Fix quote handling and tweak the whitelist a bit. 'any' token registrations

are now merged with specific registrations by rank. Not yet clear if that is a
good idea overall, need to check use cases when implementing template expansion
and other functionality.

183 parser test now passing.
This commit is contained in:
Gabriel Wicke 2012-01-04 14:09:05 +00:00
parent 6cd95fea37
commit 2e35171fd1
Notes: Gabriel Wicke 2012-02-27 16:40:01 +00:00
3 changed files with 44 additions and 26 deletions

View file

@ -5,20 +5,25 @@
*/
function QuoteTransformer ( ) {
// Bold and italic tokens are collected in these lists, and then processed
// in onNewLine.
this.quoteAndNewlineRank = 2.1;
this.anyRank = 2.101; // Just after regular quote and newline
this.reset();
}
QuoteTransformer.prototype.reset = function ( ) {
this.italics = [];
this.bolds = [];
// A chunk starts with a token context around a quote token and is
// (optionally) followed by non-quote tokens. The quote token and its
// context is later replaced with the actual tag token for italic or bold.
this.currentChunk = [];
// List of chunks, each starting with a (potentially) bold or italic token
// and followed by plain tokens.
this.chunks = [];
// References to chunks in which the first token context / quote token
// should be converted to italic or bold tokens.
this.italics = [];
this.bolds = [];
this.isActive = false;
};
@ -37,19 +42,15 @@ QuoteTransformer.prototype.register = function ( dispatcher ) {
// Make a copy of the token context
QuoteTransformer.prototype._startNewChunk = function ( ) {
this.currentChunk.pos = this.chunks.length;
this.chunks.push( this.currentChunk );
this.currentChunk = [];
this.currentChunk.pos = this.chunks.length - 1;
};
// Handle QUOTE tags. These are collected in italic/bold lists depending on
// the length of quote string. Actual analysis and conversion to the
// appropriate tag tokens is deferred until the next NEWLINE token triggers
// onNewLine.
//
// XXX: Cannot use async stuff here, need to buffer things locally instead!
// FIXME: Convert to internal buffering! -> return all tokens with rank set to
// own rank to avoid reprocessing
QuoteTransformer.prototype.onQuote = function ( token, cb, frame, prevToken ) {
var qlen = token.value.length,
tokens = [], // output tokens
@ -66,9 +67,10 @@ QuoteTransformer.prototype.onQuote = function ( token, cb, frame, prevToken ) {
};
if ( this.chunks.length === 0 ) {
if ( ! this.isActive ) {
// register for any token if not yet active
this.dispatcher.addTransform( this.onAny.bind(this), this.anyRank, 'tag', 'mw-quote' );
this.dispatcher.addTransform( this.onAny.bind(this), this.anyRank, 'any' );
this.isActive = true;
}
this._startNewChunk();
@ -114,7 +116,7 @@ QuoteTransformer.prototype.onQuote = function ( token, cb, frame, prevToken ) {
break;
}
return { token: null };
return {};
};
QuoteTransformer.prototype.onAny = function ( token, cb, frame, prevToken ) {
@ -128,28 +130,28 @@ QuoteTransformer.prototype.onAny = function ( token, cb, frame, prevToken ) {
QuoteTransformer.prototype.onNewLine = function ( token, cb, frame, prevToken ) {
var res;
if( ! this.chunks.length ) {
if( ! this.isActive ) {
// Nothing to do, quick abort.
return { token: token };
}
token.rank = this.quoteAndNewlineRank;
this.currentChunk.push( token );
this._startNewChunk();
//console.log("onNewLine: " + this.italics + this.bolds);
//console.log('chunks: ' + JSON.stringify( this.chunks, null, 2 ) );
//console.log("onNewLine: " + this.italics.length + 'i/b' + this.bolds.length);
// balance out tokens, convert placeholders into tags
if (this.italics.length % 2 && this.bolds.length % 2) {
var firstsingleletterword = -1,
firstmultiletterword = -1,
firstspace = -1;
for (var j = 0; j < this.bolds.length; j++) {
var ctx = this.bolds[j];
var ctx = this.bolds[j][0];
//console.log("balancing!" + JSON.stringify(ctx.prevToken, null, 2));
if (ctx.prevToken) {
if (ctx.prevToken.type === 'TEXT') {
var lastchar = ctx.prevToken.value[ctx.prevToken.value.length - 1],
var lastchar = prevToken.value[ctx.prevToken.value.length - 1],
secondtolastchar = ctx.prevToken.value[ctx.prevToken.value.length - 2];
if (lastchar === ' ' && firstspace === -1) {
firstspace = j;
@ -189,12 +191,15 @@ QuoteTransformer.prototype.onNewLine = function ( token, cb, frame, prevToken )
this.quotesToTags( this.italics, 'i' );
this.quotesToTags( this.bolds, 'b' );
this.currentChunk.push( token );
this._startNewChunk();
//console.log('chunks: ' + JSON.stringify( this.chunks, null, 2 ) );
// return all collected tokens including the newline
res = { tokens: [].concat.apply([], this.chunks) };
// prepare for next session
// prepare for next line
this.reset();
// remove 'any' registration
@ -246,7 +251,7 @@ QuoteTransformer.prototype.quotesToTags = function ( chunks, name ) {
toggle = !toggle;
}
if (!toggle) {
// Add end tag, but don't count it towards completion.
// Add end tag
this.currentChunk.push( {type: 'ENDTAG', name: name} );
}
};

View file

@ -127,7 +127,7 @@ TokenTransformDispatcher.prototype.addTransform = function ( transformation, ran
}
transArr.push(transformer);
// sort ascending by rank
transArr.sort( function ( t1, t2 ) { return t1.rank - t2.rank; } );
transArr.sort( this._cmpTransformations );
};
/**
@ -185,6 +185,12 @@ TokenTransformDispatcher.prototype._resetTokenRank = function ( res, transformer
}
};
/**
* Comparison for sorting transformations by ascending rank.
*/
TokenTransformDispatcher.prototype._cmpTransformations = function ( a, b ) {
return a.rank - b.rank;
};
/* Call all transformers on a tag.
*
@ -206,8 +212,10 @@ TokenTransformDispatcher.prototype._transformTagToken = function ( token, cb, ph
tName = token.name.toLowerCase(),
tagts = this.transformers[phaseEndRank].tag[tName];
if ( tagts ) {
if ( tagts && tagts.length ) {
// could cache this per tag type to avoid re-sorting each time
ts = ts.concat(tagts);
ts.sort( this._cmpTransformations );
}
//console.log(JSON.stringify(ts, null, 2));
if ( ts ) {
@ -252,7 +260,11 @@ TokenTransformDispatcher.prototype._transformTagToken = function ( token, cb, ph
*/
TokenTransformDispatcher.prototype._transformToken = function ( token, cb, phaseEndRank, frame, ts ) {
// prepend 'any' transformers
ts = this.transformers[phaseEndRank].any.concat(ts);
var anyTrans = this.transformers[phaseEndRank].any;
if ( anyTrans.length ) {
ts = this.transformers[phaseEndRank].any.concat(ts);
ts.sort( this._cmpTransformations );
}
var transformer,
res = { token: token },
aborted = false;

View file

@ -10,14 +10,15 @@ testWhiteList["Italics and bold"] = "<ul><li> plain</li><li> plain<i>italic</i>p
testWhiteList["Bug 2702: Mismatched <i>, <b> and <a> tags are invalid"] = "<p><i><a href=\"http://example.com\">text</a></i><a href=\"http://example.com\" data-sourcePos=\"30:61\"><b>text</b></a><i data-sourcePos=\"62:106\">Something <a href=\"http://example.com\">in italic</a></i><i data-sourcePos=\"107:164\">Something <a href=\"http://example.com\">mixed</a></i><a href=\"http://example.com\"><b>, even bold</b></a><i data-sourcePos=\"165:204\"><b data-sourcePos=\"165:204\">Now <a href=\"http://example.com\">both</a></b></i></p>";
testWhiteList["Unclosed and unmatched quotes"] = "<p><i><b>Bold italic text </b>with bold deactivated<b> in between.</b></i></p><p><i><b>Bold italic text </b></i><b>with italic deactivated<i> in between.</i></b></p><p><b></b>Bold text..</p><p>..spanning two paragraphs (should not work).<b></b></p><p><b></b>Bold tag left open</p><p><i></i>Italic tag left open</p><p>Normal text.<!-- Unmatching number of opening, closing tags: -->\n</p><p><b>This year'</b>s election <i>should</i> beat <b>last year'</b>s.</p><p><i>Tom<b>s car is bigger than </b></i><b>Susan</b>s.</p>";
testWhiteList["Unclosed and unmatched quotes"] = "<p data-sourcePos=\"0:66\"><i><b>Bold italic text </b>with bold deactivated<b> in between.</b></i></p><p><i><b>Bold italic text </b></i><b>with italic deactivated<i> in between.</i></b></p><p><b>Bold text..</b></p><p>..spanning two paragraphs (should not work).<b></b></p><p><b>Bold tag left open</b></p><p><i>Italic tag left open</i></p><p>Normal text.<!-- Unmatching number of opening, closing tags: -->\n</p><p><b>This year'</b>s election <i>should</i> beat <b>last year'</b>s.</p><p><i>Tom<b>s car is bigger than </b></i><b>Susan</b>s.</p>";
testWhiteList["Link containing double-single-quotes '' in text embedded in italics (bug 4598 sanity check)"] = "<p><i>Some <a data-type=\"internal\" href=\"Link\">pretty </a></i><a data-type=\"internal\" href=\"Link\">italics<i></i> and stuff</a>!</p>";
// The expected result for this test is really broken html.
testWhiteList["Link containing double-single-quotes '' in text embedded in italics (bug 4598 sanity check)"] = "<p data-sourcePos=\"0:45\"><i>Some <a data-type=\"internal\" href=\"Link\">pretty </a></i><a data-type=\"internal\" href=\"Link\">italics<i> and stuff</i></a><i>!</i></p>";
testWhiteList["External link containing double-single-quotes in text embedded in italics (bug 4598 sanity check)"] = "<p><i>Some <a href=\"http://example.com/\">pretty </a></i><a href=\"http://example.com/\">italics<i> and stuff</i></a><i>!</i></p>";
// This is a rare edge case, and the new behavior is arguably more consistent
testWhiteList["5 quotes, code coverage +1 line"] = "<p><i>'</i></p>";
testWhiteList["5 quotes, code coverage +1 line"] = "<p>'<i></i></p>";
// empty table tags / with only a caption are legal in HTML5.