From 2e35171fd1e65afd3916310d19b262c5245a0106 Mon Sep 17 00:00:00 2001
From: Gabriel Wicke <gwicke@users.mediawiki.org>
Date: Wed, 4 Jan 2012 14:09:05 +0000
Subject: [PATCH] Fix quote handling and tweak the whitelist a bit. 'any' token
 registrations are now merged with specific registrations by rank. Not yet
 clear if that is a good idea overall, need to check use cases when
 implementing template expansion and other functionality.

183 parser test now passing.
---
 modules/parser/ext.core.QuoteTransformer.js   | 45 ++++++++++---------
 .../mediawiki.TokenTransformDispatcher.js     | 18 ++++++--
 tests/parser/parserTests-whitelist.js         |  7 +--
 3 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/modules/parser/ext.core.QuoteTransformer.js b/modules/parser/ext.core.QuoteTransformer.js
index c4d5e4158d..e7a93dd6f7 100644
--- a/modules/parser/ext.core.QuoteTransformer.js
+++ b/modules/parser/ext.core.QuoteTransformer.js
@@ -5,20 +5,25 @@
  */
 
 function QuoteTransformer ( ) {
-	// Bold and italic tokens are collected in these lists, and then processed
-	// in onNewLine.
 	this.quoteAndNewlineRank = 2.1;
 	this.anyRank = 2.101; // Just after regular quote and newline
 	this.reset();
 }
 
 QuoteTransformer.prototype.reset = function ( ) {
-	this.italics = [];
-	this.bolds = [];
+	// A chunk starts with a token context around a quote token and is
+	// (optionally) followed by non-quote tokens. The quote token and its
+	// context is later replaced with the actual tag token for italic or bold.
 	this.currentChunk = [];
 	// List of chunks, each starting with a (potentially) bold or italic token
 	// and followed by plain tokens.
 	this.chunks = [];
+	// References to chunks in which the first token context / quote token
+	// should be converted to italic or bold tokens.
+	this.italics = [];
+	this.bolds = [];
+
+	this.isActive = false;
 };
 
 
@@ -37,19 +42,15 @@ QuoteTransformer.prototype.register = function ( dispatcher ) {
 
 // Make a copy of the token context
 QuoteTransformer.prototype._startNewChunk = function ( ) {
-	this.currentChunk.pos = this.chunks.length;
 	this.chunks.push( this.currentChunk );
 	this.currentChunk = [];
+	this.currentChunk.pos = this.chunks.length - 1;
 };
 
 // Handle QUOTE tags. These are collected in italic/bold lists depending on
 // the length of quote string. Actual analysis and conversion to the
 // appropriate tag tokens is deferred until the next NEWLINE token triggers
 // onNewLine.
-// 
-// XXX: Cannot use async stuff here, need to buffer things locally instead!
-// FIXME: Convert to internal buffering! -> return all tokens with rank set to
-// own rank to avoid reprocessing
 QuoteTransformer.prototype.onQuote = function ( token, cb, frame, prevToken ) {
 	var qlen = token.value.length,
 		tokens = [], // output tokens
@@ -66,9 +67,10 @@ QuoteTransformer.prototype.onQuote = function ( token, cb, frame, prevToken ) {
 		};
 	
 
-	if ( this.chunks.length === 0 ) {
+	if ( ! this.isActive ) {
 		// register for any token if not yet active
-		this.dispatcher.addTransform( this.onAny.bind(this), this.anyRank, 'tag', 'mw-quote' );
+		this.dispatcher.addTransform( this.onAny.bind(this), this.anyRank, 'any' );
+		this.isActive = true;
 	}
 
 	this._startNewChunk();
@@ -114,7 +116,7 @@ QuoteTransformer.prototype.onQuote = function ( token, cb, frame, prevToken ) {
 			break;
 	}
 	
-	return { token: null };
+	return {};
 };
 
 QuoteTransformer.prototype.onAny = function ( token, cb, frame, prevToken ) {
@@ -128,28 +130,28 @@ QuoteTransformer.prototype.onAny = function ( token, cb, frame, prevToken ) {
 QuoteTransformer.prototype.onNewLine = function (  token, cb, frame, prevToken ) {
 	var res;
 
-	if( ! this.chunks.length ) {
+	if( ! this.isActive ) {
 		// Nothing to do, quick abort.
 		return { token: token };
 	}
 
 
 	token.rank = this.quoteAndNewlineRank;
-	this.currentChunk.push( token );
-	this._startNewChunk();
 
-	//console.log("onNewLine: " + this.italics + this.bolds);
+	//console.log('chunks: ' + JSON.stringify( this.chunks, null, 2 ) );
+
+	//console.log("onNewLine: " + this.italics.length + 'i/b' + this.bolds.length);
 	// balance out tokens, convert placeholders into tags
 	if (this.italics.length % 2 && this.bolds.length % 2) {
 		var firstsingleletterword = -1,
 			firstmultiletterword = -1,
 			firstspace = -1;
 		for (var j = 0; j < this.bolds.length; j++) {
-			var ctx = this.bolds[j];
+			var ctx = this.bolds[j][0];
 			//console.log("balancing!" + JSON.stringify(ctx.prevToken, null, 2));
 			if (ctx.prevToken) {
 				if (ctx.prevToken.type === 'TEXT') {
-					var lastchar = ctx.prevToken.value[ctx.prevToken.value.length - 1],
+					var lastchar = prevToken.value[ctx.prevToken.value.length - 1],
 						secondtolastchar = ctx.prevToken.value[ctx.prevToken.value.length - 2];
 					if (lastchar === ' ' && firstspace === -1) {
 						firstspace = j;
@@ -189,12 +191,15 @@ QuoteTransformer.prototype.onNewLine = function (  token, cb, frame, prevToken )
 	this.quotesToTags( this.italics, 'i' );
 	this.quotesToTags( this.bolds, 'b' );
 
+	this.currentChunk.push( token );
+	this._startNewChunk();
+
 	//console.log('chunks: ' + JSON.stringify( this.chunks, null, 2 ) );
 
 	// return all collected tokens including the newline
 	res = { tokens: [].concat.apply([], this.chunks) };
 
-	// prepare for next session
+	// prepare for next line
 	this.reset();
 
 	// remove 'any' registration
@@ -246,7 +251,7 @@ QuoteTransformer.prototype.quotesToTags = function ( chunks, name ) {
 		toggle = !toggle;
 	}
 	if (!toggle) {
-		// Add end tag, but don't count it towards completion.
+		// Add end tag
 		this.currentChunk.push( {type: 'ENDTAG', name: name} );
 	}
 };
diff --git a/modules/parser/mediawiki.TokenTransformDispatcher.js b/modules/parser/mediawiki.TokenTransformDispatcher.js
index 0dfa8c9c87..d7ff612221 100644
--- a/modules/parser/mediawiki.TokenTransformDispatcher.js
+++ b/modules/parser/mediawiki.TokenTransformDispatcher.js
@@ -127,7 +127,7 @@ TokenTransformDispatcher.prototype.addTransform = function ( transformation, ran
 	}
 	transArr.push(transformer);
 	// sort ascending by rank
-	transArr.sort( function ( t1, t2 ) { return t1.rank - t2.rank; } );
+	transArr.sort( this._cmpTransformations );
 };
 
 /**
@@ -185,6 +185,12 @@ TokenTransformDispatcher.prototype._resetTokenRank = function ( res, transformer
 	}
 };
 
+/**
+ * Comparison for sorting transformations by ascending rank.
+ */
+TokenTransformDispatcher.prototype._cmpTransformations = function ( a, b ) {
+	return a.rank - b.rank;
+};
 
 /* Call all transformers on a tag.
  *
@@ -206,8 +212,10 @@ TokenTransformDispatcher.prototype._transformTagToken = function ( token, cb, ph
 		tName = token.name.toLowerCase(),
 		tagts = this.transformers[phaseEndRank].tag[tName];
 
-	if ( tagts ) {
+	if ( tagts && tagts.length ) {
+		// could cache this per tag type to avoid re-sorting each time
 		ts = ts.concat(tagts);
+		ts.sort( this._cmpTransformations );
 	}
 	//console.log(JSON.stringify(ts, null, 2));
 	if ( ts ) {
@@ -252,7 +260,11 @@ TokenTransformDispatcher.prototype._transformTagToken = function ( token, cb, ph
  */
 TokenTransformDispatcher.prototype._transformToken = function ( token, cb, phaseEndRank, frame, ts ) {
 	// prepend 'any' transformers
-	ts = this.transformers[phaseEndRank].any.concat(ts);
+	var anyTrans = this.transformers[phaseEndRank].any;
+	if ( anyTrans.length ) {
+		ts = this.transformers[phaseEndRank].any.concat(ts);
+		ts.sort( this._cmpTransformations );
+	}
 	var transformer,
 		res = { token: token },
 		aborted = false;
diff --git a/tests/parser/parserTests-whitelist.js b/tests/parser/parserTests-whitelist.js
index 49c48b8c29..51b9c8410c 100644
--- a/tests/parser/parserTests-whitelist.js
+++ b/tests/parser/parserTests-whitelist.js
@@ -10,14 +10,15 @@ testWhiteList["Italics and bold"] = "<ul><li> plain</li><li> plain<i>italic</i>p
 
 testWhiteList["Bug 2702: Mismatched <i>, <b> and <a> tags are invalid"] = "<p><i><a href=\"http://example.com\">text</a></i><a href=\"http://example.com\" data-sourcePos=\"30:61\"><b>text</b></a><i data-sourcePos=\"62:106\">Something <a href=\"http://example.com\">in italic</a></i><i data-sourcePos=\"107:164\">Something <a href=\"http://example.com\">mixed</a></i><a href=\"http://example.com\"><b>, even bold</b></a><i data-sourcePos=\"165:204\"><b data-sourcePos=\"165:204\">Now <a href=\"http://example.com\">both</a></b></i></p>";
 
-testWhiteList["Unclosed and unmatched quotes"] = "<p><i><b>Bold italic text </b>with bold deactivated<b> in between.</b></i></p><p><i><b>Bold italic text </b></i><b>with italic deactivated<i> in between.</i></b></p><p><b></b>Bold text..</p><p>..spanning two paragraphs (should not work).<b></b></p><p><b></b>Bold tag left open</p><p><i></i>Italic tag left open</p><p>Normal text.<!-- Unmatching number of opening, closing tags: -->\n</p><p><b>This year'</b>s election <i>should</i> beat <b>last year'</b>s.</p><p><i>Tom<b>s car is bigger than </b></i><b>Susan</b>s.</p>";
+testWhiteList["Unclosed and unmatched quotes"] = "<p data-sourcePos=\"0:66\"><i><b>Bold italic text </b>with bold deactivated<b> in between.</b></i></p><p><i><b>Bold italic text </b></i><b>with italic deactivated<i> in between.</i></b></p><p><b>Bold text..</b></p><p>..spanning two paragraphs (should not work).<b></b></p><p><b>Bold tag left open</b></p><p><i>Italic tag left open</i></p><p>Normal text.<!-- Unmatching number of opening, closing tags: -->\n</p><p><b>This year'</b>s election <i>should</i> beat <b>last year'</b>s.</p><p><i>Tom<b>s car is bigger than </b></i><b>Susan</b>s.</p>";
 
-testWhiteList["Link containing double-single-quotes '' in text embedded in italics (bug 4598 sanity check)"] = "<p><i>Some <a data-type=\"internal\" href=\"Link\">pretty </a></i><a data-type=\"internal\" href=\"Link\">italics<i></i> and stuff</a>!</p>";
+// The expected result for this test is really broken html.
+testWhiteList["Link containing double-single-quotes '' in text embedded in italics (bug 4598 sanity check)"] = "<p data-sourcePos=\"0:45\"><i>Some <a data-type=\"internal\" href=\"Link\">pretty </a></i><a data-type=\"internal\" href=\"Link\">italics<i> and stuff</i></a><i>!</i></p>";
 
 testWhiteList["External link containing double-single-quotes in text embedded in italics (bug 4598 sanity check)"] = "<p><i>Some <a href=\"http://example.com/\">pretty </a></i><a href=\"http://example.com/\">italics<i> and stuff</i></a><i>!</i></p>";
 
 // This is a rare edge case, and the new behavior is arguably more consistent
-testWhiteList["5 quotes, code coverage +1 line"] = "<p><i>'</i></p>";
+testWhiteList["5 quotes, code coverage +1 line"] = "<p>'<i></i></p>";
 
 
 // empty table tags / with only a caption are legal in HTML5.