From a146fcb8ad092c1692db221f73f2949fed05c84b Mon Sep 17 00:00:00 2001
From: Gabriel Wicke <gwicke@wikimedia.org>
Date: Tue, 5 Jun 2012 23:44:23 +0200
Subject: [PATCH] Improve the handling of newlines for round-tripping

An improvement, but there still are some extra newlines inserted after
paragraphs. Example input:

-------

Foo:
{|
|foo
|}
-------

Extra newlines are inserted after the Foo: and the foo in the table. They are
not fed as tokens or text to the tree builder, so there is likely a bug in the
html5 library or JSDom.

Change-Id: I83eb6180e3cd1c4e7f9b15b31d339e1d32bccd3f
---
 .../ext.core.PostExpandParagraphHandler.js    | 19 ++++++++++---------
 .../parser/mediawiki.HTML5TreeBuilder.node.js |  3 ++-
 modules/parser/pegTokenizer.pegjs.txt         |  8 ++++----
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/modules/parser/ext.core.PostExpandParagraphHandler.js b/modules/parser/ext.core.PostExpandParagraphHandler.js
index 0c22ea23ee..0411c8be09 100644
--- a/modules/parser/ext.core.PostExpandParagraphHandler.js
+++ b/modules/parser/ext.core.PostExpandParagraphHandler.js
@@ -34,9 +34,10 @@ PostExpandParagraphHandler.prototype.register = function ( dispatcher ) {
 PostExpandParagraphHandler.prototype.reset = function ( token, frame, cb ) {
 	//console.warn( 'PostExpandParagraphHandler.reset ' + JSON.stringify( this.tokens ) );
 	if ( this.newLines ) {
-		return { tokens: this._finish() };
+		this.tokens.push( token );
+		return this._finish();
 	} else {
-		return { token: token };
+		return [token];
 	}
 };
 
@@ -66,19 +67,17 @@ PostExpandParagraphHandler.prototype.onNewLine = function (  token, frame, cb )
 };
 
 PostExpandParagraphHandler.prototype.onEnd = function (  token, frame, cb ) {
-	var tokens = this.tokens;
-	this.reset();
-	return { tokens: tokens.concat( [token] ) };
+	return { tokens: this.reset( token ) };
 }
 
 PostExpandParagraphHandler.prototype.onAny = function ( token, frame, cb ) {
 	//console.warn( 'PostExpandParagraphHandler.onAny' );
-	this.tokens.push( token );
 	if ( token.constructor === CommentTk || 
 			( token.constructor === String && token.match( /^[\t ]*$/ ) ) 
 	)
 	{
 		// Continue with collection..
+		this.tokens.push( token );
 		return {};
 	} else {
 		// XXX: Only open paragraph if inline token follows!
@@ -86,14 +85,16 @@ PostExpandParagraphHandler.prototype.onAny = function ( token, frame, cb ) {
 		// None of the tokens we are interested in, so abort processing..
 		//console.warn( 'PostExpandParagraphHandler.onAny: ' + JSON.stringify( this.tokens, null , 2 ) );
 		if ( this.newLines >= 2 && ! u.isBlockToken( token ) ) {
+			this.tokens.push( token );
 			var nlTks = [];
 			while ( this.tokens[0].constructor === NlTk ) {
 				nlTks.push( this.tokens.shift() );
 			}
-			//console.warn( 'insert p:' + JSON.stringify( token, null, 2 ) );
-			return { tokens: nlTks.concat([ new TagTk( 'p' ) ], this._finish() ) };
+			var res = { tokens: nlTks.concat([ new TagTk( 'p' ) ], this._finish() ) };
+			//console.warn( 'insert p:' + JSON.stringify( res, null, 2 ) );
+			return res;
 		} else {
-			return { tokens: this._finish() };
+			return { tokens: this.reset(token) };
 		}
 	}
 
diff --git a/modules/parser/mediawiki.HTML5TreeBuilder.node.js b/modules/parser/mediawiki.HTML5TreeBuilder.node.js
index 97ff63cccc..f776312435 100644
--- a/modules/parser/mediawiki.HTML5TreeBuilder.node.js
+++ b/modules/parser/mediawiki.HTML5TreeBuilder.node.js
@@ -72,6 +72,8 @@ FauxHTML5.TreeBuilder.prototype._att = function (maybeAttribs) {
 // Adapt the token format to internal HTML tree builder format, call the actual
 // html tree builder by emitting the token.
 FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
+	//console.warn( 'processToken: ' + JSON.stringify( token ));
+
 	var attribs = token.attribs || [];
 	if ( token.dataAttribs ) {
 		var dataMW = JSON.stringify( token.dataAttribs );
@@ -132,7 +134,6 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
 		default:
 			console.warn("Unhandled token: " + JSON.stringify(token));
 			break;
-			break;
 	}
 };
 
diff --git a/modules/parser/pegTokenizer.pegjs.txt b/modules/parser/pegTokenizer.pegjs.txt
index 164562c379..bf75f123d2 100644
--- a/modules/parser/pegTokenizer.pegjs.txt
+++ b/modules/parser/pegTokenizer.pegjs.txt
@@ -494,7 +494,7 @@ inline
 
 
 inlineline
-  = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
+  = c:(urltext / !inline_breaks (inline_element / [^\r\n]))+ {
       return flatten_stringlist( c );
 }
 
@@ -723,7 +723,7 @@ urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
 //[^][<>"\\x00-\\x20\\x7F\p{Zs}]
 
 // no punctiation, and '{<' to trigger directives
-no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]
+no_punctuation_char = [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]
 
 url
   = proto:url_protocol 
@@ -1993,7 +1993,7 @@ attribute_preprocessor_text_single_line
   = r:( t:[^{}&']+ { return t.join(''); }
   / !inline_breaks ( 
       directive
-    / !'\n' [{&] )
+    / ![\r\n] [{&] )
   )* {
       return flatten_string ( r );
   }
@@ -2001,7 +2001,7 @@ attribute_preprocessor_text_double_line
   = r:( t:[^{}&"]+ { return t.join(''); }
   / !inline_breaks (
       directive
-    / !'\n' [{&] )
+    / ![\r\n] [{&] )
   )* {
       //console.warn( 'double:' + pp(r) );
       return flatten_string ( r );