Improve the handling of newlines for round-tripping

An improvement, but there still are some extra newlines inserted after paragraphs. Example input: ------- Foo: {| |foo |} ------- Extra newlines are inserted after the Foo: and the foo in the table. They are not fed as tokens or text to the tree builder, so there is likely a bug in the html5 library or JSDom. Change-Id: I83eb6180e3cd1c4e7f9b15b31d339e1d32bccd3f
2024-11-24 22:35:41 +00:00 · 2012-06-05 23:44:23 +02:00 · 2012-06-05 23:44:23 +02:00 · a146fcb8ad
parent 59fc634cce
commit a146fcb8ad
3 changed files with 16 additions and 14 deletions
--- a/modules/parser/ext.core.PostExpandParagraphHandler.js
+++ b/modules/parser/ext.core.PostExpandParagraphHandler.js
@ -34,9 +34,10 @@ PostExpandParagraphHandler.prototype.register = function ( dispatcher ) {
 PostExpandParagraphHandler.prototype.reset = function ( token, frame, cb ) {
 	//console.warn( 'PostExpandParagraphHandler.reset ' + JSON.stringify( this.tokens ) );
 	if ( this.newLines ) {
-		return { tokens: this._finish() };
+		this.tokens.push( token );
+		return this._finish();
 	} else {
-		return { token: token };
+		return [token];
 	}
 };

@ -66,19 +67,17 @@ PostExpandParagraphHandler.prototype.onNewLine = function (  token, frame, cb )
 };

 PostExpandParagraphHandler.prototype.onEnd = function (  token, frame, cb ) {
-	var tokens = this.tokens;
-	this.reset();
-	return { tokens: tokens.concat( [token] ) };
+	return { tokens: this.reset( token ) };
 }

 PostExpandParagraphHandler.prototype.onAny = function ( token, frame, cb ) {
 	//console.warn( 'PostExpandParagraphHandler.onAny' );
-	this.tokens.push( token );
 	if ( token.constructor === CommentTk || 
 			( token.constructor === String && token.match( /^[\t ]*$/ ) ) 
 	)
 	{
 		// Continue with collection..
+		this.tokens.push( token );
 		return {};
 	} else {
 		// XXX: Only open paragraph if inline token follows!
@ -86,14 +85,16 @@ PostExpandParagraphHandler.prototype.onAny = function ( token, frame, cb ) {
 		// None of the tokens we are interested in, so abort processing..
 		//console.warn( 'PostExpandParagraphHandler.onAny: ' + JSON.stringify( this.tokens, null , 2 ) );
 		if ( this.newLines >= 2 && ! u.isBlockToken( token ) ) {
+			this.tokens.push( token );
 			var nlTks = [];
 			while ( this.tokens[0].constructor === NlTk ) {
 				nlTks.push( this.tokens.shift() );
 			}
-			//console.warn( 'insert p:' + JSON.stringify( token, null, 2 ) );
-			return { tokens: nlTks.concat([ new TagTk( 'p' ) ], this._finish() ) };
+			var res = { tokens: nlTks.concat([ new TagTk( 'p' ) ], this._finish() ) };
+			//console.warn( 'insert p:' + JSON.stringify( res, null, 2 ) );
+			return res;
 		} else {
-			return { tokens: this._finish() };
+			return { tokens: this.reset(token) };
 		}
 	}

--- a/modules/parser/mediawiki.HTML5TreeBuilder.node.js
+++ b/modules/parser/mediawiki.HTML5TreeBuilder.node.js
@ -72,6 +72,8 @@ FauxHTML5.TreeBuilder.prototype._att = function (maybeAttribs) {
 // Adapt the token format to internal HTML tree builder format, call the actual
 // html tree builder by emitting the token.
 FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
+	//console.warn( 'processToken: ' + JSON.stringify( token ));
+
 	var attribs = token.attribs || [];
 	if ( token.dataAttribs ) {
 		var dataMW = JSON.stringify( token.dataAttribs );
@ -132,7 +134,6 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
 		default:
 			console.warn("Unhandled token: " + JSON.stringify(token));
 			break;
-			break;
 	}
 };

--- a/modules/parser/pegTokenizer.pegjs.txt
+++ b/modules/parser/pegTokenizer.pegjs.txt
@ -494,7 +494,7 @@ inline


 inlineline
-  = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
+  = c:(urltext / !inline_breaks (inline_element / [^\r\n]))+ {
      return flatten_stringlist( c );
 }

@ -723,7 +723,7 @@ urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
 //[^][<>"\\x00-\\x20\\x7F\p{Zs}]

 // no punctiation, and '{<' to trigger directives
-no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]
+no_punctuation_char = [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]

 url
  = proto:url_protocol 
@ -1993,7 +1993,7 @@ attribute_preprocessor_text_single_line
  = r:( t:[^{}&']+ { return t.join(''); }
  / !inline_breaks ( 
      directive
-    / !'\n' [{&] )
+    / ![\r\n] [{&] )
  )* {
      return flatten_string ( r );
  }
@ -2001,7 +2001,7 @@ attribute_preprocessor_text_double_line
  = r:( t:[^{}&"]+ { return t.join(''); }
  / !inline_breaks (
      directive
-    / !'\n' [{&] )
+    / ![\r\n] [{&] )
  )* {
      //console.warn( 'double:' + pp(r) );
      return flatten_string ( r );