From a146fcb8ad092c1692db221f73f2949fed05c84b Mon Sep 17 00:00:00 2001 From: Gabriel Wicke Date: Tue, 5 Jun 2012 23:44:23 +0200 Subject: [PATCH] Improve the handling of newlines for round-tripping An improvement, but there still are some extra newlines inserted after paragraphs. Example input: ------- Foo: {| |foo |} ------- Extra newlines are inserted after the Foo: and the foo in the table. They are not fed as tokens or text to the tree builder, so there is likely a bug in the html5 library or JSDom. Change-Id: I83eb6180e3cd1c4e7f9b15b31d339e1d32bccd3f --- .../ext.core.PostExpandParagraphHandler.js | 19 ++++++++++--------- .../parser/mediawiki.HTML5TreeBuilder.node.js | 3 ++- modules/parser/pegTokenizer.pegjs.txt | 8 ++++---- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/modules/parser/ext.core.PostExpandParagraphHandler.js b/modules/parser/ext.core.PostExpandParagraphHandler.js index 0c22ea23ee..0411c8be09 100644 --- a/modules/parser/ext.core.PostExpandParagraphHandler.js +++ b/modules/parser/ext.core.PostExpandParagraphHandler.js @@ -34,9 +34,10 @@ PostExpandParagraphHandler.prototype.register = function ( dispatcher ) { PostExpandParagraphHandler.prototype.reset = function ( token, frame, cb ) { //console.warn( 'PostExpandParagraphHandler.reset ' + JSON.stringify( this.tokens ) ); if ( this.newLines ) { - return { tokens: this._finish() }; + this.tokens.push( token ); + return this._finish(); } else { - return { token: token }; + return [token]; } }; @@ -66,19 +67,17 @@ PostExpandParagraphHandler.prototype.onNewLine = function ( token, frame, cb ) }; PostExpandParagraphHandler.prototype.onEnd = function ( token, frame, cb ) { - var tokens = this.tokens; - this.reset(); - return { tokens: tokens.concat( [token] ) }; + return { tokens: this.reset( token ) }; } PostExpandParagraphHandler.prototype.onAny = function ( token, frame, cb ) { //console.warn( 'PostExpandParagraphHandler.onAny' ); - this.tokens.push( token ); if ( token.constructor === CommentTk || ( token.constructor === String && token.match( /^[\t ]*$/ ) ) ) { // Continue with collection.. + this.tokens.push( token ); return {}; } else { // XXX: Only open paragraph if inline token follows! @@ -86,14 +85,16 @@ PostExpandParagraphHandler.prototype.onAny = function ( token, frame, cb ) { // None of the tokens we are interested in, so abort processing.. //console.warn( 'PostExpandParagraphHandler.onAny: ' + JSON.stringify( this.tokens, null , 2 ) ); if ( this.newLines >= 2 && ! u.isBlockToken( token ) ) { + this.tokens.push( token ); var nlTks = []; while ( this.tokens[0].constructor === NlTk ) { nlTks.push( this.tokens.shift() ); } - //console.warn( 'insert p:' + JSON.stringify( token, null, 2 ) ); - return { tokens: nlTks.concat([ new TagTk( 'p' ) ], this._finish() ) }; + var res = { tokens: nlTks.concat([ new TagTk( 'p' ) ], this._finish() ) }; + //console.warn( 'insert p:' + JSON.stringify( res, null, 2 ) ); + return res; } else { - return { tokens: this._finish() }; + return { tokens: this.reset(token) }; } } diff --git a/modules/parser/mediawiki.HTML5TreeBuilder.node.js b/modules/parser/mediawiki.HTML5TreeBuilder.node.js index 97ff63cccc..f776312435 100644 --- a/modules/parser/mediawiki.HTML5TreeBuilder.node.js +++ b/modules/parser/mediawiki.HTML5TreeBuilder.node.js @@ -72,6 +72,8 @@ FauxHTML5.TreeBuilder.prototype._att = function (maybeAttribs) { // Adapt the token format to internal HTML tree builder format, call the actual // html tree builder by emitting the token. FauxHTML5.TreeBuilder.prototype.processToken = function (token) { + //console.warn( 'processToken: ' + JSON.stringify( token )); + var attribs = token.attribs || []; if ( token.dataAttribs ) { var dataMW = JSON.stringify( token.dataAttribs ); @@ -132,7 +134,6 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) { default: console.warn("Unhandled token: " + JSON.stringify(token)); break; - break; } }; diff --git a/modules/parser/pegTokenizer.pegjs.txt b/modules/parser/pegTokenizer.pegjs.txt index 164562c379..bf75f123d2 100644 --- a/modules/parser/pegTokenizer.pegjs.txt +++ b/modules/parser/pegTokenizer.pegjs.txt @@ -494,7 +494,7 @@ inline inlineline - = c:(urltext / !inline_breaks (inline_element / [^\n]))+ { + = c:(urltext / !inline_breaks (inline_element / [^\r\n]))+ { return flatten_stringlist( c ); } @@ -723,7 +723,7 @@ urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] { //[^][<>"\\x00-\\x20\\x7F\p{Zs}] // no punctiation, and '{<' to trigger directives -no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{] +no_punctuation_char = [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{] url = proto:url_protocol @@ -1993,7 +1993,7 @@ attribute_preprocessor_text_single_line = r:( t:[^{}&']+ { return t.join(''); } / !inline_breaks ( directive - / !'\n' [{&] ) + / ![\r\n] [{&] ) )* { return flatten_string ( r ); } @@ -2001,7 +2001,7 @@ attribute_preprocessor_text_double_line = r:( t:[^{}&"]+ { return t.join(''); } / !inline_breaks ( directive - / !'\n' [{&] ) + / ![\r\n] [{&] ) )* { //console.warn( 'double:' + pp(r) ); return flatten_string ( r );