Misc improvements, now 196 parser tests passing.

* Add handler for post-expand paragraph wrapping on token stream, to handle things like comments on its own line post-expand * Add general Util module * Fix self-closing tag handling in HTML5 tree builder
https://mediawiki.org/wiki/Special:Code/MediaWiki/109172
2024-11-15 02:23:58 +00:00 · 2012-01-17 18:22:10 +00:00 · 2012-01-17 18:22:10 +00:00 · 6bd7ca1e75 · 2012-02-27 16:40:01 +00:00
parent 34a2f3f1aa
commit 6bd7ca1e75
5 changed files with 165 additions and 4 deletions
--- a/modules/parser/ext.Util.js
+++ b/modules/parser/ext.Util.js
@ -0,0 +1,55 @@
+/**
+ * General utilities for token transforms
+ */
+
+function Util () {
+}
+
+
+/**
+ * Determine if a token is block-level or not
+ *
+ * @static
+ * @method
+ * @param {Object} token: The token to check
+ * @returns {Boolean}: True if token is block-level, false otherwise.
+ */
+Util.prototype.isBlock = function ( token ) {
+	if ( token.type === 'TAG' || 
+			token.type === 'ENDTAG' || 
+			token.type === 'SELFCLOSINGTAG' ) {
+		switch (token.name.toLowerCase()) {
+			case 'div':
+			case 'table':
+			case 'td':
+			case 'tr':
+			case 'tbody':
+			case 'p':
+			case 'ul':
+			case 'ol':
+			case 'li':
+			case 'dl':
+			case 'dt':
+			case 'dd':
+			case 'img': // hmm!
+			case 'pre':
+			case 'center':
+			case 'blockquote':
+			case 'h1':
+			case 'h2':
+			case 'h3':
+			case 'h4':
+			case 'h5':
+			case 'h6':
+				return true;
+			default:
+				return false;
+		}
+	} else {
+		return false;
+	}
+};
+
+if (typeof module == "object") {
+	module.exports.Util = Util;
+}
--- a/modules/parser/ext.core.PostExpandParagraphHandler.js
+++ b/modules/parser/ext.core.PostExpandParagraphHandler.js
@ -0,0 +1,98 @@
+/*
+ * Insert paragraphs for comment-only lines after template expansion
+ *
+ * @author Gabriel Wicke <gwicke@wikimedia.org>
+ */
+
+// Include general utilities
+var Util = require('./ext.Util.js').Util,
+	u = new Util();
+
+
+function PostExpandParagraphHandler ( dispatcher ) {
+	this.tokens = [];
+	this.newLines = 0;
+	this.register( dispatcher );
+}
+
+// constants
+PostExpandParagraphHandler.prototype.newlineRank = 2.2;
+PostExpandParagraphHandler.prototype.anyRank = 2.201; // Just after regular quote and newline
+
+
+// Register this transformer with the TokenTransformer
+PostExpandParagraphHandler.prototype.register = function ( dispatcher ) {
+	this.dispatcher = dispatcher;
+	// Register for NEWLINE tokens
+	dispatcher.addTransform( this.onNewLine.bind(this), 
+			this.newlineRank, 'newline' );
+	// Reset internal state when we are done
+	dispatcher.addTransform( this.reset.bind(this), 
+			this.newlineRank, 'end' );
+};
+
+PostExpandParagraphHandler.prototype.reset = function ( token, cb, frame, prevToken ) {
+	//console.log( 'PostExpandParagraphHandler.reset' );
+	if ( this.newLines ) {
+		return { tokens: this._finish() };
+	} else {
+		return { token: token };
+	}
+};
+
+PostExpandParagraphHandler.prototype._finish = function ( ) {
+	var tokens = this.tokens;
+	this.tokens = [];
+	for ( var i = 0, l = tokens.length; i < l; i++ ) {
+		tokens[ i ].rank = this.anyRank;
+	}
+	// remove 'any' registration
+	this.dispatcher.removeTransform( this.anyRank, 'any' );
+	this.newLines = 0;
+	return tokens;
+};
+
+
+// Handle NEWLINE tokens, which trigger the actual quote analysis on the
+// collected quote tokens so far.
+PostExpandParagraphHandler.prototype.onNewLine = function (  token, cb, frame, prevToken ) {
+	//console.log( 'PostExpandParagraphHandler.onNewLine: ' + JSON.stringify( token, null , 2 ) );
+	var res;
+	this.tokens.push( token );
+
+	if( ! this.newLines ) {
+		this.dispatcher.addTransform( this.onAny.bind(this), 
+				this.anyRank, 'any' );
+	}
+
+	this.newLines++;
+	return {};
+};
+
+
+PostExpandParagraphHandler.prototype.onAny = function ( token, cb, frame, prevToken ) {
+	//console.log( 'PostExpandParagraphHandler.onAny' );
+	this.tokens.push( token );
+	if ( token.type === 'COMMENT' || 
+			( token.type === 'TEXT' && token.value.match( /^[\t ]+$/ ) ) 
+	)
+	{
+		// Continue with collection..
+		return {};
+	} else {
+		// XXX: Only open paragraph if inline token follows!
+
+		// None of the tokens we are interested in, so abort processing..
+		//console.log( 'PostExpandParagraphHandler.onAny: ' + JSON.stringify( this.tokens, null , 2 ) );
+		if ( this.newLines >= 2 && ! u.isBlock( token ) ) {
+			return { tokens: [ { type: 'TAG', name: 'p' } ].concat( this._finish() ) };
+		} else {
+			return { tokens: this._finish() };
+		}
+	}
+
+};
+
+if (typeof module == "object") {
+	module.exports.PostExpandParagraphHandler = PostExpandParagraphHandler;
+}
--- a/modules/parser/mediawiki.HTML5TreeBuilder.node.js
+++ b/modules/parser/mediawiki.HTML5TreeBuilder.node.js
@ -89,9 +89,13 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
 			this.emit('token', {type: 'StartTag', 
 				name: token.name, 
 				data: att(token.attribs)});
-			this.emit('token', {type: 'EndTag', 
-				name: token.name, 
-				data: att(token.attribs)});
+			if ( HTML5.VOID_ELEMENTS.indexOf( token.name.toLowerCase() ) < 0 ) {
+				// VOID_ELEMENTS are automagically treated as self-closing by
+				// the tree builder
+				this.emit('token', {type: 'EndTag', 
+					name: token.name, 
+					data: att(token.attribs)});
+			}
 			break;
 		case "COMMENT":
 			this.emit('token', {type: 'Comment', 
--- a/modules/parser/mediawiki.parser.js
+++ b/modules/parser/mediawiki.parser.js
@ -14,8 +14,10 @@ var events = require( 'events' );
 var fs = require('fs'),
 	path = require('path'),
 	PegTokenizer                = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
-	TokenTransformManager    = require('./mediawiki.TokenTransformManager.js'),
+	TokenTransformManager       = require('./mediawiki.TokenTransformManager.js'),
 	QuoteTransformer            = require('./ext.core.QuoteTransformer.js').QuoteTransformer,
+	PostExpandParagraphHandler  = require('./ext.core.PostExpandParagraphHandler.js')
+																.PostExpandParagraphHandler,
 	TemplateHandler             = require('./ext.core.TemplateHandler.js').TemplateHandler,
 	Cite                        = require('./ext.Cite.js').Cite,
 	FauxHTML5                   = require('./mediawiki.HTML5TreeBuilder.node.js').FauxHTML5,
@ -68,6 +70,7 @@ function ParserPipeline( env, inputType ) {

 	// Add token transformations..
 	new QuoteTransformer( this.tokenPostProcessor );
+	new PostExpandParagraphHandler( this.tokenPostProcessor );
 	
 	//var citeExtension = new Cite( this.tokenTransformer );

--- a/modules/parser/pegTokenizer.pegjs.txt
+++ b/modules/parser/pegTokenizer.pegjs.txt
@ -262,6 +262,7 @@ start
 		// Append the end (for obvious reasons this should not
 		// be part of a stream, only when tokenizing complete
 		// texts)
+      //console.log( pp( flatten ( e ) ) );
      return flatten(e);
  }