Misc improvements, now 196 parser tests passing.

* Add handler for post-expand paragraph wrapping on token stream, to handle
  things like comments on its own line post-expand
* Add general Util module
* Fix self-closing tag handling in HTML5 tree builder
This commit is contained in:
Gabriel Wicke 2012-01-17 18:22:10 +00:00
parent 34a2f3f1aa
commit 6bd7ca1e75
Notes: Gabriel Wicke 2012-02-27 16:40:01 +00:00
5 changed files with 165 additions and 4 deletions

View file

@ -0,0 +1,55 @@
/**
* General utilities for token transforms
*/
function Util () {
}
/**
* Determine if a token is block-level or not
*
* @static
* @method
* @param {Object} token: The token to check
* @returns {Boolean}: True if token is block-level, false otherwise.
*/
Util.prototype.isBlock = function ( token ) {
if ( token.type === 'TAG' ||
token.type === 'ENDTAG' ||
token.type === 'SELFCLOSINGTAG' ) {
switch (token.name.toLowerCase()) {
case 'div':
case 'table':
case 'td':
case 'tr':
case 'tbody':
case 'p':
case 'ul':
case 'ol':
case 'li':
case 'dl':
case 'dt':
case 'dd':
case 'img': // hmm!
case 'pre':
case 'center':
case 'blockquote':
case 'h1':
case 'h2':
case 'h3':
case 'h4':
case 'h5':
case 'h6':
return true;
default:
return false;
}
} else {
return false;
}
};
if (typeof module == "object") {
module.exports.Util = Util;
}

View file

@ -0,0 +1,98 @@
/*
* Insert paragraphs for comment-only lines after template expansion
*
* @author Gabriel Wicke <gwicke@wikimedia.org>
*/
// Include general utilities
var Util = require('./ext.Util.js').Util,
u = new Util();
function PostExpandParagraphHandler ( dispatcher ) {
this.tokens = [];
this.newLines = 0;
this.register( dispatcher );
}
// constants
PostExpandParagraphHandler.prototype.newlineRank = 2.2;
PostExpandParagraphHandler.prototype.anyRank = 2.201; // Just after regular quote and newline
// Register this transformer with the TokenTransformer
PostExpandParagraphHandler.prototype.register = function ( dispatcher ) {
this.dispatcher = dispatcher;
// Register for NEWLINE tokens
dispatcher.addTransform( this.onNewLine.bind(this),
this.newlineRank, 'newline' );
// Reset internal state when we are done
dispatcher.addTransform( this.reset.bind(this),
this.newlineRank, 'end' );
};
PostExpandParagraphHandler.prototype.reset = function ( token, cb, frame, prevToken ) {
//console.log( 'PostExpandParagraphHandler.reset' );
if ( this.newLines ) {
return { tokens: this._finish() };
} else {
return { token: token };
}
};
PostExpandParagraphHandler.prototype._finish = function ( ) {
var tokens = this.tokens;
this.tokens = [];
for ( var i = 0, l = tokens.length; i < l; i++ ) {
tokens[ i ].rank = this.anyRank;
}
// remove 'any' registration
this.dispatcher.removeTransform( this.anyRank, 'any' );
this.newLines = 0;
return tokens;
};
// Handle NEWLINE tokens, which trigger the actual quote analysis on the
// collected quote tokens so far.
PostExpandParagraphHandler.prototype.onNewLine = function ( token, cb, frame, prevToken ) {
//console.log( 'PostExpandParagraphHandler.onNewLine: ' + JSON.stringify( token, null , 2 ) );
var res;
this.tokens.push( token );
if( ! this.newLines ) {
this.dispatcher.addTransform( this.onAny.bind(this),
this.anyRank, 'any' );
}
this.newLines++;
return {};
};
PostExpandParagraphHandler.prototype.onAny = function ( token, cb, frame, prevToken ) {
//console.log( 'PostExpandParagraphHandler.onAny' );
this.tokens.push( token );
if ( token.type === 'COMMENT' ||
( token.type === 'TEXT' && token.value.match( /^[\t ]+$/ ) )
)
{
// Continue with collection..
return {};
} else {
// XXX: Only open paragraph if inline token follows!
// None of the tokens we are interested in, so abort processing..
//console.log( 'PostExpandParagraphHandler.onAny: ' + JSON.stringify( this.tokens, null , 2 ) );
if ( this.newLines >= 2 && ! u.isBlock( token ) ) {
return { tokens: [ { type: 'TAG', name: 'p' } ].concat( this._finish() ) };
} else {
return { tokens: this._finish() };
}
}
};
if (typeof module == "object") {
module.exports.PostExpandParagraphHandler = PostExpandParagraphHandler;
}

View file

@ -89,9 +89,13 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
this.emit('token', {type: 'StartTag',
name: token.name,
data: att(token.attribs)});
this.emit('token', {type: 'EndTag',
name: token.name,
data: att(token.attribs)});
if ( HTML5.VOID_ELEMENTS.indexOf( token.name.toLowerCase() ) < 0 ) {
// VOID_ELEMENTS are automagically treated as self-closing by
// the tree builder
this.emit('token', {type: 'EndTag',
name: token.name,
data: att(token.attribs)});
}
break;
case "COMMENT":
this.emit('token', {type: 'Comment',

View file

@ -14,8 +14,10 @@ var events = require( 'events' );
var fs = require('fs'),
path = require('path'),
PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
TokenTransformManager = require('./mediawiki.TokenTransformManager.js'),
TokenTransformManager = require('./mediawiki.TokenTransformManager.js'),
QuoteTransformer = require('./ext.core.QuoteTransformer.js').QuoteTransformer,
PostExpandParagraphHandler = require('./ext.core.PostExpandParagraphHandler.js')
.PostExpandParagraphHandler,
TemplateHandler = require('./ext.core.TemplateHandler.js').TemplateHandler,
Cite = require('./ext.Cite.js').Cite,
FauxHTML5 = require('./mediawiki.HTML5TreeBuilder.node.js').FauxHTML5,
@ -68,6 +70,7 @@ function ParserPipeline( env, inputType ) {
// Add token transformations..
new QuoteTransformer( this.tokenPostProcessor );
new PostExpandParagraphHandler( this.tokenPostProcessor );
//var citeExtension = new Cite( this.tokenTransformer );

View file

@ -262,6 +262,7 @@ start
// Append the end (for obvious reasons this should not
// be part of a stream, only when tokenizing complete
// texts)
//console.log( pp( flatten ( e ) ) );
return flatten(e);
}