Gabriel Wicke 3be4992782 'Obama finally expands' ;) Misc fixes and documentation updates
* [[:en:Barack Obama]] can now be expanded in 77 seconds using 330MB RAM,
  while it would prevously run out of RAM after ~30 minutes. Wohoooo!
  The token transform framework rework really paid off.
* 303 parser tests are passing in the new record time of 5.5 seconds. Two more
  tests are passing since these tests expect the day of the week to be
  Thursday.  Won't be the case tomorrow.

2012-04-26 18:18:08 +02:00

448 lines
14 KiB

* Template and template argument handling, first cut.
* AsyncTokenTransformManager objects provide preprocessor-frame-like
* functionality once template args etc are fully expanded, and isolate
* individual transforms from concurrency issues. Template expansion is
* controlled using a tplExpandData structure created independently for each
* handled template tag.
* @author Gabriel Wicke <>
* @author Brion Vibber <>
var $ = require('jquery'),
request = require('request'),
events = require('events'),
qs = require('querystring'),
ParserFunctions = require('./ext.core.ParserFunctions.js').ParserFunctions,
AttributeTransformManager = require('./mediawiki.TokenTransformManager.js')
defines = require('./mediawiki.parser.defines.js');
function TemplateHandler ( manager ) {
this.register( manager );
this.parserFunctions = new ParserFunctions( manager );
// constants
TemplateHandler.prototype.rank = 1.1;
TemplateHandler.prototype.register = function ( manager ) {
this.manager = manager;
// Register for template and templatearg tag tokens
manager.addTransform( this.onTemplate.bind(this),
this.rank, 'tag', 'template' );
// Template argument expansion
manager.addTransform( this.onTemplateArg.bind(this),
this.rank, 'tag', 'templatearg' );
* Main template token handler
* Expands target and arguments (both keys and values) and either directly
* calls or sets up the callback to _expandTemplate, which then fetches and
* processes the template.
TemplateHandler.prototype.onTemplate = function ( token, frame, cb ) {
//console.warn('onTemplate! ' + JSON.stringify( token, null, 2 ) +
// ' args: ' + JSON.stringify( this.manager.args ));
// create a new temporary frame for argument and title expansions
// XXX: only expand keys, and leave value expansion to template parameter
// replacement or parser functions as needed!
// expand argument keys, with callback set to next processing step
// XXX: would likely be faster to do this in a tight loop here
var atm = new AttributeTransformManager(
this._expandTemplate.bind( this, token, frame, cb )
cb( { async: true } );
atm.processKeys( token.attribs );
* Create positional (number) keys for arguments without explicit keys
TemplateHandler.prototype._nameArgs = function ( attribs ) {
var n = 1,
out = [];
for ( var i = 0, l = attribs.length; i < l; i++ ) {
// FIXME: Also check for whitespace-only named args!
if ( ! attribs[i].k.length ) {
out.push( {k: [ n.toString() ], v: attribs[i].v } );
} else {
out.push( attribs[i] );
this.manager.env.dp( '_nameArgs: ', out );
return out;
* Fetch, tokenize and token-transform a template after all arguments and the
* target were expanded.
TemplateHandler.prototype._expandTemplate = function ( token, frame, cb, attribs ) {
//console.warn('TemplateHandler.expandTemplate: ' +
// JSON.stringify( tplExpandData, null, 2 ) );
var target = attribs[0].k;
if ( ! target ) {
this.manager.env.ap( 'No target! ', attribs );
// TODO:
// check for 'subst:'
// check for variable magic names
// check for msg, msgnw, raw magics
// check for parser functions
// First, check the target for loops
target = this.manager.env.tokensToString( target ).trim();
//var args = this.manager.env.KVtoHash( tplExpandData.expandedArgs );
// strip subst for now.
target = target.replace( /^(safe)?subst:/, '' );
// XXX: wrap attribs in object with .dict() and .named() methods,
// and each member (key/value) into object with .tokens(), .dom() and
// .wikitext() methods (subclass of Array)
var prefix = target.split(':', 1)[0].toLowerCase().trim();
if ( prefix && 'pf_' + prefix in this.parserFunctions ) {
var pfAttribs = new Params( this.manager.env, attribs );
pfAttribs[0] = new KV( target.substr( prefix.length + 1 ), [] );
//this.manager.env.dp( 'func prefix/args: ', prefix,
// tplExpandData.expandedArgs,
// 'unnamedArgs', tplExpandData.origToken.attribs,
// 'funcArg:', funcArg
// );
//this.manager.env.dp( 'entering prefix', funcArg, tplExpandData.expandedArgs );
this.parserFunctions[ 'pf_' + prefix ]
( token, this.manager.frame, cb, pfAttribs );
} 'template target: ' + target );
// now normalize the target before template processing
target = this.manager.env.normalizeTitle( target );
// Resolve a possibly relative link
var templateName = this.manager.env.resolveTitle(
var checkRes = this.manager.frame.loopAndDepthCheck( templateName, this.manager.env.maxDepth );
if( checkRes ) {
// Loop detected or depth limit exceeded, abort!
res = [
new TagTk( 'a', [{k: 'href', v: target}] ),
new EndTagTk( 'a' )
cb( { tokens: res, allTokensProcessed: true } );
// XXX: notes from brion's mediawiki.parser.environment
// resolve template name
// load template w/ canonical name
// load template w/ variant names (language variants)
// For now, just fetch the template and pass the callback for further
// processing along.
this._processTemplateAndTitle.bind( this, token, frame, cb, templateName, attribs )
* Process a fetched template source
TemplateHandler.prototype._processTemplateAndTitle = function( token, frame, cb, name, attribs, src, type ) {
// Get a nested transformation pipeline for the input type. The input
// pipeline includes the tokenizer, synchronous stage-1 transforms for
// 'text/wiki' input and asynchronous stage-2 transforms).
var pipeline = this.manager.pipeFactory.getPipeline(
type || 'text/x-mediawiki', true
pipeline.setFrame( this.manager.frame, name, attribs );
// Hook up the inputPipeline output events to our handlers
pipeline.addListener( 'chunk', this._onChunk.bind ( this, cb ) );
pipeline.addListener( 'end', this._onEnd.bind ( this, cb ) );
// Feed the pipeline. XXX: Support different formats.
this.manager.env.dp( 'TemplateHandler._processTemplateAndTitle', name, src, attribs );
pipeline.process ( src, name );
* Handle chunk emitted from the input pipeline after feeding it a template
TemplateHandler.prototype._onChunk = function( cb, chunk ) {
// We encapsulate the output by default, so collect tokens here.
this.manager.env.stripEOFTkfromTokens( chunk );
this.manager.env.dp( 'TemplateHandler._onChunk', chunk );
cb( { tokens: chunk, async: true } );
* Handle the end event emitted by the parser pipeline after fully processing
* the template source.
TemplateHandler.prototype._onEnd = function( cb ) {
this.manager.env.dp( 'TemplateHandler._onEnd' );
cb( { tokens: [] } );
* Fetch a template
TemplateHandler.prototype._fetchTemplateAndTitle = function ( title, parentCB, cb ) {
// @fixme normalize name?
var self = this;
if ( title in this.manager.env.pageCache ) {
// XXX: store type too (and cache tokens/x-mediawiki)
cb( self.manager.env.pageCache[title] /* , type */ );
} else if ( ! this.manager.env.fetchTemplates ) {
parentCB( { tokens: [ 'Warning: Page/template fetching disabled, and no cache for ' +
title ] } );
} else {
// We are about to start an async request for a template
this.manager.env.dp( 'Note: trying to fetch ', title );
// Start a new request if none is outstanding
this.manager.env.dp( 'requestQueue: ', this.manager.env.requestQueue );
if ( this.manager.env.requestQueue[title] === undefined ) { 'Note: Starting new request for ' + title );
this.manager.env.requestQueue[title] = new TemplateRequest( this.manager, title );
// Append a listener to the request at the toplevel, but prepend at
// lower levels to enforce depth-first processing
if ( false && this.manager.isInclude ) {
// prepend request: deal with requests from includes first
.listeners( 'src' ).unshift( cb );
} else {
// append request, process in document order
.listeners( 'src' ).push( cb );
parentCB ( { async: true } );
/*********************** Template argument expansion *******************/
* Expand template arguments with tokens from the containing frame.
TemplateHandler.prototype.onTemplateArg = function ( token, frame, cb ) {
new AttributeTransformManager (
this._returnArgAttributes.bind( this, token, cb, frame )
).process( token.attribs.slice() );
TemplateHandler.prototype._returnArgAttributes = function ( token, cb, frame, attributes ) {
//console.warn( '_returnArgAttributes: ' + JSON.stringify( attributes ));
var argName = this.manager.env.tokensToString( attributes[0].v ).trim(),
dict = this.manager.frame.args.named();
this.manager.env.dp( 'args', argName, dict );
if ( argName in dict ) {
// return tokens for argument
//console.warn( 'templateArg found: ' + argName +
// ' vs. ' + JSON.stringify( this.manager.args ) );
res = dict[argName];
if ( res.constructor === String ) {
cb( { tokens: [res] } );
} else {
dict[argName].to('tokens/x-mediawiki/expanded', function(chunk) { cb ( { tokens: chunk } ); });
} else {
this.manager.env.dp( 'templateArg not found: ', argName,
' vs. ', dict );
if ( attributes.length > 1 ) {
res = attributes[1].v;
} else {
//console.warn('no default for ' + argName + JSON.stringify( attributes ));
res = [ '{{{' + argName + '}}}' ];
cb( { tokens: res } );
/***************** Template fetch request helper class ********/
function TemplateRequest ( manager, title ) {
// Increase the number of maximum listeners a bit..
this.setMaxListeners( 50000 );
var self = this,
url = manager.env.wgScript + '/api' +
manager.env.wgScriptExtension +
'?' +
qs.stringify( {
format: 'json',
action: 'query',
prop: 'revisions',
rvprop: 'content',
titles: title
} );
//'?format=json&action=query&prop=revisions&rvprop=content&titles=' + title;
method: 'GET',
followRedirect: true,
url: url,
headers: {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:9.0.1) ' +
'Gecko/20100101 Firefox/9.0.1 Iceweasel/9.0.1'
function (error, response, body) {
//console.warn( 'response for ' + title + ' :' + body + ':' );
if(error) {
self.emit('src', 'Page/template fetch failure for title ' + title,
} else if(response.statusCode == 200) {
var src = '',
try {
//console.warn( 'body: ' + body );
data = JSON.parse( body );
} catch(e) {
console.warn( "Error: while parsing result. Error was: " );
console.warn( e );
console.warn( "Response that didn't parse was:");
console.warn( "------------------------------------------\n" + body );
console.warn( "------------------------------------------" );
try {
$.each( data.query.pages, function(i, page) {
if (page.revisions && page.revisions.length) {
src = page.revisions[0]['*'];
normalizeTitle = page.title;
} catch ( e2 ) {
console.warn( 'Did not find page revisions in the returned body:' + body );
src = '';
//console.warn( 'Page ' + title + ': got ' + src ); 'Retrieved ' + title );
// Add the source to the cache
manager.env.pageCache[title] = src;
// Process only a few callbacks in each event loop iteration to
// reduce memory usage.
var listeners = self.listeners( 'src' );
var processSome = function () {
// XXX: experiment a bit with the number of callback per
// iteration!
var maxIters = Math.min(1, listeners.length);
for ( var it = 0; it < maxIters; it++ ) {
var nextListener = listeners.shift();
// We only retrieve text/x-mediawiki source currently.
nextListener( src, 'text/x-mediawiki' );
if ( listeners.length ) {
process.nextTick( processSome );
process.nextTick( processSome );
//self.emit( 'src', src, title );
// XXX: handle other status codes
// Remove self from request queue
manager.env.dp( 'trying to remove ', title, ' from requestQueue' );
delete manager.env.requestQueue[title];
manager.env.dp( 'after deletion:', manager.env.requestQueue );
* XXX: The jQuery version does not quite work with node, but we keep
* it around for now.
url: url,
data: {
format: 'json',
action: 'query',
prop: 'revisions',
rvprop: 'content',
titles: title
success: function(data, statusString, xhr) {
console.warn( 'Page ' + title + ' success ' + JSON.stringify( data ) );
var src = null, title = null;
$.each(data.query.pages, function(i, page) {
if (page.revisions && page.revisions.length) {
src = page.revisions[0]['*'];
title = page.title;
if (typeof src !== 'string') {
console.warn( 'Page ' + title + 'not found! Got ' + src );
callback( 'Page ' + title + ' not found' );
} else {
// Add to cache
console.warn( 'Page ' + title + ': got ' + src );
this.manager.env.pageCache[title] = src;
callback(src, title);
error: function(xhr, msg, err) {
console.warn( 'Page/template fetch failure for title ' +
title + ', url=' + url + JSON.stringify(xhr) + ', err=' + err );
callback('Page/template fetch failure for title ' + title);
dataType: 'json',
cache: false, // @fixme caching, versions etc?
crossDomain: true
// Inherit from EventEmitter
TemplateRequest.prototype = new events.EventEmitter();
TemplateHandler.prototype.constructor = TemplateRequest;
if (typeof module == "object") {
module.exports.TemplateHandler = TemplateHandler;