mediawiki-extensions-Visual.../modules/parser/mediawiki.parser.js
Gabriel Wicke bf84638bc0 Add tokenizer cache and clone token state on mutation
* Added an LRU cache (using the lru-cache node module) for tokenizer output
* Mutation of nested attributes now replaces the containers. A shallow copy of
  tokens is sufficient to isolate token transformations. Need to investigate
  if we can actually get away without isolation and re-transformation for most
  ordinary tokens.

Change-Id: I9136b1d7a1fbcc538183a319d4ecaa290d616fdf
2012-04-18 14:40:47 +02:00

458 lines
15 KiB
JavaScript

/**
* This module assembles parser pipelines from parser stages with
* asynchronous communnication between stages based on events. Apart from the
* default pipeline which converts WikiText to HTML DOM, it also provides
* sub-pipelines for the processing of template transclusions.
*
* See http://www.mediawiki.org/wiki/Parsoid and
* http://www.mediawiki.org/wiki/Parsoid/Token_stream_transformations
* for illustrations of the pipeline architecture.
*
* @author Gabriel Wicke <gwicke@wikimedia.org>
* @author Neil Kandalgaonkar <neilk@wikimedia.org>
*/
// make this global for now
// XXX: figure out a way to get away without a global for PEG actions!
$ = require('jquery');
var events = require( 'events' );
var fs = require('fs'),
path = require('path'),
PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
TokenTransformManager = require('./mediawiki.TokenTransformManager.js'),
NoIncludeOnly = require('./ext.core.NoIncludeOnly.js'),
IncludeOnly = NoIncludeOnly.IncludeOnly,
NoInclude = NoIncludeOnly.NoInclude,
OnlyInclude = NoIncludeOnly.OnlyInclude,
QuoteTransformer = require('./ext.core.QuoteTransformer.js').QuoteTransformer,
PostExpandParagraphHandler = require('./ext.core.PostExpandParagraphHandler.js')
.PostExpandParagraphHandler,
Sanitizer = require('./ext.core.Sanitizer.js').Sanitizer,
TemplateHandler = require('./ext.core.TemplateHandler.js').TemplateHandler,
AttributeExpander = require('./ext.core.AttributeExpander.js').AttributeExpander,
LinkHandler = require('./ext.core.LinkHandler.js'),
WikiLinkHandler = LinkHandler.WikiLinkHandler,
ExternalLinkHandler = LinkHandler.ExternalLinkHandler,
Cite = require('./ext.Cite.js').Cite,
BehaviorSwitchHandler = require('./ext.core.BehaviorSwitchHandler.js').BehaviorSwitchHandler,
FauxHTML5 = require('./mediawiki.HTML5TreeBuilder.node.js').FauxHTML5,
DOMPostProcessor = require('./mediawiki.DOMPostProcessor.js').DOMPostProcessor,
DOMConverter = require('./mediawiki.DOMConverter.js').DOMConverter,
ConvertDOMToLM = require('./mediawiki.LinearModelConverter.js').ConvertDOMToLM;
/**
* Set up a simple parser pipeline. There will be a single pipeline overall,
* but there can be multiple sub-pipelines for template expansions etc, which
* in turn differ by input type. The main input type will be fixed at
* construction time though.
*
* @class
* @constructor
* @param {Object} Environment.
*/
function ParserPipeline( env, inputType ) {
if ( ! inputType ) {
// Actually the only one supported for now, but could also create
// others for serialized tokens etc
inputType = 'text/wiki';
}
this.inputType = inputType;
// Pass in a full-fledged environment based on
// mediawiki.parser.environment.js.
if ( !env ) {
this.env = {};
} else {
this.env = env;
}
// set up a sub-pipeline cache
this.pipelineCache = {};
this.pipelineCache[this.inputType] = {
'input-toplevel': [],
'input-include': [],
'attribute-include': [],
'attribute-toplevel': []
};
// Create an input pipeline for the given input type.
this.inputPipeline = this.makeInputPipeline ( inputType, {}, false );
// Mark this pipeline as the top-level input pipeline, so that it is not
// cached and its listeners removed
this.inputPipeline.atTopLevel = true;
this.inputPipeline.last.atTopLevel = true;
this.tokenPostProcessor = new TokenTransformManager
.SyncTokenTransformManager ( env, inputType, 3.0, false );
this.tokenPostProcessor.listenForTokensFrom ( this.inputPipeline );
// Add token transformations..
this._addTransformers( this.inputType, 'sync23', this.tokenPostProcessor, false );
/**
* The tree builder creates a DOM tree from the token soup emitted from
* the TokenTransformDispatcher.
*/
this.treeBuilder = new FauxHTML5.TreeBuilder( this.env );
this.treeBuilder.listenForTokensFrom( this.tokenPostProcessor );
//this.tokenPostProcessor.on('chunk', function( c ) {
// console.warn( JSON.stringify( c, null, 2 ));
//} );
/**
* Final processing on the HTML DOM.
*/
/* Generic DOM transformer.
* This currently performs minor tree-dependent clean up like wrapping
* plain text in paragraphs. For HTML output, it would also be configured
* to perform more aggressive nesting cleanup.
*/
this.postProcessor = new DOMPostProcessor();
this.postProcessor.listenForDocumentFrom( this.treeBuilder );
/**
* Conversion from HTML DOM to WikiDOM. This is not needed if plain HTML
* DOM output is desired, so it should only be registered to the
* DOMPostProcessor 'document' event if WikiDom output is requested. We
* could emit events for 'dom', 'wikidom', 'html' and so on, but only
* actually set up the needed pipeline stages if a listener is registered.
* Overriding the addListener method should make this possible.
*/
this.DOMConverter = new DOMConverter();
// Lame version for now, see above for an idea for the external async
// interface and pipeline setup
this.postProcessor.addListener( 'document', this.forwardDocument.bind( this ) );
}
// Inherit from EventEmitter
ParserPipeline.prototype = new events.EventEmitter();
ParserPipeline.prototype.constructor = ParserPipeline;
/**
* Token stream transformations to register by type and per phase. The
* possible ranks for individual transformation registrations are [0,1)
* (excluding 1.0) for sync01, [1,2) for async12 and [2,3) for sync23.
*
* Should perhaps be moved to mediawiki.parser.environment.js, so that all
* configuration can be found in a single place.
*/
ParserPipeline.prototype._transformers = {
'text/wiki': {
// Synchronous in-order per input
sync01:
[
OnlyInclude,
IncludeOnly,
NoInclude
// Insert TokenCollectors for extensions here (don't expand
// templates in extension contents); wrap collected tokens in
// special extension token.
/* Extension1, */
/* Extension2, */
],
/*
* Asynchronous out-of-order per input. Each async transform can only
* operate on a single input token, but can emit multiple output
* tokens. If multiple tokens need to be collected per-input, then a
* separate collection transform in sync01 can be used to wrap the
* collected tokens into a single one later processed in an async12
* transform.
*/
async12:
[
TemplateHandler,
// Expand attributes after templates to avoid expanding unused branches
AttributeExpander,
WikiLinkHandler,
ExternalLinkHandler,
BehaviorSwitchHandler
/* ExtensionHandler1, */
/* ExtensionHandler2, */
],
// Synchronous in-order on fully expanded token stream (including
// expanded templates etc).
sync23:
[
QuoteTransformer,
PostExpandParagraphHandler,
/* Cite, */
/* ListHandler, */
Sanitizer
]
}
};
/**
* Add all transformers to a token transform manager for a given input type
* and phase.
*/
ParserPipeline.prototype._addTransformers = function ( type, phase, manager, isInclude )
{
var transformers;
try {
transformers = this._transformers[type][phase];
} catch ( e ) {
console.warn( 'Error while looking for token transformers for ' +
type + ' and phase ' + phase );
transformers = [];
}
for ( var i = 0, l = transformers.length; i < l; i++ ) {
new transformers[i]( manager, isInclude );
}
};
/**
* Factory method for the input (up to async token transforms / phase two)
* parts of the parser pipeline.
*
* @method
* @param {String} Input type. Try 'text/wiki'.
* @param {Object} Expanded template arguments to pass to the
* AsyncTokenTransformManager.
* @returns {Object} { first: <first stage>, last: AsyncTokenTransformManager }
* First stage is supposed to implement a process() function
* that can accept all input at once. The wikitext tokenizer for example
* accepts the wiki text this way. The last stage of the input pipeline is
* always an AsyncTokenTransformManager, which emits its output in events.
*/
ParserPipeline.prototype.makeInputPipeline = function ( inputType, args, isInclude ) {
var pipelinePart = isInclude ? 'input-include' : 'input-toplevel';
switch ( inputType ) {
case 'text/wiki':
//console.warn( 'makeInputPipeline ' + JSON.stringify( args ) );
if ( this.pipelineCache['text/wiki'][pipelinePart].length ) {
var pipe = this.pipelineCache['text/wiki'][pipelinePart].pop();
pipe.last.args = args;
return pipe;
} else {
var wikiTokenizer = new PegTokenizer( this.env, isInclude );
/**
* Token stream transformations.
* This is where all the wiki-specific functionality is implemented.
* See
* https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations
*/
var tokenPreProcessor = new TokenTransformManager
.SyncTokenTransformManager ( this.env, 'text/wiki', 1, isInclude );
tokenPreProcessor.listenForTokensFrom ( wikiTokenizer );
this._addTransformers( 'text/wiki', 'sync01',
tokenPreProcessor, isInclude );
var tokenExpander = new TokenTransformManager.AsyncTokenTransformManager (
{
'input': this.makeInputPipeline.bind( this ),
'attributes': this.makeAttributePipeline.bind( this )
},
args, this.env, inputType, 2.0, isInclude
);
// Register template expansion extension
this._addTransformers( 'text/wiki', 'async12',
tokenExpander, isInclude );
tokenExpander.listenForTokensFrom ( tokenPreProcessor );
// XXX: hack.
tokenExpander.inputType = inputType;
tokenPreProcessor.inputType = inputType;
return new CachedTokenPipeline(
this.cachePipeline.bind( this, 'text/wiki', pipelinePart ),
wikiTokenizer,
tokenExpander,
isInclude
);
}
break;
default:
console.trace();
throw "ParserPipeline.makeInputPipeline: Unsupported input type " + inputType;
}
};
/**
* Factory for attribute transformations, with input type implicit in the
* environment.
*/
ParserPipeline.prototype.makeAttributePipeline = function ( inputType, args, isInclude ) {
var pipelinePart = isInclude ? 'attribute-include' : 'attribute-toplevel';
//console.warn( 'makeAttributePipeline: ' + pipelinePart);
if ( this.pipelineCache[inputType][pipelinePart].length ) {
var pipe = this.pipelineCache[inputType][pipelinePart].pop();
pipe.last.args = args;
//console.warn( 'from cache' + JSON.stringify( pipe.last.transformers, null, 2 ) );
return pipe;
} else {
/**
* Token stream transformations.
* This is where all the wiki-specific functionality is implemented.
* See https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations
*/
var tokenPreProcessor = new TokenTransformManager
.SyncTokenTransformManager ( this.env, inputType, 1, isInclude );
this._addTransformers( inputType, 'sync01', tokenPreProcessor, isInclude );
new NoInclude( tokenPreProcessor );
var tokenExpander = new TokenTransformManager.AsyncTokenTransformManager (
{
'input': this.makeInputPipeline.bind( this ),
'attributes': this.makeAttributePipeline.bind( this )
},
args, this.env, inputType, 2, isInclude
);
// Add token transformers
this._addTransformers( 'text/wiki', 'async12',
tokenExpander, isInclude );
tokenExpander.listenForTokensFrom ( tokenPreProcessor );
//console.warn( 'new pipe' + JSON.stringify( tokenExpander.transformers, null, 2 ) );
return new CachedTokenPipeline(
this.cachePipeline.bind( this, inputType, pipelinePart ),
tokenPreProcessor,
tokenExpander,
isInclude
);
}
};
ParserPipeline.prototype.cachePipeline = function ( inputType, pipelinePart, pipe ) {
var cache = this.pipelineCache[inputType][pipelinePart];
if ( cache && cache.length < 5 ) {
cache.push( pipe );
}
};
/**
* Feed the parser pipeline with some input, the output is emitted in events.
*
* @method
* @param {Mixed} All arguments are passed through to the underlying input
* pipeline's first element's process() method. For a wikitext pipeline (the
* default), this would be the wikitext to tokenize:
* pipeline.parse ( wikiText );
*/
ParserPipeline.prototype.parse = function ( ) {
// Set the pipeline in motion by feeding the first element with the given
// arguments.
this.inputPipeline.process.apply( this.inputPipeline , arguments );
};
// Just bubble up the document event from the pipeline
ParserPipeline.prototype.forwardDocument = function ( document ) {
this.emit( 'document', document );
};
// XXX: remove JSON serialization here, that should only be performed when
// needed (and normally without pretty-printing).
ParserPipeline.prototype.getWikiDom = function ( document ) {
return JSON.stringify(
this.DOMConverter.HTMLtoWiki( document.body ),
null,
2
);
};
ParserPipeline.prototype.getLinearModel = function( document ) {
return JSON.stringify( ConvertDOMToLM( document.body ), null, 2 );
};
/************************ CachedTokenPipeline ********************************/
/**
* Wrap a part of a pipeline. The last member of the pipeline is supposed to
* emit 'end' and 'chunk' events, while the first is supposed to support a
* process() method that sets the pipeline in motion.
*
* @class
* @constructor
* @param {Function} returnToCacheCB: Callback to return the
* CachedTokenPipeline to a cache when processing has finished
* @param {Object} first: First stage of the pipeline
* @param {Object} last: Last stage of the pipeline
*/
function CachedTokenPipeline ( returnToCacheCB, first, last, isInclude ) {
this.returnToCacheCB = returnToCacheCB;
this.first = first;
this.last = last;
this.last.addListener( 'end', this.forwardEndAndRecycleSelf.bind( this ) );
this.last.addListener( 'chunk', this.forwardChunk.bind( this ) );
this.isInclude = isInclude;
}
// Inherit from EventEmitter
CachedTokenPipeline.prototype = new events.EventEmitter();
CachedTokenPipeline.prototype.constructor = CachedTokenPipeline;
/**
* Feed input tokens to the first pipeline stage
*/
CachedTokenPipeline.prototype.process = function ( chunk, key ) {
//console.warn( 'CachedTokenPipeline::process: ' + JSON.stringify( chunk ) );
this.first.process( chunk, key );
};
/**
* Forward chunks to our listeners
*/
CachedTokenPipeline.prototype.forwardChunk = function ( chunk, key ) {
//console.warn( 'CachedTokenPipeline.forwardChunk: ' +
// JSON.stringify( chunk, null, 2 )
// );
this.emit( 'chunk', chunk, key );
};
/**
* Chunk and end event consumer and emitter, that removes all listeners from
* the given pipeline stage and returns it to a cache.
*/
CachedTokenPipeline.prototype.forwardEndAndRecycleSelf = function ( ) {
//console.warn( 'CachedTokenPipeline.forwardEndAndRecycleSelf: ' +
// JSON.stringify( this.listeners( 'chunk' ), null, 2 ) );
// first, forward the event
this.emit( 'end' );
// now recycle self
if ( ! this.atTopLevel ) {
this.removeAllListeners( 'end' );
this.removeAllListeners( 'chunk' );
this.returnToCacheCB ( this );
}
};
if (typeof module == "object") {
module.exports.ParserPipeline = ParserPipeline;
}