mediawiki-extensions-Visual.../modules/parser/mediawiki.tokenizer.peg.js
Gabriel Wicke bd98eb4c5a Land big TokenTransformDispatcher and eventization refactoring.
The TokenTransformDispatcher now actually implements an asynchronous, phased
token transformation framework as described in
https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations.

Additionally, the parser pipeline is now mostly held together using events.
The tokenizer still emits a lame single events with all tokens, as block-level
emission failed with scoping issues specific to the PEGJS parser generator.
All stages clean up when receiving the end tokens, so that the full pipeline
can be used for repeated parsing.

The QuoteTransformer is not yet 100% fixed to work with the new interface, and
the Cite extension is disabled for now pending adaptation. Bold-italic related
tests are failing currently.
2012-01-03 18:44:31 +00:00

163 lines
3.9 KiB
JavaScript

/**
* Tokenizer for wikitext, using PEG.js and a separate PEG grammar file
* (pegTokenizer.pegjs.txt)
*
* Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML
* output.
*/
var PEG = require('pegjs'),
path = require('path'),
fs = require('fs'),
events = require('events');
function PegTokenizer() {
var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
this.src = fs.readFileSync( pegSrcPath, 'utf8' );
}
// Inherit from EventEmitter
PegTokenizer.prototype = new events.EventEmitter();
PegTokenizer.src = false;
PegTokenizer.prototype.tokenize = function( text ) {
var out, err;
if ( !this.parser ) {
this.parser = PEG.buildParser(this.src);
// add reference to this for event emission
this.parser._tokenizer = this;
}
// some normalization
if ( text.substring(text.length - 1) !== "\n" ) {
text += "\n";
}
// XXX: Commented out exception handling during development to get
// reasonable traces. Calling a trace on the extension does not really cut
// it.
//try {
out = this.parser.parse(text);
// emit tokens here until we get that to work per toplevelblock in the
// actual tokenizer
this.emit('chunk', out);
this.emit('end');
//} catch (e) {
//err = e;
//console.trace();
//} finally {
return { err: err };
//}
};
/*****************************************************************************
* LEGACY stuff
*
* This is kept around as a template for the ongoing template expansion work!
* It won't work with the token infrastructure.
*/
/**
* @param {object} tree
* @param {function(tree, error)} callback
*/
PegTokenizer.prototype.expandTree = function(tree, callback) {
var self = this;
var subParseArray = function(listOfTrees) {
var content = [];
$.each(listOfTrees, function(i, subtree) {
self.expandTree(subtree, function(substr, err) {
content.push(tree);
});
});
return content;
};
var src;
if (typeof tree === "string") {
callback(tree);
return;
}
if (tree.type == 'template') {
// expand a template node!
// Resolve a possibly relative link
var templateName = this.env.resolveTitle( tree.target, 'Template' );
this.env.fetchTemplate( tree.target, tree.params || {}, function( templateSrc, error ) {
// @fixme should pre-parse/cache these too?
self.parseToTree( templateSrc, function( templateTree, error ) {
if ( error ) {
callback({
type: 'placeholder',
orig: tree,
content: [
{
// @fixme broken link?
type: 'link',
target: templateName
}
]
});
} else {
callback({
type: 'placeholder',
orig: tree,
content: self.env.expandTemplateArgs( templateTree, tree.params )
});
}
});
} );
// Wait for async...
return;
}
var out = $.extend( tree ); // @fixme prefer a deep copy?
if (tree.content) {
out.content = subParseArray(tree.content);
}
callback(out);
};
PegTokenizer.prototype.initSource = function(callback) {
if (PegTokenizer.src) {
callback();
} else {
if ( typeof parserPlaygroundPegPage !== 'undefined' ) {
$.ajax({
url: wgScriptPath + '/api' + wgScriptExtension,
data: {
format: 'json',
action: 'query',
prop: 'revisions',
rvprop: 'content',
titles: parserPlaygroundPegPage
},
success: function(data, xhr) {
$.each(data.query.pages, function(i, page) {
if (page.revisions && page.revisions.length) {
PegTokenizer.src = page.revisions[0]['*'];
}
});
callback();
},
dataType: 'json',
cache: false
}, 'json');
} else {
$.ajax({
url: mw.config.get('wgParserPlaygroundAssetsPath', mw.config.get('wgExtensionAssetsPath')) + '/ParserPlayground/modules/pegParser.pegjs.txt',
success: function(data) {
PegTokenizer.src = data;
callback();
},
dataType: 'text',
cache: false
});
}
}
};
if (typeof module == "object") {
module.exports.PegTokenizer = PegTokenizer;
}