2011-11-02 21:07:51 +00:00
|
|
|
/**
|
2011-12-08 11:40:59 +00:00
|
|
|
* Tokenizer for wikitext, using PEG.js and a separate PEG grammar file
|
|
|
|
* (pegTokenizer.pegjs.txt)
|
2011-11-02 21:07:51 +00:00
|
|
|
*
|
2011-12-08 11:40:59 +00:00
|
|
|
* Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML
|
|
|
|
* output.
|
2011-11-02 21:07:51 +00:00
|
|
|
*/
|
2011-12-12 14:03:54 +00:00
|
|
|
|
2011-12-28 17:04:16 +00:00
|
|
|
var PEG = require('pegjs'),
|
|
|
|
path = require('path'),
|
|
|
|
fs = require('fs');
|
2011-12-12 14:03:54 +00:00
|
|
|
|
2011-12-28 17:04:16 +00:00
|
|
|
function PegTokenizer() {
|
|
|
|
var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
|
|
|
|
this.src = fs.readFileSync( pegSrcPath, 'utf8' );
|
2011-11-02 21:07:51 +00:00
|
|
|
}
|
|
|
|
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.src = false;
|
2011-11-02 21:07:51 +00:00
|
|
|
|
2011-12-12 14:03:54 +00:00
|
|
|
PegTokenizer.prototype.tokenize = function( text ) {
|
|
|
|
var out, err;
|
|
|
|
if ( !this.parser ) {
|
2011-12-28 01:36:36 +00:00
|
|
|
this.parser = PEG.buildParser(this.src);
|
2011-12-12 14:03:54 +00:00
|
|
|
}
|
2011-12-28 01:37:11 +00:00
|
|
|
|
|
|
|
// some normalization
|
2011-12-30 21:51:03 +00:00
|
|
|
if ( text.substring(text.length - 1) !== "\n" ) {
|
2011-12-28 01:37:11 +00:00
|
|
|
text += "\n";
|
|
|
|
}
|
|
|
|
|
2011-12-12 14:03:54 +00:00
|
|
|
try {
|
|
|
|
out = this.parser.parse(text);
|
|
|
|
} catch (e) {
|
|
|
|
err = e;
|
|
|
|
console.trace();
|
|
|
|
} finally {
|
2011-12-28 01:37:15 +00:00
|
|
|
|
|
|
|
// Append the end (for obvious reasons this should not
|
|
|
|
// be part of a stream, only when tokenizing complete
|
|
|
|
// texts)
|
|
|
|
out.push({type: 'END'});
|
|
|
|
|
2011-12-12 14:03:54 +00:00
|
|
|
return {tokens: out, err: err};
|
|
|
|
}
|
2011-11-02 21:07:51 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param {object} tree
|
|
|
|
* @param {function(tree, error)} callback
|
|
|
|
*/
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.prototype.expandTree = function(tree, callback) {
|
2011-11-02 21:07:51 +00:00
|
|
|
var self = this;
|
|
|
|
var subParseArray = function(listOfTrees) {
|
|
|
|
var content = [];
|
|
|
|
$.each(listOfTrees, function(i, subtree) {
|
|
|
|
self.expandTree(subtree, function(substr, err) {
|
|
|
|
content.push(tree);
|
|
|
|
});
|
|
|
|
});
|
|
|
|
return content;
|
|
|
|
};
|
|
|
|
var src;
|
|
|
|
if (typeof tree === "string") {
|
|
|
|
callback(tree);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (tree.type == 'template') {
|
|
|
|
// expand a template node!
|
|
|
|
|
|
|
|
// Resolve a possibly relative link
|
|
|
|
var templateName = this.env.resolveTitle( tree.target, 'Template' );
|
|
|
|
this.env.fetchTemplate( tree.target, tree.params || {}, function( templateSrc, error ) {
|
|
|
|
// @fixme should pre-parse/cache these too?
|
|
|
|
self.parseToTree( templateSrc, function( templateTree, error ) {
|
|
|
|
if ( error ) {
|
|
|
|
callback({
|
|
|
|
type: 'placeholder',
|
|
|
|
orig: tree,
|
|
|
|
content: [
|
|
|
|
{
|
|
|
|
// @fixme broken link?
|
|
|
|
type: 'link',
|
|
|
|
target: templateName
|
|
|
|
}
|
|
|
|
]
|
|
|
|
});
|
|
|
|
} else {
|
|
|
|
callback({
|
|
|
|
type: 'placeholder',
|
|
|
|
orig: tree,
|
|
|
|
content: self.env.expandTemplateArgs( templateTree, tree.params )
|
|
|
|
});
|
|
|
|
}
|
|
|
|
})
|
|
|
|
} );
|
|
|
|
// Wait for async...
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
var out = $.extend( tree ); // @fixme prefer a deep copy?
|
|
|
|
if (tree.content) {
|
|
|
|
out.content = subParseArray(tree.content);
|
|
|
|
}
|
|
|
|
callback(out);
|
|
|
|
};
|
|
|
|
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.prototype.initSource = function(callback) {
|
|
|
|
if (PegTokenizer.src) {
|
2011-11-02 21:07:51 +00:00
|
|
|
callback();
|
|
|
|
} else {
|
|
|
|
if ( typeof parserPlaygroundPegPage !== 'undefined' ) {
|
|
|
|
$.ajax({
|
|
|
|
url: wgScriptPath + '/api' + wgScriptExtension,
|
|
|
|
data: {
|
|
|
|
format: 'json',
|
|
|
|
action: 'query',
|
|
|
|
prop: 'revisions',
|
|
|
|
rvprop: 'content',
|
|
|
|
titles: parserPlaygroundPegPage
|
|
|
|
},
|
|
|
|
success: function(data, xhr) {
|
|
|
|
$.each(data.query.pages, function(i, page) {
|
|
|
|
if (page.revisions && page.revisions.length) {
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.src = page.revisions[0]['*'];
|
2011-11-02 21:07:51 +00:00
|
|
|
}
|
|
|
|
});
|
|
|
|
callback()
|
|
|
|
},
|
|
|
|
dataType: 'json',
|
|
|
|
cache: false
|
|
|
|
}, 'json');
|
|
|
|
} else {
|
|
|
|
$.ajax({
|
|
|
|
url: mw.config.get('wgParserPlaygroundAssetsPath', mw.config.get('wgExtensionAssetsPath')) + '/ParserPlayground/modules/pegParser.pegjs.txt',
|
|
|
|
success: function(data) {
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.src = data;
|
2011-11-02 21:07:51 +00:00
|
|
|
callback();
|
|
|
|
},
|
|
|
|
dataType: 'text',
|
|
|
|
cache: false
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
if (typeof module == "object") {
|
2011-12-08 10:59:44 +00:00
|
|
|
module.exports.PegTokenizer = PegTokenizer;
|
2011-11-02 21:07:51 +00:00
|
|
|
}
|