2011-11-02 21:07:51 +00:00
|
|
|
/**
|
2011-12-08 11:40:59 +00:00
|
|
|
* Tokenizer for wikitext, using PEG.js and a separate PEG grammar file
|
|
|
|
* (pegTokenizer.pegjs.txt)
|
2011-11-02 21:07:51 +00:00
|
|
|
*
|
2011-12-08 11:40:59 +00:00
|
|
|
* Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML
|
|
|
|
* output.
|
2011-11-02 21:07:51 +00:00
|
|
|
*/
|
2011-12-12 14:03:54 +00:00
|
|
|
|
2011-12-28 17:04:16 +00:00
|
|
|
var PEG = require('pegjs'),
|
|
|
|
path = require('path'),
|
2012-01-03 18:44:31 +00:00
|
|
|
fs = require('fs'),
|
2012-01-04 08:39:45 +00:00
|
|
|
$ = require('jquery'),
|
2012-01-03 18:44:31 +00:00
|
|
|
events = require('events');
|
2011-12-12 14:03:54 +00:00
|
|
|
|
2011-12-28 17:04:16 +00:00
|
|
|
function PegTokenizer() {
|
|
|
|
var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
|
|
|
|
this.src = fs.readFileSync( pegSrcPath, 'utf8' );
|
2011-11-02 21:07:51 +00:00
|
|
|
}
|
|
|
|
|
2012-01-03 18:44:31 +00:00
|
|
|
// Inherit from EventEmitter
|
|
|
|
PegTokenizer.prototype = new events.EventEmitter();
|
2012-01-04 12:28:41 +00:00
|
|
|
PegTokenizer.prototype.constructor = PegTokenizer;
|
2012-01-03 18:44:31 +00:00
|
|
|
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.src = false;
|
2011-11-02 21:07:51 +00:00
|
|
|
|
2012-01-09 19:33:49 +00:00
|
|
|
PegTokenizer.prototype.process = function( text ) {
|
2011-12-12 14:03:54 +00:00
|
|
|
var out, err;
|
|
|
|
if ( !this.parser ) {
|
2012-01-09 19:33:49 +00:00
|
|
|
// Only create a single parser, as parse() is a static method.
|
2012-01-09 17:49:16 +00:00
|
|
|
PegTokenizer.prototype.parser = PEG.buildParser(this.src);
|
2012-01-03 18:44:31 +00:00
|
|
|
// add reference to this for event emission
|
2012-01-09 17:49:16 +00:00
|
|
|
// XXX: pass a cb into parse() instead, but need to modify pegjs a bit
|
|
|
|
// for that.
|
|
|
|
//PegTokenizer.prototype.parser._tokenizer = undefined;
|
|
|
|
|
2012-01-06 17:19:14 +00:00
|
|
|
// Print the generated parser source
|
|
|
|
//console.log(this.parser.toSource());
|
2011-12-12 14:03:54 +00:00
|
|
|
}
|
2011-12-28 01:37:11 +00:00
|
|
|
|
|
|
|
// some normalization
|
2011-12-30 21:51:03 +00:00
|
|
|
if ( text.substring(text.length - 1) !== "\n" ) {
|
2011-12-28 01:37:11 +00:00
|
|
|
text += "\n";
|
|
|
|
}
|
|
|
|
|
2012-01-03 18:44:31 +00:00
|
|
|
// XXX: Commented out exception handling during development to get
|
|
|
|
// reasonable traces. Calling a trace on the extension does not really cut
|
|
|
|
// it.
|
|
|
|
//try {
|
2011-12-12 14:03:54 +00:00
|
|
|
out = this.parser.parse(text);
|
2012-01-03 18:44:31 +00:00
|
|
|
// emit tokens here until we get that to work per toplevelblock in the
|
|
|
|
// actual tokenizer
|
2012-01-17 20:01:21 +00:00
|
|
|
this.emit('chunk', out.concat( [{ type: 'END' }] ) );
|
2012-01-03 18:44:31 +00:00
|
|
|
this.emit('end');
|
|
|
|
//} catch (e) {
|
|
|
|
//err = e;
|
|
|
|
//console.trace();
|
|
|
|
//} finally {
|
|
|
|
return { err: err };
|
|
|
|
//}
|
|
|
|
};
|
2011-12-28 01:37:15 +00:00
|
|
|
|
2012-01-03 18:44:31 +00:00
|
|
|
/*****************************************************************************
|
|
|
|
* LEGACY stuff
|
|
|
|
*
|
|
|
|
* This is kept around as a template for the ongoing template expansion work!
|
|
|
|
* It won't work with the token infrastructure.
|
|
|
|
*/
|
2011-12-28 01:37:15 +00:00
|
|
|
|
2011-11-02 21:07:51 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* @param {object} tree
|
|
|
|
* @param {function(tree, error)} callback
|
|
|
|
*/
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.prototype.expandTree = function(tree, callback) {
|
2011-11-02 21:07:51 +00:00
|
|
|
var self = this;
|
|
|
|
var subParseArray = function(listOfTrees) {
|
|
|
|
var content = [];
|
|
|
|
$.each(listOfTrees, function(i, subtree) {
|
|
|
|
self.expandTree(subtree, function(substr, err) {
|
|
|
|
content.push(tree);
|
|
|
|
});
|
|
|
|
});
|
|
|
|
return content;
|
|
|
|
};
|
|
|
|
var src;
|
|
|
|
if (typeof tree === "string") {
|
|
|
|
callback(tree);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (tree.type == 'template') {
|
|
|
|
// expand a template node!
|
|
|
|
|
|
|
|
// Resolve a possibly relative link
|
|
|
|
var templateName = this.env.resolveTitle( tree.target, 'Template' );
|
|
|
|
this.env.fetchTemplate( tree.target, tree.params || {}, function( templateSrc, error ) {
|
|
|
|
// @fixme should pre-parse/cache these too?
|
|
|
|
self.parseToTree( templateSrc, function( templateTree, error ) {
|
|
|
|
if ( error ) {
|
|
|
|
callback({
|
|
|
|
type: 'placeholder',
|
|
|
|
orig: tree,
|
|
|
|
content: [
|
|
|
|
{
|
|
|
|
// @fixme broken link?
|
|
|
|
type: 'link',
|
|
|
|
target: templateName
|
|
|
|
}
|
|
|
|
]
|
|
|
|
});
|
|
|
|
} else {
|
|
|
|
callback({
|
|
|
|
type: 'placeholder',
|
|
|
|
orig: tree,
|
|
|
|
content: self.env.expandTemplateArgs( templateTree, tree.params )
|
|
|
|
});
|
|
|
|
}
|
2012-01-03 18:44:31 +00:00
|
|
|
});
|
2011-11-02 21:07:51 +00:00
|
|
|
} );
|
|
|
|
// Wait for async...
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
var out = $.extend( tree ); // @fixme prefer a deep copy?
|
|
|
|
if (tree.content) {
|
|
|
|
out.content = subParseArray(tree.content);
|
|
|
|
}
|
|
|
|
callback(out);
|
|
|
|
};
|
|
|
|
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.prototype.initSource = function(callback) {
|
|
|
|
if (PegTokenizer.src) {
|
2011-11-02 21:07:51 +00:00
|
|
|
callback();
|
|
|
|
} else {
|
|
|
|
if ( typeof parserPlaygroundPegPage !== 'undefined' ) {
|
|
|
|
$.ajax({
|
|
|
|
url: wgScriptPath + '/api' + wgScriptExtension,
|
|
|
|
data: {
|
|
|
|
format: 'json',
|
|
|
|
action: 'query',
|
|
|
|
prop: 'revisions',
|
|
|
|
rvprop: 'content',
|
|
|
|
titles: parserPlaygroundPegPage
|
|
|
|
},
|
|
|
|
success: function(data, xhr) {
|
|
|
|
$.each(data.query.pages, function(i, page) {
|
|
|
|
if (page.revisions && page.revisions.length) {
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.src = page.revisions[0]['*'];
|
2011-11-02 21:07:51 +00:00
|
|
|
}
|
|
|
|
});
|
2012-01-03 18:44:31 +00:00
|
|
|
callback();
|
2011-11-02 21:07:51 +00:00
|
|
|
},
|
|
|
|
dataType: 'json',
|
|
|
|
cache: false
|
|
|
|
}, 'json');
|
|
|
|
} else {
|
|
|
|
$.ajax({
|
|
|
|
url: mw.config.get('wgParserPlaygroundAssetsPath', mw.config.get('wgExtensionAssetsPath')) + '/ParserPlayground/modules/pegParser.pegjs.txt',
|
|
|
|
success: function(data) {
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.src = data;
|
2011-11-02 21:07:51 +00:00
|
|
|
callback();
|
|
|
|
},
|
|
|
|
dataType: 'text',
|
|
|
|
cache: false
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
if (typeof module == "object") {
|
2011-12-08 10:59:44 +00:00
|
|
|
module.exports.PegTokenizer = PegTokenizer;
|
2011-11-02 21:07:51 +00:00
|
|
|
}
|