mediawiki-extensions-Visual.../modules/parser/mediawiki.tokenizer.peg.js

240 lines
6.2 KiB
JavaScript
Raw Normal View History

/**
* Tokenizer for wikitext, using PEG.js and a separate PEG grammar file
* (pegTokenizer.pegjs.txt)
*
* Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML
* output.
*/
var PEG = require('pegjs'),
path = require('path'),
fs = require('fs'),
$ = require('jquery'),
events = require('events'),
defines = require('./mediawiki.parser.defines.js');
function PegTokenizer() {
var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
this.src = fs.readFileSync( pegSrcPath, 'utf8' );
}
// Inherit from EventEmitter
PegTokenizer.prototype = new events.EventEmitter();
PegTokenizer.prototype.constructor = PegTokenizer;
PegTokenizer.src = false;
PegTokenizer.prototype.process = function( text ) {
var out, err;
if ( !this.parser ) {
// Only create a single parser, as parse() is a static method.
var parserSource = PEG.buildParser(this.src).toSource();
//console.warn( parserSource );
parserSource = parserSource.replace( 'parse: function(input, startRule) {',
'parse: function(input, startRule) { var __parseArgs = arguments;' );
//console.warn( parserSource );
PegTokenizer.prototype.parser = eval( parserSource );
// add reference to this for event emission
// XXX: pass a cb into parse() instead, but need to modify pegjs a bit
// for that.
//PegTokenizer.prototype.parser._tokenizer = undefined;
// Print the generated parser source
//console.warn(this.parser.toSource());
}
// some normalization
2011-12-30 21:51:03 +00:00
if ( text.substring(text.length - 1) !== "\n" ) {
text += "\n";
}
// XXX: Commented out exception handling during development to get
// reasonable traces. Calling a trace on the extension does not really cut
// it.
//try {
this.parser.parse(text, 'start',
// callback
this.emit.bind( this, 'chunk' ),
// inline break test
this
);
// emit tokens here until we get that to work per toplevelblock in the
// actual tokenizer
//this.emit('chunk', out.concat( [{ type: 'END' }] ) );
this.emit('end');
//} catch (e) {
//err = e;
//console.trace();
//} finally {
return { err: err };
//}
};
PegTokenizer.prototype.breakMap = {
'=': function(input, pos, syntaxFlags) {
return syntaxFlags.equal ||
( syntaxFlags.h &&
input.substr( pos + 1, 200)
.match(/[ \t]*[\r\n]/) !== null ) || null;
},
'|': function ( input, pos, syntaxFlags ) {
return syntaxFlags.template ||
( syntaxFlags.table &&
( input[pos + 1].match(/[|}]/) !== null ||
syntaxFlags.tableCellArg
)
) || null;
},
"!": function ( input, pos, syntaxFlags ) {
return syntaxFlags.table && input[pos + 1] === "!" ||
null;
},
"}": function ( input, pos, syntaxFlags ) {
return syntaxFlags.template && input[pos + 1] === "}" || null;
},
":": function ( input, pos, syntaxFlags ) {
return syntaxFlags.colon &&
! syntaxFlags.extlink &&
! syntaxFlags.linkdesc || null;
},
"\r": function ( input, pos, syntaxFlags ) {
return syntaxFlags.table &&
input[pos + 1] !== '!' &&
input[pos + 1] !== '|' ||
null;
},
"\n": function ( input, pos, syntaxFlags ) {
return syntaxFlags.table &&
input[pos + 1] !== '!' &&
input[pos + 1] !== '|' ||
null;
},
"]": function ( input, pos, syntaxFlags ) {
return syntaxFlags.extlink ||
( syntaxFlags.linkdesc && input[pos + 1] === ']' ) ||
null;
},
"<": function ( input, pos, syntaxFlags ) {
return syntaxFlags.pre && input.substr( pos, 6 ) === '</pre>' || null;
}
};
PegTokenizer.prototype.inline_breaks = function (input, pos, syntaxFlags ) {
var res = this.breakMap[ input[pos] ]( input, pos, syntaxFlags);
console.warn( 'ilb res: ' + JSON.stringify( [ res, input.substr( pos, 4 ) ] ) );
return res;
};
/*****************************************************************************
* LEGACY stuff
*
* This is kept around as a template for the ongoing template expansion work!
* It won't work with the token infrastructure.
*/
/**
* @param {object} tree
* @param {function(tree, error)} callback
*/
PegTokenizer.prototype.expandTree = function(tree, callback) {
var self = this;
var subParseArray = function(listOfTrees) {
var content = [];
$.each(listOfTrees, function(i, subtree) {
self.expandTree(subtree, function(substr, err) {
content.push(tree);
});
});
return content;
};
var src;
if (typeof tree === "string") {
callback(tree);
return;
}
if (tree.type == 'template') {
// expand a template node!
// Resolve a possibly relative link
var templateName = this.env.resolveTitle( tree.target, 'Template' );
this.env.fetchTemplate( tree.target, tree.params || {}, function( templateSrc, error ) {
// @fixme should pre-parse/cache these too?
self.parseToTree( templateSrc, function( templateTree, error ) {
if ( error ) {
callback({
type: 'placeholder',
orig: tree,
content: [
{
// @fixme broken link?
type: 'link',
target: templateName
}
]
});
} else {
callback({
type: 'placeholder',
orig: tree,
content: self.env.expandTemplateArgs( templateTree, tree.params )
});
}
});
} );
// Wait for async...
return;
}
var out = $.extend( tree ); // @fixme prefer a deep copy?
if (tree.content) {
out.content = subParseArray(tree.content);
}
callback(out);
};
PegTokenizer.prototype.initSource = function(callback) {
if (PegTokenizer.src) {
callback();
} else {
if ( typeof parserPlaygroundPegPage !== 'undefined' ) {
$.ajax({
url: wgScriptPath + '/api' + wgScriptExtension,
data: {
format: 'json',
action: 'query',
prop: 'revisions',
rvprop: 'content',
titles: parserPlaygroundPegPage
},
success: function(data, xhr) {
$.each(data.query.pages, function(i, page) {
if (page.revisions && page.revisions.length) {
PegTokenizer.src = page.revisions[0]['*'];
}
});
callback();
},
dataType: 'json',
cache: false
}, 'json');
} else {
$.ajax({
url: mw.config.get('wgParserPlaygroundAssetsPath', mw.config.get('wgExtensionAssetsPath')) + '/ParserPlayground/modules/pegParser.pegjs.txt',
success: function(data) {
PegTokenizer.src = data;
callback();
},
dataType: 'text',
cache: false
});
}
}
};
if (typeof module == "object") {
module.exports.PegTokenizer = PegTokenizer;
}