/** * Tokenizer for wikitext, using PEG.js and a separate PEG grammar file * (pegTokenizer.pegjs.txt) * * Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML * output. * */ var PEG = require('pegjs'), path = require('path'), LRU = require("lru-cache"), fs = require('fs'), $ = require('jquery'), events = require('events'), //colors = require('colors'), defines = require('./mediawiki.parser.defines.js'); function PegTokenizer( env, canCache ) { this.env = env; this.canCache = canCache; if ( this.canCache ) { this.cacheAccum = { chunks: [] }; } } // Inherit from EventEmitter PegTokenizer.prototype = new events.EventEmitter(); PegTokenizer.prototype.constructor = PegTokenizer; PegTokenizer.src = false; /* * The main worker. Sets up event emission ('chunk' and 'end' events). * Consumers are supposed to register with PegTokenizer before calling * process(). */ PegTokenizer.prototype.process = function( text, cacheKey ) { var out, err; if ( !this.tokenizer ) { // Construct a singleton static tokenizer. var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' ); this.src = fs.readFileSync( pegSrcPath, 'utf8' ); var tokenizerSource = PEG.buildParser(this.src).toSource(); /* We patch the generated source to assign the arguments array for the * parse function to a function-scoped variable. We use this to pass * in callbacks and other information, which can be used from actions * run when matching a production. In particular, we pass in a * callback called for a chunk of tokens in toplevelblock. Setting this * callback per call to parse() keeps the tokenizer reentrant, so that it * can be reused to expand templates while a main parse is ongoing. * PEG tokenizer construction is very expensive, so having a single * reentrant tokenizer is a big win. * * We could also make modules available to the tokenizer by prepending * requires to the source. */ tokenizerSource = tokenizerSource.replace( 'parse: function(input, startRule) {', 'parse: function(input, startRule) { var __parseArgs = arguments;' ); //console.warn( tokenizerSource ); PegTokenizer.prototype.tokenizer = eval( tokenizerSource ); // alias the parse method this.tokenizer.tokenize = this.tokenizer.parse; // Also, while we are at it, create a tokenizer cache. PegTokenizer.prototype.cache = LRU(20); } if ( this.canCache ) { var maybeCached = this.cache.get(cacheKey); if ( maybeCached ) { this.env.tp( 'tokenizer cache hit for ' + cacheKey ); //console.warn( JSON.stringify( maybeCached, null, 2 ) ); for ( var i = 0, l = maybeCached.length; i < l; i++ ) { // emit a clone of this chunk this.emit('chunk', this.env.cloneTokens( maybeCached[i] )); } this.emit('end'); return; } else { this.cacheAccum.key = cacheKey; } } // Some input normalization: force a trailing newline if ( text.substring(text.length - 1) !== "\n" ) { text += "\n"; } // XXX: Commented out exception handling during development to get // reasonable traces. //try { var chunkCB; if ( this.canCache ) { chunkCB = this.onCacheChunk.bind( this ); } else { chunkCB = this.emit.bind( this, 'chunk' ); } this.tokenizer.tokenize(text, 'start', // callback chunkCB, // inline break test this ); this.onEnd(); //} catch (e) { //err = e; //console.trace(); //} finally { return { err: err }; //} }; PegTokenizer.prototype.onCacheChunk = function ( chunk ) { // make a deep copy of the chunk for now this.cacheAccum.chunks.push( chunk.slice() ); //console.warn( 'onCacheChunk: ' + this.cacheAccum.key + JSON.stringify( chunk, null, 2 ) ); this.emit('chunk', chunk); }; PegTokenizer.prototype.onEnd = function ( ) { if ( this.canCache ) { this.cache.set(this.cacheAccum.key, this.cacheAccum.chunks); // reset cacheAccum this.cacheAccum = { chunks: [] }; } this.emit('end'); }; PegTokenizer.prototype.processImageOptions = function( text ) { return this.tokenizer.tokenize(text, 'img_options', null, this ); }; /** * Tokenize a URL */ PegTokenizer.prototype.tokenizeURL = function( text ) { try { return this.tokenizer.tokenize(text, 'url', null, this ); } catch ( e ) { return false; } }; /* * Inline breaks, flag-enabled production which detects end positions for * active higher-level productions in inline and other nested productions. * Those inner productions are then exited, so that the outer production can * handle the end marker. */ PegTokenizer.prototype.inline_breaks = function (input, pos, stops ) { var counters = stops.counters; switch( input[pos] ) { case '=': return stops.onStack( 'equal' ) || ( counters.h && input.substr( pos + 1, 200) .match(/[ \t]*[\r\n]/) !== null ) || null; case '|': return counters.pipe || counters.template || counters.linkdesc || ( stops.onStack('table') && ( input[pos + 1].match(/[|}]/) !== null || counters.tableCellArg ) ) || null; case '{': // {{!}} pipe templates.. return ( counters.pipe || ( stops.onStack( 'table' ) && ( input.substr(pos, 10) === '{{!}}{{!}}' || counters.tableCellArg ) ) ) && input.substr( pos, 5 ) === '{{!}}' || null; case "!": return stops.onStack( 'table' ) && input[pos + 1] === "!" || null; case "}": return counters.template && input[pos + 1] === "}" || null; case ":": return counters.colon && ! stops.onStack( 'extlink' ) && ! counters.linkdesc || null; case "\r": return stops.onStack( 'table' ) && input.substr(pos, 4).match(/\r\n?[!|]/) !== null || null; case "\n": return stops.onStack( 'table' ) && input[pos + 1] === '!' || input[pos + 1] === '|' || null; case "]": return stops.onStack( 'extlink' ) || ( counters.linkdesc && input[pos + 1] === ']' ) || null; case "<": return ( counters.pre && input.substr( pos, 6 ) === '
' ) ||
				( counters.noinclude && input.substr(pos, 12) === '' ) ||
				( counters.includeonly && input.substr(pos, 14) === '' ) ||
				( counters.onlyinclude && input.substr(pos, 14) === '' ) ||
				null;
		default:
			return null;
	}
};

// Alternate version of the above. The hash is likely faster, but the nested
// function calls seem to cancel that out.
PegTokenizer.prototype.breakMap = {
	'=': function(input, pos, syntaxFlags) { 
		return syntaxFlags.equal ||
			( syntaxFlags.h &&
				input.substr( pos + 1, 200)
				.match(/[ \t]*[\r\n]/) !== null ) || null;
	},
	'|': function ( input, pos, syntaxFlags ) {
		return syntaxFlags.template ||
			syntaxFlags.linkdesc ||
			( syntaxFlags.table &&
				( 
					input[pos + 1].match(/[|}]/) !== null ||
					syntaxFlags.tableCellArg
				) 
			) || null;
	},
	"!": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.table && input[pos + 1] === "!" ||
			null;
	},
	"}": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.template && input[pos + 1] === "}" || null;
	},
	":": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.colon &&
			! syntaxFlags.extlink &&
			! syntaxFlags.linkdesc || null;
	},
	"\r": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.table &&
			input.substr(pos, 4).match(/\r\n?[!|]/) !== null ||
			null;
	},
	"\n": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.table &&
			input[pos + 1] === '!' ||
			input[pos + 1] === '|' ||
			null;
	},
	"]": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.extlink ||
			( syntaxFlags.linkdesc && input[pos + 1] === ']' ) ||
			null;
	},
	"<": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.pre &&  input.substr( pos, 6 ) === '
' || null; } }; PegTokenizer.prototype.inline_breaks_hash = function (input, pos, syntaxFlags ) { return this.breakMap[ input[pos] ]( input, pos, syntaxFlags); //console.warn( 'ilbn res: ' + JSON.stringify( [ res, input.substr( pos, 4 ) ] ) ); //return res; }; if (typeof module == "object") { module.exports.PegTokenizer = PegTokenizer; }