mediawiki-extensions-Visual.../modules/parser/mediawiki.tokenizer.peg.js
Gabriel Wicke 3be4992782 'Obama finally expands' ;) Misc fixes and documentation updates
* [[:en:Barack Obama]] can now be expanded in 77 seconds using 330MB RAM,
  while it would prevously run out of RAM after ~30 minutes. Wohoooo!
  The token transform framework rework really paid off.
* 303 parser tests are passing in the new record time of 5.5 seconds. Two more
  tests are passing since these tests expect the day of the week to be
  Thursday.  Won't be the case tomorrow.

Change-Id: I56e850838476b546df10c6a239c8c9e29a1a3136
2012-04-26 18:18:08 +02:00

279 lines
8 KiB
JavaScript

/**
* Tokenizer for wikitext, using PEG.js and a separate PEG grammar file
* (pegTokenizer.pegjs.txt)
*
* Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML
* output.
*
*/
var PEG = require('pegjs'),
path = require('path'),
LRU = require("lru-cache"),
fs = require('fs'),
$ = require('jquery'),
events = require('events'),
//colors = require('colors'),
defines = require('./mediawiki.parser.defines.js');
function PegTokenizer( env, canCache ) {
this.env = env;
this.canCache = canCache;
if ( this.canCache ) {
this.cacheAccum = { chunks: [] };
}
}
// Inherit from EventEmitter
PegTokenizer.prototype = new events.EventEmitter();
PegTokenizer.prototype.constructor = PegTokenizer;
PegTokenizer.src = false;
/*
* The main worker. Sets up event emission ('chunk' and 'end' events).
* Consumers are supposed to register with PegTokenizer before calling
* process().
*/
PegTokenizer.prototype.process = function( text, cacheKey ) {
var out, err;
if ( !this.tokenizer ) {
// Construct a singleton static tokenizer.
var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
this.src = fs.readFileSync( pegSrcPath, 'utf8' );
var tokenizerSource = PEG.buildParser(this.src,
{ cache: true, trackLineAndColumn: false }).toSource();
/* We patch the generated source to assign the arguments array for the
* parse function to a function-scoped variable. We use this to pass
* in callbacks and other information, which can be used from actions
* run when matching a production. In particular, we pass in a
* callback called for a chunk of tokens in toplevelblock. Setting this
* callback per call to parse() keeps the tokenizer reentrant, so that it
* can be reused to expand templates while a main parse is ongoing.
* PEG tokenizer construction is very expensive, so having a single
* reentrant tokenizer is a big win.
*
* We could also make modules available to the tokenizer by prepending
* requires to the source.
*/
tokenizerSource = tokenizerSource.replace( 'parse: function(input, startRule) {',
'parse: function(input, startRule) { var __parseArgs = arguments;' );
//console.warn( tokenizerSource );
PegTokenizer.prototype.tokenizer = eval( tokenizerSource );
// alias the parse method
this.tokenizer.tokenize = this.tokenizer.parse;
// Also, while we are at it, create a tokenizer cache.
PegTokenizer.prototype.cache = LRU(20);
}
if ( this.canCache ) {
var maybeCached = this.cache.get(cacheKey);
if ( maybeCached ) {
this.env.tp( 'tokenizer cache hit for ' + cacheKey );
//console.warn( JSON.stringify( maybeCached, null, 2 ) );
for ( var i = 0, l = maybeCached.length; i < l; i++ ) {
// emit a clone of this chunk
this.emit('chunk', this.env.cloneTokens( maybeCached[i] ));
}
this.emit('end');
return;
} else {
this.cacheAccum.key = cacheKey;
}
}
// Some input normalization: force a trailing newline
if ( text.substring(text.length - 1) !== "\n" ) {
text += "\n";
}
// XXX: Commented out exception handling during development to get
// reasonable traces.
try {
var chunkCB;
if ( this.canCache ) {
chunkCB = this.onCacheChunk.bind( this );
} else {
chunkCB = this.emit.bind( this, 'chunk' );
}
this.tokenizer.tokenize(text, 'start',
// callback
chunkCB,
// inline break test
this
);
this.onEnd();
} catch (e) {
err = e;
console.warn( e );
console.trace();
}
};
PegTokenizer.prototype.onCacheChunk = function ( chunk ) {
// make a deep copy of the chunk for now
this.cacheAccum.chunks.push( chunk.slice() );
//console.warn( 'onCacheChunk: ' + this.cacheAccum.key + JSON.stringify( chunk, null, 2 ) );
this.emit('chunk', chunk);
};
PegTokenizer.prototype.onEnd = function ( ) {
if ( this.canCache ) {
this.cache.set(this.cacheAccum.key, this.cacheAccum.chunks);
// reset cacheAccum
this.cacheAccum = { chunks: [] };
}
this.emit('end');
};
PegTokenizer.prototype.processImageOptions = function( text ) {
return this.tokenizer.tokenize(text, 'img_options', null, this );
};
/**
* Tokenize a URL
*/
PegTokenizer.prototype.tokenizeURL = function( text ) {
try {
return this.tokenizer.tokenize(text, 'url', null, this );
} catch ( e ) {
return false;
}
};
/*
* Inline breaks, flag-enabled production which detects end positions for
* active higher-level productions in inline and other nested productions.
* Those inner productions are then exited, so that the outer production can
* handle the end marker.
*/
PegTokenizer.prototype.inline_breaks = function (input, pos, stops ) {
var counters = stops.counters;
switch( input[pos] ) {
case '=':
return stops.onStack( 'equal' ) ||
( counters.h &&
input.substr( pos + 1, 200)
.match(/[ \t]*[\r\n]/) !== null ) || null;
case '|':
return counters.pipe ||
counters.template ||
counters.linkdesc ||
( stops.onStack('table') &&
( input[pos + 1].match(/[|}]/) !== null ||
counters.tableCellArg
)
) || null;
case '{':
// {{!}} pipe templates..
return (
counters.pipe ||
( stops.onStack( 'table' ) &&
(
input.substr(pos, 10) === '{{!}}{{!}}' ||
counters.tableCellArg
)
)
) && input.substr( pos, 5 ) === '{{!}}' || null;
case "!":
return stops.onStack( 'table' ) && input[pos + 1] === "!" ||
null;
case "}":
return counters.template && input[pos + 1] === "}" || null;
case ":":
return counters.colon &&
! stops.onStack( 'extlink' ) &&
! counters.linkdesc || null;
case "\r":
return stops.onStack( 'table' ) &&
input.substr(pos, 4).match(/\r\n?[!|]/) !== null ||
null;
case "\n":
return stops.onStack( 'table' ) &&
input[pos + 1] === '!' ||
input[pos + 1] === '|' ||
null;
case "]":
return stops.onStack( 'extlink' ) ||
( counters.linkdesc && input[pos + 1] === ']' ) ||
null;
case "<":
return ( counters.pre && input.substr( pos, 6 ) === '<pre>' ) ||
( counters.noinclude && input.substr(pos, 12) === '</noinclude>' ) ||
( counters.includeonly && input.substr(pos, 14) === '</includeonly>' ) ||
( counters.onlyinclude && input.substr(pos, 14) === '</onlyinclude>' ) ||
null;
default:
return null;
}
};
// Alternate version of the above. The hash is likely faster, but the nested
// function calls seem to cancel that out.
PegTokenizer.prototype.breakMap = {
'=': function(input, pos, syntaxFlags) {
return syntaxFlags.equal ||
( syntaxFlags.h &&
input.substr( pos + 1, 200)
.match(/[ \t]*[\r\n]/) !== null ) || null;
},
'|': function ( input, pos, syntaxFlags ) {
return syntaxFlags.template ||
syntaxFlags.linkdesc ||
( syntaxFlags.table &&
(
input[pos + 1].match(/[|}]/) !== null ||
syntaxFlags.tableCellArg
)
) || null;
},
"!": function ( input, pos, syntaxFlags ) {
return syntaxFlags.table && input[pos + 1] === "!" ||
null;
},
"}": function ( input, pos, syntaxFlags ) {
return syntaxFlags.template && input[pos + 1] === "}" || null;
},
":": function ( input, pos, syntaxFlags ) {
return syntaxFlags.colon &&
! syntaxFlags.extlink &&
! syntaxFlags.linkdesc || null;
},
"\r": function ( input, pos, syntaxFlags ) {
return syntaxFlags.table &&
input.substr(pos, 4).match(/\r\n?[!|]/) !== null ||
null;
},
"\n": function ( input, pos, syntaxFlags ) {
return syntaxFlags.table &&
input[pos + 1] === '!' ||
input[pos + 1] === '|' ||
null;
},
"]": function ( input, pos, syntaxFlags ) {
return syntaxFlags.extlink ||
( syntaxFlags.linkdesc && input[pos + 1] === ']' ) ||
null;
},
"<": function ( input, pos, syntaxFlags ) {
return syntaxFlags.pre && input.substr( pos, 6 ) === '</pre>' || null;
}
};
PegTokenizer.prototype.inline_breaks_hash = function (input, pos, syntaxFlags ) {
return this.breakMap[ input[pos] ]( input, pos, syntaxFlags);
//console.warn( 'ilbn res: ' + JSON.stringify( [ res, input.substr( pos, 4 ) ] ) );
//return res;
};
if (typeof module == "object") {
module.exports.PegTokenizer = PegTokenizer;
}