mediawiki-extensions-Visual.../modules/parser/mediawiki.tokenizer.peg.js
Gabriel Wicke bf84638bc0 Add tokenizer cache and clone token state on mutation
* Added an LRU cache (using the lru-cache node module) for tokenizer output
* Mutation of nested attributes now replaces the containers. A shallow copy of
  tokens is sufficient to isolate token transformations. Need to investigate
  if we can actually get away without isolation and re-transformation for most
  ordinary tokens.

Change-Id: I9136b1d7a1fbcc538183a319d4ecaa290d616fdf
2012-04-18 14:40:47 +02:00

279 lines
7.9 KiB
JavaScript

/**
* Tokenizer for wikitext, using PEG.js and a separate PEG grammar file
* (pegTokenizer.pegjs.txt)
*
* Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML
* output.
*
*/
var PEG = require('pegjs'),
path = require('path'),
LRU = require("lru-cache"),
fs = require('fs'),
$ = require('jquery'),
events = require('events'),
//colors = require('colors'),
defines = require('./mediawiki.parser.defines.js');
function PegTokenizer( env, canCache ) {
this.env = env;
this.canCache = canCache;
if ( this.canCache ) {
this.cacheAccum = { chunks: [] };
}
}
// Inherit from EventEmitter
PegTokenizer.prototype = new events.EventEmitter();
PegTokenizer.prototype.constructor = PegTokenizer;
PegTokenizer.src = false;
/*
* The main worker. Sets up event emission ('chunk' and 'end' events).
* Consumers are supposed to register with PegTokenizer before calling
* process().
*/
PegTokenizer.prototype.process = function( text, cacheKey ) {
var out, err;
if ( !this.tokenizer ) {
// Construct a singleton static tokenizer.
var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
this.src = fs.readFileSync( pegSrcPath, 'utf8' );
var tokenizerSource = PEG.buildParser(this.src).toSource();
/* We patch the generated source to assign the arguments array for the
* parse function to a function-scoped variable. We use this to pass
* in callbacks and other information, which can be used from actions
* run when matching a production. In particular, we pass in a
* callback called for a chunk of tokens in toplevelblock. Setting this
* callback per call to parse() keeps the tokenizer reentrant, so that it
* can be reused to expand templates while a main parse is ongoing.
* PEG tokenizer construction is very expensive, so having a single
* reentrant tokenizer is a big win.
*
* We could also make modules available to the tokenizer by prepending
* requires to the source.
*/
tokenizerSource = tokenizerSource.replace( 'parse: function(input, startRule) {',
'parse: function(input, startRule) { var __parseArgs = arguments;' );
//console.warn( tokenizerSource );
PegTokenizer.prototype.tokenizer = eval( tokenizerSource );
// alias the parse method
this.tokenizer.tokenize = this.tokenizer.parse;
// Also, while we are at it, create a tokenizer cache.
PegTokenizer.prototype.cache = LRU(20);
}
if ( this.canCache ) {
var maybeCached = this.cache.get(cacheKey);
if ( maybeCached ) {
this.env.ap( 'tokenizer cache hit for ' + cacheKey );
//console.warn( JSON.stringify( maybeCached, null, 2 ) );
for ( var i = 0, l = maybeCached.length; i < l; i++ ) {
// emit a clone of this chunk
this.emit('chunk', this.env.cloneTokens( maybeCached[i] ));
}
this.emit('end');
return;
} else {
this.cacheAccum.key = cacheKey;
}
}
// Some input normalization: force a trailing newline
if ( text.substring(text.length - 1) !== "\n" ) {
text += "\n";
}
// XXX: Commented out exception handling during development to get
// reasonable traces.
//try {
var chunkCB;
if ( this.canCache ) {
chunkCB = this.onCacheChunk.bind( this );
} else {
chunkCB = this.emit.bind( this, 'chunk' );
}
this.tokenizer.tokenize(text, 'start',
// callback
chunkCB,
// inline break test
this
);
this.onEnd();
//} catch (e) {
//err = e;
//console.trace();
//} finally {
return { err: err };
//}
};
PegTokenizer.prototype.onCacheChunk = function ( chunk ) {
// make a deep copy of the chunk for now
this.cacheAccum.chunks.push( this.env.cloneTokens( chunk ) );
//console.warn( 'onCacheChunk: ' + this.cacheAccum.key + JSON.stringify( chunk, null, 2 ) );
this.emit('chunk', chunk);
};
PegTokenizer.prototype.onEnd = function ( ) {
if ( this.canCache ) {
this.cache.set(this.cacheAccum.key, this.cacheAccum.chunks);
// reset cacheAccum
this.cacheAccum = { chunks: [] };
}
this.emit('end');
};
PegTokenizer.prototype.processImageOptions = function( text ) {
return this.tokenizer.tokenize(text, 'img_options', null, this );
};
/**
* Tokenize a URL
*/
PegTokenizer.prototype.tokenizeURL = function( text ) {
try {
return this.tokenizer.tokenize(text, 'url', null, this );
} catch ( e ) {
return false;
}
};
/*
* Inline breaks, flag-enabled production which detects end positions for
* active higher-level productions in inline and other nested productions.
* Those inner productions are then exited, so that the outer production can
* handle the end marker.
*/
PegTokenizer.prototype.inline_breaks = function (input, pos, stops ) {
var counters = stops.counters;
switch( input[pos] ) {
case '=':
return stops.onStack( 'equal' ) ||
( counters.h &&
input.substr( pos + 1, 200)
.match(/[ \t]*[\r\n]/) !== null ) || null;
case '|':
return counters.pipe ||
counters.template ||
counters.linkdesc ||
( stops.onStack('table') &&
( input[pos + 1].match(/[|}]/) !== null ||
counters.tableCellArg
)
) || null;
case '{':
// {{!}} pipe templates..
return (
counters.pipe ||
( stops.onStack( 'table' ) &&
(
input.substr(pos, 10) === '{{!}}{{!}}' ||
counters.tableCellArg
)
)
) && input.substr( pos, 5 ) === '{{!}}' || null;
case "!":
return stops.onStack( 'table' ) && input[pos + 1] === "!" ||
null;
case "}":
return counters.template && input[pos + 1] === "}" || null;
case ":":
return counters.colon &&
! stops.onStack( 'extlink' ) &&
! counters.linkdesc || null;
case "\r":
return stops.onStack( 'table' ) &&
input.substr(pos, 4).match(/\r\n?[!|]/) !== null ||
null;
case "\n":
return stops.onStack( 'table' ) &&
input[pos + 1] === '!' ||
input[pos + 1] === '|' ||
null;
case "]":
return stops.onStack( 'extlink' ) ||
( counters.linkdesc && input[pos + 1] === ']' ) ||
null;
case "<":
return ( counters.pre && input.substr( pos, 6 ) === '<pre>' ) ||
( counters.noinclude && input.substr(pos, 12) === '</noinclude>' ) ||
( counters.includeonly && input.substr(pos, 14) === '</includeonly>' ) ||
( counters.onlyinclude && input.substr(pos, 14) === '</onlyinclude>' ) ||
null;
default:
return null;
}
};
// Alternate version of the above. The hash is likely faster, but the nested
// function calls seem to cancel that out.
PegTokenizer.prototype.breakMap = {
'=': function(input, pos, syntaxFlags) {
return syntaxFlags.equal ||
( syntaxFlags.h &&
input.substr( pos + 1, 200)
.match(/[ \t]*[\r\n]/) !== null ) || null;
},
'|': function ( input, pos, syntaxFlags ) {
return syntaxFlags.template ||
syntaxFlags.linkdesc ||
( syntaxFlags.table &&
(
input[pos + 1].match(/[|}]/) !== null ||
syntaxFlags.tableCellArg
)
) || null;
},
"!": function ( input, pos, syntaxFlags ) {
return syntaxFlags.table && input[pos + 1] === "!" ||
null;
},
"}": function ( input, pos, syntaxFlags ) {
return syntaxFlags.template && input[pos + 1] === "}" || null;
},
":": function ( input, pos, syntaxFlags ) {
return syntaxFlags.colon &&
! syntaxFlags.extlink &&
! syntaxFlags.linkdesc || null;
},
"\r": function ( input, pos, syntaxFlags ) {
return syntaxFlags.table &&
input.substr(pos, 4).match(/\r\n?[!|]/) !== null ||
null;
},
"\n": function ( input, pos, syntaxFlags ) {
return syntaxFlags.table &&
input[pos + 1] === '!' ||
input[pos + 1] === '|' ||
null;
},
"]": function ( input, pos, syntaxFlags ) {
return syntaxFlags.extlink ||
( syntaxFlags.linkdesc && input[pos + 1] === ']' ) ||
null;
},
"<": function ( input, pos, syntaxFlags ) {
return syntaxFlags.pre && input.substr( pos, 6 ) === '</pre>' || null;
}
};
PegTokenizer.prototype.inline_breaks_hash = function (input, pos, syntaxFlags ) {
return this.breakMap[ input[pos] ]( input, pos, syntaxFlags);
//console.warn( 'ilbn res: ' + JSON.stringify( [ res, input.substr( pos, 4 ) ] ) );
//return res;
};
if (typeof module == "object") {
module.exports.PegTokenizer = PegTokenizer;
}