mediawiki-extensions-Visual.../modules/parser/mediawiki.tokenizer.peg.js

/**
 * Tokenizer for wikitext, using PEG.js and a separate PEG grammar file
 * (pegTokenizer.pegjs.txt)
 *
 * Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML
 * output.
 *
 */

var PEG = require('pegjs'),
	path = require('path'),
	fs = require('fs'),
	$ = require('jquery'),
	events = require('events'),
	defines = require('./mediawiki.parser.defines.js');

function PegTokenizer() {
}


// Inherit from EventEmitter
PegTokenizer.prototype = new events.EventEmitter();
PegTokenizer.prototype.constructor = PegTokenizer;

PegTokenizer.src = false;

/*
 * The main worker. Sets up event emission ('chunk' and 'end' events).
 * Consumers are supposed to register with PegTokenizer before calling
 * process().
 */
PegTokenizer.prototype.process = function( text ) {
	var out, err;
	if ( !this.parser ) {
		var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
		this.src = fs.readFileSync( pegSrcPath, 'utf8' );
		// Only create a single parser, as parse() is a static method.
		var parserSource = PEG.buildParser(this.src).toSource();
		//console.warn( parserSource );
		parserSource = parserSource.replace( 'parse: function(input, startRule) {',
					'parse: function(input, startRule) { var __parseArgs = arguments;' );
		//console.warn( parserSource );
		PegTokenizer.prototype.parser = eval( parserSource );
	}

	// Some input normalization: force a trailing newline
	if ( text.substring(text.length - 1) !== "\n" ) {
		text += "\n";
	}

	// XXX: Commented out exception handling during development to get
	// reasonable traces.
	//try {
		this.parser.parse(text, 'start', 
				// callback
				this.emit.bind( this, 'chunk' ),
				// inline break test
				this
				);
		this.emit('end');
	//} catch (e) {
		//err = e;
		//console.trace();
	//} finally {
		return { err: err };
	//}
};


/*
 * Inline breaks, flag-enabled production which detects end positions for
 * active higher-level productions in inline and other nested productions.
 * Those inner productions are then exited, so that the outer production can
 * handle the end marker.
 */
PegTokenizer.prototype.inline_breaks = function (input, pos, syntaxFlags ) {
	switch( input[pos] ) {
		case '=':
			return syntaxFlags.equal ||
				( syntaxFlags.h &&
				  input.substr( pos + 1, 200)
				  .match(/[ \t]*[\r\n]/) !== null ) || null;
		case '|':
			return syntaxFlags.template ||
				( syntaxFlags.table &&
				  ( input[pos + 1].match(/[|}]/) !== null ||
					syntaxFlags.tableCellArg
				  ) 
				) || null;
		case "!":
			return syntaxFlags.table && input[pos + 1] === "!" ||
				null;
		case "}":
			return syntaxFlags.template && input[pos + 1] === "}" || null;
		case ":":
			return syntaxFlags.colon &&
				! syntaxFlags.extlink &&
				! syntaxFlags.linkdesc || null;
		case "\r":
			return syntaxFlags.table &&
				input.substr(pos, 4).match(/\r\n?[!|]/) !== null ||
				null;
		case "\n":
			return syntaxFlags.table &&
				input[pos + 1] === '!' ||
				input[pos + 1] === '|' ||
				null;
		case "]":
			return syntaxFlags.extlink ||
				( syntaxFlags.linkdesc && input[pos + 1] === ']' ) ||
				null;
		case "<":
			return syntaxFlags.pre &&  input.substr( pos, 6 ) === '</pre>' || null;
		default:
			return null;
	}
};

// Alternate version of the above. The hash is likely faster, but the nested
// function calls seem to cancel that out.
PegTokenizer.prototype.breakMap = {
	'=': function(input, pos, syntaxFlags) { 
		return syntaxFlags.equal ||
			( syntaxFlags.h &&
			  input.substr( pos + 1, 200)
			  .match(/[ \t]*[\r\n]/) !== null ) || null;
	},
	'|': function ( input, pos, syntaxFlags ) {
		return syntaxFlags.template ||
			( syntaxFlags.table &&
			  ( input[pos + 1].match(/[|}]/) !== null ||
				syntaxFlags.tableCellArg
			  ) 
			) || null;
	},
	"!": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.table && input[pos + 1] === "!" ||
			null;
	},
	"}": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.template && input[pos + 1] === "}" || null;
	},
	":": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.colon &&
			! syntaxFlags.extlink &&
			! syntaxFlags.linkdesc || null;
	},
	"\r": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.table &&
			input.substr(pos, 4).match(/\r\n?[!|]/) !== null ||
			null;
	},
	"\n": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.table &&
			input[pos + 1] === '!' ||
			input[pos + 1] === '|' ||
			null;
	},
	"]": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.extlink ||
			( syntaxFlags.linkdesc && input[pos + 1] === ']' ) ||
			null;
	},
	"<": function ( input, pos, syntaxFlags ) {
		return syntaxFlags.pre &&  input.substr( pos, 6 ) === '</pre>' || null;
	}
};

PegTokenizer.prototype.inline_breaks_hash = function (input, pos, syntaxFlags ) {
	return this.breakMap[ input[pos] ]( input, pos, syntaxFlags);
	//console.warn( 'ilbn res: ' + JSON.stringify( [ res, input.substr( pos, 4 ) ] ) );
	//return res;
};


if (typeof module == "object") {
	module.exports.PegTokenizer = PegTokenizer;
}
Copy several of the experimental JS parser bits from ParserPlayground to VisualEditor. They'll need retooling to hook up with the wikidom stuff. 2011-11-02 21:07:51 +00:00			`/**`
A bit of comment clean-up and wrapping of tree building into try/catch block to actually count failures. 2011-12-08 11:40:59 +00:00			`* Tokenizer for wikitext, using PEG.js and a separate PEG grammar file`
			`* (pegTokenizer.pegjs.txt)`
Copy several of the experimental JS parser bits from ParserPlayground to VisualEditor. They'll need retooling to hook up with the wikidom stuff. 2011-11-02 21:07:51 +00:00			`*`
A bit of comment clean-up and wrapping of tree building into try/catch block to actually count failures. 2011-12-08 11:40:59 +00:00			`* Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML`
			`* output.`
Remove some more unused code and tidy up some more. 2012-02-21 18:26:40 +00:00			`*`
Copy several of the experimental JS parser bits from ParserPlayground to VisualEditor. They'll need retooling to hook up with the wikidom stuff. 2011-11-02 21:07:51 +00:00			`*/`
Refactor parserTests somewhat into a class-like structure, and wire up the TokenTransformer. 2011-12-12 14:03:54 +00:00
Remove env and load grammar in tokenizer constructor. Re-add property hack to keep parserTests running for now. Really need a different pipeline for html serialization or a reference to the HTML DOM. 2011-12-28 17:04:16 +00:00			`var PEG = require('pegjs'),`
			`path = require('path'),`
Land big TokenTransformDispatcher and eventization refactoring. The TokenTransformDispatcher now actually implements an asynchronous, phased token transformation framework as described in https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations. Additionally, the parser pipeline is now mostly held together using events. The tokenizer still emits a lame single events with all tokens, as block-level emission failed with scoping issues specific to the PEGJS parser generator. All stages clean up when receiving the end tokens, so that the full pipeline can be used for repeated parsing. The QuoteTransformer is not yet 100% fixed to work with the new interface, and the Cite extension is disabled for now pending adaptation. Bold-italic related tests are failing currently. 2012-01-03 18:44:31 +00:00			`fs = require('fs'),`
Rename ParseThingy to ParserPipeline and fix up broken WikiDom generation and commandline runner. 2012-01-04 08:39:45 +00:00			`$ = require('jquery'),`
Change token format to plain strings for text tokens, and specific objects for other tokens. This is only the first half of the conversion. The next step is to drop the type attribute on most tokens and match on the constructor in the token transform machinery. 2012-02-01 16:30:43 +00:00			`events = require('events'),`
			`defines = require('./mediawiki.parser.defines.js');`
Refactor parserTests somewhat into a class-like structure, and wire up the TokenTransformer. 2011-12-12 14:03:54 +00:00
Remove env and load grammar in tokenizer constructor. Re-add property hack to keep parserTests running for now. Really need a different pipeline for html serialization or a reference to the HTML DOM. 2011-12-28 17:04:16 +00:00			`function PegTokenizer() {`
Copy several of the experimental JS parser bits from ParserPlayground to VisualEditor. They'll need retooling to hook up with the wikidom stuff. 2011-11-02 21:07:51 +00:00			`}`

Change token format to plain strings for text tokens, and specific objects for other tokens. This is only the first half of the conversion. The next step is to drop the type attribute on most tokens and match on the constructor in the token transform machinery. 2012-02-01 16:30:43 +00:00

Land big TokenTransformDispatcher and eventization refactoring. The TokenTransformDispatcher now actually implements an asynchronous, phased token transformation framework as described in https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations. Additionally, the parser pipeline is now mostly held together using events. The tokenizer still emits a lame single events with all tokens, as block-level emission failed with scoping issues specific to the PEGJS parser generator. All stages clean up when receiving the end tokens, so that the full pipeline can be used for repeated parsing. The QuoteTransformer is not yet 100% fixed to work with the new interface, and the Cite extension is disabled for now pending adaptation. Bold-italic related tests are failing currently. 2012-01-03 18:44:31 +00:00			`// Inherit from EventEmitter`
			`PegTokenizer.prototype = new events.EventEmitter();`
Fix up constructors in EventEmitter inheritance and tweak a few more comments. 2012-01-04 12:28:41 +00:00			`PegTokenizer.prototype.constructor = PegTokenizer;`
Land big TokenTransformDispatcher and eventization refactoring. The TokenTransformDispatcher now actually implements an asynchronous, phased token transformation framework as described in https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations. Additionally, the parser pipeline is now mostly held together using events. The tokenizer still emits a lame single events with all tokens, as block-level emission failed with scoping issues specific to the PEGJS parser generator. All stages clean up when receiving the end tokens, so that the full pipeline can be used for repeated parsing. The QuoteTransformer is not yet 100% fixed to work with the new interface, and the Cite extension is disabled for now pending adaptation. Bold-italic related tests are failing currently. 2012-01-03 18:44:31 +00:00
Further renaming, this time from pegParser to pegTokenizer. 2011-12-08 10:59:44 +00:00			`PegTokenizer.src = false;`
Copy several of the experimental JS parser bits from ParserPlayground to VisualEditor. They'll need retooling to hook up with the wikidom stuff. 2011-11-02 21:07:51 +00:00
Remove some more unused code and tidy up some more. 2012-02-21 18:26:40 +00:00			`/*`
			`* The main worker. Sets up event emission ('chunk' and 'end' events).`
			`* Consumers are supposed to register with PegTokenizer before calling`
			`* process().`
			`*/`
A bit of cleanup in ParserPipeline, with better and more consistent support for multiple input types. 2012-01-09 19:33:49 +00:00			`PegTokenizer.prototype.process = function( text ) {`
Refactor parserTests somewhat into a class-like structure, and wire up the TokenTransformer. 2011-12-12 14:03:54 +00:00			`var out, err;`
			`if ( !this.parser ) {`
Add productions for image option tokenization, and prepare to call those from the LinkHandler token stream transformer. 2012-03-01 18:07:20 +00:00			`var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );`
			`this.src = fs.readFileSync( pegSrcPath, 'utf8' );`
A bit of cleanup in ParserPipeline, with better and more consistent support for multiple input types. 2012-01-09 19:33:49 +00:00			`// Only create a single parser, as parse() is a static method.`
* Emit token chunks for top-level block elements by patching the source of the tokenizer * Fix a bug uncovered by this * Increase the number of outstanding listeners on a single download to 10000 2012-01-22 23:21:53 +00:00			`var parserSource = PEG.buildParser(this.src).toSource();`
Replace console.log with console.warn in all debug statements 2012-02-14 20:56:14 +00:00			`//console.warn( parserSource );`
* Emit token chunks for top-level block elements by patching the source of the tokenizer * Fix a bug uncovered by this * Increase the number of outstanding listeners on a single download to 10000 2012-01-22 23:21:53 +00:00			`parserSource = parserSource.replace( 'parse: function(input, startRule) {',`
			`'parse: function(input, startRule) { var __parseArgs = arguments;' );`
Replace console.log with console.warn in all debug statements 2012-02-14 20:56:14 +00:00			`//console.warn( parserSource );`
* Emit token chunks for top-level block elements by patching the source of the tokenizer * Fix a bug uncovered by this * Increase the number of outstanding listeners on a single download to 10000 2012-01-22 23:21:53 +00:00			`PegTokenizer.prototype.parser = eval( parserSource );`
Refactor parserTests somewhat into a class-like structure, and wire up the TokenTransformer. 2011-12-12 14:03:54 +00:00			`}`
remove need to add newline at end of input 2011-12-28 01:37:11 +00:00
Remove some more unused code and tidy up some more. 2012-02-21 18:26:40 +00:00			`// Some input normalization: force a trailing newline`
fix substr for IE, followup r107464 2011-12-30 21:51:03 +00:00			`if ( text.substring(text.length - 1) !== "\n" ) {`
remove need to add newline at end of input 2011-12-28 01:37:11 +00:00			`text += "\n";`
			`}`

Land big TokenTransformDispatcher and eventization refactoring. The TokenTransformDispatcher now actually implements an asynchronous, phased token transformation framework as described in https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations. Additionally, the parser pipeline is now mostly held together using events. The tokenizer still emits a lame single events with all tokens, as block-level emission failed with scoping issues specific to the PEGJS parser generator. All stages clean up when receiving the end tokens, so that the full pipeline can be used for repeated parsing. The QuoteTransformer is not yet 100% fixed to work with the new interface, and the Cite extension is disabled for now pending adaptation. Bold-italic related tests are failing currently. 2012-01-03 18:44:31 +00:00			`// XXX: Commented out exception handling during development to get`
Remove some more unused code and tidy up some more. 2012-02-21 18:26:40 +00:00			`// reasonable traces.`
Land big TokenTransformDispatcher and eventization refactoring. The TokenTransformDispatcher now actually implements an asynchronous, phased token transformation framework as described in https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations. Additionally, the parser pipeline is now mostly held together using events. The tokenizer still emits a lame single events with all tokens, as block-level emission failed with scoping issues specific to the PEGJS parser generator. All stages clean up when receiving the end tokens, so that the full pipeline can be used for repeated parsing. The QuoteTransformer is not yet 100% fixed to work with the new interface, and the Cite extension is disabled for now pending adaptation. Bold-italic related tests are failing currently. 2012-01-03 18:44:31 +00:00			`//try {`
Tidy up and comment the tokenizer a bit more. Start to move code into mediawiki.tokenizer.js module, and pass a reference to parse(). Faster inline_breaks production using a JS function which seems to be generally correct, but still breaks five tests when enabled. Seems to be some weird interaction with peg.js, possibly something to do with caching. 2012-02-21 17:21:42 +00:00			`this.parser.parse(text, 'start',`
			`// callback`
			`this.emit.bind( this, 'chunk' ),`
			`// inline break test`
			`this`
			`);`
Land big TokenTransformDispatcher and eventization refactoring. The TokenTransformDispatcher now actually implements an asynchronous, phased token transformation framework as described in https://www.mediawiki.org/wiki/Future/Parser_development/Token_stream_transformations. Additionally, the parser pipeline is now mostly held together using events. The tokenizer still emits a lame single events with all tokens, as block-level emission failed with scoping issues specific to the PEGJS parser generator. All stages clean up when receiving the end tokens, so that the full pipeline can be used for repeated parsing. The QuoteTransformer is not yet 100% fixed to work with the new interface, and the Cite extension is disabled for now pending adaptation. Bold-italic related tests are failing currently. 2012-01-03 18:44:31 +00:00			`this.emit('end');`
			`//} catch (e) {`
			`//err = e;`
			`//console.trace();`
			`//} finally {`
			`return { err: err };`
			`//}`
			`};`
put add terminal token inside tokenize method (will pull it out again for streaming interface) 2011-12-28 01:37:15 +00:00
Remove some more unused code and tidy up some more. 2012-02-21 18:26:40 +00:00
			`/*`
			`* Inline breaks, flag-enabled production which detects end positions for`
			`* active higher-level productions in inline and other nested productions.`
			`* Those inner productions are then exited, so that the outer production can`
			`* handle the end marker.`
			`*/`
			`PegTokenizer.prototype.inline_breaks = function (input, pos, syntaxFlags ) {`
			`switch( input[pos] ) {`
			`case '=':`
			`return syntaxFlags.equal \|\|`
			`( syntaxFlags.h &&`
			`input.substr( pos + 1, 200)`
			`.match(/[ \t]*[\r\n]/) !== null ) \|\| null;`
			`case '\|':`
			`return syntaxFlags.template \|\|`
			`( syntaxFlags.table &&`
			`( input[pos + 1].match(/[\|}]/) !== null \|\|`
			`syntaxFlags.tableCellArg`
			`)`
			`) \|\| null;`
			`case "!":`
			`return syntaxFlags.table && input[pos + 1] === "!" \|\|`
			`null;`
			`case "}":`
			`return syntaxFlags.template && input[pos + 1] === "}" \|\| null;`
			`case ":":`
			`return syntaxFlags.colon &&`
			`! syntaxFlags.extlink &&`
			`! syntaxFlags.linkdesc \|\| null;`
			`case "\r":`
			`return syntaxFlags.table &&`
			`input.substr(pos, 4).match(/\r\n?[!\|]/) !== null \|\|`
			`null;`
			`case "\n":`
			`return syntaxFlags.table &&`
			`input[pos + 1] === '!' \|\|`
			`input[pos + 1] === '\|' \|\|`
			`null;`
			`case "]":`
			`return syntaxFlags.extlink \|\|`
			`( syntaxFlags.linkdesc && input[pos + 1] === ']' ) \|\|`
			`null;`
			`case "<":`
			`return syntaxFlags.pre && input.substr( pos, 6 ) === '</pre>' \|\| null;`
			`default:`
			`return null;`
			`}`
			`};`

			`// Alternate version of the above. The hash is likely faster, but the nested`
			`// function calls seem to cancel that out.`
Tidy up and comment the tokenizer a bit more. Start to move code into mediawiki.tokenizer.js module, and pass a reference to parse(). Faster inline_breaks production using a JS function which seems to be generally correct, but still breaks five tests when enabled. Seems to be some weird interaction with peg.js, possibly something to do with caching. 2012-02-21 17:21:42 +00:00			`PegTokenizer.prototype.breakMap = {`
			`'=': function(input, pos, syntaxFlags) {`
			`return syntaxFlags.equal \|\|`
			`( syntaxFlags.h &&`
			`input.substr( pos + 1, 200)`
			`.match(/[ \t]*[\r\n]/) !== null ) \|\| null;`
			`},`
			`'\|': function ( input, pos, syntaxFlags ) {`
			`return syntaxFlags.template \|\|`
			`( syntaxFlags.table &&`
			`( input[pos + 1].match(/[\|}]/) !== null \|\|`
			`syntaxFlags.tableCellArg`
			`)`
			`) \|\| null;`
			`},`
			`"!": function ( input, pos, syntaxFlags ) {`
			`return syntaxFlags.table && input[pos + 1] === "!" \|\|`
			`null;`
			`},`
			`"}": function ( input, pos, syntaxFlags ) {`
			`return syntaxFlags.template && input[pos + 1] === "}" \|\| null;`
			`},`
			`":": function ( input, pos, syntaxFlags ) {`
			`return syntaxFlags.colon &&`
			`! syntaxFlags.extlink &&`
			`! syntaxFlags.linkdesc \|\| null;`
			`},`
			`"\r": function ( input, pos, syntaxFlags ) {`
			`return syntaxFlags.table &&`
Fix the bug in the inline_breaks replacement, and write another switch-based version, which is slightly faster and shorter. Performance is improved by about 5% for parserTests. 2012-02-21 17:57:30 +00:00			`input.substr(pos, 4).match(/\r\n?[!\|]/) !== null \|\|`
Tidy up and comment the tokenizer a bit more. Start to move code into mediawiki.tokenizer.js module, and pass a reference to parse(). Faster inline_breaks production using a JS function which seems to be generally correct, but still breaks five tests when enabled. Seems to be some weird interaction with peg.js, possibly something to do with caching. 2012-02-21 17:21:42 +00:00			`null;`
			`},`
			`"\n": function ( input, pos, syntaxFlags ) {`
			`return syntaxFlags.table &&`
Fix the bug in the inline_breaks replacement, and write another switch-based version, which is slightly faster and shorter. Performance is improved by about 5% for parserTests. 2012-02-21 17:57:30 +00:00			`input[pos + 1] === '!' \|\|`
			`input[pos + 1] === '\|' \|\|`
Tidy up and comment the tokenizer a bit more. Start to move code into mediawiki.tokenizer.js module, and pass a reference to parse(). Faster inline_breaks production using a JS function which seems to be generally correct, but still breaks five tests when enabled. Seems to be some weird interaction with peg.js, possibly something to do with caching. 2012-02-21 17:21:42 +00:00			`null;`
			`},`
			`"]": function ( input, pos, syntaxFlags ) {`
			`return syntaxFlags.extlink \|\|`
			`( syntaxFlags.linkdesc && input[pos + 1] === ']' ) \|\|`
			`null;`
			`},`
			`"<": function ( input, pos, syntaxFlags ) {`
			`return syntaxFlags.pre && input.substr( pos, 6 ) === '</pre>' \|\| null;`
			`}`
			`};`

Remove some more unused code and tidy up some more. 2012-02-21 18:26:40 +00:00			`PegTokenizer.prototype.inline_breaks_hash = function (input, pos, syntaxFlags ) {`
Fix the bug in the inline_breaks replacement, and write another switch-based version, which is slightly faster and shorter. Performance is improved by about 5% for parserTests. 2012-02-21 17:57:30 +00:00			`return this.breakMap[ input[pos] ]( input, pos, syntaxFlags);`
			`//console.warn( 'ilbn res: ' + JSON.stringify( [ res, input.substr( pos, 4 ) ] ) );`
			`//return res;`
			`};`

Copy several of the experimental JS parser bits from ParserPlayground to VisualEditor. They'll need retooling to hook up with the wikidom stuff. 2011-11-02 21:07:51 +00:00
Tidy up and comment the tokenizer a bit more. Start to move code into mediawiki.tokenizer.js module, and pass a reference to parse(). Faster inline_breaks production using a JS function which seems to be generally correct, but still breaks five tests when enabled. Seems to be some weird interaction with peg.js, possibly something to do with caching. 2012-02-21 17:21:42 +00:00
Copy several of the experimental JS parser bits from ParserPlayground to VisualEditor. They'll need retooling to hook up with the wikidom stuff. 2011-11-02 21:07:51 +00:00			`if (typeof module == "object") {`
Further renaming, this time from pegParser to pegTokenizer. 2011-12-08 10:59:44 +00:00			`module.exports.PegTokenizer = PegTokenizer;`
Copy several of the experimental JS parser bits from ParserPlayground to VisualEditor. They'll need retooling to hook up with the wikidom stuff. 2011-11-02 21:07:51 +00:00			`}`