A bit more documentation and naming cleanup in the tokenizer wrapper.

https://mediawiki.org/wiki/Special:Code/MediaWiki/113351
2024-09-27 12:16:51 +00:00 · 2012-03-08 09:00:45 +00:00 · 2012-03-08 09:00:45 +00:00 · b1e131d568 · 2012-03-08 09:00:45 +00:00
parent 459c4fa271
commit b1e131d568
2 changed files with 31 additions and 13 deletions
--- a/modules/parser/ext.core.LinkHandler.js
+++ b/modules/parser/ext.core.LinkHandler.js
@ -249,10 +249,10 @@ ExternalLinkHandler.prototype.onExtLink = function ( token, manager, cb ) {
 	//console.warn('extlink href: ' + href );
 	//console.warn( 'content: ' + JSON.stringify( content, null, 2 ) );
 	// validate the href
-	if ( this.imageParser.parseURL( href ) ) {
+	if ( this.imageParser.tokenizeURL( href ) ) {
 		if ( content.length === 1 && 
 				content[0].constructor === String &&
-				this.imageParser.parseURL( content[0] ) &&
+				this.imageParser.tokenizeURL( content[0] ) &&
 				this._isImageLink( content[0] ) )
 		{
 			var src = content[0];
--- a/modules/parser/mediawiki.tokenizer.peg.js
+++ b/modules/parser/mediawiki.tokenizer.peg.js
@ -32,16 +32,31 @@ PegTokenizer.src = false;
 */
 PegTokenizer.prototype.process = function( text ) {
 	var out, err;
-	if ( !this.parser ) {
+	if ( !this.tokenizer ) {
+		// Construct a singleton static tokenizer.
 		var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
 		this.src = fs.readFileSync( pegSrcPath, 'utf8' );
-		// Only create a single parser, as parse() is a static method.
-		var parserSource = PEG.buildParser(this.src).toSource();
-		//console.warn( parserSource );
-		parserSource = parserSource.replace( 'parse: function(input, startRule) {',
+		var tokenizerSource = PEG.buildParser(this.src).toSource();
+
+		/* We patch the generated source to assign the arguments array for the
+		* parse function to a function-scoped variable. We use this to pass
+		* in callbacks and other information, which can be used from actions
+		* run when matching a production. In particular, we pass in a
+		* callback called for a chunk of tokens in toplevelblock. Setting this
+		* callback per call to parse() keeps the tokenizer reentrant, so that it
+		* can be reused to expand templates while a main parse is ongoing.
+		* PEG tokenizer construction is very expensive, so having a single
+		* reentrant tokenizer is a big win.
+		*
+		* We could also make modules available to the tokenizer by prepending
+		* requires to the source.
+		*/
+		tokenizerSource = tokenizerSource.replace( 'parse: function(input, startRule) {',
 					'parse: function(input, startRule) { var __parseArgs = arguments;' );
-		//console.warn( parserSource );
-		PegTokenizer.prototype.parser = eval( parserSource );
+		//console.warn( tokenizerSource );
+		PegTokenizer.prototype.tokenizer = eval( tokenizerSource );
+		// alias the parse method
+		this.tokenizer.tokenize = this.tokenizer.parse;
 	}

 	// Some input normalization: force a trailing newline
@ -52,7 +67,7 @@ PegTokenizer.prototype.process = function( text ) {
 	// XXX: Commented out exception handling during development to get
 	// reasonable traces.
 	//try {
-		this.parser.parse(text, 'start', 
+		this.tokenizer.tokenize(text, 'start', 
 				// callback
 				this.emit.bind( this, 'chunk' ),
 				// inline break test
@ -68,12 +83,15 @@ PegTokenizer.prototype.process = function( text ) {
 };

 PegTokenizer.prototype.processImageOptions = function( text ) {
-		return this.parser.parse(text, 'img_options', null, this );
+		return this.tokenizer.tokenize(text, 'img_options', null, this );
 };

-PegTokenizer.prototype.parseURL = function( text ) {
+/**
+ * Tokenize a URL
+ */
+PegTokenizer.prototype.tokenizeURL = function( text ) {
 	try {
-		return this.parser.parse(text, 'url', null, this );
+		return this.tokenizer.tokenize(text, 'url', null, this );
 	} catch ( e ) {
 		return false;
 	}