2011-11-02 21:07:51 +00:00
|
|
|
/**
|
2011-12-08 11:40:59 +00:00
|
|
|
* Tokenizer for wikitext, using PEG.js and a separate PEG grammar file
|
|
|
|
* (pegTokenizer.pegjs.txt)
|
2011-11-02 21:07:51 +00:00
|
|
|
*
|
2011-12-08 11:40:59 +00:00
|
|
|
* Use along with a HTML5TreeBuilder and the DOMPostProcessor(s) for HTML
|
|
|
|
* output.
|
2012-02-21 18:26:40 +00:00
|
|
|
*
|
2011-11-02 21:07:51 +00:00
|
|
|
*/
|
2011-12-12 14:03:54 +00:00
|
|
|
|
2011-12-28 17:04:16 +00:00
|
|
|
var PEG = require('pegjs'),
|
|
|
|
path = require('path'),
|
2012-01-03 18:44:31 +00:00
|
|
|
fs = require('fs'),
|
2012-01-04 08:39:45 +00:00
|
|
|
$ = require('jquery'),
|
2012-02-01 16:30:43 +00:00
|
|
|
events = require('events'),
|
|
|
|
defines = require('./mediawiki.parser.defines.js');
|
2011-12-12 14:03:54 +00:00
|
|
|
|
2011-12-28 17:04:16 +00:00
|
|
|
function PegTokenizer() {
|
2011-11-02 21:07:51 +00:00
|
|
|
}
|
|
|
|
|
2012-02-01 16:30:43 +00:00
|
|
|
|
|
|
|
|
2012-01-03 18:44:31 +00:00
|
|
|
// Inherit from EventEmitter
|
|
|
|
PegTokenizer.prototype = new events.EventEmitter();
|
2012-01-04 12:28:41 +00:00
|
|
|
PegTokenizer.prototype.constructor = PegTokenizer;
|
2012-01-03 18:44:31 +00:00
|
|
|
|
2011-12-08 10:59:44 +00:00
|
|
|
PegTokenizer.src = false;
|
2011-11-02 21:07:51 +00:00
|
|
|
|
2012-02-21 18:26:40 +00:00
|
|
|
/*
|
|
|
|
* The main worker. Sets up event emission ('chunk' and 'end' events).
|
|
|
|
* Consumers are supposed to register with PegTokenizer before calling
|
|
|
|
* process().
|
|
|
|
*/
|
2012-01-09 19:33:49 +00:00
|
|
|
PegTokenizer.prototype.process = function( text ) {
|
2011-12-12 14:03:54 +00:00
|
|
|
var out, err;
|
|
|
|
if ( !this.parser ) {
|
2012-03-01 18:07:20 +00:00
|
|
|
var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
|
|
|
|
this.src = fs.readFileSync( pegSrcPath, 'utf8' );
|
2012-01-09 19:33:49 +00:00
|
|
|
// Only create a single parser, as parse() is a static method.
|
2012-01-22 23:21:53 +00:00
|
|
|
var parserSource = PEG.buildParser(this.src).toSource();
|
2012-02-14 20:56:14 +00:00
|
|
|
//console.warn( parserSource );
|
2012-01-22 23:21:53 +00:00
|
|
|
parserSource = parserSource.replace( 'parse: function(input, startRule) {',
|
|
|
|
'parse: function(input, startRule) { var __parseArgs = arguments;' );
|
2012-02-14 20:56:14 +00:00
|
|
|
//console.warn( parserSource );
|
2012-01-22 23:21:53 +00:00
|
|
|
PegTokenizer.prototype.parser = eval( parserSource );
|
2011-12-12 14:03:54 +00:00
|
|
|
}
|
2011-12-28 01:37:11 +00:00
|
|
|
|
2012-02-21 18:26:40 +00:00
|
|
|
// Some input normalization: force a trailing newline
|
2011-12-30 21:51:03 +00:00
|
|
|
if ( text.substring(text.length - 1) !== "\n" ) {
|
2011-12-28 01:37:11 +00:00
|
|
|
text += "\n";
|
|
|
|
}
|
|
|
|
|
2012-01-03 18:44:31 +00:00
|
|
|
// XXX: Commented out exception handling during development to get
|
2012-02-21 18:26:40 +00:00
|
|
|
// reasonable traces.
|
2012-01-03 18:44:31 +00:00
|
|
|
//try {
|
2012-02-21 17:21:42 +00:00
|
|
|
this.parser.parse(text, 'start',
|
|
|
|
// callback
|
|
|
|
this.emit.bind( this, 'chunk' ),
|
|
|
|
// inline break test
|
|
|
|
this
|
|
|
|
);
|
2012-01-03 18:44:31 +00:00
|
|
|
this.emit('end');
|
|
|
|
//} catch (e) {
|
|
|
|
//err = e;
|
|
|
|
//console.trace();
|
|
|
|
//} finally {
|
|
|
|
return { err: err };
|
|
|
|
//}
|
|
|
|
};
|
2011-12-28 01:37:15 +00:00
|
|
|
|
2012-03-02 13:36:37 +00:00
|
|
|
PegTokenizer.prototype.processImageOptions = function( text ) {
|
|
|
|
return this.parser.parse(text, 'img_options', null, this );
|
|
|
|
};
|
2012-02-21 18:26:40 +00:00
|
|
|
|
2012-03-05 15:34:27 +00:00
|
|
|
PegTokenizer.prototype.parseURL = function( text ) {
|
|
|
|
try {
|
|
|
|
return this.parser.parse(text, 'url', null, this );
|
|
|
|
} catch ( e ) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2012-02-21 18:26:40 +00:00
|
|
|
/*
|
|
|
|
* Inline breaks, flag-enabled production which detects end positions for
|
|
|
|
* active higher-level productions in inline and other nested productions.
|
|
|
|
* Those inner productions are then exited, so that the outer production can
|
|
|
|
* handle the end marker.
|
|
|
|
*/
|
|
|
|
PegTokenizer.prototype.inline_breaks = function (input, pos, syntaxFlags ) {
|
|
|
|
switch( input[pos] ) {
|
|
|
|
case '=':
|
|
|
|
return syntaxFlags.equal ||
|
|
|
|
( syntaxFlags.h &&
|
|
|
|
input.substr( pos + 1, 200)
|
|
|
|
.match(/[ \t]*[\r\n]/) !== null ) || null;
|
|
|
|
case '|':
|
2012-03-02 13:36:37 +00:00
|
|
|
return syntaxFlags.pipe ||
|
|
|
|
syntaxFlags.template ||
|
2012-02-21 18:26:40 +00:00
|
|
|
( syntaxFlags.table &&
|
|
|
|
( input[pos + 1].match(/[|}]/) !== null ||
|
|
|
|
syntaxFlags.tableCellArg
|
|
|
|
)
|
|
|
|
) || null;
|
|
|
|
case "!":
|
|
|
|
return syntaxFlags.table && input[pos + 1] === "!" ||
|
|
|
|
null;
|
|
|
|
case "}":
|
|
|
|
return syntaxFlags.template && input[pos + 1] === "}" || null;
|
|
|
|
case ":":
|
|
|
|
return syntaxFlags.colon &&
|
|
|
|
! syntaxFlags.extlink &&
|
|
|
|
! syntaxFlags.linkdesc || null;
|
|
|
|
case "\r":
|
|
|
|
return syntaxFlags.table &&
|
|
|
|
input.substr(pos, 4).match(/\r\n?[!|]/) !== null ||
|
|
|
|
null;
|
|
|
|
case "\n":
|
|
|
|
return syntaxFlags.table &&
|
|
|
|
input[pos + 1] === '!' ||
|
|
|
|
input[pos + 1] === '|' ||
|
|
|
|
null;
|
|
|
|
case "]":
|
|
|
|
return syntaxFlags.extlink ||
|
|
|
|
( syntaxFlags.linkdesc && input[pos + 1] === ']' ) ||
|
|
|
|
null;
|
|
|
|
case "<":
|
|
|
|
return syntaxFlags.pre && input.substr( pos, 6 ) === '</pre>' || null;
|
|
|
|
default:
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Alternate version of the above. The hash is likely faster, but the nested
|
|
|
|
// function calls seem to cancel that out.
|
2012-02-21 17:21:42 +00:00
|
|
|
PegTokenizer.prototype.breakMap = {
|
|
|
|
'=': function(input, pos, syntaxFlags) {
|
|
|
|
return syntaxFlags.equal ||
|
|
|
|
( syntaxFlags.h &&
|
|
|
|
input.substr( pos + 1, 200)
|
|
|
|
.match(/[ \t]*[\r\n]/) !== null ) || null;
|
|
|
|
},
|
|
|
|
'|': function ( input, pos, syntaxFlags ) {
|
|
|
|
return syntaxFlags.template ||
|
2012-03-05 12:00:38 +00:00
|
|
|
syntaxFlags.linkdesc ||
|
2012-02-21 17:21:42 +00:00
|
|
|
( syntaxFlags.table &&
|
|
|
|
( input[pos + 1].match(/[|}]/) !== null ||
|
|
|
|
syntaxFlags.tableCellArg
|
|
|
|
)
|
|
|
|
) || null;
|
|
|
|
},
|
|
|
|
"!": function ( input, pos, syntaxFlags ) {
|
|
|
|
return syntaxFlags.table && input[pos + 1] === "!" ||
|
|
|
|
null;
|
|
|
|
},
|
|
|
|
"}": function ( input, pos, syntaxFlags ) {
|
|
|
|
return syntaxFlags.template && input[pos + 1] === "}" || null;
|
|
|
|
},
|
|
|
|
":": function ( input, pos, syntaxFlags ) {
|
|
|
|
return syntaxFlags.colon &&
|
|
|
|
! syntaxFlags.extlink &&
|
|
|
|
! syntaxFlags.linkdesc || null;
|
|
|
|
},
|
|
|
|
"\r": function ( input, pos, syntaxFlags ) {
|
|
|
|
return syntaxFlags.table &&
|
2012-02-21 17:57:30 +00:00
|
|
|
input.substr(pos, 4).match(/\r\n?[!|]/) !== null ||
|
2012-02-21 17:21:42 +00:00
|
|
|
null;
|
|
|
|
},
|
|
|
|
"\n": function ( input, pos, syntaxFlags ) {
|
|
|
|
return syntaxFlags.table &&
|
2012-02-21 17:57:30 +00:00
|
|
|
input[pos + 1] === '!' ||
|
|
|
|
input[pos + 1] === '|' ||
|
2012-02-21 17:21:42 +00:00
|
|
|
null;
|
|
|
|
},
|
|
|
|
"]": function ( input, pos, syntaxFlags ) {
|
|
|
|
return syntaxFlags.extlink ||
|
|
|
|
( syntaxFlags.linkdesc && input[pos + 1] === ']' ) ||
|
|
|
|
null;
|
|
|
|
},
|
|
|
|
"<": function ( input, pos, syntaxFlags ) {
|
|
|
|
return syntaxFlags.pre && input.substr( pos, 6 ) === '</pre>' || null;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2012-02-21 18:26:40 +00:00
|
|
|
PegTokenizer.prototype.inline_breaks_hash = function (input, pos, syntaxFlags ) {
|
2012-02-21 17:57:30 +00:00
|
|
|
return this.breakMap[ input[pos] ]( input, pos, syntaxFlags);
|
|
|
|
//console.warn( 'ilbn res: ' + JSON.stringify( [ res, input.substr( pos, 4 ) ] ) );
|
|
|
|
//return res;
|
|
|
|
};
|
|
|
|
|
2011-11-02 21:07:51 +00:00
|
|
|
|
2012-02-21 17:21:42 +00:00
|
|
|
|
2011-11-02 21:07:51 +00:00
|
|
|
if (typeof module == "object") {
|
2011-12-08 10:59:44 +00:00
|
|
|
module.exports.PegTokenizer = PegTokenizer;
|
2011-11-02 21:07:51 +00:00
|
|
|
}
|