refactor parser to ParseThingy in different module, can be invoked with command line utility parse.js

This commit is contained in:
Neil Kandalgaonkar 2011-12-28 01:37:06 +00:00
parent aedc6751ae
commit 4158f82d7e
Notes: Gabriel Wicke 2012-02-27 16:40:01 +00:00
4 changed files with 166 additions and 93 deletions

View file

@ -22,6 +22,8 @@
* integrate the general environment (configuration, cache etc). (gwicke)
* */
$ = require('jquery');
/**
* Central dispatcher for potentially asynchronous token transformations.
*

View file

@ -0,0 +1,106 @@
/**
*
* Simple parser class. Should have lots of options for observing parse stages (or, use events).
*
* @author Gabriel Wicke <gwicke@wikimedia.org>
* @author Neil Kandalgaonkar <neilk@wikimedia.org>
*/
var fs = require('fs'),
path = require('path'),
$ = require('jquery'),
PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
TokenTransformDispatcher = require('./mediawiki.TokenTransformDispatcher.js').TokenTransformDispatcher,
DOMPostProcessor = require('./mediawiki.DOMPostProcessor.js').DOMPostProcessor,
DOMConverter = require('./mediawiki.DOMConverter.js').DOMConverter,
QuoteTransformer = require('./ext.core.QuoteTransformer.js').QuoteTransformer,
Cite = require('./ext.Cite.js').Cite,
MWRefTagHook = require('./ext.cite.taghook.ref.js').MWRefTagHook,
FauxHTML5 = require('./mediawiki.HTML5TreeBuilder.node.js').FauxHTML5;
function ParseThingy( config ) {
if ( !config ) {
config = {};
}
if ( !config.peg ) {
// n.b. __dirname is relative to the module.
var pegSrcPath = path.join( __dirname, 'pegTokenizer.pegjs.txt' );
config.peg = fs.readFileSync( pegSrcPath, 'utf8' );
}
// XXX parser environment? Will be needed for fetching templates, etc.
this.wikiTokenizer = new PegTokenizer(config.parserEnv, config.peg);
this.postProcessor = new DOMPostProcessor();
this.DOMConverter = new DOMConverter();
var pthingy = this;
// Set up the TokenTransformDispatcher with a callback for the remaining
// processing.
this.tokenDispatcher = new TokenTransformDispatcher ( function ( tokens ) {
//console.log("TOKENS: " + JSON.stringify(tokens, null, 2));
// Create a new tree builder, which also creates a new document.
var treeBuilder = new FauxHTML5.TreeBuilder();
// Build a DOM tree from tokens using the HTML tree builder/parser.
pthingy.buildTree( tokens, treeBuilder );
// Perform post-processing on DOM.
pthingy.postProcessor.doPostProcess(treeBuilder.document);
// And serialize the result.
// XXX fix this -- make it a method
pthingy.out = treeBuilder.document.body.innerHTML;
// XXX fix this -- make it a method
pthingy.getWikiDom = function() {
return JSON.stringify(
pthingy.DOMConverter.HTMLtoWiki( treeBuilder.document.body ),
null,
2
) + "\n";
};
// XXX pull HTML5 htmlparser fixups into this module? Or leave in tests?
});
// Add token transformations..
var qt = new QuoteTransformer();
qt.register(this.tokenDispatcher);
var citeExtension = new Cite();
citeExtension.register(this.tokenDispatcher);
}
ParseThingy.prototype = {
buildTree: function ( tokens, treeBuilder ) {
// push a body element, just to be sure to have one
treeBuilder.processToken({type: 'TAG', name: 'body'});
// Process all tokens
for (var i = 0, length = tokens.length; i < length; i++) {
treeBuilder.processToken(tokens[i]);
}
// FIXME HACK: For some reason the end token is not processed sometimes,
// which normally fixes the body reference up.
treeBuilder.document.body = treeBuilder.parser
.document.getElementsByTagName('body')[0];
}
};
if (typeof module == "object") {
module.exports.ParseThingy = ParseThingy;
}

49
modules/parser/parse.js Normal file
View file

@ -0,0 +1,49 @@
/**
* Command line wikidom parse utility.
* Read from STDIN, write to STDOUT.
*/
( function() {
var ParseThingy = require('./mediawiki.parser.js').ParseThingy,
optimist = require('optimist');
var parser = new ParseThingy();
process.stdin.resume();
process.stdin.setEncoding('utf8');
var inputChunks = [];
process.stdin.on( 'data', function( chunk ) {
inputChunks.push( chunk );
} );
process.stdin.on( 'end', function() {
var input = inputChunks.join('');
var output = getOutput(parser, input);
process.stdout.write( output );
process.exit(0);
} );
/**
* @param {ParseThingy} parser
* @param {String} text
*/
function getOutput( parser, input ) {
var res = parser.wikiTokenizer.tokenize(input + "\n");
if (res.err) {
console.log('PARSE FAIL', res.err);
process.exit(1);
}
// Append the end
res.tokens.push({type: 'END'});
parser.tokenDispatcher.transformTokens( res.tokens );
return parser.getWikiDom();
}
} )();

View file

@ -14,83 +14,6 @@
(function() {
// temporary
function ParseThingy( config ) {
// create tokenizer
// Preload the grammar file...
var peg = fs.readFileSync( config.pegSrcPath, 'utf8' );
var parserEnv = {};
//var parserEnv = new MWParserEnvironment({
// tagHooks: {
// 'ref': MWRefTagHook,
// 'references': MWReferencesTagHook
// }
//});
this.wikiTokenizer = new PegTokenizer(parserEnv, peg);
this.postProcessor = new DOMPostProcessor();
this.DOMConverter = new DOMConverter();
var pthingy = this;
// Set up the TokenTransformDispatcher with a callback for the remaining
// processing.
this.tokenDispatcher = new TokenTransformDispatcher ( function ( tokens ) {
//console.log("TOKENS: " + JSON.stringify(tokens, null, 2));
// Create a new tree builder, which also creates a new document.
var treeBuilder = new FauxHTML5.TreeBuilder();
// Build a DOM tree from tokens using the HTML tree builder/parser.
pthingy.buildTree( tokens, treeBuilder );
// Perform post-processing on DOM.
pthingy.postProcessor.doPostProcess(treeBuilder.document);
// And serialize the result.
// XXX fix this -- make it a method
pthingy.out = treeBuilder.document.body.innerHTML;
// XXX fix this -- make it a method
pthingy.getWikiDom = function() {
return JSON.stringify(
pthingy.DOMConverter.HTMLtoWiki( treeBuilder.document.body ),
null,
2
) + "\n";
};
});
// Add token transformations..
var qt = new QuoteTransformer();
qt.register(this.tokenDispatcher);
var citeExtension = new Cite();
citeExtension.register(this.tokenDispatcher);
}
ParseThingy.prototype = {
buildTree: function ( tokens, treeBuilder ) {
// push a body element, just to be sure to have one
treeBuilder.processToken({type: 'TAG', name: 'body'});
// Process all tokens
for (var i = 0, length = tokens.length; i < length; i++) {
treeBuilder.processToken(tokens[i]);
}
// FIXME HACK: For some reason the end token is not processed sometimes,
// which normally fixes the body reference up.
treeBuilder.document.body = treeBuilder.parser
.document.getElementsByTagName('body')[0];
}
};
console.log( "Starting up JS parser tests" );
@ -100,13 +23,11 @@ var fs = require('fs'),
jsDiff = require('diff'),
colors = require('colors'),
util = require( 'util' ),
HTML5 = require('html5').HTML5,
HTML5 = require('html5').HTML5, //TODO is this fixup for tests only, or part of real parsing...
PEG = require('pegjs'),
// Handle options/arguments with optimist module
optimist = require('optimist');
// @fixme wrap more or this setup in a common module
// track files imported / required
var fileDependencies = [];
@ -137,19 +58,8 @@ var pj = path.join;
var testWhiteList = require('./parserTests-whitelist.js').testWhiteList;
_import(pj('parser', 'mediawiki.tokenizer.peg.js'), ['PegTokenizer']);
_import(pj('parser', 'mediawiki.parser.environment.js'), ['MWParserEnvironment']);
_import(pj('parser', 'mediawiki.TokenTransformDispatcher.js'), ['TokenTransformDispatcher']);
_import(pj('parser', 'ext.cite.taghook.ref.js'), ['MWRefTagHook']);
_import(pj('parser', 'mediawiki.HTML5TreeBuilder.node.js'), ['FauxHTML5']);
_import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']);
_import(pj('parser', 'mediawiki.DOMConverter.js'), ['DOMConverter']);
_import(pj('parser', 'ext.core.QuoteTransformer.js'), ['QuoteTransformer']);
_import(pj('parser', 'ext.Cite.js'), ['Cite']);
_import(pj('parser', 'mediawiki.parser.js'), ['ParseThingy']);
// WikiDom and serializers
//_require(pj('es', 'es.js'));
@ -653,10 +563,16 @@ ParserTests.prototype.reportSummary = function () {
ParserTests.prototype.main = function () {
console.log( "Initialisation complete. Now launching tests." );
//var parserEnv = new MWParserEnvironment({
// tagHooks: {
// 'ref': MWRefTagHook,
// 'references': MWReferencesTagHook
// }
//});
// move this config out of here
var config = {
pegSrcPath: path.join(basePath, 'parser', 'pegTokenizer.pegjs.txt')
parserEnv: {}
};
var pThingy = new ParseThingy(config);