mediawiki-extensions-Visual.../modules/parser/parse.js

159 lines
4.5 KiB
JavaScript
Raw Normal View History

/**
2012-03-07 18:42:26 +00:00
* Command line parse utility.
* Read from STDIN, write to STDOUT.
*
* @author Neil Kandalgaonkar <neilk@wikimedia.org>
* @author Gabriel Wicke <gwicke@wikimedia.org>
*/
Biggish token transform system refactoring * All parser pipelines including tokenizer and DOM stuff are now constructed from a 'recipe' data structure in a ParserPipelineFactory. * All sub-pipelines of these can now be cached * Event registrations to a pipeline are directly forwarded to the last pipeline member to save relatively expensive event forwarding. * Some APIs for on-demand expansion / format conversion of parameters from parser functions are added: param.to('tokens/expanded', cb) param.to('text/wiki', cb) (this does not work yet) All parameters are additionally wrapped into a Param object that provides method for positional parameter naming (.named() or conversion to a dict (.dict()). * The async token transform manager is now separated from a frame object, with the frame holding arguments, an on-demand expansion method and loop checks. * Only keys of template parameters are now expanded. Parser functions or template arguments trigger an expansion on-demand. This (unsurprisingly) makes a big performance difference with typical switch-heavy template systems. * Return values from async transforms are no longer used in favor of plain callbacks. This saves the complication of having to maintain two code paths. A trick in transformTokens still avoids the construction of unneeded TokenAccumulators. * The results of template expansions are no longer buffered. * 301 parser tests are passing Known issues: * Cosmetic cleanup remains to do * Some parser functions do not support async expansions yet, and need to be modified. Change-Id: I1a7690baffbe8141cadf67270904a1b2e1df879a
2012-04-25 14:35:59 +00:00
var ParserPipelineFactory = require('./mediawiki.parser.js').ParserPipelineFactory,
ParserEnv = require('./mediawiki.parser.environment.js').MWParserEnvironment,
ConvertDOMToLM = require('./mediawiki.LinearModelConverter.js').ConvertDOMToLM,
DOMConverter = require('./mediawiki.DOMConverter.js').DOMConverter,
WikitextSerializer = require('./mediawiki.WikitextSerializer.js').WikitextSerializer,
optimist = require('optimist'),
html5 = require('html5');
( function() {
var opts = optimist.usage( 'Usage: echo wikitext | $0', {
'help': {
description: 'Show this message',
'boolean': true,
'default': false
},
'linearmodel': {
description: 'Output linear model data instead of HTML',
'boolean': true,
'default': false
},
'wikidom': {
description: 'Output WikiDOM instead of HTML',
'boolean': true,
'default': false
},
'html2wt': {
description: 'Convert input HTML to Wikitext',
'boolean': true,
'default': false
},
'wikitext': {
description: 'Output WikiText instead of HTML',
'boolean': true,
'default': false
},
'debug': {
description: 'Debug mode',
'boolean': true,
'default': false
},
'trace': {
description: 'Trace mode (light debugging), implied by --debug',
'boolean': true,
'default': false
},
'maxdepth': {
description: 'Maximum expansion depth',
'boolean': false,
'default': 40
},
'wgScript': {
description: 'http path to remote API, e.g. http://wiki.sample.com/w',
'boolean': false,
'default': 'http://en.wikipedia.org/w'
},
'wgScriptPath': {
description: 'http path to remote web interface, e.g. http://wiki.sample.com/wiki',
'boolean': false,
'default': 'http://en.wikipedia.org/wiki/'
},
'wgScriptExtension': {
description: 'Extension for PHP files on remote API server, if any. Include the period, e.g. ".php"',
'boolean': false,
'default': '.php'
},
'fetchTemplates': {
description: 'Whether to fetch included templates recursively',
'boolean': true,
'default': true
},
'pagename': {
description: 'The page name, returned for {{PAGENAME}}.',
'boolean': false,
'default': 'Main page'
}
});
var argv = opts.argv;
if ( argv.help ) {
optimist.showHelp();
return;
}
var env = new ParserEnv( {
// fetch templates from enwiki by default..
wgScript: argv.wgScript,
wgScriptPath: argv.wgScriptPath,
wgScriptExtension: argv.wgScriptExtension,
// XXX: add options for this!
wgUploadPath: 'http://upload.wikimedia.org/wikipedia/commons',
fetchTemplates: argv.fetchTemplates,
// enable/disable debug output using this switch
debug: argv.debug,
trace: argv.trace,
maxDepth: argv.maxdepth,
pageName: argv.pagename
Biggish token transform system refactoring * All parser pipelines including tokenizer and DOM stuff are now constructed from a 'recipe' data structure in a ParserPipelineFactory. * All sub-pipelines of these can now be cached * Event registrations to a pipeline are directly forwarded to the last pipeline member to save relatively expensive event forwarding. * Some APIs for on-demand expansion / format conversion of parameters from parser functions are added: param.to('tokens/expanded', cb) param.to('text/wiki', cb) (this does not work yet) All parameters are additionally wrapped into a Param object that provides method for positional parameter naming (.named() or conversion to a dict (.dict()). * The async token transform manager is now separated from a frame object, with the frame holding arguments, an on-demand expansion method and loop checks. * Only keys of template parameters are now expanded. Parser functions or template arguments trigger an expansion on-demand. This (unsurprisingly) makes a big performance difference with typical switch-heavy template systems. * Return values from async transforms are no longer used in favor of plain callbacks. This saves the complication of having to maintain two code paths. A trick in transformTokens still avoids the construction of unneeded TokenAccumulators. * The results of template expansions are no longer buffered. * 301 parser tests are passing Known issues: * Cosmetic cleanup remains to do * Some parser functions do not support async expansions yet, and need to be modified. Change-Id: I1a7690baffbe8141cadf67270904a1b2e1df879a
2012-04-25 14:35:59 +00:00
} );
process.stdin.resume();
process.stdin.setEncoding('utf8');
var inputChunks = [];
process.stdin.on( 'data', function( chunk ) {
inputChunks.push( chunk );
} );
process.stdin.on( 'end', function() {
var input = inputChunks.join('');
if (argv.html2wt) {
var p = new html5.Parser();
p.parse('<html><body>' + input.replace(/\r/g, '') + '</body></html>');
var content = p.tree.document.childNodes[0].childNodes[1];
var stdout = process.stdout;
new WikitextSerializer({env: env}).serializeDOM(content, stdout.write.bind(stdout));
// add a trailing newline for shell user's benefit
stdout.write( "\n" );
process.exit(0);
} else {
var parserPipelineFactory = new ParserPipelineFactory( env );
var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' );
parser.on('document', function ( document ) {
// Print out the html
if ( argv.linearmodel ) {
process.stdout.write(
JSON.stringify( ConvertDOMToLM( document.body ), null, 2 ) );
} else if ( argv.wikidom ) {
process.stdout.write(
JSON.stringify(
new DOMConverter().HTMLtoWiki( document.body ),
null,
2
));
} else if ( argv.wikitext ) {
new WikitextSerializer({env: env}).serializeDOM( document.body,
process.stdout.write.bind( process.stdout ) );
} else {
process.stdout.write( document.body.innerHTML );
}
// add a trailing newline for shell user's benefit
process.stdout.write( "\n" );
process.exit(0);
});
// Kick off the pipeline by feeding the input into the parser pipeline
parser.process( input );
}
} );
} )();