From ddc6899b1b5a25ebe78e2bbd696953329ad9253b Mon Sep 17 00:00:00 2001 From: Subramanya Sastry Date: Tue, 24 Jul 2012 10:30:44 -0500 Subject: [PATCH] Added html2wt command-line option to parse.js * A quick way to see how a html fragment serializes * Minor JSHint tweaks. Change-Id: I901398ec15700905c53de6f39cde93400b2f964a --- modules/parser/parse.js | 79 +++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/modules/parser/parse.js b/modules/parser/parse.js index ae3b45d3ff..7209b73134 100644 --- a/modules/parser/parse.js +++ b/modules/parser/parse.js @@ -1,7 +1,7 @@ /** * Command line parse utility. * Read from STDIN, write to STDOUT. - * + * * @author Neil Kandalgaonkar * @author Gabriel Wicke */ @@ -11,7 +11,8 @@ var ParserPipelineFactory = require('./mediawiki.parser.js').ParserPipelineFacto ConvertDOMToLM = require('./mediawiki.LinearModelConverter.js').ConvertDOMToLM, DOMConverter = require('./mediawiki.DOMConverter.js').DOMConverter, WikitextSerializer = require('./mediawiki.WikitextSerializer.js').WikitextSerializer, - optimist = require('optimist'); + optimist = require('optimist'), + html5 = require('html5'); ( function() { var opts = optimist.usage( 'Usage: echo wikitext | $0', { @@ -30,6 +31,11 @@ var ParserPipelineFactory = require('./mediawiki.parser.js').ParserPipelineFacto 'boolean': true, 'default': false }, + 'html2wt': { + description: 'Convert input HTML to Wikitext', + 'boolean': true, + 'default': false + }, 'wikitext': { description: 'Output WikiText instead of HTML', 'boolean': true, @@ -83,7 +89,7 @@ var ParserPipelineFactory = require('./mediawiki.parser.js').ParserPipelineFacto return; } - var env = new ParserEnv( { + var env = new ParserEnv( { // fetch templates from enwiki by default.. wgScript: argv.wgScript, wgScriptPath: argv.wgScriptPath, @@ -91,14 +97,12 @@ var ParserPipelineFactory = require('./mediawiki.parser.js').ParserPipelineFacto // XXX: add options for this! wgUploadPath: 'http://upload.wikimedia.org/wikipedia/commons', fetchTemplates: argv.fetchTemplates, - // enable/disable debug output using this switch + // enable/disable debug output using this switch debug: argv.debug, trace: argv.trace, maxDepth: argv.maxdepth, pageName: argv.pagename } ); - var parserPipelineFactory = new ParserPipelineFactory( env ); - var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' ); process.stdin.resume(); process.stdin.setEncoding('utf8'); @@ -108,34 +112,47 @@ var ParserPipelineFactory = require('./mediawiki.parser.js').ParserPipelineFacto inputChunks.push( chunk ); } ); - - - process.stdin.on( 'end', function() { + process.stdin.on( 'end', function() { var input = inputChunks.join(''); - parser.on('document', function ( document ) { - // Print out the html - if ( argv.linearmodel ) { - process.stdout.write( - JSON.stringify( ConvertDOMToLM( document.body ), null, 2 ) ); - } else if ( argv.wikidom ) { - process.stdout.write( - JSON.stringify( - new DOMConverter().HTMLtoWiki( document.body ), - null, - 2 - )); - } else if ( argv.wikitext ) { - new WikitextSerializer({env: env}).serializeDOM( document.body, - process.stdout.write.bind( process.stdout ) ); - } else { - process.stdout.write( document.body.innerHTML ); - } + if (argv.html2wt) { + var p = new html5.Parser(); + p.parse('' + input.replace(/\r/g, '') + ''); + var content = p.tree.document.childNodes[0].childNodes[1]; + var stdout = process.stdout; + new WikitextSerializer({env: env}).serializeDOM(content, stdout.write.bind(stdout)); + // add a trailing newline for shell user's benefit - process.stdout.write( "\n" ); + stdout.write( "\n" ); process.exit(0); - }); - // Kick off the pipeline by feeding the input into the parser pipeline - parser.process( input ); + } else { + var parserPipelineFactory = new ParserPipelineFactory( env ); + var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' ); + parser.on('document', function ( document ) { + // Print out the html + if ( argv.linearmodel ) { + process.stdout.write( + JSON.stringify( ConvertDOMToLM( document.body ), null, 2 ) ); + } else if ( argv.wikidom ) { + process.stdout.write( + JSON.stringify( + new DOMConverter().HTMLtoWiki( document.body ), + null, + 2 + )); + } else if ( argv.wikitext ) { + new WikitextSerializer({env: env}).serializeDOM( document.body, + process.stdout.write.bind( process.stdout ) ); + } else { + process.stdout.write( document.body.innerHTML ); + } + + // add a trailing newline for shell user's benefit + process.stdout.write( "\n" ); + process.exit(0); + }); + // Kick off the pipeline by feeding the input into the parser pipeline + parser.process( input ); + } } ); } )();