/** * A very basic parser / serializer web service. */ /** * Config section * * Could move this to a separate file later. */ // default to en.wikipedia.org var defaultInterwiki = 'en'; // alternative: default to www.mediawiki.org //var defaultInterwiki = 'mw'; // for development: default to localhost //var defaultInterwiki = 'localhost'; /** * End config section */ // global includes var express = require('express'), jsDiff = require('diff'), html5 = require('html5'); // local includes var mp = '../modules/parser/'; var ParserPipelineFactory = require(mp + 'mediawiki.parser.js').ParserPipelineFactory, ParserEnv = require(mp + 'mediawiki.parser.environment.js').MWParserEnvironment, WikitextSerializer = require(mp + 'mediawiki.WikitextSerializer.js').WikitextSerializer, TemplateRequest = require(mp + 'mediawiki.ApiRequest.js').TemplateRequest; var env = new ParserEnv( { // stay within the 'proxied' content, so that we can click around wgScriptPath: '/', //http://en.wikipedia.org/wiki', wgScriptExtension: '.php', // XXX: add options for this! wgUploadPath: 'http://upload.wikimedia.org/wikipedia/commons', fetchTemplates: true, // enable/disable debug output using this switch debug: false, trace: false, maxDepth: 40 } ); // add mediawiki.org env.addInterwiki( 'mw', 'http://www.mediawiki.org/w' ); // For development: //env.addInterwiki( 'localhost', 'http://localhost/w' ); var parserPipelineFactory = new ParserPipelineFactory( env ); //var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' ); var app = express.createServer(); app.use(express.bodyParser()); app.get('/', function(req, res){ res.write('

Welcome to the alpha test web service for the ' + 'Parsoid project.

'); res.write( '

Usage:

'); res.write('

There are also some tools for experiments:

'); res.end('

We are currently focusing on round-tripping of basic formatting like inline/bold, headings, lists, tables and links. Templates, citations and thumbnails are not expected to round-trip properly yet. Please report issues you see at :mw:Talk:Parsoid/Todo. Thanks!

'); }); var htmlSpecialChars = function ( s ) { return s.replace(/&/g,'&') .replace(/
'); }; /** * Form-based HTML DOM -> wikitext interface for manual testing */ app.get(/\/_html\/(.*)/, function(req, res){ env.pageName = req.params[0]; res.setHeader('Content-Type', 'text/html; charset=UTF-8'); res.write( "Your HTML DOM:" ); textarea( res ); res.end(''); }); app.post(/\/_html\/(.*)/, function(req, res){ env.pageName = req.params[0]; res.setHeader('Content-Type', 'text/html; charset=UTF-8'); var p = new html5.Parser(); p.parse( '' + req.body.content.replace(/\r/g, '') + '' ); res.write('
');
	new WikitextSerializer({env: env}).serializeDOM( 
		p.tree.document.childNodes[0].childNodes[1], 
		function( c ) {
			res.write( htmlSpecialChars( c ) );
		});
	res.write('
'); res.write( "
Your HTML DOM:" ); textarea( res, req.body.content.replace(/\r/g, '') ); res.end(''); }); /** * Form-based wikitext -> HTML DOM interface for manual testing */ app.get(/\/_wikitext\/(.*)/, function(req, res){ env.pageName = req.params[0]; res.setHeader('Content-Type', 'text/html; charset=UTF-8'); res.write( "Your wikitext:" ); textarea( res ); res.end(''); }); app.post(/\/_wikitext\/(.*)/, function(req, res){ env.pageName = req.params[0]; res.setHeader('Content-Type', 'text/html; charset=UTF-8'); var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' ); parser.on('document', function ( document ) { res.write(document.body.innerHTML); //res.write('
'); //res.end("hello world\n" + req.method + ' ' + req.params.title); res.write( "
Your wikitext:" ); textarea( res, req.body.content.replace(/\r/g, '') ); res.end(''); }); try { res.setHeader('Content-Type', 'text/html; charset=UTF-8'); console.log('starting parsing of ' + req.params[0]); // FIXME: This does not handle includes or templates correctly parser.process( req.body.content.replace(/\r/g, '') ); } catch (e) { console.log( e ); res.write( e ); } }); /** * Perform word-based diff on a line-based diff. The word-based algorithm is * practically unusable for inputs > 5k bytes, so we only perform it on the * output of the more efficient line-based diff. */ var refineDiff = function( diff ) { // Attempt to accumulate consecutive add-delete pairs // with short text separating them (short = 2 chars right now) // // This is equivalent to the ... minimization // to expand range of and tags, except there is no optimal // solution except as determined by heuristics ("short text" = <= 2 chars). function mergeConsecutiveSegments(wordDiffs) { var n = wordDiffs.length; var currIns = null, currDel = null; var newDiffs = []; for (var i = 0; i < n; i++) { var d = wordDiffs[i]; var dVal = d.value; if (d.added) { // Attempt to accumulate if (currIns === null) { currIns = d; } else { currIns.value = currIns.value + dVal; } } else if (d.removed) { // Attempt to accumulate if (currDel === null) { currDel = d; } else { currDel.value = currDel.value + dVal; } } else if (((dVal.length < 4) || !dVal.match(/\s/)) && currIns && currDel) { // Attempt to accumulate currIns.value = currIns.value + dVal; currDel.value = currDel.value + dVal; } else { // Accumulation ends. Purge! if (currIns !== null) { newDiffs.push(currIns); currIns = null; } if (currDel !== null) { newDiffs.push(currDel); currDel = null; } newDiffs.push(d); } } // Purge buffered diffs if (currIns !== null) newDiffs.push(currIns); if (currDel !== null) newDiffs.push(currDel); return newDiffs; } var added = null, out = []; for ( var i = 0, l = diff.length; i < l; i++ ) { var d = diff[i]; if ( d.added ) { if ( added ) { out.push( added ); } added = d; } else if ( d.removed ) { if ( added ) { var fineDiff = jsDiff.diffWords( d.value, added.value ); fineDiff = mergeConsecutiveSegments(fineDiff); out.push.apply( out, fineDiff ); added = null; } else { out.push( d ); } } else { if ( added ) { out.push( added ); added = null; } out.push(d); } } if ( added ) { out.push(added); } return out; }; var roundTripDiff = function ( req, res, src, document ) { res.write(''); res.write( '

Wikitext parsed to HTML DOM


' ); res.write(document.body.innerHTML + '
'); res.write( '

HTML DOM converted back to Wikitext


' ); var out = new WikitextSerializer({env: env}).serializeDOM( document.body ); res.write('
' + htmlSpecialChars( out ) + '

\n'); res.write( '

Diff between original Wikitext (green) and round-tripped wikitext (red)


\n' ); var patch; src = src.replace(/\n(?=\n)/g, '\n '); out = out.replace(/\n(?=\n)/g, '\n '); //console.log(JSON.stringify( jsDiff.diffLines( out, src ) )); //patch = jsDiff.convertChangesToXML( jsDiff.diffLines( src, out ) ); patch = jsDiff.convertChangesToXML( refineDiff( jsDiff.diffLines( src, out ) ) ); res.write( '
' + patch);
	// Add a 'report issue' link
	res.end('

'+ '' + 'Report a parser issue in this page at ' + ''+ '[[:mw:Talk:Parsoid/Todo]]


'); }; var parse = function ( req, res, cb, src ) { var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' ); parser.on('document', cb.bind( null, req, res, src ) ); try { res.setHeader('Content-Type', 'text/html; charset=UTF-8'); parser.process( src ); } catch (e) { console.log( e ); res.end( e ); } }; /** * Round-trip article testing */ app.get( new RegExp('/_rt/(?:(?:(?:' + env.interwikiRegexp + '):+)?(' + env.interwikiRegexp + '):)?(.*)') , function(req, res){ env.pageName = req.params[1]; if ( req.params[0] ) { env.wgScriptPath = '/_rt/' + req.params[0] + ':'; env.wgScript = env.interwikiMap[req.params[0]]; } else { env.wgScriptPath = '/_rt/'; env.wgScript = env.interwikiMap[defaultInterwiki]; } if ( env.pageName === 'favicon.ico' ) { res.end( 'no favicon yet..' ); return; } var target = env.resolveTitle( env.normalizeTitle( env.pageName ), '' ); console.log('starting parsing of ' + target); var tpr = new TemplateRequest( env, target ); tpr.once('src', parse.bind( null, req, res, roundTripDiff )); }); /** * Round-trip article testing with newline stripping for editor-created HTML * simulation */ app.get( new RegExp('/_rtve/(?:(?:(?:' + env.interwikiRegexp + '):+)?(' + env.interwikiRegexp + '):)?(.*)') , function(req, res){ env.pageName = req.params[1]; if ( req.params[0] ) { env.wgScriptPath = '/_rtve/' + req.params[0] + ':'; env.wgScript = env.interwikiMap[req.params[0]]; } else { env.wgScriptPath = '/_rtve/'; env.wgScript = env.interwikiMap[defaultInterwiki]; } if ( env.pageName === 'favicon.ico' ) { res.end( 'no favicon yet..' ); return; } var target = env.resolveTitle( env.normalizeTitle( env.pageName ), '' ); console.log('starting parsing of ' + target); var tpr = new TemplateRequest( env, target ), cb = function ( req, res, src, document ) { // strip newlines from the html var html = document.innerHTML.replace(/[\r\n]/g, ''), p = new html5.Parser(); p.parse( html ); var newDocument = p.tree.document; // monkey-patch document.body reference for now.. newDocument.body = newDocument.childNodes[0].childNodes[1]; roundTripDiff( req, res, src, newDocument ); }; tpr.once('src', parse.bind( null, req, res, cb )); }); /** * Form-based round-tripping for manual testing */ app.get(/\/_rtform\/(.*)/, function(req, res){ env.pageName = req.params[0]; res.setHeader('Content-Type', 'text/html; charset=UTF-8'); res.write( "Your wikitext:" ); textarea( res ); res.end(''); }); app.post(/\/_rtform\/(.*)/, function(req, res){ env.pageName = req.params[0]; res.setHeader('Content-Type', 'text/html; charset=UTF-8'); // we don't care about \r, and normalize everything to \n parse( req, res, roundTripDiff, req.body.content.replace(/\r/g, '')); }); /** * Regular article parsing */ app.get(new RegExp( '/(?:(?:(?:' + env.interwikiRegexp + '):+)?(' + env.interwikiRegexp + '):)?(.*)' ), function(req, res){ env.pageName = req.params[1]; if ( req.params[0] ) { env.wgScriptPath = '/' + req.params[0] + ':'; env.wgScript = env.interwikiMap[req.params[0]]; } else { env.wgScriptPath = '/'; env.wgScript = env.interwikiMap[defaultInterwiki]; } if ( env.pageName === 'favicon.ico' ) { res.end( 'no favicon yet..'); return; } var target = env.resolveTitle( env.normalizeTitle( env.pageName ), '' ); console.log('starting parsing of ' + target); var tpr = new TemplateRequest( env, target ); tpr.once('src', parse.bind( null, req, res, function ( req, res, src, document ) { res.end(document.body.innerHTML); })); }); /** * Regular article serialization using POST */ app.post(/\/(.*)/, function(req, res){ env.pageName = req.params[0]; env.wgScriptPath = '/'; res.setHeader('Content-Type', 'text/x-mediawiki; charset=UTF-8'); var p = new html5.Parser(); p.parse( req.body.content ); new WikitextSerializer({env: env}).serializeDOM( p.tree.document.childNodes[0].childNodes[1], res.write.bind( res ) ); res.end(''); }); module.exports = app;