/** * A very basic parser / serializer web service. */ /** * Config section * * Could move this to a separate file later. */ // default to en.wikipedia.org //var defaultInterwiki = 'en'; // alternative: default to www.mediawiki.org // Use this in production for now var defaultInterwiki = 'mw'; // for development: default to localhost //var defaultInterwiki = 'localhost'; /** * End config section */ // global includes var express = require('express'), jsDiff = require('diff'), html5 = require('html5'); // local includes var mp = '../modules/parser/'; var ParserPipelineFactory = require(mp + 'mediawiki.parser.js').ParserPipelineFactory, ParserEnv = require(mp + 'mediawiki.parser.environment.js').MWParserEnvironment, WikitextSerializer = require(mp + 'mediawiki.WikitextSerializer.js').WikitextSerializer, TemplateRequest = require(mp + 'mediawiki.ApiRequest.js').TemplateRequest; var env = new ParserEnv( { // stay within the 'proxied' content, so that we can click around wgScriptPath: '/', //http://en.wikipedia.org/wiki', wgScriptExtension: '.php', // XXX: add options for this! wgUploadPath: 'http://upload.wikimedia.org/wikipedia/commons', fetchTemplates: true, // enable/disable debug output using this switch debug: false, trace: false, maxDepth: 40 } ); // add mediawiki.org env.addInterwiki( 'mw', 'http://www.mediawiki.org/w' ); // For development: //env.addInterwiki( 'localhost', 'http://localhost/w' ); var parserPipelineFactory = new ParserPipelineFactory( env ); //var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' ); var app = express.createServer(); app.use(express.bodyParser()); app.get('/', function(req, res){ res.write('
Usage:
There are also some tools for experiments:
We are currently focusing on round-tripping of basic formatting like inline/bold, headings, lists, tables and links. Templates, citations and thumbnails are not expected to round-trip properly yet. Please report issues you see at :mw:Talk:Parsoid/Todo. Thanks!
'); }); var htmlSpecialChars = function ( s ) { return s.replace(/&/g,'&') .replace(/'); new WikitextSerializer({env: env}).serializeDOM( p.tree.document.childNodes[0].childNodes[1], function( c ) { res.write( htmlSpecialChars( c ) ); }); res.write(''); res.write( "
' + htmlSpecialChars( out ) + '
' + patch); // Add a 'report issue' link res.end(''+ '' + 'Report a parser issue in this page at ' + ''+ '[[:mw:Talk:Parsoid/Todo]]
'); }; var parse = function ( req, res, cb, src ) { var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' ); parser.on('document', cb.bind( null, req, res, src ) ); try { res.setHeader('Content-Type', 'text/html; charset=UTF-8'); parser.process( src ); } catch (e) { console.log( e ); res.end( e ); } }; /** * Round-trip article testing */ app.get( new RegExp('/_rt/(?:(?:(?:' + env.interwikiRegexp + '):+)?(' + env.interwikiRegexp + '):)?(.*)') , function(req, res){ env.pageName = req.params[1]; if ( req.params[0] ) { env.wgScriptPath = '/_rt/' + req.params[0] + ':'; env.wgScript = env.interwikiMap[req.params[0]]; } else { env.wgScriptPath = '/_rt/'; env.wgScript = env.interwikiMap[defaultInterwiki]; } if ( env.pageName === 'favicon.ico' ) { res.end( 'no favicon yet..' ); return; } var target = env.resolveTitle( env.normalizeTitle( env.pageName ), '' ); console.log('starting parsing of ' + target); var tpr = new TemplateRequest( env, target ); tpr.once('src', parse.bind( null, req, res, roundTripDiff )); }); /** * Round-trip article testing with newline stripping for editor-created HTML * simulation */ app.get( new RegExp('/_rtve/(?:(?:(?:' + env.interwikiRegexp + '):+)?(' + env.interwikiRegexp + '):)?(.*)') , function(req, res){ env.pageName = req.params[1]; if ( req.params[0] ) { env.wgScriptPath = '/_rtve/' + req.params[0] + ':'; env.wgScript = env.interwikiMap[req.params[0]]; } else { env.wgScriptPath = '/_rtve/'; env.wgScript = env.interwikiMap[defaultInterwiki]; } if ( env.pageName === 'favicon.ico' ) { res.end( 'no favicon yet..' ); return; } var target = env.resolveTitle( env.normalizeTitle( env.pageName ), '' ); console.log('starting parsing of ' + target); var tpr = new TemplateRequest( env, target ), cb = function ( req, res, src, document ) { // strip newlines from the html var html = document.innerHTML.replace(/[\r\n]/g, ''), p = new html5.Parser(); p.parse( html ); var newDocument = p.tree.document; // monkey-patch document.body reference for now.. newDocument.body = newDocument.childNodes[0].childNodes[1]; roundTripDiff( req, res, src, newDocument ); }; tpr.once('src', parse.bind( null, req, res, cb )); }); /** * Form-based round-tripping for manual testing */ app.get(/\/_rtform\/(.*)/, function(req, res){ env.pageName = req.params[0]; res.setHeader('Content-Type', 'text/html; charset=UTF-8'); res.write( "Your wikitext:" ); textarea( res ); res.end(''); }); app.post(/\/_rtform\/(.*)/, function(req, res){ env.pageName = req.params[0]; res.setHeader('Content-Type', 'text/html; charset=UTF-8'); // we don't care about \r, and normalize everything to \n parse( req, res, roundTripDiff, req.body.content.replace(/\r/g, '')); }); /** * Regular article parsing */ app.get(new RegExp( '/(?:(?:(?:' + env.interwikiRegexp + '):+)?(' + env.interwikiRegexp + '):)?(.*)' ), function(req, res){ env.pageName = req.params[1]; if ( req.params[0] ) { env.wgScriptPath = '/' + req.params[0] + ':'; env.wgScript = env.interwikiMap[req.params[0]]; } else { env.wgScriptPath = '/'; env.wgScript = env.interwikiMap[defaultInterwiki]; } if ( env.pageName === 'favicon.ico' ) { res.end( 'no favicon yet..'); return; } var target = env.resolveTitle( env.normalizeTitle( env.pageName ), '' ); console.log('starting parsing of ' + target); var tpr = new TemplateRequest( env, target ); tpr.once('src', parse.bind( null, req, res, function ( req, res, src, document ) { res.end(document.body.innerHTML); })); }); /** * Regular article serialization using POST */ app.post(/\/(.*)/, function(req, res){ env.pageName = req.params[0]; env.wgScriptPath = '/'; res.setHeader('Content-Type', 'text/x-mediawiki; charset=UTF-8'); var p = new html5.Parser(); p.parse( req.body.content ); new WikitextSerializer({env: env}).serializeDOM( p.tree.document.childNodes[0].childNodes[1], res.write.bind( res ) ); res.end(''); }); module.exports = app;