mediawiki-extensions-Visual.../api/ParserService.js

259 lines
8.1 KiB
JavaScript
Raw Normal View History

/**
* A very basic parser / serializer web service.
*/
var express = require('express'),
jsDiff = require('diff'),
html5 = require('html5');
var mp = '../modules/parser/';
var ParserPipelineFactory = require(mp + 'mediawiki.parser.js').ParserPipelineFactory,
ParserEnv = require(mp + 'mediawiki.parser.environment.js').MWParserEnvironment,
WikitextSerializer = require(mp + 'mediawiki.WikitextSerializer.js').WikitextSerializer,
TemplateRequest = require(mp + 'mediawiki.ApiRequest.js').TemplateRequest;
var env = new ParserEnv( {
// fetch templates from enwiki for now..
wgScript: 'http://en.wikipedia.org/w',
// stay within the 'proxied' content, so that we can click around
wgScriptPath: '', //http://en.wikipedia.org/wiki',
wgScriptExtension: '.php',
// XXX: add options for this!
wgUploadPath: 'http://upload.wikimedia.org/wikipedia/commons',
fetchTemplates: true,
// enable/disable debug output using this switch
debug: false,
trace: false,
maxDepth: 40
} );
var parserPipelineFactory = new ParserPipelineFactory( env );
//var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' );
var app = express.createServer();
app.use(express.bodyParser());
app.get('/', function(req, res){
res.write('<body><h3>Welcome to the alpha test web service for the ' +
'<a href="http://www.mediawiki.org/wiki/Parsoid">Parsoid project<a>.</h3>');
res.write( '<p>Usage: <ul><li>GET /title for the DOM. ' +
'Example: <strong><a href="/Main_Page">Main Page</a></strong>');
res.write('<li>POST a DOM as parameter "content" to /title for the wikitext</ul>');
res.write('<p>There are also some tools for experiments:' +
'<ul><li><strong><a href="/_wikitext/">WikiText -&gt; HTML DOM form</a></strong></li>' +
'<li><strong><a href="/_html/">HTML DOM -&gt; WikiText form</a></strong></li>');
res.end('<li>Round-trip testing arbitrary pages: ' +
'<strong><a href="/_roundtrip/Help:Magic">/_roundtrip/Help:Magic</a></strong></li></ul>');
});
var htmlSpecialChars = function ( s ) {
return s.replace(/&/g,'&amp;')
.replace(/</g,'&lt;')
.replace(/"/g,'&quot;')
.replace(/'/g,'&#039;');
};
var textarea = function ( res, content ) {
res.write('<form method=POST><textarea name="content" cols=90 rows=9>');
res.write( ( content &&
htmlSpecialChars( content) ) ||
'');
res.write('</textarea><br><input type="submit"></form>');
};
/**
* Form-based HTML DOM -> wikitext interface for manual testing
*/
app.get(/\/_html\/(.*)/, function(req, res){
env.pageName = req.params[0];
res.setHeader('Content-Type', 'text/html; charset=UTF-8');
res.write( "Your HTML DOM:" );
textarea( res );
res.end('');
});
app.post(/\/_html\/(.*)/, function(req, res){
env.pageName = req.params[0];
res.setHeader('Content-Type', 'text/html; charset=UTF-8');
var p = new html5.Parser();
p.parse( '<html><body>' + req.body.content + '</body></html>' );
res.write('<pre style="background-color: #efefef">');
new WikitextSerializer({env: env}).serializeDOM(
p.tree.document.childNodes[0].childNodes[1],
function( c ) {
res.write( htmlSpecialChars( c ) );
});
res.write('</pre>');
res.write( "<hr>Your HTML DOM:" );
textarea( res, req.body.content );
res.end('');
});
/**
* Form-based wikitext -> HTML DOM interface for manual testing
*/
app.get(/\/_wikitext\/(.*)/, function(req, res){
env.pageName = req.params[0];
res.setHeader('Content-Type', 'text/html; charset=UTF-8');
res.write( "Your wikitext:" );
textarea( res );
res.end('');
});
app.post(/\/_wikitext\/(.*)/, function(req, res){
env.pageName = req.params[0];
res.setHeader('Content-Type', 'text/html; charset=UTF-8');
var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' );
parser.on('document', function ( document ) {
res.write(document.body.innerHTML);
//res.write('<form method=POST><input name="content"></form>');
//res.end("hello world\n" + req.method + ' ' + req.params.title);
res.write( "<hr>Your wikitext:" );
textarea( res, req.body.content );
res.end('');
});
try {
res.setHeader('Content-Type', 'text/html; charset=UTF-8');
console.log('starting parsing of ' + req.params[0]);
// FIXME: This does not handle includes or templates correctly
parser.process( req.body.content );
} catch (e) {
console.log( e );
res.write( e );
}
});
/**
* Perform word-based diff on a line-based diff. The word-based algorithm is
* practically unusable for inputs > 5k bytes, so we only perform it on the
* output of the more efficient line-based diff.
*/
var refineDiff = function( diff ) {
var added = null,
out = [];
for ( var i = 0, l = diff.length; i < l; i++ ) {
var d = diff[i];
if ( d.added ) {
if ( added ) {
out.push( added );
}
added = d;
} else if ( d.removed ) {
if ( added ) {
var fineDiff = jsDiff.diffWords( d.value, added.value );
out.push.apply( out, fineDiff );
added = null;
} else {
out.push( d );
}
} else {
if ( added ) {
out.push( added );
added = null;
}
out.push(d);
}
}
if ( added ) {
out.push(added);
}
return out;
};
/**
* Round-trip article testing
*/
app.get(/\/_roundtrip\/(.*)/, function(req, res){
env.pageName = req.params[0];
env.wgScriptPath = '/_roundtrip';
if ( env.pageName === 'favicon.ico' ) {
res.end( 'no favicon yet..' );
return;
}
var target = env.resolveTitle( env.normalizeTitle( env.pageName ), '' );
console.log('retrieving ' + req.params[0]);
var tpr = new TemplateRequest( env, target );
tpr.once('src', function ( src ) {
var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' );
parser.on('document', function ( document ) {
res.write('<html><head><style>del { background: #ff9191; text-decoration: none; } ins { background: #99ff7e; text-decoration: none }; </style></head><body>');
res.write( '<h2>Wikitext parsed to HTML DOM</h2><hr>' );
res.write(document.body.innerHTML + '<hr>');
res.write( '<h2>HTML DOM converted back to Wikitext</h2><hr>' );
var out = new WikitextSerializer({env: env}).serializeDOM( document.body );
res.write('<pre>' + htmlSpecialChars( out ) + '</pre><hr>');
res.write( '<h2>Diff between original Wikitext (green) and round-tripped wikitext (red)</h2><hr>' );
var patch;
if ( src.length < 4000 ) {
// Use word-based diff for small articles
patch = jsDiff.convertChangesToXML( jsDiff.diffWords( out, src ) );
} else {
//console.log(JSON.stringify( jsDiff.diffLines( out, src ) ));
//patch = jsDiff.convertChangesToXML( jsDiff.diffLines( out, src ) );
patch = jsDiff.convertChangesToXML( refineDiff( jsDiff.diffLines( out, src ) ) );
}
res.end( '<pre>' + patch);
});
try {
res.setHeader('Content-Type', 'text/html; charset=UTF-8');
console.log('starting parsing of ' + req.params[0]);
// FIXME: This does not handle includes or templates correctly
parser.process( src );
} catch (e) {
console.log( e );
res.end( e );
}
});
});
/**
* Regular article parsing
*/
app.get(/\/(.*)/, function(req, res){
env.pageName = req.params[0];
env.wgScriptPath = '';
if ( env.pageName === 'favicon.ico' ) {
res.end( 'no favicon yet..');
return;
}
var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' );
parser.on('document', function ( document ) {
res.end(document.body.innerHTML);
//res.write('<form method=POST><input name="content"></form>');
//res.end("hello world\n" + req.method + ' ' + req.params.title);
});
try {
res.setHeader('Content-Type', 'text/html; charset=UTF-8');
console.log('starting parsing of ' + req.params[0]);
// FIXME: This does not handle includes or templates correctly
parser.process('{{:' + req.params[0] + '}}' );
} catch (e) {
console.log( e );
res.end( e );
textarea( res, req.body.content );
}
});
/**
* Regular article serialization using POST
*/
app.post(/\/(.*)/, function(req, res){
env.pageName = req.params[0];
env.wgScriptPath = '';
res.setHeader('Content-Type', 'text/x-mediawiki; charset=UTF-8');
var p = new html5.Parser();
p.parse( req.body.content );
new WikitextSerializer({env: env}).serializeDOM(
p.tree.document.childNodes[0].childNodes[1],
res.write.bind( res ) );
res.end('');
});
module.exports = app;