2012-05-23 10:35:00 +00:00
/ * *
* A very basic parser / serializer web service .
* /
2012-06-07 09:50:40 +00:00
/ * *
* Config section
*
* Could move this to a separate file later .
* /
// default to en.wikipedia.org
2012-06-21 03:09:43 +00:00
//var defaultInterwiki = 'en';
2012-06-07 09:50:40 +00:00
// alternative: default to www.mediawiki.org
2012-06-21 03:09:43 +00:00
// Use this in production for now
var defaultInterwiki = 'mw' ;
2012-06-13 02:19:48 +00:00
// for development: default to localhost
//var defaultInterwiki = 'localhost';
2012-06-07 09:50:40 +00:00
/ * *
* End config section
* /
// global includes
2012-05-23 10:35:00 +00:00
var express = require ( 'express' ) ,
2012-05-30 18:15:08 +00:00
jsDiff = require ( 'diff' ) ,
2012-05-23 10:35:00 +00:00
html5 = require ( 'html5' ) ;
2012-06-07 09:50:40 +00:00
// local includes
2012-05-23 10:35:00 +00:00
var mp = '../modules/parser/' ;
var ParserPipelineFactory = require ( mp + 'mediawiki.parser.js' ) . ParserPipelineFactory ,
ParserEnv = require ( mp + 'mediawiki.parser.environment.js' ) . MWParserEnvironment ,
2012-05-30 18:15:08 +00:00
WikitextSerializer = require ( mp + 'mediawiki.WikitextSerializer.js' ) . WikitextSerializer ,
TemplateRequest = require ( mp + 'mediawiki.ApiRequest.js' ) . TemplateRequest ;
2012-05-23 10:35:00 +00:00
var env = new ParserEnv ( {
// stay within the 'proxied' content, so that we can click around
2012-06-05 09:19:58 +00:00
wgScriptPath : '/' , //http://en.wikipedia.org/wiki',
2012-05-23 10:35:00 +00:00
wgScriptExtension : '.php' ,
// XXX: add options for this!
wgUploadPath : 'http://upload.wikimedia.org/wikipedia/commons' ,
fetchTemplates : true ,
// enable/disable debug output using this switch
debug : false ,
trace : false ,
2012-06-07 10:30:16 +00:00
maxDepth : 40
2012-05-23 10:35:00 +00:00
} ) ;
2012-06-07 10:30:16 +00:00
// add mediawiki.org
env . addInterwiki ( 'mw' , 'http://www.mediawiki.org/w' ) ;
2012-06-13 02:19:48 +00:00
// For development:
2012-06-21 07:53:18 +00:00
env . addInterwiki ( 'localhost' , 'http://localhost/w' ) ;
2012-06-03 21:46:57 +00:00
2012-05-23 10:35:00 +00:00
var parserPipelineFactory = new ParserPipelineFactory ( env ) ;
//var parser = parserPipelineFactory.makePipeline( 'text/x-mediawiki/full' );
var app = express . createServer ( ) ;
app . use ( express . bodyParser ( ) ) ;
app . get ( '/' , function ( req , res ) {
2012-06-03 09:04:40 +00:00
res . write ( '<body><h3>Welcome to the alpha test web service for the ' +
'<a href="http://www.mediawiki.org/wiki/Parsoid">Parsoid project<a>.</h3>' ) ;
2012-06-02 15:17:37 +00:00
res . write ( '<p>Usage: <ul><li>GET /title for the DOM. ' +
2012-06-21 07:53:18 +00:00
'Example: <strong><a href="/en:Main_Page">Main Page</a></strong>' ) ;
2012-05-23 10:35:00 +00:00
res . write ( '<li>POST a DOM as parameter "content" to /title for the wikitext</ul>' ) ;
2012-06-04 08:49:59 +00:00
res . write ( '<p>There are also some tools for experiments:<ul>' ) ;
res . write ( '<li>Round-trip test pages from the English Wikipedia: ' +
2012-06-21 07:53:18 +00:00
'<strong><a href="/_rt/en:Help:Magic">/_rt/Help:Magic</a></strong></li>' ) ;
2012-06-04 08:49:59 +00:00
res . write ( '<li><strong><a href="/_rtform/">WikiText -> HTML DOM -> WikiText round-trip form</a></strong></li>' ) ;
res . write ( '<li><strong><a href="/_wikitext/">WikiText -> HTML DOM form</a></strong></li>' +
2012-06-02 15:17:37 +00:00
'<li><strong><a href="/_html/">HTML DOM -> WikiText form</a></strong></li>' ) ;
2012-06-07 11:16:05 +00:00
res . write ( '</ul>' ) ;
2012-06-07 11:18:44 +00:00
res . end ( '<p>We are currently focusing on round-tripping of basic formatting like inline/bold, headings, lists, tables and links. Templates, citations and thumbnails are not expected to round-trip properly yet. <strong>Please report issues you see at <a href="http://www.mediawiki.org/w/index.php?title=Talk:Parsoid/Todo&action=edit§ion=new">:mw:Talk:Parsoid/Todo</a>. Thanks!</strong></p>' ) ;
2012-05-23 10:35:00 +00:00
} ) ;
2012-05-23 15:50:35 +00:00
var htmlSpecialChars = function ( s ) {
return s . replace ( /&/g , '&' )
. replace ( /</g , '<' )
. replace ( /"/g , '"' )
. replace ( /'/g , ''' ) ;
} ;
2012-05-23 10:35:00 +00:00
var textarea = function ( res , content ) {
res . write ( '<form method=POST><textarea name="content" cols=90 rows=9>' ) ;
res . write ( ( content &&
2012-05-23 15:50:35 +00:00
htmlSpecialChars ( content ) ) ||
2012-05-23 10:35:00 +00:00
'' ) ;
res . write ( '</textarea><br><input type="submit"></form>' ) ;
} ;
/ * *
* Form - based HTML DOM - > wikitext interface for manual testing
* /
app . get ( /\/_html\/(.*)/ , function ( req , res ) {
2012-05-23 14:43:45 +00:00
env . pageName = req . params [ 0 ] ;
2012-05-23 16:11:51 +00:00
res . setHeader ( 'Content-Type' , 'text/html; charset=UTF-8' ) ;
2012-05-23 10:35:00 +00:00
res . write ( "Your HTML DOM:" ) ;
textarea ( res ) ;
res . end ( '' ) ;
} ) ;
app . post ( /\/_html\/(.*)/ , function ( req , res ) {
2012-05-23 14:43:45 +00:00
env . pageName = req . params [ 0 ] ;
2012-05-23 16:11:51 +00:00
res . setHeader ( 'Content-Type' , 'text/html; charset=UTF-8' ) ;
2012-05-23 10:35:00 +00:00
var p = new html5 . Parser ( ) ;
2012-06-06 17:26:29 +00:00
p . parse ( '<html><body>' + req . body . content . replace ( /\r/g , '' ) + '</body></html>' ) ;
2012-05-23 10:35:00 +00:00
res . write ( '<pre style="background-color: #efefef">' ) ;
new WikitextSerializer ( { env : env } ) . serializeDOM (
p . tree . document . childNodes [ 0 ] . childNodes [ 1 ] ,
2012-05-23 15:50:35 +00:00
function ( c ) {
res . write ( htmlSpecialChars ( c ) ) ;
} ) ;
2012-05-23 10:35:00 +00:00
res . write ( '</pre>' ) ;
res . write ( "<hr>Your HTML DOM:" ) ;
2012-06-06 17:26:29 +00:00
textarea ( res , req . body . content . replace ( /\r/g , '' ) ) ;
2012-05-23 10:35:00 +00:00
res . end ( '' ) ;
} ) ;
/ * *
* Form - based wikitext - > HTML DOM interface for manual testing
* /
app . get ( /\/_wikitext\/(.*)/ , function ( req , res ) {
2012-05-23 14:43:45 +00:00
env . pageName = req . params [ 0 ] ;
2012-05-23 16:11:51 +00:00
res . setHeader ( 'Content-Type' , 'text/html; charset=UTF-8' ) ;
2012-05-23 10:35:00 +00:00
res . write ( "Your wikitext:" ) ;
textarea ( res ) ;
res . end ( '' ) ;
} ) ;
app . post ( /\/_wikitext\/(.*)/ , function ( req , res ) {
2012-05-23 14:43:45 +00:00
env . pageName = req . params [ 0 ] ;
2012-05-23 16:11:51 +00:00
res . setHeader ( 'Content-Type' , 'text/html; charset=UTF-8' ) ;
2012-05-23 10:35:00 +00:00
var parser = parserPipelineFactory . makePipeline ( 'text/x-mediawiki/full' ) ;
parser . on ( 'document' , function ( document ) {
res . write ( document . body . innerHTML ) ;
//res.write('<form method=POST><input name="content"></form>');
//res.end("hello world\n" + req.method + ' ' + req.params.title);
res . write ( "<hr>Your wikitext:" ) ;
2012-06-06 17:26:29 +00:00
textarea ( res , req . body . content . replace ( /\r/g , '' ) ) ;
2012-05-23 10:35:00 +00:00
res . end ( '' ) ;
} ) ;
try {
2012-05-23 16:11:51 +00:00
res . setHeader ( 'Content-Type' , 'text/html; charset=UTF-8' ) ;
2012-05-23 10:35:00 +00:00
console . log ( 'starting parsing of ' + req . params [ 0 ] ) ;
// FIXME: This does not handle includes or templates correctly
2012-06-06 17:26:29 +00:00
parser . process ( req . body . content . replace ( /\r/g , '' ) ) ;
2012-05-23 10:35:00 +00:00
} catch ( e ) {
console . log ( e ) ;
res . write ( e ) ;
}
} ) ;
2012-06-03 21:46:57 +00:00
/ * *
2012-06-03 22:02:49 +00:00
* Perform word - based diff on a line - based diff . The word - based algorithm is
2012-06-03 21:46:57 +00:00
* practically unusable for inputs > 5 k bytes , so we only perform it on the
* output of the more efficient line - based diff .
* /
var refineDiff = function ( diff ) {
2012-06-06 05:08:00 +00:00
// Attempt to accumulate consecutive add-delete pairs
// with short text separating them (short = 2 chars right now)
//
// This is equivalent to the <b><i> ... </i></b> minimization
// to expand range of <b> and <i> tags, except there is no optimal
// solution except as determined by heuristics ("short text" = <= 2 chars).
function mergeConsecutiveSegments ( wordDiffs ) {
var n = wordDiffs . length ;
var currIns = null , currDel = null ;
var newDiffs = [ ] ;
for ( var i = 0 ; i < n ; i ++ ) {
2012-06-06 16:01:47 +00:00
var d = wordDiffs [ i ] ;
var dVal = d . value ;
2012-06-06 05:08:00 +00:00
if ( d . added ) {
// Attempt to accumulate
if ( currIns === null ) {
currIns = d ;
} else {
2012-06-06 16:01:47 +00:00
currIns . value = currIns . value + dVal ;
2012-06-06 05:08:00 +00:00
}
} else if ( d . removed ) {
// Attempt to accumulate
if ( currDel === null ) {
currDel = d ;
} else {
2012-06-06 16:01:47 +00:00
currDel . value = currDel . value + dVal ;
2012-06-06 05:08:00 +00:00
}
2012-06-06 16:01:47 +00:00
} else if ( ( ( dVal . length < 4 ) || ! dVal . match ( /\s/ ) ) && currIns && currDel ) {
2012-06-06 05:08:00 +00:00
// Attempt to accumulate
2012-06-06 16:01:47 +00:00
currIns . value = currIns . value + dVal ;
currDel . value = currDel . value + dVal ;
2012-06-06 05:08:00 +00:00
} else {
// Accumulation ends. Purge!
if ( currIns !== null ) {
newDiffs . push ( currIns ) ;
currIns = null ;
}
if ( currDel !== null ) {
newDiffs . push ( currDel ) ;
currDel = null ;
}
newDiffs . push ( d ) ;
}
}
// Purge buffered diffs
if ( currIns !== null ) newDiffs . push ( currIns ) ;
if ( currDel !== null ) newDiffs . push ( currDel ) ;
return newDiffs ;
}
2012-06-03 21:46:57 +00:00
var added = null ,
out = [ ] ;
for ( var i = 0 , l = diff . length ; i < l ; i ++ ) {
var d = diff [ i ] ;
if ( d . added ) {
2012-06-04 08:16:05 +00:00
if ( added ) {
out . push ( added ) ;
}
2012-06-03 21:46:57 +00:00
added = d ;
} else if ( d . removed ) {
if ( added ) {
2012-06-18 22:21:34 +00:00
var fineDiff = jsDiff . diffWords ( d . value , added . value ) ;
2012-06-06 05:08:00 +00:00
fineDiff = mergeConsecutiveSegments ( fineDiff ) ;
2012-06-03 21:46:57 +00:00
out . push . apply ( out , fineDiff ) ;
added = null ;
2012-06-04 08:16:05 +00:00
} else {
out . push ( d ) ;
}
} else {
if ( added ) {
out . push ( added ) ;
added = null ;
2012-06-03 21:46:57 +00:00
}
out . push ( d ) ;
}
}
if ( added ) {
out . push ( added ) ;
}
return out ;
} ;
2012-06-07 11:37:40 +00:00
var roundTripDiff = function ( req , res , src , document ) {
2012-06-06 14:02:54 +00:00
res . write ( '<html><head><style>ins { background: #ff9191; text-decoration: none; } del { background: #99ff7e; text-decoration: none }; </style></head><body>' ) ;
2012-06-04 08:49:59 +00:00
res . write ( '<h2>Wikitext parsed to HTML DOM</h2><hr>' ) ;
res . write ( document . body . innerHTML + '<hr>' ) ;
res . write ( '<h2>HTML DOM converted back to Wikitext</h2><hr>' ) ;
var out = new WikitextSerializer ( { env : env } ) . serializeDOM ( document . body ) ;
2012-06-18 15:18:20 +00:00
res . write ( '<pre>' + htmlSpecialChars ( out ) + '</pre><hr>\n' ) ;
res . write ( '<h2>Diff between original Wikitext (green) and round-tripped wikitext (red)</h2><hr>\n' ) ;
2012-06-04 08:49:59 +00:00
var patch ;
2012-06-06 15:11:29 +00:00
src = src . replace ( /\n(?=\n)/g , '\n ' ) ;
out = out . replace ( /\n(?=\n)/g , '\n ' ) ;
2012-06-06 15:46:25 +00:00
//console.log(JSON.stringify( jsDiff.diffLines( out, src ) ));
2012-06-19 23:31:46 +00:00
patch = jsDiff . convertChangesToXML ( jsDiff . diffLines ( src , out ) ) ;
//patch = jsDiff.convertChangesToXML( refineDiff( jsDiff.diffLines( src, out ) ) );
2012-06-07 11:37:40 +00:00
res . write ( '<pre>' + patch ) ;
// Add a 'report issue' link
res . end ( '<hr><h2>' +
'<a style="color: red" ' +
'href="http://www.mediawiki.org/w/index.php?title=Talk:Parsoid/Todo' +
'&action=edit§ion=new&preloadtitle=' +
'Issue%20on%20http://parsoid.wmflabs.org' + req . url + '">' +
2012-06-07 11:42:05 +00:00
'Report a parser issue in this page</a> at ' +
'<a href="http://www.mediawiki.org/wiki/Talk:Parsoid/Todo">' +
'[[:mw:Talk:Parsoid/Todo]]</a></h2><hr>' ) ;
2012-06-04 08:49:59 +00:00
} ;
2012-06-07 11:37:40 +00:00
var parse = function ( req , res , cb , src ) {
2012-06-04 08:49:59 +00:00
var parser = parserPipelineFactory . makePipeline ( 'text/x-mediawiki/full' ) ;
2012-06-07 11:37:40 +00:00
parser . on ( 'document' , cb . bind ( null , req , res , src ) ) ;
2012-06-04 08:49:59 +00:00
try {
res . setHeader ( 'Content-Type' , 'text/html; charset=UTF-8' ) ;
parser . process ( src ) ;
} catch ( e ) {
console . log ( e ) ;
res . end ( e ) ;
}
} ;
2012-05-30 18:15:08 +00:00
/ * *
* Round - trip article testing
* /
2012-06-07 10:30:16 +00:00
app . get ( new RegExp ( '/_rt/(?:(?:(?:' + env . interwikiRegexp + '):+)?(' + env . interwikiRegexp + '):)?(.*)' ) , function ( req , res ) {
2012-06-05 09:19:58 +00:00
env . pageName = req . params [ 1 ] ;
if ( req . params [ 0 ] ) {
env . wgScriptPath = '/_rt/' + req . params [ 0 ] + ':' ;
2012-06-07 09:50:40 +00:00
env . wgScript = env . interwikiMap [ req . params [ 0 ] ] ;
2012-06-05 09:19:58 +00:00
} else {
2012-06-05 10:11:45 +00:00
env . wgScriptPath = '/_rt/' ;
2012-06-07 09:50:40 +00:00
env . wgScript = env . interwikiMap [ defaultInterwiki ] ;
2012-06-05 09:19:58 +00:00
}
2012-05-30 18:15:08 +00:00
if ( env . pageName === 'favicon.ico' ) {
res . end ( 'no favicon yet..' ) ;
return ;
}
2012-06-02 14:48:00 +00:00
var target = env . resolveTitle ( env . normalizeTitle ( env . pageName ) , '' ) ;
2012-05-30 18:15:08 +00:00
2012-06-05 09:19:58 +00:00
console . log ( 'starting parsing of ' + target ) ;
2012-05-30 18:15:08 +00:00
var tpr = new TemplateRequest ( env , target ) ;
2012-06-07 11:37:40 +00:00
tpr . once ( 'src' , parse . bind ( null , req , res , roundTripDiff ) ) ;
2012-06-04 08:49:59 +00:00
} ) ;
2012-06-17 15:36:39 +00:00
/ * *
* Round - trip article testing with newline stripping for editor - created HTML
* simulation
* /
app . get ( new RegExp ( '/_rtve/(?:(?:(?:' + env . interwikiRegexp + '):+)?(' + env . interwikiRegexp + '):)?(.*)' ) , function ( req , res ) {
env . pageName = req . params [ 1 ] ;
if ( req . params [ 0 ] ) {
env . wgScriptPath = '/_rtve/' + req . params [ 0 ] + ':' ;
env . wgScript = env . interwikiMap [ req . params [ 0 ] ] ;
} else {
env . wgScriptPath = '/_rtve/' ;
env . wgScript = env . interwikiMap [ defaultInterwiki ] ;
}
if ( env . pageName === 'favicon.ico' ) {
res . end ( 'no favicon yet..' ) ;
return ;
}
var target = env . resolveTitle ( env . normalizeTitle ( env . pageName ) , '' ) ;
console . log ( 'starting parsing of ' + target ) ;
var tpr = new TemplateRequest ( env , target ) ,
cb = function ( req , res , src , document ) {
// strip newlines from the html
2012-06-19 18:02:37 +00:00
var html = document . innerHTML . replace ( /[\r\n]/g , '' ) ,
2012-06-17 15:36:39 +00:00
p = new html5 . Parser ( ) ;
p . parse ( html ) ;
var newDocument = p . tree . document ;
// monkey-patch document.body reference for now..
newDocument . body = newDocument . childNodes [ 0 ] . childNodes [ 1 ] ;
roundTripDiff ( req , res , src , newDocument ) ;
} ;
tpr . once ( 'src' , parse . bind ( null , req , res , cb ) ) ;
} ) ;
2012-06-04 08:49:59 +00:00
/ * *
* Form - based round - tripping for manual testing
* /
app . get ( /\/_rtform\/(.*)/ , function ( req , res ) {
env . pageName = req . params [ 0 ] ;
res . setHeader ( 'Content-Type' , 'text/html; charset=UTF-8' ) ;
res . write ( "Your wikitext:" ) ;
textarea ( res ) ;
res . end ( '' ) ;
} ) ;
app . post ( /\/_rtform\/(.*)/ , function ( req , res ) {
env . pageName = req . params [ 0 ] ;
res . setHeader ( 'Content-Type' , 'text/html; charset=UTF-8' ) ;
2012-06-06 17:26:29 +00:00
// we don't care about \r, and normalize everything to \n
2012-06-07 11:37:40 +00:00
parse ( req , res , roundTripDiff , req . body . content . replace ( /\r/g , '' ) ) ;
2012-05-30 18:15:08 +00:00
} ) ;
2012-05-23 10:35:00 +00:00
/ * *
* Regular article parsing
* /
2012-06-07 10:30:16 +00:00
app . get ( new RegExp ( '/(?:(?:(?:' + env . interwikiRegexp + '):+)?(' + env . interwikiRegexp + '):)?(.*)' ) , function ( req , res ) {
2012-06-05 09:19:58 +00:00
env . pageName = req . params [ 1 ] ;
if ( req . params [ 0 ] ) {
2012-06-05 10:05:33 +00:00
env . wgScriptPath = '/' + req . params [ 0 ] + ':' ;
2012-06-07 09:50:40 +00:00
env . wgScript = env . interwikiMap [ req . params [ 0 ] ] ;
2012-06-05 09:19:58 +00:00
} else {
env . wgScriptPath = '/' ;
2012-06-07 09:50:40 +00:00
env . wgScript = env . interwikiMap [ defaultInterwiki ] ;
2012-06-05 09:19:58 +00:00
}
2012-05-23 15:17:54 +00:00
if ( env . pageName === 'favicon.ico' ) {
res . end ( 'no favicon yet..' ) ;
return ;
}
2012-06-05 09:19:58 +00:00
var target = env . resolveTitle ( env . normalizeTitle ( env . pageName ) , '' ) ;
console . log ( 'starting parsing of ' + target ) ;
var tpr = new TemplateRequest ( env , target ) ;
2012-06-07 11:37:40 +00:00
tpr . once ( 'src' , parse . bind ( null , req , res , function ( req , res , src , document ) {
2012-05-23 10:35:00 +00:00
res . end ( document . body . innerHTML ) ;
2012-06-05 09:19:58 +00:00
} ) ) ;
2012-05-23 10:35:00 +00:00
} ) ;
/ * *
* Regular article serialization using POST
* /
app . post ( /\/(.*)/ , function ( req , res ) {
2012-05-23 14:43:45 +00:00
env . pageName = req . params [ 0 ] ;
2012-06-05 09:19:58 +00:00
env . wgScriptPath = '/' ;
2012-05-24 08:18:41 +00:00
res . setHeader ( 'Content-Type' , 'text/x-mediawiki; charset=UTF-8' ) ;
2012-05-23 10:35:00 +00:00
var p = new html5 . Parser ( ) ;
p . parse ( req . body . content ) ;
new WikitextSerializer ( { env : env } ) . serializeDOM (
p . tree . document . childNodes [ 0 ] . childNodes [ 1 ] ,
res . write . bind ( res ) ) ;
res . end ( '' ) ;
} ) ;
module . exports = app ;