Pluck a few low-hanging fruit in external link tokenization, and add a simple

localurl parser function implementation. 230 parser tests now passing.
This commit is contained in:
Gabriel Wicke 2012-02-07 10:28:23 +00:00
parent cf8b7bf45d
commit 1f6db903e9
Notes: Gabriel Wicke 2012-02-27 16:40:01 +00:00
5 changed files with 27 additions and 6 deletions

View file

@ -212,6 +212,18 @@ ParserFunctions.prototype['pf_#expr'] = function ( target, argList, argDict ) {
return [ res.toString() ];
};
ParserFunctions.prototype['pf_localurl'] = function ( target, argList, argDict ) {
return ( this.manager.env.wgScriptPath + '/index' +
this.manager.env.wgScriptExtension + '?title=' +
target + '&' +
argList.map(
function( kv ) {
//console.log( JSON.stringify( kv ) );
return (kv.v !== '' && kv.k + '=' + kv.v ) || kv.k;
}
).join('&')
);
};
/**

View file

@ -77,7 +77,7 @@ function ParserPipeline( env, inputType ) {
// Add token transformations..
new QuoteTransformer( this.tokenPostProcessor );
new PostExpandParagraphHandler( this.tokenPostProcessor );
//new Sanitizer( this.tokenPostProcessor );
new Sanitizer( this.tokenPostProcessor );
//var citeExtension = new Cite( this.tokenTransformer );

View file

@ -351,11 +351,11 @@ spaceless_preprocessor_text
}
link_preprocessor_text
= r:( t:[^'<~[{\n\r|!\]}\t &=]+ { return t.join(''); }
= r:( t:[^'<~[{\n\r|!\]}\t &="']+ { return t.join(''); }
/ directive
/ urlencoded_char
/ !inline_breaks no_punctuation_char
/ s:[.:,] !(space / eolf) { return s }
/ urlencoded_char
/ [&%] )+ {
return flatten_string ( r );
}
@ -687,7 +687,8 @@ comment_chars
urllink
= target:url {
= ! { return syntaxFlags['extlink'] }
target:url {
return [ new TagTk( 'a', [new KV('href', target)] )
, target
, new EndTagTk( 'a' )

View file

@ -59,6 +59,13 @@ testWhiteList["Table security: embedded pipes (http://lists.wikimedia.org/mailma
// Sanitizer, but UTF8 in link might actually be ok in HTML5
testWhiteList["External link containing double-single-quotes with no space separating the url from text in italics"] = "<p><a href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\"><i>La muerte de Casagemas</i> (1901) en el sitio de </a><a data-type=\"internal\" href=\"Museo Picasso (París)\">Museo Picasso</a>.</p>";
// plain percent sign is also valid in HTML5
testWhiteList["Bug 4781, 5267: %28, %29 in URL"] = "<p><a href=\"http://www.example.com/?title=Ben-Hur_(1959_film)\" data-sourcePos=\"0:53\">http://www.example.com/?title=Ben-Hur_(1959_film)</a></p>";
testWhiteList["External links: wiki links within external link (Bug 3695)"] = "<p><a href=\"http://example.com\" data-type=\"external\" data-sourcePos=\"0:54\"></a><a data-type=\"internal\" href=\"wikilink\">wikilink</a> embedded in ext link</p>";
testWhiteList["Bug 4781, 5267: %25 in URL"] = "<p><a href=\"http://www.example.com/?title=100%_Bran\" data-sourcePos=\"0:41\">http://www.example.com/?title=100%_Bran</a></p>";
if (typeof module == "object") {
module.exports.testWhiteList = testWhiteList;

View file

@ -199,7 +199,8 @@ function ParserTests () {
this.env = new MWParserEnvironment({
fetchTemplates: false,
debug: this.argv.debug,
trace: this.argv.trace
trace: this.argv.trace,
wgScriptPath: ''
});
}
@ -313,7 +314,7 @@ ParserTests.prototype.normalizeHTML = function (source) {
// general class and titles, typically on links
.replace(/(title|class|rel)="[^"]+"/g, '')
// strip red link markup, we do not check if a page exists yet
.replace(/\/index.php\?title=|&amp;action=edit&amp;redlink=1/g, '')
.replace(/\/index.php\?title=([^']+)&amp;action=edit&amp;redlink=1/g, '$1')
// the expected html has some extra space in tags, strip it
.replace(/<a +href/g, '<a href')
.replace(/" +>/g, '">');