Shorten data-mw-rt to data-mw and clean up whitelist

Instead of a proliferation of data-mw-* attributes, it should be easier to
stash all private / non-semantic round-trip information in a JSON object
stored in data-mw.

Change-Id: Id200a6a8789fa152f29ea530e5a24b6ee7b4b285
This commit is contained in:
Gabriel Wicke 2012-04-02 18:12:49 +02:00
parent 5ef3438ee5
commit f662690d02
4 changed files with 14 additions and 14 deletions

View file

@ -86,7 +86,7 @@ Sanitizer.prototype._stripIDNs = function ( host ) {
* Sanitize any tag.
*
* XXX: Make attribute sanitation reversible by storing round-trip info in
* token.dataAttribs object (which is serialized as JSON in a data-mw-rt
* token.dataAttribs object (which is serialized as JSON in a data-mw
* attribute in the DOM).
*/
Sanitizer.prototype.onAny = function ( token ) {

View file

@ -76,8 +76,8 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
}
token.attribs.push(
{
// 'Mediawiki round-trip' information
k: 'data-mw-rt',
// Mediawiki-specific round-trip / non-semantic information
k: 'data-mw',
v: JSON.stringify( token.dataAttribs )
} );
}

View file

@ -8,12 +8,12 @@ testWhiteList = {};
// formatting is identical
testWhiteList["Italics and bold"] = "<ul><li> plain</li><li> plain<i>italic</i>plain</li><li> plain<i>italic</i>plain<i>italic</i>plain</li><li> plain<b>bold</b>plain</li><li> plain<b>bold</b>plain<b>bold</b>plain</li><li> plain<i>italic</i>plain<b>bold</b>plain</li><li> plain<b>bold</b>plain<i>italic</i>plain</li><li> plain<i>italic<b>bold-italic</b>italic</i>plain</li><li> plain<b>bold<i>bold-italic</i>bold</b>plain</li><li> plain<i><b>bold-italic</b>italic</i>plain</li><li> plain<i><b>bold-italic</b></i><b>bold</b>plain</li><li> plain<i>italic<b>bold-italic</b></i>plain</li><li> plain<b>bold<i>bold-italic</i></b>plain</li><li> plain l'<i>italic</i>plain</li><li> plain l'<b>bold</b> plain</li></ul>";
testWhiteList["Bug 2702: Mismatched <i>, <b> and <a> tags are invalid"] = "<p><i><a href=\"http://example.com\">text</a></i><a href=\"http://example.com\" data-mw-sourcePos=\"30:61\"><b>text</b></a><i data-mw-sourcePos=\"62:106\">Something <a href=\"http://example.com\">in italic</a></i><i data-mw-sourcePos=\"107:164\">Something <a href=\"http://example.com\">mixed</a></i><a href=\"http://example.com\"><b>, even bold</b></a><i data-mw-sourcePos=\"165:204\"><b data-mw-sourcePos=\"165:204\">Now <a href=\"http://example.com\">both</a></b></i></p>";
testWhiteList["Bug 2702: Mismatched <i>, <b> and <a> tags are invalid"] = "<p><i><a href=\"http://example.com\">text</a></i><a href=\"http://example.com\"><b>text</b></a><i>Something <a href=\"http://example.com\">in italic</a></i><i>Something <a href=\"http://example.com\">mixed</a></i><a href=\"http://example.com\"><b>, even bold</b></a><i><b>Now <a href=\"http://example.com\">both</a></b></i></p>";
testWhiteList["Unclosed and unmatched quotes"] = "<p data-mw-sourcePos=\"0:66\"><i><b>Bold italic text </b>with bold deactivated<b> in between.</b></i></p><p><i><b>Bold italic text </b></i><b>with italic deactivated<i> in between.</i></b></p><p><b>Bold text..</b></p><p>..spanning two paragraphs (should not work).<b></b></p><p><b>Bold tag left open</b></p><p><i>Italic tag left open</i></p><p>Normal text.<!-- Unmatching number of opening, closing tags: -->\n</p><p><b>This year'</b>s election <i>should</i> beat <b>last year'</b>s.</p><p><i>Tom<b>s car is bigger than </b></i><b>Susan</b>s.</p>";
testWhiteList["Unclosed and unmatched quotes"] = "<p><i><b>Bold italic text </b>with bold deactivated<b> in between.</b></i></p><p><i><b>Bold italic text </b></i><b>with italic deactivated<i> in between.</i></b></p><p><b>Bold text..</b></p><p>..spanning two paragraphs (should not work).<b></b></p><p><b>Bold tag left open</b></p><p><i>Italic tag left open</i></p><p>Normal text.<!-- Unmatching number of opening, closing tags: -->\n</p><p><b>This year'</b>s election <i>should</i> beat <b>last year'</b>s.</p><p><i>Tom<b>s car is bigger than </b></i><b>Susan</b>s.</p>";
// The expected result for this test is really broken html.
testWhiteList["Link containing double-single-quotes '' in text embedded in italics (bug 4598 sanity check)"] = "<p data-mw-sourcePos=\"0:45\"><i>Some <a data-mw-type=\"internal\" href=\"/wiki/Link\">pretty </a></i><a data-mw-type=\"internal\" href=\"/wiki/Link\">italics<i> and stuff</i></a><i>!</i></p>";
testWhiteList["Link containing double-single-quotes '' in text embedded in italics (bug 4598 sanity check)"] = "<p><i>Some <a href=\"/wiki/Link\">pretty </a></i><a href=\"/wiki/Link\">italics<i> and stuff</i></a><i>!</i></p>";
testWhiteList["External link containing double-single-quotes in text embedded in italics (bug 4598 sanity check)"] = "<p><i>Some <a href=\"http://example.com/\">pretty </a></i><a href=\"http://example.com/\">italics<i> and stuff</i></a><i>!</i></p>";
@ -22,13 +22,13 @@ testWhiteList["5 quotes, code coverage +1 line"] = "<p>'<i></i></p>";
// The comment in the test already suggests this result as correct, but
// supplies the old result without preformatting.
testWhiteList["Bug 6200: Preformatted in <blockquote>"] = "<blockquote data-mw-sourcePos=\"0:12\"><pre>\nBlah</pre></blockquote>";
testWhiteList["Bug 6200: Preformatted in <blockquote>"] = "<blockquote><pre>\nBlah</pre></blockquote>";
// empty table tags / with only a caption are legal in HTML5.
testWhiteList["A table with no data."] = "<table></table>";
testWhiteList["A table with nothing but a caption"] = "<table><caption> caption</caption></table>";
testWhiteList["Fuzz testing: Parser22"] = "<p data-mw-sourcePos=\"0:23\"><a href=\"http://===r:::https://b\">http://===r:::https://b</a></p><table></table>";
testWhiteList["Fuzz testing: Parser22"] = "<p><a href=\"http://===r:::https://b\">http://===r:::https://b</a></p><table></table>";
// MediaWiki changes the order of attributes in tables, ignore that
testWhiteList["Multiplication table"] = "<table border=\"1\" cellpadding=\"2\"><caption>Multiplication table</caption><tbody><tr><th> × </th><th> 1 </th><th> 2 </th><th> 3</th></tr><tr><th> 1</th><td> 1 </td><td> 2 </td><td> 3</td></tr><tr><th> 2</th><td> 2 </td><td> 4 </td><td> 6</td></tr><tr><th> 3</th><td> 3 </td><td> 6 </td><td> 9</td></tr><tr><th> 4</th><td> 4 </td><td> 8 </td><td> 12</td></tr><tr><th> 5</th><td> 5 </td><td> 10 </td><td> 15</td></tr></tbody></table>";
@ -37,7 +37,7 @@ testWhiteList["Nested table"] = "<table border=\"1\"><tbody><tr><td> α</td><td>
// Very minor whitespace difference at end of cell (MediaWiki inserts a
// newline before the close tag even if there was no trailing space in the cell)
testWhiteList["Table rowspan"] = "<table border=\"1\" data-mw-sourcePos=\"0:121\"><tbody><tr><td> Cell 1, row 1 </td><td rowspan=\"2\"> Cell 2, row 1 (and 2) </td><td> Cell 3, row 1 </td></tr><tr><td> Cell 1, row 2 </td><td> Cell 3, row 2 </td></tr></tbody></table>";
testWhiteList["Table rowspan"] = "<table border=\"1\"><tbody><tr><td> Cell 1, row 1 </td><td rowspan=\"2\"> Cell 2, row 1 (and 2) </td><td> Cell 3, row 1 </td></tr><tr><td> Cell 1, row 2 </td><td> Cell 3, row 2 </td></tr></tbody></table>";
// Inter-element whitespace only
testWhiteList["Indented table markup mixed with indented pre content (proposed in bug 6200)"] = " \n\n<table><tbody><tr><td><pre>\nText that should be rendered preformatted\n</pre></td></tr></tbody></table>";
@ -48,7 +48,7 @@ testWhiteList["Indented table markup mixed with indented pre content (proposed i
// Single quotes are legal in HTML5 URIs. See
// http://www.whatwg.org/specs/web-apps/current-work/multipage/urls.html#url-manipulation-and-creation
testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e paise d''o munno</a></p>";
testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a href=\"/wiki/Lista_d''e_paise_d''o_munno\">Lista d''e paise d''o munno</a></p>";
// Sanitizer
@ -56,15 +56,15 @@ testWhiteList["Invalid attributes in table cell (bug 1830)"] = "<table><tbody><t
testWhiteList["Table security: embedded pipes (http://lists.wikimedia.org/mailman/htdig/wikitech-l/2006-April/022293.html)"] = "<table><tbody><tr><td> |<a href=\"ftp://|x||\">[1]</a>\" onmouseover=\"alert(document.cookie)\"&gt;test</td></tr></tbody></table>";
// Sanitizer, but UTF8 in link is ok in HTML5
testWhiteList["External link containing double-single-quotes with no space separating the url from text in italics"] = "<p><a href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\" data-mw-type=\"external\" data-mw-rt=\"{&quot;sourcePos&quot;:[0,146]}\"><i>La muerte de Casagemas</i> (1901) en el sitio de </a><a href=\"/wiki/Museo_Picasso_(París)\" data-mw-type=\"internal\">Museo Picasso</a>.</p>";
testWhiteList["External link containing double-single-quotes with no space separating the url from text in italics"] = "<p><a href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\" data-mw=\"{&quot;sourcePos&quot;:[0,146]}\"><i>La muerte de Casagemas</i> (1901) en el sitio de </a><a href=\"/wiki/Museo_Picasso_(París)\">Museo Picasso</a>.</p>";
testWhiteList["External links: wiki links within external link (Bug 3695)"] = "<p><a href=\"http://example.com\" data-mw-type=\"external\" data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\" href=\"/wiki/Wikilink\">wikilink</a> embedded in ext link</p>";
testWhiteList["External links: wiki links within external link (Bug 3695)"] = "<p><a href=\"http://example.com\"></a><a href=\"/wiki/Wikilink\">wikilink</a> embedded in ext link</p>";
testWhiteList["<pre> with forbidden attribute values (bug 3202)"] = "<pre width=\"8\" style=\"\">Narrow screen goodies</pre>";
// This is valid, just confusing for humans. The reason for disallowing this
// might be history by now. XXX: Check this!
testWhiteList["Link containing % as a double hex sequence interpreted to hex sequence"] = "<p><a href=\"/wiki/7%2525_Solution\" data-mw-type=\"internal\">7%25 Solution</a></p>";
testWhiteList["Link containing % as a double hex sequence interpreted to hex sequence"] = "<p><a href=\"/wiki/7%2525_Solution\">7%25 Solution</a></p>";
if (typeof module == "object") {
module.exports.testWhiteList = testWhiteList;

View file

@ -333,7 +333,7 @@ ParserTests.prototype.normalizeHTML = function (source) {
// known-ok differences.
ParserTests.prototype.normalizeOut = function ( out ) {
// TODO: Do not strip newlines in pre and nowiki blocks!
return out.replace(/[\r\n]| data-mw-[a-zA-Z-]+="[^">]*"/g, '')
return out.replace(/[\r\n]| data-mw="[^">]*"/g, '')
.replace(/<!--.*?-->\n?/gm, '')
.replace(/<\/?meta[^>]*>/g, '');
};