mediawiki-extensions-Visual.../modules/parser/ext.core.Sanitizer.js
Gabriel Wicke 3172afb750 Rename data-mw into data-rt
This hopefully makes it clearer that data-rt contains private round-trip info
instead of semantically interesting data.

Change-Id: I03b476ed112a4b627c9871ee3677c450f943429a
2012-07-16 12:10:08 -07:00

139 lines
4.1 KiB
JavaScript

/*
* General token sanitizer. Strips out (or encapsulates) unsafe and disallowed
* tag types and attributes. Should run last in the third, synchronous
* expansion stage. Tokens from extensions which should not be sanitized
* can bypass sanitation by setting their rank to 3.
*
* @author Gabriel Wicke <gwicke@wikimedia.org>
*/
// Include general utilities
var Util = require('./ext.Util.js').Util,
u = new Util();
function Sanitizer ( manager ) {
this.manager = manager;
this.register( manager );
}
// constants
Sanitizer.prototype.handledRank = 2.99;
Sanitizer.prototype.anyRank = 2.9901;
// Register this transformer with the TokenTransformer
Sanitizer.prototype.register = function ( manager ) {
this.manager = manager;
manager.addTransform( this.onAnchor.bind(this), this.handledRank, 'tag', 'a' );
manager.addTransform( this.onAny.bind(this), this.anyRank, 'any' );
};
Sanitizer.prototype.onAnchor = function ( token ) {
// perform something similar to Sanitizer::cleanUrl
if ( token.constructor === EndTagTk ) {
return { token: token };
}
var hrefKV = this.manager.env.lookupKV( token.attribs, 'href' );
// FIXME: Should the flattening of attributes to text be performed as part
// of the AttributeTransformManager processing? This certainly is not the
// right place!
if ( hrefKV !== null ) {
hrefKV.v = this.manager.env.tokensToString( hrefKV.v );
var bits = hrefKV.v.match( /(.*?\/\/)([^\/]+)(\/?.*)/ );
if ( bits ) {
proto = bits[1];
host = bits[2];
path = bits[3];
} else {
proto = '';
host = '';
path = hrefKV.v;
}
host = this._stripIDNs( host );
hrefKV.v = proto + host + path;
}
return { token: token };
};
// XXX: We actually need to strip IDN ignored characters in the link text as
// well, so that readers are not mislead. This should perhaps happen at an
// earlier stage, while converting links to html.
Sanitizer.prototype._IDNRegexp = new RegExp(
"[\t ]|" + // general whitespace
"\u00ad|" + // 00ad SOFT HYPHEN
"\u1806|" + // 1806 MONGOLIAN TODO SOFT HYPHEN
"\u200b|" + // 200b ZERO WIDTH SPACE
"\u2060|" + // 2060 WORD JOINER
"\ufeff|" + // feff ZERO WIDTH NO-BREAK SPACE
"\u034f|" + // 034f COMBINING GRAPHEME JOINER
"\u180b|" + // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
"\u180c|" + // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
"\u180d|" + // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
"\u200c|" + // 200c ZERO WIDTH NON-JOINER
"\u200d|" + // 200d ZERO WIDTH JOINER
"[\ufe00-\ufe0f]", // fe00-fe0f VARIATION SELECTOR-1-16
'g'
);
Sanitizer.prototype._stripIDNs = function ( host ) {
return host.replace( this._IDNRegexp, '' );
};
/**
* Sanitize any tag.
*
* XXX: Make attribute sanitation reversible by storing round-trip info in
* token.dataAttribs object (which is serialized as JSON in a data-rt
* attribute in the DOM).
*/
Sanitizer.prototype.onAny = function ( token ) {
// XXX: validate token type according to whitelist and convert non-ok ones
// back to text.
// Convert attributes to string, if necessary.
// XXX: Likely better done in AttributeTransformManager when processing is
// complete
if ( token.attribs && token.attribs.length ) {
var attribs = token.attribs.slice();
var newToken = $.extend( {}, token );
for ( var i = 0, l = attribs.length; i < l; i++ ) {
var kv = attribs[i],
k = kv.k,
v = kv.v;
if ( k.constructor === Array ) {
k = this.manager.env.tokensToString ( k );
}
if ( v.constructor === Array ) {
v = this.manager.env.tokensToString ( v );
}
if ( k === 'style' ) {
v = this.checkCss(v);
}
attribs[i] = new KV( k, v );
}
//console.warn( 'sanitizer: ' + JSON.stringify([attribs, token], null, 2));
newToken.attribs = attribs;
token = newToken;
}
// XXX: Validate attributes
return { token: token };
};
Sanitizer.prototype.checkCss = function ( value ) {
if (/[\000-\010\016-\037\177]/.test(value)) {
return '/* invalid control char */';
}
if (/expression|filter\s*:|accelerator\s*:|url\s*\(/i.test(value)) {
return '/* insecure input */';
}
return value;
};
if (typeof module == "object") {
module.exports.Sanitizer = Sanitizer;
}