mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-26 23:31:02 +00:00
d918fa18ac
* Tokens are now immutable. The progress of transformations is tracked on chunks instead of tokens. Tokenizer output is cached and can be directly returned without a need for cloning. Transforms are required to clone or newly create tokens they are modifying. * Expansions per chunk are now shared between equivalent frames via a cache stored on the chunk itself. Equivalence of frames is not yet ideal though, as right now a hash tree of *unexpanded* arguments is used. This should be switched to a hash of the fully expanded local parameters instead. * There is now a vastly improved maybeSyncReturn wrapper for async transforms that either forwards processing to the iterative transformTokens if the current transform is still ongoing, or manages a recursive transformation if needed. * Parameters for parser functions are now wrapped in abstract Params and ParserValue objects, which support some handy on-demand *value* expansions. Keys are always expanded. Parser functions are converted to use these interfaces, and now properly expand their values in the correct frame. Making this expansion lazier is certainly possible, but would complicate transformTokens and other token-handling machinery. Need to investigate if it would really be worth it. Dead branch elimination is certainly a bigger win overall. * Complex recursive asynchronous expansions should now be closer to correct for both the iterative (transformTokens) and recursive (maybeSyncReturn after transformTokens has returned) code paths. * Performance degraded slightly. There are no micro-optimizations done yet and the shared expansion cache still has a low hit rate. The progress tracking on chunks is not yet perfect, so there are likely a lot of unneeded re-expansions that can be easily eliminated. There is also more debug tracing right now. Obama currently expands in 54 seconds on my laptop. Change-Id: I4a603f3d3c70ca657ebda9fbb8570269f943d6b6
139 lines
4.1 KiB
JavaScript
139 lines
4.1 KiB
JavaScript
/*
|
|
* General token sanitizer. Strips out (or encapsulates) unsafe and disallowed
|
|
* tag types and attributes. Should run last in the third, synchronous
|
|
* expansion stage. Tokens from extensions which should not be sanitized
|
|
* can bypass sanitation by setting their rank to 3.
|
|
*
|
|
* @author Gabriel Wicke <gwicke@wikimedia.org>
|
|
*/
|
|
|
|
// Include general utilities
|
|
var Util = require('./ext.Util.js').Util,
|
|
u = new Util();
|
|
|
|
|
|
function Sanitizer ( manager ) {
|
|
this.manager = manager;
|
|
this.register( manager );
|
|
}
|
|
|
|
// constants
|
|
Sanitizer.prototype.handledRank = 2.99;
|
|
Sanitizer.prototype.anyRank = 2.9901;
|
|
|
|
|
|
// Register this transformer with the TokenTransformer
|
|
Sanitizer.prototype.register = function ( manager ) {
|
|
this.manager = manager;
|
|
manager.addTransform( this.onAnchor.bind(this), this.handledRank, 'tag', 'a' );
|
|
manager.addTransform( this.onAny.bind(this), this.anyRank, 'any' );
|
|
};
|
|
|
|
Sanitizer.prototype.onAnchor = function ( token ) {
|
|
// perform something similar to Sanitizer::cleanUrl
|
|
if ( token.constructor === EndTagTk ) {
|
|
return { token: token };
|
|
}
|
|
var hrefKV = this.manager.env.lookupKV( token.attribs, 'href' );
|
|
// FIXME: Should the flattening of attributes to text be performed as part
|
|
// of the AttributeTransformManager processing? This certainly is not the
|
|
// right place!
|
|
if ( hrefKV !== null ) {
|
|
hrefKV.v = this.manager.env.tokensToString( hrefKV.v );
|
|
var bits = hrefKV.v.match( /(.*?\/\/)([^\/]+)(\/?.*)/ );
|
|
if ( bits ) {
|
|
proto = bits[1];
|
|
host = bits[2];
|
|
path = bits[3];
|
|
} else {
|
|
proto = '';
|
|
host = '';
|
|
path = hrefKV.v;
|
|
}
|
|
host = this._stripIDNs( host );
|
|
hrefKV.v = proto + host + path;
|
|
}
|
|
return { token: token };
|
|
};
|
|
|
|
// XXX: We actually need to strip IDN ignored characters in the link text as
|
|
// well, so that readers are not mislead. This should perhaps happen at an
|
|
// earlier stage, while converting links to html.
|
|
Sanitizer.prototype._IDNRegexp = new RegExp(
|
|
"[\t ]|" + // general whitespace
|
|
"\u00ad|" + // 00ad SOFT HYPHEN
|
|
"\u1806|" + // 1806 MONGOLIAN TODO SOFT HYPHEN
|
|
"\u200b|" + // 200b ZERO WIDTH SPACE
|
|
"\u2060|" + // 2060 WORD JOINER
|
|
"\ufeff|" + // feff ZERO WIDTH NO-BREAK SPACE
|
|
"\u034f|" + // 034f COMBINING GRAPHEME JOINER
|
|
"\u180b|" + // 180b MONGOLIAN FREE VARIATION SELECTOR ONE
|
|
"\u180c|" + // 180c MONGOLIAN FREE VARIATION SELECTOR TWO
|
|
"\u180d|" + // 180d MONGOLIAN FREE VARIATION SELECTOR THREE
|
|
"\u200c|" + // 200c ZERO WIDTH NON-JOINER
|
|
"\u200d|" + // 200d ZERO WIDTH JOINER
|
|
"[\ufe00-\ufe0f]", // fe00-fe0f VARIATION SELECTOR-1-16
|
|
'g'
|
|
);
|
|
|
|
Sanitizer.prototype._stripIDNs = function ( host ) {
|
|
return host.replace( this._IDNRegexp, '' );
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
* Sanitize any tag.
|
|
*
|
|
* XXX: Make attribute sanitation reversible by storing round-trip info in
|
|
* token.dataAttribs object (which is serialized as JSON in a data-mw
|
|
* attribute in the DOM).
|
|
*/
|
|
Sanitizer.prototype.onAny = function ( token ) {
|
|
// XXX: validate token type according to whitelist and convert non-ok ones
|
|
// back to text.
|
|
|
|
// Convert attributes to string, if necessary.
|
|
// XXX: Likely better done in AttributeTransformManager when processing is
|
|
// complete
|
|
if ( token.attribs && token.attribs.length ) {
|
|
var attribs = token.attribs.slice();
|
|
var newToken = $.extend( {}, token );
|
|
for ( var i = 0, l = attribs.length; i < l; i++ ) {
|
|
var kv = attribs[i],
|
|
k = kv.k,
|
|
v = kv.v;
|
|
|
|
if ( k.constructor === Array ) {
|
|
k = this.manager.env.tokensToString ( k );
|
|
}
|
|
if ( v.constructor === Array ) {
|
|
v = this.manager.env.tokensToString ( v );
|
|
}
|
|
if ( k === 'style' ) {
|
|
v = this.checkCss(v);
|
|
}
|
|
attribs[i] = new KV( k, v );
|
|
}
|
|
//console.warn( 'sanitizer: ' + JSON.stringify([attribs, token], null, 2));
|
|
newToken.attribs = attribs;
|
|
token = newToken;
|
|
}
|
|
// XXX: Validate attributes
|
|
return { token: token };
|
|
};
|
|
|
|
Sanitizer.prototype.checkCss = function ( value ) {
|
|
if (/[\000-\010\016-\037\177]/.test(value)) {
|
|
return '/* invalid control char */';
|
|
}
|
|
if (/expression|filter\s*:|accelerator\s*:|url\s*\(/i.test(value)) {
|
|
return '/* insecure input */';
|
|
}
|
|
return value;
|
|
};
|
|
|
|
if (typeof module == "object") {
|
|
module.exports.Sanitizer = Sanitizer;
|
|
}
|