mediawiki-extensions-Visual.../modules/parser/mediawiki.DOMPostProcessor.js

469 lines
13 KiB
JavaScript
Raw Normal View History

/* Perform post-processing steps on an already-built HTML DOM. */
2012-01-17 18:30:22 +00:00
var events = require('events'),
Util = require('./mediawiki.Util.js').Util;
// Quick HACK: define Node constants
// https://developer.mozilla.org/en/nodeType
var Node = {
ELEMENT_NODE: 1,
TEXT_NODE: 3,
COMMENT_NODE: 8
};
var isElementContentWhitespace = function ( e ) {
return (e.data.match(/^[ \r\n\t]*$/) !== null);
};
function minimize_inline_tags(root, rewriteable_nodes) {
var rewriteable_node_map = null;
function tail(a) {
return a[a.length-1];
}
function remove_all_children(node) {
while (node.hasChildNodes()) node.removeChild(node.firstChild);
}
function add_children(node, children) {
for (var i = 0, n = children.length; i < n; i++) node.appendChild(children[i]);
}
function init() {
rewriteable_node_map = {};
for (var i = 0, n = rewriteable_nodes.length; i < n; i++) {
rewriteable_node_map[rewriteable_nodes[i].toLowerCase()] = true;
}
}
function is_rewriteable_node(node_name) {
return rewriteable_node_map[node_name];
}
// Main routine
function rewrite(node) {
var children = node.childNodes;
var n = children.length;
// If we have a single node, no restructuring is possible at this level
// Descend ...
if (n === 1) {
var sole_node = children[0];
if (sole_node.nodeType === Node.ELEMENT_NODE) rewrite(sole_node);
return;
}
// * Collect longest linear paths for all children
// * Process subtrees attached to the end of those paths
// * Restructure the list of linear paths (and reattach processed subtrees at the tips).
var P = [];
for (var i = 0; i < n; i++) {
var s = children[i];
if (s.nodeType === Node.ELEMENT_NODE) {
var p = longest_linear_path(s);
if (p.length === 0) {
rewrite(s);
// console.log("Pushed EMPTY with orig_parent: " + node.nodeName);
P.push({path: [], orig_parent: node, children: [s]});
} else {
var p_tail = tail(p);
// console.log("llp: " + p);
// process subtree (depth-first)
rewrite(p_tail);
// collect the restructured p_tail subtree (children)
var child_nodes = p_tail.childNodes;
var new_children = [];
for (var j = 0, n2 = child_nodes.length; j < n2; j++) {
new_children.push(child_nodes[j]);
}
// console.log("Pushed: " + p + ", tail: " + p_tail.nodeName + "; new_children: " + new_children.length);
P.push({path: p, orig_parent: p_tail, children: new_children});
}
} else {
// console.log("Pushed EMPTY with subtree: " + s);
P.push({path: [], orig_parent: node, children: [s]});
}
}
// Rewrite paths in 'P'
if (P.length > 0) rewrite_paths(node, P);
}
function longest_linear_path(node) {
var children, path = [];
while (node.nodeType == Node.ELEMENT_NODE) {
path.push(node);
children = node.childNodes;
if ((children.length === 0) || (children.length > 1)) return path;
node = children[0];
}
return path;
}
function rewrite_paths(parent_node, P) {
// 1. Split P into maximal sublists where each sublist has a non-null path intersection.
// 2. Process each sublist separately and accumulate the result.
//
// lcs = longest common sublist
remove_all_children(parent_node);
var sublists = split_into_disjoint_sublists(P);
// console.log("# sublists: " + sublists.length + ", parent_node: " + parent_node.nodeName);
for (var i = 0, num_sublists = sublists.length; i < num_sublists; i++) {
var s = sublists[i];
var lcs = s.lcs;
if (lcs.length > 0) {
// Connect up LCS
// console.log("LCS: " + lcs);
var prev = lcs[0];
for (k = 1, lcs_len = lcs.length; k < lcs_len; k++) {
var curr = lcs[k];
// SSS FIXME: this add/remove can be optimized
// console.log("adding " + curr.nodeName + " to " + prev.nodeName);
remove_all_children(prev);
prev.appendChild(curr);
prev = curr;
}
// Lastly, attach lcs to the incoming parent
parent_node.appendChild(lcs[0]);
}
var paths = s.paths;
var num_paths = paths.length;
// console.log("sublist: lcs: " + lcs + ", #paths: " + num_paths);
if (num_paths === 1) {
// Nothing more to do! Stitch things up
// two possible scenarios:
// (a) we have an empty path ==> attach the children to parent_node
// (b) we have a non-empty path ==> attach the children to the end of the path
var p = paths[0].path;
var children = paths[0].children;
if (p.length > 0) {
var p_tail = tail(p);
remove_all_children(p_tail);
add_children(p_tail, children);
} else {
add_children(parent_node, children);
}
} else {
// Process the sublist
rewrite_paths(tail(lcs), strip_lcs(paths, lcs));
}
// console.log("done with this sublist");
}
// console.log("--done all sublists--");
}
function common_path(old, new_path) {
var hash = {};
for (var i = 0, n = new_path.length; i < n; i++) {
var e = new_path[i].nodeName.toLowerCase();
if (is_rewriteable_node(e)) hash[e] = new_path[i];
}
var cp = [];
for (i = 0, n = old.length; i < n; i++) {
var hit = hash[old[i].nodeName.toLowerCase()];
// Add old path element always. This effectively picks elements from the leftmost path.
if (hit) cp.push(old[i]);
}
// console.log("CP: " + old + "||" + new_path + "=" + cp);
return cp;
}
// For each 'p' in 'paths', eliminate 'lcs' from 'p'
function strip_lcs(paths, lcs) {
// SSS FIXME: Implicit assumption: there are no duplicate elements in lcs or path!
// Ex: <b><i><b>BIB</b></i></b> will
// Fix this to be more robust
var i, lcs_map = {};
for (i = 0, n = lcs.length; i < n; i++) lcs_map[lcs[i]] = true;
for (i = 0, n = paths.length; i < n; i++) {
var p = paths[i].path;
for (var j = 0, l = p.length; j < l; j++) {
// remove matching element
if (lcs_map[p[j]]) {
p.splice(j, 1);
l--;
j--;
}
}
}
return paths;
}
// Split 'P' into sublists where each sublist has the property that
// the elements of the sublist have an intersection that is non-zero
// Ex: [BIUS, SB, BUS, IU, I, U, US, B, I] will get split into 5 sublists
// - (lcs: BS, paths: [BIUS, SB, BUS])
// - (lcs: I, paths: [IU, I])
// - (lcs: U, paths: [U, US])
// - (lcs: B, paths: [B])
// - (lcs: I, paths: [I])
function split_into_disjoint_sublists(P) {
var p = P.shift();
var lcs = p.path;
var curr = [p];
for (var i = 0, n = P.length; i < n; i++) {
p = P.shift();
new_lcs = common_path(lcs, p.path);
if (new_lcs.length === 0) {
P.unshift(p);
return [{lcs: lcs, paths: curr}].concat(split_into_disjoint_sublists(P));
}
lcs = new_lcs;
curr.push(p);
}
return [{lcs: lcs, paths: curr}];
}
// Init
init();
// Kick it off
try {
rewrite(root);
} catch (e) {
console.log("------- error errrror errrrrror! ----------");
console.log(e.stack);
}
}
First attempt implementing rewriting rules on the DOM - This is implemented as a post-processing pass. - Might require additional checks to verify rewriteability. - Implemented as a pair-wise tag DOM minimization strategy, i.e. it takes tag pairs (B, I) for ex, and attempts to normalize the tree just for those tag pairs. Normalizing across multiple tags is implemented as pairwise rewriting across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u) - Copied over attributes as part of rewriting, but some of the attributes lose their meaning on rewriting since tags are reordered (ex: sourcePosn, sourceTagPosn). How do we handle this? Output examples and possible issues to fix: <i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u> gets rewritten to: <u><b><i>biu</i>bu</b>u</u> But, the equivalent wikitext form: '''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u> does not get rewritten because of parsing differences. This wikitext gets parsed into: <i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u> The extra ''' token in the middle thwarts DOM rewriting. However, a slightly different version: "'''''<u>biu</u>''<u>bu</u>'''<u>u</u>" gets properly normalized to: <u>'''''biu''bu'''u</u> An alternative, but fun strategy to play with is to use the following two normalization primitives: S(wap) and M(erge). - S rewrites T1(T2(x)) into T2(T1(x)) (ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>) - M rewrites (T(x),T(y)) into (T(x,y)). (ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>) The current rewriting strategy could possibly be re-implemented as S-M rewriting. The problem to solve there would be to find an efficient rewriting strategy that is guaranteed to lead to a normal form. I may not play with it now, but just documenting it for later (to play with in my spare time). This commit is just as a record of fun/experimental code where I get to learn details of JS, wikitext, parsing, and DOM manipulation. Next version of this code will attempt to introduce minimal DOM restructuring across multiple tags at once which can be more efficient. gwicke: Removed now passing test from whitelist, and updated another whitelist entry which is now improved. Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
// [..., T1[T2[*x]], T2[*y], ...] ==> [..., T2[T1[*x], *y], ...]
// [..., T2[*x], T1[T2[*y]], ...] ==> [..., T2[*x, T1[*y]], ...]
// where T1 and T2 are different and can be one of [i, b]
//
// Strictly speaking, we can rewrite even when T1 has more than one child as long as all of them are T2's
// but for now, we are going to restrict ourselves to a single child.
var tag_pair_map = null;
var rewrite_nested_tag_pairs = function(document, tag_pairs, node) {
function init_tag_pairs(tag_pairs) {
tag_pair_map = {};
for (var i = 0, n = tag_pairs.length; i < n; i++) {
// Ex: t1 = 'b', t2 = 'i'
var p = tag_pairs[i];
var t1 = p[0];
var t2 = p[1];
// Update map with {t1: t2}
var t1_tags = tag_pair_map[t1];
if (t1_tags === undefined) {
t1_tags = [];
tag_pair_map[t1] = t1_tags;
}
t1_tags.push(t2);
// Update map with {t2: t1}
var t2_tags = tag_pair_map[t2];
if (t2_tags === undefined) {
t2_tags = [];
tag_pair_map[t2] = t2_tags;
}
t2_tags.push(t1);
}
}
function migrate_all_children(src, tgt) {
var children = src.childNodes;
for (var i = 0, n = children.length; i < n; i++) {
// IMPORTANT: Always use index 0 to get the item at the top of the list.
// Do not use children[i] because as you append a child to 'tgt', they
// also get pulled out of the children list!
tgt.appendChild(children.item(0));
}
}
function copy_attributes(src, tgt) {
var attrs = src.attributes;
for (var i = 0, n = attrs.length; i < n; i++) {
var a = attrs.item(i);
tgt.setAttribute(a.nodeName, a.nodeValue);
}
}
function rewriteable_tag_pair(t1, t2) {
// SSS FIXME: I dont think this check is sufficient. We also need to
// verify that nodes having these tags t1 and t2 are deletable as well.
var t1_tags = tag_pair_map[t1];
if (t1_tags !== undefined) {
for (var i = 0, n = t1_tags.length; i < n; i++)
if (t1_tags[i] === t2) return true;
}
return false;
}
// Initialization
if (tag_pair_map === null) init_tag_pairs(tag_pairs);
var children = node.childNodes;
var num_children = children.length;
// no re-writing possible
if (num_children < 2) return;
// look for the pattern
for (var i = 0, n = num_children-1; i < n; i++) {
var child = children[i];
var sibling = children[i+1];
var t1 = child.nodeName.toLowerCase();
var t2 = sibling.nodeName.toLowerCase();
if (rewriteable_tag_pair(t1, t2)) {
var newChild = null, newGrandChild = null;
if (child.childNodes.length == 1 && child.childNodes[0].nodeName.toLowerCase() == t2) {
// rewritable
newChild = document.createElement(t2);
newGrandChild = document.createElement(t1);
newChild.appendChild(newGrandChild);
migrate_all_children(sibling, newChild);
migrate_all_children(child.childNodes[0], newGrandChild);
copy_attributes(child, newGrandChild);
copy_attributes(sibling, newChild);
} else if (sibling.childNodes.length == 1 && sibling.childNodes[0].nodeName.toLowerCase() == t1) {
// rewritable
newChild = document.createElement(t1);
newGrandChild = document.createElement(t2);
migrate_all_children(child, newChild);
migrate_all_children(sibling.childNodes[0], newGrandChild);
newChild.appendChild(newGrandChild);
copy_attributes(child, newChild);
copy_attributes(sibling, newGrandChild);
}
// modify DOM here
if (newChild !== null) {
node.insertBefore(newChild, child);
node.removeChild(child);
node.removeChild(sibling);
// modify indexes
n--;
// Reprocess newChild
rewrite_nested_tag_pairs(document, tag_pairs, newChild);
// Revisit the new node and see if it will merge with its sibling.
// Hence the decrement.
i--;
}
}
}
};
// Rewrite rules currently implemented:
// 1. [..., T1[T2[*x]], T2[*y], ...] ==> [..., T2[T1[*x], *y], ...]
//
// Other possible rewrite rules:
// 2. [..., T1[T2[*x]], ...] ==> [..., T2[T1[*x], ...]
// 3. [..., T[*x], T[*y], ...] ==> [..., T[*x,*y], ...]
//
// Note: Rewriting 2. and 3. in that sequence will effectively yield 1.
// but implementing 1. directly is faster
//
// Not sure we want to implement all kinds of rewriting rules yet.
// We have to figure out what rules are worth implementing.
var normalize_subtree = function(node, rewrite_rules) {
if (rewrite_rules.length === 0) return;
var children = node.childNodes;
for(var i = 0, length = children.length; i < length; i++) {
normalize_subtree(children[i], rewrite_rules);
}
for (var j = 0, num_rules = rewrite_rules.length; j < num_rules; j++) {
rewrite_rules[j](node);
}
};
var normalize_document = function(document) {
// normalize_subtree(document.body, [rewrite_nested_tag_pairs.bind(null, document, [['b','i'], ['b','u'], ['i','u']])]);
minimize_inline_tags(document.body, ['b','u','i','s']);
First attempt implementing rewriting rules on the DOM - This is implemented as a post-processing pass. - Might require additional checks to verify rewriteability. - Implemented as a pair-wise tag DOM minimization strategy, i.e. it takes tag pairs (B, I) for ex, and attempts to normalize the tree just for those tag pairs. Normalizing across multiple tags is implemented as pairwise rewriting across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u) - Copied over attributes as part of rewriting, but some of the attributes lose their meaning on rewriting since tags are reordered (ex: sourcePosn, sourceTagPosn). How do we handle this? Output examples and possible issues to fix: <i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u> gets rewritten to: <u><b><i>biu</i>bu</b>u</u> But, the equivalent wikitext form: '''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u> does not get rewritten because of parsing differences. This wikitext gets parsed into: <i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u> The extra ''' token in the middle thwarts DOM rewriting. However, a slightly different version: "'''''<u>biu</u>''<u>bu</u>'''<u>u</u>" gets properly normalized to: <u>'''''biu''bu'''u</u> An alternative, but fun strategy to play with is to use the following two normalization primitives: S(wap) and M(erge). - S rewrites T1(T2(x)) into T2(T1(x)) (ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>) - M rewrites (T(x),T(y)) into (T(x,y)). (ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>) The current rewriting strategy could possibly be re-implemented as S-M rewriting. The problem to solve there would be to find an efficient rewriting strategy that is guaranteed to lead to a normal form. I may not play with it now, but just documenting it for later (to play with in my spare time). This commit is just as a record of fun/experimental code where I get to learn details of JS, wikitext, parsing, and DOM manipulation. Next version of this code will attempt to introduce minimal DOM restructuring across multiple tags at once which can be more efficient. gwicke: Removed now passing test from whitelist, and updated another whitelist entry which is now improved. Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
};
// Wrap all top-level inline elements in paragraphs. This should also be
// applied inside block-level elements, but in that case the first paragraph
// usually remains plain inline.
var process_inlines_in_p = function ( document ) {
var body = document.body,
newP = document.createElement('p'),
cnodes = body.childNodes,
inParagraph = false,
deleted = 0;
for(var i = 0, length = cnodes.length; i < length; i++) {
var child = cnodes[i - deleted],
ctype = child.nodeType;
//console.warn(child + ctype);
if ((ctype === 3 && (inParagraph || !isElementContentWhitespace( child ))) ||
(ctype === Node.COMMENT_NODE && inParagraph ) ||
(ctype !== Node.TEXT_NODE &&
ctype !== Node.COMMENT_NODE &&
2012-01-17 18:30:22 +00:00
!Util.isBlockTag(child.nodeName.toLowerCase()))
)
{
// wrap in paragraph
newP.appendChild(child);
inParagraph = true;
deleted++;
} else if (inParagraph) {
body.insertBefore(newP, child);
deleted--;
newP = document.createElement('p');
inParagraph = false;
}
}
if (inParagraph) {
body.appendChild(newP);
}
};
function DOMPostProcessor () {
First attempt implementing rewriting rules on the DOM - This is implemented as a post-processing pass. - Might require additional checks to verify rewriteability. - Implemented as a pair-wise tag DOM minimization strategy, i.e. it takes tag pairs (B, I) for ex, and attempts to normalize the tree just for those tag pairs. Normalizing across multiple tags is implemented as pairwise rewriting across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u) - Copied over attributes as part of rewriting, but some of the attributes lose their meaning on rewriting since tags are reordered (ex: sourcePosn, sourceTagPosn). How do we handle this? Output examples and possible issues to fix: <i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u> gets rewritten to: <u><b><i>biu</i>bu</b>u</u> But, the equivalent wikitext form: '''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u> does not get rewritten because of parsing differences. This wikitext gets parsed into: <i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u> The extra ''' token in the middle thwarts DOM rewriting. However, a slightly different version: "'''''<u>biu</u>''<u>bu</u>'''<u>u</u>" gets properly normalized to: <u>'''''biu''bu'''u</u> An alternative, but fun strategy to play with is to use the following two normalization primitives: S(wap) and M(erge). - S rewrites T1(T2(x)) into T2(T1(x)) (ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>) - M rewrites (T(x),T(y)) into (T(x,y)). (ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>) The current rewriting strategy could possibly be re-implemented as S-M rewriting. The problem to solve there would be to find an efficient rewriting strategy that is guaranteed to lead to a normal form. I may not play with it now, but just documenting it for later (to play with in my spare time). This commit is just as a record of fun/experimental code where I get to learn details of JS, wikitext, parsing, and DOM manipulation. Next version of this code will attempt to introduce minimal DOM restructuring across multiple tags at once which can be more efficient. gwicke: Removed now passing test from whitelist, and updated another whitelist entry which is now improved. Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
this.processors = [process_inlines_in_p, normalize_document];
}
// Inherit from EventEmitter
DOMPostProcessor.prototype = new events.EventEmitter();
DOMPostProcessor.prototype.constructor = DOMPostProcessor;
DOMPostProcessor.prototype.doPostProcess = function ( document ) {
for(var i = 0; i < this.processors.length; i++) {
this.processors[i](document);
}
this.emit( 'document', document );
};
/**
First attempt implementing rewriting rules on the DOM - This is implemented as a post-processing pass. - Might require additional checks to verify rewriteability. - Implemented as a pair-wise tag DOM minimization strategy, i.e. it takes tag pairs (B, I) for ex, and attempts to normalize the tree just for those tag pairs. Normalizing across multiple tags is implemented as pairwise rewriting across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u) - Copied over attributes as part of rewriting, but some of the attributes lose their meaning on rewriting since tags are reordered (ex: sourcePosn, sourceTagPosn). How do we handle this? Output examples and possible issues to fix: <i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u> gets rewritten to: <u><b><i>biu</i>bu</b>u</u> But, the equivalent wikitext form: '''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u> does not get rewritten because of parsing differences. This wikitext gets parsed into: <i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u> The extra ''' token in the middle thwarts DOM rewriting. However, a slightly different version: "'''''<u>biu</u>''<u>bu</u>'''<u>u</u>" gets properly normalized to: <u>'''''biu''bu'''u</u> An alternative, but fun strategy to play with is to use the following two normalization primitives: S(wap) and M(erge). - S rewrites T1(T2(x)) into T2(T1(x)) (ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>) - M rewrites (T(x),T(y)) into (T(x,y)). (ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>) The current rewriting strategy could possibly be re-implemented as S-M rewriting. The problem to solve there would be to find an efficient rewriting strategy that is guaranteed to lead to a normal form. I may not play with it now, but just documenting it for later (to play with in my spare time). This commit is just as a record of fun/experimental code where I get to learn details of JS, wikitext, parsing, and DOM manipulation. Next version of this code will attempt to introduce minimal DOM restructuring across multiple tags at once which can be more efficient. gwicke: Removed now passing test from whitelist, and updated another whitelist entry which is now improved. Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
* Register for the 'document' event, normally emitted from the HTML5 tree
* builder.
*/
Biggish token transform system refactoring * All parser pipelines including tokenizer and DOM stuff are now constructed from a 'recipe' data structure in a ParserPipelineFactory. * All sub-pipelines of these can now be cached * Event registrations to a pipeline are directly forwarded to the last pipeline member to save relatively expensive event forwarding. * Some APIs for on-demand expansion / format conversion of parameters from parser functions are added: param.to('tokens/expanded', cb) param.to('text/wiki', cb) (this does not work yet) All parameters are additionally wrapped into a Param object that provides method for positional parameter naming (.named() or conversion to a dict (.dict()). * The async token transform manager is now separated from a frame object, with the frame holding arguments, an on-demand expansion method and loop checks. * Only keys of template parameters are now expanded. Parser functions or template arguments trigger an expansion on-demand. This (unsurprisingly) makes a big performance difference with typical switch-heavy template systems. * Return values from async transforms are no longer used in favor of plain callbacks. This saves the complication of having to maintain two code paths. A trick in transformTokens still avoids the construction of unneeded TokenAccumulators. * The results of template expansions are no longer buffered. * 301 parser tests are passing Known issues: * Cosmetic cleanup remains to do * Some parser functions do not support async expansions yet, and need to be modified. Change-Id: I1a7690baffbe8141cadf67270904a1b2e1df879a
2012-04-25 14:35:59 +00:00
DOMPostProcessor.prototype.addListenersOn = function ( emitter ) {
emitter.addListener( 'document', this.doPostProcess.bind( this ) );
First attempt implementing rewriting rules on the DOM - This is implemented as a post-processing pass. - Might require additional checks to verify rewriteability. - Implemented as a pair-wise tag DOM minimization strategy, i.e. it takes tag pairs (B, I) for ex, and attempts to normalize the tree just for those tag pairs. Normalizing across multiple tags is implemented as pairwise rewriting across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u) - Copied over attributes as part of rewriting, but some of the attributes lose their meaning on rewriting since tags are reordered (ex: sourcePosn, sourceTagPosn). How do we handle this? Output examples and possible issues to fix: <i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u> gets rewritten to: <u><b><i>biu</i>bu</b>u</u> But, the equivalent wikitext form: '''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u> does not get rewritten because of parsing differences. This wikitext gets parsed into: <i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u> The extra ''' token in the middle thwarts DOM rewriting. However, a slightly different version: "'''''<u>biu</u>''<u>bu</u>'''<u>u</u>" gets properly normalized to: <u>'''''biu''bu'''u</u> An alternative, but fun strategy to play with is to use the following two normalization primitives: S(wap) and M(erge). - S rewrites T1(T2(x)) into T2(T1(x)) (ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>) - M rewrites (T(x),T(y)) into (T(x,y)). (ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>) The current rewriting strategy could possibly be re-implemented as S-M rewriting. The problem to solve there would be to find an efficient rewriting strategy that is guaranteed to lead to a normal form. I may not play with it now, but just documenting it for later (to play with in my spare time). This commit is just as a record of fun/experimental code where I get to learn details of JS, wikitext, parsing, and DOM manipulation. Next version of this code will attempt to introduce minimal DOM restructuring across multiple tags at once which can be more efficient. gwicke: Removed now passing test from whitelist, and updated another whitelist entry which is now improved. Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
};
if (typeof module == "object") {
module.exports.DOMPostProcessor = DOMPostProcessor;
}