mediawiki-extensions-Visual.../modules/parser/mediawiki.DOMPostProcessor.js
Subramanya Sastry 1c80e2d7f0 First pass implementing a general tag minimization routine
* This routine attempts to rewrite the DOM to maximize tag overlap
  and thus minimize tag uses.

* This takes as input a set of tags which participate in the
  minimization.

* Tested on the following example
  <b><i><u><s>BIUS</s></u></i></b><b><i><s>BIS</s></i></b><b><u><s>BUS</s></u></b><u><i>UI</i></u>
  with multiple combinations of the 2^4 possible variations of i,b,u,s
  tags: [], ['i','b','u','s'], ['i'], ['b','s'], ['i','b','u']

  - But, I am not fully sure if this implements the right behavior when
    only a subset of inline tags are provided.  Needs discussion and tweaking
	 as necessary.

* Also tested on few others:
  <b>B</b><b><i>BI</i></b><b><i><u>BIU</u></i></b><b><i><u><s>BIUS</s></u></i></b>
  <s><i><b>SIB</s></i></b><s><i><u>SIU</u></i></s><i><u>IU</u></i><i>I</i>

* The previous pairwise tag rewriting version fails on several of these
  examples, so this new version is a definite improvement.

* No change in parserTests run (203 passing before and after).

* Possible improvements that could/should be undertaken:
  - get rid of useless/idempotent add/remove of nodes that don't change
    the DOM.
  - ensure that node attributes post-restructuring are correct.

Change-Id: Ib4a8b39583fa96a2be880a77021ca81cefa06484
2012-05-31 12:10:28 -05:00

470 lines
13 KiB
JavaScript

/* Perform post-processing steps on an already-built HTML DOM. */
var events = require('events'),
util = require('./ext.Util.js'),
Util = new util.Util();
// Quick HACK: define Node constants
// https://developer.mozilla.org/en/nodeType
var Node = {
ELEMENT_NODE: 1,
TEXT_NODE: 3,
COMMENT_NODE: 8
};
var isElementContentWhitespace = function ( e ) {
return (e.data.match(/^[ \r\n\t]*$/) !== null);
};
function minimize_inline_tags(root, rewriteable_nodes) {
var rewriteable_node_map = null;
function tail(a) {
return a[a.length-1];
}
function remove_all_children(node) {
while (node.hasChildNodes()) node.removeChild(node.firstChild);
}
function add_children(node, children) {
for (var i = 0, n = children.length; i < n; i++) node.appendChild(children[i]);
}
function init() {
rewriteable_node_map = {};
for (var i = 0, n = rewriteable_nodes.length; i < n; i++) {
rewriteable_node_map[rewriteable_nodes[i].toLowerCase()] = true;
}
}
function is_rewriteable_node(node_name) {
return rewriteable_node_map[node_name];
}
// Main routine
function rewrite(node) {
var children = node.childNodes;
var n = children.length;
// If we have a single node, no restructuring is possible at this level
// Descend ...
if (n === 1) {
var sole_node = children[0];
if (sole_node.nodeType === Node.ELEMENT_NODE) rewrite(sole_node);
return;
}
// * Collect longest linear paths for all children
// * Process subtrees attached to the end of those paths
// * Restructure the list of linear paths (and reattach processed subtrees at the tips).
var P = [];
for (var i = 0; i < n; i++) {
var s = children[i];
if (s.nodeType === Node.ELEMENT_NODE) {
var p = longest_linear_path(s);
if (p.length === 0) {
rewrite(s);
// console.log("Pushed EMPTY with orig_parent: " + node.nodeName);
P.push({path: [], orig_parent: node, children: [s]});
} else {
var p_tail = tail(p);
// console.log("llp: " + p);
// process subtree (depth-first)
rewrite(p_tail);
// collect the restructured p_tail subtree (children)
var child_nodes = p_tail.childNodes;
var new_children = [];
for (var j = 0, n2 = child_nodes.length; j < n2; j++) {
new_children.push(child_nodes[j]);
}
// console.log("Pushed: " + p + ", tail: " + p_tail.nodeName + "; new_children: " + new_children.length);
P.push({path: p, orig_parent: p_tail, children: new_children});
}
} else {
// console.log("Pushed EMPTY with subtree: " + s);
P.push({path: [], orig_parent: node, children: [s]});
}
}
// Rewrite paths in 'P'
if (P.length > 0) rewrite_paths(node, P);
}
function longest_linear_path(node) {
var children, path = [];
while (node.nodeType == Node.ELEMENT_NODE) {
path.push(node);
children = node.childNodes;
if ((children.length === 0) || (children.length > 1)) return path;
node = children[0];
}
return path;
}
function rewrite_paths(parent_node, P) {
// 1. Split P into maximal sublists where each sublist has a non-null path intersection.
// 2. Process each sublist separately and accumulate the result.
//
// lcs = longest common sublist
remove_all_children(parent_node);
var sublists = split_into_disjoint_sublists(P);
// console.log("# sublists: " + sublists.length + ", parent_node: " + parent_node.nodeName);
for (var i = 0, num_sublists = sublists.length; i < num_sublists; i++) {
var s = sublists[i];
var lcs = s.lcs;
if (lcs.length > 0) {
// Connect up LCS
// console.log("LCS: " + lcs);
var prev = lcs[0];
for (k = 1, lcs_len = lcs.length; k < lcs_len; k++) {
var curr = lcs[k];
// SSS FIXME: this add/remove can be optimized
// console.log("adding " + curr.nodeName + " to " + prev.nodeName);
remove_all_children(prev);
prev.appendChild(curr);
prev = curr;
}
// Lastly, attach lcs to the incoming parent
parent_node.appendChild(lcs[0]);
}
var paths = s.paths;
var num_paths = paths.length;
// console.log("sublist: lcs: " + lcs + ", #paths: " + num_paths);
if (num_paths === 1) {
// Nothing more to do! Stitch things up
// two possible scenarios:
// (a) we have an empty path ==> attach the children to parent_node
// (b) we have a non-empty path ==> attach the children to the end of the path
var p = paths[0].path;
var children = paths[0].children;
if (p.length > 0) {
var p_tail = tail(p);
remove_all_children(p_tail);
add_children(p_tail, children);
} else {
add_children(parent_node, children);
}
} else {
// Process the sublist
rewrite_paths(tail(lcs), strip_lcs(paths, lcs));
}
// console.log("done with this sublist");
}
// console.log("--done all sublists--");
}
function common_path(old, new_path) {
var hash = {};
for (var i = 0, n = new_path.length; i < n; i++) {
var e = new_path[i].nodeName.toLowerCase();
if (is_rewriteable_node(e)) hash[e] = new_path[i];
}
var cp = [];
for (i = 0, n = old.length; i < n; i++) {
var hit = hash[old[i].nodeName.toLowerCase()];
// Add old path element always. This effectively picks elements from the leftmost path.
if (hit) cp.push(old[i]);
}
// console.log("CP: " + old + "||" + new_path + "=" + cp);
return cp;
}
// For each 'p' in 'paths', eliminate 'lcs' from 'p'
function strip_lcs(paths, lcs) {
// SSS FIXME: Implicit assumption: there are no duplicate elements in lcs or path!
// Ex: <b><i><b>BIB</b></i></b> will
// Fix this to be more robust
var i, lcs_map = {};
for (i = 0, n = lcs.length; i < n; i++) lcs_map[lcs[i]] = true;
for (i = 0, n = paths.length; i < n; i++) {
var p = paths[i].path;
for (var j = 0, l = p.length; j < l; j++) {
// remove matching element
if (lcs_map[p[j]]) {
p.splice(j, 1);
l--;
j--;
}
}
}
return paths;
}
// Split 'P' into sublists where each sublist has the property that
// the elements of the sublist have an intersection that is non-zero
// Ex: [BIUS, SB, BUS, IU, I, U, US, B, I] will get split into 5 sublists
// - (lcs: BS, paths: [BIUS, SB, BUS])
// - (lcs: I, paths: [IU, I])
// - (lcs: U, paths: [U, US])
// - (lcs: B, paths: [B])
// - (lcs: I, paths: [I])
function split_into_disjoint_sublists(P) {
var p = P.shift();
var lcs = p.path;
var curr = [p];
for (var i = 0, n = P.length; i < n; i++) {
p = P.shift();
new_lcs = common_path(lcs, p.path);
if (new_lcs.length === 0) {
P.unshift(p);
return [{lcs: lcs, paths: curr}].concat(split_into_disjoint_sublists(P));
}
lcs = new_lcs;
curr.push(p);
}
return [{lcs: lcs, paths: curr}];
}
// Init
init();
// Kick it off
try {
rewrite(root);
} catch (e) {
console.log("------- error errrror errrrrror! ----------");
console.log(e.stack);
}
}
// [..., T1[T2[*x]], T2[*y], ...] ==> [..., T2[T1[*x], *y], ...]
// [..., T2[*x], T1[T2[*y]], ...] ==> [..., T2[*x, T1[*y]], ...]
// where T1 and T2 are different and can be one of [i, b]
//
// Strictly speaking, we can rewrite even when T1 has more than one child as long as all of them are T2's
// but for now, we are going to restrict ourselves to a single child.
var tag_pair_map = null;
var rewrite_nested_tag_pairs = function(document, tag_pairs, node) {
function init_tag_pairs(tag_pairs) {
tag_pair_map = {};
for (var i = 0, n = tag_pairs.length; i < n; i++) {
// Ex: t1 = 'b', t2 = 'i'
var p = tag_pairs[i];
var t1 = p[0];
var t2 = p[1];
// Update map with {t1: t2}
var t1_tags = tag_pair_map[t1];
if (t1_tags === undefined) {
t1_tags = [];
tag_pair_map[t1] = t1_tags;
}
t1_tags.push(t2);
// Update map with {t2: t1}
var t2_tags = tag_pair_map[t2];
if (t2_tags === undefined) {
t2_tags = [];
tag_pair_map[t2] = t2_tags;
}
t2_tags.push(t1);
}
}
function migrate_all_children(src, tgt) {
var children = src.childNodes;
for (var i = 0, n = children.length; i < n; i++) {
// IMPORTANT: Always use index 0 to get the item at the top of the list.
// Do not use children[i] because as you append a child to 'tgt', they
// also get pulled out of the children list!
tgt.appendChild(children.item(0));
}
}
function copy_attributes(src, tgt) {
var attrs = src.attributes;
for (var i = 0, n = attrs.length; i < n; i++) {
var a = attrs.item(i);
tgt.setAttribute(a.nodeName, a.nodeValue);
}
}
function rewriteable_tag_pair(t1, t2) {
// SSS FIXME: I dont think this check is sufficient. We also need to
// verify that nodes having these tags t1 and t2 are deletable as well.
var t1_tags = tag_pair_map[t1];
if (t1_tags !== undefined) {
for (var i = 0, n = t1_tags.length; i < n; i++)
if (t1_tags[i] === t2) return true;
}
return false;
}
// Initialization
if (tag_pair_map === null) init_tag_pairs(tag_pairs);
var children = node.childNodes;
var num_children = children.length;
// no re-writing possible
if (num_children < 2) return;
// look for the pattern
for (var i = 0, n = num_children-1; i < n; i++) {
var child = children[i];
var sibling = children[i+1];
var t1 = child.nodeName.toLowerCase();
var t2 = sibling.nodeName.toLowerCase();
if (rewriteable_tag_pair(t1, t2)) {
var newChild = null, newGrandChild = null;
if (child.childNodes.length == 1 && child.childNodes[0].nodeName.toLowerCase() == t2) {
// rewritable
newChild = document.createElement(t2);
newGrandChild = document.createElement(t1);
newChild.appendChild(newGrandChild);
migrate_all_children(sibling, newChild);
migrate_all_children(child.childNodes[0], newGrandChild);
copy_attributes(child, newGrandChild);
copy_attributes(sibling, newChild);
} else if (sibling.childNodes.length == 1 && sibling.childNodes[0].nodeName.toLowerCase() == t1) {
// rewritable
newChild = document.createElement(t1);
newGrandChild = document.createElement(t2);
migrate_all_children(child, newChild);
migrate_all_children(sibling.childNodes[0], newGrandChild);
newChild.appendChild(newGrandChild);
copy_attributes(child, newChild);
copy_attributes(sibling, newGrandChild);
}
// modify DOM here
if (newChild !== null) {
node.insertBefore(newChild, child);
node.removeChild(child);
node.removeChild(sibling);
// modify indexes
n--;
// Reprocess newChild
rewrite_nested_tag_pairs(document, tag_pairs, newChild);
// Revisit the new node and see if it will merge with its sibling.
// Hence the decrement.
i--;
}
}
}
};
// Rewrite rules currently implemented:
// 1. [..., T1[T2[*x]], T2[*y], ...] ==> [..., T2[T1[*x], *y], ...]
//
// Other possible rewrite rules:
// 2. [..., T1[T2[*x]], ...] ==> [..., T2[T1[*x], ...]
// 3. [..., T[*x], T[*y], ...] ==> [..., T[*x,*y], ...]
//
// Note: Rewriting 2. and 3. in that sequence will effectively yield 1.
// but implementing 1. directly is faster
//
// Not sure we want to implement all kinds of rewriting rules yet.
// We have to figure out what rules are worth implementing.
var normalize_subtree = function(node, rewrite_rules) {
if (rewrite_rules.length === 0) return;
var children = node.childNodes;
for(var i = 0, length = children.length; i < length; i++) {
normalize_subtree(children[i], rewrite_rules);
}
for (var j = 0, num_rules = rewrite_rules.length; j < num_rules; j++) {
rewrite_rules[j](node);
}
};
var normalize_document = function(document) {
// normalize_subtree(document.body, [rewrite_nested_tag_pairs.bind(null, document, [['b','i'], ['b','u'], ['i','u']])]);
minimize_inline_tags(document.body, ['b','u','i','s']);
};
// Wrap all top-level inline elements in paragraphs. This should also be
// applied inside block-level elements, but in that case the first paragraph
// usually remains plain inline.
var process_inlines_in_p = function ( document ) {
var body = document.body,
newP = document.createElement('p'),
cnodes = body.childNodes,
inParagraph = false,
deleted = 0;
for(var i = 0, length = cnodes.length; i < length; i++) {
var child = cnodes[i - deleted],
ctype = child.nodeType;
//console.warn(child + ctype);
if ((ctype === 3 && (inParagraph || !isElementContentWhitespace( child ))) ||
(ctype === Node.COMMENT_NODE && inParagraph ) ||
(ctype !== Node.TEXT_NODE &&
ctype !== Node.COMMENT_NODE &&
!Util.isBlockTag(child.nodeName.toLowerCase()))
)
{
// wrap in paragraph
newP.appendChild(child);
inParagraph = true;
deleted++;
} else if (inParagraph) {
body.insertBefore(newP, child);
deleted--;
newP = document.createElement('p');
inParagraph = false;
}
}
if (inParagraph) {
body.appendChild(newP);
}
};
function DOMPostProcessor () {
this.processors = [process_inlines_in_p, normalize_document];
}
// Inherit from EventEmitter
DOMPostProcessor.prototype = new events.EventEmitter();
DOMPostProcessor.prototype.constructor = DOMPostProcessor;
DOMPostProcessor.prototype.doPostProcess = function ( document ) {
for(var i = 0; i < this.processors.length; i++) {
this.processors[i](document);
}
this.emit( 'document', document );
};
/**
* Register for the 'document' event, normally emitted from the HTML5 tree
* builder.
*/
DOMPostProcessor.prototype.addListenersOn = function ( emitter ) {
emitter.addListener( 'document', this.doPostProcess.bind( this ) );
};
if (typeof module == "object") {
module.exports.DOMPostProcessor = DOMPostProcessor;
}