2011-11-29 15:11:51 +00:00
|
|
|
/* Perform post-processing steps on an already-built HTML DOM. */
|
|
|
|
|
2012-01-17 18:30:22 +00:00
|
|
|
var events = require('events'),
|
|
|
|
util = require('./ext.Util.js'),
|
|
|
|
Util = new util.Util();
|
2011-11-29 15:11:51 +00:00
|
|
|
|
2011-12-14 15:15:41 +00:00
|
|
|
// Quick HACK: define Node constants
|
|
|
|
// https://developer.mozilla.org/en/nodeType
|
|
|
|
var Node = {
|
First pass implementing a general tag minimization routine
* This routine attempts to rewrite the DOM to maximize tag overlap
and thus minimize tag uses.
* This takes as input a set of tags which participate in the
minimization.
* Tested on the following example
<b><i><u><s>BIUS</s></u></i></b><b><i><s>BIS</s></i></b><b><u><s>BUS</s></u></b><u><i>UI</i></u>
with multiple combinations of the 2^4 possible variations of i,b,u,s
tags: [], ['i','b','u','s'], ['i'], ['b','s'], ['i','b','u']
- But, I am not fully sure if this implements the right behavior when
only a subset of inline tags are provided. Needs discussion and tweaking
as necessary.
* Also tested on few others:
<b>B</b><b><i>BI</i></b><b><i><u>BIU</u></i></b><b><i><u><s>BIUS</s></u></i></b>
<s><i><b>SIB</s></i></b><s><i><u>SIU</u></i></s><i><u>IU</u></i><i>I</i>
* The previous pairwise tag rewriting version fails on several of these
examples, so this new version is a definite improvement.
* No change in parserTests run (203 passing before and after).
* Possible improvements that could/should be undertaken:
- get rid of useless/idempotent add/remove of nodes that don't change
the DOM.
- ensure that node attributes post-restructuring are correct.
Change-Id: Ib4a8b39583fa96a2be880a77021ca81cefa06484
2012-05-31 03:54:23 +00:00
|
|
|
ELEMENT_NODE: 1,
|
2011-12-14 15:15:41 +00:00
|
|
|
TEXT_NODE: 3,
|
|
|
|
COMMENT_NODE: 8
|
|
|
|
};
|
|
|
|
|
2012-02-11 16:43:25 +00:00
|
|
|
var isElementContentWhitespace = function ( e ) {
|
|
|
|
return (e.data.match(/^[ \r\n\t]*$/) !== null);
|
|
|
|
};
|
|
|
|
|
First pass implementing a general tag minimization routine
* This routine attempts to rewrite the DOM to maximize tag overlap
and thus minimize tag uses.
* This takes as input a set of tags which participate in the
minimization.
* Tested on the following example
<b><i><u><s>BIUS</s></u></i></b><b><i><s>BIS</s></i></b><b><u><s>BUS</s></u></b><u><i>UI</i></u>
with multiple combinations of the 2^4 possible variations of i,b,u,s
tags: [], ['i','b','u','s'], ['i'], ['b','s'], ['i','b','u']
- But, I am not fully sure if this implements the right behavior when
only a subset of inline tags are provided. Needs discussion and tweaking
as necessary.
* Also tested on few others:
<b>B</b><b><i>BI</i></b><b><i><u>BIU</u></i></b><b><i><u><s>BIUS</s></u></i></b>
<s><i><b>SIB</s></i></b><s><i><u>SIU</u></i></s><i><u>IU</u></i><i>I</i>
* The previous pairwise tag rewriting version fails on several of these
examples, so this new version is a definite improvement.
* No change in parserTests run (203 passing before and after).
* Possible improvements that could/should be undertaken:
- get rid of useless/idempotent add/remove of nodes that don't change
the DOM.
- ensure that node attributes post-restructuring are correct.
Change-Id: Ib4a8b39583fa96a2be880a77021ca81cefa06484
2012-05-31 03:54:23 +00:00
|
|
|
function minimize_inline_tags(root, rewriteable_nodes) {
|
|
|
|
var rewriteable_node_map = null;
|
|
|
|
|
|
|
|
function tail(a) {
|
|
|
|
return a[a.length-1];
|
|
|
|
}
|
|
|
|
|
|
|
|
function remove_all_children(node) {
|
|
|
|
while (node.hasChildNodes()) node.removeChild(node.firstChild);
|
|
|
|
}
|
|
|
|
|
|
|
|
function add_children(node, children) {
|
|
|
|
for (var i = 0, n = children.length; i < n; i++) node.appendChild(children[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
function init() {
|
|
|
|
rewriteable_node_map = {};
|
|
|
|
for (var i = 0, n = rewriteable_nodes.length; i < n; i++) {
|
|
|
|
rewriteable_node_map[rewriteable_nodes[i].toLowerCase()] = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function is_rewriteable_node(node_name) {
|
|
|
|
return rewriteable_node_map[node_name];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Main routine
|
|
|
|
function rewrite(node) {
|
|
|
|
var children = node.childNodes;
|
|
|
|
var n = children.length;
|
|
|
|
|
|
|
|
// If we have a single node, no restructuring is possible at this level
|
|
|
|
// Descend ...
|
|
|
|
if (n === 1) {
|
|
|
|
var sole_node = children[0];
|
|
|
|
if (sole_node.nodeType === Node.ELEMENT_NODE) rewrite(sole_node);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// * Collect longest linear paths for all children
|
|
|
|
// * Process subtrees attached to the end of those paths
|
|
|
|
// * Restructure the list of linear paths (and reattach processed subtrees at the tips).
|
|
|
|
|
|
|
|
var P = [];
|
|
|
|
for (var i = 0; i < n; i++) {
|
|
|
|
var s = children[i];
|
|
|
|
if (s.nodeType === Node.ELEMENT_NODE) {
|
|
|
|
var p = longest_linear_path(s);
|
|
|
|
if (p.length === 0) {
|
|
|
|
rewrite(s);
|
|
|
|
// console.log("Pushed EMPTY with orig_parent: " + node.nodeName);
|
|
|
|
P.push({path: [], orig_parent: node, children: [s]});
|
|
|
|
} else {
|
|
|
|
var p_tail = tail(p);
|
|
|
|
|
|
|
|
// console.log("llp: " + p);
|
|
|
|
|
|
|
|
// process subtree (depth-first)
|
|
|
|
rewrite(p_tail);
|
|
|
|
|
|
|
|
// collect the restructured p_tail subtree (children)
|
|
|
|
var child_nodes = p_tail.childNodes;
|
|
|
|
var new_children = [];
|
|
|
|
for (var j = 0, n2 = child_nodes.length; j < n2; j++) {
|
|
|
|
new_children.push(child_nodes[j]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// console.log("Pushed: " + p + ", tail: " + p_tail.nodeName + "; new_children: " + new_children.length);
|
|
|
|
P.push({path: p, orig_parent: p_tail, children: new_children});
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// console.log("Pushed EMPTY with subtree: " + s);
|
|
|
|
P.push({path: [], orig_parent: node, children: [s]});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Rewrite paths in 'P'
|
|
|
|
if (P.length > 0) rewrite_paths(node, P);
|
|
|
|
}
|
|
|
|
|
|
|
|
function longest_linear_path(node) {
|
|
|
|
var children, path = [];
|
|
|
|
while (node.nodeType == Node.ELEMENT_NODE) {
|
|
|
|
path.push(node);
|
|
|
|
children = node.childNodes;
|
|
|
|
if ((children.length === 0) || (children.length > 1)) return path;
|
|
|
|
node = children[0];
|
|
|
|
}
|
|
|
|
|
|
|
|
return path;
|
|
|
|
}
|
|
|
|
|
|
|
|
function rewrite_paths(parent_node, P) {
|
|
|
|
// 1. Split P into maximal sublists where each sublist has a non-null path intersection.
|
|
|
|
// 2. Process each sublist separately and accumulate the result.
|
|
|
|
//
|
|
|
|
// lcs = longest common sublist
|
|
|
|
|
|
|
|
remove_all_children(parent_node);
|
|
|
|
|
|
|
|
var sublists = split_into_disjoint_sublists(P);
|
|
|
|
// console.log("# sublists: " + sublists.length + ", parent_node: " + parent_node.nodeName);
|
|
|
|
for (var i = 0, num_sublists = sublists.length; i < num_sublists; i++) {
|
|
|
|
var s = sublists[i];
|
|
|
|
var lcs = s.lcs;
|
|
|
|
|
|
|
|
if (lcs.length > 0) {
|
|
|
|
// Connect up LCS
|
|
|
|
// console.log("LCS: " + lcs);
|
|
|
|
var prev = lcs[0];
|
|
|
|
for (k = 1, lcs_len = lcs.length; k < lcs_len; k++) {
|
|
|
|
var curr = lcs[k];
|
|
|
|
// SSS FIXME: this add/remove can be optimized
|
|
|
|
// console.log("adding " + curr.nodeName + " to " + prev.nodeName);
|
|
|
|
remove_all_children(prev);
|
|
|
|
prev.appendChild(curr);
|
|
|
|
prev = curr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Lastly, attach lcs to the incoming parent
|
|
|
|
parent_node.appendChild(lcs[0]);
|
|
|
|
}
|
|
|
|
|
|
|
|
var paths = s.paths;
|
|
|
|
var num_paths = paths.length;
|
|
|
|
// console.log("sublist: lcs: " + lcs + ", #paths: " + num_paths);
|
|
|
|
if (num_paths === 1) {
|
|
|
|
// Nothing more to do! Stitch things up
|
|
|
|
// two possible scenarios:
|
|
|
|
// (a) we have an empty path ==> attach the children to parent_node
|
|
|
|
// (b) we have a non-empty path ==> attach the children to the end of the path
|
|
|
|
var p = paths[0].path;
|
|
|
|
var children = paths[0].children;
|
|
|
|
if (p.length > 0) {
|
|
|
|
var p_tail = tail(p);
|
|
|
|
remove_all_children(p_tail);
|
|
|
|
add_children(p_tail, children);
|
|
|
|
} else {
|
|
|
|
add_children(parent_node, children);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Process the sublist
|
|
|
|
rewrite_paths(tail(lcs), strip_lcs(paths, lcs));
|
|
|
|
}
|
|
|
|
|
|
|
|
// console.log("done with this sublist");
|
|
|
|
}
|
|
|
|
// console.log("--done all sublists--");
|
|
|
|
}
|
|
|
|
|
|
|
|
function common_path(old, new_path) {
|
|
|
|
var hash = {};
|
|
|
|
for (var i = 0, n = new_path.length; i < n; i++) {
|
|
|
|
var e = new_path[i].nodeName.toLowerCase();
|
|
|
|
if (is_rewriteable_node(e)) hash[e] = new_path[i];
|
|
|
|
}
|
|
|
|
|
|
|
|
var cp = [];
|
|
|
|
for (i = 0, n = old.length; i < n; i++) {
|
|
|
|
var hit = hash[old[i].nodeName.toLowerCase()];
|
|
|
|
// Add old path element always. This effectively picks elements from the leftmost path.
|
|
|
|
if (hit) cp.push(old[i]);
|
|
|
|
}
|
|
|
|
|
|
|
|
// console.log("CP: " + old + "||" + new_path + "=" + cp);
|
|
|
|
return cp;
|
|
|
|
}
|
|
|
|
|
|
|
|
// For each 'p' in 'paths', eliminate 'lcs' from 'p'
|
|
|
|
function strip_lcs(paths, lcs) {
|
|
|
|
// SSS FIXME: Implicit assumption: there are no duplicate elements in lcs or path!
|
|
|
|
// Ex: <b><i><b>BIB</b></i></b> will
|
|
|
|
// Fix this to be more robust
|
|
|
|
|
|
|
|
var i, lcs_map = {};
|
|
|
|
for (i = 0, n = lcs.length; i < n; i++) lcs_map[lcs[i]] = true;
|
|
|
|
|
|
|
|
for (i = 0, n = paths.length; i < n; i++) {
|
|
|
|
var p = paths[i].path;
|
|
|
|
for (var j = 0, l = p.length; j < l; j++) {
|
|
|
|
// remove matching element
|
|
|
|
if (lcs_map[p[j]]) {
|
|
|
|
p.splice(j, 1);
|
|
|
|
l--;
|
|
|
|
j--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return paths;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Split 'P' into sublists where each sublist has the property that
|
|
|
|
// the elements of the sublist have an intersection that is non-zero
|
|
|
|
// Ex: [BIUS, SB, BUS, IU, I, U, US, B, I] will get split into 5 sublists
|
|
|
|
// - (lcs: BS, paths: [BIUS, SB, BUS])
|
|
|
|
// - (lcs: I, paths: [IU, I])
|
|
|
|
// - (lcs: U, paths: [U, US])
|
|
|
|
// - (lcs: B, paths: [B])
|
|
|
|
// - (lcs: I, paths: [I])
|
|
|
|
function split_into_disjoint_sublists(P) {
|
|
|
|
var p = P.shift();
|
|
|
|
var lcs = p.path;
|
|
|
|
var curr = [p];
|
|
|
|
|
|
|
|
for (var i = 0, n = P.length; i < n; i++) {
|
|
|
|
p = P.shift();
|
|
|
|
new_lcs = common_path(lcs, p.path);
|
|
|
|
if (new_lcs.length === 0) {
|
|
|
|
P.unshift(p);
|
|
|
|
return [{lcs: lcs, paths: curr}].concat(split_into_disjoint_sublists(P));
|
|
|
|
}
|
|
|
|
lcs = new_lcs;
|
|
|
|
curr.push(p);
|
|
|
|
}
|
|
|
|
|
|
|
|
return [{lcs: lcs, paths: curr}];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Init
|
|
|
|
init();
|
|
|
|
|
|
|
|
// Kick it off
|
|
|
|
try {
|
|
|
|
rewrite(root);
|
|
|
|
} catch (e) {
|
|
|
|
console.log("------- error errrror errrrrror! ----------");
|
|
|
|
console.log(e.stack);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
First attempt implementing rewriting rules on the DOM
- This is implemented as a post-processing pass.
- Might require additional checks to verify rewriteability.
- Implemented as a pair-wise tag DOM minimization strategy,
i.e. it takes tag pairs (B, I) for ex, and attempts to
normalize the tree just for those tag pairs. Normalizing
across multiple tags is implemented as pairwise rewriting
across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u)
- Copied over attributes as part of rewriting, but some of the
attributes lose their meaning on rewriting since tags are
reordered (ex: sourcePosn, sourceTagPosn). How do we handle this?
Output examples and possible issues to fix:
<i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u>
gets rewritten to:
<u><b><i>biu</i>bu</b>u</u>
But, the equivalent wikitext form:
'''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u>
does not get rewritten because of parsing differences.
This wikitext gets parsed into:
<i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u>
The extra ''' token in the middle thwarts DOM rewriting.
However, a slightly different version:
"'''''<u>biu</u>''<u>bu</u>'''<u>u</u>"
gets properly normalized to:
<u>'''''biu''bu'''u</u>
An alternative, but fun strategy to play with is to use the following
two normalization primitives: S(wap) and M(erge).
- S rewrites T1(T2(x)) into T2(T1(x))
(ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>)
- M rewrites (T(x),T(y)) into (T(x,y)).
(ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>)
The current rewriting strategy could possibly be re-implemented as S-M
rewriting. The problem to solve there would be to find an efficient
rewriting strategy that is guaranteed to lead to a normal form. I may
not play with it now, but just documenting it for later (to play with
in my spare time).
This commit is just as a record of fun/experimental code where I get to
learn details of JS, wikitext, parsing, and DOM manipulation. Next
version of this code will attempt to introduce minimal DOM restructuring
across multiple tags at once which can be more efficient.
gwicke: Removed now passing test from whitelist, and updated another whitelist
entry which is now improved.
Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
|
|
|
// [..., T1[T2[*x]], T2[*y], ...] ==> [..., T2[T1[*x], *y], ...]
|
|
|
|
// [..., T2[*x], T1[T2[*y]], ...] ==> [..., T2[*x, T1[*y]], ...]
|
|
|
|
// where T1 and T2 are different and can be one of [i, b]
|
|
|
|
//
|
|
|
|
// Strictly speaking, we can rewrite even when T1 has more than one child as long as all of them are T2's
|
|
|
|
// but for now, we are going to restrict ourselves to a single child.
|
|
|
|
var tag_pair_map = null;
|
|
|
|
var rewrite_nested_tag_pairs = function(document, tag_pairs, node) {
|
|
|
|
function init_tag_pairs(tag_pairs) {
|
|
|
|
tag_pair_map = {};
|
|
|
|
for (var i = 0, n = tag_pairs.length; i < n; i++) {
|
|
|
|
// Ex: t1 = 'b', t2 = 'i'
|
|
|
|
var p = tag_pairs[i];
|
|
|
|
var t1 = p[0];
|
|
|
|
var t2 = p[1];
|
|
|
|
|
|
|
|
// Update map with {t1: t2}
|
|
|
|
var t1_tags = tag_pair_map[t1];
|
|
|
|
if (t1_tags === undefined) {
|
|
|
|
t1_tags = [];
|
|
|
|
tag_pair_map[t1] = t1_tags;
|
|
|
|
}
|
|
|
|
t1_tags.push(t2);
|
|
|
|
|
|
|
|
// Update map with {t2: t1}
|
|
|
|
var t2_tags = tag_pair_map[t2];
|
|
|
|
if (t2_tags === undefined) {
|
|
|
|
t2_tags = [];
|
|
|
|
tag_pair_map[t2] = t2_tags;
|
|
|
|
}
|
|
|
|
t2_tags.push(t1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function migrate_all_children(src, tgt) {
|
|
|
|
var children = src.childNodes;
|
|
|
|
for (var i = 0, n = children.length; i < n; i++) {
|
|
|
|
// IMPORTANT: Always use index 0 to get the item at the top of the list.
|
|
|
|
// Do not use children[i] because as you append a child to 'tgt', they
|
|
|
|
// also get pulled out of the children list!
|
|
|
|
tgt.appendChild(children.item(0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function copy_attributes(src, tgt) {
|
|
|
|
var attrs = src.attributes;
|
|
|
|
for (var i = 0, n = attrs.length; i < n; i++) {
|
|
|
|
var a = attrs.item(i);
|
|
|
|
tgt.setAttribute(a.nodeName, a.nodeValue);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
function rewriteable_tag_pair(t1, t2) {
|
|
|
|
// SSS FIXME: I dont think this check is sufficient. We also need to
|
|
|
|
// verify that nodes having these tags t1 and t2 are deletable as well.
|
|
|
|
var t1_tags = tag_pair_map[t1];
|
|
|
|
if (t1_tags !== undefined) {
|
|
|
|
for (var i = 0, n = t1_tags.length; i < n; i++)
|
|
|
|
if (t1_tags[i] === t2) return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Initialization
|
|
|
|
if (tag_pair_map === null) init_tag_pairs(tag_pairs);
|
|
|
|
|
|
|
|
var children = node.childNodes;
|
|
|
|
var num_children = children.length;
|
|
|
|
|
|
|
|
// no re-writing possible
|
|
|
|
if (num_children < 2) return;
|
|
|
|
|
|
|
|
// look for the pattern
|
|
|
|
for (var i = 0, n = num_children-1; i < n; i++) {
|
|
|
|
var child = children[i];
|
|
|
|
var sibling = children[i+1];
|
|
|
|
var t1 = child.nodeName.toLowerCase();
|
|
|
|
var t2 = sibling.nodeName.toLowerCase();
|
|
|
|
if (rewriteable_tag_pair(t1, t2)) {
|
|
|
|
var newChild = null, newGrandChild = null;
|
|
|
|
if (child.childNodes.length == 1 && child.childNodes[0].nodeName.toLowerCase() == t2) {
|
|
|
|
// rewritable
|
|
|
|
newChild = document.createElement(t2);
|
|
|
|
newGrandChild = document.createElement(t1);
|
|
|
|
|
|
|
|
newChild.appendChild(newGrandChild);
|
|
|
|
migrate_all_children(sibling, newChild);
|
|
|
|
migrate_all_children(child.childNodes[0], newGrandChild);
|
|
|
|
|
|
|
|
copy_attributes(child, newGrandChild);
|
|
|
|
copy_attributes(sibling, newChild);
|
|
|
|
} else if (sibling.childNodes.length == 1 && sibling.childNodes[0].nodeName.toLowerCase() == t1) {
|
|
|
|
// rewritable
|
|
|
|
newChild = document.createElement(t1);
|
|
|
|
newGrandChild = document.createElement(t2);
|
|
|
|
|
|
|
|
migrate_all_children(child, newChild);
|
|
|
|
migrate_all_children(sibling.childNodes[0], newGrandChild);
|
|
|
|
newChild.appendChild(newGrandChild);
|
|
|
|
|
|
|
|
copy_attributes(child, newChild);
|
|
|
|
copy_attributes(sibling, newGrandChild);
|
|
|
|
}
|
|
|
|
|
|
|
|
// modify DOM here
|
|
|
|
if (newChild !== null) {
|
|
|
|
|
|
|
|
node.insertBefore(newChild, child);
|
|
|
|
node.removeChild(child);
|
|
|
|
node.removeChild(sibling);
|
|
|
|
|
|
|
|
// modify indexes
|
|
|
|
n--;
|
|
|
|
|
|
|
|
// Reprocess newChild
|
|
|
|
rewrite_nested_tag_pairs(document, tag_pairs, newChild);
|
|
|
|
|
|
|
|
// Revisit the new node and see if it will merge with its sibling.
|
|
|
|
// Hence the decrement.
|
|
|
|
i--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
// Rewrite rules currently implemented:
|
|
|
|
// 1. [..., T1[T2[*x]], T2[*y], ...] ==> [..., T2[T1[*x], *y], ...]
|
|
|
|
//
|
|
|
|
// Other possible rewrite rules:
|
|
|
|
// 2. [..., T1[T2[*x]], ...] ==> [..., T2[T1[*x], ...]
|
|
|
|
// 3. [..., T[*x], T[*y], ...] ==> [..., T[*x,*y], ...]
|
|
|
|
//
|
|
|
|
// Note: Rewriting 2. and 3. in that sequence will effectively yield 1.
|
|
|
|
// but implementing 1. directly is faster
|
|
|
|
//
|
|
|
|
// Not sure we want to implement all kinds of rewriting rules yet.
|
|
|
|
// We have to figure out what rules are worth implementing.
|
|
|
|
var normalize_subtree = function(node, rewrite_rules) {
|
|
|
|
if (rewrite_rules.length === 0) return;
|
|
|
|
|
|
|
|
var children = node.childNodes;
|
|
|
|
for(var i = 0, length = children.length; i < length; i++) {
|
|
|
|
normalize_subtree(children[i], rewrite_rules);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (var j = 0, num_rules = rewrite_rules.length; j < num_rules; j++) {
|
|
|
|
rewrite_rules[j](node);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
var normalize_document = function(document) {
|
First pass implementing a general tag minimization routine
* This routine attempts to rewrite the DOM to maximize tag overlap
and thus minimize tag uses.
* This takes as input a set of tags which participate in the
minimization.
* Tested on the following example
<b><i><u><s>BIUS</s></u></i></b><b><i><s>BIS</s></i></b><b><u><s>BUS</s></u></b><u><i>UI</i></u>
with multiple combinations of the 2^4 possible variations of i,b,u,s
tags: [], ['i','b','u','s'], ['i'], ['b','s'], ['i','b','u']
- But, I am not fully sure if this implements the right behavior when
only a subset of inline tags are provided. Needs discussion and tweaking
as necessary.
* Also tested on few others:
<b>B</b><b><i>BI</i></b><b><i><u>BIU</u></i></b><b><i><u><s>BIUS</s></u></i></b>
<s><i><b>SIB</s></i></b><s><i><u>SIU</u></i></s><i><u>IU</u></i><i>I</i>
* The previous pairwise tag rewriting version fails on several of these
examples, so this new version is a definite improvement.
* No change in parserTests run (203 passing before and after).
* Possible improvements that could/should be undertaken:
- get rid of useless/idempotent add/remove of nodes that don't change
the DOM.
- ensure that node attributes post-restructuring are correct.
Change-Id: Ib4a8b39583fa96a2be880a77021ca81cefa06484
2012-05-31 03:54:23 +00:00
|
|
|
// normalize_subtree(document.body, [rewrite_nested_tag_pairs.bind(null, document, [['b','i'], ['b','u'], ['i','u']])]);
|
|
|
|
minimize_inline_tags(document.body, ['b','u','i','s']);
|
First attempt implementing rewriting rules on the DOM
- This is implemented as a post-processing pass.
- Might require additional checks to verify rewriteability.
- Implemented as a pair-wise tag DOM minimization strategy,
i.e. it takes tag pairs (B, I) for ex, and attempts to
normalize the tree just for those tag pairs. Normalizing
across multiple tags is implemented as pairwise rewriting
across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u)
- Copied over attributes as part of rewriting, but some of the
attributes lose their meaning on rewriting since tags are
reordered (ex: sourcePosn, sourceTagPosn). How do we handle this?
Output examples and possible issues to fix:
<i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u>
gets rewritten to:
<u><b><i>biu</i>bu</b>u</u>
But, the equivalent wikitext form:
'''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u>
does not get rewritten because of parsing differences.
This wikitext gets parsed into:
<i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u>
The extra ''' token in the middle thwarts DOM rewriting.
However, a slightly different version:
"'''''<u>biu</u>''<u>bu</u>'''<u>u</u>"
gets properly normalized to:
<u>'''''biu''bu'''u</u>
An alternative, but fun strategy to play with is to use the following
two normalization primitives: S(wap) and M(erge).
- S rewrites T1(T2(x)) into T2(T1(x))
(ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>)
- M rewrites (T(x),T(y)) into (T(x,y)).
(ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>)
The current rewriting strategy could possibly be re-implemented as S-M
rewriting. The problem to solve there would be to find an efficient
rewriting strategy that is guaranteed to lead to a normal form. I may
not play with it now, but just documenting it for later (to play with
in my spare time).
This commit is just as a record of fun/experimental code where I get to
learn details of JS, wikitext, parsing, and DOM manipulation. Next
version of this code will attempt to introduce minimal DOM restructuring
across multiple tags at once which can be more efficient.
gwicke: Removed now passing test from whitelist, and updated another whitelist
entry which is now improved.
Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
|
|
|
};
|
|
|
|
|
2011-11-30 12:28:45 +00:00
|
|
|
// Wrap all top-level inline elements in paragraphs. This should also be
|
|
|
|
// applied inside block-level elements, but in that case the first paragraph
|
|
|
|
// usually remains plain inline.
|
2011-11-29 15:11:51 +00:00
|
|
|
var process_inlines_in_p = function ( document ) {
|
2011-12-14 15:15:41 +00:00
|
|
|
var body = document.body,
|
2011-11-30 12:28:45 +00:00
|
|
|
newP = document.createElement('p'),
|
2011-11-29 15:11:51 +00:00
|
|
|
cnodes = body.childNodes,
|
2012-01-14 00:58:20 +00:00
|
|
|
inParagraph = false,
|
2011-11-30 12:28:45 +00:00
|
|
|
deleted = 0;
|
2011-11-29 15:11:51 +00:00
|
|
|
|
2011-11-30 12:28:45 +00:00
|
|
|
for(var i = 0, length = cnodes.length; i < length; i++) {
|
|
|
|
var child = cnodes[i - deleted],
|
2011-11-29 15:11:51 +00:00
|
|
|
ctype = child.nodeType;
|
2012-02-14 20:56:14 +00:00
|
|
|
//console.warn(child + ctype);
|
2012-01-14 00:58:20 +00:00
|
|
|
if ((ctype === 3 && (inParagraph || !isElementContentWhitespace( child ))) ||
|
|
|
|
(ctype === Node.COMMENT_NODE && inParagraph ) ||
|
|
|
|
(ctype !== Node.TEXT_NODE &&
|
|
|
|
ctype !== Node.COMMENT_NODE &&
|
2012-01-17 18:30:22 +00:00
|
|
|
!Util.isBlockTag(child.nodeName.toLowerCase()))
|
2012-01-14 00:58:20 +00:00
|
|
|
)
|
|
|
|
{
|
|
|
|
// wrap in paragraph
|
2011-11-30 12:28:45 +00:00
|
|
|
newP.appendChild(child);
|
2012-01-14 00:58:20 +00:00
|
|
|
inParagraph = true;
|
2011-11-30 12:28:45 +00:00
|
|
|
deleted++;
|
2012-01-14 00:58:20 +00:00
|
|
|
} else if (inParagraph) {
|
2011-11-30 12:28:45 +00:00
|
|
|
body.insertBefore(newP, child);
|
2012-01-14 00:58:20 +00:00
|
|
|
deleted--;
|
2011-11-30 12:28:45 +00:00
|
|
|
newP = document.createElement('p');
|
2012-01-14 00:58:20 +00:00
|
|
|
inParagraph = false;
|
|
|
|
}
|
2011-11-29 15:11:51 +00:00
|
|
|
}
|
|
|
|
|
2012-01-14 00:58:20 +00:00
|
|
|
if (inParagraph) {
|
2011-11-30 12:28:45 +00:00
|
|
|
body.appendChild(newP);
|
2011-11-29 15:11:51 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
function DOMPostProcessor () {
|
First attempt implementing rewriting rules on the DOM
- This is implemented as a post-processing pass.
- Might require additional checks to verify rewriteability.
- Implemented as a pair-wise tag DOM minimization strategy,
i.e. it takes tag pairs (B, I) for ex, and attempts to
normalize the tree just for those tag pairs. Normalizing
across multiple tags is implemented as pairwise rewriting
across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u)
- Copied over attributes as part of rewriting, but some of the
attributes lose their meaning on rewriting since tags are
reordered (ex: sourcePosn, sourceTagPosn). How do we handle this?
Output examples and possible issues to fix:
<i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u>
gets rewritten to:
<u><b><i>biu</i>bu</b>u</u>
But, the equivalent wikitext form:
'''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u>
does not get rewritten because of parsing differences.
This wikitext gets parsed into:
<i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u>
The extra ''' token in the middle thwarts DOM rewriting.
However, a slightly different version:
"'''''<u>biu</u>''<u>bu</u>'''<u>u</u>"
gets properly normalized to:
<u>'''''biu''bu'''u</u>
An alternative, but fun strategy to play with is to use the following
two normalization primitives: S(wap) and M(erge).
- S rewrites T1(T2(x)) into T2(T1(x))
(ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>)
- M rewrites (T(x),T(y)) into (T(x,y)).
(ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>)
The current rewriting strategy could possibly be re-implemented as S-M
rewriting. The problem to solve there would be to find an efficient
rewriting strategy that is guaranteed to lead to a normal form. I may
not play with it now, but just documenting it for later (to play with
in my spare time).
This commit is just as a record of fun/experimental code where I get to
learn details of JS, wikitext, parsing, and DOM manipulation. Next
version of this code will attempt to introduce minimal DOM restructuring
across multiple tags at once which can be more efficient.
gwicke: Removed now passing test from whitelist, and updated another whitelist
entry which is now improved.
Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
|
|
|
this.processors = [process_inlines_in_p, normalize_document];
|
2011-11-29 15:11:51 +00:00
|
|
|
}
|
|
|
|
|
2012-01-04 11:00:54 +00:00
|
|
|
// Inherit from EventEmitter
|
|
|
|
DOMPostProcessor.prototype = new events.EventEmitter();
|
2012-01-04 12:28:41 +00:00
|
|
|
DOMPostProcessor.prototype.constructor = DOMPostProcessor;
|
2012-01-04 11:00:54 +00:00
|
|
|
|
2011-11-29 15:11:51 +00:00
|
|
|
DOMPostProcessor.prototype.doPostProcess = function ( document ) {
|
|
|
|
for(var i = 0; i < this.processors.length; i++) {
|
|
|
|
this.processors[i](document);
|
|
|
|
}
|
2012-01-04 11:00:54 +00:00
|
|
|
this.emit( 'document', document );
|
2011-11-29 15:11:51 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
2012-01-04 11:00:54 +00:00
|
|
|
/**
|
First attempt implementing rewriting rules on the DOM
- This is implemented as a post-processing pass.
- Might require additional checks to verify rewriteability.
- Implemented as a pair-wise tag DOM minimization strategy,
i.e. it takes tag pairs (B, I) for ex, and attempts to
normalize the tree just for those tag pairs. Normalizing
across multiple tags is implemented as pairwise rewriting
across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u)
- Copied over attributes as part of rewriting, but some of the
attributes lose their meaning on rewriting since tags are
reordered (ex: sourcePosn, sourceTagPosn). How do we handle this?
Output examples and possible issues to fix:
<i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u>
gets rewritten to:
<u><b><i>biu</i>bu</b>u</u>
But, the equivalent wikitext form:
'''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u>
does not get rewritten because of parsing differences.
This wikitext gets parsed into:
<i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u>
The extra ''' token in the middle thwarts DOM rewriting.
However, a slightly different version:
"'''''<u>biu</u>''<u>bu</u>'''<u>u</u>"
gets properly normalized to:
<u>'''''biu''bu'''u</u>
An alternative, but fun strategy to play with is to use the following
two normalization primitives: S(wap) and M(erge).
- S rewrites T1(T2(x)) into T2(T1(x))
(ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>)
- M rewrites (T(x),T(y)) into (T(x,y)).
(ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>)
The current rewriting strategy could possibly be re-implemented as S-M
rewriting. The problem to solve there would be to find an efficient
rewriting strategy that is guaranteed to lead to a normal form. I may
not play with it now, but just documenting it for later (to play with
in my spare time).
This commit is just as a record of fun/experimental code where I get to
learn details of JS, wikitext, parsing, and DOM manipulation. Next
version of this code will attempt to introduce minimal DOM restructuring
across multiple tags at once which can be more efficient.
gwicke: Removed now passing test from whitelist, and updated another whitelist
entry which is now improved.
Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
|
|
|
* Register for the 'document' event, normally emitted from the HTML5 tree
|
2012-01-04 11:00:54 +00:00
|
|
|
* builder.
|
|
|
|
*/
|
2012-04-25 14:35:59 +00:00
|
|
|
DOMPostProcessor.prototype.addListenersOn = function ( emitter ) {
|
2012-01-04 11:00:54 +00:00
|
|
|
emitter.addListener( 'document', this.doPostProcess.bind( this ) );
|
First attempt implementing rewriting rules on the DOM
- This is implemented as a post-processing pass.
- Might require additional checks to verify rewriteability.
- Implemented as a pair-wise tag DOM minimization strategy,
i.e. it takes tag pairs (B, I) for ex, and attempts to
normalize the tree just for those tag pairs. Normalizing
across multiple tags is implemented as pairwise rewriting
across all pairs: Ex:(b,i), (b,u),(i,u) for (b,i,u)
- Copied over attributes as part of rewriting, but some of the
attributes lose their meaning on rewriting since tags are
reordered (ex: sourcePosn, sourceTagPosn). How do we handle this?
Output examples and possible issues to fix:
<i><b><u>biu</u></b></i><b><u>bu</u></b><u>u</u>
gets rewritten to:
<u><b><i>biu</i>bu</b>u</u>
But, the equivalent wikitext form:
'''''<u>biu</u>''''''''<u>bu</u>'''<u>u</u>
does not get rewritten because of parsing differences.
This wikitext gets parsed into:
<i><b><u>biu</u>'''</b></i><u>bu<b>u</b></u>
The extra ''' token in the middle thwarts DOM rewriting.
However, a slightly different version:
"'''''<u>biu</u>''<u>bu</u>'''<u>u</u>"
gets properly normalized to:
<u>'''''biu''bu'''u</u>
An alternative, but fun strategy to play with is to use the following
two normalization primitives: S(wap) and M(erge).
- S rewrites T1(T2(x)) into T2(T1(x))
(ex: <b><i>foo</i></b> ==> <i><b>foo</b></i>)
- M rewrites (T(x),T(y)) into (T(x,y)).
(ex: <b>foo</b><b>bar</b> ==> <b>foobar</b>)
The current rewriting strategy could possibly be re-implemented as S-M
rewriting. The problem to solve there would be to find an efficient
rewriting strategy that is guaranteed to lead to a normal form. I may
not play with it now, but just documenting it for later (to play with
in my spare time).
This commit is just as a record of fun/experimental code where I get to
learn details of JS, wikitext, parsing, and DOM manipulation. Next
version of this code will attempt to introduce minimal DOM restructuring
across multiple tags at once which can be more efficient.
gwicke: Removed now passing test from whitelist, and updated another whitelist
entry which is now improved.
Change-Id: Ie97bcb164eb62c34ba61aa76ba2f4c232aa713d8
2012-05-25 01:10:47 +00:00
|
|
|
};
|
2012-01-04 11:00:54 +00:00
|
|
|
|
2011-11-29 15:11:51 +00:00
|
|
|
if (typeof module == "object") {
|
|
|
|
module.exports.DOMPostProcessor = DOMPostProcessor;
|
|
|
|
}
|