/* Perform post-processing steps on an already-built HTML DOM. */ var events = require('events'), Util = require('./mediawiki.Util.js').Util; // Quick HACK: define Node constants // https://developer.mozilla.org/en/nodeType var Node = { ELEMENT_NODE: 1, TEXT_NODE: 3, COMMENT_NODE: 8 }; var isElementContentWhitespace = function ( e ) { return (e.data.match(/^[ \r\n\t]*$/) !== null); }; function minimize_inline_tags(root, rewriteable_nodes) { var rewriteable_node_map = null; function tail(a) { return a[a.length-1]; } function remove_all_children(node) { while (node.hasChildNodes()) node.removeChild(node.firstChild); } function add_children(node, children) { for (var i = 0, n = children.length; i < n; i++) node.appendChild(children[i]); } function init() { rewriteable_node_map = {}; for (var i = 0, n = rewriteable_nodes.length; i < n; i++) { rewriteable_node_map[rewriteable_nodes[i].toLowerCase()] = true; } } function is_rewriteable_node(node_name) { return rewriteable_node_map[node_name]; } // Main routine function rewrite(node) { var children = node.childNodes; var n = children.length; // If we have a single node, no restructuring is possible at this level // Descend ... if (n === 1) { var sole_node = children[0]; if (sole_node.nodeType === Node.ELEMENT_NODE) rewrite(sole_node); return; } // * Collect longest linear paths for all children // * Process subtrees attached to the end of those paths // * Restructure the list of linear paths (and reattach processed subtrees at the tips). var P = []; for (var i = 0; i < n; i++) { var s = children[i]; if (s.nodeType === Node.ELEMENT_NODE) { var p = longest_linear_path(s); if (p.length === 0) { rewrite(s); // console.log("Pushed EMPTY with orig_parent: " + node.nodeName); P.push({path: [], orig_parent: node, children: [s]}); } else { var p_tail = tail(p); // console.log("llp: " + p); // process subtree (depth-first) rewrite(p_tail); // collect the restructured p_tail subtree (children) var child_nodes = p_tail.childNodes; var new_children = []; for (var j = 0, n2 = child_nodes.length; j < n2; j++) { new_children.push(child_nodes[j]); } // console.log("Pushed: " + p + ", tail: " + p_tail.nodeName + "; new_children: " + new_children.length); P.push({path: p, orig_parent: p_tail, children: new_children}); } } else { // console.log("Pushed EMPTY with subtree: " + s); P.push({path: [], orig_parent: node, children: [s]}); } } // Rewrite paths in 'P' if (P.length > 0) rewrite_paths(node, P); } function longest_linear_path(node) { var children, path = []; while (node.nodeType == Node.ELEMENT_NODE) { path.push(node); children = node.childNodes; if ((children.length === 0) || (children.length > 1)) return path; node = children[0]; } return path; } function rewrite_paths(parent_node, P) { // 1. Split P into maximal sublists where each sublist has a non-null path intersection. // 2. Process each sublist separately and accumulate the result. // // lcs = longest common sublist remove_all_children(parent_node); var sublists = split_into_disjoint_sublists(P); // console.log("# sublists: " + sublists.length + ", parent_node: " + parent_node.nodeName); for (var i = 0, num_sublists = sublists.length; i < num_sublists; i++) { var s = sublists[i]; var lcs = s.lcs; if (lcs.length > 0) { // Connect up LCS // console.log("LCS: " + lcs); var prev = lcs[0]; for (k = 1, lcs_len = lcs.length; k < lcs_len; k++) { var curr = lcs[k]; // SSS FIXME: this add/remove can be optimized // console.log("adding " + curr.nodeName + " to " + prev.nodeName); remove_all_children(prev); prev.appendChild(curr); prev = curr; } // Lastly, attach lcs to the incoming parent parent_node.appendChild(lcs[0]); } var paths = s.paths; var num_paths = paths.length; // console.log("sublist: lcs: " + lcs + ", #paths: " + num_paths); if (num_paths === 1) { // Nothing more to do! Stitch things up // two possible scenarios: // (a) we have an empty path ==> attach the children to parent_node // (b) we have a non-empty path ==> attach the children to the end of the path var p = paths[0].path; var children = paths[0].children; if (p.length > 0) { var p_tail = tail(p); remove_all_children(p_tail); add_children(p_tail, children); } else { add_children(parent_node, children); } } else { // Process the sublist rewrite_paths(tail(lcs), strip_lcs(paths, lcs)); } // console.log("done with this sublist"); } // console.log("--done all sublists--"); } function common_path(old, new_path) { var hash = {}; for (var i = 0, n = new_path.length; i < n; i++) { var e = new_path[i].nodeName.toLowerCase(); if (is_rewriteable_node(e)) hash[e] = new_path[i]; } var cp = []; for (i = 0, n = old.length; i < n; i++) { var hit = hash[old[i].nodeName.toLowerCase()]; // Add old path element always. This effectively picks elements from the leftmost path. if (hit) cp.push(old[i]); } // console.log("CP: " + old + "||" + new_path + "=" + cp); return cp; } // For each 'p' in 'paths', eliminate 'lcs' from 'p' function strip_lcs(paths, lcs) { // SSS FIXME: Implicit assumption: there are no duplicate elements in lcs or path! // Ex: BIB will // Fix this to be more robust var i, lcs_map = {}; for (i = 0, n = lcs.length; i < n; i++) lcs_map[lcs[i]] = true; for (i = 0, n = paths.length; i < n; i++) { var p = paths[i].path; for (var j = 0, l = p.length; j < l; j++) { // remove matching element if (lcs_map[p[j]]) { p.splice(j, 1); l--; j--; } } } return paths; } // Split 'P' into sublists where each sublist has the property that // the elements of the sublist have an intersection that is non-zero // Ex: [BIUS, SB, BUS, IU, I, U, US, B, I] will get split into 5 sublists // - (lcs: BS, paths: [BIUS, SB, BUS]) // - (lcs: I, paths: [IU, I]) // - (lcs: U, paths: [U, US]) // - (lcs: B, paths: [B]) // - (lcs: I, paths: [I]) function split_into_disjoint_sublists(P) { var p = P.shift(); var lcs = p.path; var curr = [p]; for (var i = 0, n = P.length; i < n; i++) { p = P.shift(); new_lcs = common_path(lcs, p.path); if (new_lcs.length === 0) { P.unshift(p); return [{lcs: lcs, paths: curr}].concat(split_into_disjoint_sublists(P)); } lcs = new_lcs; curr.push(p); } return [{lcs: lcs, paths: curr}]; } // Init init(); // Kick it off try { rewrite(root); } catch (e) { console.log("------- error errrror errrrrror! ----------"); console.log(e.stack); } } // [..., T1[T2[*x]], T2[*y], ...] ==> [..., T2[T1[*x], *y], ...] // [..., T2[*x], T1[T2[*y]], ...] ==> [..., T2[*x, T1[*y]], ...] // where T1 and T2 are different and can be one of [i, b] // // Strictly speaking, we can rewrite even when T1 has more than one child as long as all of them are T2's // but for now, we are going to restrict ourselves to a single child. var tag_pair_map = null; var rewrite_nested_tag_pairs = function(document, tag_pairs, node) { function init_tag_pairs(tag_pairs) { tag_pair_map = {}; for (var i = 0, n = tag_pairs.length; i < n; i++) { // Ex: t1 = 'b', t2 = 'i' var p = tag_pairs[i]; var t1 = p[0]; var t2 = p[1]; // Update map with {t1: t2} var t1_tags = tag_pair_map[t1]; if (t1_tags === undefined) { t1_tags = []; tag_pair_map[t1] = t1_tags; } t1_tags.push(t2); // Update map with {t2: t1} var t2_tags = tag_pair_map[t2]; if (t2_tags === undefined) { t2_tags = []; tag_pair_map[t2] = t2_tags; } t2_tags.push(t1); } } function migrate_all_children(src, tgt) { var children = src.childNodes; for (var i = 0, n = children.length; i < n; i++) { // IMPORTANT: Always use index 0 to get the item at the top of the list. // Do not use children[i] because as you append a child to 'tgt', they // also get pulled out of the children list! tgt.appendChild(children.item(0)); } } function copy_attributes(src, tgt) { var attrs = src.attributes; for (var i = 0, n = attrs.length; i < n; i++) { var a = attrs.item(i); tgt.setAttribute(a.nodeName, a.nodeValue); } } function rewriteable_tag_pair(t1, t2) { // SSS FIXME: I dont think this check is sufficient. We also need to // verify that nodes having these tags t1 and t2 are deletable as well. var t1_tags = tag_pair_map[t1]; if (t1_tags !== undefined) { for (var i = 0, n = t1_tags.length; i < n; i++) if (t1_tags[i] === t2) return true; } return false; } // Initialization if (tag_pair_map === null) init_tag_pairs(tag_pairs); var children = node.childNodes; var num_children = children.length; // no re-writing possible if (num_children < 2) return; // look for the pattern for (var i = 0, n = num_children-1; i < n; i++) { var child = children[i]; var sibling = children[i+1]; var t1 = child.nodeName.toLowerCase(); var t2 = sibling.nodeName.toLowerCase(); if (rewriteable_tag_pair(t1, t2)) { var newChild = null, newGrandChild = null; if (child.childNodes.length == 1 && child.childNodes[0].nodeName.toLowerCase() == t2) { // rewritable newChild = document.createElement(t2); newGrandChild = document.createElement(t1); newChild.appendChild(newGrandChild); migrate_all_children(sibling, newChild); migrate_all_children(child.childNodes[0], newGrandChild); copy_attributes(child, newGrandChild); copy_attributes(sibling, newChild); } else if (sibling.childNodes.length == 1 && sibling.childNodes[0].nodeName.toLowerCase() == t1) { // rewritable newChild = document.createElement(t1); newGrandChild = document.createElement(t2); migrate_all_children(child, newChild); migrate_all_children(sibling.childNodes[0], newGrandChild); newChild.appendChild(newGrandChild); copy_attributes(child, newChild); copy_attributes(sibling, newGrandChild); } // modify DOM here if (newChild !== null) { node.insertBefore(newChild, child); node.removeChild(child); node.removeChild(sibling); // modify indexes n--; // Reprocess newChild rewrite_nested_tag_pairs(document, tag_pairs, newChild); // Revisit the new node and see if it will merge with its sibling. // Hence the decrement. i--; } } } }; // Rewrite rules currently implemented: // 1. [..., T1[T2[*x]], T2[*y], ...] ==> [..., T2[T1[*x], *y], ...] // // Other possible rewrite rules: // 2. [..., T1[T2[*x]], ...] ==> [..., T2[T1[*x], ...] // 3. [..., T[*x], T[*y], ...] ==> [..., T[*x,*y], ...] // // Note: Rewriting 2. and 3. in that sequence will effectively yield 1. // but implementing 1. directly is faster // // Not sure we want to implement all kinds of rewriting rules yet. // We have to figure out what rules are worth implementing. var normalize_subtree = function(node, rewrite_rules) { if (rewrite_rules.length === 0) return; var children = node.childNodes; for(var i = 0, length = children.length; i < length; i++) { normalize_subtree(children[i], rewrite_rules); } for (var j = 0, num_rules = rewrite_rules.length; j < num_rules; j++) { rewrite_rules[j](node); } }; var normalize_document = function(document) { // normalize_subtree(document.body, [rewrite_nested_tag_pairs.bind(null, document, [['b','i'], ['b','u'], ['i','u']])]); minimize_inline_tags(document.body, ['b','u','i','s']); }; // Wrap all top-level inline elements in paragraphs. This should also be // applied inside block-level elements, but in that case the first paragraph // usually remains plain inline. var process_inlines_in_p = function ( document ) { var body = document.body, newP = document.createElement('p'), cnodes = body.childNodes, inParagraph = false, deleted = 0; for(var i = 0, length = cnodes.length; i < length; i++) { var child = cnodes[i - deleted], ctype = child.nodeType; //console.warn(child + ctype); if ((ctype === 3 && (inParagraph || !isElementContentWhitespace( child ))) || (ctype === Node.COMMENT_NODE && inParagraph ) || (ctype !== Node.TEXT_NODE && ctype !== Node.COMMENT_NODE && !Util.isBlockTag(child.nodeName.toLowerCase())) ) { // wrap in paragraph newP.appendChild(child); inParagraph = true; deleted++; } else if (inParagraph) { body.insertBefore(newP, child); deleted--; newP = document.createElement('p'); inParagraph = false; } } if (inParagraph) { body.appendChild(newP); } }; function DOMPostProcessor () { this.processors = [process_inlines_in_p, normalize_document]; } // Inherit from EventEmitter DOMPostProcessor.prototype = new events.EventEmitter(); DOMPostProcessor.prototype.constructor = DOMPostProcessor; DOMPostProcessor.prototype.doPostProcess = function ( document ) { for(var i = 0; i < this.processors.length; i++) { this.processors[i](document); } this.emit( 'document', document ); }; /** * Register for the 'document' event, normally emitted from the HTML5 tree * builder. */ DOMPostProcessor.prototype.addListenersOn = function ( emitter ) { emitter.addListener( 'document', this.doPostProcess.bind( this ) ); }; if (typeof module == "object") { module.exports.DOMPostProcessor = DOMPostProcessor; }