From a09aa4d5994bf1c37d5d82b808fd8c33083d0bb3 Mon Sep 17 00:00:00 2001 From: Gabriel Wicke Date: Wed, 14 Dec 2011 15:15:41 +0000 Subject: [PATCH] Add rough HTML DOM to WikiDom conversion. You can see serialized WikiDom of parser tests using 'node parserTests.js --wikidom'. --- modules/parser/ext.Cite.js | 3 +- modules/parser/mediawiki.DOMConverter.js | 306 ++++++++++++++++++ modules/parser/mediawiki.DOMPostProcessor.js | 14 +- .../parser/mediawiki.HTML5TreeBuilder.node.js | 2 +- tests/parser/parserTests.js | 33 +- 5 files changed, 349 insertions(+), 9 deletions(-) create mode 100644 modules/parser/mediawiki.DOMConverter.js diff --git a/modules/parser/ext.Cite.js b/modules/parser/ext.Cite.js index 263e50f267..b73dbe0679 100644 --- a/modules/parser/ext.Cite.js +++ b/modules/parser/ext.Cite.js @@ -257,7 +257,8 @@ Cite.prototype.onReferences = function ( tokenCTX ) { type: 'TAG', name: 'ol', attribs: [ - ['class', 'references'] + ['class', 'references'], + ['data-object', 'references'] // Object type ] } ].concat( listItems, { type: 'ENDTAG', name: 'ol' } ); diff --git a/modules/parser/mediawiki.DOMConverter.js b/modules/parser/mediawiki.DOMConverter.js new file mode 100644 index 0000000000..2611b2289a --- /dev/null +++ b/modules/parser/mediawiki.DOMConverter.js @@ -0,0 +1,306 @@ +/** + * Conversions between HTML DOM and WikiDom + * + * @class + * @constructor + */ +function DOMConverter () { +} + +// Quick HACK: define Node constants +// https://developer.mozilla.org/en/nodeType +var Node = { + ELEMENT_NODE: 1, + ATTRIBUTE_NODE: 2, + TEXT_NODE: 3, + CDATA_SECTION_NODE: 4, + ENTITY_REFERENCE_NODE: 5, + ENTITY_NODE: 6, + PROCESSING_INSTRUCTION_NODE: 7, + COMMENT_NODE: 8, + DOCUMENT_NODE: 9, + DOCUMENT_TYPE_NODE: 10, + DOCUMENT_FRAGMENT_NODE: 11, + NOTATION_NODE: 12 +}; + +DOMConverter.prototype.getHTMLHandlerInfo = function ( nodeName ) { + switch ( nodeName.toLowerCase() ) { + case 'p': + return { + handler: this._convertHTMLLeaf, + type: 'paragraph' + }; + case 'li': + case 'dl': + case 'dd': + return { + handler: this._convertHTMLLeaf, + type: 'listItem' + }; + case 'pre': + return { + handler: this._convertHTMLLeaf, + type: 'pre' + }; + case 'ul': + case 'ol': + case 'dl': + return { + handler: this._convertHTMLBranch, + type: 'list' + }; + default: + console.log( 'HTML to Wiki DOM conversion error. Unsupported node name ' + + nodeName ); + return { + handler: this._convertHTMLBranch, + type: nodeName.toLowerCase() + }; + break; + } +}; + +DOMConverter.prototype.getHTMLAnnotationType = function ( nodeName ) { + switch ( nodeName.toLowerCase() ) { + case 'i': + return 'textStyle/italic'; + case 'b': + return 'textStyle/bold'; + case 'span': + return 'textStyle/span'; + case 'a': + return 'link/unknown'; // XXX: distinguish internal / external etc + default: + console.log( 'HTML to Wiki DOM conversion error. Unsupported html annotation ' + + nodeName ); + return undefined; + break; + } +}; + +/** + * Convert a HTML DOM to WikiDom + * + * @method + * @param {Object} root of HTML DOM (usually the body element) + * @returns {Object} WikiDom version + */ +DOMConverter.prototype.HTMLtoWiki = function ( node ) { + var children = node.childNodes, + out = { + type: 'document', + children: [] + }; + for ( var i = 0, l = children.length; i < l; i++ ) { + var cnode = children[i]; + switch ( cnode.nodeType ) { + case Node.ELEMENT_NODE: + // Call a handler for the particular node type + var hi = this.getHTMLHandlerInfo( cnode.nodeName ); + var res = hi.handler.call(this, cnode, 0, hi.type ); + out.children.push( res.node ); + break; + case Node.TEXT_NODE: + // Add text as content, and increment offset + // BUT: Should not appear at toplevel! + break; + case Node.COMMENT_NODE: + // Add a comment annotation to which text? Not clear how this + // can be represented in WikiDom. + break; + default: + console.log( "HTML to Wiki DOM conversion error. Unhandled node type " + + cnode.innerHTML ); + break; + } + } + return out; +}; + +/** + * Private HTML branch node handler + * + * @param {Object} HTML DOM element + * @param {Int} WikiDom offset within a block + * @returns {Object} WikiDom object + */ +DOMConverter.prototype._convertHTMLBranch = function ( node, offset, type ) { + var children = node.childNodes, + wnode = { + type: type, + attributes: this._HTMLPropertiesToWikiAttributes( node ), + children: [] + }; + for ( var i = 0, l = children.length; i < l; i++ ) { + var cnode = children[i]; + switch ( cnode.nodeType ) { + case Node.ELEMENT_NODE: + // Call a handler for the particular node type + var hi = this.getHTMLHandlerInfo( cnode.nodeName ); + var res = hi.handler.call(this, cnode, offset + 1, hi.type ); + wnode.children.push( res.node ); + offset = res.offset; + break; + case Node.TEXT_NODE: + // Create a paragraph and add it to children? + break; + case Node.COMMENT_NODE: + // add a comment node. + break; + default: + console.log( "HTML to Wiki DOM conversion error. Unhandled node " + + cnode.innerHTML ); + break; + } + } + return { + offset: offset, + node: wnode + }; +}; + +/** + * Private HTML leaf node handler + * + * @param {Object} HTML DOM element + * @param {Int} WikiDom offset within a block + * @returns {Object} WikiDom object + */ +DOMConverter.prototype._convertHTMLLeaf = function ( node, offset, type ) { + var children = node.childNodes, + wnode = { + type: type, + attributes: this._HTMLPropertiesToWikiAttributes( node ), + content: { + text: '', + annotations: [] + } + }; + //console.log( 'res wnode: ' + JSON.stringify(wnode, null, 2)); + for ( var i = 0, l = children.length; i < l; i++ ) { + var cnode = children[i]; + switch ( cnode.nodeType ) { + case Node.ELEMENT_NODE: + // Call a handler for the particular annotation node type + var annotationtype = this.getHTMLAnnotationType( cnode.nodeName ); + if ( annotationtype ) { + var res = this._convertHTMLAnnotation( cnode, offset, annotationtype ); + //console.log( 'res leaf: ' + JSON.stringify(res, null, 2)); + offset += res.text.length; + wnode.content.text += res.text; + //console.log( 'res annotations: ' + JSON.stringify(res, null, 2)); + wnode.content.annotations = wnode.content.annotations + .concat( res.annotations ); + } + break; + case Node.TEXT_NODE: + // Add text as content, and increment offset + wnode.content.text += cnode.data; + offset += cnode.data.length; + break; + case Node.COMMENT_NODE: + // add a comment annotation? + break; + default: + console.log( "HTML to Wiki DOM conversion error. Unhandled node " + + cnode.innerHTML ); + break; + } + } + return { + offset: offset, + node: wnode + }; +}; + +DOMConverter.prototype._convertHTMLAnnotation = function ( node, offset, type ) { + var children = node.childNodes, + text = '', + annotations = [ + { + type: type, + data: this._HTMLPropertiesToWikiData( node ), + range: { + start: offset, + end: offset + } + } + ]; + for ( var i = 0, l = children.length; i < l; i++ ) { + var cnode = children[i]; + switch ( cnode.nodeType ) { + case Node.ELEMENT_NODE: + // Call a handler for the particular annotation node type + var annotationtype = this.getHTMLAnnotationType(cnode.nodeName); + if ( annotationtype ) { + var res = this._convertHTMLAnnotation( cnode, offset, annotationtype ); + //console.log( 'res annotations 2: ' + JSON.stringify(res, null, 2)); + text += res.text; + offset += res.text.length; + annotations = annotations.concat( res.annotations ); + } + break; + case Node.TEXT_NODE: + // Add text as content, and increment offset + text += cnode.data; + offset += cnode.data.length; + break; + case Node.COMMENT_NODE: + // add a comment annotation? + break; + default: + console.log( "HTML to Wiki DOM conversion error. Unhandled node " + + cnode.innerHTML ); + break; + } + } + annotations[0].range.end = offset; + return { + text: text, + annotations: annotations + }; +}; + +DOMConverter.prototype._HTMLPropertiesToWikiAttributes = function ( elem ) { + var attribs = elem.attributes, + out = {}; + for ( var i = 0, l = attribs.length; i < l; i++ ) { + var attrib = attribs.item(i), + key = attrib.name; + console.log('key: ' + key); + if ( key.match( /^data-/ ) ) { + // strip data- prefix from data-* + out[key.replace( /^data-/, '' )] = attrib.value; + } else { + // prefix html properties with html/ + out['html/' + key] = attrib.value; + } + } + return out; +}; + +DOMConverter.prototype._HTMLPropertiesToWikiData = function ( elem ) { + var attribs = elem.attributes, + out = {}; + for ( var i = 0, l = attribs.length; i < l; i++ ) { + var attrib = attribs.item(i), + key = attrib.name; + if ( key.match( /^data-/ ) ) { + // strip data- prefix from data-* + out[key.replace( /^data-/, '' )] = attrib.value; + } else { + // pass through a few whitelisted keys + // XXX: This subsets html DOM + if ( ['title'].indexOf(key) != -1 ) { + out[key] = attrib.value; + } + } + } + return out; +}; + + +if (typeof module == "object") { + module.exports.DOMConverter = DOMConverter; +} diff --git a/modules/parser/mediawiki.DOMPostProcessor.js b/modules/parser/mediawiki.DOMPostProcessor.js index 64733f7551..737d95b616 100644 --- a/modules/parser/mediawiki.DOMPostProcessor.js +++ b/modules/parser/mediawiki.DOMPostProcessor.js @@ -30,12 +30,18 @@ var isBlock = function isBlock (name) { } }; +// Quick HACK: define Node constants +// https://developer.mozilla.org/en/nodeType +var Node = { + TEXT_NODE: 3, + COMMENT_NODE: 8 +}; + // Wrap all top-level inline elements in paragraphs. This should also be // applied inside block-level elements, but in that case the first paragraph // usually remains plain inline. var process_inlines_in_p = function ( document ) { - // document.body does not always work in jsdom, so work around it. - var body = document.getElementsByTagName('body')[0], + var body = document.body, newP = document.createElement('p'), cnodes = body.childNodes, haveInlines = false, @@ -50,8 +56,8 @@ var process_inlines_in_p = function ( document ) { ctype = child.nodeType; //console.log(child + ctype); if ((ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) || - (ctype !== 3 && // text - ctype !== 8 && // comment + (ctype !== Node.TEXT_NODE && + ctype !== Node.COMMENT_NODE && !isBlock(child.nodeName))) { // text node newP.appendChild(child); diff --git a/modules/parser/mediawiki.HTML5TreeBuilder.node.js b/modules/parser/mediawiki.HTML5TreeBuilder.node.js index 14bfffe073..bdde3f815e 100644 --- a/modules/parser/mediawiki.HTML5TreeBuilder.node.js +++ b/modules/parser/mediawiki.HTML5TreeBuilder.node.js @@ -62,8 +62,8 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) { break; case "END": this.emit('end'); - // HACK: This should not be needed really. this.document = this.parser.document; + // HACK: This should not be needed really. this.document.body = this.document.getElementsByTagName('body')[0]; break; case "NEWLINE": diff --git a/tests/parser/parserTests.js b/tests/parser/parserTests.js index ec27a00f9e..216fa19c78 100644 --- a/tests/parser/parserTests.js +++ b/tests/parser/parserTests.js @@ -63,6 +63,8 @@ _import(pj('parser', 'ext.cite.taghook.ref.js'), ['MWRefTagHook']); _import(pj('parser', 'mediawiki.HTML5TreeBuilder.node.js'), ['FauxHTML5']); _import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']); +_import(pj('parser', 'mediawiki.DOMConverter'), ['DOMConverter']); + _import(pj('parser', 'ext.core.QuoteTransformer.js'), ['QuoteTransformer']); _import(pj('parser', 'ext.Cite.js'), ['Cite']); @@ -121,6 +123,11 @@ function ParserTests () { description: 'Print out a whitelist entry for failing tests. Default false.', default: false, boolean: true + }, + 'wikidom': { + description: 'Print out a WikiDom conversion of the HTML DOM', + default: false, + boolean: true } } ).check( function(argv) { @@ -181,6 +188,8 @@ function ParserTests () { this.postProcessor = new DOMPostProcessor(); + this.DOMConverter = new DOMConverter(); + var pt = this; // Set up the TokenTransformDispatcher with a callback for the remaining @@ -196,13 +205,18 @@ function ParserTests () { pt.buildTree( tokens, treeBuilder ); // Perform post-processing on DOM. - pt.postProcessor.doPostProcess(treeBuilder.parser.document); + pt.postProcessor.doPostProcess(treeBuilder.document); // And serialize the result. var out = treeBuilder.document.body.innerHTML; // Finally, check the result vs. the expected result. pt.checkResult( pt.currentItem, out ); + + if ( pt.argv.wikidom ) { + // Test HTML DOM -> WikiDOM conversion + pt.printWikiDom( treeBuilder.document.body ); + } }); // Add token transformations.. @@ -225,8 +239,6 @@ function ParserTests () { } - - /** * Get an object holding our tests cases. Eventually from a cache file */ @@ -520,6 +532,21 @@ ParserTests.prototype.checkResult = function ( item, out ) { }; +/** + * Print out a WikiDom conversion of the HTML DOM + */ +ParserTests.prototype.printWikiDom = function ( body ) { + console.log('WikiDom'.cyan + ':'); + console.log( + JSON.stringify( + this.DOMConverter.HTMLtoWiki(body), + null, + 2 + ) + "\n" + ); +}; + + ParserTests.prototype.buildTree = function ( tokens, treeBuilder ) { // push a body element, just to be sure to have one treeBuilder.processToken({type: 'TAG', name: 'body'});