Add rough HTML DOM to WikiDom conversion. You can see serialized WikiDom of

parser tests using 'node parserTests.js --wikidom'.
2024-11-24 06:24:08 +00:00 · 2011-12-14 15:15:41 +00:00 · 2011-12-14 15:15:41 +00:00 · a09aa4d599
parent 5f80d30428
commit a09aa4d599
5 changed files with 349 additions and 9 deletions
--- a/modules/parser/ext.Cite.js
+++ b/modules/parser/ext.Cite.js
@ -257,7 +257,8 @@ Cite.prototype.onReferences = function ( tokenCTX ) {
 				type: 'TAG',
 				name: 'ol',
 				attribs: [
-					['class', 'references']
+					['class', 'references'],
 					['data-object', 'references'] // Object type
 				]
 			}
 		].concat( listItems, { type: 'ENDTAG', name: 'ol' } );
--- a/modules/parser/mediawiki.DOMConverter.js
+++ b/modules/parser/mediawiki.DOMConverter.js
@ -0,0 +1,306 @@
 /**
 * Conversions between HTML DOM and WikiDom
 * 
 * @class
 * @constructor
 */
 function DOMConverter () {
 }
 // Quick HACK: define Node constants
 // https://developer.mozilla.org/en/nodeType
 var Node = {
 	ELEMENT_NODE: 1,
    ATTRIBUTE_NODE: 2,
    TEXT_NODE: 3,
    CDATA_SECTION_NODE: 4,
    ENTITY_REFERENCE_NODE: 5,
    ENTITY_NODE: 6,
    PROCESSING_INSTRUCTION_NODE: 7,
    COMMENT_NODE: 8,
    DOCUMENT_NODE: 9,
    DOCUMENT_TYPE_NODE: 10,
    DOCUMENT_FRAGMENT_NODE: 11,
    NOTATION_NODE: 12
 };
 DOMConverter.prototype.getHTMLHandlerInfo = function ( nodeName ) {
 	switch ( nodeName.toLowerCase() ) {
 		case 'p':
 			return {
 				handler: this._convertHTMLLeaf, 
 				type: 'paragraph'
 			};
 		case 'li':
 		case 'dl':
 		case 'dd':
 			return {
 				handler: this._convertHTMLLeaf, 
 				type: 'listItem'
 			};
 		case 'pre':
 			return {
 				handler: this._convertHTMLLeaf, 
 				type: 'pre'
 			};
 		case 'ul':
 		case 'ol':
 		case 'dl':
 			return {
 				handler: this._convertHTMLBranch, 
 				type: 'list'
 			};
 		default:
 			console.log( 'HTML to Wiki DOM conversion error. Unsupported node name ' +
 					nodeName );
 			return {
 				handler: this._convertHTMLBranch, 
 				type: nodeName.toLowerCase()
 			};
 			break;
 	}
 };
 DOMConverter.prototype.getHTMLAnnotationType = function ( nodeName ) {
 	switch ( nodeName.toLowerCase() ) {
 		case 'i':
 			return 'textStyle/italic';
 		case 'b':
 			return 'textStyle/bold';
 		case 'span':
 			return 'textStyle/span';
 		case 'a':
 			return 'link/unknown'; // XXX: distinguish internal / external etc
 		default:
 			console.log( 'HTML to Wiki DOM conversion error. Unsupported html annotation ' +
 					nodeName );
 			return undefined;
 			break;
 	}
 };
 /**
 * Convert a HTML DOM to WikiDom
 *
 * @method
 * @param {Object} root of HTML DOM (usually the body element)
 * @returns {Object} WikiDom version
 */
 DOMConverter.prototype.HTMLtoWiki = function ( node ) {
 	var children = node.childNodes,
 		out = {
 			type: 'document',
 			children: []
 		};
 	for ( var i = 0, l = children.length; i < l; i++ ) {
 		var cnode = children[i];
 		switch ( cnode.nodeType ) {
 			case Node.ELEMENT_NODE:
 				// Call a handler for the particular node type
 				var hi = this.getHTMLHandlerInfo( cnode.nodeName );
 				var res = hi.handler.call(this, cnode, 0, hi.type );
 				out.children.push( res.node );
 				break;
 			case Node.TEXT_NODE:
 				// Add text as content, and increment offset
 				// BUT: Should not appear at toplevel!
 				break;
 			case Node.COMMENT_NODE:
 				// Add a comment annotation to which text? Not clear how this
 				// can be represented in WikiDom.
 				break;
 			default:
 				console.log( "HTML to Wiki DOM conversion error. Unhandled node type " + 
 						cnode.innerHTML );
 				break;
 		}
 	}
 	return out;
 };
 /**
 * Private HTML branch node handler
 *
 * @param {Object} HTML DOM element
 * @param {Int} WikiDom offset within a block
 * @returns {Object} WikiDom object
 */
 DOMConverter.prototype._convertHTMLBranch = function ( node, offset, type ) {
 	var children = node.childNodes,
 		wnode = {
 			type: type,
 			attributes: this._HTMLPropertiesToWikiAttributes( node ),
 			children: [] 
 		};
 	for ( var i = 0, l = children.length; i < l; i++ ) {
 		var cnode = children[i];
 		switch ( cnode.nodeType ) {
 			case Node.ELEMENT_NODE:
 				// Call a handler for the particular node type
 				var hi = this.getHTMLHandlerInfo( cnode.nodeName );
 				var res = hi.handler.call(this, cnode, offset + 1, hi.type );
 				wnode.children.push( res.node );
 				offset = res.offset;
 				break;
 			case Node.TEXT_NODE:
 				// Create a paragraph and add it to children?
 				break;
 			case Node.COMMENT_NODE:
 				// add a comment node.
 				break;
 			default:
 				console.log( "HTML to Wiki DOM conversion error. Unhandled node " + 
 						cnode.innerHTML );
 				break;
 		}
 	}
 	return {
 		offset: offset,
 		node: wnode
 	};
 };
 /**
 * Private HTML leaf node handler
 *
 * @param {Object} HTML DOM element
 * @param {Int} WikiDom offset within a block
 * @returns {Object} WikiDom object
 */
 DOMConverter.prototype._convertHTMLLeaf = function ( node, offset, type ) {
 	var children = node.childNodes,
 		wnode = {
 			type: type,
 			attributes: this._HTMLPropertiesToWikiAttributes( node ),
 			content: { 
 				text: '',
 				annotations: []
 			}
 		};
 	//console.log( 'res wnode: ' + JSON.stringify(wnode, null, 2));
 	for ( var i = 0, l = children.length; i < l; i++ ) {
 		var cnode = children[i];
 		switch ( cnode.nodeType ) {
 			case Node.ELEMENT_NODE:
 				// Call a handler for the particular annotation node type
 				var annotationtype = this.getHTMLAnnotationType( cnode.nodeName );
 				if ( annotationtype ) {
 					var res = this._convertHTMLAnnotation( cnode, offset, annotationtype );
 					//console.log( 'res leaf: ' + JSON.stringify(res, null, 2));
 					offset += res.text.length;
 					wnode.content.text += res.text;
 					//console.log( 'res annotations: ' + JSON.stringify(res, null, 2));
 					wnode.content.annotations = wnode.content.annotations
 														.concat( res.annotations );
 				}
 				break;
 			case Node.TEXT_NODE:
 				// Add text as content, and increment offset
 				wnode.content.text += cnode.data;
 				offset += cnode.data.length;
 				break;
 			case Node.COMMENT_NODE:
 				// add a comment annotation?
 				break;
 			default:
 				console.log( "HTML to Wiki DOM conversion error. Unhandled node " + 
 						cnode.innerHTML );
 				break;
 		}
 	}
 	return {
 		offset: offset,
 		node: wnode
 	};
 };
 DOMConverter.prototype._convertHTMLAnnotation = function ( node, offset, type ) {
 	var children = node.childNodes,
 		text = '',
 		annotations = [
 				{
 					type: type,
 					data: this._HTMLPropertiesToWikiData( node ),
 					range: {
 						start: offset,
 						end: offset
 					}
 				}
 		];
 	for ( var i = 0, l = children.length; i < l; i++ ) {
 		var cnode = children[i];
 		switch ( cnode.nodeType ) {
 			case Node.ELEMENT_NODE:
 				// Call a handler for the particular annotation node type
 				var annotationtype = this.getHTMLAnnotationType(cnode.nodeName);
 				if ( annotationtype ) {
 					var res = this._convertHTMLAnnotation( cnode, offset, annotationtype );
 					//console.log( 'res annotations 2: ' + JSON.stringify(res, null, 2));
 					text += res.text;
 					offset += res.text.length;
 					annotations = annotations.concat( res.annotations );
 				}
 				break;
 			case Node.TEXT_NODE:
 				// Add text as content, and increment offset
 				text += cnode.data;
 				offset += cnode.data.length;
 				break;
 			case Node.COMMENT_NODE:
 				// add a comment annotation?
 				break;
 			default:
 				console.log( "HTML to Wiki DOM conversion error. Unhandled node " + 
 						cnode.innerHTML );
 				break;
 		}
 	}
 	annotations[0].range.end = offset;
 	return	{
 		text: text,
 		annotations: annotations
 	};
 };
 DOMConverter.prototype._HTMLPropertiesToWikiAttributes = function ( elem ) {
 	var attribs = elem.attributes,
 		out = {};
 	for ( var i = 0, l = attribs.length; i < l; i++ ) {
 		var attrib = attribs.item(i),
 			key = attrib.name;
 		console.log('key: ' + key);
 		if ( key.match( /^data-/ ) ) {
 			// strip data- prefix from data-*
 			out[key.replace( /^data-/, '' )] = attrib.value;
 		} else {
 			// prefix html properties with html/
 			out['html/' + key] = attrib.value;
 		}
 	}
 	return out;
 };
 DOMConverter.prototype._HTMLPropertiesToWikiData = function ( elem ) {
 	var attribs = elem.attributes,
 		out = {};
 	for ( var i = 0, l = attribs.length; i < l; i++ ) {
 		var attrib = attribs.item(i),
 			key = attrib.name;
 		if ( key.match( /^data-/ ) ) {
 			// strip data- prefix from data-*
 			out[key.replace( /^data-/, '' )] = attrib.value;
 		} else {
 			// pass through a few whitelisted keys
 			// XXX: This subsets html DOM
 			if ( ['title'].indexOf(key) != -1 ) {
 				out[key] = attrib.value;
 			}
 		}
 	}
 	return out;
 };
 if (typeof module == "object") {
 	module.exports.DOMConverter = DOMConverter;
 }
--- a/modules/parser/mediawiki.DOMPostProcessor.js
+++ b/modules/parser/mediawiki.DOMPostProcessor.js
@ -30,12 +30,18 @@ var isBlock = function isBlock (name) {
 	}
 };
 // Quick HACK: define Node constants
 // https://developer.mozilla.org/en/nodeType
 var Node = {
 	TEXT_NODE: 3,
 	COMMENT_NODE: 8
 };
 // Wrap all top-level inline elements in paragraphs. This should also be
 // applied inside block-level elements, but in that case the first paragraph
 // usually remains plain inline.
 var process_inlines_in_p = function ( document ) {
-		// document.body does not always work in jsdom, so work around it.
+	var body = document.body,
 	var body = document.getElementsByTagName('body')[0],
 		newP = document.createElement('p'),
 		cnodes = body.childNodes,
 		haveInlines = false,
@ -50,8 +56,8 @@ var process_inlines_in_p = function ( document ) {
 			ctype = child.nodeType;
 		//console.log(child + ctype);
 		if ((ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) || 
-				(ctype !== 3 && // text
+				(ctype !== Node.TEXT_NODE &&
-				 ctype !== 8 && // comment
+				 ctype !== Node.COMMENT_NODE &&
 				 !isBlock(child.nodeName))) {
 			// text node
 			newP.appendChild(child);
--- a/modules/parser/mediawiki.HTML5TreeBuilder.node.js
+++ b/modules/parser/mediawiki.HTML5TreeBuilder.node.js
@ -62,8 +62,8 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
 			break;
 		case "END":
 			this.emit('end');
 			// HACK: This should not be needed really.
 			this.document = this.parser.document;
 			// HACK: This should not be needed really.
 			this.document.body = this.document.getElementsByTagName('body')[0];
 			break;
 		case "NEWLINE":
--- a/tests/parser/parserTests.js
+++ b/tests/parser/parserTests.js
@ -63,6 +63,8 @@ _import(pj('parser', 'ext.cite.taghook.ref.js'), ['MWRefTagHook']);
 _import(pj('parser', 'mediawiki.HTML5TreeBuilder.node.js'), ['FauxHTML5']);
 _import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']);
 _import(pj('parser', 'mediawiki.DOMConverter'), ['DOMConverter']);
 _import(pj('parser', 'ext.core.QuoteTransformer.js'), ['QuoteTransformer']);
 _import(pj('parser', 'ext.Cite.js'), ['Cite']);
@ -121,6 +123,11 @@ function ParserTests () {
 			description: 'Print out a whitelist entry for failing tests. Default false.',
 			default: false,
 			boolean: true
 		},
 		'wikidom': {
 			description: 'Print out a WikiDom conversion of the HTML DOM',
 			default: false,
 			boolean: true
 		}
 	}
 	).check( function(argv) {
@ -181,6 +188,8 @@ function ParserTests () {
 	this.postProcessor = new DOMPostProcessor();
 	this.DOMConverter = new DOMConverter();
 	var pt = this;
 	// Set up the TokenTransformDispatcher with a callback for the remaining
@ -196,13 +205,18 @@ function ParserTests () {
 		pt.buildTree( tokens, treeBuilder );
 		// Perform post-processing on DOM.
-		pt.postProcessor.doPostProcess(treeBuilder.parser.document);
+		pt.postProcessor.doPostProcess(treeBuilder.document);
 		// And serialize the result.
 		var out = treeBuilder.document.body.innerHTML;
 		// Finally, check the result vs. the expected result.
 		pt.checkResult( pt.currentItem, out );
 		if ( pt.argv.wikidom ) {
 			// Test HTML DOM -> WikiDOM conversion
 			pt.printWikiDom( treeBuilder.document.body );
 		}
 	});
 	// Add token transformations..
@ -225,8 +239,6 @@ function ParserTests () {
 }
 /**
 * Get an object holding our tests cases. Eventually from a cache file
 */
@ -520,6 +532,21 @@ ParserTests.prototype.checkResult = function ( item, out ) {
 };
 /**
 * Print out a WikiDom conversion of the HTML DOM
 */
 ParserTests.prototype.printWikiDom = function ( body ) {
 	console.log('WikiDom'.cyan + ':');
 	console.log(
 			JSON.stringify(
 				this.DOMConverter.HTMLtoWiki(body),
 				null, 
 				2
 			) + "\n"
 	);
 };
 ParserTests.prototype.buildTree = function ( tokens, treeBuilder ) {
 	// push a body element, just to be sure to have one
 	treeBuilder.processToken({type: 'TAG', name: 'body'});