Add rough HTML DOM to WikiDom conversion. You can see serialized WikiDom of

parser tests using 'node parserTests.js --wikidom'.
This commit is contained in:
Gabriel Wicke 2011-12-14 15:15:41 +00:00
parent 5f80d30428
commit a09aa4d599
5 changed files with 349 additions and 9 deletions

View file

@ -257,7 +257,8 @@ Cite.prototype.onReferences = function ( tokenCTX ) {
type: 'TAG', type: 'TAG',
name: 'ol', name: 'ol',
attribs: [ attribs: [
['class', 'references'] ['class', 'references'],
['data-object', 'references'] // Object type
] ]
} }
].concat( listItems, { type: 'ENDTAG', name: 'ol' } ); ].concat( listItems, { type: 'ENDTAG', name: 'ol' } );

View file

@ -0,0 +1,306 @@
/**
* Conversions between HTML DOM and WikiDom
*
* @class
* @constructor
*/
function DOMConverter () {
}
// Quick HACK: define Node constants
// https://developer.mozilla.org/en/nodeType
var Node = {
ELEMENT_NODE: 1,
ATTRIBUTE_NODE: 2,
TEXT_NODE: 3,
CDATA_SECTION_NODE: 4,
ENTITY_REFERENCE_NODE: 5,
ENTITY_NODE: 6,
PROCESSING_INSTRUCTION_NODE: 7,
COMMENT_NODE: 8,
DOCUMENT_NODE: 9,
DOCUMENT_TYPE_NODE: 10,
DOCUMENT_FRAGMENT_NODE: 11,
NOTATION_NODE: 12
};
DOMConverter.prototype.getHTMLHandlerInfo = function ( nodeName ) {
switch ( nodeName.toLowerCase() ) {
case 'p':
return {
handler: this._convertHTMLLeaf,
type: 'paragraph'
};
case 'li':
case 'dl':
case 'dd':
return {
handler: this._convertHTMLLeaf,
type: 'listItem'
};
case 'pre':
return {
handler: this._convertHTMLLeaf,
type: 'pre'
};
case 'ul':
case 'ol':
case 'dl':
return {
handler: this._convertHTMLBranch,
type: 'list'
};
default:
console.log( 'HTML to Wiki DOM conversion error. Unsupported node name ' +
nodeName );
return {
handler: this._convertHTMLBranch,
type: nodeName.toLowerCase()
};
break;
}
};
DOMConverter.prototype.getHTMLAnnotationType = function ( nodeName ) {
switch ( nodeName.toLowerCase() ) {
case 'i':
return 'textStyle/italic';
case 'b':
return 'textStyle/bold';
case 'span':
return 'textStyle/span';
case 'a':
return 'link/unknown'; // XXX: distinguish internal / external etc
default:
console.log( 'HTML to Wiki DOM conversion error. Unsupported html annotation ' +
nodeName );
return undefined;
break;
}
};
/**
* Convert a HTML DOM to WikiDom
*
* @method
* @param {Object} root of HTML DOM (usually the body element)
* @returns {Object} WikiDom version
*/
DOMConverter.prototype.HTMLtoWiki = function ( node ) {
var children = node.childNodes,
out = {
type: 'document',
children: []
};
for ( var i = 0, l = children.length; i < l; i++ ) {
var cnode = children[i];
switch ( cnode.nodeType ) {
case Node.ELEMENT_NODE:
// Call a handler for the particular node type
var hi = this.getHTMLHandlerInfo( cnode.nodeName );
var res = hi.handler.call(this, cnode, 0, hi.type );
out.children.push( res.node );
break;
case Node.TEXT_NODE:
// Add text as content, and increment offset
// BUT: Should not appear at toplevel!
break;
case Node.COMMENT_NODE:
// Add a comment annotation to which text? Not clear how this
// can be represented in WikiDom.
break;
default:
console.log( "HTML to Wiki DOM conversion error. Unhandled node type " +
cnode.innerHTML );
break;
}
}
return out;
};
/**
* Private HTML branch node handler
*
* @param {Object} HTML DOM element
* @param {Int} WikiDom offset within a block
* @returns {Object} WikiDom object
*/
DOMConverter.prototype._convertHTMLBranch = function ( node, offset, type ) {
var children = node.childNodes,
wnode = {
type: type,
attributes: this._HTMLPropertiesToWikiAttributes( node ),
children: []
};
for ( var i = 0, l = children.length; i < l; i++ ) {
var cnode = children[i];
switch ( cnode.nodeType ) {
case Node.ELEMENT_NODE:
// Call a handler for the particular node type
var hi = this.getHTMLHandlerInfo( cnode.nodeName );
var res = hi.handler.call(this, cnode, offset + 1, hi.type );
wnode.children.push( res.node );
offset = res.offset;
break;
case Node.TEXT_NODE:
// Create a paragraph and add it to children?
break;
case Node.COMMENT_NODE:
// add a comment node.
break;
default:
console.log( "HTML to Wiki DOM conversion error. Unhandled node " +
cnode.innerHTML );
break;
}
}
return {
offset: offset,
node: wnode
};
};
/**
* Private HTML leaf node handler
*
* @param {Object} HTML DOM element
* @param {Int} WikiDom offset within a block
* @returns {Object} WikiDom object
*/
DOMConverter.prototype._convertHTMLLeaf = function ( node, offset, type ) {
var children = node.childNodes,
wnode = {
type: type,
attributes: this._HTMLPropertiesToWikiAttributes( node ),
content: {
text: '',
annotations: []
}
};
//console.log( 'res wnode: ' + JSON.stringify(wnode, null, 2));
for ( var i = 0, l = children.length; i < l; i++ ) {
var cnode = children[i];
switch ( cnode.nodeType ) {
case Node.ELEMENT_NODE:
// Call a handler for the particular annotation node type
var annotationtype = this.getHTMLAnnotationType( cnode.nodeName );
if ( annotationtype ) {
var res = this._convertHTMLAnnotation( cnode, offset, annotationtype );
//console.log( 'res leaf: ' + JSON.stringify(res, null, 2));
offset += res.text.length;
wnode.content.text += res.text;
//console.log( 'res annotations: ' + JSON.stringify(res, null, 2));
wnode.content.annotations = wnode.content.annotations
.concat( res.annotations );
}
break;
case Node.TEXT_NODE:
// Add text as content, and increment offset
wnode.content.text += cnode.data;
offset += cnode.data.length;
break;
case Node.COMMENT_NODE:
// add a comment annotation?
break;
default:
console.log( "HTML to Wiki DOM conversion error. Unhandled node " +
cnode.innerHTML );
break;
}
}
return {
offset: offset,
node: wnode
};
};
DOMConverter.prototype._convertHTMLAnnotation = function ( node, offset, type ) {
var children = node.childNodes,
text = '',
annotations = [
{
type: type,
data: this._HTMLPropertiesToWikiData( node ),
range: {
start: offset,
end: offset
}
}
];
for ( var i = 0, l = children.length; i < l; i++ ) {
var cnode = children[i];
switch ( cnode.nodeType ) {
case Node.ELEMENT_NODE:
// Call a handler for the particular annotation node type
var annotationtype = this.getHTMLAnnotationType(cnode.nodeName);
if ( annotationtype ) {
var res = this._convertHTMLAnnotation( cnode, offset, annotationtype );
//console.log( 'res annotations 2: ' + JSON.stringify(res, null, 2));
text += res.text;
offset += res.text.length;
annotations = annotations.concat( res.annotations );
}
break;
case Node.TEXT_NODE:
// Add text as content, and increment offset
text += cnode.data;
offset += cnode.data.length;
break;
case Node.COMMENT_NODE:
// add a comment annotation?
break;
default:
console.log( "HTML to Wiki DOM conversion error. Unhandled node " +
cnode.innerHTML );
break;
}
}
annotations[0].range.end = offset;
return {
text: text,
annotations: annotations
};
};
DOMConverter.prototype._HTMLPropertiesToWikiAttributes = function ( elem ) {
var attribs = elem.attributes,
out = {};
for ( var i = 0, l = attribs.length; i < l; i++ ) {
var attrib = attribs.item(i),
key = attrib.name;
console.log('key: ' + key);
if ( key.match( /^data-/ ) ) {
// strip data- prefix from data-*
out[key.replace( /^data-/, '' )] = attrib.value;
} else {
// prefix html properties with html/
out['html/' + key] = attrib.value;
}
}
return out;
};
DOMConverter.prototype._HTMLPropertiesToWikiData = function ( elem ) {
var attribs = elem.attributes,
out = {};
for ( var i = 0, l = attribs.length; i < l; i++ ) {
var attrib = attribs.item(i),
key = attrib.name;
if ( key.match( /^data-/ ) ) {
// strip data- prefix from data-*
out[key.replace( /^data-/, '' )] = attrib.value;
} else {
// pass through a few whitelisted keys
// XXX: This subsets html DOM
if ( ['title'].indexOf(key) != -1 ) {
out[key] = attrib.value;
}
}
}
return out;
};
if (typeof module == "object") {
module.exports.DOMConverter = DOMConverter;
}

View file

@ -30,12 +30,18 @@ var isBlock = function isBlock (name) {
} }
}; };
// Quick HACK: define Node constants
// https://developer.mozilla.org/en/nodeType
var Node = {
TEXT_NODE: 3,
COMMENT_NODE: 8
};
// Wrap all top-level inline elements in paragraphs. This should also be // Wrap all top-level inline elements in paragraphs. This should also be
// applied inside block-level elements, but in that case the first paragraph // applied inside block-level elements, but in that case the first paragraph
// usually remains plain inline. // usually remains plain inline.
var process_inlines_in_p = function ( document ) { var process_inlines_in_p = function ( document ) {
// document.body does not always work in jsdom, so work around it. var body = document.body,
var body = document.getElementsByTagName('body')[0],
newP = document.createElement('p'), newP = document.createElement('p'),
cnodes = body.childNodes, cnodes = body.childNodes,
haveInlines = false, haveInlines = false,
@ -50,8 +56,8 @@ var process_inlines_in_p = function ( document ) {
ctype = child.nodeType; ctype = child.nodeType;
//console.log(child + ctype); //console.log(child + ctype);
if ((ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) || if ((ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) ||
(ctype !== 3 && // text (ctype !== Node.TEXT_NODE &&
ctype !== 8 && // comment ctype !== Node.COMMENT_NODE &&
!isBlock(child.nodeName))) { !isBlock(child.nodeName))) {
// text node // text node
newP.appendChild(child); newP.appendChild(child);

View file

@ -62,8 +62,8 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
break; break;
case "END": case "END":
this.emit('end'); this.emit('end');
// HACK: This should not be needed really.
this.document = this.parser.document; this.document = this.parser.document;
// HACK: This should not be needed really.
this.document.body = this.document.getElementsByTagName('body')[0]; this.document.body = this.document.getElementsByTagName('body')[0];
break; break;
case "NEWLINE": case "NEWLINE":

View file

@ -63,6 +63,8 @@ _import(pj('parser', 'ext.cite.taghook.ref.js'), ['MWRefTagHook']);
_import(pj('parser', 'mediawiki.HTML5TreeBuilder.node.js'), ['FauxHTML5']); _import(pj('parser', 'mediawiki.HTML5TreeBuilder.node.js'), ['FauxHTML5']);
_import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']); _import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']);
_import(pj('parser', 'mediawiki.DOMConverter'), ['DOMConverter']);
_import(pj('parser', 'ext.core.QuoteTransformer.js'), ['QuoteTransformer']); _import(pj('parser', 'ext.core.QuoteTransformer.js'), ['QuoteTransformer']);
_import(pj('parser', 'ext.Cite.js'), ['Cite']); _import(pj('parser', 'ext.Cite.js'), ['Cite']);
@ -121,6 +123,11 @@ function ParserTests () {
description: 'Print out a whitelist entry for failing tests. Default false.', description: 'Print out a whitelist entry for failing tests. Default false.',
default: false, default: false,
boolean: true boolean: true
},
'wikidom': {
description: 'Print out a WikiDom conversion of the HTML DOM',
default: false,
boolean: true
} }
} }
).check( function(argv) { ).check( function(argv) {
@ -181,6 +188,8 @@ function ParserTests () {
this.postProcessor = new DOMPostProcessor(); this.postProcessor = new DOMPostProcessor();
this.DOMConverter = new DOMConverter();
var pt = this; var pt = this;
// Set up the TokenTransformDispatcher with a callback for the remaining // Set up the TokenTransformDispatcher with a callback for the remaining
@ -196,13 +205,18 @@ function ParserTests () {
pt.buildTree( tokens, treeBuilder ); pt.buildTree( tokens, treeBuilder );
// Perform post-processing on DOM. // Perform post-processing on DOM.
pt.postProcessor.doPostProcess(treeBuilder.parser.document); pt.postProcessor.doPostProcess(treeBuilder.document);
// And serialize the result. // And serialize the result.
var out = treeBuilder.document.body.innerHTML; var out = treeBuilder.document.body.innerHTML;
// Finally, check the result vs. the expected result. // Finally, check the result vs. the expected result.
pt.checkResult( pt.currentItem, out ); pt.checkResult( pt.currentItem, out );
if ( pt.argv.wikidom ) {
// Test HTML DOM -> WikiDOM conversion
pt.printWikiDom( treeBuilder.document.body );
}
}); });
// Add token transformations.. // Add token transformations..
@ -225,8 +239,6 @@ function ParserTests () {
} }
/** /**
* Get an object holding our tests cases. Eventually from a cache file * Get an object holding our tests cases. Eventually from a cache file
*/ */
@ -520,6 +532,21 @@ ParserTests.prototype.checkResult = function ( item, out ) {
}; };
/**
* Print out a WikiDom conversion of the HTML DOM
*/
ParserTests.prototype.printWikiDom = function ( body ) {
console.log('WikiDom'.cyan + ':');
console.log(
JSON.stringify(
this.DOMConverter.HTMLtoWiki(body),
null,
2
) + "\n"
);
};
ParserTests.prototype.buildTree = function ( tokens, treeBuilder ) { ParserTests.prototype.buildTree = function ( tokens, treeBuilder ) {
// push a body element, just to be sure to have one // push a body element, just to be sure to have one
treeBuilder.processToken({type: 'TAG', name: 'body'}); treeBuilder.processToken({type: 'TAG', name: 'body'});