mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-28 00:00:49 +00:00
Add rough HTML DOM to WikiDom conversion. You can see serialized WikiDom of
parser tests using 'node parserTests.js --wikidom'.
This commit is contained in:
parent
5f80d30428
commit
a09aa4d599
|
@ -257,7 +257,8 @@ Cite.prototype.onReferences = function ( tokenCTX ) {
|
||||||
type: 'TAG',
|
type: 'TAG',
|
||||||
name: 'ol',
|
name: 'ol',
|
||||||
attribs: [
|
attribs: [
|
||||||
['class', 'references']
|
['class', 'references'],
|
||||||
|
['data-object', 'references'] // Object type
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
].concat( listItems, { type: 'ENDTAG', name: 'ol' } );
|
].concat( listItems, { type: 'ENDTAG', name: 'ol' } );
|
||||||
|
|
306
modules/parser/mediawiki.DOMConverter.js
Normal file
306
modules/parser/mediawiki.DOMConverter.js
Normal file
|
@ -0,0 +1,306 @@
|
||||||
|
/**
|
||||||
|
* Conversions between HTML DOM and WikiDom
|
||||||
|
*
|
||||||
|
* @class
|
||||||
|
* @constructor
|
||||||
|
*/
|
||||||
|
function DOMConverter () {
|
||||||
|
}
|
||||||
|
|
||||||
|
// Quick HACK: define Node constants
|
||||||
|
// https://developer.mozilla.org/en/nodeType
|
||||||
|
var Node = {
|
||||||
|
ELEMENT_NODE: 1,
|
||||||
|
ATTRIBUTE_NODE: 2,
|
||||||
|
TEXT_NODE: 3,
|
||||||
|
CDATA_SECTION_NODE: 4,
|
||||||
|
ENTITY_REFERENCE_NODE: 5,
|
||||||
|
ENTITY_NODE: 6,
|
||||||
|
PROCESSING_INSTRUCTION_NODE: 7,
|
||||||
|
COMMENT_NODE: 8,
|
||||||
|
DOCUMENT_NODE: 9,
|
||||||
|
DOCUMENT_TYPE_NODE: 10,
|
||||||
|
DOCUMENT_FRAGMENT_NODE: 11,
|
||||||
|
NOTATION_NODE: 12
|
||||||
|
};
|
||||||
|
|
||||||
|
DOMConverter.prototype.getHTMLHandlerInfo = function ( nodeName ) {
|
||||||
|
switch ( nodeName.toLowerCase() ) {
|
||||||
|
case 'p':
|
||||||
|
return {
|
||||||
|
handler: this._convertHTMLLeaf,
|
||||||
|
type: 'paragraph'
|
||||||
|
};
|
||||||
|
case 'li':
|
||||||
|
case 'dl':
|
||||||
|
case 'dd':
|
||||||
|
return {
|
||||||
|
handler: this._convertHTMLLeaf,
|
||||||
|
type: 'listItem'
|
||||||
|
};
|
||||||
|
case 'pre':
|
||||||
|
return {
|
||||||
|
handler: this._convertHTMLLeaf,
|
||||||
|
type: 'pre'
|
||||||
|
};
|
||||||
|
case 'ul':
|
||||||
|
case 'ol':
|
||||||
|
case 'dl':
|
||||||
|
return {
|
||||||
|
handler: this._convertHTMLBranch,
|
||||||
|
type: 'list'
|
||||||
|
};
|
||||||
|
default:
|
||||||
|
console.log( 'HTML to Wiki DOM conversion error. Unsupported node name ' +
|
||||||
|
nodeName );
|
||||||
|
return {
|
||||||
|
handler: this._convertHTMLBranch,
|
||||||
|
type: nodeName.toLowerCase()
|
||||||
|
};
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
DOMConverter.prototype.getHTMLAnnotationType = function ( nodeName ) {
|
||||||
|
switch ( nodeName.toLowerCase() ) {
|
||||||
|
case 'i':
|
||||||
|
return 'textStyle/italic';
|
||||||
|
case 'b':
|
||||||
|
return 'textStyle/bold';
|
||||||
|
case 'span':
|
||||||
|
return 'textStyle/span';
|
||||||
|
case 'a':
|
||||||
|
return 'link/unknown'; // XXX: distinguish internal / external etc
|
||||||
|
default:
|
||||||
|
console.log( 'HTML to Wiki DOM conversion error. Unsupported html annotation ' +
|
||||||
|
nodeName );
|
||||||
|
return undefined;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert a HTML DOM to WikiDom
|
||||||
|
*
|
||||||
|
* @method
|
||||||
|
* @param {Object} root of HTML DOM (usually the body element)
|
||||||
|
* @returns {Object} WikiDom version
|
||||||
|
*/
|
||||||
|
DOMConverter.prototype.HTMLtoWiki = function ( node ) {
|
||||||
|
var children = node.childNodes,
|
||||||
|
out = {
|
||||||
|
type: 'document',
|
||||||
|
children: []
|
||||||
|
};
|
||||||
|
for ( var i = 0, l = children.length; i < l; i++ ) {
|
||||||
|
var cnode = children[i];
|
||||||
|
switch ( cnode.nodeType ) {
|
||||||
|
case Node.ELEMENT_NODE:
|
||||||
|
// Call a handler for the particular node type
|
||||||
|
var hi = this.getHTMLHandlerInfo( cnode.nodeName );
|
||||||
|
var res = hi.handler.call(this, cnode, 0, hi.type );
|
||||||
|
out.children.push( res.node );
|
||||||
|
break;
|
||||||
|
case Node.TEXT_NODE:
|
||||||
|
// Add text as content, and increment offset
|
||||||
|
// BUT: Should not appear at toplevel!
|
||||||
|
break;
|
||||||
|
case Node.COMMENT_NODE:
|
||||||
|
// Add a comment annotation to which text? Not clear how this
|
||||||
|
// can be represented in WikiDom.
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
console.log( "HTML to Wiki DOM conversion error. Unhandled node type " +
|
||||||
|
cnode.innerHTML );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Private HTML branch node handler
|
||||||
|
*
|
||||||
|
* @param {Object} HTML DOM element
|
||||||
|
* @param {Int} WikiDom offset within a block
|
||||||
|
* @returns {Object} WikiDom object
|
||||||
|
*/
|
||||||
|
DOMConverter.prototype._convertHTMLBranch = function ( node, offset, type ) {
|
||||||
|
var children = node.childNodes,
|
||||||
|
wnode = {
|
||||||
|
type: type,
|
||||||
|
attributes: this._HTMLPropertiesToWikiAttributes( node ),
|
||||||
|
children: []
|
||||||
|
};
|
||||||
|
for ( var i = 0, l = children.length; i < l; i++ ) {
|
||||||
|
var cnode = children[i];
|
||||||
|
switch ( cnode.nodeType ) {
|
||||||
|
case Node.ELEMENT_NODE:
|
||||||
|
// Call a handler for the particular node type
|
||||||
|
var hi = this.getHTMLHandlerInfo( cnode.nodeName );
|
||||||
|
var res = hi.handler.call(this, cnode, offset + 1, hi.type );
|
||||||
|
wnode.children.push( res.node );
|
||||||
|
offset = res.offset;
|
||||||
|
break;
|
||||||
|
case Node.TEXT_NODE:
|
||||||
|
// Create a paragraph and add it to children?
|
||||||
|
break;
|
||||||
|
case Node.COMMENT_NODE:
|
||||||
|
// add a comment node.
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
console.log( "HTML to Wiki DOM conversion error. Unhandled node " +
|
||||||
|
cnode.innerHTML );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
offset: offset,
|
||||||
|
node: wnode
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Private HTML leaf node handler
|
||||||
|
*
|
||||||
|
* @param {Object} HTML DOM element
|
||||||
|
* @param {Int} WikiDom offset within a block
|
||||||
|
* @returns {Object} WikiDom object
|
||||||
|
*/
|
||||||
|
DOMConverter.prototype._convertHTMLLeaf = function ( node, offset, type ) {
|
||||||
|
var children = node.childNodes,
|
||||||
|
wnode = {
|
||||||
|
type: type,
|
||||||
|
attributes: this._HTMLPropertiesToWikiAttributes( node ),
|
||||||
|
content: {
|
||||||
|
text: '',
|
||||||
|
annotations: []
|
||||||
|
}
|
||||||
|
};
|
||||||
|
//console.log( 'res wnode: ' + JSON.stringify(wnode, null, 2));
|
||||||
|
for ( var i = 0, l = children.length; i < l; i++ ) {
|
||||||
|
var cnode = children[i];
|
||||||
|
switch ( cnode.nodeType ) {
|
||||||
|
case Node.ELEMENT_NODE:
|
||||||
|
// Call a handler for the particular annotation node type
|
||||||
|
var annotationtype = this.getHTMLAnnotationType( cnode.nodeName );
|
||||||
|
if ( annotationtype ) {
|
||||||
|
var res = this._convertHTMLAnnotation( cnode, offset, annotationtype );
|
||||||
|
//console.log( 'res leaf: ' + JSON.stringify(res, null, 2));
|
||||||
|
offset += res.text.length;
|
||||||
|
wnode.content.text += res.text;
|
||||||
|
//console.log( 'res annotations: ' + JSON.stringify(res, null, 2));
|
||||||
|
wnode.content.annotations = wnode.content.annotations
|
||||||
|
.concat( res.annotations );
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case Node.TEXT_NODE:
|
||||||
|
// Add text as content, and increment offset
|
||||||
|
wnode.content.text += cnode.data;
|
||||||
|
offset += cnode.data.length;
|
||||||
|
break;
|
||||||
|
case Node.COMMENT_NODE:
|
||||||
|
// add a comment annotation?
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
console.log( "HTML to Wiki DOM conversion error. Unhandled node " +
|
||||||
|
cnode.innerHTML );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
offset: offset,
|
||||||
|
node: wnode
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
DOMConverter.prototype._convertHTMLAnnotation = function ( node, offset, type ) {
|
||||||
|
var children = node.childNodes,
|
||||||
|
text = '',
|
||||||
|
annotations = [
|
||||||
|
{
|
||||||
|
type: type,
|
||||||
|
data: this._HTMLPropertiesToWikiData( node ),
|
||||||
|
range: {
|
||||||
|
start: offset,
|
||||||
|
end: offset
|
||||||
|
}
|
||||||
|
}
|
||||||
|
];
|
||||||
|
for ( var i = 0, l = children.length; i < l; i++ ) {
|
||||||
|
var cnode = children[i];
|
||||||
|
switch ( cnode.nodeType ) {
|
||||||
|
case Node.ELEMENT_NODE:
|
||||||
|
// Call a handler for the particular annotation node type
|
||||||
|
var annotationtype = this.getHTMLAnnotationType(cnode.nodeName);
|
||||||
|
if ( annotationtype ) {
|
||||||
|
var res = this._convertHTMLAnnotation( cnode, offset, annotationtype );
|
||||||
|
//console.log( 'res annotations 2: ' + JSON.stringify(res, null, 2));
|
||||||
|
text += res.text;
|
||||||
|
offset += res.text.length;
|
||||||
|
annotations = annotations.concat( res.annotations );
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case Node.TEXT_NODE:
|
||||||
|
// Add text as content, and increment offset
|
||||||
|
text += cnode.data;
|
||||||
|
offset += cnode.data.length;
|
||||||
|
break;
|
||||||
|
case Node.COMMENT_NODE:
|
||||||
|
// add a comment annotation?
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
console.log( "HTML to Wiki DOM conversion error. Unhandled node " +
|
||||||
|
cnode.innerHTML );
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
annotations[0].range.end = offset;
|
||||||
|
return {
|
||||||
|
text: text,
|
||||||
|
annotations: annotations
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
DOMConverter.prototype._HTMLPropertiesToWikiAttributes = function ( elem ) {
|
||||||
|
var attribs = elem.attributes,
|
||||||
|
out = {};
|
||||||
|
for ( var i = 0, l = attribs.length; i < l; i++ ) {
|
||||||
|
var attrib = attribs.item(i),
|
||||||
|
key = attrib.name;
|
||||||
|
console.log('key: ' + key);
|
||||||
|
if ( key.match( /^data-/ ) ) {
|
||||||
|
// strip data- prefix from data-*
|
||||||
|
out[key.replace( /^data-/, '' )] = attrib.value;
|
||||||
|
} else {
|
||||||
|
// prefix html properties with html/
|
||||||
|
out['html/' + key] = attrib.value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
|
||||||
|
DOMConverter.prototype._HTMLPropertiesToWikiData = function ( elem ) {
|
||||||
|
var attribs = elem.attributes,
|
||||||
|
out = {};
|
||||||
|
for ( var i = 0, l = attribs.length; i < l; i++ ) {
|
||||||
|
var attrib = attribs.item(i),
|
||||||
|
key = attrib.name;
|
||||||
|
if ( key.match( /^data-/ ) ) {
|
||||||
|
// strip data- prefix from data-*
|
||||||
|
out[key.replace( /^data-/, '' )] = attrib.value;
|
||||||
|
} else {
|
||||||
|
// pass through a few whitelisted keys
|
||||||
|
// XXX: This subsets html DOM
|
||||||
|
if ( ['title'].indexOf(key) != -1 ) {
|
||||||
|
out[key] = attrib.value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
if (typeof module == "object") {
|
||||||
|
module.exports.DOMConverter = DOMConverter;
|
||||||
|
}
|
|
@ -30,12 +30,18 @@ var isBlock = function isBlock (name) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Quick HACK: define Node constants
|
||||||
|
// https://developer.mozilla.org/en/nodeType
|
||||||
|
var Node = {
|
||||||
|
TEXT_NODE: 3,
|
||||||
|
COMMENT_NODE: 8
|
||||||
|
};
|
||||||
|
|
||||||
// Wrap all top-level inline elements in paragraphs. This should also be
|
// Wrap all top-level inline elements in paragraphs. This should also be
|
||||||
// applied inside block-level elements, but in that case the first paragraph
|
// applied inside block-level elements, but in that case the first paragraph
|
||||||
// usually remains plain inline.
|
// usually remains plain inline.
|
||||||
var process_inlines_in_p = function ( document ) {
|
var process_inlines_in_p = function ( document ) {
|
||||||
// document.body does not always work in jsdom, so work around it.
|
var body = document.body,
|
||||||
var body = document.getElementsByTagName('body')[0],
|
|
||||||
newP = document.createElement('p'),
|
newP = document.createElement('p'),
|
||||||
cnodes = body.childNodes,
|
cnodes = body.childNodes,
|
||||||
haveInlines = false,
|
haveInlines = false,
|
||||||
|
@ -50,8 +56,8 @@ var process_inlines_in_p = function ( document ) {
|
||||||
ctype = child.nodeType;
|
ctype = child.nodeType;
|
||||||
//console.log(child + ctype);
|
//console.log(child + ctype);
|
||||||
if ((ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) ||
|
if ((ctype === 3 && (haveInlines || !isElementContentWhitespace(child))) ||
|
||||||
(ctype !== 3 && // text
|
(ctype !== Node.TEXT_NODE &&
|
||||||
ctype !== 8 && // comment
|
ctype !== Node.COMMENT_NODE &&
|
||||||
!isBlock(child.nodeName))) {
|
!isBlock(child.nodeName))) {
|
||||||
// text node
|
// text node
|
||||||
newP.appendChild(child);
|
newP.appendChild(child);
|
||||||
|
|
|
@ -62,8 +62,8 @@ FauxHTML5.TreeBuilder.prototype.processToken = function (token) {
|
||||||
break;
|
break;
|
||||||
case "END":
|
case "END":
|
||||||
this.emit('end');
|
this.emit('end');
|
||||||
// HACK: This should not be needed really.
|
|
||||||
this.document = this.parser.document;
|
this.document = this.parser.document;
|
||||||
|
// HACK: This should not be needed really.
|
||||||
this.document.body = this.document.getElementsByTagName('body')[0];
|
this.document.body = this.document.getElementsByTagName('body')[0];
|
||||||
break;
|
break;
|
||||||
case "NEWLINE":
|
case "NEWLINE":
|
||||||
|
|
|
@ -63,6 +63,8 @@ _import(pj('parser', 'ext.cite.taghook.ref.js'), ['MWRefTagHook']);
|
||||||
_import(pj('parser', 'mediawiki.HTML5TreeBuilder.node.js'), ['FauxHTML5']);
|
_import(pj('parser', 'mediawiki.HTML5TreeBuilder.node.js'), ['FauxHTML5']);
|
||||||
_import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']);
|
_import(pj('parser', 'mediawiki.DOMPostProcessor.js'), ['DOMPostProcessor']);
|
||||||
|
|
||||||
|
_import(pj('parser', 'mediawiki.DOMConverter'), ['DOMConverter']);
|
||||||
|
|
||||||
_import(pj('parser', 'ext.core.QuoteTransformer.js'), ['QuoteTransformer']);
|
_import(pj('parser', 'ext.core.QuoteTransformer.js'), ['QuoteTransformer']);
|
||||||
|
|
||||||
_import(pj('parser', 'ext.Cite.js'), ['Cite']);
|
_import(pj('parser', 'ext.Cite.js'), ['Cite']);
|
||||||
|
@ -121,6 +123,11 @@ function ParserTests () {
|
||||||
description: 'Print out a whitelist entry for failing tests. Default false.',
|
description: 'Print out a whitelist entry for failing tests. Default false.',
|
||||||
default: false,
|
default: false,
|
||||||
boolean: true
|
boolean: true
|
||||||
|
},
|
||||||
|
'wikidom': {
|
||||||
|
description: 'Print out a WikiDom conversion of the HTML DOM',
|
||||||
|
default: false,
|
||||||
|
boolean: true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
).check( function(argv) {
|
).check( function(argv) {
|
||||||
|
@ -181,6 +188,8 @@ function ParserTests () {
|
||||||
|
|
||||||
this.postProcessor = new DOMPostProcessor();
|
this.postProcessor = new DOMPostProcessor();
|
||||||
|
|
||||||
|
this.DOMConverter = new DOMConverter();
|
||||||
|
|
||||||
var pt = this;
|
var pt = this;
|
||||||
|
|
||||||
// Set up the TokenTransformDispatcher with a callback for the remaining
|
// Set up the TokenTransformDispatcher with a callback for the remaining
|
||||||
|
@ -196,13 +205,18 @@ function ParserTests () {
|
||||||
pt.buildTree( tokens, treeBuilder );
|
pt.buildTree( tokens, treeBuilder );
|
||||||
|
|
||||||
// Perform post-processing on DOM.
|
// Perform post-processing on DOM.
|
||||||
pt.postProcessor.doPostProcess(treeBuilder.parser.document);
|
pt.postProcessor.doPostProcess(treeBuilder.document);
|
||||||
|
|
||||||
// And serialize the result.
|
// And serialize the result.
|
||||||
var out = treeBuilder.document.body.innerHTML;
|
var out = treeBuilder.document.body.innerHTML;
|
||||||
|
|
||||||
// Finally, check the result vs. the expected result.
|
// Finally, check the result vs. the expected result.
|
||||||
pt.checkResult( pt.currentItem, out );
|
pt.checkResult( pt.currentItem, out );
|
||||||
|
|
||||||
|
if ( pt.argv.wikidom ) {
|
||||||
|
// Test HTML DOM -> WikiDOM conversion
|
||||||
|
pt.printWikiDom( treeBuilder.document.body );
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Add token transformations..
|
// Add token transformations..
|
||||||
|
@ -225,8 +239,6 @@ function ParserTests () {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get an object holding our tests cases. Eventually from a cache file
|
* Get an object holding our tests cases. Eventually from a cache file
|
||||||
*/
|
*/
|
||||||
|
@ -520,6 +532,21 @@ ParserTests.prototype.checkResult = function ( item, out ) {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Print out a WikiDom conversion of the HTML DOM
|
||||||
|
*/
|
||||||
|
ParserTests.prototype.printWikiDom = function ( body ) {
|
||||||
|
console.log('WikiDom'.cyan + ':');
|
||||||
|
console.log(
|
||||||
|
JSON.stringify(
|
||||||
|
this.DOMConverter.HTMLtoWiki(body),
|
||||||
|
null,
|
||||||
|
2
|
||||||
|
) + "\n"
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
ParserTests.prototype.buildTree = function ( tokens, treeBuilder ) {
|
ParserTests.prototype.buildTree = function ( tokens, treeBuilder ) {
|
||||||
// push a body element, just to be sure to have one
|
// push a body element, just to be sure to have one
|
||||||
treeBuilder.processToken({type: 'TAG', name: 'body'});
|
treeBuilder.processToken({type: 'TAG', name: 'body'});
|
||||||
|
|
Loading…
Reference in a new issue