mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-30 00:55:00 +00:00
8dc994f037
Also, in ParserPipeline: * Import the LM converter and expose it through getLinearModel() * Fix getWikiDom() to actually work (still unused) In parse.js: * Add --help option that prints usage information (was unreachable) * Add --linearmodel option to output linear model JSON instead of HTML Change-Id: Ic534e03ff40a7c9117bb63f0c635a4213d5e3406
305 lines
10 KiB
JavaScript
305 lines
10 KiB
JavaScript
/**
|
|
* HTML DOM -> linear model converter
|
|
*
|
|
* @author Roan Kattouw
|
|
*/
|
|
|
|
/**
|
|
* Convert an HTML DOM to a linear model
|
|
* @param {Object} Root node of the HTML DOM
|
|
* @returns {Array} Linear model
|
|
*/
|
|
function ConvertDOMToLM( node ) {
|
|
return convertHTMLNode( node );
|
|
}
|
|
|
|
if ( typeof module == 'object' ) {
|
|
module.exports.ConvertDOMToLM = ConvertDOMToLM;
|
|
}
|
|
|
|
/*** Private functions and variables ***/
|
|
|
|
// Quick HACK: define Node constants locally
|
|
// https://developer.mozilla.org/en/nodeType
|
|
var Node = {
|
|
ELEMENT_NODE: 1,
|
|
ATTRIBUTE_NODE: 2,
|
|
TEXT_NODE: 3,
|
|
CDATA_SECTION_NODE: 4,
|
|
ENTITY_REFERENCE_NODE: 5,
|
|
ENTITY_NODE: 6,
|
|
PROCESSING_INSTRUCTION_NODE: 7,
|
|
COMMENT_NODE: 8,
|
|
DOCUMENT_NODE: 9,
|
|
DOCUMENT_TYPE_NODE: 10,
|
|
DOCUMENT_FRAGMENT_NODE: 11,
|
|
NOTATION_NODE: 12
|
|
};
|
|
|
|
/**
|
|
* Object mapping HTML DOM node names to linear model element names.
|
|
*
|
|
* leafNode: If true, the linear model element is a leaf node (can have children but no content)
|
|
* if false, it is a branch node (can contain content but no children),
|
|
* type: Element type
|
|
* attributes: Additional attributes to set for this element (optional)
|
|
*
|
|
*/
|
|
var elementTypes = {
|
|
'p': { leafNode: true, type: 'paragraph' },
|
|
'h1': { leafNode: true, type: 'heading', attributes: { level: 1 } },
|
|
'h2': { leafNode: true, type: 'heading', attributes: { level: 2 } },
|
|
'h3': { leafNode: true, type: 'heading', attributes: { level: 3 } },
|
|
'h4': { leafNode: true, type: 'heading', attributes: { level: 4 } },
|
|
'h5': { leafNode: true, type: 'heading', attributes: { level: 5 } },
|
|
'h6': { leafNode: true, type: 'heading', attributes: { level: 6 } },
|
|
'li': { leafNode: false, type: 'listItem' },
|
|
'dt': { leafNode: false, type: 'listItem' },
|
|
'dd': { leafNode: false, type: 'listItem' },
|
|
'pre': { leafNode: true, type: 'pre' },
|
|
'table': { leafNode: false, type: 'table' },
|
|
'tbody': { leafNode: false, type: 'tbody' }, // not in linear model!
|
|
'tr': { leafNode: false, type: 'tableRow' },
|
|
'th': { leafNode: false, type: 'tableHeading' },
|
|
'td': { leafNode: false, type: 'tableCell' },
|
|
'caption': { leafNode: false, type: 'caption' },
|
|
'hr': { leafNode: true, type: 'horizontalRule' },
|
|
'ul': { leafNode: false, type: 'list' }, // TODO add attribute for 'bullet'
|
|
'ol': { leafNode: false, type: 'list' }, // TODO add attribute for 'number'
|
|
'dl': { leafNode: false, type: 'list' }, // TODO add attribute for ???
|
|
//XXX: center is block-level in HTML, not sure what it should be in the linear model...
|
|
'center': { leafNode: false, type: 'center' },
|
|
//XXX: blockquote is block-level in HTML, not sure what it should be in the linear model...
|
|
'blockquote': { leafNode: false, type: 'blockquote' },
|
|
//XXX: what should 'div' be in the linear model?
|
|
'div': { leafNode: false, type: 'div' }
|
|
// Missing types:
|
|
// br
|
|
// img
|
|
};
|
|
|
|
/**
|
|
* Object mapping HTML DOM node names to linear model annotation types.
|
|
* The value can either be a string, or a function that takes the relevant
|
|
* HTML DOM node and returns a string.
|
|
*/
|
|
var annotationTypes = {
|
|
'i': 'textStyle/italic',
|
|
'b': 'textStyle/bold',
|
|
'small': 'textStyle/small',
|
|
'span': 'textStyle/span',
|
|
'a': function( node ) {
|
|
// FIXME the parser currently doesn't output this data this way
|
|
// Internal links get linkType:'internal' in the data-mw-rt attrib,
|
|
// external links get nothing
|
|
var atype = node.getAttribute( 'data-type' );
|
|
if ( atype ) {
|
|
return 'link/' + atype;
|
|
} else {
|
|
return 'link/unknown';
|
|
}
|
|
},
|
|
'template': 'object/template',
|
|
'ref': 'object/hook',
|
|
'includeonly': 'object/includeonly'
|
|
};
|
|
|
|
/**
|
|
* List of HTML DOM attributes that should be passed through verbatim
|
|
* by convertAttributes() rather than being prefixed with 'html/'
|
|
*/
|
|
var attributeWhitelist = [ 'title' ]; //XXX: add href?
|
|
|
|
/**
|
|
* Convert the attributes of an HTML DOM node to linear model attributes.
|
|
*
|
|
* - Attributes prefixed with data-json- will have the prefix removed and their
|
|
* value JSON-decoded.
|
|
* - Attributes prefixed with data- will have the prefix removed.
|
|
* - Attributes in attributeWhitelist are passed through unchanged
|
|
* - All other attribuetes are prefixed with html/
|
|
*
|
|
* @param {Object} node HTML DOM node
|
|
* @returns {Object} Converted attribute map
|
|
*/
|
|
function convertAttributes( node ) {
|
|
var attribs = node.attributes, retval = {},
|
|
i, attrib, name;
|
|
if ( !attribs ) {
|
|
return {};
|
|
}
|
|
for ( i = 0; i < attribs.length; i++ ) {
|
|
attrib = attribs.item( i );
|
|
name = attrib.name;
|
|
if ( name.substr( 0, 10 ) == 'data-json-' ) {
|
|
// Strip data-json- prefix and decode
|
|
retval[name.substr( 10 )] = JSON.parse( attrib.value );
|
|
} else if ( name.substr( 0, 5 ) == 'data-' ) {
|
|
// Strip data- prefix
|
|
retval[name.substr( 5 )] = attrib.value;
|
|
} else if ( attributeWhitelist.indexOf( name ) != -1 ) {
|
|
// Pass through a few whitelisted keys
|
|
retval[name] = attrib.value;
|
|
} else {
|
|
// Prefix key with 'html/'
|
|
retval['html/' + name] = attrib.value;
|
|
}
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
/**
|
|
* Check if an HTML DOM node represents an annotation, and if so, build an
|
|
* annotation object for it.
|
|
*
|
|
* The annotation object looks like {type: 'type', data: {'attrKey': 'attrValue', ...}}
|
|
*
|
|
* @param {node} HTML DOM node
|
|
* @returns {Object|false} Annotation object, or false if this node is not an annotation
|
|
*/
|
|
function getAnnotation( node ) {
|
|
var name = node.nodeName.toLowerCase(),
|
|
type = annotationTypes[name],
|
|
annotation = {};
|
|
if ( !type ) {
|
|
// Not an annotation
|
|
return false;
|
|
}
|
|
if ( typeof type == 'function' ) {
|
|
type = type( node );
|
|
}
|
|
annotation.type = type;
|
|
annotation.data = convertAttributes( node );
|
|
return annotation;
|
|
}
|
|
|
|
/**
|
|
* Convert a string to (possibly annotated) linear model data
|
|
*
|
|
* @param {String} content String to convert
|
|
* @param {Array} annotations Array of annotation objects to apply
|
|
* @returns {Array} Linear model data, one element per character
|
|
*/
|
|
function generateAnnotatedContent( content, annotations ) {
|
|
var i, result = [],
|
|
split = content.split( '' );
|
|
if ( !annotations || annotations.length == 0 ) {
|
|
return split;
|
|
}
|
|
for ( i = 0; i < split.length; i++ ) {
|
|
result.push( [ split[i] ].concat( annotations ) );
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Recursively convert an HTML DOM node to a linear model array. This is the
|
|
* main function.
|
|
*
|
|
* @param {Object} node HTML DOM node
|
|
* @param {Array} [annotations] Annotations to apply. Only used for recursion.
|
|
* @param {Object} [typeData] Information about the linear model element type corresponding to this node.
|
|
* Only used for recursion. This data usually comes from elementTypes. All keys are optional. Keys are:
|
|
* type: If set, add an opening and a closing element with this type and with the DOM node's attributes
|
|
* attributes: If set and type is set, these attributes will be added to the element's attributes
|
|
* leafNode: If set and set to true, this element is supposed to be a leaf node in the linear model.
|
|
* This means that any content in this node will be put in the output directly rather than
|
|
* being wrapped in paragraphs, and that any child nodes that are elements will not be
|
|
* descended into.
|
|
* @returns {Array} Linear model data
|
|
*/
|
|
function convertHTMLNode( node, annotations, typeData ) {
|
|
var data = [], i, child, annotation,
|
|
paragraphOpened = false, element,
|
|
attributes, typeData, childTypeData;
|
|
annotations = annotations || [];
|
|
typeData = typeData || {};
|
|
|
|
if ( typeData.type ) {
|
|
element = { 'type': typeData.type };
|
|
attributes = convertAttributes( node.attributes );
|
|
if ( typeData.attributes ) {
|
|
attributes = $.extend( attributes, typeData.attributes );
|
|
}
|
|
if ( !$.isEmptyObject( attributes ) ) {
|
|
element.attributes = attributes;
|
|
}
|
|
data.push( element );
|
|
}
|
|
|
|
for ( i = 0; i < node.childNodes.length; i++ ) {
|
|
child = node.childNodes[i];
|
|
switch ( child.nodeType ) {
|
|
case Node.ELEMENT_NODE:
|
|
// Check if this is an annotation
|
|
annotation = getAnnotation( child );
|
|
if ( annotation ) {
|
|
// If we have annotated text within a branch node, open a paragraph
|
|
// Leaf nodes don't need this because they're allowed to contain content
|
|
if ( !paragraphOpened && !typeData.leafNode ) {
|
|
data.push( { 'type': 'paragraph' } );
|
|
paragraphOpened = true;
|
|
}
|
|
// Recurse into this node
|
|
data = data.concat( convertHTMLNode( child,
|
|
annotations.concat( [ annotation ] ),
|
|
{ leafNode: true }
|
|
) );
|
|
} else {
|
|
if ( typeData.leafNode ) {
|
|
// We've found an element node *inside* a leaf node.
|
|
// This is illegal, so warn and skip it
|
|
console.warn( 'HTML DOM to linear model conversion error: ' +
|
|
'found element node (' + child.nodeName + ') inside ' +
|
|
'leaf node (' + node.nodeName + ')' );
|
|
break;
|
|
}
|
|
|
|
// Close the last paragraph, if still open
|
|
if ( paragraphOpened ) {
|
|
data.push( { 'type': '/paragraph' } );
|
|
paragraphOpened = false;
|
|
}
|
|
|
|
// Get the typeData for this node
|
|
childTypeData = elementTypes[child.nodeName.toLowerCase()];
|
|
if ( childTypeData ) {
|
|
// Recurse into this node
|
|
// Don't pass annotations through; we should never have those here
|
|
// anyway because only leaves can have them.
|
|
data = data.concat( convertHTMLNode( child, [], childTypeData ) );
|
|
} else {
|
|
console.warn( 'HTML DOM to linear model conversion error: unknown node with name ' +
|
|
child.nodeName + ' and content ' + child.innerHTML );
|
|
}
|
|
}
|
|
break;
|
|
case Node.TEXT_NODE:
|
|
// If we have annotated text within a branch node, open a paragraph
|
|
// Leaf nodes don't need this because they're allowed to contain content
|
|
if ( !paragraphOpened && typeData && !typeData.leafNode ) {
|
|
data.push( { 'type': 'paragraph' } );
|
|
paragraphOpened = true;
|
|
}
|
|
// Annotate the text and output it
|
|
data = data.concat( generateAnnotatedContent( child.data, annotations ) );
|
|
break;
|
|
case Node.COMMENT_NODE:
|
|
// Comment, do nothing
|
|
break;
|
|
default:
|
|
console.warn( 'HTML DOM to linear model conversion error: unknown node of type ' +
|
|
child.nodeType + ' with content ' + child.innerHTML );
|
|
}
|
|
}
|
|
// Close the last paragraph, if still open
|
|
if ( paragraphOpened ) {
|
|
data.push( { 'type': '/paragraph' } );
|
|
}
|
|
|
|
if ( typeData && typeData.type ) {
|
|
data.push( { 'type': '/' + typeData.type } );
|
|
}
|
|
return data;
|
|
}
|