diff --git a/modules/ve2/dm/ve.dm.HTMLConverter.js b/modules/ve2/dm/ve.dm.HTMLConverter.js index 43145a887e..eb02acb5f7 100644 --- a/modules/ve2/dm/ve.dm.HTMLConverter.js +++ b/modules/ve2/dm/ve.dm.HTMLConverter.js @@ -1,41 +1,236 @@ /** - * HTML DOM -> linear model converter - * + * Converter between HTML DOM and VisualEditor linear data. + * + * @author Gabriel Wicke * @author Roan Kattouw * @author Christian Williams * @author Inez Korczynski + * @author Trevor Parscal + * + * @class + * @constructor + * @param {Object} options Conversion options */ +ve.dm.HTMLConverter = function( options ) { + this.options = options || {}; +}; -ve.dm.HTMLConverter = {}; +/* Static Members */ /** - * Recursively convert an HTML DOM node to a linear model array. This is the - * main function. + * HTML DOM node types. + * + * If this will be used more than once in a method, it should be aliased locally to avoid excessive + * lookups. + * + * @see https://developer.mozilla.org/en/nodeType + */ +ve.dm.HTMLConverter.Node = { + 'ELEMENT_NODE': 1, + 'ATTRIBUTE_NODE': 2, + 'TEXT_NODE': 3, + 'CDATA_SECTION_NODE': 4, + 'ENTITY_REFERENCE_NODE': 5, + 'ENTITY_NODE': 6, + 'PROCESSING_INSTRUCTION_NODE': 7, + 'COMMENT_NODE': 8, + 'DOCUMENT_NODE': 9, + 'DOCUMENT_TYPE_NODE': 10, + 'DOCUMENT_FRAGMENT_NODE': 11, + 'NOTATION_NODE': 12 +}; + +/** + * Object mapping HTML DOM node names to linear model element names. + * + * 'leafNode': Type of model tree node used to represent this element + * - If true, it's a leaf node (can not contain children) + * - If false, it's a branch node (can contain children) + * 'type': Symbolic name of element in VisualEditor linear data. + * 'attributes': Additional attributes to set for this element (optional) + */ +ve.dm.HTMLConverter.elementTypes = { + 'p': { 'leafNode': true, 'type': 'paragraph' }, + 'h1': { 'leafNode': true, 'type': 'heading', 'attributes': { 'level': 1 } }, + 'h2': { 'leafNode': true, 'type': 'heading', 'attributes': { 'level': 2 } }, + 'h3': { 'leafNode': true, 'type': 'heading', 'attributes': { 'level': 3 } }, + 'h4': { 'leafNode': true, 'type': 'heading', 'attributes': { 'level': 4 } }, + 'h5': { 'leafNode': true, 'type': 'heading', 'attributes': { 'level': 5 } }, + 'h6': { 'leafNode': true, 'type': 'heading', 'attributes': { 'level': 6 } }, + 'li': { 'leafNode': false, 'type': 'listItem' }, + 'dt': { 'leafNode': false, 'type': 'listItem', 'attributes': { 'style': 'term' } }, + 'dd': { 'leafNode': false, 'type': 'listItem', 'attributes': { 'style': 'definition' } }, + 'pre': { 'leafNode': true, 'type': 'preformatted' }, + 'table': { 'leafNode': false, 'type': 'table' }, + 'tr': { 'leafNode': false, 'type': 'tableRow' }, + 'th': { 'leafNode': false, 'type': 'tableHeading' }, + 'td': { 'leafNode': false, 'type': 'tableCell' }, + 'ul': { 'leafNode': false, 'type': 'list', 'attributes': { 'style': 'bullet' } }, + 'ol': { 'leafNode': false, 'type': 'list', 'attributes': { 'style': 'number' } }, + 'dl': { 'leafNode': false, 'type': 'definitionList' } + // Missing types that will end up being alien nodes (not a complete list): + // div, center, blockquote, caption, tbody, thead, tfoot, horizontalRule, br, img, video, audio +}; + +/** + * Object mapping HTML DOM node names to linear model annotation types. + * + * The value can either be a string, or a function that takes the relevant HTML DOM node and returns + * a string. + */ +ve.dm.HTMLConverter.annotationTypes = { + 'i': 'textStyle/italic', + 'b': 'textStyle/bold', + 'small': 'textStyle/small', + 'span': 'textStyle/span', + 'a': function( node ) { + // FIXME: the parser currently doesn't output this data this way + // Internal links get 'linkType': 'internal' in the data-mw-rt attrib, while external links + // currently get nothing + var atype = node.getAttribute( 'data-type' ); + if ( atype ) { + return 'link/' + atype; + } else { + return 'link/unknown'; + } + }, + 'template': 'object/template', + 'ref': 'object/hook', + 'includeonly': 'object/includeonly' +}; + +/** + * List of HTML DOM attributes that should be passed through verbatim by convertAttributes() rather + * than being prefixed with 'html/' + * + * TODO: Add href to this list? + */ +ve.dm.HTMLConverter.attributeWhitelist = [ 'title' ]; + +/* Static Methods */ + +/** + * Convert the attributes of an HTML DOM node to linear model attributes. + * + * - Attributes prefixed with data-json- will have the prefix removed and their + * value JSON-decoded. + * - Attributes prefixed with data- will have the prefix removed. + * - Attributes in attributeWhitelist are passed through unchanged + * - All other attribuetes are prefixed with html/ + * + * @param {Object} node HTML DOM node + * @returns {Object} Converted attribute map + */ +ve.dm.HTMLConverter.convertAttributes = function( node ) { + var original = node.attributes, + converted = {}, + attrib, + name, + i; + if ( !original ) { + return {}; + } + for ( i = 0; i < original.length; i++ ) { + attrib = original.item( i ); + name = attrib.name; + if ( name.substr( 0, 10 ) == 'data-json-' ) { + // Strip data-json- prefix and decode + converted[name.substr( 10 )] = JSON.parse( attrib.value ); + } else if ( name.substr( 0, 5 ) == 'data-' ) { + // Strip data- prefix + converted[name.substr( 5 )] = attrib.value; + } else if ( ve.dm.HTMLConverter.attributeWhitelist.indexOf( name ) != -1 ) { + // Pass through a few whitelisted keys + converted[name] = attrib.value; + } else { + // Prefix key with 'html/' + converted['html/' + name] = attrib.value; + } + } + return converted; +}; + +/** + * Check if an HTML DOM node represents an annotation, and if so, build an + * annotation object for it. + * + * The annotation object looks like {'type': 'type', data: {'attrKey': 'attrValue', ...}} + * + * @param {node} HTML DOM node + * @returns {Object|false} Annotation object, or false if this node is not an annotation + */ +ve.dm.HTMLConverter.getAnnotation = function( node ) { + var name = node.nodeName.toLowerCase(), + type = ve.dm.HTMLConverter.annotationTypes[name]; + if ( !type ) { + // Not an annotation + return false; + } + if ( typeof type == 'function' ) { + type = type( node ); + } + return { + 'type': type, + 'data': ve.dm.HTMLConverter.convertAttributes( node ) + }; +}; + +/** + * Convert a string to (possibly annotated) linear model data + * + * @param {String} content String to convert + * @param {Array} annotations Array of annotation objects to apply + * @returns {Array} Linear model data, one element per character + */ +ve.dm.HTMLConverter.generateAnnotatedContent = function( content, annotations ) { + var characters = content.split( '' ), + annoationMap = {}, + i; + if ( !annotations || annotations.length === 0 ) { + return characters; + } + for ( i = 0; i < annotations.length; i++ ) { + annoationMap[JSON.stringify( annotations[i] )] = annotations[i]; + } + for ( i = 0; i < characters.length; i++ ) { + characters[i] = [characters[i], annoationMap]; + } + return characters; +}; + +/** + * Recursively convert an HTML DOM node to a linear model array. + * + * This is the main function. * * @param {Object} node HTML DOM node * @param {Array} [annotations] Annotations to apply. Only used for recursion. - * @param {Object} [typeData] Information about the linear model element type corresponding to this node. - * Only used for recursion. This data usually comes from elementTypes. All keys are optional. Keys are: - * type: If set, add an opening and a closing element with this type and with the DOM node's attributes - * attributes: If set and type is set, these attributes will be added to the element's attributes - * leafNode: If set and set to true, this element is supposed to be a leaf node in the linear model. - * This means that any content in this node will be put in the output directly rather than - * being wrapped in paragraphs, and that any child nodes that are elements will not be - * descended into. + * @param {Object} [typeData] Information about the linear model element type corresponding to this + * node. Only used for recursion. This data usually comes from elementTypes. All keys are optional. + * Keys are: + * 'type': If set, add an opening and a closing element with this type and with the DOM + * node's attributes. + * 'attributes': If set and type is set, these attributes will be added to the element's + * attributes. + * 'leafNode': If set and set to true, this element is supposed to be a leaf node in the + * linear model. This means that any content in this node will be put in the output + * directly rather than being wrapped in paragraphs, and that any child nodes that are + * elements will not be descended into. * @returns {Array} Linear model data */ -ve.dm.HTMLConverter.convert = function( node, annotations, typeData ) { - var data = [], - i, - child, +ve.dm.HTMLConverter.prototype.convert = function( node, annotations, typeData ) { + var data = [], + types = ve.dm.HTMLConverter.Node, + i, + child, annotation, - paragraphOpened = false, + paragraphOpened = false, element, - attributes, - typeData, - childTypeData, - annotations = annotations || [], - typeData = typeData || {}; + attributes, + childTypeData; + + annotations = annotations || []; + typeData = typeData || {}; if ( typeData.type ) { element = { 'type': typeData.type }; @@ -52,7 +247,7 @@ ve.dm.HTMLConverter.convert = function( node, annotations, typeData ) { for ( i = 0; i < node.childNodes.length; i++ ) { child = node.childNodes[i]; switch ( child.nodeType ) { - case ve.dm.HTMLConverter.Node.ELEMENT_NODE: + case types.ELEMENT_NODE: // Check if this is an annotation annotation = ve.dm.HTMLConverter.getAnnotation( child ); if ( annotation ) { @@ -65,7 +260,7 @@ ve.dm.HTMLConverter.convert = function( node, annotations, typeData ) { // Recurse into this node data = data.concat( ve.dm.HTMLConverter.convert( child, annotations.concat( [ annotation ] ), - { leafNode: true } + { 'leafNode': true } ) ); } else { if ( typeData.leafNode ) { @@ -89,14 +284,18 @@ ve.dm.HTMLConverter.convert = function( node, annotations, typeData ) { // Recurse into this node // Don't pass annotations through; we should never have those here // anyway because only leaves can have them. - data = data.concat( ve.dm.HTMLConverter.convert( child, [], childTypeData ) ); + data = data.concat( + ve.dm.HTMLConverter.convert( child, [], childTypeData ) + ); } else { - console.warn( 'HTML DOM to linear model conversion error: unknown node with name ' + - child.nodeName + ' and content ' + child.innerHTML ); + console.warn( + 'HTML DOM to linear model conversion error: unknown node with name ' + + child.nodeName + ' and content ' + child.innerHTML + ); } } break; - case ve.dm.HTMLConverter.Node.TEXT_NODE: + case types.TEXT_NODE: // If we have annotated text within a branch node, open a paragraph // Leaf nodes don't need this because they're allowed to contain content if ( !paragraphOpened && typeData && !typeData.leafNode ) { @@ -104,14 +303,18 @@ ve.dm.HTMLConverter.convert = function( node, annotations, typeData ) { paragraphOpened = true; } // Annotate the text and output it - data = data.concat( ve.dm.HTMLConverter.generateAnnotatedContent( child.data, annotations ) ); + data = data.concat( + ve.dm.HTMLConverter.generateAnnotatedContent( child.data, annotations ) + ); break; - case ve.dm.HTMLConverter.Node.COMMENT_NODE: + case types.COMMENT_NODE: // Comment, do nothing break; default: - console.warn( 'HTML DOM to linear model conversion error: unknown node of type ' + - child.nodeType + ' with content ' + child.innerHTML ); + console.warn( + 'HTML DOM to linear model conversion error: unknown node of type ' + + child.nodeType + ' with content ' + child.innerHTML + ); } } // Close the last paragraph, if still open @@ -123,192 +326,12 @@ ve.dm.HTMLConverter.convert = function( node, annotations, typeData ) { data.push( { 'type': '/' + typeData.type } ); } return data; -} +}; /** * Recursively convert a linear model array to HTML DOM. * This function is not yet written. */ -ve.dm.HTMLConverter.unconvert = function() { -} - - -/*** Helper functions and variables ***/ - -// Quick HACK: define Node constants locally -// https://developer.mozilla.org/en/nodeType -ve.dm.HTMLConverter.Node = { - ELEMENT_NODE: 1, - ATTRIBUTE_NODE: 2, - TEXT_NODE: 3, - CDATA_SECTION_NODE: 4, - ENTITY_REFERENCE_NODE: 5, - ENTITY_NODE: 6, - PROCESSING_INSTRUCTION_NODE: 7, - COMMENT_NODE: 8, - DOCUMENT_NODE: 9, - DOCUMENT_TYPE_NODE: 10, - DOCUMENT_FRAGMENT_NODE: 11, - NOTATION_NODE: 12 +ve.dm.HTMLConverter.prototype.unconvert = function() { + // TODO: Implement }; - -/** - * Object mapping HTML DOM node names to linear model element names. - * - * leafNode: If true, the linear model element is a leaf node (can have children but no content) - * if false, it is a branch node (can contain content but no children), - * type: Element type - * attributes: Additional attributes to set for this element (optional) - * - */ -ve.dm.HTMLConverter.elementTypes = { - 'p': { leafNode: true, type: 'paragraph' }, - 'h1': { leafNode: true, type: 'heading', attributes: { level: 1 } }, - 'h2': { leafNode: true, type: 'heading', attributes: { level: 2 } }, - 'h3': { leafNode: true, type: 'heading', attributes: { level: 3 } }, - 'h4': { leafNode: true, type: 'heading', attributes: { level: 4 } }, - 'h5': { leafNode: true, type: 'heading', attributes: { level: 5 } }, - 'h6': { leafNode: true, type: 'heading', attributes: { level: 6 } }, - 'li': { leafNode: false, type: 'listItem' }, - 'dt': { leafNode: false, type: 'listItem' }, - 'dd': { leafNode: false, type: 'listItem' }, - 'pre': { leafNode: true, type: 'pre' }, - 'table': { leafNode: false, type: 'table' }, - 'tbody': { leafNode: false, type: 'tbody' }, // not in linear model! - 'tr': { leafNode: false, type: 'tableRow' }, - 'th': { leafNode: false, type: 'tableHeading' }, - 'td': { leafNode: false, type: 'tableCell' }, - 'caption': { leafNode: false, type: 'caption' }, - 'hr': { leafNode: true, type: 'horizontalRule' }, - 'ul': { leafNode: false, type: 'list' }, // TODO add attribute for 'bullet' - 'ol': { leafNode: false, type: 'list' }, // TODO add attribute for 'number' - 'dl': { leafNode: false, type: 'list' }, // TODO add attribute for ??? - //XXX: center is block-level in HTML, not sure what it should be in the linear model... - 'center': { leafNode: false, type: 'center' }, - //XXX: blockquote is block-level in HTML, not sure what it should be in the linear model... - 'blockquote': { leafNode: false, type: 'blockquote' }, - //XXX: what should 'div' be in the linear model? - 'div': { leafNode: false, type: 'div' } - // Missing types: - // br - // img -}; - -/** - * Object mapping HTML DOM node names to linear model annotation types. - * The value can either be a string, or a function that takes the relevant - * HTML DOM node and returns a string. - */ -ve.dm.HTMLConverter.annotationTypes = { - 'i': 'textStyle/italic', - 'b': 'textStyle/bold', - 'small': 'textStyle/small', - 'span': 'textStyle/span', - 'a': function( node ) { - // FIXME the parser currently doesn't output this data this way - // Internal links get linkType:'internal' in the data-mw-rt attrib, - // external links get nothing - var atype = node.getAttribute( 'data-type' ); - if ( atype ) { - return 'link/' + atype; - } else { - return 'link/unknown'; - } - }, - 'template': 'object/template', - 'ref': 'object/hook', - 'includeonly': 'object/includeonly' -}; - -/** - * List of HTML DOM attributes that should be passed through verbatim - * by convertAttributes() rather than being prefixed with 'html/' - */ -ve.dm.HTMLConverter.attributeWhitelist = [ 'title' ]; //XXX: add href? - -/** - * Convert the attributes of an HTML DOM node to linear model attributes. - * - * - Attributes prefixed with data-json- will have the prefix removed and their - * value JSON-decoded. - * - Attributes prefixed with data- will have the prefix removed. - * - Attributes in attributeWhitelist are passed through unchanged - * - All other attribuetes are prefixed with html/ - * - * @param {Object} node HTML DOM node - * @returns {Object} Converted attribute map - */ -ve.dm.HTMLConverter.convertAttributes = function( node ) { - var attribs = node.attributes, retval = {}, - i, attrib, name; - if ( !attribs ) { - return {}; - } - for ( i = 0; i < attribs.length; i++ ) { - attrib = attribs.item( i ); - name = attrib.name; - if ( name.substr( 0, 10 ) == 'data-json-' ) { - // Strip data-json- prefix and decode - retval[name.substr( 10 )] = JSON.parse( attrib.value ); - } else if ( name.substr( 0, 5 ) == 'data-' ) { - // Strip data- prefix - retval[name.substr( 5 )] = attrib.value; - } else if ( ve.dm.HTMLConverter.attributeWhitelist.indexOf( name ) != -1 ) { - // Pass through a few whitelisted keys - retval[name] = attrib.value; - } else { - // Prefix key with 'html/' - retval['html/' + name] = attrib.value; - } - } - return retval; -} - -/** - * Check if an HTML DOM node represents an annotation, and if so, build an - * annotation object for it. - * - * The annotation object looks like {type: 'type', data: {'attrKey': 'attrValue', ...}} - * - * @param {node} HTML DOM node - * @returns {Object|false} Annotation object, or false if this node is not an annotation - */ -ve.dm.HTMLConverter.getAnnotation = function( node ) { - var name = node.nodeName.toLowerCase(), - type = ve.dm.HTMLConverter.annotationTypes[name], - annotation = {}; - if ( !type ) { - // Not an annotation - return false; - } - if ( typeof type == 'function' ) { - type = type( node ); - } - annotation.type = type; - annotation.data = ve.dm.HTMLConverter.convertAttributes( node ); - return annotation; -} - -/** - * Convert a string to (possibly annotated) linear model data - * - * @param {String} content String to convert - * @param {Array} annotations Array of annotation objects to apply - * @returns {Array} Linear model data, one element per character - */ -ve.dm.HTMLConverter.generateAnnotatedContent = function( content, annotations ) { - var split = content.split( '' ); - if ( !annotations || annotations.length === 0 ) { - return split; - } - var annotationsCollection = {}; - for ( var i = 0; i < annotations.length; i++ ) { - var v = annotations[i]; - var k = JSON.stringify( v ); - annotationsCollection[k] = v; - } - for ( i = 0; i < split.length; i++ ) { - split[i] = [ split[i], annotationsCollection ]; - } - return split; -}