2012-05-04 22:47:41 +00:00
|
|
|
/**
|
2012-05-08 02:43:03 +00:00
|
|
|
* Converter between HTML DOM and VisualEditor linear data.
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-08 02:43:03 +00:00
|
|
|
* @author Gabriel Wicke
|
2012-05-04 22:47:41 +00:00
|
|
|
* @author Roan Kattouw
|
|
|
|
* @author Christian Williams
|
2012-05-05 03:22:05 +00:00
|
|
|
* @author Inez Korczynski
|
2012-05-08 02:43:03 +00:00
|
|
|
* @author Trevor Parscal
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-08 02:43:03 +00:00
|
|
|
* @class
|
|
|
|
* @constructor
|
|
|
|
* @param {Object} options Conversion options
|
2012-05-04 22:47:41 +00:00
|
|
|
*/
|
2012-05-08 02:43:03 +00:00
|
|
|
ve.dm.HTMLConverter = function( options ) {
|
|
|
|
this.options = options || {};
|
|
|
|
};
|
2012-05-04 22:47:41 +00:00
|
|
|
|
2012-05-08 02:43:03 +00:00
|
|
|
/* Static Members */
|
2012-05-04 22:47:41 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Object mapping HTML DOM node names to linear model element names.
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-08 02:43:03 +00:00
|
|
|
* 'leafNode': Type of model tree node used to represent this element
|
|
|
|
* - If true, it's a leaf node (can not contain children)
|
|
|
|
* - If false, it's a branch node (can contain children)
|
|
|
|
* 'type': Symbolic name of element in VisualEditor linear data.
|
|
|
|
* 'attributes': Additional attributes to set for this element (optional)
|
2012-05-04 22:47:41 +00:00
|
|
|
*/
|
|
|
|
ve.dm.HTMLConverter.elementTypes = {
|
2012-05-23 17:15:44 +00:00
|
|
|
'p': { 'leafNode': false, 'canContainContent': true, 'type': 'paragraph' },
|
|
|
|
'h1': { 'leafNode': false, 'canContainContent': true, 'type': 'heading', 'attributes': { 'level': 1 } },
|
|
|
|
'h2': { 'leafNode': false, 'canContainContent': true, 'type': 'heading', 'attributes': { 'level': 2 } },
|
|
|
|
'h3': { 'leafNode': false, 'canContainContent': true, 'type': 'heading', 'attributes': { 'level': 3 } },
|
|
|
|
'h4': { 'leafNode': false, 'canContainContent': true, 'type': 'heading', 'attributes': { 'level': 4 } },
|
|
|
|
'h5': { 'leafNode': false, 'canContainContent': true, 'type': 'heading', 'attributes': { 'level': 5 } },
|
|
|
|
'h6': { 'leafNode': false, 'canContainContent': true, 'type': 'heading', 'attributes': { 'level': 6 } },
|
|
|
|
'img': { 'leafnode': true, 'canContainContent': false, 'type': 'image' },
|
|
|
|
'li': { 'leafNode': false, 'canContainContent': false, 'type': 'listItem' },
|
|
|
|
'dt': { 'leafNode': false, 'canContainContent': false, 'type': 'listItem', 'attributes': { 'style': 'term' } },
|
|
|
|
'dd': { 'leafNode': false, 'canContainContent': false, 'type': 'listItem', 'attributes': { 'style': 'definition' } },
|
|
|
|
'pre': { 'leafNode': false, 'canContainContent': true, 'type': 'preformatted' },
|
|
|
|
'table': { 'leafNode': false, 'canContainContent': false, 'type': 'table' },
|
2012-05-10 01:35:05 +00:00
|
|
|
'tbody': { }, // Ignore tbody for now, we might want it later for header/footer support though
|
2012-05-23 17:15:44 +00:00
|
|
|
'tr': { 'leafNode': false, 'canContainContent': false, 'type': 'tableRow' },
|
|
|
|
'th': { 'leafNode': false, 'canContainContent': false, 'type': 'tableHeading' },
|
|
|
|
'td': { 'leafNode': false, 'canContainContent': false, 'type': 'tableCell' },
|
|
|
|
'ul': { 'leafNode': false, 'canContainContent': false, 'type': 'list', 'attributes': { 'style': 'bullet' } },
|
|
|
|
'ol': { 'leafNode': false, 'canContainContent': false, 'type': 'list', 'attributes': { 'style': 'number' } },
|
|
|
|
'dl': { 'leafNode': false, 'canContainContent': false, 'type': 'definitionList' }
|
2012-05-10 06:04:57 +00:00
|
|
|
|
2012-05-08 02:43:03 +00:00
|
|
|
// Missing types that will end up being alien nodes (not a complete list):
|
|
|
|
// div, center, blockquote, caption, tbody, thead, tfoot, horizontalRule, br, img, video, audio
|
2012-05-04 22:47:41 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Object mapping HTML DOM node names to linear model annotation types.
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-08 02:43:03 +00:00
|
|
|
* The value can either be a string, or a function that takes the relevant HTML DOM node and returns
|
|
|
|
* a string.
|
2012-05-04 22:47:41 +00:00
|
|
|
*/
|
|
|
|
ve.dm.HTMLConverter.annotationTypes = {
|
|
|
|
'i': 'textStyle/italic',
|
|
|
|
'b': 'textStyle/bold',
|
2012-05-11 21:07:20 +00:00
|
|
|
'u': 'textStyle/underline',
|
2012-05-04 22:47:41 +00:00
|
|
|
'small': 'textStyle/small',
|
|
|
|
'span': 'textStyle/span',
|
|
|
|
'a': function( node ) {
|
2012-05-08 02:43:03 +00:00
|
|
|
// FIXME: the parser currently doesn't output this data this way
|
|
|
|
// Internal links get 'linkType': 'internal' in the data-mw-rt attrib, while external links
|
|
|
|
// currently get nothing
|
2012-05-04 22:47:41 +00:00
|
|
|
var atype = node.getAttribute( 'data-type' );
|
|
|
|
if ( atype ) {
|
|
|
|
return 'link/' + atype;
|
|
|
|
} else {
|
|
|
|
return 'link/unknown';
|
|
|
|
}
|
|
|
|
},
|
|
|
|
'template': 'object/template',
|
|
|
|
'ref': 'object/hook',
|
|
|
|
'includeonly': 'object/includeonly'
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
2012-05-08 02:43:03 +00:00
|
|
|
* List of HTML DOM attributes that should be passed through verbatim by convertAttributes() rather
|
|
|
|
* than being prefixed with 'html/'
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-08 02:43:03 +00:00
|
|
|
* TODO: Add href to this list?
|
2012-05-04 22:47:41 +00:00
|
|
|
*/
|
2012-05-08 02:43:03 +00:00
|
|
|
ve.dm.HTMLConverter.attributeWhitelist = [ 'title' ];
|
|
|
|
|
|
|
|
/* Static Methods */
|
2012-05-04 22:47:41 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Convert the attributes of an HTML DOM node to linear model attributes.
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-04 22:47:41 +00:00
|
|
|
* - Attributes prefixed with data-json- will have the prefix removed and their
|
|
|
|
* value JSON-decoded.
|
|
|
|
* - Attributes prefixed with data- will have the prefix removed.
|
|
|
|
* - Attributes in attributeWhitelist are passed through unchanged
|
|
|
|
* - All other attribuetes are prefixed with html/
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-04 22:47:41 +00:00
|
|
|
* @param {Object} node HTML DOM node
|
|
|
|
* @returns {Object} Converted attribute map
|
|
|
|
*/
|
|
|
|
ve.dm.HTMLConverter.convertAttributes = function( node ) {
|
2012-05-10 01:35:05 +00:00
|
|
|
var converted = {},
|
2012-05-08 02:43:03 +00:00
|
|
|
attrib,
|
|
|
|
name,
|
2012-05-10 01:35:05 +00:00
|
|
|
value,
|
2012-05-08 02:43:03 +00:00
|
|
|
i;
|
2012-05-10 01:35:05 +00:00
|
|
|
if ( !node.length ) {
|
2012-05-04 22:47:41 +00:00
|
|
|
return {};
|
|
|
|
}
|
2012-05-10 01:35:05 +00:00
|
|
|
for ( i = 0; i < node.length; i++ ) {
|
|
|
|
attrib = node[i];
|
2012-05-04 22:47:41 +00:00
|
|
|
name = attrib.name;
|
2012-05-10 01:35:05 +00:00
|
|
|
value = attrib.value;
|
2012-05-04 22:47:41 +00:00
|
|
|
if ( name.substr( 0, 10 ) == 'data-json-' ) {
|
|
|
|
// Strip data-json- prefix and decode
|
2012-05-10 01:35:05 +00:00
|
|
|
converted[name.substr( 10 )] = JSON.parse( value );
|
|
|
|
} else if ( node[i].name.substr( 0, 5 ) == 'data-' ) {
|
2012-05-04 22:47:41 +00:00
|
|
|
// Strip data- prefix
|
2012-05-10 01:35:05 +00:00
|
|
|
converted[name.substr( 5 )] = value;
|
2012-05-04 22:47:41 +00:00
|
|
|
} else if ( ve.dm.HTMLConverter.attributeWhitelist.indexOf( name ) != -1 ) {
|
|
|
|
// Pass through a few whitelisted keys
|
2012-05-10 01:35:05 +00:00
|
|
|
converted[name] = value;
|
2012-05-04 22:47:41 +00:00
|
|
|
} else {
|
|
|
|
// Prefix key with 'html/'
|
2012-05-10 01:35:05 +00:00
|
|
|
converted['html/' + name] = value;
|
2012-05-04 22:47:41 +00:00
|
|
|
}
|
|
|
|
}
|
2012-05-08 02:43:03 +00:00
|
|
|
return converted;
|
|
|
|
};
|
2012-05-04 22:47:41 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Check if an HTML DOM node represents an annotation, and if so, build an
|
|
|
|
* annotation object for it.
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-08 02:43:03 +00:00
|
|
|
* The annotation object looks like {'type': 'type', data: {'attrKey': 'attrValue', ...}}
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-04 22:47:41 +00:00
|
|
|
* @param {node} HTML DOM node
|
|
|
|
* @returns {Object|false} Annotation object, or false if this node is not an annotation
|
|
|
|
*/
|
|
|
|
ve.dm.HTMLConverter.getAnnotation = function( node ) {
|
|
|
|
var name = node.nodeName.toLowerCase(),
|
2012-05-08 02:43:03 +00:00
|
|
|
type = ve.dm.HTMLConverter.annotationTypes[name];
|
2012-05-04 22:47:41 +00:00
|
|
|
if ( !type ) {
|
|
|
|
// Not an annotation
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
if ( typeof type == 'function' ) {
|
|
|
|
type = type( node );
|
|
|
|
}
|
2012-05-08 02:43:03 +00:00
|
|
|
return {
|
|
|
|
'type': type,
|
|
|
|
'data': ve.dm.HTMLConverter.convertAttributes( node )
|
|
|
|
};
|
|
|
|
};
|
2012-05-04 22:47:41 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Convert a string to (possibly annotated) linear model data
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-04 22:47:41 +00:00
|
|
|
* @param {String} content String to convert
|
|
|
|
* @param {Array} annotations Array of annotation objects to apply
|
|
|
|
* @returns {Array} Linear model data, one element per character
|
|
|
|
*/
|
|
|
|
ve.dm.HTMLConverter.generateAnnotatedContent = function( content, annotations ) {
|
2012-05-08 02:43:03 +00:00
|
|
|
var characters = content.split( '' ),
|
|
|
|
annoationMap = {},
|
|
|
|
i;
|
2012-05-05 03:22:05 +00:00
|
|
|
if ( !annotations || annotations.length === 0 ) {
|
2012-05-08 02:43:03 +00:00
|
|
|
return characters;
|
2012-05-04 22:47:41 +00:00
|
|
|
}
|
2012-05-08 02:43:03 +00:00
|
|
|
for ( i = 0; i < annotations.length; i++ ) {
|
2012-05-17 03:25:43 +00:00
|
|
|
if ( annotations[i].data !== undefined && Object.keys(annotations[i].data).length === 0 ) {
|
2012-05-11 21:07:20 +00:00
|
|
|
delete annotations[i].data;
|
|
|
|
}
|
2012-05-21 19:20:35 +00:00
|
|
|
annoationMap[ve.getHash( annotations[i] )] = annotations[i];
|
2012-05-04 22:47:41 +00:00
|
|
|
}
|
2012-05-08 02:43:03 +00:00
|
|
|
for ( i = 0; i < characters.length; i++ ) {
|
|
|
|
characters[i] = [characters[i], annoationMap];
|
2012-05-17 03:25:43 +00:00
|
|
|
}
|
2012-05-08 02:43:03 +00:00
|
|
|
return characters;
|
|
|
|
};
|
|
|
|
|
2012-05-10 01:35:05 +00:00
|
|
|
/**
|
|
|
|
* Get linear model from HTML DOM
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-10 01:35:05 +00:00
|
|
|
* @static
|
|
|
|
* @method
|
|
|
|
* @param {Object} node HTML node to recursively convert
|
|
|
|
* @param {Object} options Options to use
|
|
|
|
* @returns {Array} Linear model data
|
|
|
|
*/
|
|
|
|
ve.dm.HTMLConverter.getLinearModel = function( node, options ) {
|
|
|
|
return ( new ve.dm.HTMLConverter( options ) ).convert( node );
|
2012-05-17 03:25:43 +00:00
|
|
|
};
|
2012-05-10 01:35:05 +00:00
|
|
|
|
|
|
|
|
2012-05-08 02:43:03 +00:00
|
|
|
/**
|
|
|
|
* Recursively convert an HTML DOM node to a linear model array.
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-08 02:43:03 +00:00
|
|
|
* This is the main function.
|
2012-05-14 22:05:09 +00:00
|
|
|
*
|
2012-05-08 02:43:03 +00:00
|
|
|
* @param {Object} node HTML DOM node
|
|
|
|
* @param {Array} [annotations] Annotations to apply. Only used for recursion.
|
|
|
|
* @param {Object} [typeData] Information about the linear model element type corresponding to this
|
|
|
|
* node. Only used for recursion. This data usually comes from elementTypes. All keys are optional.
|
|
|
|
* Keys are:
|
|
|
|
* 'type': If set, add an opening and a closing element with this type and with the DOM
|
|
|
|
* node's attributes.
|
|
|
|
* 'attributes': If set and type is set, these attributes will be added to the element's
|
|
|
|
* attributes.
|
|
|
|
* 'leafNode': If set and set to true, this element is supposed to be a leaf node in the
|
|
|
|
* linear model. This means that any content in this node will be put in the output
|
|
|
|
* directly rather than being wrapped in paragraphs, and that any child nodes that are
|
|
|
|
* elements will not be descended into.
|
2012-05-23 17:15:44 +00:00
|
|
|
* 'canContainContent': If set and set to true, this element is allowed to contain annotated
|
|
|
|
* text and inline elements.
|
2012-05-08 02:43:03 +00:00
|
|
|
* @returns {Array} Linear model data
|
|
|
|
*/
|
|
|
|
ve.dm.HTMLConverter.prototype.convert = function( node, annotations, typeData ) {
|
|
|
|
var data = [],
|
|
|
|
i,
|
|
|
|
child,
|
|
|
|
annotation,
|
|
|
|
paragraphOpened = false,
|
|
|
|
element,
|
|
|
|
attributes,
|
|
|
|
childTypeData;
|
|
|
|
|
|
|
|
annotations = annotations || [];
|
|
|
|
typeData = typeData || {};
|
|
|
|
|
|
|
|
if ( typeData.type ) {
|
|
|
|
element = { 'type': typeData.type };
|
|
|
|
attributes = ve.dm.HTMLConverter.convertAttributes( node.attributes );
|
|
|
|
if ( typeData.attributes ) {
|
|
|
|
attributes = $.extend( attributes, typeData.attributes );
|
|
|
|
}
|
|
|
|
if ( !$.isEmptyObject( attributes ) ) {
|
|
|
|
element.attributes = attributes;
|
|
|
|
}
|
|
|
|
data.push( element );
|
|
|
|
}
|
2012-05-10 06:04:57 +00:00
|
|
|
|
2012-05-08 02:43:03 +00:00
|
|
|
for ( i = 0; i < node.childNodes.length; i++ ) {
|
|
|
|
child = node.childNodes[i];
|
|
|
|
switch ( child.nodeType ) {
|
2012-05-26 00:57:20 +00:00
|
|
|
case Node.ELEMENT_NODE:
|
2012-05-08 02:43:03 +00:00
|
|
|
// Check if this is an annotation
|
|
|
|
annotation = ve.dm.HTMLConverter.getAnnotation( child );
|
|
|
|
if ( annotation ) {
|
|
|
|
// If we have annotated text within a branch node, open a paragraph
|
|
|
|
// Leaf nodes don't need this because they're allowed to contain content
|
2012-05-23 17:15:44 +00:00
|
|
|
if ( !paragraphOpened && typeData && !typeData.canContainContent ) {
|
2012-05-08 02:43:03 +00:00
|
|
|
data.push( { 'type': 'paragraph' } );
|
|
|
|
paragraphOpened = true;
|
|
|
|
}
|
|
|
|
// Recurse into this node
|
2012-05-10 01:35:05 +00:00
|
|
|
data = data.concat( ve.dm.HTMLConverter.prototype.convert( child,
|
2012-05-08 02:43:03 +00:00
|
|
|
annotations.concat( [ annotation ] ),
|
2012-05-23 17:15:44 +00:00
|
|
|
{ 'leafNode': true, 'canContainContent': true }
|
2012-05-08 02:43:03 +00:00
|
|
|
) );
|
|
|
|
} else {
|
2012-05-23 17:15:44 +00:00
|
|
|
// Skip over unknown nodes, keeping their innerHTML as an attribute
|
|
|
|
if ( !( child.nodeName.toLowerCase() in ve.dm.HTMLConverter.elementTypes ) ) {
|
|
|
|
// Use inline or block depending on parent's ability to contain content
|
|
|
|
var type = typeData.canContainContent ? 'alienInline' : 'alienBlock';
|
|
|
|
data.push( { 'type': type, 'attributes': { 'html': child.innerHTML } } );
|
|
|
|
data.push( { 'type': '/' + type } );
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
if ( typeData.canContainContent ) {
|
2012-05-08 02:43:03 +00:00
|
|
|
// We've found an element node *inside* a leaf node.
|
|
|
|
// This is illegal, so warn and skip it
|
|
|
|
console.warn( 'HTML DOM to linear model conversion error: ' +
|
2012-05-23 17:15:44 +00:00
|
|
|
'found element that can contain content (' + child.nodeName +
|
|
|
|
') inside another element that can contain content node (' +
|
|
|
|
node.nodeName + ')' );
|
2012-05-08 02:43:03 +00:00
|
|
|
break;
|
|
|
|
}
|
2012-05-23 17:15:44 +00:00
|
|
|
*/
|
2012-05-08 02:43:03 +00:00
|
|
|
// Close the last paragraph, if still open
|
|
|
|
if ( paragraphOpened ) {
|
|
|
|
data.push( { 'type': '/paragraph' } );
|
|
|
|
paragraphOpened = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the typeData for this node
|
|
|
|
childTypeData = ve.dm.HTMLConverter.elementTypes[child.nodeName.toLowerCase()];
|
2012-05-23 17:15:44 +00:00
|
|
|
|
2012-05-08 02:43:03 +00:00
|
|
|
if ( childTypeData ) {
|
|
|
|
// Recurse into this node
|
|
|
|
// Don't pass annotations through; we should never have those here
|
|
|
|
// anyway because only leaves can have them.
|
|
|
|
data = data.concat(
|
2012-05-10 01:35:05 +00:00
|
|
|
ve.dm.HTMLConverter.prototype.convert( child, [], childTypeData )
|
2012-05-08 02:43:03 +00:00
|
|
|
);
|
|
|
|
} else {
|
|
|
|
console.warn(
|
|
|
|
'HTML DOM to linear model conversion error: unknown node with name ' +
|
|
|
|
child.nodeName + ' and content ' + child.innerHTML
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2012-05-26 00:57:20 +00:00
|
|
|
case Node.TEXT_NODE:
|
2012-05-08 02:43:03 +00:00
|
|
|
// If we have annotated text within a branch node, open a paragraph
|
2012-05-23 17:15:44 +00:00
|
|
|
if ( !paragraphOpened && typeData && !typeData.canContainContent ) {
|
2012-05-08 02:43:03 +00:00
|
|
|
data.push( { 'type': 'paragraph' } );
|
|
|
|
paragraphOpened = true;
|
|
|
|
}
|
|
|
|
// Annotate the text and output it
|
|
|
|
data = data.concat(
|
|
|
|
ve.dm.HTMLConverter.generateAnnotatedContent( child.data, annotations )
|
|
|
|
);
|
|
|
|
break;
|
2012-05-26 00:57:20 +00:00
|
|
|
case Node.COMMENT_NODE:
|
2012-05-08 02:43:03 +00:00
|
|
|
// Comment, do nothing
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
console.warn(
|
|
|
|
'HTML DOM to linear model conversion error: unknown node of type ' +
|
|
|
|
child.nodeType + ' with content ' + child.innerHTML
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Close the last paragraph, if still open
|
|
|
|
if ( paragraphOpened ) {
|
|
|
|
data.push( { 'type': '/paragraph' } );
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( typeData && typeData.type ) {
|
|
|
|
data.push( { 'type': '/' + typeData.type } );
|
|
|
|
}
|
|
|
|
return data;
|
|
|
|
};
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Recursively convert a linear model array to HTML DOM.
|
|
|
|
* This function is not yet written.
|
|
|
|
*/
|
|
|
|
ve.dm.HTMLConverter.prototype.unconvert = function() {
|
|
|
|
// TODO: Implement
|
|
|
|
};
|