mediawiki-extensions-Visual.../modules/parser/mediawiki.LinearModelConverter.js
Gabriel Wicke 72c5efedb8 Rename data-rt to data-parsoid
* Quite unique according to google, and more obvious
* Also adjust parserTests to ignore mw:Nowiki and mw:Placeholder spans

Change-Id: I340e85092b60a65b4053a40bf8c238e26cb49c96
2012-07-26 15:27:32 -07:00

305 lines
10 KiB
JavaScript

/**
* HTML DOM -> linear model converter
*
* @author Roan Kattouw
*/
/**
* Convert an HTML DOM to a linear model
* @param {Object} Root node of the HTML DOM
* @returns {Array} Linear model
*/
function ConvertDOMToLM( node ) {
return convertHTMLNode( node );
}
if ( typeof module == 'object' ) {
module.exports.ConvertDOMToLM = ConvertDOMToLM;
}
/*** Private functions and variables ***/
// Quick HACK: define Node constants locally
// https://developer.mozilla.org/en/nodeType
var Node = {
ELEMENT_NODE: 1,
ATTRIBUTE_NODE: 2,
TEXT_NODE: 3,
CDATA_SECTION_NODE: 4,
ENTITY_REFERENCE_NODE: 5,
ENTITY_NODE: 6,
PROCESSING_INSTRUCTION_NODE: 7,
COMMENT_NODE: 8,
DOCUMENT_NODE: 9,
DOCUMENT_TYPE_NODE: 10,
DOCUMENT_FRAGMENT_NODE: 11,
NOTATION_NODE: 12
};
/**
* Object mapping HTML DOM node names to linear model element names.
*
* leafNode: If true, the linear model element is a leaf node (can have children but no content)
* if false, it is a branch node (can contain content but no children),
* type: Element type
* attributes: Additional attributes to set for this element (optional)
*
*/
var elementTypes = {
'p': { leafNode: true, type: 'paragraph' },
'h1': { leafNode: true, type: 'heading', attributes: { level: 1 } },
'h2': { leafNode: true, type: 'heading', attributes: { level: 2 } },
'h3': { leafNode: true, type: 'heading', attributes: { level: 3 } },
'h4': { leafNode: true, type: 'heading', attributes: { level: 4 } },
'h5': { leafNode: true, type: 'heading', attributes: { level: 5 } },
'h6': { leafNode: true, type: 'heading', attributes: { level: 6 } },
'li': { leafNode: false, type: 'listItem' },
'dt': { leafNode: false, type: 'listItem' },
'dd': { leafNode: false, type: 'listItem' },
'pre': { leafNode: true, type: 'pre' },
'table': { leafNode: false, type: 'table' },
'tbody': { leafNode: false, type: 'tbody' }, // not in linear model!
'tr': { leafNode: false, type: 'tableRow' },
'th': { leafNode: false, type: 'tableHeading' },
'td': { leafNode: false, type: 'tableCell' },
'caption': { leafNode: false, type: 'caption' },
'hr': { leafNode: true, type: 'horizontalRule' },
'ul': { leafNode: false, type: 'list' }, // TODO add attribute for 'bullet'
'ol': { leafNode: false, type: 'list' }, // TODO add attribute for 'number'
'dl': { leafNode: false, type: 'list' }, // TODO add attribute for ???
//XXX: center is block-level in HTML, not sure what it should be in the linear model...
'center': { leafNode: false, type: 'center' },
//XXX: blockquote is block-level in HTML, not sure what it should be in the linear model...
'blockquote': { leafNode: false, type: 'blockquote' },
//XXX: what should 'div' be in the linear model?
'div': { leafNode: false, type: 'div' }
// Missing types:
// br
// img
};
/**
* Object mapping HTML DOM node names to linear model annotation types.
* The value can either be a string, or a function that takes the relevant
* HTML DOM node and returns a string.
*/
var annotationTypes = {
'i': 'textStyle/italic',
'b': 'textStyle/bold',
'small': 'textStyle/small',
'span': 'textStyle/span',
'a': function( node ) {
// FIXME the parser currently doesn't output this data this way
// Internal links get linkType:'internal' in the data-parsoid attrib,
// external links get nothing
var atype = node.getAttribute( 'data-type' );
if ( atype ) {
return 'link/' + atype;
} else {
return 'link/unknown';
}
},
'template': 'object/template',
'ref': 'object/hook',
'includeonly': 'object/includeonly'
};
/**
* List of HTML DOM attributes that should be passed through verbatim
* by convertAttributes() rather than being prefixed with 'html/'
*/
var attributeWhitelist = [ 'title' ]; //XXX: add href?
/**
* Convert the attributes of an HTML DOM node to linear model attributes.
*
* - Attributes prefixed with data-json- will have the prefix removed and their
* value JSON-decoded.
* - Attributes prefixed with data- will have the prefix removed.
* - Attributes in attributeWhitelist are passed through unchanged
* - All other attribuetes are prefixed with html/
*
* @param {Object} node HTML DOM node
* @returns {Object} Converted attribute map
*/
function convertAttributes( node ) {
var attribs = node.attributes, retval = {},
i, attrib, name;
if ( !attribs ) {
return {};
}
for ( i = 0; i < attribs.length; i++ ) {
attrib = attribs.item( i );
name = attrib.name;
if ( name.substr( 0, 10 ) == 'data-json-' ) {
// Strip data-json- prefix and decode
retval[name.substr( 10 )] = JSON.parse( attrib.value );
} else if ( name.substr( 0, 5 ) == 'data-' ) {
// Strip data- prefix
retval[name.substr( 5 )] = attrib.value;
} else if ( attributeWhitelist.indexOf( name ) != -1 ) {
// Pass through a few whitelisted keys
retval[name] = attrib.value;
} else {
// Prefix key with 'html/'
retval['html/' + name] = attrib.value;
}
}
return retval;
}
/**
* Check if an HTML DOM node represents an annotation, and if so, build an
* annotation object for it.
*
* The annotation object looks like {type: 'type', data: {'attrKey': 'attrValue', ...}}
*
* @param {node} HTML DOM node
* @returns {Object|false} Annotation object, or false if this node is not an annotation
*/
function getAnnotation( node ) {
var name = node.nodeName.toLowerCase(),
type = annotationTypes[name],
annotation = {};
if ( !type ) {
// Not an annotation
return false;
}
if ( typeof type == 'function' ) {
type = type( node );
}
annotation.type = type;
annotation.data = convertAttributes( node );
return annotation;
}
/**
* Convert a string to (possibly annotated) linear model data
*
* @param {String} content String to convert
* @param {Array} annotations Array of annotation objects to apply
* @returns {Array} Linear model data, one element per character
*/
function generateAnnotatedContent( content, annotations ) {
var i, result = [],
split = content.split( '' );
if ( !annotations || annotations.length == 0 ) {
return split;
}
for ( i = 0; i < split.length; i++ ) {
result.push( [ split[i] ].concat( annotations ) );
}
return result;
}
/**
* Recursively convert an HTML DOM node to a linear model array. This is the
* main function.
*
* @param {Object} node HTML DOM node
* @param {Array} [annotations] Annotations to apply. Only used for recursion.
* @param {Object} [typeData] Information about the linear model element type corresponding to this node.
* Only used for recursion. This data usually comes from elementTypes. All keys are optional. Keys are:
* type: If set, add an opening and a closing element with this type and with the DOM node's attributes
* attributes: If set and type is set, these attributes will be added to the element's attributes
* leafNode: If set and set to true, this element is supposed to be a leaf node in the linear model.
* This means that any content in this node will be put in the output directly rather than
* being wrapped in paragraphs, and that any child nodes that are elements will not be
* descended into.
* @returns {Array} Linear model data
*/
function convertHTMLNode( node, annotations, typeData ) {
var data = [], i, child, annotation,
paragraphOpened = false, element,
attributes, typeData, childTypeData;
annotations = annotations || [];
typeData = typeData || {};
if ( typeData.type ) {
element = { 'type': typeData.type };
attributes = convertAttributes( node.attributes );
if ( typeData.attributes ) {
attributes = $.extend( attributes, typeData.attributes );
}
if ( !$.isEmptyObject( attributes ) ) {
element.attributes = attributes;
}
data.push( element );
}
for ( i = 0; i < node.childNodes.length; i++ ) {
child = node.childNodes[i];
switch ( child.nodeType ) {
case Node.ELEMENT_NODE:
// Check if this is an annotation
annotation = getAnnotation( child );
if ( annotation ) {
// If we have annotated text within a branch node, open a paragraph
// Leaf nodes don't need this because they're allowed to contain content
if ( !paragraphOpened && !typeData.leafNode ) {
data.push( { 'type': 'paragraph' } );
paragraphOpened = true;
}
// Recurse into this node
data = data.concat( convertHTMLNode( child,
annotations.concat( [ annotation ] ),
{ leafNode: true }
) );
} else {
if ( typeData.leafNode ) {
// We've found an element node *inside* a leaf node.
// This is illegal, so warn and skip it
console.warn( 'HTML DOM to linear model conversion error: ' +
'found element node (' + child.nodeName + ') inside ' +
'leaf node (' + node.nodeName + ')' );
break;
}
// Close the last paragraph, if still open
if ( paragraphOpened ) {
data.push( { 'type': '/paragraph' } );
paragraphOpened = false;
}
// Get the typeData for this node
childTypeData = elementTypes[child.nodeName.toLowerCase()];
if ( childTypeData ) {
// Recurse into this node
// Don't pass annotations through; we should never have those here
// anyway because only leaves can have them.
data = data.concat( convertHTMLNode( child, [], childTypeData ) );
} else {
console.warn( 'HTML DOM to linear model conversion error: unknown node with name ' +
child.nodeName + ' and content ' + child.innerHTML );
}
}
break;
case Node.TEXT_NODE:
// If we have annotated text within a branch node, open a paragraph
// Leaf nodes don't need this because they're allowed to contain content
if ( !paragraphOpened && typeData && !typeData.leafNode ) {
data.push( { 'type': 'paragraph' } );
paragraphOpened = true;
}
// Annotate the text and output it
data = data.concat( generateAnnotatedContent( child.data, annotations ) );
break;
case Node.COMMENT_NODE:
// Comment, do nothing
break;
default:
console.warn( 'HTML DOM to linear model conversion error: unknown node of type ' +
child.nodeType + ' with content ' + child.innerHTML );
}
}
// Close the last paragraph, if still open
if ( paragraphOpened ) {
data.push( { 'type': '/paragraph' } );
}
if ( typeData && typeData.type ) {
data.push( { 'type': '/' + typeData.type } );
}
return data;
}