mediawiki-extensions-Visual.../modules/ve/dm/ve.dm.Converter.js
Catrope 7319038ed6 Strip generated <p> tags in dataToDom
domToData wraps bare content in paragraph elements, which were then
converted to <p> tags by domToData. With this fix, HTML with "missing"
<p> tags actually round-trips through the editor correctly now, rather
than having <p> tags added wherever VE believes they should exist.

* Mark generated paragraph elements with .internal.generated = 'wrapper'
** This signifies the wrapper was generated but its contents were not,
   so the right thing to do when converting back to HTML is to remove
   the wrapper and keep the contents. We might want to use other values
   of generated in the future.
* Unwrap nodes with generated=wrapper when converting to HTML

Tests:
* Add 'generated': 'wrapper' as appropriate. Only affects 1 test
* Remove 'normalizedHtml' for this test because it is no longer needed
** Need to keep 'normalizedHtml' for now because we normalize hrefs
* Eventually the main example should test bare content, but that
  requires touching a lot of stuff. The main example could use some
  beefing up anyway.

Change-Id: I277ad5fe3f64e07c1bbf49007d6bbaecc90b7466
2012-08-16 16:09:28 -07:00

628 lines
21 KiB
JavaScript

/**
* VisualEditor data model Converter class.
*
* @copyright 2011-2012 VisualEditor Team and others; see AUTHORS.txt
* @license The MIT License (MIT); see LICENSE.txt
*/
/**
* Converter between HTML DOM and VisualEditor linear data.
*
* @class
* @constructor
* @param {Object} options Conversion options
*/
ve.dm.Converter = function ( nodeFactory, annotationFactory ) {
// Properties
this.nodeFactory = nodeFactory;
this.annotationFactory = annotationFactory;
this.elements = { 'toDomElement': {}, 'toDataElement': {}, 'dataElementTypes': {} };
this.annotations = { 'toDomElement': {}, 'toDataAnnotation': {} };
// Events
this.nodeFactory.addListenerMethod( this, 'register', 'onNodeRegister' );
this.annotationFactory.addListenerMethod( this, 'register', 'onAnnotationRegister' );
};
/* Static Methods */
/**
* Get linear model data from a string optionally applying annotations
*
* @param {String} text Plain text to convert
* @param {Array} [annotations] Array of annotation objects to apply
* @returns {Array} Linear model data, one element per character
*/
ve.dm.Converter.getDataContentFromText = function ( text, annotations ) {
var characters = text.split( '' ),
annotationMap = {},
i;
if ( !annotations || annotations.length === 0 ) {
return characters;
}
// Build annotation map
for ( i = 0; i < annotations.length; i++ ) {
if ( annotations[i].data && ve.isEmptyObject( annotations[i].data ) ) {
// Cleanup empty data property
delete annotations[i].data;
}
annotationMap[ve.getHash( annotations[i] )] = annotations[i];
}
// Apply annotations to characters
for ( i = 0; i < characters.length; i++ ) {
// Make a shallow copy of the annotationMap object, otherwise adding an annotation to one
// character automatically adds it to all of others as well, annotations should be treated
// as immutable, so it's OK to share references, but annotation maps are not immutable, so
// its not safe to share references - each annotated character needs its own map
characters[i] = [characters[i], ve.extendObject( {}, annotationMap )];
}
return characters;
};
/* Methods */
/**
* Responds to register events from the node factory.
*
* If a node is special; such as document, alienInline, alienBlock and text; its {converters}
* property should be set to null, as to distinguish it from a new node type that someone has simply
* forgotten to implement converters for.
*
* @method
* @param {String} type Node type
* @param {Function} constructor Node constructor
* @throws 'Missing conversion data in node implementation of {type}'
*/
ve.dm.Converter.prototype.onNodeRegister = function ( dataElementType, constructor ) {
if ( constructor.converters === undefined ) {
throw new Error( 'Missing conversion data in node implementation of ' + dataElementType );
} else if ( constructor.converters !== null ) {
var i,
domElementTypes = constructor.converters.domElementTypes,
toDomElement = constructor.converters.toDomElement,
toDataElement = constructor.converters.toDataElement;
// Registration
this.elements.toDomElement[dataElementType] = toDomElement;
for ( i = 0; i < domElementTypes.length; i++ ) {
this.elements.toDataElement[domElementTypes[i]] = toDataElement;
this.elements.dataElementTypes[domElementTypes[i]] = dataElementType;
}
}
};
/**
* Responds to register events from the annotation factory.
*
* @method
* @param {String} type Base annotation type
* @param {Function} constructor Annotation constructor
* @throws 'Missing conversion data in annotation implementation of {type}'
*/
ve.dm.Converter.prototype.onAnnotationRegister = function ( dataElementType, constructor ) {
if ( constructor.converters === undefined ) {
throw new Error( 'Missing conversion data in annotation implementation of ' + dataElementType );
} else if ( constructor.converters !== null ) {
var i,
domElementTypes = constructor.converters.domElementTypes,
toDomElement = constructor.converters.toDomElement,
toDataAnnotation = constructor.converters.toDataAnnotation;
// Registration
this.annotations.toDomElement[dataElementType] = toDomElement;
for ( i = 0; i < domElementTypes.length; i++ ) {
this.annotations.toDataAnnotation[domElementTypes[i]] = toDataAnnotation;
}
}
};
/**
* Get the DOM element for a given linear model element.
*
* This invokes the toDomElement function registered for the element type.
* NOTE: alienBlock and alienInline elements are not supported, if you pass them this function
* will return false. (Opposite of District 9: no aliens allowed.)
*
* @method
* @param {Object} dataElement Linear model element
* @returns {HTMLElement|false} DOM element, or false if this element cannot be converted
*/
ve.dm.Converter.prototype.getDomElementFromDataElement = function ( dataElement ) {
var key, domElement, dataElementAttributes,
dataElementType = dataElement.type;
if (
// Aliens
dataElementType === 'alienInline' || dataElementType === 'alienBlock' ||
// Unsupported elements
!( dataElementType in this.elements.toDomElement)
) {
return false;
}
domElement = this.elements.toDomElement[dataElementType]( dataElementType, dataElement );
dataElementAttributes = dataElement.attributes;
if ( dataElementAttributes ) {
for ( key in dataElementAttributes ) {
// Only include 'html/*' attributes and strip the 'html/' from the beginning of the name
if ( key.indexOf( 'html/' ) === 0 ) {
domElement.setAttribute( key.substr( 5 ), dataElementAttributes[key] );
}
}
}
return domElement;
};
/**
* Get the linear model data element for a given DOM element.
*
* This invokes the toDataElement function registered for the element type, after checking that
* there is no data-mw-gc attribute.
*
* @method
* @param {HTMLElement} domElement DOM element
* @returns {Object|false} Linear model element, or false if this node cannot be converted
*/
ve.dm.Converter.prototype.getDataElementFromDomElement = function ( domElement ) {
var dataElement, domElementAttributes, dataElementAttributes, domElementAttribute, i,
domElementType = domElement.nodeName.toLowerCase();
if (
// Generated elements
domElement.hasAttribute( 'data-mw-gc' ) ||
// Unsupported elements
!( domElementType in this.elements.toDataElement )
) {
return false;
}
dataElement = this.elements.toDataElement[domElementType]( domElementType, domElement );
domElementAttributes = domElement.attributes;
if ( domElementAttributes.length ) {
dataElementAttributes = dataElement.attributes = dataElement.attributes || {};
// Inlcude all attributes and prepend 'html/' to each attribute name
for ( i = 0; i < domElementAttributes.length; i++ ) {
domElementAttribute = domElementAttributes[i];
dataElementAttributes['html/' + domElementAttribute.name] = domElementAttribute.value;
}
}
return dataElement;
};
/**
* Check if an HTML DOM node represents an annotation, and if so, build an annotation object for it.
*
* @example Annotation Object
* { 'type': 'type', data: { 'key': 'value', ... } }
*
* @param {HTMLElement} domElement HTML DOM node
* @returns {Object|false} Annotation object, or false if this node is not an annotation
*/
ve.dm.Converter.prototype.getDataAnnotationFromDomElement = function ( domElement ) {
var domElementType = domElement.nodeName.toLowerCase(),
toDataAnnotation = this.annotations.toDataAnnotation[domElementType];
if ( typeof toDataAnnotation === 'function' ) {
return toDataAnnotation( domElementType, domElement );
}
return false;
};
/**
* Build an HTML DOM node for a linear model annotation.
*
* @method
* @param {Object} dataAnnotation Annotation object
* @returns {HTMLElement|false} HTML DOM node, or false if this annotation is not known
*/
ve.dm.Converter.prototype.getDomElementFromDataAnnotation = function ( dataAnnotation ) {
var split = dataAnnotation.type.split( '/' ),
baseType = split[0],
subType = split.slice( 1 ).join( '/' ),
toDomElement = this.annotations.toDomElement[baseType];
if ( typeof toDomElement === 'function' ) {
return toDomElement( subType, dataAnnotation );
}
return false;
};
/**
* Convert an HTML DOM tree to a linear model.
*
* Do not use the annotations, dataElement and path parameters, they're used for internal
* recursion only.
*
* @method
* @param {HTMLElement} domElement Wrapper div containing the HTML to convert
* @param {Array} [annotations] Array of annotations (objects) to apply to the generated data
* @param {Object} [dataElement] Data element to wrap the returned data in
* @param {Array} [path] Array of linear model element types
* @returns {Array} Linear model data
*/
ve.dm.Converter.prototype.getDataFromDom = function ( domElement, annotations, dataElement, path, alreadyWrapped ) {
function createAlien( domElement, isInline ) {
var type = isInline ? 'alienInline' : 'alienBlock';
return [
{
'type': type,
'attributes': {
'html': $( '<div>' ).append( $( domElement ).clone() ).html()
}
},
{ 'type': '/' + type }
];
}
// Fallback to defaults
annotations = annotations || [];
path = path || ['document'];
var i, childDomElement, annotation, childDataElement, text, childTypes, matches, paragraph,
wrapperElement = dataElement,
data = [],
branchType = path[path.length - 1],
branchIsContent = ve.dm.nodeFactory.canNodeContainContent( branchType ),
wrapping = false;
// Open element
if ( dataElement ) {
data.push( dataElement );
}
// Add contents
for ( i = 0; i < domElement.childNodes.length; i++ ) {
childDomElement = domElement.childNodes[i];
switch ( childDomElement.nodeType ) {
case Node.ELEMENT_NODE:
// Detect generated content and wrap it in an alien node
if ( childDomElement.hasAttribute( 'data-mw-gc' ) ) {
// FIXME Parsoid outputs RDFa now, address this in API rewrite
data = data.concat( createAlien( childDomElement, branchIsContent ) );
break;
}
// Detect and handle annotated content
annotation = this.getDataAnnotationFromDomElement( childDomElement );
if ( annotation ) {
// Start auto-wrapping of bare content
if ( !wrapping && !alreadyWrapped && !branchIsContent ) {
// Mark this paragraph as having been generated by
// us, so we can strip it on the way out
data.push( {
'type': 'paragraph',
'internal': { 'generated': 'wrapper' }
} );
wrapping = true;
}
// Append child element data
data = data.concat(
this.getDataFromDom(
childDomElement, annotations.concat( annotation ), undefined, path, wrapping || alreadyWrapped
)
);
break;
}
// End auto-wrapping of bare content
if ( wrapping ) {
data.push( { 'type': '/paragraph' } );
wrapping = false;
}
// Append child element data
childDataElement = this.getDataElementFromDomElement( childDomElement );
if ( childDataElement ) {
data = data.concat(
this.getDataFromDom(
childDomElement,
[],
childDataElement,
path.concat( childDataElement.type ),
wrapping || alreadyWrapped
)
);
break;
}
// We don't know what this is, fall back to alien
data = data.concat( createAlien( childDomElement, branchIsContent ) );
break;
case Node.TEXT_NODE:
// HACK: strip trailing newline in <li> tags. Workaround for a Parsoid bug
// TODO kill this in favor of the fringe whitespace preservation
// code below
text = childDomElement.data;
if ( domElement.nodeName.toLowerCase() === 'li' ) {
text = text.replace( /\n$/, '' );
}
if ( !branchIsContent ) {
// If it's bare content, strip leading and trailing newlines
// FIXME these newlines should be preserved somehow
text = text.replace( /^\n+/, '' ).replace( /\n+$/, '' );
}
if ( text === '' ) {
// Don't produce an empty text node or an empty paragraph
break;
}
// Start auto-wrapping of bare content
if ( !wrapping && !alreadyWrapped && !branchIsContent ) {
// Mark this paragraph as having been generated by
// us, so we can strip it on the way out
paragraph = {
'type': 'paragraph',
'internal': { 'generated': 'wrapper' }
};
data.push( paragraph );
wrapping = true;
wrapperElement = paragraph;
}
// Strip leading and trailing whitespace
// (but only in non-annotation nodes)
// and store it so it can be restored later.
// whitespace = [ outerPre, innerPre, innerPost, outerPost ]
// <tag> text </tag> <nextTag>
// ^^^^^^^^^^ ^^^^^^^^^ ^^^^^^^^^^^ ^^^^^^^^^^
// outerPre innerPre innerPost outerPost
if ( annotations.length === 0 && i === 0 && wrapperElement ) {
// Strip leading whitespace from the first child
matches = text.match( /^\s+/ );
if ( matches && matches[0] !== '' ) {
if ( !wrapperElement.internal ) {
wrapperElement.internal = {};
}
if ( !wrapperElement.internal.whitespace ) {
wrapperElement.internal.whitespace = [];
}
wrapperElement.internal.whitespace[1] = matches[0];
text = text.substring( matches[0].length );
}
}
if (
annotations.length === 0 &&
i === domElement.childNodes.length - 1 &&
wrapperElement
) {
// Strip trailing whitespace from the last child
matches = text.match( /\s+$/ );
if ( matches && matches[0] !== '' ) {
if ( !wrapperElement.internal ) {
wrapperElement.internal = {};
}
if ( !wrapperElement.internal.whitespace ) {
wrapperElement.internal.whitespace = [];
}
wrapperElement.internal.whitespace[2] = matches[0];
text = text.substring( 0,
text.length - matches[0].length );
}
}
// Annotate the text and output it
data = data.concat(
ve.dm.Converter.getDataContentFromText( text, annotations )
);
break;
case Node.COMMENT_NODE:
// TODO: Preserve comments by inserting them into the linear model too
// Could use placeholders for this too, although they'd need to be
// inline in certain cases
break;
}
}
// End auto-wrapping of bare content
if ( wrapping ) {
data.push( { 'type': '/paragraph' } );
}
// If we're closing a node that doesn't have any children, but could contain a paragraph,
// add a paragraph. This prevents things like empty list items
childTypes = ve.dm.nodeFactory.getChildNodeTypes( branchType );
if ( branchType !== 'paragraph' && dataElement && data[data.length - 1] === dataElement &&
!wrapping && !ve.dm.nodeFactory.canNodeContainContent( branchType ) &&
!ve.dm.nodeFactory.isNodeContent( branchType ) &&
( childTypes === null || ve.indexOf( 'paragraph', childTypes ) !== -1 )
) {
data.push( { 'type': 'paragraph', 'internal': { 'generated': 'wrapper' } } );
data.push( { 'type': '/paragraph' } );
}
// Close element
if ( dataElement ) {
data.push( { 'type': '/' + dataElement.type } );
}
// Don't return an empty document
if ( branchType === 'document' && data.length === 0 ) {
return [
{ 'type': 'paragraph', 'internal': { 'generated': 'wrapper' } },
{ 'type': '/paragraph' }
];
}
return data;
};
/**
* Convert linear model data to an HTML DOM
*
* @method
* @param {Array} data Linear model data
* @returns {HTMLElement} Wrapper div containing the resulting HTML
*/
ve.dm.Converter.prototype.getDomFromData = function ( data ) {
var text, i, j, annotations, hash, annotationElement, done, dataElement, wrapper,
childDomElement, pre, post, parentDomElement,
container = document.createElement( 'div' ),
domElement = container,
annotationStack = {}; // { hash: DOMnode }
for ( i = 0; i < data.length; i++ ) {
if ( typeof data[i] === 'string' ) {
// Text
text = '';
// Continue forward as far as the plain text goes
while ( typeof data[i] === 'string' ) {
text += data[i];
i++;
}
// i points to the first non-text thing, go back one so we don't skip this later
i--;
// Add text
domElement.appendChild( document.createTextNode( text ) );
} else if (
ve.isArray( data[i] ) ||
(
data[i].annotations !== undefined &&
ve.dm.nodeFactory.isNodeContent( data[i].type )
)
) {
// Annotated text
text = '';
while (
ve.isArray( data[i] ) ||
(
data[i].annotations !== undefined &&
ve.dm.nodeFactory.isNodeContent( data[i].type )
)
) {
annotations = data[i].annotations || data[i][1];
// Check for closed annotations
for ( hash in annotationStack ) {
if ( !( hash in annotations ) ) {
// It's closed
// Traverse up until we hit the node we need to close, and then
// traverse up one more time to close that node
done = false;
while ( !done ) {
done = domElement === annotationStack[hash];
// Remove the annotation from the stack
delete annotationStack[domElement.veAnnotationHash];
// Remove the temporary veAnnotationHash property
delete domElement.veAnnotationHash;
// Add text if needed
if ( text.length > 0 ) {
domElement.appendChild( document.createTextNode( text ) );
text = '';
}
// Traverse up
domElement = domElement.parentNode;
}
}
}
// Check for opened annotations
for ( hash in annotations ) {
if ( !( hash in annotationStack ) ) {
// It's opened
annotationElement = this.getDomElementFromDataAnnotation( annotations[hash] );
// Temporary property, will remove this when closing the annotation
annotationElement.veAnnotationHash = hash;
// Add to the annotation stack
annotationStack[hash] = annotationElement;
// Add text if needed
if ( text.length > 0 ) {
domElement.appendChild( document.createTextNode( text ) );
text = '';
}
// Attach new node and descend into it
domElement.appendChild( annotationElement );
domElement = annotationElement;
}
}
if ( data[i].annotations === undefined ) {
text += data[i][0];
} else {
// Add text if needed
if ( text.length > 0 ) {
domElement.appendChild( document.createTextNode( text ) );
text = '';
}
// Insert the element
domElement.appendChild( this.getDomElementFromDataElement( data[i] ) );
// Increment i once more so we skip over the closing as well
i++;
}
i++;
}
// We're now at the first non-annotated thing, go back one so we don't skip this later
i--;
// Add any gathered text
if ( text.length > 0 ) {
domElement.appendChild( document.createTextNode( text ) );
text = '';
}
// Close any remaining annotation nodes
while ( domElement.veAnnotationHash !== undefined ) {
delete annotationStack[domElement.veAnnotationHash];
delete domElement.veAnnotationHash;
domElement = domElement.parentNode;
}
} else if ( data[i].type !== undefined ) {
dataElement = data[i];
// Element
if ( dataElement.type === 'alienBlock' || dataElement.type === 'alienInline' ) {
// Create nodes from source
wrapper = document.createElement( 'div' );
wrapper.innerHTML = dataElement.attributes.html;
// Add element - adds all child elements, but there really should only be 1
while ( wrapper.firstChild ) {
domElement.appendChild( wrapper.firstChild );
}
// Make sure the alien closing is skipped
i++;
} else if ( dataElement.type.charAt( 0 ) === '/' ) {
// Process inner whitespace
// whitespace = [ outerPre, innerPre, innerPost, outerPost ]
if ( domElement.veInternal && domElement.veInternal.whitespace ) {
pre = domElement.veInternal.whitespace[1];
if ( pre ) {
if ( domElement.firstChild.nodeType === 3 ) {
// First child is a TextNode, prepend to it
domElement.firstChild.insertData( 0, pre );
} else {
// Prepend a TextNode
domElement.insertBefore(
document.createTextNode( pre ),
domElement.firstChild
);
}
}
post = domElement.veInternal.whitespace[2];
if ( post ) {
if ( domElement.lastChild.nodeType === 3 ) {
// Last child is a TextNode, append to it
domElement.lastChild.appendData( post );
} else {
// Append a TextNode
domElement.appendChild(
document.createTextNode( post )
);
}
}
}
// If closing a generated wrapper node, unwrap it
// It would be nicer if we could avoid generating in the first
// place, but then remembering where we have to skip ascending
// to the parent would be tricky.
parentDomElement = domElement.parentNode;
if ( domElement.veInternal && domElement.veInternal.generated === 'wrapper' ) {
for ( j = 0; j < domElement.childNodes.length; j++ ) {
parentDomElement.insertBefore(
domElement.childNodes[j],
domElement
);
}
parentDomElement.removeChild( domElement );
}
// Clean up the internal data
delete domElement.veInternal;
// Ascend to parent node
domElement = parentDomElement;
} else {
// Create node from data
childDomElement = this.getDomElementFromDataElement( dataElement );
// Add reference to internal data
if ( dataElement.internal ) {
childDomElement.veInternal = dataElement.internal;
}
// Add element
domElement.appendChild( childDomElement );
// Descend into child node
domElement = childDomElement;
}
}
}
return container;
};
/* Initialization */
ve.dm.converter = new ve.dm.Converter( ve.dm.nodeFactory, ve.dm.annotationFactory );