mediawiki-extensions-Visual.../modules/parser/mediawiki.WikitextSerializer.js
Gabriel Wicke 5fbc80321b Improve newline handling for comments and nowiki/noinclude tags
* Added a newlineTransparent flag to handlers that prevents changes to the
  onNewline status, so that content following it is still considered to be in
  start-of-line context. This fixes a few rt tests where a comment or nowiki
  tag is at the start of the line, and following content should end up on the
  same line.
* 283 rt parser tests are now passing.

Change-Id: Ie58dcb9e5e9af9000fff61c2e1db5d8649ffc3f6
2012-06-18 22:56:41 +02:00

761 lines
19 KiB
JavaScript

/**
* Serializes a chunk of tokens or an HTML DOM to MediaWiki's wikitext flavor.
*
* @class
* @constructor
* @param options {Object} List of options for serialization
*/
WikitextSerializer = function( options ) {
this.options = $.extend( {
// defaults
}, options || {} );
};
var WSP = WikitextSerializer.prototype;
WSP.defaultOptions = {
onNewline : true,
listStack : [],
lastHandler : null,
availableNewlineCount: 0,
singleLineMode: 0
};
var id = function(v) {
return function( state ) {
return v;
};
};
WSP._listHandler = function( bullet, state, token ) {
function isListItem(token) {
if (token.constructor !== TagTk) return false;
var tokenName = token.name;
return (tokenName === 'li' || tokenName === 'dt' || tokenName === 'dd');
}
var bullets, res;
var stack = state.listStack;
if (stack.length === 0) {
bullets = bullet;
res = bullets;
} else {
var curList = stack[stack.length - 1];
bullets = curList.bullets + bullet;
curList.itemCount++;
if ( // deeply nested list
curList.itemCount > 2 ||
// A nested list, not directly after a list item
(curList.itemCount > 1 && !isListItem(state.prevToken))) {
res = bullets;
} else {
if (bullet !== '') token.startsNewline = false;
res = bullet;
}
}
stack.push({ itemCount: 0, bullets: bullets});
return res;
};
WSP._listEndHandler = function( state, token ) {
state.listStack.pop();
return '';
};
WSP._listItemHandler = function ( bullet, state, token ) {
var stack = state.listStack;
var curList = stack[stack.length - 1];
curList.itemCount++;
return ((curList.itemCount > 1 ) ? curList.bullets + bullet : bullet);
};
WSP._serializeTableTag = function ( symbol, optionEndSymbol, state, token ) {
if ( token.attribs.length ) {
return symbol + ' ' + WSP._serializeAttributes( token.attribs ) + optionEndSymbol;
} else {
return symbol;
}
};
WSP._emptyTags = { br: true, meta: true };
WSP._serializeHTMLTag = function ( state, token ) {
var close = '';
if ( WSP._emptyTags[ token.name ] ) {
close = '/';
}
// Swallow required newline from previous token on encountering a HTML tag
//state.emitNewlineOnNextToken = false;
if ( token.attribs.length ) {
return '<' + token.name + ' ' +
WSP._serializeAttributes( token.attribs ) + close + '>';
} else {
return '<' + token.name + close + '>';
}
};
WSP._serializeHTMLEndTag = function ( state, token ) {
if ( ! WSP._emptyTags[ token.name ] ) {
return '</' + token.name + '>';
} else {
return '';
}
};
WSP._linkHandler = function( state, token ) {
//return '[[';
// TODO: handle internal/external links etc using RDFa and dataAttribs
// Also convert unannotated html links to external wiki links for html
// import. Might want to consider converting relative links without path
// component and file extension to wiki links.
var env = state.env;
var attribDict = env.KVtoHash( token.attribs );
if ( attribDict.rel && attribDict.href !== undefined ) {
var tokenData = token.dataAttribs;
if ( attribDict.rel === 'mw:wikiLink' ) {
var base = env.wgScriptPath;
var href = attribDict.href;
var prefix = href.substr(0, base.length);
var target = (prefix === base) ? href.substr(base.length) : href;
target = decodeURIComponent(target);
var tail = tokenData.tail;
if ( tail && tail.length ) {
state.dropTail = tail;
target = tokenData.gc ? tokenData.sHref[0] : target.replace( /_/g, ' ' );
} else {
// SSS: Why is sHref an array instead of a string?
var origLinkTgt = tokenData.sHref[0];
if (origLinkTgt) {
//console.warn( JSON.stringify( tokenData.sHref ) );
// SSS FIXME: Why was resolveTitle wrapping this? Also, why do we require normalizeTitle here?
var normalizedOrigLinkTgt = env.normalizeTitle(env.tokensToString(origLinkTgt));
if ( normalizedOrigLinkTgt === target ) {
// Non-standard capitalization
target = origLinkTgt;
}
} else {
target = target.replace( /_/g, ' ' );
}
}
// FIXME: Properly handle something like [[{{Foo}}]]s
target = env.tokensToString( target );
if ( tokenData.gc ) {
state.dropContent = true;
return '[[' + target;
} else {
return '[[' + target + '|';
}
} else if ( attribDict.rel === 'mw:extLink' ) {
if ( tokenData.stx === 'urllink' ) {
state.dropContent = true;
return attribDict.href;
} else if ( tokenData.gc ) {
state.dropContent = true;
return '[' + attribDict.href;
} else {
return '[' + attribDict.href + ' ';
}
} else {
return WSP._serializeHTMLTag( state, token );
}
} else {
return WSP._serializeHTMLTag( state, token );
}
//if ( rtinfo.type === 'wikilink' ) {
// return '[[' + rtinfo.target + ']]';
//} else {
// // external link
// return '[' + rtinfo.
};
WSP._linkEndHandler = function( state, token ) {
var attribDict = state.env.KVtoHash( token.attribs );
if ( attribDict.rel && attribDict.href !== undefined ) {
if ( attribDict.rel === 'mw:wikiLink' ) {
state.dropContent = false;
state.dropTail = false;
return "]]" + (token.dataAttribs.tail ? token.dataAttribs.tail : "");
} else if ( attribDict.rel === 'mw:extLink' ) {
state.dropContent = false;
return (token.dataAttribs.stx === 'urllink') ? '' : ']';
} else {
return WSP._serializeHTMLEndTag( state, token );
}
} else {
return WSP._serializeHTMLEndTag( state, token );
}
};
WSP.tagHandlers = {
body: {
start: {
handle: function(state, token) {
// swallow trailing new line
state.emitNewlineOnNextToken = false;
return '';
}
}
},
ul: {
start: {
startsNewline : true,
handle: WSP._listHandler.bind( null, '*' ),
pairSepNLCount: 2,
newlineTransparent: true
},
end: {
endsLine: true,
handle: WSP._listEndHandler
}
},
ol: {
start: {
startsNewline : true,
handle: WSP._listHandler.bind( null, '#' ),
pairSepNLCount: 2,
newlineTransparent: true
},
end: {
endsLine : true,
handle: WSP._listEndHandler
}
},
dl: {
start: {
startsNewline : true,
handle: WSP._listHandler.bind( null, ''),
pairSepNLCount: 2,
newlineTransparent: true
},
end: {
endsLine: true,
handle: WSP._listEndHandler
}
},
li: {
start: {
handle: WSP._listItemHandler.bind( null, '' ),
singleLine: 1,
pairSepNLCount: 1
},
end: {
singleLine: -1
}
},
// XXX: handle single-line vs. multi-line dls etc
dt: {
start: {
startsNewline: true,
singleLine: 1,
handle: WSP._listItemHandler.bind( null, ';' )
},
end: {
singleLine: -1
}
},
dd: {
start: {
singleLine: 1,
handle: WSP._listItemHandler.bind( null, ":" )
},
end: {
endsLine: true,
singleLine: -1
}
},
// XXX: handle options
table: {
start: {
handle: WSP._serializeTableTag.bind(null, "{|", '')
},
end: {
handle: function(state, token) {
if ( state.prevTagToken && state.prevTagToken.name === 'tr' ) {
this.startsNewline = true;
} else {
this.startsNewline = false;
}
return "|}";
}
}
},
tbody: { start: { ignore: true }, end: { ignore: true } },
th: {
start: {
handle: function ( state, token ) {
if ( token.dataAttribs.stx_v === 'row' ) {
this.startsNewline = false;
return WSP._serializeTableTag("!!", ' |', state, token);
} else {
this.startsNewline = true;
return WSP._serializeTableTag( "!", ' |', state, token);
}
}
}
},
tr: {
start: {
handle: function ( state, token ) {
if ( state.prevToken.constructor === TagTk && state.prevToken.name === 'tbody' ) {
// Omit for first row in a table. XXX: support optional trs
// for first line (in source wikitext) too using some flag in
// data-mw (stx: 'wikitext' ?)
return '';
} else {
return WSP._serializeTableTag("|-", '', state, token );
}
},
startsNewline: true
}
},
td: {
start: {
handle: function ( state, token ) {
if ( token.dataAttribs.stx_v === 'row' ) {
this.startsNewline = false;
return WSP._serializeTableTag("||", ' |', state, token);
} else {
this.startsNewline = true;
return WSP._serializeTableTag("|", ' |', state, token);
}
}
}
},
caption: {
start: {
startsNewline: true,
handle: WSP._serializeTableTag.bind(null, "|+", ' |')
}
},
p: {
make: function(state, token) {
// Special case handling in a list context
// VE embeds list content in paragraph tags
return state.singleLineMode ? WSP.defaultHTMLTagHandler : this;
},
start: {
startsNewline : true,
pairSepNLCount: 2
},
end: {
endsLine: true
}
},
// XXX: support indent variant instead by registering a newline handler?
pre: {
start: {
startsNewline: true,
handle: function( state, token ) {
state.textHandler = function( t ) { return t.replace(/\n/g, '\n ' ); };
return ' ';
}
},
end: {
endsLine: true,
handle: function( state, token) { state.textHandler = null; return ''; }
}
},
meta: {
start: {
handle: function ( state, token ) {
var argDict = state.env.KVtoHash( token.attribs );
if ( argDict['typeof'] === 'mw:tag' ) {
// we use this currently for nowiki and noinclude & co
this.newlineTransparent = true;
return '<' + argDict.content + '>';
} else {
this.newlineTransparent = false;
return WSP._serializeHTMLTag( state, token );
}
}
}
},
hr: {
start: { startsNewline: true, handle: id("----") },
end: { endsLine: true }
},
h1: {
start: { startsNewline: true, handle: id("=") },
end: { endsLine: true, handle: id("=") }
},
h2: {
start: { startsNewline: true, handle: id("==") },
end: { endsLine: true, handle: id("==") }
},
h3: {
start: { startsNewline: true, handle: id("===") },
end: { endsLine: true, handle: id("===") }
},
h4: {
start: { startsNewline: true, handle: id("====") },
end: { endsLine: true, handle: id("====") }
},
h5: {
start: { startsNewline: true, handle: id("=====") },
end: { endsLine: true, handle: id("=====") }
},
h6: {
start: { startsNewline: true, handle: id("======") },
end: { endsLine: true, handle: id("======") }
},
br: {
start: { startsNewline: true, handle: id("") },
end: { endsLine: true }
},
b: {
start: { handle: id("'''") },
end: { handle: id("'''") }
},
i: {
start: { handle: id("''") },
end: { handle: id("''") }
},
a: {
start: { handle: WSP._linkHandler },
end: { handle: WSP._linkEndHandler }
}
};
WSP._serializeAttributes = function ( attribs ) {
var out = [];
for ( var i = 0, l = attribs.length; i < l; i++ ) {
var kv = attribs[i];
if (kv.k.length) {
if ( kv.v.length ) {
out.push( kv.k + '=' +
'"' + kv.v.replace( '"', '&quot;' ) + '"');
} else {
out.push( kv.k );
}
} else if ( kv.v.length ) {
// not very likely..
out.push( kv.v );
}
}
// XXX: round-trip optional whitespace / line breaks etc
return out.join(' ');
};
/**
* Serialize a chunk of tokens
*/
WSP.serializeTokens = function( tokens, chunkCB ) {
var state = $.extend({}, this.defaultOptions, this.options),
i, l;
if ( chunkCB === undefined ) {
var out = [];
state.chunkCB = out.push.bind(out);
for ( i = 0, l = tokens.length; i < l; i++ ) {
this._serializeToken( state, tokens[i] );
}
return out;
} else {
state.chunkCB = chunkCB;
for ( i = 0, l = tokens.length; i < l; i++ ) {
this._serializeToken( state, tokens[i] );
}
}
};
WSP.defaultHTMLTagHandler = {
start: { handle: WSP._serializeHTMLTag },
end : { handle: WSP._serializeHTMLEndTag }
};
WSP._getTokenHandler = function(state, token) {
var handler;
if (token.dataAttribs.stx === 'html') {
handler = this.defaultHTMLTagHandler;
} else {
var tname = token.name;
handler = this.tagHandlers[tname];
if ( handler && handler.make ) {
handler = handler.make(state, token);
}
}
if ( ! handler ) {
handler = this.defaultHTMLTagHandler;
}
if ( token.constructor === TagTk || token.constructor === SelfclosingTagTk ) {
return handler.start || {};
} else {
return handler.end || {};
}
};
/**
* Serialize a token.
*/
WSP._serializeToken = function ( state, token ) {
var handler = {},
res = '',
dropContent = state.dropContent;
state.prevToken = state.curToken;
state.curToken = token;
switch( token.constructor ) {
case TagTk:
case SelfclosingTagTk:
handler = WSP._getTokenHandler( state, token );
if ( ! handler.ignore ) {
state.prevTagToken = state.currTagToken;
state.currTagToken = token;
res = handler.handle ? handler.handle( state, token ) : '';
}
break;
case EndTagTk:
handler = WSP._getTokenHandler( state, token );
if ( ! handler.ignore ) {
state.prevTagToken = state.currTagToken;
state.currTagToken = token;
if ( handler.singleLine < 0 ) {
state.singleLineMode--;
}
res = handler.handle ? handler.handle( state, token ) : '';
}
break;
case String:
res = state.textHandler ? state.textHandler( token ) : token;
break;
case CommentTk:
res = '<!--' + token.value + '-->';
// don't consider comments for changes of the onNewline status
// XXX: convert all non-tag handlers to a similar handler
// structure as tags?
handler = { newlineTransparent: true };
break;
case NlTk:
res = '\n';
res = state.textHandler ? state.textHandler( res ) : res;
break;
case EOFTk:
res = '';
for ( var i = 0, l = state.availableNewlineCount; i < l; i++ ) {
res += '\n';
}
state.chunkCB(res);
break;
default:
res = '';
console.warn( 'Unhandled token type ' + JSON.stringify( token ) );
break;
}
if (! dropContent || ! state.dropContent ) {
var requiredNLCount = state.availableNewlineCount;
if (res !== '') {
// Deal with leading or trailing new lines
var match = res.match( /^((?:\r?\n)*)((?:.*?|[\r\n]+[^\r\n])*?)((?:\r?\n)*)$/ ),
leadingNLs = match[1],
trailingNLs = match[3];
if (leadingNLs === res) {
// all newlines, accumulate count, and clear output
state.availableNewlineCount += leadingNLs.replace(/\r\n/g, '\n').length;
res = "";
} else {
state.availableNewlineCount = trailingNLs.replace(/\r\n/g, '\n').length;
if ( leadingNLs !== '' ) {
requiredNLCount += leadingNLs.replace(/\r\n/g, '\n').length;
}
// strip newlines
res = match[2];
}
}
// Check if we have a pair of identical tag tokens </p><p>; </ul><ul>; etc.
// that have to be separated by extra newlines and add those in.
if (handler.pairSepNLCount && state.prevTagToken &&
state.prevTagToken.constructor === EndTagTk &&
state.prevTagToken.name == token.name ) {
if ( requiredNLCount < handler.pairSepNLCount) {
requiredNLCount = handler.pairSepNLCount;
}
} /*else if (state.singleLineMode) {
// Swallow newlines
// XXX: also swallow newlines in the middle of text content
requiredNLCount = 0;
state.availableNewlineCount = 0;
}*/
if ( state.env.debug ) {
console.warn("tok: " + token + ", res: <" + res + ">" +
", onnl: " + state.onNewline + ", # nls: " +
state.availableNewlineCount + ', reqNLs: ' + requiredNLCount);
}
if (res !== '') {
var out = '';
// Prev token's new line token
if ( !state.singleLineMode &&
( ( !res.match(/^\s*$/) && state.emitNewlineOnNextToken ) ||
( handler.startsNewline && !state.onNewline ) ) )
{
// Emit new line, if necessary
if ( ! requiredNLCount ) {
requiredNLCount++;
}
state.emitNewlineOnNextToken = false;
}
// Add required # of new lines in the beginning
for (i = 0; i < requiredNLCount; i++) {
out += '\n';
}
if ( state.availableNewlineCount > requiredNLCount ) {
// availableNewlineCount was not reset, and requiredNLCount
// can only be incremented.
state.availableNewlineCount -= requiredNLCount;
}
if ( requiredNLCount ) {
state.onNewline = true;
}
// FIXME: This might modify not just the last content token in a
// link, which would be wrong. We'll likely have to collect tokens
// between a tags instead, and strip only the last content token.
if (state.dropTail && res.substr(- state.dropTail.length) === state.dropTail) {
res = res.substr(0, res.length - state.dropTail.length);
}
if ( state.singleLineMode ) {
res = res.replace(/\n/g, ' ');
}
out += res;
if ( res !== '' && !handler.newlineTransparent ) {
state.onNewline = false;
}
state.chunkCB( out );
} else if ( requiredNLCount > state.availableNewlineCount ) {
state.availableNewlineCount = requiredNLCount;
}
/* else {
console.warn("SILENT: tok: " + token + ", res: <" + res + ">" + ", onnl: " + state.onNewline + ", # nls: " + state.availableNewlineCount);
}
*/
if (handler.endsLine) {
// Record end of line
state.emitNewlineOnNextToken = true;
}
if ( handler.singleLine > 0 ) {
state.singleLineMode += handler.singleLine;
}
}
};
/**
* Serialize an HTML DOM document.
*/
WSP.serializeDOM = function( node, chunkCB ) {
var state = $.extend({}, this.defaultOptions, this.options);
//console.warn( node.innerHTML );
if ( ! chunkCB ) {
var out = [];
state.chunkCB = out.push.bind( out );
this._serializeDOM( node, state );
this._serializeToken( state, new EOFTk() );
return out.join('');
} else {
state.chunkCB = chunkCB;
this._serializeDOM( node, state );
this._serializeToken( state, new EOFTk() );
}
};
/**
* Internal worker. Recursively serialize a DOM subtree by creating tokens and
* calling _serializeToken on each of these.
*/
WSP._serializeDOM = function( node, state ) {
// serialize this node
switch( node.nodeType ) {
case Node.ELEMENT_NODE:
//console.warn( node.nodeName.toLowerCase() );
var children = node.childNodes,
name = node.nodeName.toLowerCase(),
tkAttribs = this._getDOMAttribs(node.attributes),
tkRTInfo = this._getDOMRTInfo(node.attributes);
// Serialize the start token
this._serializeToken(state, new TagTk(name, tkAttribs, tkRTInfo));
// then children
for ( var i = 0, l = children.length; i < l; i++ ) {
this._serializeDOM( children[i], state );
}
// then the end token
this._serializeToken(state, new EndTagTk(name, tkAttribs, tkRTInfo));
break;
case Node.TEXT_NODE:
this._serializeToken( state, node.data );
break;
case Node.COMMENT_NODE:
// delay the newline creation until after the comment
var savedEmitNewlineOnNextToken = state.emitNewlineOnNextToken;
state.emitNewlineOnNextToken = false;
this._serializeToken( state, new CommentTk( node.data ) );
state.emitNewlineOnNextToken = savedEmitNewlineOnNextToken;
break;
default:
console.warn( "Unhandled node type: " +
node.outerHTML );
break;
}
};
WSP._getDOMAttribs = function( attribs ) {
// convert to list fo key-value pairs
var out = [];
for ( var i = 0, l = attribs.length; i < l; i++ ) {
var attrib = attribs.item(i);
if ( attrib.name !== 'data-mw' ) {
out.push( { k: attrib.name, v: attrib.value } );
}
}
return out;
};
WSP._getDOMRTInfo = function( attribs ) {
if ( attribs['data-mw'] ) {
return JSON.parse( attribs['data-mw'].value || '{}' );
} else {
return {};
}
};
// Quick HACK: define Node constants locally
// https://developer.mozilla.org/en/nodeType
var Node = {
ELEMENT_NODE: 1,
ATTRIBUTE_NODE: 2,
TEXT_NODE: 3,
CDATA_SECTION_NODE: 4,
ENTITY_REFERENCE_NODE: 5,
ENTITY_NODE: 6,
PROCESSING_INSTRUCTION_NODE: 7,
COMMENT_NODE: 8,
DOCUMENT_NODE: 9,
DOCUMENT_TYPE_NODE: 10,
DOCUMENT_FRAGMENT_NODE: 11,
NOTATION_NODE: 12
};
if (typeof module == "object") {
module.exports.WikitextSerializer = WikitextSerializer;
}