mediawiki-extensions-Visual.../modules/parser/mediawiki.WikitextSerializer.js
Gabriel Wicke b344e140aa Start to use the tokenCollector for links
Now that we have access to the contents we can more easily compare the content
with link targets. This is still to do- this commit only converts the link
handler to work on the collected tokens.

* Start to implement latest RDFa spec from
  http://www.mediawiki.org/wiki/Parsoid/RDFa_vocabulary
* Capitalize types, add mw:Entity type for html entities
* Detect changes to entities using tokenCollector and srcContent

Change-Id: I45429f4b930858a16e166ef8377c8f6f5114c414
2012-07-19 14:56:32 -07:00

1397 lines
38 KiB
JavaScript

/**
* Serializes a chunk of tokens or an HTML DOM to MediaWiki's wikitext flavor.
*
* @class
* @constructor
* @param options {Object} List of options for serialization
*/
WikitextSerializer = function( options ) {
this.options = $.extend( {
// defaults
}, options || {} );
};
require('./core-upgrade.js');
var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer;
var WikitextConstants = require('./mediawiki.wikitext.constants.js').WikitextConstants;
var WSP = WikitextSerializer.prototype;
/* *********************************************************************
* Here is what the state attributes mean:
*
* listStack
* Stack of list contexts to let us emit wikitext for nested lists.
* Each context keeps track of 3 values:
* - itemBullet: the wikitext bullet char for this list
* - itemCount : # of list items encountered so far for the list
* - bullets : cumulative bullet prefix based on all the lists
* that enclose the current list
*
* onNewline
* true on start of file or after a new line has been emitted.
*
* onStartOfLine
* true when onNewline is true, and also in other start-of-line contexts
* Ex: after a comment has been emitted, or after include/noinclude tokens.
*
* singleLineMode
* - if (> 0), we cannot emit any newlines.
* - this value changes as we entire/exit dom subtrees that require
* single-line wikitext output. WSP._tagHandlers specify single-line
* mode for individual tags.
*
* availableNewlineCount
* # of newlines that have been encountered so far but not emitted yet.
* Newlines are buffered till they need to be output. This lets us
* swallow newlines in contexts where they shouldn't be emitted for
* ensuring equivalent wikitext output. (ex dom: ..</li>\n\n</li>..)
* ********************************************************************* */
WSP.initialState = {
listStack: [],
onNewline: true,
onStartOfLine : true,
availableNewlineCount: 0,
singleLineMode: 0,
tokens: []
};
WSP.escapeWikiText = function ( state, text ) {
// tokenize the text
var p = new PegTokenizer( state.env ),
tokens = [];
p.on('chunk', function ( chunk ) {
//console.warn( JSON.stringify(chunk));
tokens.push.apply( tokens, chunk );
});
p.on('end', function(){
//console.warn( JSON.stringify('end'));
});
// this is synchronous for now, will still need sync version later, or
// alternatively make text processing in the serializer async
var prefixedText = text;
var inNewlineContext = state.onNewLine;
if ( ! inNewlineContext ) {
// Prefix '_' so that no start-of-line wiki syntax matches. Strip it from
// the result.
prefixedText = '_' + text;
}
if ( state.inIndentPre ) {
prefixedText = prefixedText.replace(/(\r?\n)/g, '$1_');
}
// FIXME: parse using
p.process( prefixedText );
if ( ! inNewlineContext ) {
// now strip the leading underscore.
if ( tokens[0] === '_' ) {
tokens.shift();
} else {
tokens[0] = tokens[0].substr(1);
}
}
// state.inIndentPre is handled on the complete output
//
// wrap any run of non-text tokens into <nowiki> tags using the source
// offsets of top-level productions
// return the updated text
var outTexts = [],
nonTextTokenAccum = [],
cursor = 0;
function wrapNonTextTokens () {
if ( nonTextTokenAccum.length ) {
var missingRangeEnd = false;
// TODO: make sure the source positions are always set!
// The start range
var startRange = nonTextTokenAccum[0].dataAttribs.tsr,
rangeStart, rangeEnd;
if ( ! startRange ) {
console.warn( 'No tsr on ' + nonTextTokenAccum[0] );
rangeStart = cursor;
} else {
rangeStart = startRange[0];
if ( ! inNewlineContext ) {
// compensate for underscore.
rangeStart--;
}
cursor = rangeStart;
}
var endRange = nonTextTokenAccum.last().dataAttribs.tsr;
if ( ! endRange ) {
// FIXME: improve this!
//rangeEnd = state.env.tokensToString( tokens ).length;
// Be conservative and extend the range to the end for now.
// Alternatives: only extend it to the next token with range
// info on it.
missingRangeEnd = true;
rangeEnd = text.length;
} else {
rangeEnd = endRange[1];
if ( ! inNewlineContext ) {
// compensate for underscore.
rangeEnd--;
}
}
var escapedSource = text.substr( rangeStart, rangeEnd - rangeStart )
.replace( /<(\/?nowiki)>/g, '&lt;$1&gt;' );
outTexts.push( '<nowiki>' );
outTexts.push( escapedSource );
outTexts.push( '</nowiki>' );
cursor += 17 + escapedSource.length;
if ( missingRangeEnd ) {
throw 'No tsr on end token: ' + nonTextTokenAccum.last();
}
nonTextTokenAccum = [];
}
}
try {
for ( var i = 0, l = tokens.length; i < l; i++ ) {
var token = tokens[i];
switch ( token.constructor ) {
case String:
wrapNonTextTokens();
outTexts.push(
token
// Angle brackets forming HTML tags are picked up as
// tags and escaped with nowiki. Remaining angle
// brackets can remain unescaped in the wikitext. They
// are entity-escaped by the HTML5 DOM serializer when
// outputting the HTML DOM.
//.replace(/</g, '&lt;').replace(/>/g, '&gt;')
);
cursor += token.length;
break;
case NlTk:
wrapNonTextTokens();
outTexts.push( '\n' );
cursor++;
break;
case EOFTk:
wrapNonTextTokens();
break;
case TagTk:
case SelfclosingTagTk:
var argDict = state.env.KVtoHash( token.attribs );
if ( argDict['data-gen'] === 'both' &&
// XXX: move the decision whether to escape or not
// into individual handlers!
token.dataAttribs.src )
{
wrapNonTextTokens();
// push out the original source
// XXX: This assumes the content was not
// modified for now.
outTexts.push( token.dataAttribs.src
// escape ampersands in entity text
.replace(/&(#?[0-9a-zA-Z]{2,20};)/, '&amp;$1') );
// skip generated tokens
for ( ; i < l; i ++) {
var tk = tokens[i];
if ( tk.constructor === EndTagTk &&
tk.name === token.name ) {
break;
}
}
} else {
nonTextTokenAccum.push(token);
}
break;
default:
//console.warn('pushing ' + token);
nonTextTokenAccum.push(token);
break;
}
}
} catch ( e ) {
console.warn( e );
}
//console.warn( 'escaped wikiText: ' + outTexts.join('') );
var res = outTexts.join('');
if ( state.inIndentPre ) {
return res.replace(/\n_/g, '\n');
} else {
return res;
}
};
var id = function(v) {
return function( state ) {
return v;
};
};
var installCollector = function ( collectorConstructor, cb, handler, state, token ) {
state.tokenCollector = new collectorConstructor( token, cb, handler );
return '';
};
var endTagMatchTokenCollector = function ( tk, cb ) {
var tokens = [tk];
return {
cb: cb,
collect: function ( state, token ) {
tokens.push( token );
if ( token.constructor === EndTagTk &&
token.name === tk.name ) {
// finish collection
if ( this.cb ) {
// abort further token processing since the cb handled it
return this.cb( state, tokens );
} else {
// let a handler deal with token processing
return false;
}
} else {
// continue collection
return true;
}
},
tokens: tokens
};
};
var closeHeading = function(v) {
return function(state, token) {
var prevToken = state.prevToken;
// Deal with empty headings. Ex: <h1></h1>
if (prevToken.constructor === TagTk && prevToken.name === token.name) {
return "<nowiki></nowiki>" + v;
} else {
return v;
}
};
};
function isListItem(token) {
if (token.constructor !== TagTk) return false;
var tokenName = token.name;
return (tokenName === 'li' || tokenName === 'dt' || tokenName === 'dd');
}
WSP._listHandler = function( handler, bullet, state, token ) {
if ( state.singleLineMode ) {
state.singleLineMode--;
}
var bullets, res;
var stack = state.listStack;
if (stack.length === 0) {
bullets = bullet;
res = bullets;
handler.startsNewline = true;
} else {
var curList = stack.last();
//console.warn(JSON.stringify( stack ));
bullets = curList.bullets + curList.itemBullet + bullet;
curList.itemCount++;
// A nested list, not directly after a list item
if (curList.itemCount > 1 && !isListItem(state.prevToken)) {
res = bullets;
handler.startsNewline = true;
} else {
res = bullet;
handler.startsNewline = false;
}
}
stack.push({ itemCount: 0, bullets: bullets, itemBullet: ''});
state.env.dp('lh res', bullets, res, handler );
return res;
};
WSP._listEndHandler = function( state, token ) {
state.listStack.pop();
return '';
};
WSP._listItemHandler = function ( handler, bullet, state, token ) {
function isRepeatToken(state, token) {
return state.prevToken.constructor === EndTagTk &&
state.prevToken.name === token.name;
}
function isMultiLineDtDdPair(state, token) {
return token.name === 'dd' &&
token.dataAttribs.stx !== 'row' &&
state.prevTagToken.constructor === EndTagTk &&
state.prevTagToken.name === 'dt';
}
var stack = state.listStack;
// This check is required to handle cases where the DOM is not well-formed.
//
// FIXME NOTE: This is required currently to deal with bugs in the parser
// as it deals with complex cases. But, in the future, we could deal with
// this in one of the following ways:
// (a) The serializer expects a well-formed DOM and all cleanup will be
// done as part of external tools/passes.
// (b) The serializer supports a small set of exceptional cases and bare
// list items could be one of them
// (c) The serializer ought to handle any DOM that is thrown at it.
//
// Yet to be resolved.
if (stack.length === 0) {
stack.push({ itemCount: 0, bullets: bullet, itemBullet: bullet});
}
var curList = stack[stack.length - 1];
curList.itemCount++;
curList.itemBullet = bullet;
// Output bullet prefix only if:
// - this is not the first list item
// - we are either in:
// * a new line context,
// * seeing an identical token as the last one (..</li><li>...)
// (since we are in this handler on encountering a list item token,
// this means we are the 2nd or later item in the list, BUT without
// any intervening new lines or other tokens in between)
// * on the dd part of a multi-line dt-dd pair
// (The dd on a single-line dt-dd pair sticks to the dt.
// which means it won't get the bullets that the dt already got).
//
// SSS FIXME: This condition could be rephrased as:
//
// if (isRepeatToken(state, token) ||
// (curList.itemCount > 1 && (inStartOfLineContext(state) || isMultiLineDtDdPair(state, token))))
//
var res;
if (curList.itemCount > 1 &&
( state.onStartOfLine ||
isRepeatToken(state, token) ||
isMultiLineDtDdPair(state, token)
)
)
{
handler.startsNewline = true;
res = curList.bullets + bullet;
} else {
handler.startsNewline = false;
res = bullet;
}
state.env.dp( 'lih', token, res, handler );
return res;
};
WSP._figureHandler = function ( state, figTokens ) {
// skip tokens looking for the image tag
var img;
var i = 1, n = figTokens.length;
while (i < n) {
if (figTokens[i].name === "img") {
img = figTokens[i];
break;
}
i++;
}
// skip tokens looking for the start and end caption tags
var fcStartIndex = 0, fcEndIndex = 0;
while (i < n) {
if (figTokens[i].name === "figcaption") {
if (fcStartIndex > 0) {
fcEndIndex = i;
break;
} else {
fcStartIndex = i;
}
}
i++;
}
// Call the serializer to build the caption
var caption = state.serializer.serializeTokens(figTokens.slice(fcStartIndex+1, fcEndIndex)).join('');
// Get the image resource name
// FIXME: file name has been capitalized -- need some fix in the parser
var argDict = state.env.KVtoHash( img.attribs );
var imgR = argDict.resource.replace(/(^\[:)|(\]$)/g, '');
// Now, build the complete wikitext for the figure
var outBits = [imgR];
var figToken = figTokens[0];
var figAttrs = figToken.dataAttribs.optionList;
var simpleImgOptions = WikitextConstants.Image.SimpleOptions;
var prefixImgOptions = WikitextConstants.Image.PrefixOptions;
var sizeOptions = { "width": 1, "height": 1};
var size = {};
for (i = 0, n = figAttrs.length; i < n; i++) {
var a = figAttrs[i];
var k = a.k, v = a.v;
if (sizeOptions[k]) {
size[k] = v;
} else {
// Output size first and clear it
var w = size.width;
if (w) {
outBits.push(w + (size.height ? "x" + size.height : '') + "px");
size.width = null;
}
if (k === "aspect") {
// SSS: Bad Hack! Need a better solution
// One solution is to search through prefix options hash but seems ugly.
// Another is to flip prefix options hash and use it to search.
if (v) {
outBits.push("upright=" + v);
} else {
outBits.push("upright");
}
} else if (simpleImgOptions[v.trim()] === k) {
// The values and keys in the parser attributes are a flip
// of how they are in the wikitext constants image hash
// Hence the indexing by 'v' instead of 'k'
outBits.push(v);
} else if (prefixImgOptions[k.trim()]) {
outBits.push(k + "=" + v);
} else {
console.warn("Unknown image option encountered: " + JSON.stringify(a));
}
}
}
if (caption) {
outBits.push(caption);
}
return "[[" + outBits.join('|') + "]]";
};
WSP._serializeTableTag = function ( symbol, optionEndSymbol, state, token ) {
if ( token.attribs.length ) {
return symbol + ' ' + WSP._serializeAttributes( token.attribs ) + optionEndSymbol;
} else {
return symbol;
}
};
WSP._emptyTags = { br: true, meta: true };
WSP._serializeHTMLTag = function ( state, token ) {
var close = '';
if ( WSP._emptyTags[ token.name ] ) {
close = '/';
}
if ( token.name === 'pre' ) {
// html-syntax pre is very similar to nowiki
state.inHTMLPre = true;
}
if ( token.attribs.length ) {
return '<' + token.name + ' ' +
WSP._serializeAttributes( token.attribs ) + close + '>';
} else {
return '<' + token.name + close + '>';
}
};
WSP._serializeHTMLEndTag = function ( state, token ) {
if ( token.name === 'pre' ) {
state.inHTMLPre = false;
}
if ( ! WSP._emptyTags[ token.name ] ) {
return '</' + token.name + '>';
} else {
return '';
}
};
WSP._linkHandler = function( state, tokens ) {
//return '[[';
// TODO: handle internal/external links etc using RDFa and dataAttribs
// Also convert unannotated html links without advanced attributes to
// external wiki links for html import. Might want to consider converting
// relative links without path component and file extension to wiki links.
var env = state.env,
token = tokens.shift(),
endToken = tokens.pop();
var attribDict = env.KVtoHash( token.attribs );
if ( attribDict.rel && attribDict.href !== undefined ) {
var tokenData = token.dataAttribs;
if ( attribDict.rel === 'mw:WikiLink' ) {
var base = env.wgScriptPath;
var href = attribDict.href;
var prefix = href.substr(0, base.length);
var target = (prefix === base) ? href.substr(base.length) : href;
target = decodeURIComponent(target);
var tail = tokenData.tail;
if ( tail && tail.length ) {
target = tokenData.gc ? tokenData.sHref : target.replace( /_/g, ' ' );
} else {
tail = '';
var origLinkTgt = tokenData.sHref;
if (origLinkTgt) {
// Normalize the source target so that we can compare it
// with href.
var normalizedOrigLinkTgt = env.normalizeTitle( env.tokensToString(origLinkTgt) );
if ( normalizedOrigLinkTgt === target ) {
// Non-standard capitalization
target = origLinkTgt;
}
} else {
target = target.replace( /_/g, ' ' );
}
}
// FIXME: Properly handle something like [[{{Foo}}]]s
target = env.tokensToString( target );
if ( tokenData.gc ) {
return '[[' + target + ']]' + tail;
} else {
var content = state.serializer.serializeTokens( tokens ).join('');
if (tail && content.substr(- tail.length) === tail) {
content = content.substr(0, content.length - tail.length);
}
return '[[' + target + '|' + content + ']]' + tail;
}
} else if ( attribDict.rel === 'mw:ExtLink' ) {
// TODO: use data-{gen,sem,special} instead!
if ( tokenData.stx === 'urllink' ) {
return attribDict.href;
} else if ( tokenData.gc ) {
return '[' + attribDict.href + ']';
} else {
return '[' + attribDict.href + ' ' +
state.serializer.serializeTokens( tokens ).join('') +
']';
}
} else {
// Unknown rel was set
return WSP._serializeHTMLTag( state, token );
}
} else {
// TODO: default to extlink for simple links with unknown rel set
// switch to html only when needed to support attributes
var isComplexLink = function ( attribDict ) {
for ( var name in attribDict ) {
if ( name && ! ( name in { href: 1 } ) ) {
return true;
}
}
return false;
};
if ( true || isComplexLink ( attribDict ) ) {
// Complex attributes we can't support in wiki syntax
return WSP._serializeHTMLTag( state, token ) +
state.serializer.serializeTokens( tokens ) +
WSP._serializeHTMLEndTag( state, endToken );
} else {
// TODO: serialize as external wikilink
return '';
}
}
//if ( rtinfo.type === 'wikilink' ) {
// return '[[' + rtinfo.target + ']]';
//} else {
// // external link
// return '[' + rtinfo.
};
WSP.genContentSpanTypes = { 'mw:Nowiki':1, 'mw:Entity': 1 };
/**
* Compare the actual content with the previous content and use
* dataAttribs.src if it does. Return serialization of modified content
* otherwise.
*/
WSP.compareSourceHandler = function ( state, tokens ) {
var token = tokens.shift(),
lastToken = tokens.pop(),
content = state.env.tokensToString( tokens, true );
if ( content.constructor !== String ) {
return state.serializer.serializeTokens( tokens ).join('');
} else if ( content === token.dataAttribs.srcContent ) {
return token.dataAttribs.src;
} else {
return content;
}
};
/* *********************************************************************
* startsNewline
* if true, the wikitext for the dom subtree rooted
* at this html tag requires a new line context.
*
* endsLine
* if true, the wikitext for the dom subtree rooted
* at this html tag ends the line.
*
* pairsSepNlCount
* # of new lines required between wikitext for dom siblings
* of the same tag type (..</p><p>.., etc.)
*
* newlineTransparent
* if true, this token does not change the newline status
* after it is emitted.
*
* singleLine
* if 1, the wikitext for the dom subtree rooted at this html tag
* requires all content to be emitted on the same line without
* any line breaks. +1 sets the single-line mode (on descending
* the dom subtree), -1 clears the single-line mod (on exiting
* the dom subtree).
*
* ignore
* if true, the serializer pretends as if it never saw this token.
* ********************************************************************* */
WSP.tagHandlers = {
body: {
end: {
handle: function(state, token) {
// swallow trailing new line
state.emitNewlineOnNextToken = false;
return '';
}
}
},
ul: {
start: {
startsNewline : true,
handle: function ( state, token ) {
return WSP._listHandler( this, '*', state, token );
},
pairSepNLCount: 2,
newlineTransparent: true
},
end: {
endsLine: true,
handle: WSP._listEndHandler
}
},
ol: {
start: {
startsNewline : true,
handle: function ( state, token ) {
return WSP._listHandler( this, '#', state, token );
},
pairSepNLCount: 2,
newlineTransparent: true
},
end: {
endsLine : true,
handle: WSP._listEndHandler
}
},
dl: {
start: {
startsNewline : true,
handle: function ( state, token ) {
return WSP._listHandler( this, '', state, token );
},
pairSepNLCount: 2
},
end: {
endsLine: true,
handle: WSP._listEndHandler
}
},
li: {
start: {
handle: function ( state, token ) {
return WSP._listItemHandler( this, '', state, token );
},
singleLine: 1,
pairSepNLCount: 1
},
end: {
singleLine: -1
}
},
// XXX: handle single-line vs. multi-line dls etc
dt: {
start: {
singleLine: 1,
handle: function ( state, token ) {
return WSP._listItemHandler( this, ';', state, token );
},
pairSepNLCount: 1,
newlineTransparent: true
},
end: {
singleLine: -1
}
},
dd: {
start: {
singleLine: 1,
handle: function ( state, token ) {
return WSP._listItemHandler( this, ':', state, token );
},
pairSepNLCount: 1,
newlineTransparent: true
},
end: {
endsLine: true,
singleLine: -1
}
},
// XXX: handle options
table: {
start: {
handle: WSP._serializeTableTag.bind(null, "{|", '')
},
end: {
handle: function(state, token) {
if ( state.prevTagToken && state.prevTagToken.name === 'tr' ) {
this.startsNewline = true;
} else {
this.startsNewline = false;
}
return "|}";
}
}
},
tbody: { start: { ignore: true }, end: { ignore: true } },
th: {
start: {
handle: function ( state, token ) {
if ( token.dataAttribs.stx_v === 'row' ) {
this.startsNewline = false;
return WSP._serializeTableTag("!!", ' |', state, token);
} else {
this.startsNewline = true;
return WSP._serializeTableTag( "!", ' |', state, token);
}
}
}
},
tr: {
start: {
handle: function ( state, token ) {
if ( state.prevToken.constructor === TagTk && state.prevToken.name === 'tbody' ) {
// Omit for first row in a table. XXX: support optional trs
// for first line (in source wikitext) too using some flag in
// data-rt (stx: 'wikitext' ?)
return '';
} else {
return WSP._serializeTableTag("|-", '', state, token );
}
},
startsNewline: true
}
},
td: {
start: {
handle: function ( state, token ) {
if ( token.dataAttribs.stx_v === 'row' ) {
this.startsNewline = false;
return WSP._serializeTableTag("||", ' |', state, token);
} else {
this.startsNewline = true;
return WSP._serializeTableTag("|", ' |', state, token);
}
}
}
},
caption: {
start: {
startsNewline: true,
handle: WSP._serializeTableTag.bind(null, "|+", ' |')
}
},
p: {
make: function(state, token) {
// "stx": "html" tags never get here
// Special case handling in a list context
// VE embeds list content in paragraph tags.
//
// SSS FIXME: This will *NOT* work if the list item has nested paragraph tags!
var prevToken = state.prevToken;
if ( token.attribs.length === 0 &&
( (state.listStack.length > 0 && isListItem(prevToken)) ||
(prevToken.constructor === TagTk && prevToken.name === 'td') ||
(state.ignorePTag && token.constructor === EndTagTk)))
{
state.ignorePTag = !state.ignorePTag;
return { start: { ignore: true }, end: { ignore: true } };
} else {
return state.singleLineMode ? WSP.defaultHTMLTagHandler : this;
}
},
start: {
startsNewline : true,
pairSepNLCount: 2
},
end: {
endsLine: true
}
},
// XXX: support indent variant instead by registering a newline handler?
pre: {
start: {
startsNewline: true,
pairSepNLCount: 2,
handle: function( state, token ) {
state.inIndentPre = true;
state.textHandler = function( t ) {
return t.replace(/\n/g, '\n ' );
};
return ' ';
}
},
end: {
endsLine: true,
handle: function( state, token) {
state.inIndentPre = false;
state.textHandler = null;
return '';
}
}
},
meta: {
start: {
handle: function ( state, token ) {
var argDict = state.env.KVtoHash( token.attribs );
if ( argDict['typeof'] === 'mw:tag' ) {
// we use this currently for nowiki and noinclude & co
this.newlineTransparent = true;
if ( argDict.content === 'nowiki' ) {
state.inNoWiki = true;
} else if ( argDict.content === '/nowiki' ) {
state.inNoWiki = false;
} else {
console.warn( JSON.stringify( argDict ) );
}
return '<' + argDict.content + '>';
} else if ( argDict['typeof'] === 'mw:noinclude' ) {
this.newlineTransparent = true;
if ( token.dataAttribs.src === '<noinclude>' ) {
return '<noinclude>';
} else {
return '</noinclude>';
}
} else {
this.newlineTransparent = false;
return WSP._serializeHTMLTag( state, token );
}
}
}
},
span: {
start: {
handle: function( state, token ) {
var argDict = state.env.KVtoHash( token.attribs );
if ( argDict['typeof'] in WSP.genContentSpanTypes ) {
if ( argDict['typeof'] === 'mw:Nowiki' ) {
state.inNoWiki = true;
return '<nowiki>';
} else if ( token.dataAttribs.src ) {
// FIXME: compare content with original content
return installCollector(
endTagMatchTokenCollector,
WSP.compareSourceHandler,
this,
state, token
);
}
} else {
// Fall back to plain HTML serialization for spans created
// by the editor
return WSP._serializeHTMLTag( state, token );
}
}
},
end: {
handle: function ( state, token ) {
var argDict = state.env.KVtoHash( token.attribs );
if ( argDict['typeof'] in WSP.genContentSpanTypes ) {
if ( argDict['typeof'] === 'mw:Nowiki' ) {
state.inNoWiki = false;
return '</nowiki>';
}
} else {
// Fall back to plain HTML serialization for spans created
// by the editor
return WSP._serializeHTMLEndTag( state, token );
}
}
}
},
figure: {
start: {
handle: function ( state, token ) {
state.tokenCollector = endTagMatchTokenCollector( token, WSP._figureHandler );
// Set the handler- not terribly useful since this one doesn't
// have any flags, but still useful for general testing
state.tokenCollector.handler = this;
return '';
}
}
},
hr: {
start: {
startsNewline: true,
endsLine: true,
handle: function(state, token) {
var extra_dashes = token.dataAttribs.extra_dashes;
if (extra_dashes && (extra_dashes > 0)) {
var buf = ["----"];
for (var i = 0; i < extra_dashes; i++) {
buf.push("-");
}
return buf.join('');
} else {
// num_dashes undefined OR exactly 4
return "----";
}
}
}
},
h1: {
start: { startsNewline: true, handle: id("="), defaultStartNewlineCount: 2 },
end: { endsLine: true, handle: closeHeading("=") }
},
h2: {
start: { startsNewline: true, handle: id("=="), defaultStartNewlineCount: 2 },
end: { endsLine: true, handle: closeHeading("==") }
},
h3: {
start: { startsNewline: true, handle: id("==="), defaultStartNewlineCount: 2 },
end: { endsLine: true, handle: closeHeading("===") }
},
h4: {
start: { startsNewline: true, handle: id("===="), defaultStartNewlineCount: 2 },
end: { endsLine: true, handle: closeHeading("====") }
},
h5: {
start: { startsNewline: true, handle: id("====="), defaultStartNewlineCount: 2 },
end: { endsLine: true, handle: closeHeading("=====") }
},
h6: {
start: { startsNewline: true, handle: id("======"), defaultStartNewlineCount: 2 },
end: { endsLine: true, handle: closeHeading("======") }
},
br: {
start: {
startsNewline: true,
endsLine: true,
handle: id("")
}
},
b: {
start: { handle: id("'''") },
end: { handle: id("'''") }
},
i: {
start: { handle: id("''") },
end: { handle: id("''") }
},
a: {
start: {
handle: installCollector.bind(null,
endTagMatchTokenCollector,
WSP._linkHandler,
this
)
}
}
};
WSP._serializeAttributes = function ( attribs ) {
var out = [];
for ( var i = 0, l = attribs.length; i < l; i++ ) {
var kv = attribs[i];
if (kv.k.length) {
if ( kv.v.length ) {
out.push( kv.k + '=' +
'"' + kv.v.replace( '"', '&quot;' ) + '"');
} else {
out.push( kv.k );
}
} else if ( kv.v.length ) {
// not very likely..
out.push( kv.v );
}
}
// XXX: round-trip optional whitespace / line breaks etc
return out.join(' ');
};
/**
* Serialize a chunk of tokens
*/
WSP.serializeTokens = function( tokens, chunkCB ) {
var state = $.extend({}, this.initialState, this.options),
i, l;
state.serializer = this;
if ( chunkCB === undefined ) {
var out = [];
state.chunkCB = out.push.bind(out);
for ( i = 0, l = tokens.length; i < l; i++ ) {
this._serializeToken( state, tokens[i] );
}
return out;
} else {
state.chunkCB = chunkCB;
for ( i = 0, l = tokens.length; i < l; i++ ) {
this._serializeToken( state, tokens[i] );
}
}
};
WSP.defaultHTMLTagHandler = {
start: { isNewlineEquivalent: true, handle: WSP._serializeHTMLTag },
end : { isNewlineEquivalent: true, handle: WSP._serializeHTMLEndTag }
};
WSP._getTokenHandler = function(state, token) {
var handler;
if ( token.dataAttribs.src !== undefined &&
state.env.lookup( token.attribs, 'data-gen' ) === 'both' ) {
// implement generic src round-tripping:
// return src, and drop the generated content
if ( token.constructor === TagTk ) {
state.tokenCollector = endTagMatchTokenCollector( token );
return { handle: id( token.dataAttribs.src ) };
} else if ( token.constructor === SelfclosingTagTk ) {
return { handle: id( token.dataAttribs.src ) };
} else { // EndTagTk
state.tokenCollector = null;
return { handle: id('') };
}
} else if (token.dataAttribs.stx === 'html') {
handler = this.defaultHTMLTagHandler;
} else {
var tname = token.name;
handler = this.tagHandlers[tname];
if ( handler && handler.make ) {
handler = handler.make(state, token);
}
}
if ( ! handler ) {
handler = this.defaultHTMLTagHandler;
}
if ( token.constructor === TagTk || token.constructor === SelfclosingTagTk ) {
return handler.start || {};
} else {
return handler.end || {};
}
};
/**
* Serialize a token.
*/
WSP._serializeToken = function ( state, token ) {
var res = '',
collectorResult = false,
handler = {},
dropContent = state.dropContent;
if (state.tokenCollector) {
collectorResult = state.tokenCollector.collect( state, token );
if ( collectorResult === true ) {
// continue collecting
return;
} else if ( collectorResult !== false ) {
res = collectorResult;
if ( state.tokenCollector.handler ) {
handler = state.tokenCollector.handler;
}
state.tokenCollector = null;
}
}
if ( collectorResult === false ) {
state.prevToken = state.curToken;
state.curToken = token;
// The serializer is logically in a new line context if a new line is pending
if (state.emitNewlineOnNextToken || (state.availableNewlineCount > 0)) {
state.onNewline = true;
state.onStartOfLine = true;
}
switch( token.constructor ) {
case TagTk:
case SelfclosingTagTk:
handler = WSP._getTokenHandler( state, token );
if ( ! handler.ignore ) {
state.prevTagToken = state.currTagToken;
state.currTagToken = token;
res = handler.handle ? handler.handle( state, token ) : '';
}
break;
case EndTagTk:
handler = WSP._getTokenHandler( state, token );
if ( ! handler.ignore ) {
state.prevTagToken = state.currTagToken;
state.currTagToken = token;
if ( handler.singleLine < 0 && state.singleLineMode ) {
state.singleLineMode--;
}
res = handler.handle ? handler.handle( state, token ) : '';
}
break;
case String:
res = ( state.inNoWiki || state.inHTMLPre ) ? token
: this.escapeWikiText( state, token );
res = state.textHandler ? state.textHandler( res ) : res;
break;
case CommentTk:
res = '<!--' + token.value + '-->';
// don't consider comments for changes of the onStartOfLine status
// XXX: convert all non-tag handlers to a similar handler
// structure as tags?
handler = { newlineTransparent: true };
break;
case NlTk:
res = '\n';
res = state.textHandler ? state.textHandler( res ) : res;
break;
case EOFTk:
res = '';
for ( var i = 0, l = state.availableNewlineCount; i < l; i++ ) {
res += '\n';
}
state.chunkCB(res);
break;
default:
res = '';
console.warn( 'Unhandled token type ' + JSON.stringify( token ) );
break;
}
}
if (! dropContent || ! state.dropContent ) {
var newTrailingNLCount = 0;
if (res !== '') {
// Strip leading or trailing newlines from the returned string
var match = res.match( /^((?:\r?\n)*)((?:.*?|[\r\n]+[^\r\n])*?)((?:\r?\n)*)$/ ),
leadingNLs = match[1],
trailingNLs = match[3];
if (leadingNLs === res) {
// all newlines, accumulate count, and clear output
state.availableNewlineCount += leadingNLs.replace(/\r\n/g, '\n').length;
res = "";
} else {
newTrailingNLCount = trailingNLs.replace(/\r\n/g, '\n').length;
if ( leadingNLs !== '' ) {
state.availableNewlineCount += leadingNLs.replace(/\r\n/g, '\n').length;
}
// strip newlines
res = match[2];
}
}
// Check if we have a pair of identical tag tokens </p><p>; </ul><ul>; etc.
// that have to be separated by extra newlines and add those in.
if (handler.pairSepNLCount && state.prevTagToken &&
state.prevTagToken.constructor === EndTagTk &&
state.prevTagToken.name === token.name )
{
if ( state.availableNewlineCount < handler.pairSepNLCount) {
state.availableNewlineCount = handler.pairSepNLCount;
}
}
if (state.env.debug) {
console.warn(token +
", res: " + JSON.stringify( res ) +
", nl: " + state.onNewline +
", sol: " + state.onStartOfLine +
', eon:' + state.emitNewlineOnNextToken +
", #nl: " + state.availableNewlineCount +
', #new:' + newTrailingNLCount);
}
if (res !== '') {
var out = '';
// If this is not a html tag and the serializer is not in single-line mode,
// allocate a newline if
// - prev token needs a single line,
// - handler starts a new line and we aren't on a new line,
//
// Newline-equivalent tokens (HTML tags for example) don't get
// implicit newlines.
if (!handler.isNewlineEquivalent &&
!state.singleLineMode &&
!state.availableNewlineCount &&
((!res.match(/^\s*$/) && state.emitNewlineOnNextToken) ||
(!state.onStartOfLine && handler.startsNewline)))
{
state.availableNewlineCount = handler.defaultStartNewlineCount || 1;
}
// Add required # of new lines in the beginning
for (; state.availableNewlineCount; state.availableNewlineCount--) {
out += '\n';
}
if ( state.singleLineMode ) {
res = res.replace(/\n/g, ' ');
}
out += res;
state.env.dp(' =>', out);
state.chunkCB( out );
// Update new line state
// 1. If this token generated new trailing new lines, we are in a newline state again.
// If not, we are not! But, handle onStartOfLine specially.
if (newTrailingNLCount > 0) {
state.availableNewlineCount = newTrailingNLCount;
state.onNewline = true;
state.onStartOfLine = true;
} else {
state.availableNewlineCount = 0;
state.onNewline = false;
if (!handler.newlineTransparent) {
state.onStartOfLine = false;
}
}
// 2. Previous token nl state is no longer relevant
state.emitNewlineOnNextToken = false;
} else if ( handler.startsNewline && !state.onStartOfLine ) {
state.emitNewlineOnNextToken = true;
}
if (handler.endsLine) {
// Record end of line
state.emitNewlineOnNextToken = true;
}
if ( handler.singleLine > 0 ) {
state.singleLineMode += handler.singleLine;
}
}
};
/**
* Serialize an HTML DOM document.
*/
WSP.serializeDOM = function( node, chunkCB ) {
try {
var state = $.extend({}, this.initialState, this.options);
state.serializer = this;
//console.warn( node.innerHTML );
if ( ! chunkCB ) {
var out = [];
state.chunkCB = out.push.bind( out );
this._serializeDOM( node, state );
this._serializeToken( state, new EOFTk() );
return out.join('');
} else {
state.chunkCB = chunkCB;
this._serializeDOM( node, state );
this._serializeToken( state, new EOFTk() );
}
} catch (e) {
console.warn(e.stack);
}
};
/**
* Internal worker. Recursively serialize a DOM subtree by creating tokens and
* calling _serializeToken on each of these.
*/
WSP._serializeDOM = function( node, state ) {
// serialize this node
switch( node.nodeType ) {
case Node.ELEMENT_NODE:
//console.warn( node.nodeName.toLowerCase() );
var children = node.childNodes,
name = node.nodeName.toLowerCase(),
tkAttribs = this._getDOMAttribs(node.attributes),
tkRTInfo = this._getDOMRTInfo(node.attributes);
// Serialize the start token
this._serializeToken(state, new TagTk(name, tkAttribs, tkRTInfo));
// then children
for ( var i = 0, l = children.length; i < l; i++ ) {
this._serializeDOM( children[i], state );
}
// then the end token
this._serializeToken(state, new EndTagTk(name, tkAttribs, tkRTInfo));
break;
case Node.TEXT_NODE:
this._serializeToken( state, node.data );
break;
case Node.COMMENT_NODE:
// delay the newline creation until after the comment
var savedEmitNewlineOnNextToken = state.emitNewlineOnNextToken;
state.emitNewlineOnNextToken = false;
this._serializeToken( state, new CommentTk( node.data ) );
state.emitNewlineOnNextToken = savedEmitNewlineOnNextToken;
break;
default:
console.warn( "Unhandled node type: " +
node.outerHTML );
break;
}
};
WSP._getDOMAttribs = function( attribs ) {
// convert to list fo key-value pairs
var out = [];
for ( var i = 0, l = attribs.length; i < l; i++ ) {
var attrib = attribs.item(i);
if ( attrib.name !== 'data-rt' ) {
out.push( { k: attrib.name, v: attrib.value } );
}
}
return out;
};
WSP._getDOMRTInfo = function( attribs ) {
if ( attribs['data-rt'] ) {
return JSON.parse( attribs['data-rt'].value || '{}' );
} else {
return {};
}
};
// Quick HACK: define Node constants locally
// https://developer.mozilla.org/en/nodeType
var Node = {
ELEMENT_NODE: 1,
ATTRIBUTE_NODE: 2,
TEXT_NODE: 3,
CDATA_SECTION_NODE: 4,
ENTITY_REFERENCE_NODE: 5,
ENTITY_NODE: 6,
PROCESSING_INSTRUCTION_NODE: 7,
COMMENT_NODE: 8,
DOCUMENT_NODE: 9,
DOCUMENT_TYPE_NODE: 10,
DOCUMENT_FRAGMENT_NODE: 11,
NOTATION_NODE: 12
};
if (typeof module == "object") {
module.exports.WikitextSerializer = WikitextSerializer;
}