mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-29 08:34:54 +00:00
6054a4aa14
Queued newlines and new trailing newlines were not cleanly separated so far, which caused some trailing newlines to be consumed for needed leading newlines. This change fixes several newline bugs, taking the number of passing round-trip tests from 276 back up to 282. Change-Id: Idb4706e15ce71e63085033e3f3f29557915c11a8
891 lines
23 KiB
JavaScript
891 lines
23 KiB
JavaScript
/**
|
|
* Serializes a chunk of tokens or an HTML DOM to MediaWiki's wikitext flavor.
|
|
*
|
|
* @class
|
|
* @constructor
|
|
* @param options {Object} List of options for serialization
|
|
*/
|
|
WikitextSerializer = function( options ) {
|
|
this.options = $.extend( {
|
|
// defaults
|
|
}, options || {} );
|
|
};
|
|
|
|
require('./core-upgrade.js');
|
|
var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer;
|
|
|
|
var WSP = WikitextSerializer.prototype;
|
|
|
|
WSP.defaultOptions = {
|
|
onNewline: true, // actually following start of file or a real newline
|
|
onStartOfLine : true, // in start-of-line context, not necessarily after newline
|
|
listStack: [],
|
|
lastHandler: null,
|
|
availableNewlineCount: 0, // collected (and stripped) newlines from the input
|
|
singleLineMode: 0 // single-line syntactical context: list items, headings etc
|
|
};
|
|
|
|
WSP.escapeWikiText = function ( state, text ) {
|
|
// tokenize the text
|
|
var p = new PegTokenizer( state.env ),
|
|
tokens = [];
|
|
p.on('chunk', function ( chunk ) {
|
|
//console.warn( JSON.stringify(chunk));
|
|
tokens.push.apply( tokens, chunk );
|
|
});
|
|
p.on('end', function(){
|
|
//console.warn( JSON.stringify('end'));
|
|
});
|
|
// this is synchronous for now, will still need sync version later, or
|
|
// alternatively make text processing in the serializer async
|
|
if ( ! state.onNewline ) {
|
|
// Prefix '_' so that no start-of-line wiki syntax matches. Strip it from
|
|
// the result.
|
|
p.process( '_' + text );
|
|
// now strip the leading underscore.
|
|
if ( tokens[0] === '_' ) {
|
|
tokens.shift();
|
|
} else {
|
|
tokens[0] = tokens[0].substr(1);
|
|
}
|
|
} else {
|
|
p.process( text );
|
|
}
|
|
//
|
|
// wrap any run of non-text tokens into <nowiki> tags using the source
|
|
// offsets of top-level productions
|
|
// return the updated text
|
|
var outTexts = [],
|
|
nonTextTokenAccum = [],
|
|
cursor = 0;
|
|
function wrapNonTextTokens () {
|
|
if ( nonTextTokenAccum.length ) {
|
|
var missingRangeEnd = false;
|
|
// TODO: make sure the source positions are always set!
|
|
// The start range
|
|
var startRange = nonTextTokenAccum[0].dataAttribs.tsr,
|
|
rangeStart, rangeEnd;
|
|
if ( ! startRange ) {
|
|
console.warn( 'No tsr on ' + nonTextTokenAccum[0] );
|
|
rangeStart = cursor;
|
|
} else {
|
|
rangeStart = startRange[0];
|
|
if ( ! state.onNewline ) {
|
|
// compensate for underscore.
|
|
rangeStart--;
|
|
}
|
|
cursor = rangeStart;
|
|
}
|
|
|
|
var endRange = nonTextTokenAccum.last().dataAttribs.tsr;
|
|
if ( ! endRange ) {
|
|
// FIXME: improve this!
|
|
//rangeEnd = state.env.tokensToString( tokens ).length;
|
|
// Be conservative and extend the range to the end for now.
|
|
// Alternatives: only extend it to the next token with range
|
|
// info on it.
|
|
missingRangeEnd = true;
|
|
rangeEnd = text.length;
|
|
} else {
|
|
rangeEnd = endRange[1];
|
|
if ( ! state.onNewline ) {
|
|
// compensate for underscore.
|
|
rangeEnd--;
|
|
}
|
|
}
|
|
|
|
var escapedSource = text.substr( rangeStart, rangeEnd - rangeStart )
|
|
.replace( /<(\/?nowiki)>/g, '<$1>' );
|
|
outTexts.push( '<nowiki>' );
|
|
outTexts.push( escapedSource );
|
|
outTexts.push( '</nowiki>' );
|
|
cursor += 17 + escapedSource.length;
|
|
if ( missingRangeEnd ) {
|
|
throw 'No tsr on end token: ' + nonTextTokenAccum.last();
|
|
}
|
|
nonTextTokenAccum = [];
|
|
}
|
|
}
|
|
try {
|
|
for ( var i = 0, l = tokens.length; i < l; i++ ) {
|
|
var token = tokens[i];
|
|
switch ( token.constructor ) {
|
|
case String:
|
|
wrapNonTextTokens();
|
|
outTexts.push( token );
|
|
cursor += token.length;
|
|
break;
|
|
case NlTk:
|
|
wrapNonTextTokens();
|
|
outTexts.push( '\n' );
|
|
cursor++;
|
|
break;
|
|
case EOFTk:
|
|
wrapNonTextTokens();
|
|
break;
|
|
default:
|
|
//console.warn('pushing ' + token);
|
|
nonTextTokenAccum.push(token);
|
|
break;
|
|
}
|
|
}
|
|
} catch ( e ) {
|
|
console.warn( e );
|
|
}
|
|
//console.warn( 'escaped wikiText: ' + outTexts.join('') );
|
|
return outTexts.join('');
|
|
};
|
|
|
|
var id = function(v) {
|
|
return function( state ) {
|
|
return v;
|
|
};
|
|
};
|
|
|
|
WSP._listHandler = function( bullet, state, token ) {
|
|
function isListItem(token) {
|
|
if (token.constructor !== TagTk) return false;
|
|
|
|
var tokenName = token.name;
|
|
return (tokenName === 'li' || tokenName === 'dt' || tokenName === 'dd');
|
|
}
|
|
if ( state.singleLineMode ) {
|
|
state.singleLineMode--;
|
|
}
|
|
|
|
var bullets, res;
|
|
var stack = state.listStack;
|
|
if (stack.length === 0) {
|
|
bullets = bullet;
|
|
res = bullets;
|
|
} else {
|
|
var curList = stack[stack.length - 1];
|
|
bullets = curList.bullets + bullet;
|
|
curList.itemCount++;
|
|
if ( // deeply nested list
|
|
curList.itemCount > 2 ||
|
|
// A nested list, not directly after a list item
|
|
(curList.itemCount > 1 && !isListItem(state.prevToken))) {
|
|
res = bullets;
|
|
} else {
|
|
res = bullet;
|
|
}
|
|
}
|
|
stack.push({ itemCount: 0, bullets: bullets});
|
|
return res;
|
|
};
|
|
|
|
WSP._listEndHandler = function( state, token ) {
|
|
state.listStack.pop();
|
|
return '';
|
|
};
|
|
|
|
WSP._listItemHandler = function ( bullet, state, token ) {
|
|
var stack = state.listStack;
|
|
var curList = stack[stack.length - 1];
|
|
curList.itemCount++;
|
|
return ((curList.itemCount > 1 ) ? curList.bullets + bullet : bullet);
|
|
};
|
|
|
|
WSP._serializeTableTag = function ( symbol, optionEndSymbol, state, token ) {
|
|
if ( token.attribs.length ) {
|
|
return symbol + ' ' + WSP._serializeAttributes( token.attribs ) + optionEndSymbol;
|
|
} else {
|
|
return symbol;
|
|
}
|
|
};
|
|
|
|
WSP._emptyTags = { br: true, meta: true };
|
|
|
|
WSP._serializeHTMLTag = function ( state, token ) {
|
|
var close = '';
|
|
if ( WSP._emptyTags[ token.name ] ) {
|
|
close = '/';
|
|
}
|
|
|
|
// Swallow required newline from previous token on encountering a HTML tag
|
|
//state.emitNewlineOnNextToken = false;
|
|
|
|
if ( token.attribs.length ) {
|
|
return '<' + token.name + ' ' +
|
|
WSP._serializeAttributes( token.attribs ) + close + '>';
|
|
} else {
|
|
return '<' + token.name + close + '>';
|
|
}
|
|
};
|
|
|
|
WSP._serializeHTMLEndTag = function ( state, token ) {
|
|
if ( ! WSP._emptyTags[ token.name ] ) {
|
|
return '</' + token.name + '>';
|
|
} else {
|
|
return '';
|
|
}
|
|
};
|
|
|
|
WSP._linkHandler = function( state, token ) {
|
|
//return '[[';
|
|
// TODO: handle internal/external links etc using RDFa and dataAttribs
|
|
// Also convert unannotated html links to external wiki links for html
|
|
// import. Might want to consider converting relative links without path
|
|
// component and file extension to wiki links.
|
|
|
|
var env = state.env;
|
|
var attribDict = env.KVtoHash( token.attribs );
|
|
if ( attribDict.rel && attribDict.href !== undefined ) {
|
|
var tokenData = token.dataAttribs;
|
|
if ( attribDict.rel === 'mw:wikiLink' ) {
|
|
var base = env.wgScriptPath;
|
|
var href = attribDict.href;
|
|
var prefix = href.substr(0, base.length);
|
|
var target = (prefix === base) ? href.substr(base.length) : href;
|
|
target = decodeURIComponent(target);
|
|
|
|
var tail = tokenData.tail;
|
|
if ( tail && tail.length ) {
|
|
state.dropTail = tail;
|
|
target = tokenData.gc ? tokenData.sHref : target.replace( /_/g, ' ' );
|
|
} else {
|
|
var origLinkTgt = tokenData.sHref;
|
|
if (origLinkTgt) {
|
|
// Normalize the source target so that we can compare it
|
|
// with href.
|
|
var normalizedOrigLinkTgt = env.normalizeTitle( env.tokensToString(origLinkTgt) );
|
|
if ( normalizedOrigLinkTgt === target ) {
|
|
// Non-standard capitalization
|
|
target = origLinkTgt;
|
|
}
|
|
} else {
|
|
target = target.replace( /_/g, ' ' );
|
|
}
|
|
}
|
|
|
|
// FIXME: Properly handle something like [[{{Foo}}]]s
|
|
target = env.tokensToString( target );
|
|
|
|
if ( tokenData.gc ) {
|
|
state.dropContent = true;
|
|
return '[[' + target;
|
|
} else {
|
|
return '[[' + target + '|';
|
|
}
|
|
} else if ( attribDict.rel === 'mw:extLink' ) {
|
|
if ( tokenData.stx === 'urllink' ) {
|
|
state.dropContent = true;
|
|
return attribDict.href;
|
|
} else if ( tokenData.gc ) {
|
|
state.dropContent = true;
|
|
return '[' + attribDict.href;
|
|
} else {
|
|
return '[' + attribDict.href + ' ';
|
|
}
|
|
} else {
|
|
return WSP._serializeHTMLTag( state, token );
|
|
}
|
|
} else {
|
|
return WSP._serializeHTMLTag( state, token );
|
|
}
|
|
|
|
//if ( rtinfo.type === 'wikilink' ) {
|
|
// return '[[' + rtinfo.target + ']]';
|
|
//} else {
|
|
// // external link
|
|
// return '[' + rtinfo.
|
|
};
|
|
WSP._linkEndHandler = function( state, token ) {
|
|
var attribDict = state.env.KVtoHash( token.attribs );
|
|
if ( attribDict.rel && attribDict.href !== undefined ) {
|
|
if ( attribDict.rel === 'mw:wikiLink' ) {
|
|
state.dropContent = false;
|
|
state.dropTail = false;
|
|
return "]]" + (token.dataAttribs.tail ? token.dataAttribs.tail : "");
|
|
} else if ( attribDict.rel === 'mw:extLink' ) {
|
|
state.dropContent = false;
|
|
return (token.dataAttribs.stx === 'urllink') ? '' : ']';
|
|
} else {
|
|
return WSP._serializeHTMLEndTag( state, token );
|
|
}
|
|
} else {
|
|
return WSP._serializeHTMLEndTag( state, token );
|
|
}
|
|
};
|
|
|
|
WSP.tagHandlers = {
|
|
body: {
|
|
start: {
|
|
handle: function(state, token) {
|
|
// swallow trailing new line
|
|
state.emitNewlineOnNextToken = false;
|
|
return '';
|
|
}
|
|
}
|
|
},
|
|
ul: {
|
|
start: {
|
|
startsNewline : true,
|
|
handle: WSP._listHandler.bind( null, '*' ),
|
|
pairSepNLCount: 2,
|
|
newlineTransparent: true
|
|
},
|
|
end: {
|
|
endsLine: true,
|
|
handle: WSP._listEndHandler
|
|
}
|
|
},
|
|
ol: {
|
|
start: {
|
|
startsNewline : true,
|
|
handle: WSP._listHandler.bind( null, '#' ),
|
|
pairSepNLCount: 2,
|
|
newlineTransparent: true
|
|
},
|
|
end: {
|
|
endsLine : true,
|
|
handle: WSP._listEndHandler
|
|
}
|
|
},
|
|
dl: {
|
|
start: {
|
|
startsNewline : true,
|
|
handle: WSP._listHandler.bind( null, ''),
|
|
pairSepNLCount: 2
|
|
},
|
|
end: {
|
|
endsLine: true,
|
|
handle: WSP._listEndHandler
|
|
}
|
|
},
|
|
li: {
|
|
start: {
|
|
handle: WSP._listItemHandler.bind( null, '' ),
|
|
singleLine: 1,
|
|
pairSepNLCount: 1
|
|
},
|
|
end: {
|
|
singleLine: -1
|
|
}
|
|
},
|
|
// XXX: handle single-line vs. multi-line dls etc
|
|
dt: {
|
|
start: {
|
|
singleLine: 1,
|
|
handle: WSP._listItemHandler.bind( null, ';' ),
|
|
pairSepNLCount: 1,
|
|
newlineTransparent: true
|
|
},
|
|
end: {
|
|
singleLine: -1
|
|
}
|
|
},
|
|
dd: {
|
|
start: {
|
|
singleLine: 1,
|
|
handle: WSP._listItemHandler.bind( null, ":" ),
|
|
pairSepNLCount: 1,
|
|
newlineTransparent: true
|
|
},
|
|
end: {
|
|
endsLine: true,
|
|
singleLine: -1
|
|
}
|
|
},
|
|
// XXX: handle options
|
|
table: {
|
|
start: {
|
|
handle: WSP._serializeTableTag.bind(null, "{|", '')
|
|
},
|
|
end: {
|
|
handle: function(state, token) {
|
|
if ( state.prevTagToken && state.prevTagToken.name === 'tr' ) {
|
|
this.startsNewline = true;
|
|
} else {
|
|
this.startsNewline = false;
|
|
}
|
|
return "|}";
|
|
}
|
|
}
|
|
},
|
|
tbody: { start: { ignore: true }, end: { ignore: true } },
|
|
th: {
|
|
start: {
|
|
handle: function ( state, token ) {
|
|
if ( token.dataAttribs.stx_v === 'row' ) {
|
|
this.startsNewline = false;
|
|
return WSP._serializeTableTag("!!", ' |', state, token);
|
|
} else {
|
|
this.startsNewline = true;
|
|
return WSP._serializeTableTag( "!", ' |', state, token);
|
|
}
|
|
}
|
|
}
|
|
},
|
|
tr: {
|
|
start: {
|
|
handle: function ( state, token ) {
|
|
if ( state.prevToken.constructor === TagTk && state.prevToken.name === 'tbody' ) {
|
|
// Omit for first row in a table. XXX: support optional trs
|
|
// for first line (in source wikitext) too using some flag in
|
|
// data-mw (stx: 'wikitext' ?)
|
|
return '';
|
|
} else {
|
|
return WSP._serializeTableTag("|-", '', state, token );
|
|
}
|
|
},
|
|
startsNewline: true
|
|
}
|
|
},
|
|
td: {
|
|
start: {
|
|
handle: function ( state, token ) {
|
|
if ( token.dataAttribs.stx_v === 'row' ) {
|
|
this.startsNewline = false;
|
|
return WSP._serializeTableTag("||", ' |', state, token);
|
|
} else {
|
|
this.startsNewline = true;
|
|
return WSP._serializeTableTag("|", ' |', state, token);
|
|
}
|
|
}
|
|
}
|
|
},
|
|
caption: {
|
|
start: {
|
|
startsNewline: true,
|
|
handle: WSP._serializeTableTag.bind(null, "|+", ' |')
|
|
}
|
|
},
|
|
p: {
|
|
make: function(state, token) {
|
|
// Special case handling in a list context
|
|
// VE embeds list content in paragraph tags
|
|
return state.singleLineMode ? WSP.defaultHTMLTagHandler : this;
|
|
},
|
|
start: {
|
|
startsNewline : true,
|
|
pairSepNLCount: 2
|
|
},
|
|
end: {
|
|
endsLine: true
|
|
}
|
|
},
|
|
// XXX: support indent variant instead by registering a newline handler?
|
|
pre: {
|
|
start: {
|
|
startsNewline: true,
|
|
handle: function( state, token ) {
|
|
state.textHandler = function( t ) {
|
|
return t.replace(/\n/g, '\n ' );
|
|
};
|
|
return ' ';
|
|
}
|
|
},
|
|
end: {
|
|
endsLine: true,
|
|
handle: function( state, token) { state.textHandler = null; return ''; }
|
|
}
|
|
},
|
|
meta: {
|
|
start: {
|
|
handle: function ( state, token ) {
|
|
var argDict = state.env.KVtoHash( token.attribs );
|
|
if ( argDict['typeof'] === 'mw:tag' ) {
|
|
// we use this currently for nowiki and noinclude & co
|
|
this.newlineTransparent = true;
|
|
if ( argDict.content === 'nowiki' ) {
|
|
state.inNoWiki = true;
|
|
} else if ( argDict.content === '/nowiki' ) {
|
|
state.inNoWiki = false;
|
|
} else {
|
|
console.warn( JSON.stringify( argDict ) );
|
|
}
|
|
return '<' + argDict.content + '>';
|
|
} else {
|
|
this.newlineTransparent = false;
|
|
return WSP._serializeHTMLTag( state, token );
|
|
}
|
|
}
|
|
}
|
|
},
|
|
hr: {
|
|
start: { startsNewline: true, handle: id("----") },
|
|
end: { endsLine: true }
|
|
},
|
|
h1: {
|
|
start: { startsNewline: true, handle: id("=") },
|
|
end: { endsLine: true, handle: id("=") }
|
|
},
|
|
h2: {
|
|
start: { startsNewline: true, handle: id("==") },
|
|
end: { endsLine: true, handle: id("==") }
|
|
},
|
|
h3: {
|
|
start: { startsNewline: true, handle: id("===") },
|
|
end: { endsLine: true, handle: id("===") }
|
|
},
|
|
h4: {
|
|
start: { startsNewline: true, handle: id("====") },
|
|
end: { endsLine: true, handle: id("====") }
|
|
},
|
|
h5: {
|
|
start: { startsNewline: true, handle: id("=====") },
|
|
end: { endsLine: true, handle: id("=====") }
|
|
},
|
|
h6: {
|
|
start: { startsNewline: true, handle: id("======") },
|
|
end: { endsLine: true, handle: id("======") }
|
|
},
|
|
br: {
|
|
start: { startsNewline: true, handle: id("") },
|
|
end: { endsLine: true }
|
|
},
|
|
b: {
|
|
start: { handle: id("'''") },
|
|
end: { handle: id("'''") }
|
|
},
|
|
i: {
|
|
start: { handle: id("''") },
|
|
end: { handle: id("''") }
|
|
},
|
|
a: {
|
|
start: { handle: WSP._linkHandler },
|
|
end: { handle: WSP._linkEndHandler }
|
|
}
|
|
};
|
|
|
|
|
|
WSP._serializeAttributes = function ( attribs ) {
|
|
var out = [];
|
|
for ( var i = 0, l = attribs.length; i < l; i++ ) {
|
|
var kv = attribs[i];
|
|
if (kv.k.length) {
|
|
if ( kv.v.length ) {
|
|
out.push( kv.k + '=' +
|
|
'"' + kv.v.replace( '"', '"' ) + '"');
|
|
} else {
|
|
out.push( kv.k );
|
|
}
|
|
} else if ( kv.v.length ) {
|
|
// not very likely..
|
|
out.push( kv.v );
|
|
}
|
|
}
|
|
// XXX: round-trip optional whitespace / line breaks etc
|
|
return out.join(' ');
|
|
};
|
|
|
|
/**
|
|
* Serialize a chunk of tokens
|
|
*/
|
|
WSP.serializeTokens = function( tokens, chunkCB ) {
|
|
var state = $.extend({}, this.defaultOptions, this.options),
|
|
i, l;
|
|
if ( chunkCB === undefined ) {
|
|
var out = [];
|
|
state.chunkCB = out.push.bind(out);
|
|
for ( i = 0, l = tokens.length; i < l; i++ ) {
|
|
this._serializeToken( state, tokens[i] );
|
|
}
|
|
return out;
|
|
} else {
|
|
state.chunkCB = chunkCB;
|
|
for ( i = 0, l = tokens.length; i < l; i++ ) {
|
|
this._serializeToken( state, tokens[i] );
|
|
}
|
|
}
|
|
};
|
|
|
|
WSP.defaultHTMLTagHandler = {
|
|
start: { handle: WSP._serializeHTMLTag },
|
|
end : { handle: WSP._serializeHTMLEndTag }
|
|
};
|
|
|
|
WSP._getTokenHandler = function(state, token) {
|
|
var handler;
|
|
if (token.dataAttribs.stx === 'html') {
|
|
handler = this.defaultHTMLTagHandler;
|
|
} else {
|
|
var tname = token.name;
|
|
handler = this.tagHandlers[tname];
|
|
if ( handler && handler.make ) {
|
|
handler = handler.make(state, token);
|
|
}
|
|
}
|
|
|
|
if ( ! handler ) {
|
|
handler = this.defaultHTMLTagHandler;
|
|
}
|
|
if ( token.constructor === TagTk || token.constructor === SelfclosingTagTk ) {
|
|
return handler.start || {};
|
|
} else {
|
|
return handler.end || {};
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Serialize a token.
|
|
*/
|
|
WSP._serializeToken = function ( state, token ) {
|
|
var handler = {},
|
|
res = '',
|
|
dropContent = state.dropContent;
|
|
|
|
state.prevToken = state.curToken;
|
|
state.curToken = token;
|
|
|
|
|
|
switch( token.constructor ) {
|
|
case TagTk:
|
|
case SelfclosingTagTk:
|
|
handler = WSP._getTokenHandler( state, token );
|
|
if ( ! handler.ignore ) {
|
|
state.prevTagToken = state.currTagToken;
|
|
state.currTagToken = token;
|
|
res = handler.handle ? handler.handle( state, token ) : '';
|
|
}
|
|
break;
|
|
case EndTagTk:
|
|
handler = WSP._getTokenHandler( state, token );
|
|
if ( ! handler.ignore ) {
|
|
state.prevTagToken = state.currTagToken;
|
|
state.currTagToken = token;
|
|
if ( handler.singleLine < 0 && state.singleLineMode ) {
|
|
state.singleLineMode--;
|
|
}
|
|
res = handler.handle ? handler.handle( state, token ) : '';
|
|
}
|
|
break;
|
|
case String:
|
|
res = state.inNoWiki? token : this.escapeWikiText( state, token );
|
|
res = state.textHandler ? state.textHandler( res ) : res;
|
|
break;
|
|
case CommentTk:
|
|
res = '<!--' + token.value + '-->';
|
|
// don't consider comments for changes of the onStartOfLine status
|
|
// XXX: convert all non-tag handlers to a similar handler
|
|
// structure as tags?
|
|
handler = { newlineTransparent: true };
|
|
break;
|
|
case NlTk:
|
|
res = '\n';
|
|
res = state.textHandler ? state.textHandler( res ) : res;
|
|
break;
|
|
case EOFTk:
|
|
res = '';
|
|
for ( var i = 0, l = state.availableNewlineCount; i < l; i++ ) {
|
|
res += '\n';
|
|
}
|
|
state.chunkCB(res);
|
|
break;
|
|
default:
|
|
res = '';
|
|
console.warn( 'Unhandled token type ' + JSON.stringify( token ) );
|
|
break;
|
|
}
|
|
|
|
|
|
if (! dropContent || ! state.dropContent ) {
|
|
|
|
var newNLCount = 0;
|
|
if (res !== '') {
|
|
// Strip leading or trailing newlines from the returned string
|
|
var match = res.match( /^((?:\r?\n)*)((?:.*?|[\r\n]+[^\r\n])*?)((?:\r?\n)*)$/ ),
|
|
leadingNLs = match[1],
|
|
trailingNLs = match[3];
|
|
|
|
if (leadingNLs === res) {
|
|
// all newlines, accumulate count, and clear output
|
|
state.availableNewlineCount += leadingNLs.replace(/\r\n/g, '\n').length;
|
|
res = "";
|
|
} else {
|
|
newNLCount = trailingNLs.replace(/\r\n/g, '\n').length;
|
|
if ( leadingNLs !== '' ) {
|
|
state.availableNewlineCount += leadingNLs.replace(/\r\n/g, '\n').length;
|
|
}
|
|
// strip newlines
|
|
res = match[2];
|
|
}
|
|
}
|
|
|
|
// Check if we have a pair of identical tag tokens </p><p>; </ul><ul>; etc.
|
|
// that have to be separated by extra newlines and add those in.
|
|
if (handler.pairSepNLCount && state.prevTagToken &&
|
|
state.prevTagToken.constructor === EndTagTk &&
|
|
state.prevTagToken.name == token.name )
|
|
{
|
|
if ( state.availableNewlineCount < handler.pairSepNLCount) {
|
|
state.availableNewlineCount = handler.pairSepNLCount;
|
|
}
|
|
}
|
|
|
|
if ( state.env.debug ) {
|
|
console.warn(token + " -> " + res + ", onnl: " + state.onNewline + ", #nls " +
|
|
state.availableNewlineCount + ', new ' + newNLCount);
|
|
}
|
|
if (res !== '' ) {
|
|
var out = '';
|
|
// Prev token's new line token
|
|
if ( !state.singleLineMode &&
|
|
( ( !res.match(/^\s*$/) && state.emitNewlineOnNextToken ) ||
|
|
( handler.startsNewline && !state.onStartOfLine ) ) )
|
|
{
|
|
// Emit new line, if necessary
|
|
if ( ! state.availableNewlineCount ) {
|
|
state.availableNewlineCount++;
|
|
}
|
|
state.emitNewlineOnNextToken = false;
|
|
}
|
|
|
|
if ( state.availableNewlineCount ) {
|
|
state.onNewline = true;
|
|
state.onStartOfLine = true;
|
|
}
|
|
|
|
// Add required # of new lines in the beginning
|
|
for (; state.availableNewlineCount; state.availableNewlineCount--) {
|
|
out += '\n';
|
|
}
|
|
|
|
state.availableNewlineCount = newNLCount;
|
|
|
|
// FIXME: This might modify not just the last content token in a
|
|
// link, which would be wrong. We'll likely have to collect tokens
|
|
// between a tags instead, and strip only the last content token.
|
|
if (state.dropTail && res.substr(- state.dropTail.length) === state.dropTail) {
|
|
res = res.substr(0, res.length - state.dropTail.length);
|
|
}
|
|
|
|
if ( state.singleLineMode ) {
|
|
res = res.replace(/\n/g, ' ');
|
|
}
|
|
out += res;
|
|
if ( res !== '' ) {
|
|
state.onNewline = false;
|
|
if ( !handler.newlineTransparent ) {
|
|
state.onStartOfLine = false;
|
|
}
|
|
}
|
|
state.env.dp(' =>', out);
|
|
state.chunkCB( out );
|
|
} else {
|
|
state.availableNewlineCount += newNLCount;
|
|
if ( handler.startsNewline && ! state.onStartOfLine ) {
|
|
state.emitNewlineOnNextToken = true;
|
|
}
|
|
}
|
|
/* else {
|
|
console.warn("SILENT: tok: " + token + ", res: <" + res + ">" + ", onnl: " + state.onNewline + ", # nls: " + state.availableNewlineCount);
|
|
}
|
|
*/
|
|
|
|
if (handler.endsLine) {
|
|
// Record end of line
|
|
state.emitNewlineOnNextToken = true;
|
|
}
|
|
if ( handler.singleLine > 0 ) {
|
|
state.singleLineMode += handler.singleLine;
|
|
}
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Serialize an HTML DOM document.
|
|
*/
|
|
WSP.serializeDOM = function( node, chunkCB ) {
|
|
var state = $.extend({}, this.defaultOptions, this.options);
|
|
//console.warn( node.innerHTML );
|
|
if ( ! chunkCB ) {
|
|
var out = [];
|
|
state.chunkCB = out.push.bind( out );
|
|
this._serializeDOM( node, state );
|
|
this._serializeToken( state, new EOFTk() );
|
|
return out.join('');
|
|
} else {
|
|
state.chunkCB = chunkCB;
|
|
this._serializeDOM( node, state );
|
|
this._serializeToken( state, new EOFTk() );
|
|
}
|
|
};
|
|
|
|
/**
|
|
* Internal worker. Recursively serialize a DOM subtree by creating tokens and
|
|
* calling _serializeToken on each of these.
|
|
*/
|
|
WSP._serializeDOM = function( node, state ) {
|
|
// serialize this node
|
|
switch( node.nodeType ) {
|
|
case Node.ELEMENT_NODE:
|
|
//console.warn( node.nodeName.toLowerCase() );
|
|
var children = node.childNodes,
|
|
name = node.nodeName.toLowerCase(),
|
|
tkAttribs = this._getDOMAttribs(node.attributes),
|
|
tkRTInfo = this._getDOMRTInfo(node.attributes);
|
|
|
|
// Serialize the start token
|
|
this._serializeToken(state, new TagTk(name, tkAttribs, tkRTInfo));
|
|
|
|
// then children
|
|
for ( var i = 0, l = children.length; i < l; i++ ) {
|
|
this._serializeDOM( children[i], state );
|
|
}
|
|
|
|
// then the end token
|
|
this._serializeToken(state, new EndTagTk(name, tkAttribs, tkRTInfo));
|
|
break;
|
|
case Node.TEXT_NODE:
|
|
this._serializeToken( state, node.data );
|
|
break;
|
|
case Node.COMMENT_NODE:
|
|
// delay the newline creation until after the comment
|
|
var savedEmitNewlineOnNextToken = state.emitNewlineOnNextToken;
|
|
state.emitNewlineOnNextToken = false;
|
|
this._serializeToken( state, new CommentTk( node.data ) );
|
|
state.emitNewlineOnNextToken = savedEmitNewlineOnNextToken;
|
|
break;
|
|
default:
|
|
console.warn( "Unhandled node type: " +
|
|
node.outerHTML );
|
|
break;
|
|
}
|
|
};
|
|
|
|
WSP._getDOMAttribs = function( attribs ) {
|
|
// convert to list fo key-value pairs
|
|
var out = [];
|
|
for ( var i = 0, l = attribs.length; i < l; i++ ) {
|
|
var attrib = attribs.item(i);
|
|
if ( attrib.name !== 'data-mw' ) {
|
|
out.push( { k: attrib.name, v: attrib.value } );
|
|
}
|
|
}
|
|
return out;
|
|
};
|
|
|
|
WSP._getDOMRTInfo = function( attribs ) {
|
|
if ( attribs['data-mw'] ) {
|
|
return JSON.parse( attribs['data-mw'].value || '{}' );
|
|
} else {
|
|
return {};
|
|
}
|
|
};
|
|
|
|
|
|
// Quick HACK: define Node constants locally
|
|
// https://developer.mozilla.org/en/nodeType
|
|
var Node = {
|
|
ELEMENT_NODE: 1,
|
|
ATTRIBUTE_NODE: 2,
|
|
TEXT_NODE: 3,
|
|
CDATA_SECTION_NODE: 4,
|
|
ENTITY_REFERENCE_NODE: 5,
|
|
ENTITY_NODE: 6,
|
|
PROCESSING_INSTRUCTION_NODE: 7,
|
|
COMMENT_NODE: 8,
|
|
DOCUMENT_NODE: 9,
|
|
DOCUMENT_TYPE_NODE: 10,
|
|
DOCUMENT_FRAGMENT_NODE: 11,
|
|
NOTATION_NODE: 12
|
|
};
|
|
|
|
|
|
if (typeof module == "object") {
|
|
module.exports.WikitextSerializer = WikitextSerializer;
|
|
}
|