mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-29 08:34:54 +00:00
1294 lines
38 KiB
JavaScript
1294 lines
38 KiB
JavaScript
/* Combined Wiki (MediaWiki) and HTML tokenizer. Produces a token stream
|
|
** (actually a list of tokens for now) suitable for a HTML5TreeBuilder. */
|
|
{
|
|
/* Fixme: use static functions to separate module! Unfortunately, this
|
|
* does not work:
|
|
* var tu = require('./mediawiki.tokenizer.utils.js');
|
|
* console.log(tu.flatten([]));
|
|
* Using exports in the module gets a bit further, but accesses to
|
|
* tu.flatten in productions still fail. Thus, I just moved the functions
|
|
* here until a solution is found:
|
|
*/
|
|
|
|
/* Static utilities */
|
|
|
|
// Flatten a list of lists.
|
|
var flatten = function ( e ) {
|
|
// Fast single-level flatten:
|
|
//return [].concat.apply([], e);
|
|
|
|
var es = [];
|
|
// flatten sub-arrays
|
|
for(var i = 0, length = e.length; i < length; i++) {
|
|
var ei = e[i];
|
|
if ($.isArray(ei))
|
|
es = es.concat(flatten(ei));
|
|
else
|
|
es.push(ei);
|
|
};
|
|
return es;
|
|
};
|
|
|
|
// Remove escaped quotes from attributes etc
|
|
// This was in the original PEG parser, but could not find anything in
|
|
// MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke
|
|
var unquote = function (quotec, text) {
|
|
return text.replace('\\' + quotec, quotec);
|
|
};
|
|
|
|
// Decode html entities. In a browser, this should only be fed the entity,
|
|
// not untrusted html! XXX: replace with safer version.
|
|
var unentity = function ( entity ) {
|
|
return $("<div/>").html(entity).text();
|
|
};
|
|
|
|
// Debug print with global switch
|
|
var dp = function ( msg ) {
|
|
if ( false ) {
|
|
console.log(msg);
|
|
}
|
|
};
|
|
|
|
var pp = function ( s ) { return JSON.stringify(s, null, 2); }
|
|
|
|
/*
|
|
* Annotate a token stream with list items with appropriate list tokens
|
|
*
|
|
* @static
|
|
* @method
|
|
* @param {[tokens]} Token stream with li tokens
|
|
* @returns {[tokens]} Token stream, possibly with additional list tokens
|
|
* */
|
|
var annotateList = function ( tokens ) {
|
|
var out = [], // List of tokens
|
|
bstack = [], // Bullet stack, previous element's listStyle
|
|
bnext = [], // Next element's listStyle
|
|
endtags = []; // Stack of end tags
|
|
|
|
var commonPrefixLength = function (x, y) {
|
|
var minLength = Math.min(x.length, y.length);
|
|
for(var i = 0; i < minLength; i++) {
|
|
if (x[i] != y[i])
|
|
break;
|
|
}
|
|
return i;
|
|
};
|
|
|
|
var pushList = function ( listName, itemName ) {
|
|
out.push({type: 'TAG', name: listName});
|
|
out.push({type: 'TAG', name: itemName});
|
|
endtags.push({type: 'ENDTAG', name: listName});
|
|
endtags.push({type: 'ENDTAG', name: itemName});
|
|
};
|
|
|
|
var popTags = function ( n ) {
|
|
for(;n > 0; n--) {
|
|
// push list item..
|
|
out.push(endtags.pop());
|
|
// and the list end tag
|
|
out.push(endtags.pop());
|
|
}
|
|
};
|
|
|
|
var isDlDd = function (a, b) {
|
|
var ab = [a,b].sort();
|
|
return (ab[0] === ':' && ab[1] === ';');
|
|
};
|
|
|
|
var doListItem = function ( bs, bn ) {
|
|
var prefixLen = commonPrefixLength (bs, bn);
|
|
var changeLen = Math.max(bs.length, bn.length) - prefixLen;
|
|
var prefix = bn.slice(0, prefixLen);
|
|
// emit close tag tokens for closed lists
|
|
if (changeLen === 0) {
|
|
var itemToken = endtags.pop();
|
|
out.push(itemToken);
|
|
out.push({type: 'TAG', name: itemToken.name});
|
|
endtags.push({type: 'ENDTAG', name: itemToken.name});
|
|
} else if ( bs.length == bn.length
|
|
&& changeLen == 1
|
|
&& isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
|
|
// handle dd/dt transitions
|
|
out.push(endtags.pop());
|
|
if( bn[prefixLen] == ';') {
|
|
var newName = 'dt';
|
|
} else {
|
|
var newName = 'dd';
|
|
}
|
|
out.push({type: 'TAG', name: newName});
|
|
endtags.push({type: 'ENDTAG', name: newName});
|
|
} else {
|
|
popTags(bs.length - prefixLen);
|
|
|
|
if (prefixLen > 0 && bn.length == prefixLen ) {
|
|
var itemToken = endtags.pop();
|
|
out.push(itemToken);
|
|
out.push({type: 'TAG', name: itemToken.name});
|
|
endtags.push({type: 'ENDTAG', name: itemToken.name});
|
|
}
|
|
|
|
for(var i = prefixLen; i < bn.length; i++) {
|
|
switch (bn[i]) {
|
|
case '*':
|
|
pushList('ul', 'li');
|
|
break;
|
|
case '#':
|
|
pushList('ol', 'li');
|
|
break;
|
|
case ';':
|
|
pushList('dl', 'dt');
|
|
break;
|
|
case ':':
|
|
pushList('dl', 'dd');
|
|
break;
|
|
default:
|
|
throw("Unknown node prefix " + prefix[i]);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
for (var i = 0, length = tokens.length; i < length; i++) {
|
|
var token = tokens[i];
|
|
switch ( token.type ) {
|
|
case 'TAG':
|
|
switch (token.name) {
|
|
case 'list':
|
|
// ignore token
|
|
break;
|
|
case 'listItem':
|
|
// convert listItem to list and list item tokens
|
|
bnext = token.bullets;
|
|
doListItem( bstack, bnext );
|
|
bstack = bnext;
|
|
break;
|
|
default:
|
|
// pass through all remaining start tags
|
|
out.push(token);
|
|
break;
|
|
}
|
|
break;
|
|
case 'ENDTAG':
|
|
if ( token.name == 'list' ) {
|
|
// pop all open list item tokens
|
|
popTags(bstack.length);
|
|
bstack = [];
|
|
} else {
|
|
out.push(token);
|
|
}
|
|
break;
|
|
default:
|
|
out.push(token);
|
|
break;
|
|
}
|
|
}
|
|
return out;
|
|
};
|
|
|
|
|
|
/* End static utilities */
|
|
|
|
/*
|
|
* Flags for specific parse environments (inside tables, links etc). Flags
|
|
* trigger syntactic stops in the inline_breaks production, which
|
|
* terminates inline and attribute matches. Flags merely reduce the number
|
|
* of productions needed: The grammar is still context-free as the
|
|
* productions can just be unrolled for all combinations of environments
|
|
* at the cost of a much larger grammar.
|
|
*/
|
|
var syntaxFlags = {};
|
|
var setFlag = function(flag) {
|
|
if (syntaxFlags[flag] !== undefined) {
|
|
syntaxFlags[flag]++;
|
|
} else {
|
|
syntaxFlags[flag] = 1;
|
|
}
|
|
return true;
|
|
};
|
|
var clearFlag = function(flag) {
|
|
syntaxFlags[flag]--;
|
|
return false;
|
|
};
|
|
|
|
// Start position of top-level block
|
|
// Could also provide positions for lower-level blocks using a stack.
|
|
var blockStart = 0;
|
|
|
|
// Start position of generic tag production
|
|
var tagStartPos = 0;
|
|
|
|
// cache the input length
|
|
var inputLength = input.length;
|
|
|
|
// pseudo-production that matches at end of input
|
|
var isEOF = function (pos) {
|
|
return pos === inputLength;
|
|
};
|
|
|
|
// text start position
|
|
var textStart = 0;
|
|
|
|
// hack to support numbered external links ([http://example.com]).
|
|
// XXX: Move to token stream transform after templates are expanded!
|
|
var linkCount = 1;
|
|
|
|
// Define block-level tags in JS, so we can use toLowerCase to match tags
|
|
// case-independently. This would be quite ugly (and possibly slower) if
|
|
// done manually in the grammar.
|
|
var block_names = (function () {
|
|
var names = [ "p", "table", "td", "tr", "ul", "ol"
|
|
, "li", "dl", "dt", "dd", "div", "center"
|
|
, "blockquote" ];
|
|
var bnames = {};
|
|
for(var i = 0, l = names.length; i < l; i++) {
|
|
bnames[names[i]] = true;
|
|
}
|
|
return bnames;
|
|
})();
|
|
|
|
var self = this;
|
|
|
|
|
|
}
|
|
|
|
start
|
|
= e:toplevelblock* newline* {
|
|
// end is passed inline as a token, as well as a separate event for now.
|
|
|
|
// this does not work yet.
|
|
//console.log('about to emit' + pp(self));
|
|
//self._tokenizer.emit('chunk', [ { type: 'END' } ] );
|
|
//self._tokenizer.emit('end');
|
|
// Append the end (for obvious reasons this should not
|
|
// be part of a stream, only when tokenizing complete
|
|
// texts)
|
|
//console.log( pp( flatten ( e ) ) );
|
|
return flatten(e);
|
|
}
|
|
|
|
|
|
/* All chars that cannot start syntactic structures in the middle of a line
|
|
* XXX: ] and other end delimiters should probably only be activated inside
|
|
* structures to avoid unnecessarily leaving the text production on plain
|
|
* content. */
|
|
|
|
text_char = [^'<~[{\n\r:\]}|!=]
|
|
|
|
text = t:text_char+ { return t.join(''); }
|
|
|
|
/* Explanation of chars
|
|
* ' quotes (italic/bold)
|
|
* < start of xmlish_tag
|
|
* ~ signatures/dates
|
|
* [ start of links
|
|
* { start of parser functions, transclusion and template args
|
|
* \n all sort of block-level markup at start of line
|
|
* \r ditto
|
|
* h http(s) urls
|
|
* n nntp(s) urls
|
|
* m mailto urls
|
|
*
|
|
* ! and | table cell delimiters, might be better to specialize those
|
|
* = headings - also specialize those!
|
|
*
|
|
* The following chars are also included for now, but only apply in some
|
|
* contexts and should probably be enabled only in those:
|
|
* : separate definition in ; term : definition
|
|
* ] end of link
|
|
* } end of parser func/transclusion/template arg
|
|
*/
|
|
|
|
urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
|
|
/ htmlentity
|
|
/ urllink
|
|
// Convert trailing space into
|
|
// XXX: This should be moved to a serializer
|
|
/ ' ' & ':' { return "\u00a0"; }
|
|
/ t:text_char )+
|
|
|
|
/*
|
|
'//', // for protocol-relative URLs, but not in text!
|
|
'ftp://',
|
|
'git://',
|
|
'gopher://',
|
|
'http://',
|
|
'https://',
|
|
'irc://',
|
|
'ircs://', // @bug 28503
|
|
'mailto:',
|
|
'mms://',
|
|
'news:',
|
|
'nntp://', // @bug 3808 RFC 1738
|
|
'svn://',
|
|
'telnet://', // Well if we're going to support the above.. -ævar
|
|
'worldwind://',
|
|
*/
|
|
|
|
// Old version
|
|
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
|
|
|
|
// Experimental tweaked version: avoid expensive single-char substrings
|
|
// This did not bring the expected performance boost, however.
|
|
//text = [A-Za-z0-9,._ -] {
|
|
// textStart = pos;
|
|
//
|
|
// var res = input.substr(textStart - 1, inputLength)
|
|
// .match(/[A-Za-z0-9,._ -]+/)[0];
|
|
// pos = pos + (res.length - 1);
|
|
// return res
|
|
// }
|
|
|
|
htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
|
|
return unentity("&" + c.join('') + ";")
|
|
}
|
|
|
|
space
|
|
= s:[ \t]+ { return s.join(''); }
|
|
|
|
optionalSpaceToken
|
|
= s:space* {
|
|
if ( s.length ) {
|
|
return [{type: 'TEXT', value: s.join('')}];
|
|
} else {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
|
|
// Start of line
|
|
sol = nl:(newlineToken / & { return pos === 0; } { return [] })
|
|
// Eat multi-line comments, so that syntax after still matches as if it
|
|
// was actually preceded by a newline
|
|
cn:( c:comment n:newline? {
|
|
if ( n !== '' ) {
|
|
return [c, {type: 'TEXT', value: n}];
|
|
} else {
|
|
return [c];
|
|
}
|
|
}
|
|
)*
|
|
// Eat includeonly/noinclude at start of line, so that start-of-line
|
|
// syntax after it still matches
|
|
ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
|
|
{
|
|
var niToken = [];
|
|
if ( ni !== '') {
|
|
if ( ni[0] === '/' ) {
|
|
niToken = [{type: 'ENDTAG', name: ni[1]}];
|
|
} else {
|
|
niToken = [{type: 'TAG', name: ni[1]}];
|
|
}
|
|
}
|
|
|
|
return nl.concat(cn, niToken);
|
|
}
|
|
|
|
eof = & { return isEOF(pos); } { return true; }
|
|
|
|
|
|
newline
|
|
= '\n' / '\r\n'
|
|
|
|
newlineToken = newline { return [{type: 'NEWLINE'}] }
|
|
|
|
eolf = newline / eof
|
|
|
|
toplevelblock
|
|
= & { blockStart = pos; return true; } b:block {
|
|
b = flatten(b);
|
|
var bs = b[0];
|
|
//dp('toplevelblock:' + pp(b));
|
|
if (bs.attribs === undefined) {
|
|
bs.attribs = [];
|
|
}
|
|
bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);
|
|
// XXX: only run this for lines that actually need it!
|
|
//b.push({type: 'NEWLINE'});
|
|
// Move this to a token stream transform!
|
|
//console.log('about to emit' + pp(self));
|
|
//console.log( pp( result._tokenizer ));
|
|
//console.log('emitted chunk' + pp(b));
|
|
//return [];
|
|
return b;
|
|
}
|
|
|
|
block
|
|
= block_lines
|
|
/ pre
|
|
/ comment &eolf
|
|
/ nowiki
|
|
/ pre
|
|
/ bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
|
|
/ para
|
|
/ inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
|
|
/ s:sol /*{
|
|
if (s) {
|
|
return [s, {type: 'NEWLINE'}];
|
|
} else {
|
|
return [{type: 'NEWLINE'}];
|
|
}
|
|
}*/
|
|
|
|
block_lines
|
|
= s:sol
|
|
// eat an empty line before the block
|
|
s2:(os:optionalSpaceToken so:sol { return os.concat(so) })?
|
|
bl:block_line {
|
|
var s2_ = (s2 !== '') ? s2 : [];
|
|
return s.concat(s2_, bl);
|
|
}
|
|
|
|
// Block structures with start-of-line wiki syntax
|
|
block_line
|
|
= h
|
|
/// table
|
|
/ & [{}|] tl:table_lines { return tl; }
|
|
/ lists
|
|
// tag-only lines should not trigger pre
|
|
/ st:optionalSpaceToken
|
|
bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+
|
|
&eolf {
|
|
return st.concat(bt);
|
|
}
|
|
/ pre_indent
|
|
/ pre
|
|
|
|
|
|
para
|
|
= s1:sol s2:sol c:inlineline {
|
|
return s1.concat(s2, /* [{type: 'TAG', name: 'p'}],*/ c);
|
|
}
|
|
|
|
br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
|
|
|
|
// Syntax stops to limit inline expansion defending on syntactic context
|
|
inline_breaks
|
|
=
|
|
& { // Important hack: disable caching for this production, as the default
|
|
// cache key does not take into account flag states!
|
|
cacheKey = '';
|
|
return true;
|
|
}
|
|
& { return syntaxFlags['table']; }
|
|
( a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; }
|
|
/ & { return syntaxFlags['tableCellArg'] }
|
|
"|" { return true }
|
|
)
|
|
/ & { return (syntaxFlags['colon'] &&
|
|
! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
|
|
! syntaxFlags.linkdesc); } ":" { return true; }
|
|
/ & { return syntaxFlags['extlink']; } "]" { return true; }
|
|
/ & { return syntaxFlags['linkdesc']; } link_end { return true; }
|
|
/ & { return syntaxFlags['h']; } '='+ space* newline { return true; }
|
|
/ & { return syntaxFlags['template']; } ('|' / '}}') { return true; }
|
|
|
|
inline
|
|
= c:(urltext / (! inline_breaks (inline_element / . )))+ {
|
|
var out = [];
|
|
var text = [];
|
|
c = flatten(c);
|
|
for (var i = 0, l = c.length; i < l; i++) {
|
|
var ci = c[i];
|
|
if (typeof ci === 'string') {
|
|
if(ci !== '') {
|
|
text.push(ci);
|
|
}
|
|
} else {
|
|
if (text.length) {
|
|
out.push({ type: "TEXT", value: text.join('') });
|
|
text = [];
|
|
}
|
|
out.push(ci);
|
|
}
|
|
}
|
|
if (text.length) {
|
|
out.push({ type: 'TEXT', value: text.join('') });
|
|
}
|
|
//dp('inline out:' + pp(out));
|
|
return out;
|
|
}
|
|
|
|
|
|
inlineline
|
|
= c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
|
|
var out = [];
|
|
var text = [];
|
|
c = flatten(c);
|
|
for (var i = 0; i < c.length; i++) {
|
|
var ci = c[i]
|
|
if (typeof ci == 'string') {
|
|
if(ci !== '') {
|
|
text.push(ci);
|
|
}
|
|
} else {
|
|
if (text.length) {
|
|
out.push({type: 'TEXT', value: text.join('')});
|
|
text = [];
|
|
}
|
|
out.push(ci);
|
|
}
|
|
}
|
|
if (text.length) {
|
|
out.push({type: 'TEXT', value: text.join('')});
|
|
}
|
|
//dp('inlineline out:' + pp(out));
|
|
return out;
|
|
}
|
|
|
|
inline_element
|
|
= //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
|
|
& '<' ( comment / xmlish_tag )
|
|
/ & '{' ( & '{{{{{' template / tplarg / template )
|
|
// Eat three opening brackets as text.
|
|
/ '[[[' { return { type: 'TEXT', value: '[[[' } }
|
|
/ & '[' ( wikilink / extlink )
|
|
/ & "'" quote
|
|
|
|
/* Headings */
|
|
|
|
h = & "=" // guard, to make sure '='+ will match.
|
|
// XXX: Also check to end to avoid inline parsing?
|
|
r:(
|
|
s:'='+ // moved in here to make s accessible to inner action
|
|
& { return setFlag('h'); }
|
|
c:inlineline
|
|
e:'='+
|
|
spc:(sp:space+ { return {type: 'TEXT', value: sp.join('') } } / comment)*
|
|
&eolf
|
|
{
|
|
clearFlag('h');
|
|
var level = Math.min(s.length, e.length);
|
|
// convert surplus equals into text
|
|
if(s.length > level) {
|
|
var extras = s.substr(0, s.length - level);
|
|
if(c[0].type == 'TEXT') {
|
|
c[0].value = extras + c[0].value;
|
|
} else {
|
|
c.unshift({type: 'TEXT', value: extras});
|
|
}
|
|
}
|
|
if(e.length > level) {
|
|
var extras = e.substr(0, e.length - level),
|
|
lastElem = c[c.length - 1];
|
|
if(lastElem.type == 'TEXT') {
|
|
lastElem.value = lastElem.value + extras;
|
|
} else {
|
|
c.push({type: 'TEXT', value: extras});
|
|
}
|
|
}
|
|
|
|
return [{type: 'TAG', name: 'h' + level}]
|
|
.concat(c, [{type: 'ENDTAG', name: 'h' + level}, spc]);
|
|
}
|
|
/ & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
|
|
) { return r }
|
|
|
|
|
|
pre_indent
|
|
= l:pre_indent_line ls:(sol pre_indent_line)* {
|
|
return [{type: 'TAG', name: 'pre'}]
|
|
.concat( [l], ls
|
|
, [{type: 'ENDTAG', name: 'pre'}]);
|
|
}
|
|
pre_indent_line = space l:inlineline {
|
|
return [{type: 'TEXT', value: '\n'}].concat(l);
|
|
}
|
|
|
|
|
|
comment
|
|
= '<!--' c:comment_chars* ('-->' / eof)
|
|
cs:(space* newline space* cn:comment { return cn })* {
|
|
return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
|
|
}
|
|
|
|
comment_chars
|
|
= c:[^-] { return c; }
|
|
/ c:'-' !'->' { return c; }
|
|
|
|
|
|
urllink
|
|
= target:url {
|
|
return [ { type: 'TAG',
|
|
name: 'a',
|
|
attribs: [['href', target]] }
|
|
, {type: 'TEXT', value: target}
|
|
, {type: 'ENDTAG', name: 'a'}
|
|
];
|
|
}
|
|
|
|
extlink
|
|
= "["
|
|
& { return setFlag('extlink'); }
|
|
target:url
|
|
space*
|
|
text:inlineline?
|
|
"]" {
|
|
clearFlag('extlink');
|
|
if ( text == '' ) {
|
|
// XXX: Link numbering should be implemented in post-processor.
|
|
text = [{type: 'TEXT', value: "[" + linkCount + "]"}];
|
|
linkCount++;
|
|
}
|
|
return [
|
|
{
|
|
type: 'TAG',
|
|
name: 'a',
|
|
attribs: [
|
|
['href', target],
|
|
['data-type', 'external']
|
|
],
|
|
}
|
|
].concat( text
|
|
, [{type: 'ENDTAG', name: 'a'}]);
|
|
}
|
|
/ "[" & { clearFlag('extlink'); return false; }
|
|
|
|
/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
|
|
* be configured dynamically. */
|
|
url_protocol
|
|
= '//' // for protocol-relative URLs
|
|
/ 'ftp://'
|
|
/ 'git://'
|
|
/ 'gopher://'
|
|
/ 'http://'
|
|
/ 'https://'
|
|
/ 'irc://'
|
|
/ 'ircs://' // @bug 28503
|
|
/ 'mailto:'
|
|
/ 'mms://'
|
|
/ 'news:'
|
|
/ 'nntp://' // @bug 3808 RFC 1738
|
|
/ 'svn://'
|
|
/ 'telnet://' // Well if we're going to support the above.. -ævar
|
|
/ 'worldwind://'
|
|
|
|
// javascript does not support unicode features..
|
|
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
|
|
|
|
|
|
urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
|
|
return decodeURI("%" + c0 + c1)
|
|
}
|
|
|
|
//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
|
|
|
|
no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
|
|
|
|
url
|
|
= proto:url_protocol
|
|
addr:( ipv6_address / ipv4_address )?
|
|
rest:( ( !inline_breaks
|
|
c:no_punctuation_char
|
|
{ return c }
|
|
)
|
|
/ s:[.:,] !(space / eolf) { return s }
|
|
/ htmlentity
|
|
/ urlencoded_char
|
|
/ [&%] )+
|
|
{
|
|
return proto + addr + rest.join('');
|
|
}
|
|
|
|
ipv4_address
|
|
= a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*)
|
|
{
|
|
return flatten( a ).join('');
|
|
}
|
|
|
|
ipv6_address
|
|
= a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']')
|
|
{
|
|
return flatten( a ).join('');
|
|
}
|
|
|
|
template
|
|
= "{{" target:template_param_text
|
|
params:(newline? "|" newline? p:template_param { return p })*
|
|
"}}" {
|
|
target = flatten( target );
|
|
var obj = {
|
|
type: 'SELFCLOSINGTAG',
|
|
name: 'template',
|
|
attribs: [['data-target', JSON.stringify(target)]],
|
|
orderedArgs: params,
|
|
args: {},
|
|
target: target
|
|
};
|
|
if (params && params.length) {
|
|
var position = 1;
|
|
for ( var i = 0, l = params.length; i < l; i++ ) {
|
|
var param = params[i];
|
|
if ( param[0] === null ) {
|
|
obj.args[position] = param[1];
|
|
position++;
|
|
} else {
|
|
// Last value wins for duplicates.
|
|
obj.args[param[0]] = param[1];
|
|
}
|
|
}
|
|
// HACK: temporarily also push the args into an attribute
|
|
// (just for debugging)
|
|
obj.attribs.push(['data-json-args', JSON.stringify(obj.args)]);
|
|
}
|
|
// Should actually use a self-closing tag here, but the Node HTML5
|
|
// parser only recognizes known self-closing tags for now, so use an
|
|
// explicit end tag for now.
|
|
//console.log(pp(obj));
|
|
return obj;
|
|
}
|
|
|
|
// XXX: support template and args in target!
|
|
template_target
|
|
= h:( !"}}" x:([^|\n]) { return x } )* { return { type: 'TEXT', value: h.join('') } }
|
|
|
|
template_param
|
|
= name:template_param_name space* "=" space* c:template_param_text {
|
|
return [[{ type: 'TEXT', value: name }], flatten( c )];
|
|
} / c:template_param_text {
|
|
return [[], flatten( c ) ];
|
|
}
|
|
|
|
tplarg
|
|
= "{{{" name:template_param_text params:("|" p:template_param { return p })* "}}}" {
|
|
name = flatten( name );
|
|
var obj = {
|
|
type: 'SELFCLOSINGTAG',
|
|
name: 'templatearg',
|
|
attribs: [],
|
|
argname: name,
|
|
defaultvalue: []
|
|
};
|
|
if (params && params.length) {
|
|
// HACK, not final.
|
|
obj.attribs.push(['data-defaultvalue', params[0][1]]);
|
|
obj.attribs.push(['data-json-args', JSON.stringify(params)]);
|
|
obj.defaultvalue = params[0][1];
|
|
}
|
|
//console.log( 'tokenizer templatearg ' + JSON.stringify( obj ));
|
|
return obj;
|
|
}
|
|
|
|
// FIXME: handle template args and templates in key! (or even parser functions?)
|
|
template_param_name
|
|
= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
|
|
|
|
template_param_text
|
|
= & { return setFlag('template') }
|
|
il:inline+ {
|
|
clearFlag('template');
|
|
return il;
|
|
}
|
|
/ & { clearFlag('template'); return false; }
|
|
|
|
// TODO: handle link prefixes as in al[[Razi]]
|
|
wikilink
|
|
= "[["
|
|
! url
|
|
target:link_target
|
|
lcontent:( "|" lt:link_text { return lt } )*
|
|
"]]"
|
|
// XXX In real MediaWiki, this is a language-dependent positive character
|
|
// class. Can we work out a static negative class instead?
|
|
// XXX: Exclude uppercase chars from non-latin languages too!
|
|
trail:(! [A-Z \t(),.:-] tc:text_char { return tc })* {
|
|
var obj = {
|
|
type: 'TAG',
|
|
name: 'a',
|
|
attribs: [
|
|
['data-type', 'internal']
|
|
]
|
|
},
|
|
textTokens = [];
|
|
obj.attribs.push(['href', target]);
|
|
if (lcontent && lcontent.length) {
|
|
textTokens = lcontent;
|
|
if (trail) {
|
|
textTokens.push( { type: 'TEXT', value: trail.join('') } );
|
|
}
|
|
} else {
|
|
if (trail) {
|
|
target += trail.join('');
|
|
}
|
|
textTokens = [{type: 'TEXT', value: target}];
|
|
}
|
|
return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
|
|
}
|
|
|
|
link_target
|
|
= h:( c:[^|%\n\]]+ { return c.join('') } // quickly eat anything unsuspicious
|
|
/ !"]]"
|
|
hi:(
|
|
[^|%\n]
|
|
/ urlencoded_char
|
|
/ '%'
|
|
) { return hi }
|
|
)* { return h.join(''); }
|
|
|
|
link_text
|
|
= & { return setFlag('linkdesc'); }
|
|
h:inlineline
|
|
{
|
|
clearFlag('linkdesc');
|
|
return h;
|
|
}
|
|
/ & { clearFlag('linkdesc'); return false }
|
|
|
|
link_end = "]]"
|
|
|
|
/* Generic quote production for italic and bold, further processed in a token
|
|
* stream transformation in doQuotes. Relies on NEWLINE tokens being emitted
|
|
* for each line of text to balance quotes per line.
|
|
*
|
|
* We are not using a simple pair rule here as we need to support mis-nested
|
|
* bolds/italics and MediaWiki's special heuristics for apostrophes, which are
|
|
* all not context free. */
|
|
quote = "''" x:"'"* {
|
|
return {
|
|
type: 'TAG',
|
|
name : 'mw-quote', // Will be consumed in token transforms
|
|
value: "''" + x.join('')
|
|
}
|
|
}
|
|
|
|
/* XXX: Extension tags can require a change in the tokenizer mode, which
|
|
* returns any text between extension tags verbatim. For now, we simply
|
|
* continue to parse the contained text and return the tokens. The original
|
|
* input source can be recovered from the source positions added on tag
|
|
* tokens. This won't however work in all cases. For example, a comment start
|
|
* (<!--) between extension tags would cause the remaining text to be consumed
|
|
* as a comment. To avoid this, we might need to look ahead for the end tag
|
|
* and limit the content parsing to this section. */
|
|
|
|
xmlish_tag = nowiki / generic_tag
|
|
|
|
pre
|
|
= "<pre"
|
|
attribs:generic_attribute*
|
|
">"
|
|
ts:(t1:[^<]+ { return {type:'TEXT',value:t1.join('')} }
|
|
/ nowiki
|
|
/ !"</pre>" t2:. {return {type:'TEXT',value:t2}})+
|
|
("</pre>" / eof) {
|
|
// return nowiki tags as well?
|
|
//console.log('inpre');
|
|
return [ {type: 'TAG', name: 'pre', attribs: attribs} ]
|
|
.concat(ts, [{type: 'ENDTAG', name: 'pre'}]);
|
|
}
|
|
/ "</pre>" { return {type: 'TEXT', value: "</pre>"}; }
|
|
|
|
nowiki
|
|
= "<nowiki>" nc:nowiki_content "</nowiki>" {
|
|
// console.log(pp(nc));
|
|
return nc;
|
|
}
|
|
/ "<nowiki>" {
|
|
//console.log('nowiki fallback');
|
|
return [{type: 'TEXT', value: '<nowiki>'}];
|
|
}
|
|
/ "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; }
|
|
|
|
nowiki_content
|
|
= ts:( t:[^<]+ { return t.join('') }
|
|
/ "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
|
|
//console.log('nested pre in nowiki');
|
|
return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');
|
|
}
|
|
/ (!("</nowiki>" / "</pre>") c:. {return c})
|
|
)* {
|
|
// return nowiki tags as well?
|
|
return [{type: 'TEXT', value: ts.join('')}];
|
|
}
|
|
|
|
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
|
|
// following paragraphs
|
|
block_tag
|
|
= "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
|
|
attribs:generic_attribute*
|
|
selfclose:"/"?
|
|
">" {
|
|
if (block_names[name.toLowerCase()] !== true) {
|
|
// abort match if tag is not block-level
|
|
return null;
|
|
}
|
|
var res = {name: name, attribs: attribs};
|
|
if ( end != '' ) {
|
|
res.type = 'ENDTAG';
|
|
} else if ( selfclose != '' ) {
|
|
res.type = 'SELFCLOSINGTAG';
|
|
} else {
|
|
res.type = 'TAG';
|
|
}
|
|
return [res];
|
|
}
|
|
|
|
/* Generic XML-like tags
|
|
*
|
|
* These also cover extensions (including Cite), which will hook into the
|
|
* token stream for further processing. The content of extension tags is
|
|
* parsed as regular inline, but the source positions of the tag are added
|
|
* to allow reconstructing the unparsed text from the input. */
|
|
|
|
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
|
|
// following paragraphs
|
|
generic_tag
|
|
= "<"
|
|
& { tagStartPos = pos; return true; } // remember the start position of this tag
|
|
end:"/"? name:[0-9a-zA-Z]+
|
|
attribs:generic_attribute*
|
|
space*
|
|
selfclose:"/"?
|
|
">" {
|
|
var res = {name: name.join(''), attribs: attribs};
|
|
if ( end != '' ) {
|
|
res.type = 'ENDTAG';
|
|
} else if ( selfclose != '' ) {
|
|
res.type = 'SELFCLOSINGTAG';
|
|
} else {
|
|
res.type = 'TAG';
|
|
}
|
|
res.attribs.push(['data-sourceTagPos', (tagStartPos - 1) + ":" + pos]);
|
|
return res;
|
|
}
|
|
|
|
generic_attribute
|
|
= s:space*
|
|
name:generic_attribute_name
|
|
value:(space*
|
|
v:generic_attribute_value { return v })?
|
|
{
|
|
if ( value !== '' ) {
|
|
return [name, value];
|
|
} else {
|
|
return [name,''];
|
|
}
|
|
}
|
|
|
|
// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
|
|
// disallow newlines, | and {.
|
|
generic_attribute_name
|
|
= n:[^ \t\0/"'>=\n|{]+ {
|
|
return n.join('');
|
|
}
|
|
|
|
generic_attribute_value
|
|
= "=" space* v:att_value {
|
|
return v;
|
|
}
|
|
|
|
// XXX: attributes can contain templates and template args!!
|
|
att_value
|
|
= t:(!inline_breaks c:[^ \t'"<>='\n] { return c } )+ {
|
|
return t.join('');
|
|
}
|
|
// XXX: is "\"" also valid html? or just Wikitext?
|
|
/ "'" t:[^'>]* "'" { return unquote("'", t.join('')); }
|
|
/ '"' t:[^">]* '"' { return unquote('"', t.join('')); }
|
|
|
|
|
|
/* Lists */
|
|
lists = e:(dtdd / li) es:(sol (dtdd / li))*
|
|
{
|
|
return annotateList( [ { type: 'TAG', name: 'list'} ]
|
|
.concat(flatten([e].concat(es))
|
|
,[{ type: 'ENDTAG', name: 'list' }]));
|
|
}
|
|
|
|
li = bullets:list_char+
|
|
c:inlineline?
|
|
&eolf
|
|
{
|
|
if ( c == '' )
|
|
c = [];
|
|
return [ { type: 'TAG',
|
|
name: 'listItem',
|
|
bullets: bullets }
|
|
, c ];
|
|
}
|
|
|
|
dtdd
|
|
= bullets:(!(";" !list_char) list_char)*
|
|
";"
|
|
& {return setFlag('colon');}
|
|
c:inlineline
|
|
":"
|
|
// Fortunately dtdds cannot be nested, so we can simply set the flag
|
|
// back to 0 to disable it.
|
|
& {syntaxFlags['colon'] = 0; return true;}
|
|
d:inlineline
|
|
&eolf {
|
|
// Convert trailing space into
|
|
// XXX: This should be moved to a serializer
|
|
//var clen = c.length;
|
|
//if (clen && c[clen - 1].type === 'TEXT') {
|
|
// var val = c[clen - 1].value;
|
|
// if(val.length && val[val.length - 1] == ' ') {
|
|
// c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
|
|
// }
|
|
//}
|
|
|
|
return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
|
|
.concat( c
|
|
,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
|
|
, d );
|
|
}
|
|
// Fall-back case to clear the colon flag
|
|
/ & { return true; } { syntaxFlags['colon'] = 0; return null; }
|
|
|
|
|
|
list_char = [*#:;]
|
|
|
|
/**
|
|
* Tables
|
|
*
|
|
* Table productions are geared to support independent parsing of fragments in
|
|
* templates (the common table start / row / table end use case). The tokens
|
|
* produced by these fragments then match up to a table while building the
|
|
* DOM tree. For similar reasons, table rows do not emit explicit end tag
|
|
* tokens.
|
|
*
|
|
* The separate table_lines production is faster than moving those productions
|
|
* directly to block_lines.
|
|
* */
|
|
|
|
table_lines
|
|
= & { return setFlag('table'); }
|
|
tl:table_line
|
|
tls:( s:sol tl2:table_line { return s.concat(tl2); } )* {
|
|
clearFlag('table');
|
|
//console.log('table_lines: ' + pp(tl.concat(tls)));
|
|
return tl.concat( tls );
|
|
}
|
|
/ & { return clearFlag('table'); }
|
|
|
|
// This production assumes start-of-line position!
|
|
table_line
|
|
= table_start_tag
|
|
/ table_row_tag
|
|
/ table_data_tags
|
|
/ table_heading_tags
|
|
/ table_caption_tag
|
|
/ table_end_tag
|
|
|
|
table_start_tag
|
|
= "{|"
|
|
ta:generic_attribute*
|
|
space*
|
|
te:table_end_tag? // can occur somewhere in the middle of the line too
|
|
{
|
|
var tok = [{type: 'TAG', name: 'table'}];
|
|
if ( ta )
|
|
tok[0].attribs = ta;
|
|
if ( te )
|
|
tok = tok.concat(te);
|
|
return tok;
|
|
}
|
|
|
|
table_caption_tag
|
|
= "|+"
|
|
c:inline* {
|
|
return [{type: 'TAG', name: 'caption'}]
|
|
.concat( c, [{type: 'ENDTAG', name: 'caption'}]);
|
|
}
|
|
|
|
|
|
table_row_tag
|
|
= //& { console.log("table row enter"); return true; }
|
|
"|-"
|
|
a:generic_attribute*
|
|
space*
|
|
// handle tables with missing table cells after a row
|
|
td:( s:sol ![|!] tdt:table_data_tag { return s.concat(tdt); } )?
|
|
{
|
|
// We rely on our tree builder to close the row as needed. This is
|
|
// needed to support building tables from fragment templates with
|
|
// individual cells or rows.
|
|
var trToken = [{type: 'TAG', name: 'tr', attribs: a}];
|
|
if ( !td ) {
|
|
return trToken;
|
|
} else {
|
|
return trToken.concat(td);
|
|
}
|
|
}
|
|
|
|
table_data_tags
|
|
= "|"
|
|
td:table_data_tag
|
|
tds:( "||" tdt:table_data_tag { return tdt } )* {
|
|
return td.concat(tds);
|
|
}
|
|
|
|
table_data_tag
|
|
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
|
|
! [}+-]
|
|
a:table_cell_args?
|
|
//& { console.log("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
|
|
// use inline_breaks to break on tr etc
|
|
td:( !inline_breaks
|
|
//& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
|
|
b:block { return b } )*
|
|
{
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
//dp("table data result: " + pp(td) + ", attribts: " + pp(a));
|
|
return [{ type: 'TAG', name: 'td', attribs: a}]
|
|
.concat( td, [{type: 'ENDTAG', name: 'td'}] );
|
|
}
|
|
|
|
table_heading_tags
|
|
= "!"
|
|
th:table_heading_tag
|
|
ths:( "!!" tht:table_heading_tag { return tht } )* {
|
|
return th.concat(ths);
|
|
}
|
|
|
|
table_heading_tag
|
|
= a:table_cell_args?
|
|
c:inline {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
return [{type: 'TAG', name: 'th', attribs: a}]
|
|
.concat( c, [{type: 'ENDTAG', name: 'th'}] );
|
|
}
|
|
|
|
table_end_tag
|
|
= "|}" {
|
|
var tok = [{type: 'ENDTAG', name: 'table'}];
|
|
return tok;
|
|
}
|
|
|
|
table_cell_args
|
|
= & { return setFlag('tableCellArg'); }
|
|
as:generic_attribute+ space* "|" !"|" {
|
|
clearFlag('tableCellArg');
|
|
return as;
|
|
}
|
|
/ & { return clearFlag('tableCellArg'); }
|
|
|
|
|
|
|
|
/* Tables, full-table version (no support for table fragments from templates)
|
|
* Only left here for comparison purposes with the above productions. Remove
|
|
* when those work as intended! */
|
|
table
|
|
= tas:table_start space* c:table_caption? b:table_body? te:table_end {
|
|
var res = {type: 'TAG', name: 'table'}
|
|
var body = b !== '' ? b : [];
|
|
dp("body: " + pp(body));
|
|
if (tas.length > 0) {
|
|
// FIXME: actually parse and build structure
|
|
//res.attribs = [['data-unparsed', tas.join('')]];
|
|
res.attribs = tas;
|
|
}
|
|
|
|
if (c != '') {
|
|
var caption = [{type: 'TAG', name: 'caption'}]
|
|
.concat(c, [{type: 'ENDTAG', name: 'caption'}], te);
|
|
} else {
|
|
var caption = [];
|
|
}
|
|
//dp(pp(res));
|
|
|
|
return [res].concat(caption, body,
|
|
[{type: 'ENDTAG', name: 'table'}]);
|
|
}
|
|
|
|
table_start
|
|
= "{|"
|
|
res:(
|
|
& { setFlag('table'); return true; }
|
|
ta:generic_attribute*
|
|
{
|
|
dp("table_start " + pp(ta) + ", pos:" + pos);
|
|
return ta;
|
|
}
|
|
/ & { clearFlag('table'); return false; } { return null; }
|
|
) { return res }
|
|
|
|
table_caption
|
|
= n:newlineToken
|
|
"|+" c:inline* {
|
|
return n.concat(c);
|
|
}
|
|
|
|
table_body
|
|
= //& { dp("table_body enter"); return true; }
|
|
firstrow:table_firstrow otherrows:table_row* {
|
|
/* dp('table first and otherrows: '
|
|
* + pp([firstrow].concat(otherrows))); */
|
|
return [firstrow].concat(otherrows);
|
|
}
|
|
/ otherrows:table_row* {
|
|
//dp('table otherrows: ' + pp(otherrows));
|
|
return otherrows;
|
|
}
|
|
|
|
table_firstrow
|
|
= td:(table_data / table_heading)+ {
|
|
//dp('firstrow: ' + pp(td));
|
|
return [{ type: 'TAG', name: 'tr' }]
|
|
.concat(td, [{type: 'ENDTAG', name: 'tr'}]);
|
|
}
|
|
|
|
table_row
|
|
= //& { dp("table row enter"); return true; }
|
|
n:newlineToken
|
|
"|-"
|
|
a:(as:generic_attribute+ space* "|" !"|" { return as } )?
|
|
space*
|
|
td:(table_data / table_heading)* {
|
|
return n.concat([{type: 'TAG', name: 'tr'}]
|
|
, td, [{type: 'ENDTAG', name: 'tr'}]);
|
|
}
|
|
|
|
table_data
|
|
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
|
|
n:("||" { return [] } / nt:newlineToken ("|" / !'!') { return nt })
|
|
! [}+-]
|
|
//& { dp('before attrib, pos=' + pos); return true; }
|
|
a:(as:generic_attribute+ space* "|" !"|" { return as } )?
|
|
//& { dp('past attrib, pos=' + pos); return true; }
|
|
// use inline_breaks to break on tr etc
|
|
td:(!inline_breaks
|
|
//& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
|
|
b:block { return b })* {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
//dp("table data result: " + pp(td) + ", attribts: " + pp(a));
|
|
return n.concat( [{ type: 'TAG', name: 'td', attribs: a}]
|
|
, td, [{type: 'ENDTAG', name: 'td'}] );
|
|
}
|
|
|
|
table_heading
|
|
= n:("!!" { return [] } / nl:newlineToken "!" { return nl })
|
|
a:(as:generic_attribute+ "|" !"|" { return as } )?
|
|
c:inline {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
return n.concat( [{type: 'TAG', name: 'th', attribs: a}]
|
|
, c, [{type: 'ENDTAG', name: 'th'}]);
|
|
}
|
|
|
|
thtd_attribs
|
|
// In particular, do not match [|\n]
|
|
= a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "|" ! "|" {
|
|
return a;
|
|
}
|
|
|
|
|
|
table_end
|
|
= nt:newlineToken? ( "|}" / eof ) {
|
|
clearFlag('table');
|
|
if(nt)
|
|
return nt;
|
|
else
|
|
return [];
|
|
}
|
|
|
|
|
|
/* Tabs do not mix well with the hybrid production syntax */
|
|
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent : */
|