mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-12-01 17:36:35 +00:00
a8fa9433c4
token stream. This is the first token transformation exercising the TokenTransformer class as its dispatcher. Template expansions, wiki link formatting, tag sanitation and extensions should be able to use the same dispatcher by registering for specific token types. The parser performance is very slightly improved as the token stream is only traversed once.
1068 lines
31 KiB
JavaScript
1068 lines
31 KiB
JavaScript
/* Combined Wiki (MediaWiki) and HTML tokenizer. Produces a token stream
|
|
** (actually a list of tokens for now) suitable for a HTML5TreeBuilder. */
|
|
{
|
|
/* Fixme: use static functions to separate module! Unfortunately, this
|
|
* does not work:
|
|
* var tu = require('./mediawiki.tokenizer.utils.js');
|
|
* console.log(tu.flatten([]));
|
|
* Using exports in the module gets a bit further, but accesses to
|
|
* tu.flatten in productions still fail. Thus, I just moved the functions
|
|
* here until a solution is found:
|
|
*/
|
|
|
|
/* Static utilities */
|
|
|
|
// Flatten a list of lists.
|
|
var flatten = function ( e ) {
|
|
// Fast single-level flatten:
|
|
//return [].concat.apply([], e);
|
|
|
|
var es = [];
|
|
// flatten sub-arrays
|
|
for(var i = 0, length = e.length; i < length; i++) {
|
|
var ei = e[i];
|
|
if ($.isArray(ei))
|
|
es = es.concat(flatten(ei));
|
|
else
|
|
es.push(ei);
|
|
};
|
|
return es;
|
|
};
|
|
|
|
// Remove escaped quotes from attributes etc
|
|
var unquote = function (quotec, text) {
|
|
return text.replace('\\' + quotec, quotec);
|
|
};
|
|
|
|
// Decode html entities. In a browser, this should only be fed the entity,
|
|
// not untrusted html! XXX: replace with safer version.
|
|
var unentity = function ( entity ) {
|
|
return $("<div/>").html(entity).text();
|
|
};
|
|
|
|
// Debug print with global switch
|
|
var dp = function ( msg ) {
|
|
if ( false ) {
|
|
console.log(msg);
|
|
}
|
|
};
|
|
|
|
var pp = function ( s ) { return JSON.stringify(s, null, 2); }
|
|
|
|
/*
|
|
* Annotate a token stream with list items with appropriate list tokens
|
|
*
|
|
* @static
|
|
* @method
|
|
* @param {[tokens]} Token stream with li tokens
|
|
* @returns {[tokens]} Token stream, possibly with additional list tokens
|
|
* */
|
|
var annotateList = function ( tokens ) {
|
|
var out = [], // List of tokens
|
|
bstack = [], // Bullet stack, previous element's listStyle
|
|
bnext = [], // Next element's listStyle
|
|
endtags = []; // Stack of end tags
|
|
|
|
var commonPrefixLength = function (x, y) {
|
|
var minLength = Math.min(x.length, y.length);
|
|
for(var i = 0; i < minLength; i++) {
|
|
if (x[i] != y[i])
|
|
break;
|
|
}
|
|
return i;
|
|
};
|
|
|
|
var pushList = function ( listName, itemName ) {
|
|
out.push({type: 'TAG', name: listName});
|
|
out.push({type: 'TAG', name: itemName});
|
|
endtags.push({type: 'ENDTAG', name: listName});
|
|
endtags.push({type: 'ENDTAG', name: itemName});
|
|
};
|
|
|
|
var popTags = function ( n ) {
|
|
for(;n > 0; n--) {
|
|
// push list item..
|
|
out.push(endtags.pop());
|
|
// and the list end tag
|
|
out.push(endtags.pop());
|
|
}
|
|
};
|
|
|
|
var isDlDd = function (a, b) {
|
|
var ab = [a,b].sort();
|
|
return (ab[0] === ':' && ab[1] === ';');
|
|
};
|
|
|
|
var doListItem = function ( bs, bn ) {
|
|
var prefixLen = commonPrefixLength (bs, bn);
|
|
var changeLen = Math.max(bs.length, bn.length) - prefixLen;
|
|
var prefix = bn.slice(0, prefixLen);
|
|
// emit close tag tokens for closed lists
|
|
if (changeLen === 0) {
|
|
var itemToken = endtags.pop();
|
|
out.push(itemToken);
|
|
out.push({type: 'TAG', name: itemToken.name});
|
|
endtags.push({type: 'ENDTAG', name: itemToken.name});
|
|
} else if ( bs.length == bn.length
|
|
&& changeLen == 1
|
|
&& isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
|
|
// handle dd/dt transitions
|
|
out.push(endtags.pop());
|
|
if( bn[prefixLen] == ';') {
|
|
var newName = 'dt';
|
|
} else {
|
|
var newName = 'dd';
|
|
}
|
|
out.push({type: 'TAG', name: newName});
|
|
endtags.push({type: 'ENDTAG', name: newName});
|
|
} else {
|
|
popTags(bs.length - prefixLen);
|
|
|
|
if (prefixLen > 0 && bn.length == prefixLen ) {
|
|
var itemToken = endtags.pop();
|
|
out.push(itemToken);
|
|
out.push({type: 'TAG', name: itemToken.name});
|
|
endtags.push({type: 'ENDTAG', name: itemToken.name});
|
|
}
|
|
|
|
for(var i = prefixLen; i < bn.length; i++) {
|
|
switch (bn[i]) {
|
|
case '*':
|
|
pushList('ul', 'li');
|
|
break;
|
|
case '#':
|
|
pushList('ol', 'li');
|
|
break;
|
|
case ';':
|
|
pushList('dl', 'dt');
|
|
break;
|
|
case ':':
|
|
pushList('dl', 'dd');
|
|
break;
|
|
default:
|
|
throw("Unknown node prefix " + prefix[i]);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
for (var i = 0, length = tokens.length; i < length; i++) {
|
|
var token = tokens[i];
|
|
switch ( token.type ) {
|
|
case 'TAG':
|
|
switch (token.name) {
|
|
case 'list':
|
|
// ignore token
|
|
break;
|
|
case 'listItem':
|
|
// convert listItem to list and list item tokens
|
|
bnext = token.bullets;
|
|
doListItem( bstack, bnext );
|
|
bstack = bnext;
|
|
break;
|
|
default:
|
|
// pass through all remaining start tags
|
|
out.push(token);
|
|
break;
|
|
}
|
|
break;
|
|
case 'ENDTAG':
|
|
if ( token.name == 'list' ) {
|
|
// pop all open list item tokens
|
|
popTags(bstack.length);
|
|
bstack = [];
|
|
} else {
|
|
out.push(token);
|
|
}
|
|
break;
|
|
default:
|
|
out.push(token);
|
|
break;
|
|
}
|
|
}
|
|
return out;
|
|
};
|
|
|
|
|
|
/* End static utilities */
|
|
|
|
/*
|
|
* Flags for specific parse environments (inside tables, links etc). Flags
|
|
* trigger syntactic stops in the inline_breaks production, which
|
|
* terminates inline and attribute matches. Flags merely reduce the number
|
|
* of productions needed: The grammar is still context-free as the
|
|
* productions can just be unrolled for all combinations of environments
|
|
* at the cost of a much larger grammar.
|
|
*/
|
|
var syntaxFlags = {};
|
|
var setFlag = function(flag) {
|
|
if (syntaxFlags[flag] !== undefined) {
|
|
syntaxFlags[flag]++;
|
|
} else {
|
|
syntaxFlags[flag] = 1;
|
|
}
|
|
return true;
|
|
};
|
|
var clearFlag = function(flag) {
|
|
syntaxFlags[flag]--;
|
|
};
|
|
|
|
// Start position of top-level block
|
|
// Could also provide positions for lower-level blocks using a stack.
|
|
var blockStart = 0;
|
|
|
|
// Start position of generic tag production
|
|
var tagStartPos = 0;
|
|
|
|
// cache the input length
|
|
var inputLength = input.length;
|
|
|
|
// pseudo-production that matches at end of input
|
|
var isEOF = function (pos) {
|
|
return pos === inputLength;
|
|
};
|
|
|
|
// text start position
|
|
var textStart = 0;
|
|
|
|
// hack to support numbered external links ([http://example.com]).
|
|
// XXX: Move to token stream transform after templates are expanded!
|
|
var linkCount = 1;
|
|
|
|
// Define block-level tags in JS, so we can use toLowerCase to match tags
|
|
// case-independently. This would be quite ugly (and possibly slower) if
|
|
// done manually in the grammar.
|
|
var block_names = (function () {
|
|
var names = [ "p", "table", "td", "tr", "ul", "ol"
|
|
, "li", "dl", "dt", "dd", "div", "center"
|
|
, "blockquote" ];
|
|
var bnames = {};
|
|
for(var i = 0, l = names.length; i < l; i++) {
|
|
bnames[names[i]] = true;
|
|
}
|
|
return bnames;
|
|
})();
|
|
|
|
|
|
}
|
|
|
|
start
|
|
= e:toplevelblock* newline* {
|
|
return flatten(e);
|
|
}
|
|
|
|
|
|
/* All chars that cannot start syntactic structures in the middle of a line
|
|
* XXX: ] and other end delimiters should probably only be activated inside
|
|
* structures to avoid unnecessarily leaving the text production on plain
|
|
* content. */
|
|
|
|
text_char = [^'<~[{\n\r:\]}|!=]
|
|
|
|
text = t:text_char+ { return t.join(''); }
|
|
|
|
/* Explanation of chars
|
|
* ' quotes (italic/bold)
|
|
* < start of xmlish_tag
|
|
* ~ signatures/dates
|
|
* [ start of links
|
|
* { start of parser functions, transclusion and template args
|
|
* \n all sort of block-level markup at start of line
|
|
* \r ditto
|
|
* h http(s) urls
|
|
* n nntp(s) urls
|
|
* m mailto urls
|
|
*
|
|
* ! and | table cell delimiters, might be better to specialize those
|
|
* = headings - also specialize those!
|
|
*
|
|
* The following chars are also included for now, but only apply in some
|
|
* contexts and should probably be enabled only in those:
|
|
* : separate definition in ; term : definition
|
|
* ] end of link
|
|
* } end of parser func/transclusion/template arg
|
|
*/
|
|
|
|
urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
|
|
/ htmlentity
|
|
/ urllink
|
|
// Convert trailing space into
|
|
// XXX: This should be moved to a serializer
|
|
/ ' ' & ':' { return "\u00a0"; }
|
|
/ t:text_char )+
|
|
|
|
/*
|
|
'//', // for protocol-relative URLs, but not in text!
|
|
'ftp://',
|
|
'git://',
|
|
'gopher://',
|
|
'http://',
|
|
'https://',
|
|
'irc://',
|
|
'ircs://', // @bug 28503
|
|
'mailto:',
|
|
'mms://',
|
|
'news:',
|
|
'nntp://', // @bug 3808 RFC 1738
|
|
'svn://',
|
|
'telnet://', // Well if we're going to support the above.. -ævar
|
|
'worldwind://',
|
|
*/
|
|
|
|
// Old version
|
|
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
|
|
|
|
// Experimental tweaked version: avoid expensive single-char substrings
|
|
// This did not bring the expected performance boost, however.
|
|
//text = [A-Za-z0-9,._ -] {
|
|
// textStart = pos;
|
|
//
|
|
// var res = input.substr(textStart - 1, inputLength)
|
|
// .match(/[A-Za-z0-9,._ -]+/)[0];
|
|
// pos = pos + (res.length - 1);
|
|
// return res
|
|
// }
|
|
|
|
htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
|
|
return unentity("&" + c.join('') + ";")
|
|
}
|
|
|
|
space
|
|
= s:[ \t]+ { return s.join(''); }
|
|
|
|
optionalSpaceToken
|
|
= s:space* {
|
|
if ( s.length ) {
|
|
return [{type: 'TEXT', value: s.join('')}];
|
|
} else {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
|
|
// Start of line
|
|
sol = (newline / & { return pos === 0; } { return true; })
|
|
cn:(c:comment n:newline? {
|
|
if ( n !== '' ) {
|
|
return [c, {type: 'TEXT', value: n}];
|
|
} else {
|
|
return [c];
|
|
}
|
|
}
|
|
)* {
|
|
return [{type: 'NEWLINE'}].concat(cn);
|
|
}
|
|
|
|
eof = & { return isEOF(pos); } { return true; }
|
|
|
|
|
|
newline
|
|
= '\n' / '\r\n'
|
|
|
|
newlineToken = newline { return [{type: 'NEWLINE'}] }
|
|
|
|
eolf = newline / eof
|
|
|
|
toplevelblock
|
|
= & { blockStart = pos; return true; } b:block {
|
|
b = flatten(b);
|
|
var bs = b[0];
|
|
//dp('toplevelblock:' + pp(b));
|
|
if (bs.attribs === undefined) {
|
|
bs.attribs = [];
|
|
}
|
|
bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);
|
|
// XXX: only run this for lines that actually need it!
|
|
//b.push({type: 'NEWLINE'});
|
|
// Move this to a token stream transform!
|
|
return b;
|
|
}
|
|
|
|
block
|
|
= block_lines
|
|
/ pre
|
|
/ comment &eolf
|
|
/ nowiki
|
|
/ pre
|
|
/ bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
|
|
/ para
|
|
/ inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
|
|
/ s:sol {
|
|
if (s) {
|
|
return [s, {type: 'NEWLINE'}];
|
|
} else {
|
|
return [{type: 'NEWLINE'}];
|
|
}
|
|
}
|
|
|
|
block_lines
|
|
= s:sol
|
|
// eat an empty line before the block
|
|
s2:(os:optionalSpaceToken so:sol { return os.concat(so) })?
|
|
bl:block_line {
|
|
var s2_ = (s2 !== '') ? s2 : [];
|
|
return s.concat(s2_, bl);
|
|
}
|
|
|
|
// Block structures with start-of-line wiki syntax
|
|
block_line
|
|
= h
|
|
/ table
|
|
/ lists
|
|
// tag-only lines should not trigger pre
|
|
/ st:optionalSpaceToken
|
|
bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+
|
|
&eolf {
|
|
return st.concat(bt);
|
|
}
|
|
/ pre_indent
|
|
/ pre
|
|
|
|
|
|
|
|
|
|
// TODO: convert inline content to annotations!
|
|
para
|
|
= s1:sol s2:sol c:inlineline {
|
|
return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);
|
|
}
|
|
|
|
br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
|
|
|
|
// Syntax stops to limit inline expansion defending on syntactic context
|
|
inline_breaks
|
|
=
|
|
& { // Important hack: disable caching for this production, as the default
|
|
// cache key does not take into account flag states!
|
|
cacheKey = '';
|
|
return true;
|
|
}
|
|
& { return syntaxFlags['table']; }
|
|
a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; }
|
|
/ & { return (syntaxFlags['colon'] &&
|
|
! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
|
|
! syntaxFlags.linkdesc); } ":" { return true; }
|
|
/ & { return syntaxFlags['extlink']; } "]" { return true; }
|
|
/ & { return syntaxFlags['linkdesc']; } link_end { return true; }
|
|
/ & { return syntaxFlags['h']; } '='+ space* newline { return true; }
|
|
/ & { return syntaxFlags['template']; } ('|' / '}}') { return true; }
|
|
|
|
inline
|
|
= c:(urltext / (! inline_breaks (inline_element / . )))+ {
|
|
var out = [];
|
|
var text = [];
|
|
c = flatten(c);
|
|
for (var i = 0, l = c.length; i < l; i++) {
|
|
var ci = c[i];
|
|
if (typeof ci == 'string') {
|
|
if(ci !== '') {
|
|
text.push(ci);
|
|
}
|
|
} else {
|
|
if (text.length) {
|
|
out.push({ type: "TEXT", value: text.join('') });
|
|
text = [];
|
|
}
|
|
out.push(ci);
|
|
}
|
|
}
|
|
if (text.length) {
|
|
out.push({ type: 'TEXT', value: text.join('') });
|
|
}
|
|
//dp('inline out:' + pp(out));
|
|
return out;
|
|
}
|
|
|
|
|
|
inlineline
|
|
= c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
|
|
var out = [];
|
|
var text = [];
|
|
c = flatten(c);
|
|
for (var i = 0; i < c.length; i++) {
|
|
var ci = c[i]
|
|
if (typeof ci == 'string') {
|
|
if(ci !== '') {
|
|
text.push(ci);
|
|
}
|
|
} else {
|
|
if (text.length) {
|
|
out.push({type: 'TEXT', value: text.join('')});
|
|
text = [];
|
|
}
|
|
out.push(ci);
|
|
}
|
|
}
|
|
if (text.length) {
|
|
out.push({type: 'TEXT', value: text.join('')});
|
|
}
|
|
//dp('inlineline out:' + pp(out));
|
|
return out;
|
|
}
|
|
|
|
inline_element
|
|
= //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
|
|
& '<' ( comment / xmlish_tag )
|
|
/ & '{' ( template / tplarg )
|
|
/ & '[' ( wikilink / extlink )
|
|
/ & "'" quote
|
|
|
|
/* Headings */
|
|
|
|
h = & "=" // guard, to make sure '='+ will match.
|
|
// XXX: Also check to end to avoid inline parsing?
|
|
r:(
|
|
s:'='+ // moved in here to make s accessible to inner action
|
|
& { return setFlag('h'); }
|
|
c:inlineline
|
|
e:'='+
|
|
spc:(sp:space+ { return {type: 'TEXT', value: sp.join('') } } / comment)*
|
|
&eolf
|
|
{
|
|
clearFlag('h');
|
|
var level = Math.min(s.length, e.length);
|
|
// convert surplus equals into text
|
|
if(s.length > level) {
|
|
var extras = s.substr(0, s.length - level);
|
|
if(c[0].type == 'TEXT') {
|
|
c[0].value = extras + c[0].value;
|
|
} else {
|
|
c.unshift({type: 'TEXT', value: extras});
|
|
}
|
|
}
|
|
if(e.length > level) {
|
|
var extras = e.substr(0, e.length - level),
|
|
lastElem = c[c.length - 1];
|
|
if(lastElem.type == 'TEXT') {
|
|
lastElem.value = lastElem.value + extras;
|
|
} else {
|
|
c.push({type: 'TEXT', value: extras});
|
|
}
|
|
}
|
|
|
|
return [{type: 'TAG', name: 'h' + level}]
|
|
.concat(c, [{type: 'ENDTAG', name: 'h' + level}, spc]);
|
|
}
|
|
/ & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
|
|
) { return r }
|
|
|
|
|
|
pre_indent
|
|
= l:pre_indent_line ls:(sol pre_indent_line)* {
|
|
return [{type: 'TAG', name: 'pre'}]
|
|
.concat( [l], ls
|
|
, [{type: 'ENDTAG', name: 'pre'}]);
|
|
}
|
|
pre_indent_line = space l:inlineline {
|
|
return [{type: 'TEXT', value: '\n'}].concat(l);
|
|
}
|
|
|
|
|
|
comment
|
|
= '<!--' c:comment_chars* ('-->' / eof)
|
|
cs:(space* newline space* cn:comment { return cn })* {
|
|
return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
|
|
}
|
|
|
|
comment_chars
|
|
= c:[^-] { return c; }
|
|
/ c:'-' !'->' { return c; }
|
|
|
|
|
|
urllink
|
|
= target:url {
|
|
return [ { type: 'TAG',
|
|
name: 'a',
|
|
attribs: [['href', target]] }
|
|
, {type: 'TEXT', value: target}
|
|
, {type: 'ENDTAG', name: 'a'}
|
|
];
|
|
}
|
|
|
|
extlink
|
|
= "["
|
|
& { return setFlag('extlink'); }
|
|
target:url
|
|
space*
|
|
text:inlineline?
|
|
"]" {
|
|
clearFlag('extlink');
|
|
if ( text == '' ) {
|
|
// XXX: Link numbering should be implemented in post-processor.
|
|
text = [{type: 'TEXT', value: "[" + linkCount + "]"}];
|
|
linkCount++;
|
|
}
|
|
return [ { type: 'TAG',
|
|
name: 'a',
|
|
attribs: [['href', target]] } ]
|
|
.concat( text
|
|
, [{type: 'ENDTAG', name: 'a'}]);
|
|
}
|
|
/ "[" & { clearFlag('extlink'); return false; }
|
|
|
|
/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
|
|
* be configured dynamically. */
|
|
url_protocol
|
|
= '//' // for protocol-relative URLs
|
|
/ 'ftp://'
|
|
/ 'git://'
|
|
/ 'gopher://'
|
|
/ 'http://'
|
|
/ 'https://'
|
|
/ 'irc://'
|
|
/ 'ircs://' // @bug 28503
|
|
/ 'mailto:'
|
|
/ 'mms://'
|
|
/ 'news:'
|
|
/ 'nntp://' // @bug 3808 RFC 1738
|
|
/ 'svn://'
|
|
/ 'telnet://' // Well if we're going to support the above.. -ævar
|
|
/ 'worldwind://'
|
|
|
|
// javascript does not support unicode features..
|
|
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
|
|
|
|
|
|
urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
|
|
return decodeURI("%" + c0 + c1)
|
|
}
|
|
|
|
//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
|
|
url
|
|
= proto:url_protocol
|
|
rest:( [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
|
|
/ s:[.:,] !(space / eolf) { return s }
|
|
/ htmlentity
|
|
/ urlencoded_char
|
|
/ [&%] )+
|
|
{
|
|
return proto + rest.join('');
|
|
}
|
|
|
|
template
|
|
= "{{" target:template_target
|
|
params:(newline? "|" newline? p:template_param { return p })*
|
|
"}}" {
|
|
var obj = { type: 'TAG', name: 'template',
|
|
attribs: [['target', target]],
|
|
args: {}}
|
|
if (params && params.length) {
|
|
var position = 1;
|
|
for ( var i = 0, l = params.length; i < l; i++ ) {
|
|
var param = params[i];
|
|
if ( param[0] === null ) {
|
|
obj.args[position] = param[1];
|
|
position++;
|
|
} else {
|
|
obj.args[param[0]] = param[1];
|
|
}
|
|
}
|
|
// HACK: temporarily also push the args into an attribute
|
|
// (just for debugging)
|
|
obj.attribs.push(['data-args', JSON.stringify(obj.args)]);
|
|
}
|
|
// Should actually use a self-closing tag here, but the Node HTML5
|
|
// parser only recognizes known self-closing tags for now, so use an
|
|
// explicit end tag for now.
|
|
//console.log(pp(obj));
|
|
return obj;
|
|
}
|
|
|
|
template_target
|
|
= h:( !"}}" x:([^|\n]) { return x } )* { return h.join(''); }
|
|
|
|
template_param
|
|
= name:template_param_name space* "=" space* c:template_param_text {
|
|
return [name, c];
|
|
} / c:template_param_text {
|
|
return [null, c];
|
|
}
|
|
|
|
tplarg
|
|
= "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
|
|
var obj = {
|
|
type: 'SELFCLOSINGTAG',
|
|
name: 'templatearg',
|
|
attribs: [['argname', name]]
|
|
};
|
|
if (params && params.length) {
|
|
// HACK, not final.
|
|
obj.attribs.push(['data-args', JSON.stringify(params)]);
|
|
}
|
|
return obj;
|
|
}
|
|
|
|
template_param_name
|
|
= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
|
|
|
|
template_param_text
|
|
= & { return setFlag('template') }
|
|
il:inline+ {
|
|
clearFlag('template');
|
|
return il;
|
|
}
|
|
/ & { clearFlag('template'); return false; }
|
|
|
|
wikilink
|
|
= "[["
|
|
! url
|
|
target:link_target text:("|" lt:link_text { return lt })* "]]" suffix:text? {
|
|
var obj = {
|
|
type: 'TAG',
|
|
name: 'a',
|
|
attribs: [['data-type', 'internal']]
|
|
};
|
|
obj.attribs.push(['href', target]);
|
|
if (text && text.length) {
|
|
var textTokens = text;
|
|
} else {
|
|
if (suffix !== '') {
|
|
target += suffix;
|
|
}
|
|
var textTokens = [{type: 'TEXT', value: target}];
|
|
}
|
|
return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
|
|
}
|
|
|
|
link_target
|
|
= h:( c:[^|%\n\]]+ { return c.join('') } // quickly eat anything unsuspicious
|
|
/ !"]]"
|
|
hi:(
|
|
[^|%\n]
|
|
/ urlencoded_char
|
|
/ '%'
|
|
) { return hi }
|
|
)* { return h.join(''); }
|
|
|
|
link_text
|
|
= h:( & { return setFlag('linkdesc'); }
|
|
x:inlineline { return x }
|
|
)* {
|
|
clearFlag('linkdesc')
|
|
return h;
|
|
}
|
|
/ & { clearFlag('linkdesc') } { return null; }
|
|
|
|
link_end = "]]"
|
|
|
|
/* Generic quote production for italic and bold, further processed in a token
|
|
* stream transformation in doQuotes. Relies on NEWLINE tokens being emitted
|
|
* for each line of text to balance quotes per line.
|
|
*
|
|
* We are not using a simple pair rule here as we need to support mis-nested
|
|
* bolds/italics and MediaWiki's special heuristics for apostrophes, which are
|
|
* all not context free. */
|
|
quote = "''" x:"'"* {
|
|
return {
|
|
type: 'TAG',
|
|
name : 'QUOTE',
|
|
value: "''" + x.join('')
|
|
}
|
|
}
|
|
|
|
/* XXX: Extension tags can require a change in the tokenizer mode, which
|
|
* returns any text between extension tags verbatim. For now, we simply
|
|
* continue to parse the contained text and return the tokens. The original
|
|
* input source can be recovered from the source positions added on tag
|
|
* tokens. This won't however work in all cases. For example, a comment start
|
|
* (<!--) between extension tags would cause the remaining text to be consumed
|
|
* as a comment. To avoid this, we might need to look ahead for the end tag
|
|
* and limit the content parsing to this section. */
|
|
|
|
xmlish_tag = nowiki / generic_tag
|
|
|
|
pre
|
|
= "<pre"
|
|
attribs:generic_attribute*
|
|
">"
|
|
ts:(t1:[^<]+ { return {type:'TEXT',value:t1.join('')} }
|
|
/ nowiki
|
|
/ !"</pre>" t2:. {return {type:'TEXT',value:t2}})+
|
|
("</pre>" / eof) {
|
|
// return nowiki tags as well?
|
|
//console.log('inpre');
|
|
return [ {type: 'TAG', name: 'pre', attribs: attribs} ]
|
|
.concat(ts, [{type: 'ENDTAG', name: 'pre'}]);
|
|
}
|
|
/ "</pre>" { return {type: 'TEXT', value: "</pre>"}; }
|
|
|
|
nowiki
|
|
= "<nowiki>" nc:nowiki_content "</nowiki>" {
|
|
// console.log(pp(nc));
|
|
return nc;
|
|
}
|
|
/ "<nowiki>" {
|
|
//console.log('nowiki fallback');
|
|
return [{type: 'TEXT', value: '<nowiki>'}];
|
|
}
|
|
/ "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; }
|
|
|
|
nowiki_content
|
|
= ts:( t:[^<]+ { return t.join('') }
|
|
/ "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
|
|
//console.log('nested pre in nowiki');
|
|
return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');
|
|
}
|
|
/ (!("</nowiki>" / "</pre>") c:. {return c})
|
|
)* {
|
|
// return nowiki tags as well?
|
|
return [{type: 'TEXT', value: ts.join('')}];
|
|
}
|
|
|
|
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
|
|
// following paragraphs
|
|
block_tag
|
|
= "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
|
|
attribs:generic_attribute*
|
|
selfclose:"/"?
|
|
">" {
|
|
if (block_names[name.toLowerCase()] !== true) {
|
|
// abort match if tag is not block-level
|
|
return null;
|
|
}
|
|
var res = {name: name, attribs: attribs};
|
|
if ( end != '' ) {
|
|
res.type = 'ENDTAG';
|
|
} else if ( selfclose != '' ) {
|
|
res.type = 'SELFCLOSINGTAG';
|
|
} else {
|
|
res.type = 'TAG';
|
|
}
|
|
return [res];
|
|
}
|
|
|
|
/* Generic XML-like tags
|
|
*
|
|
* These also cover extensions (including Cite), which will hook into the
|
|
* token stream for further processing. The content of extension tags is
|
|
* parsed as regular inline, but the source positions of the tag are added
|
|
* to allow reconstructing the unparsed text from the input. */
|
|
|
|
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
|
|
// following paragraphs
|
|
generic_tag
|
|
= "<"
|
|
& { tagStartPos = pos; return true; } // remember the start position of this tag
|
|
end:"/"? name:[0-9a-zA-Z]+
|
|
attribs:generic_attribute*
|
|
space*
|
|
selfclose:"/"?
|
|
">" {
|
|
var res = {name: name.join(''), attribs: attribs};
|
|
if ( end != '' ) {
|
|
res.type = 'ENDTAG';
|
|
} else if ( selfclose != '' ) {
|
|
res.type = 'SELFCLOSINGTAG';
|
|
} else {
|
|
res.type = 'TAG';
|
|
}
|
|
res.attribs.push(['data-sourceTagPos', (tagStartPos - 1) + ":" + pos]);
|
|
return res;
|
|
}
|
|
|
|
generic_attribute
|
|
= s:space*
|
|
name:generic_attribute_name
|
|
value:(space*
|
|
v:generic_attribute_value { return v })?
|
|
{
|
|
if ( value !== '' ) {
|
|
return [name, value];
|
|
} else {
|
|
return [name,''];
|
|
}
|
|
}
|
|
|
|
// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
|
|
// disallow newlines, | and {.
|
|
generic_attribute_name
|
|
= n:[^ \t\0/"'>=\n|{]+ {
|
|
return n.join('');
|
|
}
|
|
|
|
generic_attribute_value
|
|
= "=" space* v:att_value {return v}
|
|
|
|
// XXX: attributes can contain templates and template args!!
|
|
att_value
|
|
= t:[^ \t'"<>='\n]+ { return t.join(''); }
|
|
// XXX: is "\"" also valid html? or just Wikitext?
|
|
/ "'" t:[^'>]* "'" { return unquote("'", t.join('')); }
|
|
/ '"' t:[^">]* '"' { return unquote('"', t.join('')); }
|
|
|
|
|
|
/* Lists */
|
|
lists = e:(dtdd / li) es:(sol (dtdd / li))*
|
|
{
|
|
return annotateList( [ { type: 'TAG', name: 'list'} ]
|
|
.concat(flatten([e].concat(es))
|
|
,[{ type: 'ENDTAG', name: 'list' }]));
|
|
}
|
|
|
|
li = bullets:list_char+
|
|
c:inlineline?
|
|
&eolf
|
|
{
|
|
if ( c == '' )
|
|
c = [];
|
|
return [ { type: 'TAG',
|
|
name: 'listItem',
|
|
bullets: bullets }
|
|
, c ];
|
|
}
|
|
|
|
dtdd
|
|
= bullets:(!(";" !list_char) list_char)*
|
|
";"
|
|
& {return setFlag('colon');}
|
|
c:inlineline
|
|
":"
|
|
// Fortunately dtdds cannot be nested, so we can simply set the flag
|
|
// back to 0 to disable it.
|
|
& {syntaxFlags['colon'] = 0; return true;}
|
|
d:inlineline
|
|
&eolf {
|
|
// Convert trailing space into
|
|
// XXX: This should be moved to a serializer
|
|
//var clen = c.length;
|
|
//if (clen && c[clen - 1].type === 'TEXT') {
|
|
// var val = c[clen - 1].value;
|
|
// if(val.length && val[val.length - 1] == ' ') {
|
|
// c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
|
|
// }
|
|
//}
|
|
|
|
return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
|
|
.concat( c
|
|
,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
|
|
, d );
|
|
}
|
|
// Fall-back case to clear the colon flag
|
|
/ & { return true; } { syntaxFlags['colon'] = 0; return null; }
|
|
|
|
|
|
list_char = [*#:;]
|
|
|
|
|
|
/* Tables */
|
|
table
|
|
= tas:table_start space* c:table_caption? b:table_body? te:table_end {
|
|
var res = {type: 'TAG', name: 'table'}
|
|
var body = b !== '' ? b : [];
|
|
dp("body: " + pp(body));
|
|
if (tas.length > 0) {
|
|
// FIXME: actually parse and build structure
|
|
//res.attribs = [['data-unparsed', tas.join('')]];
|
|
res.attribs = tas;
|
|
}
|
|
|
|
if (c != '') {
|
|
var caption = [{type: 'TAG', name: 'caption'}]
|
|
.concat(c, [{type: 'ENDTAG', name: 'caption'}], te);
|
|
} else {
|
|
var caption = [];
|
|
}
|
|
//dp(pp(res));
|
|
|
|
return [res].concat(caption, body,
|
|
[{type: 'ENDTAG', name: 'table'}]);
|
|
}
|
|
|
|
table_start
|
|
= "{|"
|
|
res:(
|
|
& { setFlag('table'); return true; }
|
|
ta:generic_attribute*
|
|
{
|
|
dp("table_start " + pp(ta) + ", pos:" + pos);
|
|
return ta;
|
|
}
|
|
/ & { clearFlag('table'); return false; } { return null; }
|
|
) { return res }
|
|
|
|
table_attribs
|
|
= text / ! inline_breaks !newline ![|] c:. { return c }
|
|
|
|
table_caption
|
|
= n:newlineToken
|
|
"|+" c:inline* {
|
|
return n.concat(c);
|
|
}
|
|
|
|
table_body
|
|
= //& { dp("table_body enter"); return true; }
|
|
firstrow:table_firstrow otherrows:table_row* {
|
|
/* dp('table first and otherrows: '
|
|
* + pp([firstrow].concat(otherrows))); */
|
|
return [firstrow].concat(otherrows);
|
|
}
|
|
/ otherrows:table_row* {
|
|
//dp('table otherrows: ' + pp(otherrows));
|
|
return otherrows;
|
|
}
|
|
|
|
table_firstrow
|
|
= td:table_data+ {
|
|
//dp('firstrow: ' + pp(td));
|
|
return [{ type: 'TAG', name: 'tr' }]
|
|
.concat(td, [{type: 'ENDTAG', name: 'tr'}]);
|
|
}
|
|
|
|
table_row
|
|
= //& { dp("table row enter"); return true; }
|
|
n:newlineToken
|
|
"|-" thtd_attribs? space* td:(table_data / table_header)* {
|
|
return n.concat([{type: 'TAG', name: 'tr'}]
|
|
, td, [{type: 'ENDTAG', name: 'tr'}]);
|
|
}
|
|
|
|
table_data
|
|
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
|
|
n:("||" { return [] } / nt:newlineToken "|" { return nt })
|
|
! [}+-]
|
|
//& { dp('before attrib, pos=' + pos); return true; }
|
|
a:(as:generic_attribute+ space* "|" !"|" { return as } )?
|
|
//& { dp('past attrib, pos=' + pos); return true; }
|
|
// use inline_breaks to break on tr etc
|
|
td:(!inline_breaks
|
|
//& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
|
|
b:block { return b })* {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
//dp("table data result: " + pp(td) + ", attribts: " + pp(a));
|
|
return n.concat( [{ type: 'TAG', name: 'td', attribs: a}]
|
|
, td, [{type: 'ENDTAG', name: 'td'}] );
|
|
}
|
|
|
|
table_header
|
|
= n:("!!" { return [] } / nl:newlineToken "!" { return nl })
|
|
a:(as:generic_attribute+ "!" !"!" { return as } )?
|
|
c:inline {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
return n.concat( [{type: 'TAG', name: 'th', attribs: a}]
|
|
, c, [{type: 'ENDTAG', name: 'th'}]);
|
|
}
|
|
|
|
thtd_attribs
|
|
// In particular, do not match [|\n]
|
|
= a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "|" ! "|" {
|
|
return a;
|
|
}
|
|
|
|
|
|
table_end
|
|
= nt:newlineToken? ( "|}" / eof ) {
|
|
clearFlag('table');
|
|
if(nt)
|
|
return nt;
|
|
else
|
|
return [];
|
|
}
|
|
|
|
|
|
/* Tabs do not mix well with the hybrid production syntax */
|
|
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent : */
|