mediawiki-extensions-Visual.../modules/parser/pegTokenizer.pegjs.txt
2012-02-09 22:27:45 +00:00

1424 lines
41 KiB
JavaScript

/* Combined Wiki (MediaWiki) and HTML tokenizer. Produces a token stream
** (actually a list of tokens for now) suitable for a HTML5TreeBuilder. */
{
/* Fixme: use static functions to separate module! Unfortunately, this
* does not work:
* var tu = require('./mediawiki.tokenizer.utils.js');
* console.log(tu.flatten([]));
* Using exports in the module gets a bit further, but accesses to
* tu.flatten in productions still fail. Thus, I just moved the functions
* here until a solution is found:
*/
/* Static utilities */
// Flatten a list of lists.
var flatten = function ( e ) {
// Fast single-level flatten:
//return [].concat.apply([], e);
var es = [];
// flatten sub-arrays
for(var i = 0, length = e.length; i < length; i++) {
var ei = e[i];
if ($.isArray(ei))
es = es.concat(flatten(ei));
else
es.push(ei);
};
return es;
};
var flatten_string = function ( c ) {
var out = [],
text = [];
c = flatten(c);
for (var i = 0, l = c.length; i < l; i++) {
var ci = c[i];
if (ci.constructor === String) {
if(ci !== '') {
text.push(ci);
}
} else {
if (text.length) {
out.push( text.join('') );
text = [];
}
out.push(ci);
}
}
if (text.length) {
out.push( text.join('') );
}
if ( out.length === 1 && out[0].constructor === String ) {
return out[0];
} else {
return out;
}
};
// Remove escaped quotes from attributes etc
// This was in the original PEG parser, but could not find anything in
// MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke
var unquote = function (quotec, text) {
return text.replace('\\' + quotec, quotec);
};
// Decode html entities. In a browser, this should only be fed the entity,
// not untrusted html! XXX: replace with safer version.
var unentity = function ( entity ) {
return $("<div/>").html(entity).text();
};
// Debug print with global switch
var dp = function ( msg ) {
if ( false ) {
console.log(msg);
}
};
var pp = function ( s ) { return JSON.stringify(s, null, 2); }
/*
* Annotate a token stream with list items with appropriate list tokens
*
* @static
* @method
* @param {[tokens]} Token stream with li tokens
* @returns {[tokens]} Token stream, possibly with additional list tokens
* */
var annotateList = function ( tokens ) {
var out = [], // List of tokens
bstack = [], // Bullet stack, previous element's listStyle
bnext = [], // Next element's listStyle
endtags = []; // Stack of end tags
var commonPrefixLength = function (x, y) {
var minLength = Math.min(x.length, y.length);
for(var i = 0; i < minLength; i++) {
if (x[i] != y[i])
break;
}
return i;
};
var pushList = function ( listName, itemName ) {
out.push( new TagTk( listName ));
out.push( new TagTk( itemName ));
endtags.push( new EndTagTk( listName ));
endtags.push( new EndTagTk( itemName ));
};
var popTags = function ( n ) {
for(;n > 0; n--) {
// push list item..
out.push(endtags.pop());
// and the list end tag
out.push(endtags.pop());
}
};
var isDlDd = function (a, b) {
var ab = [a,b].sort();
return (ab[0] === ':' && ab[1] === ';');
};
var doListItem = function ( bs, bn ) {
var prefixLen = commonPrefixLength (bs, bn);
var changeLen = Math.max(bs.length, bn.length) - prefixLen;
var prefix = bn.slice(0, prefixLen);
// emit close tag tokens for closed lists
if (changeLen === 0) {
var itemToken = endtags.pop();
out.push(itemToken);
out.push(new TagTk( itemToken.name ));
endtags.push(new EndTagTk( itemToken.name ));
} else if ( bs.length == bn.length
&& changeLen == 1
&& isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
// handle dd/dt transitions
out.push(endtags.pop());
if( bn[prefixLen] == ';') {
var newName = 'dt';
} else {
var newName = 'dd';
}
out.push(new TagTk( newName ));
endtags.push(new EndTagTk( newName ));
} else {
popTags(bs.length - prefixLen);
if (prefixLen > 0 && bn.length == prefixLen ) {
var itemToken = endtags.pop();
out.push(itemToken);
out.push(new TagTk( itemToken.name ));
endtags.push(new EndTagTk( itemToken.name ));
}
for(var i = prefixLen; i < bn.length; i++) {
switch (bn[i]) {
case '*':
pushList('ul', 'li');
break;
case '#':
pushList('ol', 'li');
break;
case ';':
pushList('dl', 'dt');
break;
case ':':
pushList('dl', 'dd');
break;
default:
throw("Unknown node prefix " + prefix[i]);
}
}
}
};
for (var i = 0, length = tokens.length; i < length; i++) {
var token = tokens[i];
switch ( token.constructor ) {
case TagTk:
switch (token.name) {
case 'list':
// ignore token
break;
case 'listItem':
// convert listItem to list and list item tokens
bnext = token.bullets;
doListItem( bstack, bnext );
bstack = bnext;
break;
default:
// pass through all remaining start tags
out.push(token);
break;
}
break;
case EndTagTk:
if ( token.name == 'list' ) {
// pop all open list item tokens
popTags(bstack.length);
bstack = [];
} else {
out.push(token);
}
break;
default:
out.push(token);
break;
}
}
return out;
};
/* End static utilities */
/*
* Flags for specific parse environments (inside tables, links etc). Flags
* trigger syntactic stops in the inline_breaks production, which
* terminates inline and attribute matches. Flags merely reduce the number
* of productions needed: The grammar is still context-free as the
* productions can just be unrolled for all combinations of environments
* at the cost of a much larger grammar.
*/
var syntaxFlags = {};
var setFlag = function(flag) {
if (syntaxFlags[flag] !== undefined) {
syntaxFlags[flag]++;
} else {
syntaxFlags[flag] = 1;
}
return true;
};
var clearFlag = function(flag) {
syntaxFlags[flag]--;
return false;
};
// Start position of top-level block
// Could also provide positions for lower-level blocks using a stack.
var blockStart = 0;
// Start position of generic tag production
var tagStartPos = 0;
// cache the input length
var inputLength = input.length;
// pseudo-production that matches at end of input
var isEOF = function (pos) {
return pos === inputLength;
};
// text start position
var textStart = 0;
// hack to support numbered external links ([http://example.com]).
// XXX: Move to token stream transform after templates are expanded!
var linkCount = 1;
// Define block-level tags in JS, so we can use toLowerCase to match tags
// case-independently. This would be quite ugly (and possibly slower) if
// done manually in the grammar.
var block_names = (function () {
var names = [ "p", "table", "td", "tr", "ul", "ol"
, "li", "dl", "dt", "dd", "div", "center"
, "blockquote" ];
var bnames = {};
for(var i = 0, l = names.length; i < l; i++) {
bnames[names[i]] = true;
}
return bnames;
})();
var self = this;
}
start
= e:toplevelblock* newline* {
// end is passed inline as a token, as well as a separate event for now.
// this does not work yet.
//console.log('about to emit' + pp(self));
//self._tokenizer.emit('chunk', [ { type: 'END' } ] );
//self._tokenizer.emit('end');
// Append the end (for obvious reasons this should not
// be part of a stream, only when tokenizing complete
// texts)
//console.log( pp( flatten ( e ) ) );
cache = {};
__parseArgs[2]( [ new EOFTk( ) ] );
return []; //flatten(e);
}
/* All chars that cannot start syntactic structures in the middle of a line
* XXX: ] and other end delimiters should probably only be activated inside
* structures to avoid unnecessarily leaving the text production on plain
* content. */
text_char = [^'<~[{\n\r:\]}|!=]
text = t:text_char+ { return t.join(''); }
/* Explanation of chars
* ' quotes (italic/bold)
* < start of xmlish_tag
* ~ signatures/dates
* [ start of links
* { start of parser functions, transclusion and template args
* \n all sort of block-level markup at start of line
* \r ditto
* h http(s) urls
* n nntp(s) urls
* m mailto urls
*
* ! and | table cell delimiters, might be better to specialize those
* = headings - also specialize those!
*
* The following chars are also included for now, but only apply in some
* contexts and should probably be enabled only in those:
* : separate definition in ; term : definition
* ] end of link
* } end of parser func/transclusion/template arg
*/
urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
/ & url_chars urllink
/ htmlentity
// Convert trailing space into &nbsp;
// XXX: This should be moved to a serializer
/ ' ' & ':' { return "\u00a0"; }
/ t:text_char )+
/*
'//', // for protocol-relative URLs, but not in text!
'ftp://',
'git://',
'gopher://',
'http://',
'https://',
'irc://',
'ircs://', // @bug 28503
'mailto:',
'mms://',
'news:',
'nntp://', // @bug 3808 RFC 1738
'svn://',
'telnet://', // Well if we're going to support the above.. -ævar
'worldwind://',
*/
// Old version
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
// Experimental tweaked version: avoid expensive single-char substrings
// This did not bring the expected performance boost, however.
//text = [A-Za-z0-9,._ -] {
// textStart = pos;
//
// var res = input.substr(textStart - 1, inputLength)
// .match(/[A-Za-z0-9,._ -]+/)[0];
// pos = pos + (res.length - 1);
// return res
// }
htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
return unentity("&" + c.join('') + ";")
}
space
= s:[ \t]+ { return s.join(''); }
optionalSpaceToken
= s:space* {
if ( s.length ) {
return [s.join('')];
} else {
return [];
}
}
// Start of line
sol = nl:(newlineToken / & { return pos === 0; } { return [] })
// Eat multi-line comments, so that syntax after still matches as if it
// was actually preceded by a newline
cn:( c:comment n:newline? {
if ( n !== '' ) {
return [c, n];
} else {
return [c];
}
}
)*
// Eat includeonly/noinclude at start of line, so that start-of-line
// syntax after it still matches
ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
{
var niToken = [];
if ( ni !== '') {
if ( ni[0] === '/' ) {
niToken = [new EndTagTk( ni[1] )];
} else {
niToken = [new TagTk( ni[1] )];
}
}
return nl.concat(cn, niToken);
}
eof = & { return isEOF(pos); } { return true; }
newline
= '\n' / '\r\n'
newlineToken = newline { return [new NlTk()] }
eolf = newline / eof
// 'Preprocessor' directive- higher-level things that can occur in otherwise
// plain-text content.
directive
= comment
/ tplarg_or_template
/ htmlentity
// Plain text, but can contain templates, template arguments, comments etc-
// all stuff that is normally handled by the preprocessor
// Returns either a list of tokens, or a plain string (if nothing is to be
// processed).
preprocessor_text
= r:( t:[^<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
/ directive
/ !inline_breaks text_char )+ {
return flatten ( r );
}
spaceless_preprocessor_text
= r:( t:[^'<~[{\n\r|!\]}\t &=]+ { return t.join(''); }
/ directive
/ !inline_breaks !' ' text_char )+ {
return flatten_string ( r );
}
link_preprocessor_text
= r:( t:[^'<~[{\n\r|!\]}\t &="']+ { return t.join(''); }
/ directive
/ urlencoded_char
/ !inline_breaks no_punctuation_char
/ s:[.:,] !(space / eolf) { return s }
/ [&%] )+ {
return flatten_string ( r );
}
// Attribute values with preprocessor support
attribute_preprocessor_text
= r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
/ directive
/ !inline_breaks [&%] )* {
//console.log('prep');
return flatten_string ( r );
}
attribute_preprocessor_text_single
= r:( t:[^{&']+ { return t.join(''); }
/ directive
/ !inline_breaks [{&] )* {
return flatten_string ( r );
}
attribute_preprocessor_text_double
= r:( t:[^{&"]+ { return t.join(''); }
/ directive
/ !inline_breaks [{&] )* {
//console.log( 'double:' + pp(r) );
return flatten_string ( r );
}
// A document (start production) is a sequence of toplevelblocks. Tokens are
// emitted in chunks per toplevelblock to avoid buffering the full document.
toplevelblock
= & { blockStart = pos; return true; } b:block {
b = flatten(b);
if ( b.length ) {
var bs = b[0];
if ( bs.constructor === String && bs.attribs === undefined ) {
b[0] = new String( bs );
bs = b[0];
}
//dp('toplevelblock:' + pp(b));
if (bs.attribs === undefined) {
bs.attribs = [];
}
bs.attribs.push(new KV('data-sourcePos', blockStart + ':' + pos));
//console.log( 'toplevelblock: ' + pp( bs ));
}
// Emit tokens for this toplevelblock. This feeds a chunk to the parser
// pipeline.
__parseArgs[2]( flatten( b ) );
// We don't return any tokens to the start production to save memory. We
// just emitted them already to our consumers.
return true;
}
block
= block_lines
/ pre
/ comment &eolf
/ nowiki
/ pre
/ bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
/ para
/ inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
/ s:sol /*{
if (s) {
return [s, {type: 'NEWLINE'}];
} else {
return [{type: 'NEWLINE'}];
}
}*/
block_lines
= s:sol
// eat an empty line before the block
s2:(os:optionalSpaceToken so:sol { return os.concat(so) })?
bl:block_line {
var s2_ = (s2 !== '') ? s2 : [];
return s.concat(s2_, bl);
}
// Block structures with start-of-line wiki syntax
block_line
= h
/// table
/ & [{}|] tl:table_lines { return tl; }
/ lists
// tag-only lines should not trigger pre
/ st:optionalSpaceToken
bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+
&eolf {
return st.concat(bt);
}
/ pre_indent
/ pre
// A paragraph. We don't emit 'p' tokens to avoid issues with template
// transclusions, <p> tags in the source and the like. Instead, we perform
// some paragraph wrapping on the DOM.
para
= s1:sol s2:sol c:inlineline {
return s1.concat(s2, /* [new TagTk('p')],*/ c);
}
br = space* &newline { return new SelfclosingTagTk( 'br' ) }
// Syntax stops to limit inline expansion defending on syntactic context
inline_breaks
=
& { // Important hack: disable caching for this production, as the default
// cache key does not take into account flag states!
cacheKey = '';
return true;
}
& { return syntaxFlags['table']; }
( a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; }
/ & { return syntaxFlags['tableCellArg'] }
"|" { return true }
)
/ & { return (syntaxFlags['colon'] &&
! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
! syntaxFlags.linkdesc); } ":" { return true; }
/ & { return syntaxFlags['extlink']; } "]" { return true; }
/ & { return syntaxFlags['linkdesc']; } link_end { return true; }
/ & { return syntaxFlags['h']; } '='+ space* newline { return true; }
/ & { return syntaxFlags['template']; } ('|' / '}}' ) { return true; }
/ & { return syntaxFlags['equal']; } '=' {
//console.log( 'equal stop!' );
return true;
}
inline
= c:(urltext / (! inline_breaks (inline_element / . )))+ {
var out = [];
var text = [];
c = flatten(c);
for (var i = 0, l = c.length; i < l; i++) {
var ci = c[i];
if (typeof ci === 'string') {
if(ci !== '') {
text.push(ci);
}
} else {
if (text.length) {
out.push( text.join('') );
text = [];
}
out.push(ci);
}
}
if (text.length) {
out.push( text.join('') );
}
//dp('inline out:' + pp(out));
return out;
}
inlineline
= c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
var out = [];
var text = [];
c = flatten(c);
for (var i = 0; i < c.length; i++) {
var ci = c[i]
if (typeof ci == 'string') {
if(ci !== '') {
text.push(ci);
}
} else {
if (text.length) {
out.push( text.join('') );
text = [];
}
out.push(ci);
}
}
if (text.length) {
out.push( text.join('') );
}
//dp('inlineline out:' + pp(out));
return out;
}
inline_element
= //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
& '<' ( comment / xmlish_tag )
/ & '{' ( & '{{{{{' template / tplarg / template )
/ & '{' tplarg_or_template
/// & '{' ( tplarg / template )
// Eat three opening brackets as text.
/ '[[[' { return '[[[' }
/ & '[' ( wikilink / extlink )
/ & "'" quote
/* Headings */
h = & "=" // guard, to make sure '='+ will match.
// XXX: Also check to end to avoid inline parsing?
r:(
s:'='+ // moved in here to make s accessible to inner action
& { return setFlag('h'); }
c:inlineline
e:'='+
spc:(sp:space+ { return sp.join('') } / comment)*
&eolf
{
clearFlag('h');
var level = Math.min(s.length, e.length);
// convert surplus equals into text
if(s.length > level) {
var extras = s.substr(0, s.length - level);
if(c[0].constructor === String) {
c[0] = extras + c[0];
} else {
c.unshift( extras );
}
}
if(e.length > level) {
var extras = e.substr(0, e.length - level),
lastElem = c[c.length - 1];
if(lastElem.constructor === String) {
lastElem += extras;
} else {
c.push( extras );
}
}
return [new TagTk( 'h' + level )]
.concat(c, [new EndTagTk( 'h' + level ), spc]);
}
/ & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
) { return r }
pre_indent
= l:pre_indent_line ls:(sol pre_indent_line)* {
return [new TagTk( 'pre' )]
.concat( [l], ls
, [new EndTagTk( 'pre' )]);
}
pre_indent_line = space l:inlineline {
return [ '\n' ].concat(l);
}
comment
= '<!--' c:comment_chars* ('-->' / eof)
cs:(space* newline space* cn:comment { return cn })* {
return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
}
comment_chars
= c:[^-] { return c; }
/ c:'-' !'->' { return c; }
urllink
= ! { return syntaxFlags['extlink'] }
target:url {
return [ new TagTk( 'a', [new KV('href', target)] )
, target
, new EndTagTk( 'a' )
];
}
extlink
= ! { return syntaxFlags['extlink'] } // extlink cannot be nested
(
"["
& { return setFlag('extlink'); }
//target:urllink
target:link_preprocessor_text
text:(space* t:inlineline { return t } )?
"]" {
clearFlag('extlink');
if ( text === '' ) {
// XXX: Link numbering should be implemented in post-processor.
text = [ "[" + linkCount + "]" ];
linkCount++;
}
var res = [
new TagTk( 'a', [
new KV('href', target),
new KV('data-type', 'external')
] ),
].concat( text
, [ new EndTagTk( 'a' )]);
//console.log( JSON.stringify( res, null, 2 ) );
return res;
}
/ "[" & { clearFlag('extlink'); return false; }
)
/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
* be configured dynamically. */
url_chars = [/fghimnstw]
url_protocol
= '//' // for protocol-relative URLs
/ 'ftp://'
/ 'git://'
/ 'gopher://'
/ 'http://'
/ 'https://'
/ 'irc://'
/ 'ircs://' // @bug 28503
/ 'mailto:'
/ 'mms://'
/ 'news:'
/ 'nntp://' // @bug 3808 RFC 1738
/ 'svn://'
/ 'telnet://' // Well if we're going to support the above.. -ævar
/ 'worldwind://'
// javascript does not support unicode features..
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
return decodeURI("%" + c0 + c1)
}
//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
url
= proto:url_protocol
addr:( ipv6_address / ipv4_address )?
rest:( ( !inline_breaks
c:no_punctuation_char
{ return c }
)
/ s:[.:,] !(space / eolf) { return s }
/ htmlentity
/ urlencoded_char
/ [&%] )+
{
return proto + addr + rest.join('');
}
ipv4_address
= a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*)
{
return flatten( a ).join('');
}
ipv6_address
= a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']')
{
return flatten( a ).join('');
}
tplarg_or_template = & '{{{{{' template / tplarg / template
template
= "{{" target:template_param_text
params:(newline? "|" newline? p:template_param { return p })*
"}}" {
// Insert target as first positional attribute, so that it can be
// generically expanded. The TemplateHandler then needs to shift it out
// again.
params.unshift( { k: '', v: flatten( target ) } );
var obj = new SelfclosingTagTk( 'template', params );
//console.log( 'tokenizer template ' + JSON.stringify( target ));
return obj;
}
// XXX: support template and args in target!
//template_target
// = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') }
tplarg
= "{{{"
name:template_param_text
params:( newline? "|" newline? p:template_param { return p })*
"}}}" {
name = flatten( name );
params.unshift( { k: '', v: name } );
var obj = new SelfclosingTagTk( 'templatearg', params );
//console.log( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 ));
return obj;
}
template_param
= name:template_param_name space* "=" space* c:template_param_text? {
//console.log( 'named template_param matched' + pp([name, flatten( c )]) );
if ( c !== '' ) {
return new KV(name, flatten( c ));
} else {
return new KV(name, []);
}
} / c:template_param_text {
return new KV([], flatten( c ) );
}
/ & [|}] { return new KV([], []); }
// FIXME: handle template args and templates in key! (or even parser functions?)
template_param_name
= & { return setFlag( 'equal' ) }
tpt:template_param_text
& { clearFlag( 'equal' ); return true; }
{
//console.log( 'template param name matched: ' + pp( tpt ) );
return tpt;
}
/ & { return clearFlag( 'equal' ) }
//= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
template_param_text
= & { return setFlag('template') }
il:inline {
clearFlag('template');
return il;
}
/ & { return clearFlag('template'); }
// TODO: handle link prefixes as in al[[Razi]]
wikilink
= "[["
! url
//target:link_target
target:preprocessor_text
lcontent:( "|" lt:link_text { return lt } )*
"]]"
// XXX In real MediaWiki, this is a language-dependent positive character
// class. Can we work out a static negative class instead?
// XXX: Exclude uppercase chars from non-latin languages too!
trail:(! [A-Z \t(),.:-] tc:text_char { return tc })* {
var obj = new TagTk( 'a',
[
new KV('data-type', 'internal')
] ),
textTokens = [];
obj.attribs.push( new KV('href', target) );
if (lcontent && lcontent.length) {
textTokens = lcontent;
if (trail) {
textTokens.push( trail.join('') );
}
} else {
if (trail) {
textTokens = $.extend(true, [], target).concat( [ trail.join('') ] );
} else {
// copy list
textTokens = $.extend(true, [], target);
}
}
//console.log( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) );
return [obj].concat(textTokens, [new EndTagTk( 'a' )]);
}
link_target
= h:( c:[^|%\n\]]+ { return c.join('') } // quickly eat anything unsuspicious
/ !"]]"
hi:(
[^|%\n]
/ urlencoded_char
/ '%'
) { return hi }
)* { return h.join(''); }
link_text
= & { return setFlag('linkdesc'); }
h:inlineline
{
clearFlag('linkdesc');
return h;
}
/ & { clearFlag('linkdesc'); return false }
link_end = "]]"
/* Generic quote production for italic and bold, further processed in a token
* stream transformation in doQuotes. Relies on NlTk tokens being emitted
* for each line of text to balance quotes per line.
*
* We are not using a simple pair rule here as we need to support mis-nested
* bolds/italics and MediaWiki's special heuristics for apostrophes, which are
* all not context free. */
quote = "''" x:"'"* {
var res = new TagTk( 'mw-quote' ); // Will be consumed in token transforms
res.value = "''" + x.join('');
return res;
}
/* XXX: Extension tags can require a change in the tokenizer mode, which
* returns any text between extension tags verbatim. For now, we simply
* continue to parse the contained text and return the tokens. The original
* input source can be recovered from the source positions added on tag
* tokens. This won't however work in all cases. For example, a comment start
* (<!--) between extension tags would cause the remaining text to be consumed
* as a comment. To avoid this, we might need to look ahead for the end tag
* and limit the content parsing to this section. */
xmlish_tag = nowiki / generic_tag
pre
= "<pre"
attribs:generic_attribute*
">"
ts:(t1:[^<]+ { return t1.join('') }
/ nowiki
/ !"</pre>" t2:. { return t2 })+
("</pre>" / eof) {
// return nowiki tags as well?
//console.log('inpre');
return [ new TagTk( 'pre', attribs ) ]
.concat(ts, [ new EndTagTk( 'pre' ) ]);
}
/ "</pre>" { return "</pre>"; }
nowiki
= "<nowiki>" nc:nowiki_content "</nowiki>" {
//console.log( 'full nowiki return: ' + pp(nc));
return nc;
}
/ "<nowiki>" {
//console.log('nowiki fallback');
return ['<nowiki>'];
}
/ "</nowiki>" {
//console.log('nowiki end fallback');
return ['</nowiki>'];
}
nowiki_content
= ts:( t:[^<]+ { return t.join('') }
/ "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
//console.log('nested pre in nowiki');
return ["<pre"].concat(p0, p1, [">"], p2, ["</pre>"]).join('');
}
/ (!("</"( "nowiki>" / "pre>")) c:. {
//console.log('nowiki: single char' + c);
return c;
})
)* {
// return nowiki tags as well?
//console.log('nowiki_content: return' + pp(ts));
return [ts.join('')];
}
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
block_tag
= "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
attribs:generic_attribute*
selfclose:"/"?
">" {
if (block_names[name.toLowerCase()] !== true) {
// abort match if tag is not block-level
return null;
}
var res;
if ( end != '' ) {
res = new EndTagTk( name, attribs );
} else if ( selfclose != '' ) {
res = new SelfclosingTagTk( name, attribs );
} else {
res = new TagTk( name, attribs );
}
return [res];
}
/* Generic XML-like tags
*
* These also cover extensions (including Cite), which will hook into the
* token stream for further processing. The content of extension tags is
* parsed as regular inline, but the source positions of the tag are added
* to allow reconstructing the unparsed text from the input. */
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
generic_tag
= "<"
& { tagStartPos = pos; return true; } // remember the start position of this tag
end:"/"? name:[0-9a-zA-Z]+
attribs:generic_newline_attribute*
( space / newline ) *
selfclose:"/"?
">" {
name = name.join('');
var res;
if ( end != '' ) {
res = new EndTagTk( name, attribs );
} else if ( selfclose != '' ) {
res = new SelfclosingTagTk( name, attribs );
} else {
res = new TagTk( name, attribs );
}
res.attribs.push(new KV('data-sourceTagPos', (tagStartPos - 1) + ":" + pos));
return res;
}
generic_newline_attribute
= s:( space / newline )*
name:generic_attribute_name
value:(( space / newline )*
v:generic_attribute_newline_value { return v })?
{
if ( value !== '' ) {
return new KV( name, value );
} else {
return new KV( name, '' );
}
}
generic_attribute
= s:space*
name:generic_attribute_name
value:(space*
v:generic_attribute_value { return v })?
{
if ( value !== '' ) {
return new KV( name, value );
} else {
return new KV( name, '' );
}
}
// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
// disallow newlines, | and {.
generic_attribute_name
= n:[^ \t\0/"'>=\n|{]+ {
return n.join('');
}
generic_attribute_newline_value
= "=" (space / newline )* v:xml_att_value {
return v;
}
generic_attribute_value
= "=" space* v:att_value {
return v;
}
// XXX: attributes can contain templates and template args!!
xml_att_value
= "'" t:attribute_preprocessor_text_single "'" { return t; }
/ '"' t:attribute_preprocessor_text_double '"' { return t; }
/ attribute_preprocessor_text
// XXX: attributes can contain templates and template args!!
att_value
= t:(!inline_breaks c:[^ \t'"<>='\n] { return c } )+ {
return t.join('');
}
// XXX: is "\"" also valid html? or just Wikitext?
/ "'" t:[^'>]* "'" { return unquote("'", t.join('')); }
/ '"' t:[^">]* '"' { return unquote('"', t.join('')); }
/* Lists */
lists = e:(dtdd / li) es:(sol (dtdd / li))*
{
return annotateList( [ new TagTk( 'list' ) ]
.concat(flatten([e].concat(es))
,[ new EndTagTk( 'list' ) ]));
}
li = bullets:list_char+
c:inlineline?
&eolf
{
if ( c == '' )
c = [];
var li = new TagTk( 'listItem' );
li.bullets = bullets;
return [ li, c ];
}
dtdd
= bullets:(!(";" !list_char) list_char)*
";"
& {return setFlag('colon');}
c:inlineline
":"
// Fortunately dtdds cannot be nested, so we can simply set the flag
// back to 0 to disable it.
& {syntaxFlags['colon'] = 0; return true;}
d:inlineline
&eolf {
// Convert trailing space into &nbsp;
// XXX: This should be moved to a serializer
//var clen = c.length;
//if (clen && c[clen - 1].constructor === String) {
// var val = c[clen - 1].value;
// if(val.length && val[val.length - 1] == ' ') {
// c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
// }
//}
var li = new TagTk( 'listItem' );
li.bullets = bullets + ";";
var li2 = new TagTk( 'listItem' );
li2.bullets = bullets + ":";
return [ li ].concat( c, [ li2 ], d );
}
// Fall-back case to clear the colon flag
/ & { return true; } { syntaxFlags['colon'] = 0; return null; }
list_char = [*#:;]
/**
* Tables
*
* Table productions are geared to support independent parsing of fragments in
* templates (the common table start / row / table end use case). The tokens
* produced by these fragments then match up to a table while building the
* DOM tree. For similar reasons, table rows do not emit explicit end tag
* tokens.
*
* The separate table_lines production is faster than moving those productions
* directly to block_lines.
* */
table_lines
= & { return setFlag('table'); }
tl:table_line
tls:( s:sol tl2:table_line { return s.concat(tl2); } )* {
clearFlag('table');
//console.log('table_lines: ' + pp(tl.concat(tls)));
return tl.concat( tls );
}
/ & { return clearFlag('table'); }
// This production assumes start-of-line position!
table_line
= table_start_tag
/ table_row_tag
/ table_data_tags
/ table_heading_tags
/ table_caption_tag
/ table_end_tag
table_start_tag
= "{|"
ta:generic_attribute*
space*
te:table_end_tag? // can occur somewhere in the middle of the line too
{
var toks = [ new TagTk( 'table' ) ];
if ( ta )
toks[0].attribs = ta;
if ( te )
toks = toks.concat( te );
return toks;
}
table_caption_tag
= "|+"
c:inline* {
return [ new TagTk( 'caption' )]
.concat( c, [ new EndTagTk( 'caption' ) ]);
}
table_row_tag
= //& { console.log("table row enter"); return true; }
"|-"
a:generic_attribute*
space*
// handle tables with missing table cells after a row
td:( s:sol ![|!] tdt:table_data_tag { return s.concat(tdt); } )?
{
// We rely on our tree builder to close the row as needed. This is
// needed to support building tables from fragment templates with
// individual cells or rows.
var trToken = [ new TagTk( 'tr', a ) ];
if ( !td ) {
return trToken;
} else {
return trToken.concat(td);
}
}
table_data_tags
= "|"
td:table_data_tag
tds:( "||" tdt:table_data_tag { return tdt } )* {
return td.concat(tds);
}
table_data_tag
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
! [}+-]
a:table_cell_args?
//& { console.log("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
// use inline_breaks to break on tr etc
td:( !inline_breaks
//& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
b:block { return b } )*
{
if ( a == '' ) {
a = [];
}
//dp("table data result: " + pp(td) + ", attribts: " + pp(a));
return [ new TagTk( 'td', a )]
.concat( td, [ new EndTagTk ( 'td' ) ] );
}
table_heading_tags
= "!"
th:table_heading_tag
ths:( "!!" tht:table_heading_tag { return tht } )* {
return th.concat(ths);
}
table_heading_tag
= a:table_cell_args?
c:inline {
if ( a == '' ) {
a = [];
}
return [ new TagTk( 'th', a ) ]
.concat( c, [new EndTagTk( 'th' )] );
}
table_end_tag
= "|}" {
var tok = [new EndTagTk( 'table' )];
return tok;
}
table_cell_args
= & { return setFlag('tableCellArg'); }
as:generic_attribute+ space* "|" !"|" {
clearFlag('tableCellArg');
return as;
}
/ & { return clearFlag('tableCellArg'); }
/* Tables, full-table version (no support for table fragments from templates)
* Only left here for comparison purposes with the above productions. Remove
* when those work as intended! */
table
= tas:table_start space* c:table_caption? b:table_body? te:table_end {
var res = new TagTk( 'table' );
var body = b !== '' ? b : [];
dp("body: " + pp(body));
if (tas.length > 0) {
// FIXME: actually parse and build structure
//res.attribs = [new KV('data-unparsed', tas.join(''))];
res.attribs = tas;
}
if (c != '') {
var caption = [ new TagTk( 'caption' ) ]
.concat(c, [new EndTagTk( 'caption' )], te);
} else {
var caption = [];
}
//dp(pp(res));
return [res].concat(caption, body, [new EndTagTk( 'table' )]);
}
table_start
= "{|"
res:(
& { setFlag('table'); return true; }
ta:generic_attribute*
{
dp("table_start " + pp(ta) + ", pos:" + pos);
return ta;
}
/ & { clearFlag('table'); return false; } { return null; }
) { return res }
table_caption
= n:newlineToken
"|+" c:inline* {
return n.concat(c);
}
table_body
= //& { dp("table_body enter"); return true; }
firstrow:table_firstrow otherrows:table_row* {
/* dp('table first and otherrows: '
* + pp([firstrow].concat(otherrows))); */
return [firstrow].concat(otherrows);
}
/ otherrows:table_row* {
//dp('table otherrows: ' + pp(otherrows));
return otherrows;
}
table_firstrow
= td:(table_data / table_heading)+ {
//dp('firstrow: ' + pp(td));
return [ new TagTk( 'tr' )]
.concat(td, [ new EndTagTk( 'tr' )]);
}
table_row
= //& { dp("table row enter"); return true; }
n:newlineToken
"|-"
a:(as:generic_attribute+ space* "|" !"|" { return as } )?
space*
td:(table_data / table_heading)* {
return n.concat([ new TagTk( 'tr' )]
, td, [ new EndTagTk( 'tr' )]);
}
table_data
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
n:("||" { return [] } / nt:newlineToken ("|" / !'!') { return nt })
! [}+-]
//& { dp('before attrib, pos=' + pos); return true; }
a:(as:generic_attribute+ space* "|" !"|" { return as } )?
//& { dp('past attrib, pos=' + pos); return true; }
// use inline_breaks to break on tr etc
td:(!inline_breaks
//& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
b:block { return b })* {
if ( a == '' ) {
a = [];
}
//dp("table data result: " + pp(td) + ", attribts: " + pp(a));
return n.concat( [ new TagTk( 'td', a ) ]
, td, [ new EndTagTk( 'td' ) ] );
}
table_heading
= n:("!!" { return [] } / nl:newlineToken "!" { return nl })
a:(as:generic_attribute+ "|" !"|" { return as } )?
c:inline {
if ( a == '' ) {
a = [];
}
return n.concat( [ new TagTk( 'th', a )]
, c, [ new EndTagTk( 'th' ) ]);
}
thtd_attribs
// In particular, do not match [|\n]
= a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "|" ! "|" {
return a;
}
table_end
= nt:newlineToken? ( "|}" / eof ) {
clearFlag('table');
if(nt)
return nt;
else
return [];
}
/* Tabs do not mix well with the hybrid production syntax */
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent : */