mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-15 18:39:52 +00:00
2efcd3cd57
construction' part of the HTML5 spec: http://www.whatwg.org/specs/web-apps/current-work/multipage/urls.html#url-manipulation-and-creation Removed a few whitelisted test cases that are now passing directly. The encoding canonicalization could also be moved to the Sanitizer. Doing this early in token stream processing however has the advantage of providing further transformations uniform data to work with. We could even consider to move this even further into the tokenizer.
1745 lines
52 KiB
JavaScript
1745 lines
52 KiB
JavaScript
/**
|
|
* Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
|
|
* chunks of tokens (one chunk per top-level block matched) and eventually an
|
|
* end event. Tokens map to HTML tags as far as possible, with custom tokens
|
|
* used where further processing on the token stream is needed.
|
|
*
|
|
* @author Gabriel Wicke <gwicke@wikimedia.org>
|
|
* @author Brion Vibber <brion@wikimedia.org>
|
|
*/
|
|
{
|
|
/* Fixme: use static functions to separate module! Unfortunately, this
|
|
* does not work:
|
|
* var tu = require('./mediawiki.tokenizer.utils.js');
|
|
* console.warn(tu.flatten([]));
|
|
* Using exports in the module gets a bit further, but accesses to
|
|
* tu.flatten in productions still fail. Thus, I just moved the functions
|
|
* here until a solution is found:
|
|
*/
|
|
|
|
/* Static utilities */
|
|
|
|
// Flatten a list of lists.
|
|
//var simple_flatten = function ( e ) {
|
|
// // Fast single-level flatten:
|
|
// //return [].concat.apply([], e);
|
|
//
|
|
// var es = [];
|
|
// // flatten sub-arrays
|
|
// for(var i = 0, length = e.length; i < length; i++) {
|
|
// var ei = e[i];
|
|
// if ($.isArray(ei))
|
|
// es = es.concat(flatten(ei));
|
|
// else
|
|
// es.push(ei);
|
|
// };
|
|
// return es;
|
|
//};
|
|
|
|
var flatten = function ( e ) {
|
|
// Fast single-level flatten:
|
|
//return [].concat.apply([], e);
|
|
|
|
var es = [];
|
|
// flatten sub-arrays
|
|
var chunkStart = null,
|
|
modified = false;
|
|
for(var i = 0, l = e.length; i < l; i++) {
|
|
if ( e[i].constructor === Array ) {
|
|
if ( chunkStart !== null ) {
|
|
es = es.concat( e.slice( chunkStart, i ), flatten( e[i] ) );
|
|
chunkStart = null;
|
|
} else {
|
|
es = es.concat(flatten(e[i]));
|
|
}
|
|
modified = true;
|
|
} else if ( chunkStart === null ) {
|
|
chunkStart = i;
|
|
}
|
|
};
|
|
if ( modified ) {
|
|
if ( chunkStart !== null ) {
|
|
es = es.concat( e.slice( chunkStart, e.length ) );
|
|
}
|
|
return es;
|
|
} else {
|
|
return e;
|
|
}
|
|
};
|
|
|
|
|
|
var flatten_string = function ( c ) {
|
|
var out = flatten_stringlist( c );
|
|
if ( out.length === 1 && out[0].constructor === String ) {
|
|
return out[0];
|
|
} else {
|
|
return out;
|
|
}
|
|
};
|
|
|
|
var flatten_stringlist = function ( c ) {
|
|
var out = [],
|
|
text = '';
|
|
c = flatten(c);
|
|
for (var i = 0, l = c.length; i < l; i++) {
|
|
var ci = c[i];
|
|
if (ci.constructor === String) {
|
|
if(ci !== '') {
|
|
text += ci;
|
|
}
|
|
} else {
|
|
if (text !== '') {
|
|
out.push( text );
|
|
text = '';
|
|
}
|
|
out.push(ci);
|
|
}
|
|
}
|
|
if (text !== '' ) {
|
|
out.push( text );
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// Remove escaped quotes from attributes etc
|
|
// This was in the original PEG parser, but could not find anything in
|
|
// MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke
|
|
var unquote = function (quotec, text) {
|
|
return text.replace('\\' + quotec, quotec);
|
|
};
|
|
|
|
// Decode html entities. In a browser, this should only be fed the entity,
|
|
// not untrusted html! XXX: replace with safer version.
|
|
var unentity = function ( entity ) {
|
|
return $("<div/>").html(entity).text();
|
|
};
|
|
|
|
// Debug print with global switch
|
|
var dp = function ( msg ) {
|
|
if ( false ) {
|
|
console.warn(msg);
|
|
}
|
|
};
|
|
|
|
var pp = function ( s ) { return JSON.stringify(s, null, 2); }
|
|
|
|
/*
|
|
* Annotate a token stream with list items with appropriate list tokens
|
|
*
|
|
* XXX: Move this to a token handler in phase sync23! That way we can
|
|
* support list items from templates too.
|
|
*
|
|
* @static
|
|
* @method
|
|
* @param {[tokens]} Token stream with li tokens
|
|
* @returns {[tokens]} Token stream, possibly with additional list tokens
|
|
* */
|
|
var annotateList = function ( tokens ) {
|
|
var out = [], // List of tokens
|
|
bstack = [], // Bullet stack, previous element's listStyle
|
|
bnext = [], // Next element's listStyle
|
|
endtags = []; // Stack of end tags
|
|
|
|
var commonPrefixLength = function (x, y) {
|
|
var minLength = Math.min(x.length, y.length);
|
|
for(var i = 0; i < minLength; i++) {
|
|
if (x[i] != y[i])
|
|
break;
|
|
}
|
|
return i;
|
|
};
|
|
|
|
var pushList = function ( listName, itemName ) {
|
|
out.push( new TagTk( listName ));
|
|
out.push( new TagTk( itemName ));
|
|
endtags.push( new EndTagTk( listName ));
|
|
endtags.push( new EndTagTk( itemName ));
|
|
};
|
|
|
|
var popTags = function ( n ) {
|
|
for(;n > 0; n--) {
|
|
// push list item..
|
|
out.push(endtags.pop());
|
|
// and the list end tag
|
|
out.push(endtags.pop());
|
|
}
|
|
};
|
|
|
|
var isDlDd = function (a, b) {
|
|
var ab = [a,b].sort();
|
|
return (ab[0] === ':' && ab[1] === ';');
|
|
};
|
|
|
|
var doListItem = function ( bs, bn ) {
|
|
var prefixLen = commonPrefixLength (bs, bn);
|
|
var changeLen = Math.max(bs.length, bn.length) - prefixLen;
|
|
var prefix = bn.slice(0, prefixLen);
|
|
// emit close tag tokens for closed lists
|
|
if (changeLen === 0) {
|
|
var itemToken = endtags.pop();
|
|
out.push(itemToken);
|
|
out.push(new TagTk( itemToken.name ));
|
|
endtags.push(new EndTagTk( itemToken.name ));
|
|
} else if ( bs.length == bn.length
|
|
&& changeLen == 1
|
|
&& isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
|
|
// handle dd/dt transitions
|
|
out.push(endtags.pop());
|
|
if( bn[prefixLen] == ';') {
|
|
var newName = 'dt';
|
|
} else {
|
|
var newName = 'dd';
|
|
}
|
|
out.push(new TagTk( newName ));
|
|
endtags.push(new EndTagTk( newName ));
|
|
} else {
|
|
popTags(bs.length - prefixLen);
|
|
|
|
if (prefixLen > 0 && bn.length == prefixLen ) {
|
|
var itemToken = endtags.pop();
|
|
out.push(itemToken);
|
|
out.push(new TagTk( itemToken.name ));
|
|
endtags.push(new EndTagTk( itemToken.name ));
|
|
}
|
|
|
|
for(var i = prefixLen; i < bn.length; i++) {
|
|
switch (bn[i]) {
|
|
case '*':
|
|
pushList('ul', 'li');
|
|
break;
|
|
case '#':
|
|
pushList('ol', 'li');
|
|
break;
|
|
case ';':
|
|
pushList('dl', 'dt');
|
|
break;
|
|
case ':':
|
|
pushList('dl', 'dd');
|
|
break;
|
|
default:
|
|
throw("Unknown node prefix " + prefix[i]);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
for (var i = 0, length = tokens.length; i < length; i++) {
|
|
var token = tokens[i];
|
|
switch ( token.constructor ) {
|
|
case TagTk:
|
|
switch (token.name) {
|
|
case 'list':
|
|
// ignore token
|
|
break;
|
|
case 'listItem':
|
|
// convert listItem to list and list item tokens
|
|
bnext = token.bullets;
|
|
doListItem( bstack, bnext );
|
|
bstack = bnext;
|
|
break;
|
|
default:
|
|
// pass through all remaining start tags
|
|
out.push(token);
|
|
break;
|
|
}
|
|
break;
|
|
case EndTagTk:
|
|
if ( token.name == 'list' ) {
|
|
// pop all open list item tokens
|
|
popTags(bstack.length);
|
|
bstack = [];
|
|
} else {
|
|
out.push(token);
|
|
}
|
|
break;
|
|
default:
|
|
out.push(token);
|
|
break;
|
|
}
|
|
}
|
|
return out;
|
|
};
|
|
|
|
|
|
/* End static utilities */
|
|
|
|
/*
|
|
* Flags for specific parse environments (inside tables, links etc). Flags
|
|
* trigger syntactic stops in the inline_breaks production, which
|
|
* terminates inline and attribute matches. Flags merely reduce the number
|
|
* of productions needed: The grammar is still context-free as the
|
|
* productions can just be unrolled for all combinations of environments
|
|
* at the cost of a much larger grammar.
|
|
*/
|
|
var syntaxFlags = {};
|
|
var setFlag = function(flag) {
|
|
if (syntaxFlags[flag] !== undefined) {
|
|
syntaxFlags[flag]++;
|
|
} else {
|
|
syntaxFlags[flag] = 1;
|
|
}
|
|
return true;
|
|
};
|
|
var clearFlag = function(flag) {
|
|
syntaxFlags[flag]--;
|
|
return false;
|
|
};
|
|
|
|
// Start position of top-level block
|
|
// Could also provide positions for lower-level blocks using a stack.
|
|
var blockStart = 0;
|
|
|
|
// Start position of generic tag production
|
|
var tagStartPos = 0;
|
|
|
|
// Stack of source positions
|
|
var posStack = {
|
|
positions: {},
|
|
push: function( key, pos ) {
|
|
if ( this.positions[key] === undefined ) {
|
|
this.positions[key] = [pos];
|
|
} else {
|
|
this.positions[key].push( pos );
|
|
}
|
|
return true;
|
|
},
|
|
pop: function( key, pos ) {
|
|
var pk = this.positions[key];
|
|
if ( pk === undefined || ! pk.length ) {
|
|
throw "Tried to pop unknown position for " + key;
|
|
} else {
|
|
return [ pk.pop(), pos ];
|
|
}
|
|
}
|
|
};
|
|
|
|
// cache the input length
|
|
var inputLength = input.length;
|
|
|
|
// pseudo-production that matches at end of input
|
|
var isEOF = function (pos) {
|
|
return pos === inputLength;
|
|
};
|
|
|
|
// text start position
|
|
var textStart = 0;
|
|
|
|
// hack to support numbered external links ([http://example.com]).
|
|
// XXX: Move to token stream transform after templates are expanded!
|
|
var linkCount = 1;
|
|
|
|
// Define block-level tags in JS, so we can use toLowerCase to match tags
|
|
// case-independently. This would be quite ugly (and possibly slower) if
|
|
// done manually in the grammar.
|
|
var block_names = (function () {
|
|
var names = [ "p", "table", "td", "tr", "ul", "ol"
|
|
, "li", "dl", "dt", "dd", "div", "center"
|
|
, "blockquote" ];
|
|
var bnames = {};
|
|
for(var i = 0, l = names.length; i < l; i++) {
|
|
bnames[names[i]] = true;
|
|
}
|
|
return bnames;
|
|
})();
|
|
|
|
var self = this;
|
|
|
|
|
|
}
|
|
|
|
/*********************************************************
|
|
* The top-level production
|
|
*********************************************************/
|
|
|
|
start
|
|
= e:toplevelblock* newline* {
|
|
// end is passed inline as a token, as well as a separate event for now.
|
|
|
|
// this does not work yet.
|
|
//console.warn('about to emit' + pp(self));
|
|
//self._tokenizer.emit('chunk', [ { type: 'END' } ] );
|
|
//self._tokenizer.emit('end');
|
|
// Append the end (for obvious reasons this should not
|
|
// be part of a stream, only when tokenizing complete
|
|
// texts)
|
|
//console.warn( pp( flatten ( e ) ) );
|
|
cache = {};
|
|
__parseArgs[2]( [ new EOFTk( ) ] );
|
|
return []; //flatten(e);
|
|
}
|
|
|
|
|
|
/*
|
|
* A document (start production) is a sequence of toplevelblocks. Tokens are
|
|
* emitted in chunks per toplevelblock to avoid buffering the full document.
|
|
*/
|
|
toplevelblock
|
|
= & { blockStart = pos; return true; } b:block {
|
|
b = flatten(b);
|
|
|
|
// Add source offsets for round-tripping. XXX: Add these not just for
|
|
// toplevelblocks!
|
|
if ( b.length ) {
|
|
var bs = b[0];
|
|
if ( bs.constructor === String && bs.attribs === undefined ) {
|
|
b[0] = new String( bs );
|
|
bs = b[0];
|
|
}
|
|
if (bs.dataAttribs === undefined) {
|
|
bs.dataAttribs = {};
|
|
}
|
|
bs.dataAttribs.sourcePos = [blockStart, pos];
|
|
}
|
|
|
|
// Emit tokens for this toplevelblock. This feeds a chunk to the parser
|
|
// pipeline.
|
|
__parseArgs[2]( flatten( b ) );
|
|
|
|
// We don't return any tokens to the start production to save memory. We
|
|
// just emitted them already to our consumers.
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* The actual contents of each block.
|
|
*/
|
|
block
|
|
= block_lines
|
|
/ & '<' r:( pre // tag variant can start anywhere
|
|
/ comment &eolf
|
|
/ nowiki
|
|
// avoid a paragraph if we know that the line starts with a block tag
|
|
/ bt:block_tag { return [bt] }
|
|
) { return r; }
|
|
/ para
|
|
// Inlineline includes generic tags; wrapped into paragraphs in token
|
|
// transform and DOM postprocessor
|
|
/ inlineline
|
|
/ sol
|
|
|
|
/*
|
|
* A block nested in other constructs. Avoid eating end delimiters for other
|
|
* constructs by checking against inline_breaks first.
|
|
*/
|
|
nested_block = !inline_breaks b:block { return b }
|
|
|
|
/*
|
|
* Line-based block constructs.
|
|
*/
|
|
block_lines
|
|
= s:sol
|
|
// eat an empty line before the block
|
|
s2:(os:optionalSpaceToken so:sol { return os.concat(so) })?
|
|
bl:block_line {
|
|
var s2_ = (s2 !== '') ? s2 : [];
|
|
return s.concat(s2_, bl);
|
|
}
|
|
|
|
/*
|
|
* Block structures with start-of-line wiki syntax
|
|
*/
|
|
block_line
|
|
= h
|
|
/ & [{}|] tl:table_lines { return tl; }
|
|
/ lists
|
|
// tag-only lines should not trigger pre
|
|
/ st:optionalSpaceToken
|
|
bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+
|
|
&eolf {
|
|
return st.concat(bt);
|
|
}
|
|
/ pre_indent
|
|
/ pre
|
|
|
|
|
|
/*
|
|
* A paragraph. We don't emit 'p' tokens to avoid issues with template
|
|
* transclusions, <p> tags in the source and the like. Instead, we perform
|
|
* some paragraph wrapping on the DOM.
|
|
*/
|
|
para
|
|
= s1:sol s2:sol c:inlineline {
|
|
return s1.concat(s2, /* [new TagTk('p')],*/ c);
|
|
}
|
|
|
|
br = space* &newline { return new SelfclosingTagTk( 'br' ) }
|
|
|
|
/*
|
|
* Syntax stops: Avoid eating significant tokens for higher-level productions
|
|
* in nested inline productions.
|
|
*
|
|
* Repeated testing of flags is not terribly efficient. See new and faster
|
|
* version below.
|
|
*/
|
|
|
|
/*
|
|
* Syntax stops: Avoid eating significant tokens for higher-level productions
|
|
* in nested inline productions.
|
|
*/
|
|
inline_breaks
|
|
= & [=|!}:\r\n\]<]
|
|
& { // Important hack: disable caching for this production, as the default
|
|
// cache key does not take into account flag states!
|
|
cacheKey = '';
|
|
//console.warn('ilbf: ' + input.substr(pos, 5) );
|
|
return null !== __parseArgs[3].inline_breaks( input, pos, syntaxFlags )
|
|
}
|
|
|
|
inline
|
|
= c:(urltext / (! inline_breaks (inline_element / . )))+ {
|
|
//console.warn('inline out:' + pp(out));
|
|
return flatten_stringlist( c );
|
|
}
|
|
|
|
|
|
inlineline
|
|
= c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
|
|
return flatten_stringlist( c );
|
|
}
|
|
|
|
inline_element
|
|
= //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
|
|
& '<' ( comment / xmlish_tag )
|
|
/// & '{' ( & '{{{{{' template / tplarg / template )
|
|
/ & '{' tplarg_or_template
|
|
/// & '{' ( tplarg / template )
|
|
// Eat three opening brackets as text.
|
|
/ '[[[' { return '[[[' }
|
|
/ & '[' ( wikilink / extlink )
|
|
/ & "'" quote
|
|
|
|
/* Headings */
|
|
|
|
h = & "=" // guard, to make sure '='+ will match.
|
|
// XXX: Also check to end to avoid inline parsing?
|
|
r:(
|
|
s:'='+ // moved in here to make s accessible to inner action
|
|
& { return setFlag('h'); }
|
|
c:inlineline
|
|
e:'='+
|
|
spc:(sp:space+ { return sp.join('') } / comment)*
|
|
&eolf
|
|
{
|
|
clearFlag('h');
|
|
var level = Math.min(s.length, e.length);
|
|
// convert surplus equals into text
|
|
if(s.length > level) {
|
|
var extras = s.substr(0, s.length - level);
|
|
if(c[0].constructor === String) {
|
|
c[0] = extras + c[0];
|
|
} else {
|
|
c.unshift( extras );
|
|
}
|
|
}
|
|
if(e.length > level) {
|
|
var extras = e.substr(0, e.length - level),
|
|
lastElem = c[c.length - 1];
|
|
if(lastElem.constructor === String) {
|
|
lastElem += extras;
|
|
} else {
|
|
c.push( extras );
|
|
}
|
|
}
|
|
|
|
return [new TagTk( 'h' + level )]
|
|
.concat(c, [new EndTagTk( 'h' + level ), spc]);
|
|
}
|
|
/ & { /* dp('nomatch exit h'); */ clearFlag('h'); return false } { return null }
|
|
) { return r }
|
|
|
|
comment
|
|
= '<!--' c:comment_chars* ('-->' / eof)
|
|
cs:(space* newline space* cn:comment { return cn })* {
|
|
return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
|
|
}
|
|
|
|
comment_chars
|
|
= c:[^-] { return c; }
|
|
/ c:'-' !'->' { return c; }
|
|
|
|
|
|
|
|
/**************************************************************
|
|
* External (bracketed and autolinked) links
|
|
**************************************************************/
|
|
|
|
urllink
|
|
= ! { return syntaxFlags['extlink'] }
|
|
target:url {
|
|
return [ new TagTk( 'urllink', [new KV('href', target)] ) ];
|
|
}
|
|
|
|
extlink
|
|
= ! { return syntaxFlags['extlink'] } // extlink cannot be nested
|
|
(
|
|
"["
|
|
& { return setFlag('extlink'); }
|
|
//target:urllink
|
|
target:extlink_preprocessor_text
|
|
text:(( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )*
|
|
t:inlineline { return t } )?
|
|
"]" {
|
|
clearFlag('extlink');
|
|
if ( text === '' ) {
|
|
// XXX: Link numbering should be implemented in post-processor.
|
|
text = [ "[" + linkCount + "]" ];
|
|
linkCount++;
|
|
}
|
|
//console.warn( 'extlink text: ' + pp( text ) );
|
|
return [
|
|
new SelfclosingTagTk( 'extlink', [
|
|
new KV('href', target),
|
|
new KV('content', text)
|
|
] )
|
|
];
|
|
}
|
|
/ "[" & { clearFlag('extlink'); return false; }
|
|
)
|
|
|
|
/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
|
|
* be configured dynamically. */
|
|
|
|
url_chars = [/fghimnstw]
|
|
|
|
url_protocol
|
|
= '//' // for protocol-relative URLs
|
|
/ 'ftp://'
|
|
/ 'git://'
|
|
/ 'gopher://'
|
|
/ 'http://'
|
|
/ 'https://'
|
|
/ 'irc://'
|
|
/ 'ircs://' // @bug 28503
|
|
/ 'mailto:'
|
|
/ 'mms://'
|
|
/ 'news:'
|
|
/ 'nntp://' // @bug 3808 RFC 1738
|
|
/ 'svn://'
|
|
/ 'telnet://' // Well if we're going to support the above.. -ævar
|
|
/ 'worldwind://'
|
|
|
|
// javascript does not support unicode features..
|
|
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
|
|
|
|
|
|
urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
|
|
try {
|
|
return decodeURI("%" + c0 + c1)
|
|
} catch ( e ) {
|
|
// Reject the match, and allow other fall-back productions to have a
|
|
// go at it.
|
|
return null;
|
|
}
|
|
}
|
|
|
|
//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
|
|
|
|
no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
|
|
|
|
url
|
|
= proto:url_protocol
|
|
addr:( ipv6_address / ipv4_address )?
|
|
rest:( ( !inline_breaks
|
|
c:no_punctuation_char
|
|
{ return c }
|
|
)
|
|
/ s:[.:,] !(space / eolf) { return s }
|
|
/ htmlentity
|
|
/// urlencoded_char
|
|
/ [&%] )+
|
|
{
|
|
return proto + addr + rest.join('');
|
|
}
|
|
|
|
ipv4_address
|
|
= a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*)
|
|
{
|
|
return flatten( a ).join('');
|
|
}
|
|
|
|
ipv6_address
|
|
= a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']')
|
|
{
|
|
return flatten( a ).join('');
|
|
}
|
|
|
|
|
|
/**************************************************************
|
|
* Templates, -arguments and wikilinks
|
|
**************************************************************/
|
|
|
|
/*
|
|
* Precedence: template arguments win over templates. See
|
|
* http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
|
|
* 4: {{{{·}}}} → {·{{{·}}}·}
|
|
* 5: {{{{{·}}}}} → {{·{{{·}}}·}}
|
|
* 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
|
|
* 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
|
|
*/
|
|
tplarg_or_template = & '{{{{{' template / tplarg / template
|
|
|
|
template
|
|
= "{{" target:template_param_text
|
|
params:(newline? "|" newline? p:template_param { return p })*
|
|
"}}" {
|
|
// Insert target as first positional attribute, so that it can be
|
|
// generically expanded. The TemplateHandler then needs to shift it out
|
|
// again.
|
|
params.unshift( { k: '', v: flatten( target ) } );
|
|
var obj = new SelfclosingTagTk( 'template', params );
|
|
//console.warn( 'tokenizer template ' + JSON.stringify( target ));
|
|
return obj;
|
|
}
|
|
|
|
// XXX: support template and args in target!
|
|
//template_target
|
|
// = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') }
|
|
|
|
tplarg
|
|
= "{{{"
|
|
name:template_param_text
|
|
params:( newline? "|" newline? p:template_param { return p })*
|
|
"}}}" {
|
|
name = flatten( name );
|
|
params.unshift( { k: '', v: name } );
|
|
var obj = new SelfclosingTagTk( 'templatearg', params );
|
|
//console.warn( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 ));
|
|
return obj;
|
|
}
|
|
|
|
template_param
|
|
= name:template_param_name
|
|
s0:space*
|
|
eq:"="?
|
|
s1:space*
|
|
value:template_param_text?
|
|
|
|
{
|
|
//console.warn( 'named template_param matched' + pp([name, value ]) );
|
|
if ( value !== '' ) {
|
|
return new KV( name, flatten( value ) );
|
|
} else if ( eq !== '' ) {
|
|
return new KV(flatten( name ), []);
|
|
} else {
|
|
return new KV([], flatten(name));
|
|
}
|
|
}
|
|
// empty parameter
|
|
/ & [|}] { return new KV([], []); }
|
|
|
|
|
|
// FIXME: handle template args and templates in key! (or even parser functions?)
|
|
template_param_name
|
|
= & { return setFlag( 'equal' ) }
|
|
tpt:template_param_text
|
|
{
|
|
clearFlag( 'equal' );
|
|
//console.warn( 'template param name matched: ' + pp( tpt ) );
|
|
return tpt;
|
|
}
|
|
|
|
/ & { return clearFlag( 'equal' ) }
|
|
//= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
|
|
|
|
template_param_text
|
|
= & { return setFlag('template') }
|
|
il:inline {
|
|
clearFlag('template');
|
|
//console.warn( 'tpt match: ' + pp (il));
|
|
return il;
|
|
}
|
|
/ & { return clearFlag('template'); }
|
|
|
|
// TODO: handle link prefixes as in al[[Razi]]
|
|
wikilink
|
|
= & { return posStack.push('wikilink' , pos); }
|
|
"[["
|
|
! url
|
|
//target:link_target
|
|
// XXX: disallow pipe!
|
|
target:wikilink_preprocessor_text
|
|
lcontent:(
|
|
& { return posStack.push('lcontent' , pos); }
|
|
lcs:( "|" lt:link_text { return new KV( '', lt ); } )+ {
|
|
return { pos: posStack.pop('lcontent' , pos), content: lcs };
|
|
}
|
|
|
|
/ {
|
|
return { pos: posStack.pop('lcontent' , pos), content: [] };
|
|
}
|
|
)
|
|
"]]"
|
|
// XXX In real MediaWiki, this is a language-dependent positive character
|
|
// class. Can we work out a static negative class instead?
|
|
// XXX: Exclude uppercase chars from non-latin languages too!
|
|
tail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
|
|
var obj = new SelfclosingTagTk( 'wikilink' ),
|
|
textTokens = [];
|
|
obj.attribs.push( new KV('', target) );
|
|
obj.dataAttribs = {
|
|
sourcePos: posStack.pop( 'wikilink', pos ),
|
|
contentPos: lcontent.pos
|
|
};
|
|
// XXX: Point to object with path, revision and input information
|
|
obj.source = input;
|
|
|
|
//console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) );
|
|
// Deal with content. XXX: Properly support pipe-trick etc
|
|
//lcontent.tail = tail && tail.join('') || '';
|
|
|
|
obj.attribs = obj.attribs.concat( lcontent.content );
|
|
obj.attribs.push( new KV( 'tail', tail && tail.join('') || '' ) );
|
|
//console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) );
|
|
return [obj];
|
|
}
|
|
/ ! { return posStack.pop( 'wikilink', pos ); }
|
|
|
|
link_text
|
|
= & { return setFlag('linkdesc'); }
|
|
h:inline
|
|
// 'equal' syntaxFlag is set for links in template parameters. Consume the
|
|
// '=' here.
|
|
hs:( '=' inline)?
|
|
{
|
|
//console.warn('link_text' + pp(h) + pp(hs));
|
|
clearFlag('linkdesc');
|
|
if( hs !== '' ) {
|
|
return h.concat(hs);
|
|
} else {
|
|
return h;
|
|
}
|
|
}
|
|
/ & { return clearFlag('linkdesc'); }
|
|
|
|
link_option
|
|
= & { setFlag('pipe'); return setFlag('linkdesc'); }
|
|
h:inline
|
|
// 'equal' syntaxFlag is set for links in template parameters. Consume the
|
|
// '=' here.
|
|
hs:( '=' inline)?
|
|
{
|
|
//console.warn('link_text' + pp(h) + pp(hs));
|
|
clearFlag('pipe');
|
|
clearFlag('linkdesc');
|
|
if( hs !== '' ) {
|
|
return h.concat(hs);
|
|
} else {
|
|
return h;
|
|
}
|
|
}
|
|
/ & { clearFlag('pipe'); return clearFlag('linkdesc'); }
|
|
|
|
link_end = "]]"
|
|
|
|
/* Generic quote production for italic and bold, further processed in a token
|
|
* stream transformation in doQuotes. Relies on NlTk tokens being emitted
|
|
* for each line of text to balance quotes per line.
|
|
*
|
|
* We are not using a simple pair rule here as we need to support mis-nested
|
|
* bolds/italics and MediaWiki's special heuristics for apostrophes, which are
|
|
* all not context free. */
|
|
quote = "''" x:"'"* {
|
|
var res = new TagTk( 'mw-quote' ); // Will be consumed in token transforms
|
|
res.value = "''" + x.join('');
|
|
return res;
|
|
}
|
|
|
|
|
|
/**
|
|
* Image option productions, only called from the LinkHandler token stream
|
|
* transformer, and only for images.
|
|
*/
|
|
img_options =
|
|
& { return setFlag( 'pipe' ); }
|
|
os:img_option* {
|
|
clearFlag( 'pipe' );
|
|
var options = {};
|
|
os = flatten( os );
|
|
for ( var i = 0, l = os.length; i < l; i++ ) {
|
|
var o = os[i];
|
|
options[o.k] = o.v;
|
|
}
|
|
options._options = os;
|
|
return options;
|
|
}
|
|
/ & { return clearFlag( 'pipe' ); }
|
|
|
|
img_option
|
|
= "|" space*
|
|
o:(
|
|
img_attribute
|
|
/ img_format
|
|
/ img_dimensions
|
|
/ img_halign
|
|
/ img_valign
|
|
/ img_link
|
|
/ lt:link_text { return new KV('caption', lt) }
|
|
)
|
|
space* {
|
|
return o
|
|
};
|
|
|
|
img_format
|
|
= f:( 'border' / 'frameless' / 'frame' / 'thumbnail' / 'thumb' ) {
|
|
return new KV( 'format', f );
|
|
}
|
|
|
|
img_dimensions
|
|
= x:(n:[0-9]+ { return n.join('') })? y:('x' n:[0-9]+ { return n.join('') })? 'px' {
|
|
if ( x === '' && y ) {
|
|
return new KV( 'height', y );
|
|
} else if ( y === '' && x ) {
|
|
return new KV( 'width', x );
|
|
} else {
|
|
return [ new KV( 'width', x ), new KV( 'height', y ) ];
|
|
}
|
|
}
|
|
/ 'upright' { return [ new KV( 'width', 'upright' ) ] }
|
|
|
|
img_halign
|
|
= a:( 'left' / 'right' / 'center' / 'none' ) {
|
|
return new KV( 'halign', a );
|
|
}
|
|
|
|
img_valign
|
|
= a:( 'baseline' / 'sub' / 'super' / 'top' / 'text-top'
|
|
/ 'middle' / 'bottom' / 'text-bottom' ) {
|
|
return new KV( 'valign', a );
|
|
}
|
|
|
|
// External link targets are already parsed as link tokens. If we can make
|
|
// sure that those cleanly convert back to the original text, then we could
|
|
// re-parse them here.
|
|
img_link
|
|
= 'link=' space*
|
|
u:(
|
|
t:url {
|
|
clearFlag( 'pipe' );
|
|
return t;
|
|
}
|
|
/ & { return clearFlag( 'pipe' ); }
|
|
)
|
|
{
|
|
return new KV( 'link', u );
|
|
}
|
|
|
|
img_attribute
|
|
= k:( 'page' / 'alt' / 'thumbnail' / 'thumb' ) '=' t:preprocessor_text? {
|
|
return new KV( k, t );
|
|
}
|
|
|
|
|
|
|
|
/***********************************************************
|
|
* Pre and xmlish tags
|
|
***********************************************************/
|
|
|
|
// Indented pre blocks differ from their non-indented (purely tag-based)
|
|
// cousins by having their contents parsed.
|
|
pre_indent
|
|
= pre_indent_in_tags
|
|
/ l:pre_indent_line ls:(sol pre_indent_line)* {
|
|
return [new TagTk( 'pre' )]
|
|
.concat( [l], ls
|
|
, [new EndTagTk( 'pre' )]);
|
|
}
|
|
|
|
// An indented pre block that is surrounded with pre tags. The pre tags are
|
|
// used directly.
|
|
pre_indent_in_tags
|
|
= space+ // XXX: capture space for round-tripping
|
|
"<pre"
|
|
attribs:generic_attribute*
|
|
">"
|
|
& { return setFlag('pre'); }
|
|
l:inlineline
|
|
ls:(sol pre_indent_line)*
|
|
"</pre>"
|
|
{
|
|
clearFlag('pre');
|
|
return [ new TagTk( 'pre', attribs ) ]
|
|
.concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] );
|
|
}
|
|
/ & { return clearFlag('pre'); }
|
|
|
|
pre_indent_line = space l:inlineline {
|
|
return [ '\n' ].concat(l);
|
|
}
|
|
|
|
/*
|
|
* Pre blocks defined using non-indented HTML tags only parse nowiki tags
|
|
* inside them, and convert other content to verbatim text. Nowiki inside pre
|
|
* is not functionally needed, but supported for backwards compatibility.
|
|
*/
|
|
pre
|
|
= "<pre"
|
|
attribs:generic_attribute*
|
|
">"
|
|
// MediaWiki <pre> is special in that it converts all pre content to plain
|
|
// text.
|
|
ts:(t1:[^<]+ { return t1.join('') }
|
|
/ nowiki
|
|
/ !"</pre>" t2:. { return t2 })+
|
|
("</pre>" / eof) {
|
|
// return nowiki tags as well?
|
|
//console.warn('inpre');
|
|
return [ new TagTk( 'pre', attribs ) ]
|
|
.concat(ts, [ new EndTagTk( 'pre' ) ]);
|
|
}
|
|
/ "</pre>" { return "</pre>"; }
|
|
|
|
/* XXX: Extension tags can require a change in the tokenizer mode, which
|
|
* returns any text between extension tags verbatim. For now, we simply
|
|
* continue to parse the contained text and return the tokens. The original
|
|
* input source can be recovered from the source positions added on tag
|
|
* tokens. This won't however work in all cases. For example, a comment start
|
|
* (<!--) between extension tags would cause the remaining text to be consumed
|
|
* as a comment. To avoid this, we might need to look ahead for the end tag
|
|
* and limit the content parsing to this section. */
|
|
|
|
xmlish_tag = nowiki / generic_tag
|
|
|
|
/*
|
|
* Nowiki treats anything inside it as plain text. It could thus also be
|
|
* defined as an extension that returns its raw input text, possibly wrapped
|
|
* in a span for round-trip information. The special treatment for nowiki in
|
|
* pre blocks would still remain in the grammar though, so overall handling it
|
|
* all here is cleaner.
|
|
*/
|
|
nowiki
|
|
= "<nowiki>" nc:nowiki_content "</nowiki>" {
|
|
//console.warn( 'full nowiki return: ' + pp(nc));
|
|
return nc;
|
|
}
|
|
/ "<nowiki>" {
|
|
//console.warn('nowiki fallback');
|
|
return ['<nowiki>'];
|
|
}
|
|
/ "</nowiki>" {
|
|
//console.warn('nowiki end fallback');
|
|
return ['</nowiki>'];
|
|
}
|
|
|
|
nowiki_content
|
|
= ts:( t:[^<]+ { return t.join('') }
|
|
/ "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
|
|
//console.warn('nested pre in nowiki');
|
|
return ["<pre"].concat(p0, p1, [">"], p2, ["</pre>"]).join('');
|
|
}
|
|
/ (!("</"( "nowiki>" / "pre>")) c:. {
|
|
//console.warn('nowiki: single char' + c);
|
|
return c;
|
|
})
|
|
)* {
|
|
// return nowiki tags as well?
|
|
//console.warn('nowiki_content: return' + pp(ts));
|
|
return [ts.join('')];
|
|
}
|
|
|
|
|
|
// The list of HTML5 tags, mainly used for the identification of non-html
|
|
// tags. These terminate otherwise tag-eating productions (see list below) in
|
|
// order to support potential extension tags:
|
|
// * comment
|
|
// * pre
|
|
// * nowiki
|
|
html5_tagnames
|
|
= "a" / "abbr" / "address" / "area" / "article"
|
|
/ "aside" / "audio" / "b" / "base" / "bdi" / "bdo" / "blockquote"
|
|
/ "body" / "br" / "button" / "canvas" / "caption" / "cite" / "code"
|
|
/ "col" / "colgroup" / "command" / "data" / "datalist" / "dd" / "del"
|
|
/ "details" / "dfn" / "div" / "dl" / "dt" / "em" / "embed" / "fieldset"
|
|
/ "figcaption" / "figure" / "footer" / "form"
|
|
/ "h1" / "h2" / "h3" / "h4" / "h5" / "h6" / "head" / "header" / "hgroup"
|
|
/ "hr" / "html" / "i" / "iframe" / "img" / "input" / "ins" / "kbd" / "keygen"
|
|
/ "label" / "legend" / "li" / "link" / "map" / "mark" / "menu" / "meta"
|
|
/ "meter" / "nav" / "noscript" / "object" / "ol" / "optgroup" / "option"
|
|
/ "output" / "p" / "param" / "pre" / "progress" / "q" / "rp" / "rt"
|
|
/ "ruby" / "s" / "samp" / "script" / "section" / "select" / "small"
|
|
/ "source" / "span" / "strong" / "style" / "sub" / "summary" / "sup"
|
|
/ "table" / "tbody" / "td" / "textarea" / "tfoot" / "th" / "thead" / "time"
|
|
/ "title" / "tr" / "track" / "u" / "ul" / "var" / "video" / "wbr"
|
|
|
|
html_oldnames = "center" / "font" / "tt"
|
|
|
|
// Simple match on non-html endtags, for use in inline_breaks
|
|
nonhtml_endtag
|
|
= '</'
|
|
! ( html5_tagnames / html_oldnames )
|
|
[^ >]+
|
|
( space / newline ) *
|
|
[^>]*
|
|
'>'
|
|
|
|
/* Generic XML-like tags
|
|
*
|
|
* These also cover extensions (including Cite), which will hook into the
|
|
* token stream for further processing. The content of extension tags is
|
|
* parsed as regular inline, but the source positions of the tag are added
|
|
* to allow reconstructing the unparsed text from the input. */
|
|
|
|
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
|
|
// following paragraphs
|
|
generic_tag
|
|
= "<"
|
|
& { tagStartPos = pos; return true; } // remember the start position of this tag
|
|
end:"/"? name:[0-9a-zA-Z]+
|
|
attribs:generic_newline_attribute*
|
|
( space / newline ) *
|
|
selfclose:"/"?
|
|
">" {
|
|
name = name.join('');
|
|
var res;
|
|
if ( end != '' ) {
|
|
res = new EndTagTk( name, attribs );
|
|
} else if ( selfclose != '' ) {
|
|
res = new SelfclosingTagTk( name, attribs );
|
|
} else {
|
|
res = new TagTk( name, attribs );
|
|
}
|
|
if ( res.dataAttribs === undefined ) {
|
|
res.dataAttribs = {};
|
|
}
|
|
res.dataAttribs.sourceTagPos = [tagStartPos - 1, pos];
|
|
return res;
|
|
}
|
|
|
|
// A generic attribute that can span multiple lines.
|
|
generic_newline_attribute
|
|
= s:( space / newline )*
|
|
name:generic_attribute_name
|
|
value:(( space / newline )*
|
|
v:generic_attribute_newline_value { return v })?
|
|
{
|
|
if ( value !== '' ) {
|
|
return new KV( name, value );
|
|
} else {
|
|
return new KV( name, '' );
|
|
}
|
|
}
|
|
|
|
// A single-line attribute.
|
|
generic_attribute
|
|
= s:space*
|
|
name:generic_attribute_name
|
|
value:(space*
|
|
v:generic_attribute_value { return v })?
|
|
{
|
|
//console.warn( 'generic attribute: ' + pp([name, value]));
|
|
// FIXME: name might just be a template, which can expand to a key-value
|
|
// pair later. We'll need to handle that in the AttributeTransformManager.
|
|
if ( value !== '' ) {
|
|
return new KV( name, value );
|
|
} else {
|
|
return new KV( name, '' );
|
|
}
|
|
}
|
|
|
|
// ( Replaced by generic_attribute_name for template / parameter support. )
|
|
//// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
|
|
//// disallow newlines, | and {.
|
|
//generic_attribute_plain_name
|
|
// = n:[^ \t\0/"'>=\n|{]+ {
|
|
// return n.join('');
|
|
// }
|
|
|
|
generic_attribute_name
|
|
= & { return setFlag( 'equal' ) }
|
|
! '/>'
|
|
name:attribute_preprocessor_text_line
|
|
{
|
|
clearFlag( 'equal' );
|
|
//console.warn( 'generic attribute name: ' + pp( name ) );
|
|
return name;
|
|
}
|
|
/ & { return clearFlag( 'equal' ) }
|
|
|
|
// A generic attribute, possibly spanning multiple lines.
|
|
generic_attribute_newline_value
|
|
= "=" (space / newline )* v:xml_att_value {
|
|
return v;
|
|
}
|
|
// A generic but single-line attribute.
|
|
generic_attribute_value
|
|
= "=" space* v:att_value {
|
|
return v;
|
|
}
|
|
|
|
// Attribute value, quoted variants can span multiple lines.
|
|
xml_att_value
|
|
= "'" t:attribute_preprocessor_text_single "'" { return t; }
|
|
/ '"' t:attribute_preprocessor_text_double '"' { return t; }
|
|
/ attribute_preprocessor_text
|
|
|
|
// Attribute value, restricted to a single line.
|
|
att_value
|
|
= "'" t:attribute_preprocessor_text_single_line "'" { return t; }
|
|
/ '"' t:attribute_preprocessor_text_double_line '"' { return t; }
|
|
/ attribute_preprocessor_text_line
|
|
|
|
/*
|
|
* A variant of generic_tag, but also checks if the tag name is a block-level
|
|
* tag as defined in
|
|
* http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and following
|
|
* paragraphs.
|
|
*/
|
|
block_tag
|
|
= "<" end:"/"?
|
|
name:(cs:[a-zA-Z]+ { return cs.join('') })
|
|
attribs:generic_newline_attribute*
|
|
( space / newline ) *
|
|
selfclose:"/"?
|
|
">" {
|
|
if (block_names[name.toLowerCase()] !== true) {
|
|
// abort match if tag is not block-level
|
|
return null;
|
|
}
|
|
var res;
|
|
if ( end != '' ) {
|
|
res = new EndTagTk( name, attribs );
|
|
} else if ( selfclose != '' ) {
|
|
res = new SelfclosingTagTk( name, attribs );
|
|
} else {
|
|
res = new TagTk( name, attribs );
|
|
}
|
|
return [res];
|
|
}
|
|
|
|
|
|
/*********************************************************
|
|
* Lists
|
|
*********************************************************/
|
|
lists = e:(dtdd / li) es:(sol (dtdd / li))*
|
|
{
|
|
return annotateList( [ new TagTk( 'list' ) ]
|
|
.concat(flatten([e].concat(es))
|
|
,[ new EndTagTk( 'list' ) ]));
|
|
}
|
|
|
|
li = bullets:list_char+
|
|
c:inlineline?
|
|
&eolf
|
|
{
|
|
if ( c === '' )
|
|
c = [];
|
|
var li = new TagTk( 'listItem' );
|
|
li.bullets = bullets;
|
|
return [ li, c ];
|
|
}
|
|
|
|
dtdd
|
|
= bullets:(!(";" !list_char) list_char)*
|
|
";"
|
|
& {return setFlag('colon');}
|
|
c:inlineline
|
|
":"
|
|
// Fortunately dtdds cannot be nested, so we can simply set the flag
|
|
// back to 0 to disable it.
|
|
& {syntaxFlags['colon'] = 0; return true;}
|
|
d:inlineline
|
|
&eolf {
|
|
// Convert trailing space into
|
|
// XXX: This should be moved to a serializer
|
|
//var clen = c.length;
|
|
//if (clen && c[clen - 1].constructor === String) {
|
|
// var val = c[clen - 1].value;
|
|
// if(val.length && val[val.length - 1] == ' ') {
|
|
// c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
|
|
// }
|
|
//}
|
|
|
|
var li = new TagTk( 'listItem' );
|
|
li.bullets = bullets + ";";
|
|
var li2 = new TagTk( 'listItem' );
|
|
li2.bullets = bullets + ":";
|
|
return [ li ].concat( c, [ li2 ], d );
|
|
}
|
|
// Fall-back case to clear the colon flag
|
|
/ & { return true; } { syntaxFlags['colon'] = 0; return null; }
|
|
|
|
|
|
list_char = [*#:;]
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
* Tables
|
|
*
|
|
* Table productions are geared to support independent parsing of fragments in
|
|
* templates (the common table start / row / table end use case). The tokens
|
|
* produced by these fragments then match up to a table while building the
|
|
* DOM tree. For similar reasons, table rows do not emit explicit end tag
|
|
* tokens.
|
|
*
|
|
* The separate table_lines production is faster than moving those productions
|
|
* directly to block_lines.
|
|
*********************************************************************/
|
|
|
|
table_lines
|
|
= & { return setFlag('table'); }
|
|
tl:table_line
|
|
tls:( s:sol tl2:table_line { return s.concat(tl2); } )* {
|
|
clearFlag('table');
|
|
//console.warn('table_lines: ' + pp(tl.concat(tls)));
|
|
return tl.concat( tls );
|
|
}
|
|
/ & { return clearFlag('table'); }
|
|
|
|
// This production assumes start-of-line position!
|
|
table_line
|
|
= table_start_tag
|
|
/ table_row_tag
|
|
/ table_data_tags
|
|
/ table_heading_tags
|
|
/ table_caption_tag
|
|
/ table_end_tag
|
|
|
|
table_start_tag
|
|
= "{|"
|
|
ta:generic_attribute*
|
|
space*
|
|
te:table_end_tag? // can occur somewhere in the middle of the line too
|
|
{
|
|
var toks = [ new TagTk( 'table' ) ];
|
|
if ( ta )
|
|
toks[0].attribs = ta;
|
|
if ( te )
|
|
toks = toks.concat( te );
|
|
return toks;
|
|
}
|
|
|
|
table_caption_tag
|
|
= "|+"
|
|
c:inline* {
|
|
return [ new TagTk( 'caption' )]
|
|
.concat( c, [ new EndTagTk( 'caption' ) ]);
|
|
}
|
|
|
|
|
|
table_row_tag
|
|
= //& { console.warn("table row enter"); return true; }
|
|
"|-"
|
|
a:generic_attribute*
|
|
space*
|
|
// handle tables with missing table cells after a row
|
|
td:( s:sol ![|!] tdt:table_data_tag { return s.concat(tdt); } )?
|
|
{
|
|
// We rely on our tree builder to close the row as needed. This is
|
|
// needed to support building tables from fragment templates with
|
|
// individual cells or rows.
|
|
var trToken = [ new TagTk( 'tr', a ) ];
|
|
if ( !td ) {
|
|
return trToken;
|
|
} else {
|
|
return trToken.concat(td);
|
|
}
|
|
}
|
|
|
|
table_data_tags
|
|
= pipe
|
|
td:table_data_tag
|
|
tds:( "||" tdt:table_data_tag { return tdt } )* {
|
|
return td.concat(tds);
|
|
}
|
|
|
|
table_data_tag
|
|
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
|
|
! [}+-]
|
|
a:table_cell_args?
|
|
//& { console.warn("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
|
|
// use inline_breaks to break on tr etc
|
|
td:( //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
|
|
b:nested_block { return b }
|
|
)*
|
|
{
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
//console.warn("table data result: " + pp(td) + ", attribts: " + pp(a));
|
|
return [ new TagTk( 'td', a )]
|
|
.concat( td, [ new EndTagTk ( 'td' ) ] );
|
|
}
|
|
|
|
table_heading_tags
|
|
= "!"
|
|
th:table_heading_tag
|
|
ths:( "!!" tht:table_heading_tag { return tht } )* {
|
|
return th.concat(ths);
|
|
}
|
|
|
|
table_heading_tag
|
|
= a:table_cell_args?
|
|
c:inline {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
return [ new TagTk( 'th', a ) ]
|
|
.concat( c, [new EndTagTk( 'th' )] );
|
|
}
|
|
|
|
table_end_tag
|
|
= "|}" {
|
|
var tok = [new EndTagTk( 'table' )];
|
|
return tok;
|
|
}
|
|
|
|
table_cell_args
|
|
= & { return setFlag('tableCellArg'); }
|
|
as:generic_attribute+ space* "|" !"|" {
|
|
clearFlag('tableCellArg');
|
|
return as;
|
|
}
|
|
/ & { return clearFlag('tableCellArg'); }
|
|
|
|
|
|
|
|
/* Tables, full-table version (no support for table fragments from templates)
|
|
* Only left here for comparison purposes with the above productions. Remove
|
|
* when those work as intended! */
|
|
table
|
|
= tas:table_start space* c:table_caption? b:table_body? te:table_end {
|
|
var res = new TagTk( 'table' );
|
|
var body = b !== '' ? b : [];
|
|
//dp("body: " + pp(body));
|
|
if (tas.length > 0) {
|
|
// FIXME: actually parse and build structure
|
|
//res.attribs = [new KV('data-unparsed', tas.join(''))];
|
|
res.attribs = tas;
|
|
}
|
|
|
|
if (c != '') {
|
|
var caption = [ new TagTk( 'caption' ) ]
|
|
.concat(c, [new EndTagTk( 'caption' )], te);
|
|
} else {
|
|
var caption = [];
|
|
}
|
|
//dp(pp(res));
|
|
|
|
return [res].concat(caption, body, [new EndTagTk( 'table' )]);
|
|
}
|
|
|
|
table_start
|
|
= "{" pipe
|
|
res:(
|
|
& { setFlag('table'); return true; }
|
|
ta:generic_attribute*
|
|
{
|
|
//dp("table_start " + pp(ta) + ", pos:" + pos);
|
|
return ta;
|
|
}
|
|
/ & { clearFlag('table'); return false; } { return null; }
|
|
) { return res }
|
|
|
|
table_caption
|
|
= n:newlineToken
|
|
pipe "+" c:inline* {
|
|
return n.concat(c);
|
|
}
|
|
|
|
table_body
|
|
= //& { dp("table_body enter"); return true; }
|
|
firstrow:table_firstrow otherrows:table_row* {
|
|
/* dp('table first and otherrows: '
|
|
* + pp([firstrow].concat(otherrows))); */
|
|
return [firstrow].concat(otherrows);
|
|
}
|
|
/ otherrows:table_row* {
|
|
//dp('table otherrows: ' + pp(otherrows));
|
|
return otherrows;
|
|
}
|
|
|
|
table_firstrow
|
|
= td:(table_data / table_heading)+ {
|
|
//dp('firstrow: ' + pp(td));
|
|
return [ new TagTk( 'tr' )]
|
|
.concat(td, [ new EndTagTk( 'tr' )]);
|
|
}
|
|
|
|
table_row
|
|
= //& { dp("table row enter"); return true; }
|
|
n:newlineToken
|
|
pipe "-"
|
|
a:(as:generic_attribute+ space* pipe !pipe { return as } )?
|
|
space*
|
|
td:(table_data / table_heading)* {
|
|
return n.concat([ new TagTk( 'tr' )]
|
|
, td, [ new EndTagTk( 'tr' )]);
|
|
}
|
|
|
|
table_data
|
|
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
|
|
n:("||" { return [] } / nt:newlineToken (pipe / !'!') { return nt })
|
|
! [}+-]
|
|
//& { dp('before attrib, pos=' + pos); return true; }
|
|
a:(as:generic_attribute+ space* pipe !pipe { return as } )?
|
|
//& { dp('past attrib, pos=' + pos); return true; }
|
|
// use inline_breaks to break on tr etc
|
|
td:(
|
|
//& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
|
|
b:nested_block { return b }
|
|
)* {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
//dp("table data result: " + pp(td) + ", attribts: " + pp(a));
|
|
return n.concat( [ new TagTk( 'td', a ) ]
|
|
, td, [ new EndTagTk( 'td' ) ] );
|
|
}
|
|
|
|
table_heading
|
|
= n:("!!" { return [] } / nl:newlineToken "!" { return nl })
|
|
a:(as:generic_attribute+ pipe !pipe { return as } )?
|
|
c:inline {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
return n.concat( [ new TagTk( 'th', a )]
|
|
, c, [ new EndTagTk( 'th' ) ]);
|
|
}
|
|
|
|
thtd_attribs
|
|
// In particular, do not match [|\n]
|
|
= a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ pipe ! pipe {
|
|
return a;
|
|
}
|
|
|
|
|
|
table_end
|
|
= nt:newlineToken? ( pipe "}" / eof ) {
|
|
clearFlag('table');
|
|
if(nt)
|
|
return nt;
|
|
else
|
|
return [];
|
|
}
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
* Text variants and other general productions
|
|
*******************************************************************/
|
|
|
|
/* All chars that cannot start syntactic structures in the middle of a line
|
|
* XXX: ] and other end delimiters should probably only be activated inside
|
|
* structures to avoid unnecessarily leaving the text production on plain
|
|
* content. */
|
|
|
|
text_char = [^'<~[{\n\r:\]}|!=]
|
|
|
|
text = t:text_char+ { return t.join(''); }
|
|
|
|
/* Legend
|
|
* ' quotes (italic/bold)
|
|
* < start of xmlish_tag
|
|
* ~ signatures/dates
|
|
* [ start of links
|
|
* { start of parser functions, transclusion and template args
|
|
* \n all sort of block-level markup at start of line
|
|
* \r ditto
|
|
* h http(s) urls
|
|
* n nntp(s) urls
|
|
* m mailto urls
|
|
*
|
|
* ! and | table cell delimiters, might be better to specialize those
|
|
* = headings - also specialize those!
|
|
*
|
|
* The following chars are also included for now, but only apply in some
|
|
* contexts and should probably be enabled only in those:
|
|
* : separate definition in ; term : definition
|
|
* ] end of link
|
|
* } end of parser func/transclusion/template arg
|
|
*/
|
|
|
|
urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
|
|
/ & url_chars urllink
|
|
/ htmlentity
|
|
// Convert trailing space into
|
|
// XXX: This should be moved to a serializer
|
|
/ ' ' & ':' { return "\u00a0"; }
|
|
/ t:text_char )+
|
|
|
|
/*
|
|
'//', // for protocol-relative URLs, but not in text!
|
|
'ftp://',
|
|
'git://',
|
|
'gopher://',
|
|
'http://',
|
|
'https://',
|
|
'irc://',
|
|
'ircs://', // @bug 28503
|
|
'mailto:',
|
|
'mms://',
|
|
'news:',
|
|
'nntp://', // @bug 3808 RFC 1738
|
|
'svn://',
|
|
'telnet://', // Well if we're going to support the above.. -ævar
|
|
'worldwind://',
|
|
*/
|
|
|
|
// Old version
|
|
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
|
|
|
|
// Experimental tweaked version: avoid expensive single-char substrings
|
|
// This did not bring the expected performance boost, however.
|
|
//text = [A-Za-z0-9,._ -] {
|
|
// textStart = pos;
|
|
//
|
|
// var res = input.substr(textStart - 1, inputLength)
|
|
// .match(/[A-Za-z0-9,._ -]+/)[0];
|
|
// pos = pos + (res.length - 1);
|
|
// return res
|
|
// }
|
|
|
|
htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
|
|
return unentity("&" + c.join('') + ";")
|
|
}
|
|
|
|
space
|
|
= s:[ \t]+ { return s.join(''); }
|
|
|
|
optionalSpaceToken
|
|
= s:space* {
|
|
if ( s.length ) {
|
|
return [s.join('')];
|
|
} else {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
|
|
// Start of line
|
|
sol = nl:(newlineToken / & { return pos === 0; } { return [] })
|
|
// Eat multi-line comments, so that syntax after still matches as if it
|
|
// was actually preceded by a newline
|
|
cn:( c:comment n:newline? {
|
|
if ( n !== '' ) {
|
|
return [c, n];
|
|
} else {
|
|
return [c];
|
|
}
|
|
}
|
|
)*
|
|
// Eat includeonly/noinclude at start of line, so that start-of-line
|
|
// syntax after it still matches
|
|
ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
|
|
{
|
|
var niToken = [];
|
|
if ( ni !== '') {
|
|
if ( ni[0] === '/' ) {
|
|
niToken = [new EndTagTk( ni[1] )];
|
|
} else {
|
|
niToken = [new TagTk( ni[1] )];
|
|
}
|
|
}
|
|
|
|
return nl.concat(cn, niToken);
|
|
}
|
|
|
|
eof = & { return isEOF(pos); } { return true; }
|
|
|
|
|
|
newline
|
|
= '\n' / '\r\n'
|
|
|
|
newlineToken = newline { return [new NlTk()] }
|
|
|
|
eolf = newline / eof
|
|
|
|
|
|
// 'Preprocessor' directive- higher-level things that can occur in otherwise
|
|
// plain-text content.
|
|
directive
|
|
= comment
|
|
/ nowiki
|
|
/ tplarg_or_template
|
|
/ htmlentity
|
|
|
|
// Plain text, but can contain templates, template arguments, comments etc-
|
|
// all stuff that is normally handled by the preprocessor
|
|
// Returns either a list of tokens, or a plain string (if nothing is to be
|
|
// processed).
|
|
preprocessor_text
|
|
= r:( t:[^<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
|
|
/ directive
|
|
/ !inline_breaks text_char )+ {
|
|
return flatten ( r );
|
|
}
|
|
|
|
spaceless_preprocessor_text
|
|
= r:( t:[^'<~[{\n\r|!\]}\t &=]+ { return t.join(''); }
|
|
/ directive
|
|
/ !inline_breaks !' ' text_char )+ {
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
|
|
wikilink_preprocessor_text
|
|
= r:( t:[^<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
|
|
/// urlencoded_char
|
|
/ directive
|
|
/ !inline_breaks !"|" !"]]" text_char )+ {
|
|
return flatten_stringlist ( r );
|
|
}
|
|
|
|
extlink_preprocessor_text
|
|
// added special separator character class inline: separates url from
|
|
// description / text
|
|
= r:( t:[^'<~[{\n\r|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
|
|
/ directive
|
|
/// urlencoded_char
|
|
/ !inline_breaks no_punctuation_char
|
|
/ s:[.:,] !(space / eolf) { return s }
|
|
/ [&%] )+ {
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
// Attribute values with preprocessor support
|
|
attribute_preprocessor_text
|
|
= r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
|
|
/ directive
|
|
/ !inline_breaks [&%] )+ {
|
|
//console.warn('prep');
|
|
return flatten_string ( r );
|
|
}
|
|
attribute_preprocessor_text_single
|
|
= r:( t:[^{&']+ { return t.join(''); }
|
|
/ directive
|
|
/ !inline_breaks [{&] )* {
|
|
return flatten_string ( r );
|
|
}
|
|
attribute_preprocessor_text_double
|
|
= r:( t:[^{&"]+ { return t.join(''); }
|
|
/ directive
|
|
/ !inline_breaks [{&] )* {
|
|
//console.warn( 'double:' + pp(r) );
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
// Variants with the entire attribute on a single line
|
|
attribute_preprocessor_text_line
|
|
= r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
|
|
/ directive
|
|
/ !inline_breaks !'\n' [&%] )+ {
|
|
//console.warn('prep');
|
|
return flatten_string ( r );
|
|
}
|
|
attribute_preprocessor_text_single_line
|
|
= r:( t:[^{&']+ { return t.join(''); }
|
|
/ directive
|
|
/ !inline_breaks !'\n' [{&] )* {
|
|
return flatten_string ( r );
|
|
}
|
|
attribute_preprocessor_text_double_line
|
|
= r:( t:[^{&"]+ { return t.join(''); }
|
|
/ directive
|
|
/ !inline_breaks !'\n' [{&] )* {
|
|
//console.warn( 'double:' + pp(r) );
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
// Special-case support for those pipe templates
|
|
pipe = "|" / "{{!}}"
|
|
|
|
/* Tabs do not mix well with the hybrid production syntax */
|
|
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent : */
|