mediawiki-extensions-Visual.../modules/parser/pegTokenizer.pegjs.txt
Subramanya Sastry b095db4303 Simpler implementation of flatten.
* Possibly more efficient under heavy GC load -- untested.
* No change in time and memory use for single file parsing.

Change-Id: Id2f3f65cc0e5f38ed968bbda60b97e46523e700e
2012-06-05 10:47:46 -05:00

2021 lines
58 KiB
JavaScript

/**
* Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
* chunks of tokens (one chunk per top-level block matched) and eventually an
* end event. Tokens map to HTML tags as far as possible, with custom tokens
* used where further processing on the token stream is needed.
*
* @author Gabriel Wicke <gwicke@wikimedia.org>
* @author Brion Vibber <brion@wikimedia.org>
*/
{
/* Fixme: use static functions to separate module! Unfortunately, this
* does not work:
* var tu = require('./mediawiki.tokenizer.utils.js');
* console.warn(tu.flatten([]));
* Using exports in the module gets a bit further, but accesses to
* tu.flatten in productions still fail. Thus, I just moved the functions
* here until a solution is found:
*/
/* Static utilities */
// Flatten a list of lists.
//var simple_flatten = function ( e ) {
// // Fast single-level flatten:
// //return [].concat.apply([], e);
//
// var es = [];
// // flatten sub-arrays
// for(var i = 0, length = e.length; i < length; i++) {
// var ei = e[i];
// if ($.isArray(ei))
// es = es.concat(flatten(ei));
// else
// es.push(ei);
// };
// return es;
//};
/**
var old_flatten = function ( e ) {
// Fast single-level flatten:
//return [].concat.apply([], e);
var es = [];
// flatten sub-arrays
var chunkStart = null,
modified = false;
for(var i = 0, l = e.length; i < l; i++) {
if ( e[i].constructor === Array ) {
if ( chunkStart !== null ) {
es = es.concat( e.slice( chunkStart, i ), flatten( e[i] ) );
chunkStart = null;
} else {
es = es.concat(flatten(e[i]));
}
modified = true;
} else if ( chunkStart === null ) {
chunkStart = i;
}
};
if ( modified ) {
if ( chunkStart !== null ) {
es = es.concat( e.slice( chunkStart, e.length ) );
}
return es;
} else {
return e;
}
};
**/
var flatten = function(e) {
function internal_flatten(e, res) {
for (var i = 0, l = e.length; i < l; i++) {
var v = e[i];
if (v.constructor === Array ) {
// Change in assumption from a shallow array to a nested array.
if (res === null) res = e.slice(0, i);
internal_flatten(v, res);
} else if (res !== null) {
res.push(v);
}
}
return res === null ? e : res;
}
return internal_flatten(e, null);
}
var flatten_string = function ( c ) {
var out = flatten_stringlist( c );
if ( out.length === 1 && out[0].constructor === String ) {
return out[0];
} else {
return out;
}
};
var flatten_stringlist = function ( c ) {
var out = [],
text = '';
c = flatten(c);
for (var i = 0, l = c.length; i < l; i++) {
var ci = c[i];
if (ci.constructor === String) {
if(ci !== '') {
text += ci;
}
} else {
if (text !== '') {
out.push( text );
text = '';
}
out.push(ci);
}
}
if (text !== '' ) {
out.push( text );
}
return out;
}
// Remove escaped quotes from attributes etc
// This was in the original PEG parser, but could not find anything in
// MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke
var unquote = function (quotec, text) {
return text.replace('\\' + quotec, quotec);
};
// Decode html entities. In a browser, this should only be fed the entity,
// not untrusted html! XXX: replace with safer version.
var unentity = function ( entity ) {
return $("<div/>").html(entity).text();
};
// Debug print with global switch
var dp = function ( msg ) {
if ( false ) {
console.warn(msg);
}
};
var pp = function ( s ) { return JSON.stringify(s, null, 2); };
// Simple string formatting using '%s'
var sprintf = function ( format ) {
var args = Array.prototype.slice.call(arguments, 1);
return format.replace(/%s/g, function () {
return args.length ? args.shift() : '';
});
};
/**
* Determine if a string represents a valid ISBN-10 or ISBN-13 identifier
*
* @static
* @method
* @param {String} isbn: The string to check
* @returns {Boolean}: True if valid ISBN, false otherwise.
*/
var isValidISBN = function ( isbn ) {
var i = 0, checksum = 0;
isbn = isbn.toUpperCase().replace(/[^\dX]/g, '');
return [10, 13].indexOf(isbn.length) !== -1;
// XXX: The following code is short-circuited because it is stricter
// than the standard parser:
switch (isbn.length) {
case 10:
for (i = 0; i < 9; i++) {
checksum += parseInt(isbn[i], 10) * (10 - i);
}
checksum += '0123456789X'.indexOf(isbn[9]);
return (checksum % 11 === 0);
case 13:
for (i = 0; i < 13; i++) {
checksum += parseInt(isbn[i], 10) * (i & 1 ? 3 : 1);
}
return (checksum % 10 === 0) && (/^97[89]/.test(isbn));
}
return false;
};
/* End static utilities */
/*
* Flags for specific parse environments (inside tables, links etc). Flags
* trigger syntactic stops in the inline_breaks production, which
* terminates inline and attribute matches. Flags merely reduce the number
* of productions needed: The grammar is still context-free as the
* productions can just be unrolled for all combinations of environments
* at the cost of a much larger grammar.
*/
function SyntaxStops () {
this.counters = {};
this.stacks = {};
this.key = '';
this._counterKey = '';
this._stackKey = '';
}
SyntaxStops.prototype.inc = function(flag) {
if (this.counters[flag] !== undefined) {
this.counters[flag]++;
} else {
this.counters[flag] = 1;
}
this._updateCounterKey();
return true;
};
SyntaxStops.prototype.dec = function(flag) {
this.counters[flag]--;
this._updateCounterKey();
return false;
};
SyntaxStops.prototype.onCount = function ( name ) {
return this.counters[name];
};
/**
* A stack for nested, but not cumulative syntactic stops.
* Example: '=' is allowed in values of template arguments, even if those
* are nested in attribute names.
*/
SyntaxStops.prototype.push = function ( name, value ) {
if( this.stacks[name] === undefined ) {
this.stacks[name] = [value];
} else {
this.stacks[name].push( value );
}
this._updateStackKey();
return true;
};
SyntaxStops.prototype.pop = function ( name ) {
if( this.stacks[name] !== undefined ) {
this.stacks[name].pop();
} else {
throw "SyntaxStops.pop: unknown stop for " + name;
}
this._updateStackKey();
return false;
};
SyntaxStops.prototype.onStack = function ( name ) {
var stack = this.stacks[name];
if ( stack === undefined || stack.length === 0 ) {
return false;
} else {
return stack[stack.length - 1];
}
};
SyntaxStops.prototype._updateKey = function ( ) {
this._updateCounterKey();
this._updateStackKey();
};
SyntaxStops.prototype._updateCounterKey = function ( ) {
var counters = [];
for ( k in this.counters ) {
if ( this.counters[k] > 0 ) {
counters.push(k);
}
}
this._counterKey = JSON.stringify(counters);
this.key = this._counterKey + this._stackKey;
};
SyntaxStops.prototype._updateStackKey = function ( ) {
var stackStops = [];
for ( k in this.stacks ) {
if ( this.onStack( k ) ) {
stackStops.push(k);
}
}
this._stackKey = JSON.stringify(stackStops);
this.key = this._counterKey + this._stackKey;
};
var stops = new SyntaxStops();
// Start position of top-level block
// Could also provide positions for lower-level blocks using a stack.
var blockStart = 0;
// Start position of generic tag production
var tagStartPos = 0;
// Stack of source positions
var posStack = {
positions: {},
push: function( key, pos ) {
if ( this.positions[key] === undefined ) {
this.positions[key] = [pos];
} else {
this.positions[key].push( pos );
}
return true;
},
pop: function( key, pos ) {
var pk = this.positions[key];
if ( pk === undefined || ! pk.length ) {
throw "Tried to pop unknown position for " + key;
} else {
return [ pk.pop(), pos ];
}
}
};
// cache the input length
var inputLength = input.length;
// pseudo-production that matches at end of input
var isEOF = function (pos) {
return pos === inputLength;
};
// text start position
var textStart = 0;
// Define block-level tags in JS, so we can use toLowerCase to match tags
// case-independently. This would be quite ugly (and possibly slower) if
// done manually in the grammar.
var block_names = (function () {
var names = [ "p", "table", "td", "tr", "ul", "ol"
, "li", "dl", "dt", "dd", "div", "center"
, "blockquote" ];
var bnames = {};
for(var i = 0, l = names.length; i < l; i++) {
bnames[names[i]] = true;
}
return bnames;
})();
var self = this;
}
/*********************************************************
* The top-level production
*********************************************************/
start
= e:toplevelblock* newline* {
// end is passed inline as a token, as well as a separate event for now.
__parseArgs[2]( [ new EOFTk( ) ] );
return []; //flatten(e);
}
/*
* A document (start production) is a sequence of toplevelblocks. Tokens are
* emitted in chunks per toplevelblock to avoid buffering the full document.
*/
toplevelblock
= & { blockStart = pos; return true; } !eof b:block {
// Clear the tokenizer's backtracking cache after matching each
// toplevelblock. There won't be any backtracking as a document is just a
// sequence of toplevelblocks, so the cache for previous toplevelblocks
// will never be needed. This frees up a lot of memory: In the case of
// [[:en:Barack Obama]], the difference is about 360M vs. 110M.
cache = {};
b = flatten(b);
// Add source offsets for round-tripping. XXX: Add these not just for
// toplevelblocks!
if ( b.length ) {
var bs = b[0];
if ( bs.constructor === String && bs.attribs === undefined ) {
b[0] = new String( bs );
bs = b[0];
}
if (bs.dataAttribs === undefined) {
bs.dataAttribs = {};
}
bs.dataAttribs.sourcePos = [blockStart, pos];
}
// Emit tokens for this toplevelblock. This feeds a chunk to the parser
// pipeline.
__parseArgs[2]( flatten( b ) );
// We don't return any tokens to the start production to save memory. We
// just emitted them already to our consumers.
return true;
}
/*
* The actual contents of each block.
*/
block
= block_lines
/ & '<' r:( pre // tag variant can start anywhere
/ comment &eolf
/ nowiki
// avoid a paragraph if we know that the line starts with a block tag
/ bt:block_tag { return [bt] }
) { return r; }
/ paragraph
// Inlineline includes generic tags; wrapped into paragraphs in token
// transform and DOM postprocessor
/ inlineline
/ sol
/*
* A block nested in other constructs. Avoid eating end delimiters for other
* constructs by checking against inline_breaks first.
*/
nested_block = !inline_breaks b:block { return b }
/*
* Line-based block constructs.
*/
block_lines
= s:sol
// eat an empty line before the block
s2:(os:optionalSpaceToken so:sol { return os.concat(so) })?
bl:block_line {
var s2_ = (s2 !== '') ? s2 : [];
return s.concat(s2_, bl);
}
/*
* Block structures with start-of-line wiki syntax
*/
block_line
= h
/ lists
/ st:optionalSpaceToken
r:( & [{}|!] tl:table_lines { return tl; }
// tag-only lines should not trigger pre either
/ bts:(bt:block_tag stl:optionalSpaceToken { return bt.concat(stl) })+
&eolf { return bts }
) {
return st.concat(r);
}
/ ! { return stops.counters.nopre } pre_indent
/ pre
/*
* A paragraph. We don't emit 'p' tokens to avoid issues with template
* transclusions, <p> tags in the source and the like. Instead, we perform
* some paragraph wrapping on the token stream and the DOM.
*/
paragraph
= s1:sol s2:sol c:inlineline {
return s1.concat(s2, /* [new TagTk('p')],*/ c);
}
br = space* &newline { return new SelfclosingTagTk( 'br' ) }
/*
* Syntax stops: Avoid eating significant tokens for higher-level productions
* in nested inline productions.
*
* Repeated testing of flags is not terribly efficient. See new and faster
* version below.
*/
/*
* Syntax stops: Avoid eating significant tokens for higher-level productions
* in nested inline productions.
*/
inline_breaks
= & [=|!}{:\r\n\]<]
//& {
// // Important hack: disable caching for this production, as the default
// // cache key does not take into account flag states!
// cacheKey += 'foo';
// return true;
//}
& {
//console.warn('ilbf: ' + input.substr(pos, 5) );
//if ( null !== __parseArgs[3].inline_breaks( input, pos, stops ) ) {
// console.warn('ilb break: ' + pp(input.substr(pos, 5)) );
//} else {
// console.warn('ilb no break: ' + pp(input.substr(pos, 5)) );
//}
return null !== __parseArgs[3].inline_breaks( input, pos, stops )
}
inline
= c:(urltext / (! inline_breaks (inline_element / . )))+ {
//console.warn('inline out:' + pp(c));
return flatten_stringlist( c );
}
inlineline
= c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
return flatten_stringlist( c );
}
inline_element
= //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
& '<' ( comment / xmlish_tag )
/// & '{' ( & '{{{{{' template / tplarg / template )
/ & '{' tplarg_or_template
/// & '{' ( tplarg / template )
// Eat three opening brackets as text.
/ '[[[' { return '[[[' }
/ & '[' ( wikilink / extlink / autolink )
/ & "'" quote
/* Headings */
h = & "=" // guard, to make sure '='+ will match.
// XXX: Also check to end to avoid inline parsing?
r:(
s:'='+ // moved in here to make s accessible to inner action
& { return stops.inc('h'); }
c:inlineline
e:'='+
spc:(sp:space+ { return sp.join('') } / comment)*
&eolf
{
stops.dec('h');
var level = Math.min(s.length, e.length);
// convert surplus equals into text
if(s.length > level) {
var extras = s.substr(0, s.length - level);
if(c[0].constructor === String) {
c[0] = extras + c[0];
} else {
c.unshift( extras );
}
}
if(e.length > level) {
var extras = e.substr(0, e.length - level),
lastElem = c[c.length - 1];
if(lastElem.constructor === String) {
lastElem += extras;
} else {
c.push( extras );
}
}
return [new TagTk( 'h' + level )]
.concat(c, [new EndTagTk( 'h' + level ), spc]);
}
/ & { /* dp('nomatch exit h'); */ stops.dec('h'); return false } { return null }
) { return r }
comments
= '<!--' c:comment_chars* ('-->' / eof)
cs:(space* newline space* cn:comment { return cn })* {
return [new CommentTk( c.join('') )].concat(cs);
}
comment
= '<!--' c:comment_chars* ('-->' / eof)
{
return [new CommentTk( c.join('') )];
}
comment_chars
= c:[^-] { return c; }
/ c:'-' !'->' { return c; }
// Behavior switches
behavior_switch
= '__'
behavior:(
'NOTOC'
/ 'FORCETOC'
/ 'TOC'
/ 'NOEDITSECTION'
/ 'NEWSECTIONLINK'
/ 'NONEWSECTIONLINK'
/ 'NOGALLERY'
/ 'HIDDENCAT'
/ 'NOCONTENTCONVERT'
/ 'NOCC'
/ 'NOTITLECONVERT'
/ 'NOTC'
/ 'START'
/ 'END' // XXX: Removed in 19213. Should we keep this?
/ 'INDEX'
/ 'NOINDEX'
/ 'STATICREDIRECT'
)
'__'
{
return [
new TagTk(
'behavior-switch',
[new KV('word', behavior)]
)
];
}
/**************************************************************
* External (bracketed and autolinked) links
**************************************************************/
autolink
= ! { return stops.onStack('extlink') }
(urllink / autoref / isbn)
urllink
= target:url {
return [ new TagTk( 'urllink', [new KV('href', target)] ) ];
}
extlink
= ! { return stops.onStack('extlink') } // extlink cannot be nested
(
"["
& { return stops.push('extlink', true); }
//target:urllink
target:extlink_preprocessor_text
text:(( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )*
t:inlineline { return t } )?
"]" {
stops.pop('extlink');
//if ( text === '' ) {
// // XXX: Link numbering should be implemented in post-processor.
// text = [ "[" + linkCount + "]" ];
// linkCount++;
//}
//console.warn( 'extlink text: ' + pp( text ) );
var dataAttribs = {};
if ( text === '' ) {
dataAttribs.gc = 1;
}
return [
new SelfclosingTagTk( 'extlink', [
new KV('href', target),
new KV('content', text)
],
dataAttribs)
];
}
/ "[" & { return stops.pop('extlink'); }
)
autoref
= ref:('RFC' / 'PMID') (newline / space)+ identifier:[0-9]+
{
identifier = identifier.join('');
var base_urls = {
'RFC' : '//tools.ietfs.org/html/rfc%s',
'PMID' : '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract'
},
url = sprintf(base_urls[ref], identifier);
return [
new SelfclosingTagTk( 'extlink', [
new KV('href', sprintf(base_urls[ref], identifier)),
new KV('content', [ref, identifier].join(' '))
] )
];
}
isbn
= 'ISBN' (newline / space)+
head:[0-9]
digits:([- ]? [0-9])+
tail:'X'?
{
var isbn = [head, digits, tail].join('').replace(/,/g, '');
if (!isValidISBN(isbn)) {
return null;
}
return [
new SelfclosingTagTk( 'wikilink', [
new KV('', 'Special:BookSources/' + isbn.replace(/[^\d]/g, '')),
new KV('', 'ISBN ' + isbn),
new KV('tail', ''),
] )
];
}
/* Default URL protocols in MediaWiki (see DefaultSettings). Normally
* these can be configured dynamically. */
url_chars = [/fghimnstwIPR]
url_protocol
= '//' // for protocol-relative URLs
/ 'ftp://'
/ 'git://'
/ 'gopher://'
/ 'http://'
/ 'https://'
/ 'irc://'
/ 'ircs://' // @bug 28503
/ 'mailto:'
/ 'mms://'
/ 'news:'
/ 'nntp://' // @bug 3808 RFC 1738
/ 'svn://'
/ 'telnet://' // Well if we're going to support the above.. -ævar
/ 'worldwind://'
// javascript does not support unicode features..
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
try {
return decodeURI("%" + c0 + c1)
} catch ( e ) {
// Reject the match, and allow other fall-back productions to have a
// go at it.
return null;
}
}
//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
// no punctiation, and '{<' to trigger directives
no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]
url
= proto:url_protocol
addr:( ipv6_address / ipv4_address )?
path:( ( !inline_breaks
c:no_punctuation_char
{ return c }
)
/ s:[.:,] !(space / eolf) { return s }
/ comment
/ tplarg_or_template
/ ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
r:(
htmlentity
/ [&%{]
) { return r }
)+
{
//console.warn( "path: " + pp( flatten_stringlist( [proto + addr].concat( path ) ) ) );
return flatten_string( [proto + addr].concat( path ) );
}
ipv4_address
= a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*)
{
return flatten( a ).join('');
}
ipv6_address
= a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']')
{
return flatten( a ).join('');
}
/**************************************************************
* Templates, -arguments and wikilinks
**************************************************************/
/*
* Precedence: template arguments win over templates. See
* http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
* 4: {{{{·}}}} → {·{{{·}}}·}
* 5: {{{{{·}}}}} → {{·{{{·}}}·}}
* 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
* 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
*/
tplarg_or_template
= & '{{{{{{{' '{' tplarg_or_template '}'
/ & ( '{{{' &'{{{' tplarg ) tplarg
// tplarg in template
/ & ( '{{' &'{{{' tplarg ) template
/ tplarg
/ template
template
= "{{" nl_comment_space*
target:template_param_value
params:(nl_comment_space* "|"
r:( nl_comment_space* &"|" { return new KV( '', '') } // empty argument
/ p:template_param { return p }
) { return r }
)*
nl_comment_space*
"}}" {
// Insert target as first positional attribute, so that it can be
// generically expanded. The TemplateHandler then needs to shift it out
// again.
params.unshift( { k: flatten( target ), v: [] } );
var obj = new SelfclosingTagTk( 'template', params );
//console.warn( 'tokenizer template ' + pos + );
//console.warn('template @' + pos + '::' + input.substr(pos, 40) );
return obj;
}
// XXX: support template and args in target!
//template_target
// = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') }
tplarg
= "{{{"
name:template_param_value?
params:( nl_comment_space*
'|' nl_comment_space*
r:(
&'}}}' { return new KV( '', '') }
/ p:template_param { return p }
) { return r }
)*
nl_comment_space*
"}}}" {
name = flatten( name );
params.unshift( new KV( name, '' ) );
var obj = new SelfclosingTagTk( 'templatearg', params );
//console.warn( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 ));
//console.warn('template arg @' + pos + '::' + input.substr(pos, 40) );
return obj;
}
template_param
= name:template_param_name
// MW accepts |foo | = bar | as a single param..
('|' (space / newline)* &'=')?
val:(
s0:space*
"="
s1:space*
value:template_param_value? {
return { s0: s0, s1: s1, value: value };
}
)?
{
//console.warn( 'named template_param matched' + pp([name, value ]) );
if ( val !== '' ) {
if ( val.value !== '' ) {
return new KV( name, flatten( val.value ) );
} else {
return new KV(flatten( name ), []);
}
} else {
return new KV([], flatten(name));
}
}
// empty parameter
/ & [|}] { return new KV([], []); }
// FIXME: handle template args and templates in key! (or even parser functions?)
template_param_name
= & { return stops.push( 'equal', true ) }
tpt:(template_param_text / &'=' { return '' })
{
stops.pop( 'equal' );
//console.warn( 'template param name matched: ' + pp( tpt ) );
return tpt;
}
/ & { return stops.pop( 'equal' ) }
//= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
template_param_value
= & { stops.inc( 'nopre' ); return stops.push( 'equal', false ) }
tpt:template_param_text
{
stops.dec( 'nopre' );
stops.pop( 'equal' );
//console.warn( 'template param value matched: ' + pp( tpt ) );
return tpt;
}
/ & { stops.dec( 'nopre' ); return stops.pop( 'equal' ) }
template_param_text
= & { /*console.warn( 'tpt: ' +
input.substr( pos - 10, 9) +
input[pos].green +
input.substr( pos +1, 9) ); */
// re-enable tables within template parameters
stops.push('table', false );
stops.push('extlink', false);
return stops.inc('template')
}
il:nested_block+ {
stops.pop('table');
stops.pop('extlink');
stops.dec('template');
//console.warn( 'tpt match: ' + pp (il) + " stops: " + pp(stops));
var r = flatten( il ),
firstToken = r[0];
// Strip leading newlines and space-only text tokens.
// XXX: Will need to round-trip these eventually.
while (
firstToken !== undefined &&
( firstToken.constructor === NlTk ||
firstToken.constructor === String && firstToken.match( /^[\t ]+$/ ) )
)
{
r.shift();
firstToken = r[0];
}
//console.warn('tpl param name' + pos + r[0] + '::' + input.substr(pos, 40) );
if ( r.length === 1 && r[0].constructor === String ) {
r = r[0];
}
return r;
}
/ & { stops.pop('table'); stops.pop('extlink'); return stops.dec('template'); }
// TODO: handle link prefixes as in al[[Razi]]
wikilink
= & { return posStack.push('wikilink' , pos); }
"[["
! url
//target:link_target
// XXX: disallow pipe!
target:wikilink_preprocessor_text
lcontent:(
& { return posStack.push('lcontent' , pos); }
lcs:( pipe lt:link_text { return new KV( '', lt ); } )+ {
return { pos: posStack.pop('lcontent' , pos), content: lcs };
}
/ {
return { pos: posStack.pop('lcontent' , pos), content: [] };
}
)
"]]"
// XXX In real MediaWiki, this is a language-dependent positive character
// class. Can we work out a static negative class instead?
// XXX: Exclude uppercase chars from non-latin languages too!
tail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
var obj = new SelfclosingTagTk( 'wikilink' ),
textTokens = [];
obj.attribs.push( new KV('', target) );
obj.dataAttribs = {
sourcePos: posStack.pop( 'wikilink', pos ),
contentPos: lcontent.pos
};
// XXX: Point to object with path, revision and input information
//obj.source = input;
//console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) );
// Deal with content. XXX: Properly support pipe-trick etc
//lcontent.tail = tail && tail.join('') || '';
obj.attribs = obj.attribs.concat( lcontent.content );
obj.attribs.push( new KV( 'tail', tail && tail.join('') || '' ) );
//console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) );
return [obj];
}
/ ! { return posStack.pop( 'wikilink', pos ); }
link_text
= & { return stops.inc('linkdesc'); }
h:inline
// 'equal' syntaxFlag is set for links in template parameters. Consume the
// '=' here.
hs:( '=' inline?)?
{
//console.warn('link_text' + pp(h) + pp(hs));
stops.dec('linkdesc');
if( hs !== '' ) {
return h.concat(hs);
} else {
return h;
}
}
/ & { return stops.dec('linkdesc'); }
link_option
= & { stops.inc('pipe'); return stops.inc('linkdesc'); }
h:inline
// 'equal' syntaxFlag is set for links in template parameters. Consume the
// '=' here.
hs:( '=' inline)?
{
//console.warn('link_text' + pp(h) + pp(hs));
stops.dec('pipe');
stops.dec('linkdesc');
if( hs !== '' ) {
return h.concat(hs);
} else {
return h;
}
}
/ & { stops.dec('pipe'); return stops.dec('linkdesc'); }
link_end = "]]"
/* Generic quote production for italic and bold, further processed in a token
* stream transformation in doQuotes. Relies on NlTk tokens being emitted
* for each line of text to balance quotes per line.
*
* We are not using a simple pair rule here as we need to support mis-nested
* bolds/italics and MediaWiki's special heuristics for apostrophes, which are
* all not context free. */
quote = "''" x:"'"* {
var res = new TagTk( 'mw-quote' ); // Will be consumed in token transforms
res.value = "''" + x.join('');
return res;
}
/**
* Image option productions, only called from the LinkHandler token stream
* transformer, and only for images.
*/
img_options =
& { return stops.inc( 'pipe' ); }
os:img_option* {
stops.dec( 'pipe' );
var options = {};
os = flatten( os );
for ( var i = 0, l = os.length; i < l; i++ ) {
var o = os[i];
options[o.k] = o.v;
}
options._options = os;
return options;
}
/ & { return stops.dec( 'pipe' ); }
img_option
= pipe space*
o:(
img_attribute
/ img_format
/ img_dimensions
/ img_halign
/ img_valign
/ img_link
/ lt:link_text { return new KV('caption', lt) }
)
space* {
return o
};
img_format
= f:( 'border' / 'frameless' / 'frame' / 'thumbnail' / 'thumb' ) {
return new KV( 'format', f );
}
img_dimensions
= x:(n:[0-9]+ { return n.join('') })? y:('x' n:[0-9]+ { return n.join('') })? 'px' {
if ( x === '' && y ) {
return new KV( 'height', y );
} else if ( y === '' && x ) {
return new KV( 'width', x );
} else {
return [ new KV( 'width', x ), new KV( 'height', y ) ];
}
}
/ 'upright' { return [ new KV( 'width', 'upright' ) ] }
img_halign
= a:( 'left' / 'right' / 'center' / 'none' ) {
return new KV( 'halign', a );
}
img_valign
= a:( 'baseline' / 'sub' / 'super' / 'top' / 'text-top'
/ 'middle' / 'bottom' / 'text-bottom' ) {
return new KV( 'valign', a );
}
// External link targets are already parsed as link tokens. If we can make
// sure that those cleanly convert back to the original text, then we could
// re-parse them here.
img_link
= 'link=' space*
u:(
t:url {
stops.dec( 'pipe' );
return t;
}
/ & { return stops.dec( 'pipe' ); }
)
{
return new KV( 'link', u );
}
img_attribute
= k:( 'page' / 'alt' / 'thumbnail' / 'thumb' ) '=' t:preprocessor_text? {
return new KV( k, t );
}
/***********************************************************
* Pre and xmlish tags
***********************************************************/
// Indented pre blocks differ from their non-indented (purely tag-based)
// cousins by having their contents parsed.
pre_indent
= pre_indent_in_tags
/ l:pre_indent_line ls:(sol pre_indent_line)* {
return [new TagTk( 'pre' )]
.concat( [l], ls
, [new EndTagTk( 'pre' )]);
}
// An indented pre block that is surrounded with pre tags. The pre tags are
// used directly.
pre_indent_in_tags
= space+ // XXX: capture space for round-tripping
"<pre"
attribs:generic_attribute*
">"
& { return stops.inc('pre'); }
l:inlineline
ls:(sol pre_indent_line)*
"</pre>"
{
stops.dec('pre');
return [ new TagTk( 'pre', attribs ) ]
.concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] );
}
/ & { return stops.dec('pre'); }
pre_indent_line = space l:inlineline {
return l;
}
/*
* Pre blocks defined using non-indented HTML tags only parse nowiki tags
* inside them, and convert other content to verbatim text. Nowiki inside pre
* is not functionally needed, but supported for backwards compatibility.
*/
pre
= "<pre"
attribs:generic_attribute*
">"
// MediaWiki <pre> is special in that it converts all pre content to plain
// text.
ts:(t1:[^<]+ { return t1.join('') }
/ nowiki
/ !"</pre>" t2:. { return t2 })+
("</pre>" / eof) {
// return nowiki tags as well?
//console.warn('inpre');
return [ new TagTk( 'pre', attribs, { stx: 'html' } ) ]
.concat(ts, [ new EndTagTk( 'pre' ) ]);
}
/ "</pre>" { return "</pre>"; }
/* XXX: Extension tags can require a change in the tokenizer mode, which
* returns any text between extension tags verbatim. For now, we simply
* continue to parse the contained text and return the tokens. The original
* input source can be recovered from the source positions added on tag
* tokens. This won't however work in all cases. For example, a comment start
* (<!--) between extension tags would cause the remaining text to be consumed
* as a comment. To avoid this, we might need to look ahead for the end tag
* and limit the content parsing to this section. */
xmlish_tag = nowiki / generic_tag
/*
* Nowiki treats anything inside it as plain text. It could thus also be
* defined as an extension that returns its raw input text, possibly wrapped
* in a span for round-trip information. The special treatment for nowiki in
* pre blocks would still remain in the grammar though, so overall handling it
* all here is cleaner.
*/
nowiki
= "<nowiki>" nc:nowiki_content "</nowiki>" {
//console.warn( 'full nowiki return: ' + pp(nc));
return [ new SelfclosingTagTk( 'meta',
[
{k: 'typeof', v: 'mw:tag'},
{k: 'content', v: 'nowiki'}
])].concat( nc, [
new SelfclosingTagTk( 'meta',
[
{k: 'typeof', v: 'mw:tag'},
{k: 'content', v: '/nowiki'}
] )
] );
}
/ "<nowiki>" {
//console.warn('nowiki fallback');
return ['<nowiki>'];
}
/ "</nowiki>" {
//console.warn('nowiki end fallback');
return ['</nowiki>'];
}
nowiki_content
= ts:( t:[^<]+ { return t.join('') }
/ "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
//console.warn('nested pre in nowiki');
return ["<pre"].concat(p0, p1, [">"], p2, ["</pre>"]).join('');
}
/ (!("</"( "nowiki>" / "pre>")) c:. {
//console.warn('nowiki: single char' + c);
return c;
})
)* {
// return nowiki tags as well?
//console.warn('nowiki_content: return' + pp(ts));
return [ts.join('')];
}
// The list of HTML5 tags, mainly used for the identification of *non*-html
// tags. Non-html tags terminate otherwise tag-eating productions (see list
// below) in order to support potential extension tags.
html5_tagnames
= "a" / "abbr" / "address" / "area" / "article"
/ "aside" / "audio" / "b" / "base" / "bdi" / "bdo" / "blockquote"
/ "body" / "br" / "button" / "canvas" / "caption" / "cite" / "code"
/ "col" / "colgroup" / "command" / "data" / "datalist" / "dd" / "del"
/ "details" / "dfn" / "div" / "dl" / "dt" / "em" / "embed" / "fieldset"
/ "figcaption" / "figure" / "footer" / "form"
/ "h1" / "h2" / "h3" / "h4" / "h5" / "h6" / "head" / "header" / "hgroup"
/ "hr" / "html" / "i" / "iframe" / "img" / "input" / "ins" / "kbd" / "keygen"
/ "label" / "legend" / "li" / "link" / "map" / "mark" / "menu" / "meta"
/ "meter" / "nav" / "noscript" / "object" / "ol" / "optgroup" / "option"
/ "output" / "p" / "param" / "pre" / "progress" / "q" / "rp" / "rt"
/ "ruby" / "s" / "samp" / "script" / "section" / "select" / "small"
/ "source" / "span" / "strong" / "style" / "sub" / "summary" / "sup"
/ "table" / "tbody" / "td" / "textarea" / "tfoot" / "th" / "thead" / "time"
/ "title" / "tr" / "track" / "u" / "ul" / "var" / "video" / "wbr"
html_oldnames = "center" / "font" / "tt"
// Simple match on non-html endtags, for use in inline_breaks
nonhtml_endtag
= '</'
! ( html5_tagnames / html_oldnames )
[^ >]+
( space / newline ) *
[^>]*
'>'
/* Generic XML-like tags
*
* These also cover extensions (including Cite), which will hook into the
* token stream for further processing. The content of extension tags is
* parsed as regular inline, but the source positions of the tag are added
* to allow reconstructing the unparsed text from the input. */
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
generic_tag
= "<"
& { tagStartPos = pos; return true; } // remember the start position of this tag
end:"/"? name:[0-9a-zA-Z]+
attribs:generic_newline_attribute*
( space / newline ) *
selfclose:"/"?
">" {
name = name.join('');
var res;
if ( end != '' ) {
res = new EndTagTk( name, attribs );
} else if ( selfclose != '' ) {
res = new SelfclosingTagTk( name, attribs );
} else {
res = new TagTk( name, attribs );
}
if ( res.dataAttribs === undefined ) {
res.dataAttribs = {};
}
res.dataAttribs.sourceTagPos = [tagStartPos - 1, pos];
res.dataAttribs.stx = 'html';
return res;
}
// A generic attribute that can span multiple lines.
generic_newline_attribute
= s:( space / newline )*
name:generic_attribute_name
value:(( space / newline )*
v:generic_attribute_newline_value { return v })?
{
//console.warn('generic_newline_attribute: ' + pp( name ))
if ( value !== '' ) {
return new KV( name, value );
} else {
return new KV( name, '' );
}
}
// A single-line attribute.
generic_attribute
= s:space*
name:generic_attribute_name
value:(space*
v:generic_attribute_value { return v })?
{
//console.warn( 'generic attribute: ' + pp([name, value]));
// FIXME: name might just be a template, which can expand to a key-value
// pair later. We'll need to handle that in the AttributeTransformManager.
if ( value !== '' ) {
return new KV( name, value );
} else {
return new KV( name, '' );
}
}
// ( Replaced by generic_attribute_name for template / parameter support. )
//// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
//// disallow newlines, | and {.
//generic_attribute_plain_name
// = n:[^ \t\0/"'>=\n|{]+ {
// return n.join('');
// }
generic_attribute_name
= & { return stops.push( 'equal', true ) }
! '/>'
name:attribute_preprocessor_text_line
{
stops.pop( 'equal' );
//console.warn( 'generic attribute name: ' + pp( name ) );
return name;
}
/ & { return stops.pop( 'equal' ) }
// A generic attribute, possibly spanning multiple lines.
generic_attribute_newline_value
= "=" (space / newline )* v:xml_att_value {
return v;
}
// A generic but single-line attribute.
generic_attribute_value
= "=" space* v:att_value {
return v;
}
// Attribute value, quoted variants can span multiple lines.
xml_att_value
= "'" t:attribute_preprocessor_text_single "'" { return t; }
/ '"' t:attribute_preprocessor_text_double '"' { return t; }
/ attribute_preprocessor_text
// Attribute value, restricted to a single line.
att_value
= "'" t:attribute_preprocessor_text_single_line "'" { return t; }
/ '"' t:attribute_preprocessor_text_double_line '"' { return t; }
/ attribute_preprocessor_text_line
/*
* A variant of generic_tag, but also checks if the tag name is a block-level
* tag as defined in
* http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and following
* paragraphs.
*/
block_tag
= "<" end:"/"?
name:(cs:[a-zA-Z]+ { return cs.join('') })
attribs:generic_newline_attribute*
( space / newline ) *
selfclose:"/"?
">" {
if (block_names[name.toLowerCase()] !== true) {
// abort match if tag is not block-level
return null;
}
var res;
if ( end != '' ) {
res = new EndTagTk( name, attribs );
} else if ( selfclose != '' ) {
res = new SelfclosingTagTk( name, attribs );
} else {
res = new TagTk( name, attribs );
}
res.dataAttribs.stx = 'html';
return [res];
}
/*********************************************************
* Lists
*********************************************************/
lists = (dtdd / li) (sol (dtdd / li))*
li = bullets:list_char+
c:inlineline?
&eolf
{
if ( c === '' )
c = [];
var li = new TagTk( 'listItem' );
li.bullets = bullets;
return [ li, c ];
}
dtdd
= bullets:(!(";" !list_char) lc:list_char { return lc })*
";"
& {return stops.inc('colon');}
c:inlineline
":"
// Fortunately dtdds cannot be nested, so we can simply set the flag
// back to 0 to disable it.
& { stops.counters['colon'] = 0; return true;}
d:inlineline?
&eolf {
// Convert trailing space into &nbsp;
// XXX: This should be moved to a serializer
//var clen = c.length;
//if (clen && c[clen - 1].constructor === String) {
// var val = c[clen - 1].value;
// if(val.length && val[val.length - 1] == ' ') {
// c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
// }
//}
bullets = bullets.join('');
var li = new TagTk( 'listItem' );
li.bullets = bullets + ";";
var li2 = new TagTk( 'listItem' );
li2.bullets = bullets + ":";
return [ li ].concat( c, [ li2 ], d || new NlTk() );
}
// Fall-back case to clear the colon flag
/ & { return true; } { stops.counters['colon'] = 0; return null; }
list_char = [*#:;]
/*********************************************************************
* Tables
*
* Table productions are geared to support independent parsing of fragments in
* templates (the common table start / row / table end use case). The tokens
* produced by these fragments then match up to a table while building the
* DOM tree. For similar reasons, table rows do not emit explicit end tag
* tokens.
*
* The separate table_lines production is faster than moving those productions
* directly to block_lines.
*********************************************************************/
table_lines
= //& { console.warn('enter table_lines: ' + input.substr(pos, 20)); return true; }
(! inline_breaks / & '{{!}}' )
r:(
& { return stops.push('table', true); }
tl:table_line
tls:( s:sol tl2:table_line { return s.concat(tl2); } )* {
stops.pop('table');
//console.warn('table_lines: ' + pp(tl.concat(tls)));
return tl.concat( tls );
}
/ & { return stops.pop('table'); }
) { return r }
// This production assumes start-of-line position!
table_line
= table_start_tag
/ table_row_tag
/ table_data_tags
/ table_heading_tags
/ table_caption_tag
/ table_end_tag
table_start_tag
= "{" pipe
ta:generic_attribute*
space*
te:table_end_tag? // can occur somewhere in the middle of the line too
{
var toks = [ new TagTk( 'table' ) ];
if ( ta )
toks[0].attribs = ta;
if ( te )
toks = toks.concat( te );
return toks;
}
table_caption_tag
= pipe "+"
c:inline* {
return [ new TagTk( 'caption' )]
.concat( c, [ new EndTagTk( 'caption' ) ]);
}
table_row_tag
= //& { console.warn("table row enter @" + input.substr(pos, 30)); return true; }
pipe "-"
a:generic_attribute*
space*
// handle tables with missing table cells after a row
td:( s:sol !( pipe / "!" ) tdt:table_data_tag { return s.concat(tdt); } )?
{
// We rely on our tree builder to close the row as needed. This is
// needed to support building tables from fragment templates with
// individual cells or rows.
var trToken = [ new TagTk( 'tr', a ) ];
if ( !td ) {
return trToken;
} else {
//console.warn( 'tr result: ' + pp(trToken.concat(td)) + ' stops: ' + pp(stops));
return trToken.concat(td);
}
}
table_data_tags
= pipe
td:table_data_tag
tds:( pipe_pipe tdt:table_data_tag { tdt[0].dataAttribs.stx_v = 'row'; return tdt } )* {
return td.concat(tds);
}
table_data_tag
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
! [}+-]
a:table_cell_args?
//& { console.warn("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
// use inline_breaks to break on tr etc
td:nested_block*
{
if ( a === '' ) {
a = [];
}
//console.warn("table data result: " + pp(td) + ", attribs: " + pp(a));
return [ new TagTk( 'td', a )]
.concat( td );
}
table_heading_tags
= //& { console.warn( 'th enter @' + input.substr(pos, 10)); return true; }
"!"
th:table_heading_tag
ths:( "!!" tht:table_heading_tag { tht[0].dataAttribs.stx_v = 'row'; return tht } )* {
//console.warn( 'thts: ' + pp([th, ths]));
return th.concat(ths);
}
table_heading_tag
= a:table_cell_args?
c:nested_block* {
//console.warn( 'table_heading_tag: ' + pp( [a, c] ) );
if ( a === '' ) {
a = [];
}
return [ new TagTk( 'th', a ) ]
.concat( c );
}
table_end_tag
= space* pipe "}" {
var tok = [new EndTagTk( 'table' )];
return tok;
}
table_cell_args
= & { return stops.inc('tableCellArg'); }
as:generic_attribute* space* pipe !pipe {
stops.dec('tableCellArg');
//console.warn( 'tcargs' + JSON.stringify( as ) + " stops: " + pp(stops) );
return as;
}
/ & { return stops.dec('tableCellArg'); }
/* Tables, full-table version (no support for table fragments from templates)
* Only left here for comparison purposes with the above productions. Remove
* when those work as intended! */
table
= tas:table_start space* c:table_caption? b:table_body? te:table_end {
var res = new TagTk( 'table' );
var body = b !== '' ? b : [];
//dp("body: " + pp(body));
if (tas.length > 0) {
// FIXME: actually parse and build structure
//res.attribs = [new KV('data-unparsed', tas.join(''))];
res.attribs = tas;
}
if (c != '') {
var caption = [ new TagTk( 'caption' ) ]
.concat(c, [new EndTagTk( 'caption' )], te);
} else {
var caption = [];
}
//dp(pp(res));
return [res].concat(caption, body, [new EndTagTk( 'table' )]);
}
table_start
= "{" pipe
res:(
& { stops.push('table', true); return true; }
ta:generic_attribute*
{
//dp("table_start " + pp(ta) + ", pos:" + pos);
return ta;
}
/ & { stops.pop('table'); return false; } { return null; }
) { return res }
table_caption
= n:newlineToken
pipe "+" c:nested_block* {
return n.concat(c);
}
table_body
= //& { dp("table_body enter"); return true; }
firstrow:table_firstrow otherrows:table_row* {
/* dp('table first and otherrows: '
* + pp([firstrow].concat(otherrows))); */
return [firstrow].concat(otherrows);
}
/ otherrows:table_row* {
//dp('table otherrows: ' + pp(otherrows));
return otherrows;
}
table_firstrow
= td:(table_data / table_heading)+ {
//dp('firstrow: ' + pp(td));
return [ new TagTk( 'tr' )]
.concat(td, [ new EndTagTk( 'tr' )]);
}
table_row
= //& { dp("table row enter"); return true; }
n:newlineToken
pipe "-"
a:(as:generic_attribute+ space* pipe !pipe { return as } )?
space*
td:(table_data / table_heading)* {
return n.concat([ new TagTk( 'tr' )]
, td, [ new EndTagTk( 'tr' )]);
}
table_data
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
n:(pipe_pipe { return [] } / nt:newlineToken (pipe / !'!') { return nt })
! [}+-]
//& { dp('before attrib, pos=' + pos); return true; }
a:(as:generic_attribute+ space* end_pipe { return as } )?
//& { dp('past attrib, pos=' + pos); return true; }
// use inline_breaks to break on tr etc
td:(
//& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
b:nested_block+ { return b }
)* {
if ( a == '' ) {
a = [];
}
//dp("table data result: " + pp(td) + ", attribts: " + pp(a));
return n.concat( [ new TagTk( 'td', a ) ]
, td, [ new EndTagTk( 'td' ) ] );
}
table_heading
= n:("!!" { return [] } / nl:sol "!" { return nl })
a:(as:generic_attribute+ end_pipe { return as } )?
c:inline {
if ( a == '' ) {
a = [];
}
return n.concat( [ new TagTk( 'th', a )]
, c, [ new EndTagTk( 'th' ) ]);
}
thtd_attribs
// In particular, do not match [|\n]
= a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ end_pipe {
return a;
}
table_end
= nt:newlineToken? ( pipe "}" / eof ) {
stops.pop('table');
if(nt)
return nt;
else
return [];
}
/*******************************************************************
* Text variants and other general productions
*******************************************************************/
/* All chars that cannot start syntactic structures in the middle of a line
* XXX: ] and other end delimiters should probably only be activated inside
* structures to avoid unnecessarily leaving the text production on plain
* content. */
text_char = [^'<~[{\n\r:\]}|!=]
text = t:text_char+ { return t.join(''); }
/* Legend
* ' quotes (italic/bold)
* < start of xmlish_tag
* ~ signatures/dates
* [ start of links
* { start of parser functions, transclusion and template args
* \n all sort of block-level markup at start of line
* \r ditto
* h http(s) urls
* n nntp(s) urls
* m mailto urls
* I start of ISBN 10/13 auto links
* P start of PMID auto links
* R start of RFC auto links
*
* _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related)
* ! and | table cell delimiters, might be better to specialize those
* = headings - also specialize those!
*
* The following chars are also included for now, but only apply in some
* contexts and should probably be enabled only in those:
* : separate definition in ; term : definition
* ] end of link
* } end of parser func/transclusion/template arg
*/
urltext = ( t:[^'<~[{\n\rIPRfghimnstw_|!:\]} &=]+ { return t.join(''); }
/ & url_chars autolink
/ & ('__') behavior_switch
/ htmlentity
// Convert trailing space into &nbsp;
// XXX: This should be moved to a serializer
/ ' ' & ':' { return "\u00a0"; }
/ t:text_char )+
/*
'//', // for protocol-relative URLs, but not in text!
'ftp://',
'git://',
'gopher://',
'http://',
'https://',
'irc://',
'ircs://', // @bug 28503
'mailto:',
'mms://',
'news:',
'nntp://', // @bug 3808 RFC 1738
'svn://',
'telnet://', // Well if we're going to support the above.. -ævar
'worldwind://',
*/
// Old version
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
// Experimental tweaked version: avoid expensive single-char substrings
// This did not bring the expected performance boost, however.
//text = [A-Za-z0-9,._ -] {
// textStart = pos;
//
// var res = input.substr(textStart - 1, inputLength)
// .match(/[A-Za-z0-9,._ -]+/)[0];
// pos = pos + (res.length - 1);
// return res
// }
htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
return unentity("&" + c.join('') + ";")
}
space
= s:[ \t]+ { return s.join(''); }
optionalSpaceToken
= s:space* {
if ( s.length ) {
return [s.join('')];
} else {
return [];
}
}
// Start of line
sol
= nl:(newlineToken / & { return pos === 0; } { return [] })
// Eat multi-line comment, so that syntax after still matches as if it
// was actually preceded by a newline
cn:( c:comment n:newline? {
if ( n !== '' ) {
return [c, n];
} else {
return [c];
}
}
)?
// Eat includeonly/noinclude at start of line, so that start-of-line
// syntax after it still matches
ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
{
var niToken = [];
if ( ni !== '') {
if ( ni[0] === '/' ) {
niToken = [new EndTagTk( ni[1] )];
} else {
niToken = [new TagTk( ni[1] )];
}
}
if ( cn === '' ) {
cn = [];
}
return nl.concat(cn, niToken);
}
nl_comment_space = newline / comment / space
/**
* noinclude / includeonly / onlyinclude productions. These are normally
* handled by the generic_tag production, except where generic tags are not
* allowed- for example in directives, which are allowed in various attribute
* names and -values.
*
* Example test case:
* {|
* |-<includeonly>
* foo
* </includeonly>
* |Hello
* |}
*/
include_limits = noinclude / includeonly / onlyinclude
noinclude
= & { return stops.inc('noinclude'); }
"<noinclude>"
i:inline
"</noinclude>"
{
stops.dec('noinclude');
return [new TagTk('noinclude')].concat(i, [new EndTagTk('noinclude')]);
}
/ & { return stops.dec('noinclude'); }
includeonly
= & { return stops.inc('includeonly'); }
"<includeonly>"
i:inline
"</includeonly>"
{
stops.dec('includeonly');
return [new TagTk('includeonly')].concat(i, [new EndTagTk('includeonly')]);
}
/ & { return stops.dec('includeonly'); }
onlyinclude
= & { return stops.inc('onlyinclude'); }
"<onlyinclude>"
i:inline
"</onlyinclude>"
{
stops.dec('onlyinclude');
return [new TagTk('onlyinclude')].concat(i, [new EndTagTk('onlyinclude')]);
}
/ & { return stops.dec('onlyinclude'); }
eof = & { return isEOF(pos); } { return true; }
newline
= '\n' / '\r\n'
newlineToken = newline { return [new NlTk()] }
eolf = newline / eof
// 'Preprocessor' directive- higher-level things that can occur in otherwise
// plain-text content.
directive
= comment
/ nowiki
/ tplarg_or_template
/ htmlentity
/ include_limits
// Plain text, but can contain templates, template arguments, comments etc-
// all stuff that is normally handled by the preprocessor
// Returns either a list of tokens, or a plain string (if nothing is to be
// processed).
preprocessor_text
= r:( t:[^<~[{\n\r\t|!\]}{ &=]+ { return t.join(''); }
/ !inline_breaks (
directive
/ text_char )
)+ {
return flatten ( r );
}
spaceless_preprocessor_text
= r:( t:[^'<~[{\n\r|!\]}{\t &=]+ { return t.join(''); }
/ !inline_breaks (
directive
/ !' ' text_char )
)+ {
return flatten_string ( r );
}
wikilink_preprocessor_text
= r:( t:[^<~[{\n\r\t|!\]}{ &=]+ { return t.join(''); }
/ !inline_breaks ( directive / !"]]" text_char )
)+ {
return flatten_stringlist ( r );
}
extlink_preprocessor_text
// added special separator character class inline: separates url from
// description / text
= r:( t:[^'<~[{\n\r|!\]}{\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
/ !inline_breaks ( directive / no_punctuation_char )
/// urlencoded_char
// !inline_breaks no_punctuation_char
/ s:[.:,] !(space / eolf) { return s }
/ [&%] )+ {
return flatten_string ( r );
}
// Attribute values with preprocessor support
attribute_preprocessor_text
= r:( ts:(!inline_breaks t:[^=<>{}\n\r&'"\t ] {return t})+ { return ts.join(''); }
/ !inline_breaks (
directive
/ !inline_breaks [&%]
)
)+
{
//console.warn('prep');
return flatten_string ( r );
}
attribute_preprocessor_text_single
= r:( t:[^{}&']+ { return t.join(''); }
/ !inline_breaks (
directive
/ [{&] )
)*
{
return flatten_string ( r );
}
attribute_preprocessor_text_double
= r:( t:[^{}&"]+ { return t.join(''); }
/ !inline_breaks (
directive
/ [{&] )
)*
{
//console.warn( 'double:' + pp(r) );
return flatten_string ( r );
}
// Variants with the entire attribute on a single line
attribute_preprocessor_text_line
= r:( ts:[^=<>{\n\r&'"\t \[\]|{}]+ { return ts.join(''); }
/ !inline_breaks
t:(
directive
/ !(newline / space / [\[>]) c:. {
//console.warn( 'aptl: ' + pp(c) );
return c
}
) { return t }
)+
{
//console.warn('prep');
return flatten_string ( r );
}
attribute_preprocessor_text_single_line
= r:( t:[^{}&']+ { return t.join(''); }
/ !inline_breaks (
directive
/ !'\n' [{&] )
)* {
return flatten_string ( r );
}
attribute_preprocessor_text_double_line
= r:( t:[^{}&"]+ { return t.join(''); }
/ !inline_breaks (
directive
/ !'\n' [{&] )
)* {
//console.warn( 'double:' + pp(r) );
return flatten_string ( r );
}
// Special-case support for those pipe templates
pipe = "|" / "{{!}}"
end_pipe = "|" ! "|" / "{{!}}" ! "{{!}}"
pipe_pipe = "||" / "{{!}}{{!}}"
// Similar, for tables..
exclam = "!" / "{{;}}"
/* Tabs do not mix well with the hybrid production syntax */
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent : */