mediawiki-extensions-Visual.../modules/parser/pegTokenizer.pegjs.txt

1983 lines
58 KiB
Plaintext
Raw Normal View History

/**
* Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
* chunks of tokens (one chunk per top-level block matched) and eventually an
* end event. Tokens map to HTML tags as far as possible, with custom tokens
* used where further processing on the token stream is needed.
*
* @author Gabriel Wicke <gwicke@wikimedia.org>
* @author Brion Vibber <brion@wikimedia.org>
*/
2011-11-02 21:09:19 +00:00
{
2011-11-30 13:42:26 +00:00
/* Fixme: use static functions to separate module! Unfortunately, this
* does not work:
* var tu = require('./mediawiki.tokenizer.utils.js');
* console.warn(tu.flatten([]));
2011-11-30 13:42:26 +00:00
* Using exports in the module gets a bit further, but accesses to
* tu.flatten in productions still fail. Thus, I just moved the functions
* here until a solution is found:
*/
/* Static utilities */
// Flatten a list of lists.
//var simple_flatten = function ( e ) {
// // Fast single-level flatten:
// //return [].concat.apply([], e);
//
// var es = [];
// // flatten sub-arrays
// for(var i = 0, length = e.length; i < length; i++) {
// var ei = e[i];
// if ($.isArray(ei))
// es = es.concat(flatten(ei));
// else
// es.push(ei);
// };
// return es;
//};
/**
var old_flatten = function ( e ) {
// Fast single-level flatten:
//return [].concat.apply([], e);
2011-11-30 13:42:26 +00:00
var es = [];
// flatten sub-arrays
var chunkStart = null,
modified = false;
for(var i = 0, l = e.length; i < l; i++) {
if ( e[i].constructor === Array ) {
if ( chunkStart !== null ) {
es = es.concat( e.slice( chunkStart, i ), flatten( e[i] ) );
chunkStart = null;
} else {
es = es.concat(flatten(e[i]));
}
modified = true;
} else if ( chunkStart === null ) {
chunkStart = i;
}
2011-11-30 13:42:26 +00:00
};
if ( modified ) {
if ( chunkStart !== null ) {
es = es.concat( e.slice( chunkStart, e.length ) );
}
return es;
} else {
return e;
}
2011-11-30 13:42:26 +00:00
};
**/
var flatten = function(e) {
function internal_flatten(e, res) {
for (var i = 0, l = e.length; i < l; i++) {
var v = e[i];
if (v.constructor === Array ) {
// Change in assumption from a shallow array to a nested array.
if (res === null) res = e.slice(0, i);
internal_flatten(v, res);
} else if (res !== null) {
res.push(v);
}
}
return res === null ? e : res;
}
return internal_flatten(e, null);
}
2011-11-30 13:42:26 +00:00
var flatten_string = function ( c ) {
var out = flatten_stringlist( c );
if ( out.length === 1 && out[0].constructor === String ) {
return out[0];
} else {
return out;
}
};
var flatten_stringlist = function ( c ) {
var out = [],
text = '';
c = flatten(c);
for (var i = 0, l = c.length; i < l; i++) {
var ci = c[i];
if (ci.constructor === String) {
if(ci !== '') {
text += ci;
}
} else {
if (text !== '') {
out.push( text );
text = '';
}
out.push(ci);
}
}
if (text !== '' ) {
out.push( text );
}
return out;
}
2011-11-30 13:42:26 +00:00
// Remove escaped quotes from attributes etc
2011-12-30 09:35:57 +00:00
// This was in the original PEG parser, but could not find anything in
// MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke
2011-11-30 13:42:26 +00:00
var unquote = function (quotec, text) {
return text.replace('\\' + quotec, quotec);
};
// Decode html entities. In a browser, this should only be fed the entity,
// not untrusted html! XXX: replace with safer version.
var unentity = function ( entity ) {
return $("<div/>").html(entity).text();
};
2011-11-30 13:42:26 +00:00
// Debug print with global switch
2011-11-02 21:09:19 +00:00
var dp = function ( msg ) {
if ( false ) {
console.warn(msg);
2011-11-02 21:09:19 +00:00
}
};
var pp = function ( s ) { return JSON.stringify(s, null, 2); };
// Simple string formatting using '%s'
var sprintf = function ( format ) {
var args = Array.prototype.slice.call(arguments, 1);
return format.replace(/%s/g, function () {
return args.length ? args.shift() : '';
});
};
2011-11-30 13:42:26 +00:00
/**
* Determine if a string represents a valid ISBN-10 or ISBN-13 identifier
*
* @static
* @method
* @param {String} isbn: The string to check
* @returns {Boolean}: True if valid ISBN, false otherwise.
*/
var isValidISBN = function ( isbn ) {
var i = 0, checksum = 0;
isbn = isbn.toUpperCase().replace(/[^\dX]/g, '');
return [10, 13].indexOf(isbn.length) !== -1;
// XXX: The following code is short-circuited because it is stricter
// than the standard parser:
switch (isbn.length) {
case 10:
for (i = 0; i < 9; i++) {
checksum += parseInt(isbn[i], 10) * (10 - i);
}
checksum += '0123456789X'.indexOf(isbn[9]);
return (checksum % 11 === 0);
case 13:
for (i = 0; i < 13; i++) {
checksum += parseInt(isbn[i], 10) * (i & 1 ? 3 : 1);
}
return (checksum % 10 === 0) && (/^97[89]/.test(isbn));
}
return false;
};
2011-11-30 13:42:26 +00:00
/* End static utilities */
/*
* Flags for specific parse environments (inside tables, links etc). Flags
* trigger syntactic stops in the inline_breaks production, which
* terminates inline and attribute matches. Flags merely reduce the number
* of productions needed: The grammar is still context-free as the
* productions can just be unrolled for all combinations of environments
* at the cost of a much larger grammar.
*/
function SyntaxStops () {
this.counters = {};
this.stacks = {};
this.key = '';
this._counterKey = '';
this._stackKey = '';
}
SyntaxStops.prototype.inc = function(flag) {
if (this.counters[flag] !== undefined) {
this.counters[flag]++;
} else {
this.counters[flag] = 1;
2011-11-02 21:09:19 +00:00
}
this._updateCounterKey();
return true;
};
SyntaxStops.prototype.dec = function(flag) {
this.counters[flag]--;
this._updateCounterKey();
return false;
};
SyntaxStops.prototype.onCount = function ( name ) {
return this.counters[name];
};
/**
* A stack for nested, but not cumulative syntactic stops.
* Example: '=' is allowed in values of template arguments, even if those
* are nested in attribute names.
*/
SyntaxStops.prototype.push = function ( name, value ) {
if( this.stacks[name] === undefined ) {
this.stacks[name] = [value];
} else {
this.stacks[name].push( value );
}
this._updateStackKey();
return true;
};
SyntaxStops.prototype.pop = function ( name ) {
if( this.stacks[name] !== undefined ) {
this.stacks[name].pop();
} else {
throw "SyntaxStops.pop: unknown stop for " + name;
}
this._updateStackKey();
return false;
};
SyntaxStops.prototype.onStack = function ( name ) {
var stack = this.stacks[name];
if ( stack === undefined || stack.length === 0 ) {
return false;
} else {
return stack[stack.length - 1];
}
};
SyntaxStops.prototype._updateKey = function ( ) {
this._updateCounterKey();
this._updateStackKey();
};
SyntaxStops.prototype._updateCounterKey = function ( ) {
var counters = [];
for ( k in this.counters ) {
if ( this.counters[k] > 0 ) {
counters.push(k);
}
}
this._counterKey = JSON.stringify(counters);
this.key = this._counterKey + this._stackKey;
};
SyntaxStops.prototype._updateStackKey = function ( ) {
var stackStops = [];
for ( k in this.stacks ) {
if ( this.onStack( k ) ) {
stackStops.push(k);
}
}
this._stackKey = JSON.stringify(stackStops);
this.key = this._counterKey + this._stackKey;
};
var stops = new SyntaxStops();
// Start position of top-level block
// Could also provide positions for lower-level blocks using a stack.
var blockStart = 0;
// Start position of generic tag production
var tagStartPos = 0;
2012-03-02 13:36:37 +00:00
// Stack of source positions
var posStack = {
positions: {},
push: function( key, pos ) {
if ( this.positions[key] === undefined ) {
this.positions[key] = [pos];
} else {
this.positions[key].push( pos );
}
return true;
},
pop: function( key, pos ) {
var pk = this.positions[key];
if ( pk === undefined || ! pk.length ) {
throw "Tried to pop unknown position for " + key;
} else {
return [ pk.pop(), pos ];
}
}
};
// cache the input length
var inputLength = input.length;
// pseudo-production that matches at end of input
var isEOF = function (pos) {
return pos === inputLength;
};
// text start position
var textStart = 0;
// Define block-level tags in JS, so we can use toLowerCase to match tags
// case-independently. This would be quite ugly (and possibly slower) if
// done manually in the grammar.
var block_names = (function () {
var names = [ "p", "table", "td", "tr", "ul", "ol"
, "li", "dl", "dt", "dd", "div", "center"
, "blockquote" ];
var bnames = {};
for(var i = 0, l = names.length; i < l; i++) {
bnames[names[i]] = true;
}
return bnames;
})();
var self = this;
/*
* Emit a chunk of tokens to our consumers. Once this has been done, the
* current expression can return an empty list (true).
*/
var emitChunk = function (tokens) {
__parseArgs[2](tokens);
};
2011-11-02 21:09:19 +00:00
}
/*********************************************************
* The top-level production
*********************************************************/
2011-11-02 21:09:19 +00:00
start
= e:toplevelblock* newline* {
// end is passed inline as a token, as well as a separate event for now.
emitChunk( [ new EOFTk( ) ] );
return []; //flatten(e);
}
2011-11-02 21:09:19 +00:00
/*
* A document (start production) is a sequence of toplevelblocks. Tokens are
* emitted in chunks per toplevelblock to avoid buffering the full document.
*/
toplevelblock
= !eof b:block {
// Clear the tokenizer's backtracking cache after matching each
// toplevelblock. There won't be any backtracking as a document is just a
// sequence of toplevelblocks, so the cache for previous toplevelblocks
// will never be needed. This frees up a lot of memory: In the case of
// [[:en:Barack Obama]], the difference is about 360M vs. 110M.
cache = {};
b = flatten(b);
// Add source offsets for round-tripping. XXX: Add these not just for
// toplevelblocks!
if ( b.length ) {
var bs = b[0];
if ( bs.constructor === String && bs.attribs === undefined ) {
b[0] = new String( bs );
bs = b[0];
}
if (bs.dataAttribs === undefined) {
bs.dataAttribs = {};
}
bs.dataAttribs.bsp = [pos0, pos];
}
if (__parseArgs[3].env.trace) {
console.warn("---- <chunk:tokenized> ----");
for (var i = 0, n = b.length; i < n; i++) {
console.warn(b[i].toString());
}
console.warn("---- </chunk:tokenized> ----");
}
// Emit tokens for this toplevelblock. This feeds a chunk to the parser
// pipeline.
emitChunk( flatten( b ) );
// We don't return any tokens to the start production to save memory. We
// just emitted them already to our consumers.
return true;
}
/*
* The actual contents of each block.
*/
2011-11-02 21:09:19 +00:00
block
= block_lines
/ & '<' r:( pre // tag variant can start anywhere
/ comment &eolf
/ nowiki
// avoid a paragraph if we know that the line starts with a block tag
/ bt:block_tag { return [bt] }
) { return r; }
/ paragraph
// Inlineline includes generic tags; wrapped into paragraphs in token
// transform and DOM postprocessor
/ inlineline
/ sol
/*
* A block nested in other constructs. Avoid eating end delimiters for other
* constructs by checking against inline_breaks first.
*/
nested_block = !inline_breaks b:block { return b }
/*
* Line-based block constructs.
*/
block_lines
= s:sol
// eat an empty line before the block
s2:(os:optionalSpaceToken so:sol { return os.concat(so) })?
bl:block_line {
//console.warn( pp(s));
var s2_ = (s2 !== '') ? s2 : [];
return s.concat(s2_, bl);
}
/*
* Block structures with start-of-line wiki syntax
*/
block_line
= h
2011-11-02 21:09:19 +00:00
/ lists
/ st:optionalSpaceToken
r:( & [{}|!] tl:table_lines { return tl; }
// tag-only lines should not trigger pre either
/ bts:(bt:block_tag stl:optionalSpaceToken { return bt.concat(stl) })+
&eolf { return bts }
) {
return st.concat(r);
}
/ ! { return stops.counters.nopre } pre_indent
/ pre
/ "----" d:"-"* {
if (d.length > 0) {
return new SelfclosingTagTk( "hr", [], {tsr: [pos0, pos], extra_dashes: d.length} );
} else {
return new SelfclosingTagTk( "hr", [], {tsr: [pos0, pos]} );
}
}
/*
* A paragraph. We don't emit 'p' tokens to avoid issues with template
* transclusions, <p> tags in the source and the like. Instead, we perform
* some paragraph wrapping on the token stream and the DOM.
*/
paragraph
= s1:sol s2:sol c:inlineline {
return s1.concat(s2, /* [new TagTk('p')],*/ c);
}
br = s:optionalSpaceToken &newline {
return s.concat(
[
new SelfclosingTagTk( 'br', [], {tsr: [pos0, pos]} )
]
);
}
/*
* Syntax stops: Avoid eating significant tokens for higher-level productions
* in nested inline productions.
*
* Repeated testing of flags is not terribly efficient. See new and faster
* version below.
*/
/*
* Syntax stops: Avoid eating significant tokens for higher-level productions
* in nested inline productions.
*/
inline_breaks
= & [=|!}{:\r\n\]<]
//& {
// // Important hack: disable caching for this production, as the default
// // cache key does not take into account flag states!
// cacheKey += 'foo';
// return true;
//}
& {
//console.warn('ilbf: ' + input.substr(pos, 5) );
//if ( null !== __parseArgs[3].inline_breaks( input, pos, stops ) ) {
// console.warn('ilb break: ' + pp(input.substr(pos, 5)) );
//} else {
// console.warn('ilb no break: ' + pp(input.substr(pos, 5)) );
//}
return null !== __parseArgs[3].inline_breaks( input, pos, stops )
}
inline
= c:(urltext / (! inline_breaks (inline_element / . )))+ {
//console.warn('inline out:' + pp(c));
return flatten_stringlist( c );
}
inlineline
= c:(urltext / !inline_breaks (inline_element / [^\r\n]))+ {
//console.warn('inlineline out:' + pp(c) + input.substr(pos0, pos));
return flatten_stringlist( c );
}
inline_element
= //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
& '<' ( comment / xmlish_tag )
/// & '{' ( & '{{{{{' template / tplarg / template )
/ & '{' tplarg_or_template
/// & '{' ( tplarg / template )
// Eat three opening brackets as text.
/ '[[[' { return '[[[' }
/ & '[' ( wikilink / extlink / autolink )
/ & "'" quote
/* Headings */
h = & "=" // guard, to make sure '='+ will match.
// XXX: Also check to end to avoid inline parsing?
r:(
s:'='+ // moved in here to make s accessible to inner action
& { return stops.inc('h'); }
c:inlineline
endTPos:({ return pos })
e:'='+
spc:(sp:space+ { return sp.join('') } / comment)*
&eolf
{
stops.dec('h');
var level = Math.min(s.length, e.length);
// convert surplus equals into text
if(s.length > level) {
s = s.join('');
var extras = s.substr(0, s.length - level);
if(c[0].constructor === String) {
c[0] = extras + c[0];
} else {
c.unshift( extras );
}
}
if(e.length > level) {
e = e.join('');
var extras = e.substr(0, e.length - level),
lastElem = c[c.length - 1];
if(lastElem.constructor === String) {
lastElem += extras;
} else {
c.push( extras );
}
}
return [new TagTk( 'h' + level, [], { tsr: [pos0, pos0+level] } )]
.concat(c, [
new EndTagTk( 'h' + level, [],
{ tsr: [endTPos, endTPos + level]} ),
spc
]);
}
/ & { /* dp('nomatch exit h'); */ stops.dec('h'); return false } { return null }
) { return r }
2011-11-02 21:09:19 +00:00
comment
= '<!--' c:comment_chars* ('-->' / eof)
{
return [new CommentTk( c.join(''), { tsr: [pos0, pos] } )];
}
2011-11-02 21:09:19 +00:00
comment_chars
= c:[^-] { return c; }
/ c:'-' !'->' { return c; }
2011-11-02 21:09:19 +00:00
// Behavior switches
behavior_switch
= '__'
behavior:(
'NOTOC'
/ 'FORCETOC'
/ 'TOC'
/ 'NOEDITSECTION'
/ 'NEWSECTIONLINK'
/ 'NONEWSECTIONLINK'
/ 'NOGALLERY'
/ 'HIDDENCAT'
/ 'NOCONTENTCONVERT'
/ 'NOCC'
/ 'NOTITLECONVERT'
/ 'NOTC'
/ 'START'
/ 'END' // XXX: Removed in 19213. Should we keep this?
/ 'INDEX'
/ 'NOINDEX'
/ 'STATICREDIRECT'
)
'__'
{
return [
new TagTk(
'behavior-switch',
[new KV('word', behavior)],
{
tsr: [pos0, pos],
src: input.substr(pos0, pos - pos0)
}
)
];
}
/**************************************************************
* External (bracketed and autolinked) links
**************************************************************/
autolink
= ! { return stops.onStack('extlink') }
(urllink / autoref / isbn)
urllink
= target:url {
return [ new TagTk( 'urllink', [new KV('href', target)], { tsr: [pos0, pos] } ) ];
}
2011-11-02 21:09:19 +00:00
extlink
= ! { return stops.onStack('extlink') } // extlink cannot be nested
2012-02-07 09:38:28 +00:00
(
"["
& { return stops.push('extlink', true); }
2012-02-07 09:38:28 +00:00
//target:urllink
target:extlink_preprocessor_text
text:(( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )*
t:inlineline { return t } )?
2012-02-07 09:38:28 +00:00
"]" {
stops.pop('extlink');
//if ( text === '' ) {
// // XXX: Link numbering should be implemented in post-processor.
// text = [ "[" + linkCount + "]" ];
// linkCount++;
//}
//console.warn( 'extlink text: ' + pp( text ) );
var dataAttribs = {};
2012-02-07 09:38:28 +00:00
if ( text === '' ) {
dataAttribs.gc = 1;
2012-02-07 09:38:28 +00:00
}
dataAttribs.tsr = [pos0, pos];
return [
new SelfclosingTagTk( 'extlink', [
new KV('href', target),
new KV('content', text)
],
dataAttribs)
];
}
/ "[" & { return stops.pop('extlink'); }
2012-02-07 09:38:28 +00:00
)
autoref
= ref:('RFC' / 'PMID') (newline / space)+ identifier:[0-9]+
{
identifier = identifier.join('');
var base_urls = {
'RFC' : '//tools.ietfs.org/html/rfc%s',
'PMID' : '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract'
},
url = sprintf(base_urls[ref], identifier);
return [
new SelfclosingTagTk( 'extlink', [
new KV('href', sprintf(base_urls[ref], identifier)),
new KV('content', [ref, identifier].join(' '))
],
{tsr: [pos0, pos]})
];
}
isbn
= 'ISBN' (newline / space)+
head:[0-9]
digits:([- ]? [0-9])+
tail:'X'?
{
var isbn = [head, digits, tail].join('').replace(/,/g, '');
if (!isValidISBN(isbn)) {
return null;
}
return [
new SelfclosingTagTk( 'wikilink', [
new KV('href', 'Special:BookSources/' + isbn.replace(/[^\d]/g, '')),
new KV('tail', ''),
new KV('content', 'ISBN ' + isbn),
],
{tsr: [pos0, pos]})
];
}
/* Default URL protocols in MediaWiki (see DefaultSettings). Normally
* these can be configured dynamically. */
url_chars = [/fghimnstwIPR]
url_protocol
= '//' // for protocol-relative URLs
/ 'ftp://'
/ 'git://'
/ 'gopher://'
/ 'http://'
/ 'https://'
/ 'irc://'
/ 'ircs://' // @bug 28503
/ 'mailto:'
/ 'mms://'
/ 'news:'
/ 'nntp://' // @bug 3808 RFC 1738
/ 'svn://'
/ 'telnet://' // Well if we're going to support the above.. -ævar
/ 'worldwind://'
// javascript does not support unicode features..
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
2011-11-02 21:09:19 +00:00
urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
try {
return decodeURI("%" + c0 + c1)
} catch ( e ) {
// Reject the match, and allow other fall-back productions to have a
// go at it.
return null;
}
}
2011-12-08 10:33:23 +00:00
//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
// no punctiation, and '{<' to trigger directives
no_punctuation_char = [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]
2011-11-02 21:09:19 +00:00
url
= proto:url_protocol
addr:( ipv6_address / ipv4_address )?
path:( ( !inline_breaks
c:no_punctuation_char
2011-12-15 00:26:22 +00:00
{ return c }
)
/ s:[.:,] !(space / eolf) { return s }
/ comment
/ tplarg_or_template
/ ! ( "&" ( [lL][tT] / [gG][tT] ) ";" )
r:(
htmlentity
/ [&%{]
) { return r }
)+
{
//console.warn( "path: " + pp( flatten_stringlist( [proto + addr].concat( path ) ) ) );
return flatten_string( [proto + addr].concat( path ) );
}
ipv4_address
= a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*)
{
return flatten( a ).join('');
}
ipv6_address
= a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']')
{
return flatten( a ).join('');
}
2011-11-02 21:09:19 +00:00
/**************************************************************
* Templates, -arguments and wikilinks
**************************************************************/
/*
* Precedence: template arguments win over templates. See
* http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
* 4: {{{{·}}}} → {·{{{·}}}·}
* 5: {{{{{·}}}}} → {{·{{{·}}}·}}
* 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
* 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
*/
tplarg_or_template
= & '{{{{{{{' '{' tplarg_or_template '}'
/ & ( '{{{' &'{{{' tplarg ) tplarg
// tplarg in template
/ & ( '{{' &'{{{' tplarg ) template
/ tplarg
/ template
2011-11-02 21:09:19 +00:00
template
= "{{" nl_comment_space*
target:template_param_value
params:(nl_comment_space* "|"
r:( nl_comment_space* &"|" { return new KV( '', '') } // empty argument
/ p:template_param { return p }
) { return r }
)*
nl_comment_space*
"}}" {
// Insert target as first positional attribute, so that it can be
// generically expanded. The TemplateHandler then needs to shift it out
// again.
Biggish token transform system refactoring * All parser pipelines including tokenizer and DOM stuff are now constructed from a 'recipe' data structure in a ParserPipelineFactory. * All sub-pipelines of these can now be cached * Event registrations to a pipeline are directly forwarded to the last pipeline member to save relatively expensive event forwarding. * Some APIs for on-demand expansion / format conversion of parameters from parser functions are added: param.to('tokens/expanded', cb) param.to('text/wiki', cb) (this does not work yet) All parameters are additionally wrapped into a Param object that provides method for positional parameter naming (.named() or conversion to a dict (.dict()). * The async token transform manager is now separated from a frame object, with the frame holding arguments, an on-demand expansion method and loop checks. * Only keys of template parameters are now expanded. Parser functions or template arguments trigger an expansion on-demand. This (unsurprisingly) makes a big performance difference with typical switch-heavy template systems. * Return values from async transforms are no longer used in favor of plain callbacks. This saves the complication of having to maintain two code paths. A trick in transformTokens still avoids the construction of unneeded TokenAccumulators. * The results of template expansions are no longer buffered. * 301 parser tests are passing Known issues: * Cosmetic cleanup remains to do * Some parser functions do not support async expansions yet, and need to be modified. Change-Id: I1a7690baffbe8141cadf67270904a1b2e1df879a
2012-04-25 14:35:59 +00:00
params.unshift( { k: flatten( target ), v: [] } );
var obj = new SelfclosingTagTk( 'template', params, {tsr: [pos0, pos]} );
//console.warn( 'tokenizer template ' + pos + );
//console.warn('template @' + pos + '::' + input.substr(pos, 40) );
return obj;
}
2011-11-02 21:09:19 +00:00
// XXX: support template and args in target!
//template_target
// = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') }
2011-11-02 21:09:19 +00:00
tplarg
= "{{{"
name:template_param_value?
params:( nl_comment_space*
'|' nl_comment_space*
r:(
&'}}}' { return new KV( '', '') }
/ p:template_param { return p }
) { return r }
)*
nl_comment_space*
"}}}" {
name = flatten( name );
Big token transform framework overhaul part 2 * Tokens are now immutable. The progress of transformations is tracked on chunks instead of tokens. Tokenizer output is cached and can be directly returned without a need for cloning. Transforms are required to clone or newly create tokens they are modifying. * Expansions per chunk are now shared between equivalent frames via a cache stored on the chunk itself. Equivalence of frames is not yet ideal though, as right now a hash tree of *unexpanded* arguments is used. This should be switched to a hash of the fully expanded local parameters instead. * There is now a vastly improved maybeSyncReturn wrapper for async transforms that either forwards processing to the iterative transformTokens if the current transform is still ongoing, or manages a recursive transformation if needed. * Parameters for parser functions are now wrapped in abstract Params and ParserValue objects, which support some handy on-demand *value* expansions. Keys are always expanded. Parser functions are converted to use these interfaces, and now properly expand their values in the correct frame. Making this expansion lazier is certainly possible, but would complicate transformTokens and other token-handling machinery. Need to investigate if it would really be worth it. Dead branch elimination is certainly a bigger win overall. * Complex recursive asynchronous expansions should now be closer to correct for both the iterative (transformTokens) and recursive (maybeSyncReturn after transformTokens has returned) code paths. * Performance degraded slightly. There are no micro-optimizations done yet and the shared expansion cache still has a low hit rate. The progress tracking on chunks is not yet perfect, so there are likely a lot of unneeded re-expansions that can be easily eliminated. There is also more debug tracing right now. Obama currently expands in 54 seconds on my laptop. Change-Id: I4a603f3d3c70ca657ebda9fbb8570269f943d6b6
2012-05-10 08:04:24 +00:00
params.unshift( new KV( name, '' ) );
var obj = new SelfclosingTagTk( 'templatearg', params, {tsr: [pos0, pos]} );
//console.warn( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 ));
//console.warn('template arg @' + pos + '::' + input.substr(pos, 40) );
return obj;
}
2011-11-02 21:09:19 +00:00
template_param
= name:template_param_name
// MW accepts |foo | = bar | as a single param..
('|' (space / newline)* &'=')?
val:(
s0:optionalSpaceToken
"="
s1:optionalSpaceToken
value:template_param_value? {
return { s0: s0, s1: s1, value: value };
}
)?
{
//console.warn( 'named template_param matched' + pp([name, value ]) );
if ( val !== '' ) {
if ( val.value !== '' ) {
return new KV( name, flatten( val.value ) );
} else {
return new KV(flatten( name ), []);
}
} else {
return new KV([], flatten(name));
}
}
// empty parameter
/ & [|}] { return new KV([], []); }
// FIXME: handle template args and templates in key! (or even parser functions?)
2011-11-02 21:09:19 +00:00
template_param_name
= & { return stops.push( 'equal', true ) }
tpt:(template_param_text / &'=' { return '' })
{
stops.pop( 'equal' );
//console.warn( 'template param name matched: ' + pp( tpt ) );
return tpt;
}
/ & { return stops.pop( 'equal' ) }
//= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
2011-11-02 21:09:19 +00:00
template_param_value
= & { stops.inc( 'nopre' ); return stops.push( 'equal', false ) }
tpt:template_param_text
{
stops.dec( 'nopre' );
stops.pop( 'equal' );
//console.warn( 'template param value matched: ' + pp( tpt ) );
return tpt;
}
/ & { stops.dec( 'nopre' ); return stops.pop( 'equal' ) }
template_param_text
= & { /*console.warn( 'tpt: ' +
input.substr( pos - 10, 9) +
input[pos].green +
input.substr( pos +1, 9) ); */
// re-enable tables within template parameters
stops.push('table', false );
stops.push('extlink', false);
return stops.inc('template')
}
il:nested_block+ {
stops.pop('table');
stops.pop('extlink');
stops.dec('template');
//console.warn( 'tpt match: ' + pp (il) + " stops: " + pp(stops));
var r = flatten( il ),
firstToken = r[0];
// Strip leading newlines and space-only text tokens.
// XXX: Will need to round-trip these eventually.
while (
firstToken !== undefined &&
( firstToken.constructor === NlTk ||
firstToken.constructor === String && firstToken.match( /^[\t ]+$/ ) )
)
{
r.shift();
firstToken = r[0];
}
//console.warn('tpl param name' + pos + r[0] + '::' + input.substr(pos, 40) );
Biggish token transform system refactoring * All parser pipelines including tokenizer and DOM stuff are now constructed from a 'recipe' data structure in a ParserPipelineFactory. * All sub-pipelines of these can now be cached * Event registrations to a pipeline are directly forwarded to the last pipeline member to save relatively expensive event forwarding. * Some APIs for on-demand expansion / format conversion of parameters from parser functions are added: param.to('tokens/expanded', cb) param.to('text/wiki', cb) (this does not work yet) All parameters are additionally wrapped into a Param object that provides method for positional parameter naming (.named() or conversion to a dict (.dict()). * The async token transform manager is now separated from a frame object, with the frame holding arguments, an on-demand expansion method and loop checks. * Only keys of template parameters are now expanded. Parser functions or template arguments trigger an expansion on-demand. This (unsurprisingly) makes a big performance difference with typical switch-heavy template systems. * Return values from async transforms are no longer used in favor of plain callbacks. This saves the complication of having to maintain two code paths. A trick in transformTokens still avoids the construction of unneeded TokenAccumulators. * The results of template expansions are no longer buffered. * 301 parser tests are passing Known issues: * Cosmetic cleanup remains to do * Some parser functions do not support async expansions yet, and need to be modified. Change-Id: I1a7690baffbe8141cadf67270904a1b2e1df879a
2012-04-25 14:35:59 +00:00
if ( r.length === 1 && r[0].constructor === String ) {
r = r[0];
}
return r;
}
/ & { stops.pop('table'); stops.pop('extlink'); return stops.dec('template'); }
2011-11-02 21:09:19 +00:00
wikilink_content
= lcs:( pipe lt:link_text { return new KV( '', lt ); } )+ {
return { pos: [pos0, pos], content: lcs };
}
/ {
return { pos: [pos0, pos], content: [] };
}
// TODO: handle link prefixes as in al[[Razi]]
wikilink
= "[["
! url
//target:link_target
2012-03-02 13:36:37 +00:00
// XXX: disallow pipe!
target:wikilink_preprocessor_text
lcontent:wikilink_content
2011-12-15 00:12:58 +00:00
"]]"
// XXX In real MediaWiki, this is a language-dependent positive character
// class. Can we work out a static negative class instead?
2012-01-17 20:01:21 +00:00
// XXX: Exclude uppercase chars from non-latin languages too!
tail:( ![A-Z \t(),.:\n\r-] tc:text_char { return tc } )*
{
var obj = new SelfclosingTagTk( 'wikilink' ),
2011-12-30 09:35:57 +00:00
textTokens = [];
obj.attribs.push( new KV('href', target) );
2012-03-02 13:36:37 +00:00
obj.dataAttribs = {
tsr: [pos0, pos],
contentPos: lcontent.pos,
src: input.substr(pos0, pos - pos0)
2012-03-02 13:36:37 +00:00
};
// XXX: Point to object with path, revision and input information
Big token transform framework overhaul part 2 * Tokens are now immutable. The progress of transformations is tracked on chunks instead of tokens. Tokenizer output is cached and can be directly returned without a need for cloning. Transforms are required to clone or newly create tokens they are modifying. * Expansions per chunk are now shared between equivalent frames via a cache stored on the chunk itself. Equivalence of frames is not yet ideal though, as right now a hash tree of *unexpanded* arguments is used. This should be switched to a hash of the fully expanded local parameters instead. * There is now a vastly improved maybeSyncReturn wrapper for async transforms that either forwards processing to the iterative transformTokens if the current transform is still ongoing, or manages a recursive transformation if needed. * Parameters for parser functions are now wrapped in abstract Params and ParserValue objects, which support some handy on-demand *value* expansions. Keys are always expanded. Parser functions are converted to use these interfaces, and now properly expand their values in the correct frame. Making this expansion lazier is certainly possible, but would complicate transformTokens and other token-handling machinery. Need to investigate if it would really be worth it. Dead branch elimination is certainly a bigger win overall. * Complex recursive asynchronous expansions should now be closer to correct for both the iterative (transformTokens) and recursive (maybeSyncReturn after transformTokens has returned) code paths. * Performance degraded slightly. There are no micro-optimizations done yet and the shared expansion cache still has a low hit rate. The progress tracking on chunks is not yet perfect, so there are likely a lot of unneeded re-expansions that can be easily eliminated. There is also more debug tracing right now. Obama currently expands in 54 seconds on my laptop. Change-Id: I4a603f3d3c70ca657ebda9fbb8570269f943d6b6
2012-05-10 08:04:24 +00:00
//obj.source = input;
//console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) );
// Deal with content. XXX: Properly support pipe-trick etc
//lcontent.tail = tail && tail.join('') || '';
// We store the tail and content(s) in attributes, so that it is
// expanded by the AttributeExpander pass.
obj.attribs.push( new KV( 'tail', tail && tail.join('') || '') );
obj.attribs = obj.attribs.concat( lcontent.content );
//console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) );
return [obj];
}
2011-11-02 21:09:19 +00:00
link_text
= & { return stops.inc('linkdesc'); }
h:inline
// 'equal' syntaxFlag is set for links in template parameters. Consume the
// '=' here.
hs:( '=' inline?)?
{
//console.warn('link_text' + pp(h) + pp(hs));
stops.dec('linkdesc');
if( hs !== '' ) {
return h.concat(hs);
} else {
return h;
}
}
/ & { return stops.dec('linkdesc'); }
link_option
= & { stops.inc('pipe'); return stops.inc('linkdesc'); }
h:inline
// 'equal' syntaxFlag is set for links in template parameters. Consume the
// '=' here.
hs:( '=' inline)?
{
//console.warn('link_text' + pp(h) + pp(hs));
stops.dec('pipe');
stops.dec('linkdesc');
if( hs !== '' ) {
return h.concat(hs);
} else {
return h;
}
}
/ & { stops.dec('pipe'); return stops.dec('linkdesc'); }
link_end = "]]"
2011-11-02 21:09:19 +00:00
/* Generic quote production for italic and bold, further processed in a token
* stream transformation in doQuotes. Relies on NlTk tokens being emitted
* for each line of text to balance quotes per line.
*
* We are not using a simple pair rule here as we need to support mis-nested
* bolds/italics and MediaWiki's special heuristics for apostrophes, which are
* all not context free. */
quote = "''" x:"'"* {
var res = new TagTk( 'mw-quote', [], { tsr: [pos0, pos] } ); // Will be consumed in token transforms
res.value = "''" + x.join('');
return res;
}
2011-11-02 21:09:19 +00:00
/**
* Image option productions, only called from the LinkHandler token stream
* transformer, and only for images.
*/
2012-03-02 13:36:37 +00:00
img_options =
& { return stops.inc( 'pipe' ); }
2012-03-02 13:36:37 +00:00
os:img_option* {
stops.dec( 'pipe' );
var options = {};
os = flatten( os );
2012-03-02 13:36:37 +00:00
for ( var i = 0, l = os.length; i < l; i++ ) {
var o = os[i];
options[o.k] = o.v;
}
2012-03-02 13:36:37 +00:00
options._options = os;
return options;
}
/ & { return stops.dec( 'pipe' ); }
img_option
= pipe space*
2012-03-02 13:36:37 +00:00
o:(
img_attribute
/ img_format
/ img_dimensions
/ img_halign
/ img_valign
/ img_link
2012-03-02 13:36:37 +00:00
/ lt:link_text { return new KV('caption', lt) }
)
2012-03-02 13:36:37 +00:00
space* {
return o
};
img_format
2012-03-02 13:36:37 +00:00
= f:( 'border' / 'frameless' / 'frame' / 'thumbnail' / 'thumb' ) {
return new KV( 'format', f );
}
img_dimensions
2012-03-02 13:36:37 +00:00
= x:(n:[0-9]+ { return n.join('') })? y:('x' n:[0-9]+ { return n.join('') })? 'px' {
if ( x === '' && y ) {
return new KV( 'height', y );
} else if ( y === '' && x ) {
return new KV( 'width', x );
} else {
return [ new KV( 'width', x ), new KV( 'height', y ) ];
}
}
/ 'upright' { return [ new KV( 'width', 'upright' ) ] }
img_halign
= a:( 'left' / 'right' / 'center' / 'none' ) {
return new KV( 'halign', a );
}
img_valign
= a:( 'baseline' / 'sub' / 'super' / 'top' / 'text-top'
/ 'middle' / 'bottom' / 'text-bottom' ) {
return new KV( 'valign', a );
}
// External link targets are already parsed as link tokens. If we can make
// sure that those cleanly convert back to the original text, then we could
// re-parse them here.
img_link
2012-03-02 13:36:37 +00:00
= 'link=' space*
u:(
t:url {
stops.dec( 'pipe' );
2012-03-02 13:36:37 +00:00
return t;
}
/ & { return stops.dec( 'pipe' ); }
2012-03-02 13:36:37 +00:00
)
{
return new KV( 'link', u );
}
img_attribute
2012-03-02 13:36:37 +00:00
= k:( 'page' / 'alt' / 'thumbnail' / 'thumb' ) '=' t:preprocessor_text? {
return new KV( k, t );
}
/***********************************************************
* Pre and xmlish tags
***********************************************************/
// Indented pre blocks differ from their non-indented (purely tag-based)
// cousins by having their contents parsed.
pre_indent
= pre_indent_in_tags
/ l:pre_indent_line ls:(sol pre_indent_line)* {
return [new TagTk( 'pre', [], { tsr: [pos0, pos0 + 1] } )]
.concat( [l], ls
, [new EndTagTk( 'pre', [], {tsr: [pos, pos]} )]);
}
// An indented pre block that is surrounded with pre tags. The pre tags are
// used directly.
// XXX gwicke: check if the first line is not indented, and round-trip spaces;
// possibly merge with the regular 'pre' production.
// FIXME: fix tag end position
pre_indent_in_tags
= space+ // XXX: capture space for round-tripping
"<pre"
attribs:generic_attribute*
">"
& { return stops.inc('pre'); }
l:inlineline
ls:(sol pre_indent_line)*
"</pre>"
{
stops.dec('pre');
return [ new TagTk( 'pre', attribs, { tsr: [pos0, pos0] } ) ]
.concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] );
}
/ & { return stops.dec('pre'); }
pre_indent_line = space l:inlineline {
//console.warn( JSON.stringify( [s, l] ) );
return l;
}
/*
* Pre blocks defined using non-indented HTML tags only parse nowiki tags and
* html entities inside them, and convert other content to verbatim text.
* Nowiki inside pre is not functionally needed, but supported for backwards
* compatibility.
*
* TODO: add entity support!
*/
pre
= "<pre"
attribs:generic_attribute*
">"
// MediaWiki <pre> is special in that it converts all pre content to plain
// text.
ts:(t1:[^<]+ { return t1.join('') }
/ nowiki
/ !"</pre>" t2:. { return t2 })+
("</pre>" / eof) {
// return nowiki tags as well?
//console.warn('inpre');
// FIXME: end pos of start tag
return [ new TagTk( 'pre', attribs, { stx: 'html', tsr: [pos0, pos0 + 5] } ) ]
.concat(ts, [ new EndTagTk( 'pre', [], { tsr: [pos - 6, pos] } ) ]);
}
/ "</pre>" { return "</pre>"; }
/* XXX: Extension tags can require a change in the tokenizer mode, which
* returns any text between extension tags verbatim. For now, we simply
* continue to parse the contained text and return the tokens. The original
* input source can be recovered from the source positions added on tag
* tokens. This won't however work in all cases. For example, a comment start
* (<!--) between extension tags would cause the remaining text to be consumed
* as a comment. To avoid this, we might need to look ahead for the end tag
* and limit the content parsing to this section. */
xmlish_tag = nowiki / generic_tag
/*
* Nowiki treats anything inside it as plain text. It could thus also be
* defined as an extension that returns its raw input text, possibly wrapped
* in a span for round-trip information. The special treatment for nowiki in
* pre blocks would still remain in the grammar though, so overall handling it
* all here is cleaner.
*/
nowiki
= "<nowiki>" nc:nowiki_content "</nowiki>" {
//console.warn( 'full nowiki return: ' + pp(nc));
return [
new TagTk( 'span',
[
{k: 'typeof', v: 'mw:Nowiki'}
],
{ tsr: [pos0, pos0+8] } )
].concat( nc, [
new EndTagTk( 'span',
[
{k: 'typeof', v: 'mw:Nowiki'}
],
{ tsr: [pos - 9, pos] })
] );
}
/ "<nowiki>" {
//console.warn('nowiki fallback');
return ['<nowiki>'];
}
/ "</nowiki>" {
//console.warn('nowiki end fallback');
return ['</nowiki>'];
}
nowiki_content
= ts:( t:[^<]+ { return t.join('') }
/ "<pre" p0:optionalSpaceToken p1:[^>]* ">" p2:nowiki_content "</pre>" {
//console.warn('nested pre in nowiki');
return ["<pre"].concat(p0, p1, [">"], p2, ["</pre>"]).join('');
}
// FIXME: convert the </pre> stop to a syntactic stop, so that it
// only triggers if we are in a pre context.
// Should abort the nowiki match:
// <pre><nowiki></pre></nowiki>
// Should allow the </pre> in nowiki:
// <nowiki></pre></nowiki>
/ (!("</"( "nowiki>" / "pre>")) c:. {
//console.warn('nowiki: single char' + c);
return c;
})
)* {
// return nowiki tags as well?
//console.warn('nowiki_content: return' + pp(ts));
return [ts.join('')];
}
// The list of HTML5 tags, mainly used for the identification of *non*-html
// tags. Non-html tags terminate otherwise tag-eating productions (see list
// below) in order to support potential extension tags.
html5_tagnames
= "a" / "abbr" / "address" / "area" / "article"
/ "aside" / "audio" / "b" / "base" / "bdi" / "bdo" / "blockquote"
/ "body" / "br" / "button" / "canvas" / "caption" / "cite" / "code"
/ "col" / "colgroup" / "command" / "data" / "datalist" / "dd" / "del"
/ "details" / "dfn" / "div" / "dl" / "dt" / "em" / "embed" / "fieldset"
/ "figcaption" / "figure" / "footer" / "form"
/ "h1" / "h2" / "h3" / "h4" / "h5" / "h6" / "head" / "header" / "hgroup"
/ "hr" / "html" / "i" / "iframe" / "img" / "input" / "ins" / "kbd" / "keygen"
/ "label" / "legend" / "li" / "link" / "map" / "mark" / "menu" / "meta"
/ "meter" / "nav" / "noscript" / "object" / "ol" / "optgroup" / "option"
/ "output" / "p" / "param" / "pre" / "progress" / "q" / "rp" / "rt"
/ "ruby" / "s" / "samp" / "script" / "section" / "select" / "small"
/ "source" / "span" / "strong" / "style" / "sub" / "summary" / "sup"
/ "table" / "tbody" / "td" / "textarea" / "tfoot" / "th" / "thead" / "time"
/ "title" / "tr" / "track" / "u" / "ul" / "var" / "video" / "wbr"
html_oldnames = "center" / "font" / "tt"
// Simple match on non-html endtags, for use in inline_breaks
nonhtml_endtag
= '</'
! ( html5_tagnames / html_oldnames )
[^ >]+
( space / newline ) *
[^>]*
'>'
/* Generic XML-like tags
*
* These also cover extensions (including Cite), which will hook into the
* token stream for further processing. The content of extension tags is
* parsed as regular inline, but the source positions of the tag are added
* to allow reconstructing the unparsed text from the input. */
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
generic_tag
= "<"
end:"/"? name:[0-9a-zA-Z]+
attribs:generic_newline_attribute*
( space / newline ) *
selfclose:"/"?
">" {
name = name.join('');
var res;
if ( end != '' ) {
res = new EndTagTk( name, attribs );
} else if ( selfclose != '' ) {
res = new SelfclosingTagTk( name, attribs );
} else {
res = new TagTk( name, attribs );
}
if ( res.dataAttribs === undefined ) {
res.dataAttribs = {};
}
res.dataAttribs.tsr = [pos0, pos];
res.dataAttribs.stx = 'html';
return res;
}
// A generic attribute that can span multiple lines.
generic_newline_attribute
= s:( space / newline )*
name:generic_attribute_name
value:(( space / newline )*
v:generic_attribute_newline_value { return v })?
{
//console.warn('generic_newline_attribute: ' + pp( name ))
if ( value !== '' ) {
return new KV( name, value );
} else {
return new KV( name, '' );
}
}
// A single-line attribute.
generic_attribute
= s:optionalSpaceToken
name:generic_attribute_name
value:(optionalSpaceToken
v:generic_attribute_value { return v })?
{
//console.warn( 'generic attribute: ' + pp([name, value]));
// FIXME: name might just be a template, which can expand to a key-value
// pair later. We'll need to handle that in the AttributeTransformManager.
if ( value !== '' ) {
return new KV( name, value );
} else {
return new KV( name, '' );
}
}
// ( Replaced by generic_attribute_name for template / parameter support. )
//// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
//// disallow newlines, | and {.
//generic_attribute_plain_name
// = n:[^ \t\0/"'>=\n|{]+ {
// return n.join('');
// }
generic_attribute_name
= & { return stops.push( 'equal', true ) }
! '/>'
name:attribute_preprocessor_text_line
{
stops.pop( 'equal' );
//console.warn( 'generic attribute name: ' + pp( name ) );
return name;
}
/ & { return stops.pop( 'equal' ) }
// A generic attribute, possibly spanning multiple lines.
generic_attribute_newline_value
= "=" (space / newline )* v:xml_att_value {
return v;
}
// A generic but single-line attribute.
generic_attribute_value
= "=" space* v:att_value {
return v;
}
// Attribute value, quoted variants can span multiple lines.
xml_att_value
= "'" t:attribute_preprocessor_text_single "'" { return t; }
/ '"' t:attribute_preprocessor_text_double '"' { return t; }
/ attribute_preprocessor_text
// Attribute value, restricted to a single line.
att_value
= "'" t:attribute_preprocessor_text_single_line "'" { return t; }
/ '"' t:attribute_preprocessor_text_double_line '"' { return t; }
/ attribute_preprocessor_text_line
/*
* A variant of generic_tag, but also checks if the tag name is a block-level
* tag as defined in
* http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and following
* paragraphs.
*/
block_tag
= "<" end:"/"?
name:(cs:[a-zA-Z]+ { return cs.join('') })
attribs:generic_newline_attribute*
( space / newline ) *
selfclose:"/"?
">" {
if (block_names[name.toLowerCase()] !== true) {
// abort match if tag is not block-level
return null;
}
var res;
if ( end != '' ) {
res = new EndTagTk( name, attribs );
} else if ( selfclose != '' ) {
res = new SelfclosingTagTk( name, attribs );
} else {
res = new TagTk( name, attribs );
}
res.dataAttribs.tsr = [pos0, pos];
res.dataAttribs.stx = 'html';
return [res];
}
/*********************************************************
* Lists
*********************************************************/
lists = (dtdd / hacky_dl_uses / li) (sol (dtdd / hacky_dl_uses / li))*
2011-11-02 21:09:19 +00:00
li = bullets:list_char+
c:inlineline?
&eolf
2011-11-02 21:09:19 +00:00
{
if ( c === '' )
c = [];
var li = new TagTk( 'listItem', [], { tsr: [pos0, pos0 + bullets.length] } );
li.bullets = bullets;
return [ li, c ];
2011-11-02 21:09:19 +00:00
}
/*
* This production is required to support wikitext of this form
* ::{|border="1"|foo|bar|baz|}
* where the leading colons are used to indent the entire table.
* This hack was added back in 2006 in commit
* a0746946312b0f1eda30a2c793f5f7052e8e5f3a based on a patch by Carl
* Fürstenberg.
*/
hacky_dl_uses = bullets:":"+
c:table_lines
&eolf
{
if ( c === '' )
c = [];
var li = new TagTk( 'listItem', [], { tsr: [pos0, pos0 + bullets.length] } );
li.bullets = bullets;
return [ li, c ];
}
dtdd
= bullets:(!(";" !list_char) lc:list_char { return lc })*
";"
& {return stops.inc('colon');}
c:inlineline
cpos:(":" { return pos })
// Fortunately dtdds cannot be nested, so we can simply set the flag
// back to 0 to disable it.
& { stops.counters['colon'] = 0; return true;}
d:inlineline?
&eolf {
bullets = bullets.join('');
var li = new TagTk( 'listItem', [],
{
tsr: [pos0, pos0 + bullets.length]
} );
li.bullets = bullets + ";";
var li2 = new TagTk( 'listItem', [],
{
tsr: [cpos, cpos+1],
stx: 'row'
} );
li2.bullets = bullets + ":";
return [ li ].concat( c, [ li2 ], d || '' );
2011-11-02 21:09:19 +00:00
}
// Fall-back case to clear the colon flag
/ & { return true; } { stops.counters['colon'] = 0; return null; }
2011-11-02 21:09:19 +00:00
list_char = [*#:;]
/*********************************************************************
2011-12-22 12:37:24 +00:00
* Tables
*
* Table productions are geared to support independent parsing of fragments in
* templates (the common table start / row / table end use case). The tokens
* produced by these fragments then match up to a table while building the
* DOM tree. For similar reasons, table rows do not emit explicit end tag
* tokens.
*
* The separate table_lines production is faster than moving those productions
* directly to block_lines.
*********************************************************************/
table_lines
= //& { console.warn('enter table_lines: ' + input.substr(pos, 20)); return true; }
(! inline_breaks / & '{{!}}' )
r:(
& { return stops.push('table', true); }
tl:table_line
tls:( s:sol tl2:table_line { return s.concat(tl2); } )* {
stops.pop('table');
//console.warn('table_lines: ' + pp(tl.concat(tls)));
return tl.concat( tls );
}
/ & { return stops.pop('table'); }
) { return r }
2011-12-22 12:37:24 +00:00
// This production assumes start-of-line position!
table_line
= table_start_tag
/ table_row_tag
/ table_data_tags
/ table_heading_tags
/ table_caption_tag
/ table_end_tag
table_start_tag
= "{" pipe
ta:generic_attribute*
space*
tsEndPos:({return pos})
te:table_end_tag? // can occur somewhere in the middle of the line too
{
var toks = [ new TagTk( 'table', [], { tsr: [pos0, tsEndPos] } ) ];
if ( ta )
toks[0].attribs = ta;
if ( te )
toks = toks.concat( te );
return toks;
}
table_caption_tag
= pipe "+"
tcStartPos:({return pos;})
c:inline* {
return [ new TagTk( 'caption', [], {tsr: [pos0, tcStartPos]} )]
.concat( c, [ new EndTagTk( 'caption' ) ]);
}
table_row_tag
= //& { console.warn("table row enter @" + input.substr(pos, 30)); return true; }
pipe "-"
a:generic_attribute*
space*
tagEndPos:({return pos})
2011-12-22 11:58:32 +00:00
// handle tables with missing table cells after a row
td:( s:sol !( pipe / "!" ) tdt:table_data_tag { return s.concat(tdt); } )?
{
// We rely on our tree builder to close the row as needed. This is
// needed to support building tables from fragment templates with
// individual cells or rows.
var trToken = [ new TagTk( 'tr', a, {tsr: [pos0, tagEndPos]} ) ];
2011-12-22 11:58:32 +00:00
if ( !td ) {
return trToken;
} else {
//console.warn( 'tr result: ' + pp(trToken.concat(td)) + ' stops: ' + pp(stops));
2011-12-22 11:58:32 +00:00
return trToken.concat(td);
}
}
table_data_tags
= pipe
td:table_data_tag
tagEndPos:({return pos})
tds:( pipe_pipe tdt:table_data_tag { tdt[0].dataAttribs.stx_v = 'row'; return tdt } )* {
td[0].dataAttribs.tsr[0] = pos0;
return td.concat(tds);
}
2011-11-02 21:09:19 +00:00
table_data_tag
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
! [}+-]
a:table_cell_args?
//& { console.warn("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
// use inline_breaks to break on tr etc
tagEndPos:({return pos})
td:nested_block*
{
if ( a === '' ) {
a = [];
}
//console.warn("table data result: " + pp(td) + ", attribs: " + pp(a));
return [ new TagTk( 'td', a, {tsr: [pos0, tagEndPos]} )]
.concat( td );
}
table_heading_tags
= //& { console.warn( 'th enter @' + input.substr(pos, 10)); return true; }
"!"
th:table_heading_tag
ths:( "!!" tht:table_heading_tag { tht[0].dataAttribs.stx_v = 'row'; return tht } )* {
//console.warn( 'thts: ' + pp([th, ths]));
return th.concat(ths);
}
table_heading_tag
= a:table_cell_args?
tagEndPos:({return pos})
c:nested_block* {
//console.warn( 'table_heading_tag: ' + pp( [a, c] ) );
if ( a === '' ) {
a = [];
}
return [ new TagTk( 'th', a, {tsr: [pos0, tagEndPos]} ) ]
.concat( c );
}
table_end_tag
= space* pipe "}" {
var tok = [new EndTagTk( 'table', [], { tsr: [pos0, pos] } )];
return tok;
}
table_cell_args
= & { return stops.inc('tableCellArg'); }
as:generic_attribute* space* pipe !pipe {
stops.dec('tableCellArg');
//console.warn( 'tcargs' + JSON.stringify( as ) + " stops: " + pp(stops) );
return as;
}
/ & { return stops.dec('tableCellArg'); }
/*******************************************************************
* Text variants and other general productions
*******************************************************************/
/* All chars that cannot start syntactic structures in the middle of a line
* XXX: ] and other end delimiters should probably only be activated inside
* structures to avoid unnecessarily leaving the text production on plain
* content. */
text_char = [^'<~[{\n\r:\]}|!=]
/* Legend
* ' quotes (italic/bold)
* < start of xmlish_tag
* ~ signatures/dates
* [ start of links
* { start of parser functions, transclusion and template args
* \n all sort of block-level markup at start of line
* \r ditto
* h http(s) urls
* n nntp(s) urls
* m mailto urls
* I start of ISBN 10/13 auto links
* P start of PMID auto links
* R start of RFC auto links
*
* _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related)
* ! and | table cell delimiters, might be better to specialize those
* = headings - also specialize those!
*
* The following chars are also included for now, but only apply in some
* contexts and should probably be enabled only in those:
* : separate definition in ; term : definition
* ] end of link
* } end of parser func/transclusion/template arg
*/
urltext = ( t:[^'<~[{\n\rIPRfghimnstw_|!:\]} &=]+ { return t.join(''); }
/ & url_chars autolink
/ & ('__') behavior_switch
/ htmlentity
// Convert trailing space into &nbsp;
// XXX: This should be moved to a serializer
/// ' ' & ':' { return "\u00a0"; }
/ t:text_char )+
/*
'//', // for protocol-relative URLs, but not in text!
'ftp://',
'git://',
'gopher://',
'http://',
'https://',
'irc://',
'ircs://', // @bug 28503
'mailto:',
'mms://',
'news:',
'nntp://', // @bug 3808 RFC 1738
'svn://',
'telnet://', // Well if we're going to support the above.. -ævar
'worldwind://',
*/
// Old version
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
// Experimental tweaked version: avoid expensive single-char substrings
// This did not bring the expected performance boost, however.
//text = [A-Za-z0-9,._ -] {
// textStart = pos;
//
// var res = input.substr(textStart - 1, inputLength)
// .match(/[A-Za-z0-9,._ -]+/)[0];
// pos = pos + (res.length - 1);
// return res
// }
htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
//return "&" + c.join('') + ";";
var m = "&" + c.join('') + ";",
c = unentity(m);
return [
new TagTk('span', [new KV('typeof', 'mw:Entity')], { src: m, srcContent: c } ),
c,
new EndTagTk('span')
];
//return unentity("&" + c.join('') + ";")
}
spaces
= s:[ \t]+ { return s.join(''); }
space = [ \t]
optionalSpaceToken
= s:space* {
if ( s.length ) {
return [s.join('')];
} else {
return [];
}
}
// Start of line
sol
= nl:(newlineToken / & { return pos === 0; } { return [] })
// Eat multi-line comment, so that syntax after still matches as if it
// was actually preceded by a newline
cn:( c:comment n:newline? {
if ( n !== '' ) {
return [c, n];
} else {
return [c];
}
}
)?
// Eat includeonly/noinclude at start of line, so that start-of-line
// syntax after it still matches
ni:( niStart:({return pos})
s:space*
"<" c:"/"? t:("includeonly" / "noinclude")
">" {return [s.join(''), c, t, [niStart, pos]]} )?
{
var niToken = [];
if ( ni !== '') {
if ( ni[1] === '/' ) {
niToken = [new EndTagTk( ni[2], [], { tsr: ni[3] } )];
} else {
niToken = [new TagTk( ni[2], [], { tsr: ni[3] } )];
}
}
if ( cn === '' ) {
cn = [];
}
return nl.concat(cn, niToken);
}
nl_comment_space = newline / comment / space
/**
* noinclude / includeonly / onlyinclude productions. These are normally
* handled by the generic_tag production, except where generic tags are not
* allowed- for example in directives, which are allowed in various attribute
* names and -values.
*
* Example test case:
* {|
* |-<includeonly>
* foo
* </includeonly>
* |Hello
* |}
*/
include_limits = noinclude / includeonly / onlyinclude
noinclude
= & { return stops.inc('noinclude'); }
"<noinclude>"
i:inline
"</noinclude>"
{
stops.dec('noinclude');
return [new TagTk('noinclude', [], {tsr: [pos0, pos0+13]})]
.concat(i, [new EndTagTk('noinclude', [], {tsr: [pos - 14, pos]})]);
}
/ & { return stops.dec('noinclude'); }
includeonly
= & { return stops.inc('includeonly'); }
"<includeonly>"
i:inline
"</includeonly>"
{
stops.dec('includeonly');
return [new TagTk('includeonly', [], {tsr: [pos0, pos0+13]})]
.concat(i, [new EndTagTk('includeonly', [], {tsr: [pos - 14, pos]})]);
}
/ & { return stops.dec('includeonly'); }
onlyinclude
= & { return stops.inc('onlyinclude'); }
"<onlyinclude>"
i:inline
"</onlyinclude>"
{
stops.dec('onlyinclude');
return [new TagTk('onlyinclude', [], {tsr: [pos0, pos0+13]})]
.concat(i, [new EndTagTk('onlyinclude', [], {tsr: [pos - 14, pos]})]);
}
/ & { return stops.dec('onlyinclude'); }
eof = & { return isEOF(pos); } { return true; }
newline
= '\n' / '\r\n'
newlineToken = newline { return [new NlTk()] }
eolf = newline / eof
// 'Preprocessor' directive- higher-level things that can occur in otherwise
// plain-text content.
directive
= comment
/ nowiki
/ tplarg_or_template
/ htmlentity
/ include_limits
// Plain text, but can contain templates, template arguments, comments etc-
// all stuff that is normally handled by the preprocessor
// Returns either a list of tokens, or a plain string (if nothing is to be
// processed).
preprocessor_text
= r:( t:[^<~[{\n\r\t|!\]}{ &=]+ { return t.join(''); }
/ !inline_breaks (
directive
/ text_char )
)+ {
return flatten ( r );
}
wikilink_preprocessor_text
= r:( t:[^<~[{\n\r\t|!\]}{ &=]+ { return t.join(''); }
/ !inline_breaks ( directive / !"]]" text_char )
)+ {
return flatten_stringlist ( r );
}
extlink_preprocessor_text
// added special separator character class inline: separates url from
// description / text
= r:( t:[^'<~[{\n\r|!\]}{\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
/ !inline_breaks ( directive / no_punctuation_char )
/// urlencoded_char
// !inline_breaks no_punctuation_char
/ s:[.:,] !(space / eolf) { return s }
/ [&%] )+ {
return flatten_string ( r );
}
// Attribute values with preprocessor support
attribute_preprocessor_text
= r:( ts:(!inline_breaks t:[^=<>{}\n\r&'"\t ] {return t})+ { return ts.join(''); }
/ !inline_breaks (
directive
/ !inline_breaks [&%]
)
)+
{
//console.warn('prep');
return flatten_string ( r );
}
attribute_preprocessor_text_single
= r:( t:[^{}&']+ { return t.join(''); }
/ !inline_breaks (
directive
/ [{&] )
)*
{
return flatten_string ( r );
}
attribute_preprocessor_text_double
= r:( t:[^{}&"]+ { return t.join(''); }
/ !inline_breaks (
directive
/ [{&] )
)*
{
//console.warn( 'double:' + pp(r) );
return flatten_string ( r );
}
// Variants with the entire attribute on a single line
attribute_preprocessor_text_line
= r:( ts:[^=<>{\n\r&'"\t \[\]|{}]+ { return ts.join(''); }
/ !inline_breaks
t:(
directive
/ !(newline / space / [\[>]) c:. {
//console.warn( 'aptl: ' + pp(c) );
return c
}
) { return t }
)+
{
//console.warn('prep');
return flatten_string ( r );
}
attribute_preprocessor_text_single_line
= r:( t:[^{}&']+ { return t.join(''); }
/ !inline_breaks (
directive
/ ![\r\n] [{&] )
)* {
return flatten_string ( r );
}
attribute_preprocessor_text_double_line
= r:( t:[^{}&"]+ { return t.join(''); }
/ !inline_breaks (
directive
/ ![\r\n] [{&] )
)* {
//console.warn( 'double:' + pp(r) );
return flatten_string ( r );
}
// Special-case support for those pipe templates
pipe = "|" / "{{!}}"
end_pipe = "|" ! "|" / "{{!}}" ! "{{!}}"
pipe_pipe = "||" / "{{!}}{{!}}"
// Similar, for tables..
exclam = "!" / "{{;}}"
/* Tabs do not mix well with the hybrid production syntax */
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent : */