mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-27 07:40:48 +00:00
e3a745a024
Change-Id: Id0894ccbedfa47fa3658817ca65119a2af76be3e
1996 lines
58 KiB
JavaScript
1996 lines
58 KiB
JavaScript
/**
|
|
* Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
|
|
* chunks of tokens (one chunk per top-level block matched) and eventually an
|
|
* end event. Tokens map to HTML tags as far as possible, with custom tokens
|
|
* used where further processing on the token stream is needed.
|
|
*
|
|
* @author Gabriel Wicke <gwicke@wikimedia.org>
|
|
* @author Brion Vibber <brion@wikimedia.org>
|
|
*/
|
|
{
|
|
/* Fixme: use static functions to separate module! Unfortunately, this
|
|
* does not work:
|
|
* var tu = require('./mediawiki.tokenizer.utils.js');
|
|
* console.warn(tu.flatten([]));
|
|
* Using exports in the module gets a bit further, but accesses to
|
|
* tu.flatten in productions still fail. Thus, I just moved the functions
|
|
* here until a solution is found:
|
|
*/
|
|
|
|
/* Static utilities */
|
|
|
|
// Flatten a list of lists.
|
|
//var simple_flatten = function ( e ) {
|
|
// // Fast single-level flatten:
|
|
// //return [].concat.apply([], e);
|
|
//
|
|
// var es = [];
|
|
// // flatten sub-arrays
|
|
// for(var i = 0, length = e.length; i < length; i++) {
|
|
// var ei = e[i];
|
|
// if ($.isArray(ei))
|
|
// es = es.concat(flatten(ei));
|
|
// else
|
|
// es.push(ei);
|
|
// };
|
|
// return es;
|
|
//};
|
|
|
|
var flatten = function ( e ) {
|
|
// Fast single-level flatten:
|
|
//return [].concat.apply([], e);
|
|
|
|
var es = [];
|
|
// flatten sub-arrays
|
|
var chunkStart = null,
|
|
modified = false;
|
|
for(var i = 0, l = e.length; i < l; i++) {
|
|
if ( e[i].constructor === Array ) {
|
|
if ( chunkStart !== null ) {
|
|
es = es.concat( e.slice( chunkStart, i ), flatten( e[i] ) );
|
|
chunkStart = null;
|
|
} else {
|
|
es = es.concat(flatten(e[i]));
|
|
}
|
|
modified = true;
|
|
} else if ( chunkStart === null ) {
|
|
chunkStart = i;
|
|
}
|
|
};
|
|
if ( modified ) {
|
|
if ( chunkStart !== null ) {
|
|
es = es.concat( e.slice( chunkStart, e.length ) );
|
|
}
|
|
return es;
|
|
} else {
|
|
return e;
|
|
}
|
|
};
|
|
|
|
|
|
var flatten_string = function ( c ) {
|
|
var out = flatten_stringlist( c );
|
|
if ( out.length === 1 && out[0].constructor === String ) {
|
|
return out[0];
|
|
} else {
|
|
return out;
|
|
}
|
|
};
|
|
|
|
var flatten_stringlist = function ( c ) {
|
|
var out = [],
|
|
text = '';
|
|
c = flatten(c);
|
|
for (var i = 0, l = c.length; i < l; i++) {
|
|
var ci = c[i];
|
|
if (ci.constructor === String) {
|
|
if(ci !== '') {
|
|
text += ci;
|
|
}
|
|
} else {
|
|
if (text !== '') {
|
|
out.push( text );
|
|
text = '';
|
|
}
|
|
out.push(ci);
|
|
}
|
|
}
|
|
if (text !== '' ) {
|
|
out.push( text );
|
|
}
|
|
return out;
|
|
}
|
|
|
|
// Remove escaped quotes from attributes etc
|
|
// This was in the original PEG parser, but could not find anything in
|
|
// MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke
|
|
var unquote = function (quotec, text) {
|
|
return text.replace('\\' + quotec, quotec);
|
|
};
|
|
|
|
// Decode html entities. In a browser, this should only be fed the entity,
|
|
// not untrusted html! XXX: replace with safer version.
|
|
var unentity = function ( entity ) {
|
|
return $("<div/>").html(entity).text();
|
|
};
|
|
|
|
// Debug print with global switch
|
|
var dp = function ( msg ) {
|
|
if ( false ) {
|
|
console.warn(msg);
|
|
}
|
|
};
|
|
|
|
var pp = function ( s ) { return JSON.stringify(s, null, 2); };
|
|
|
|
// Simple string formatting using '%s'
|
|
var sprintf = function ( format ) {
|
|
var args = Array.prototype.slice.call(arguments, 1);
|
|
return format.replace(/%s/g, function () {
|
|
return args.length ? args.shift() : '';
|
|
});
|
|
};
|
|
|
|
/*
|
|
* Annotate a token stream with list items with appropriate list tokens
|
|
*
|
|
* XXX: Move this to a token handler in phase sync23! That way we can
|
|
* support list items from templates too.
|
|
*
|
|
* @static
|
|
* @method
|
|
* @param {[tokens]} Token stream with li tokens
|
|
* @returns {[tokens]} Token stream, possibly with additional list tokens
|
|
* */
|
|
var annotateList = function ( tokens ) {
|
|
var out = [], // List of tokens
|
|
bstack = [], // Bullet stack, previous element's listStyle
|
|
bnext = [], // Next element's listStyle
|
|
endtags = []; // Stack of end tags
|
|
|
|
var commonPrefixLength = function (x, y) {
|
|
var minLength = Math.min(x.length, y.length);
|
|
for(var i = 0; i < minLength; i++) {
|
|
if (x[i] != y[i])
|
|
break;
|
|
}
|
|
return i;
|
|
};
|
|
|
|
var pushList = function ( listName, itemName ) {
|
|
out.push( new TagTk( listName ));
|
|
out.push( new TagTk( itemName ));
|
|
endtags.push( new EndTagTk( listName ));
|
|
endtags.push( new EndTagTk( itemName ));
|
|
};
|
|
|
|
var popTags = function ( n ) {
|
|
for(;n > 0; n--) {
|
|
// push list item..
|
|
out.push(endtags.pop());
|
|
// and the list end tag
|
|
out.push(endtags.pop());
|
|
}
|
|
};
|
|
|
|
var isDlDd = function (a, b) {
|
|
var ab = [a,b].sort();
|
|
return (ab[0] === ':' && ab[1] === ';');
|
|
};
|
|
|
|
var doListItem = function ( bs, bn ) {
|
|
var prefixLen = commonPrefixLength (bs, bn);
|
|
var changeLen = Math.max(bs.length, bn.length) - prefixLen;
|
|
var prefix = bn.slice(0, prefixLen);
|
|
// emit close tag tokens for closed lists
|
|
if (changeLen === 0) {
|
|
var itemToken = endtags.pop();
|
|
out.push(itemToken);
|
|
out.push(new TagTk( itemToken.name ));
|
|
endtags.push(new EndTagTk( itemToken.name ));
|
|
} else if ( bs.length == bn.length
|
|
&& changeLen == 1
|
|
&& isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
|
|
// handle dd/dt transitions
|
|
out.push(endtags.pop());
|
|
if( bn[prefixLen] == ';') {
|
|
var newName = 'dt';
|
|
} else {
|
|
var newName = 'dd';
|
|
}
|
|
out.push(new TagTk( newName ));
|
|
endtags.push(new EndTagTk( newName ));
|
|
} else {
|
|
popTags(bs.length - prefixLen);
|
|
|
|
if (prefixLen > 0 && bn.length == prefixLen ) {
|
|
var itemToken = endtags.pop();
|
|
out.push(itemToken);
|
|
out.push(new TagTk( itemToken.name ));
|
|
endtags.push(new EndTagTk( itemToken.name ));
|
|
}
|
|
|
|
for(var i = prefixLen; i < bn.length; i++) {
|
|
switch (bn[i]) {
|
|
case '*':
|
|
pushList('ul', 'li');
|
|
break;
|
|
case '#':
|
|
pushList('ol', 'li');
|
|
break;
|
|
case ';':
|
|
pushList('dl', 'dt');
|
|
break;
|
|
case ':':
|
|
pushList('dl', 'dd');
|
|
break;
|
|
default:
|
|
throw("Unknown node prefix " + prefix[i]);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
for (var i = 0, length = tokens.length; i < length; i++) {
|
|
var token = tokens[i];
|
|
switch ( token.constructor ) {
|
|
case TagTk:
|
|
switch (token.name) {
|
|
case 'list':
|
|
// ignore token
|
|
break;
|
|
case 'listItem':
|
|
// convert listItem to list and list item tokens
|
|
bnext = token.bullets;
|
|
doListItem( bstack, bnext );
|
|
bstack = bnext;
|
|
break;
|
|
default:
|
|
// pass through all remaining start tags
|
|
out.push(token);
|
|
break;
|
|
}
|
|
break;
|
|
case EndTagTk:
|
|
if ( token.name == 'list' ) {
|
|
// pop all open list item tokens
|
|
popTags(bstack.length);
|
|
bstack = [];
|
|
} else {
|
|
out.push(token);
|
|
}
|
|
break;
|
|
default:
|
|
out.push(token);
|
|
break;
|
|
}
|
|
}
|
|
return out;
|
|
};
|
|
|
|
|
|
/**
|
|
* Determine if a string represents a valid ISBN-10 or ISBN-13 identifier
|
|
*
|
|
* @static
|
|
* @method
|
|
* @param {String} isbn: The string to check
|
|
* @returns {Boolean}: True if valid ISBN, false otherwise.
|
|
*/
|
|
var isValidISBN = function ( isbn ) {
|
|
var i = 0, checksum = 0;
|
|
|
|
isbn = isbn.toUpperCase().replace(/[^\dX]/g, '');
|
|
return [10, 13].indexOf(isbn.length) !== -1;
|
|
|
|
// XXX: The following code is short-circuited because it is stricter
|
|
// than the standard parser:
|
|
|
|
switch (isbn.length) {
|
|
case 10:
|
|
for (i = 0; i < 9; i++) {
|
|
checksum += parseInt(isbn[i], 10) * (10 - i);
|
|
}
|
|
checksum += '0123456789X'.indexOf(isbn[9]);
|
|
return (checksum % 11 === 0);
|
|
case 13:
|
|
for (i = 0; i < 13; i++) {
|
|
checksum += parseInt(isbn[i], 10) * (i & 1 ? 3 : 1);
|
|
}
|
|
return (checksum % 10 === 0) && (/^97[89]/.test(isbn));
|
|
}
|
|
return false;
|
|
};
|
|
|
|
|
|
|
|
/* End static utilities */
|
|
|
|
/* Behavior switches scoped to entire page */
|
|
var behaviors = {};
|
|
|
|
/*
|
|
* Flags for specific parse environments (inside tables, links etc). Flags
|
|
* trigger syntactic stops in the inline_breaks production, which
|
|
* terminates inline and attribute matches. Flags merely reduce the number
|
|
* of productions needed: The grammar is still context-free as the
|
|
* productions can just be unrolled for all combinations of environments
|
|
* at the cost of a much larger grammar.
|
|
*/
|
|
function SyntaxStops () {
|
|
this.counters = {};
|
|
this.stacks = {};
|
|
}
|
|
SyntaxStops.prototype.inc = function(flag) {
|
|
if (this.counters[flag] !== undefined) {
|
|
this.counters[flag]++;
|
|
} else {
|
|
this.counters[flag] = 1;
|
|
}
|
|
return true;
|
|
};
|
|
SyntaxStops.prototype.dec = function(flag) {
|
|
this.counters[flag]--;
|
|
return false;
|
|
};
|
|
SyntaxStops.prototype.onCount = function ( name ) {
|
|
return this.counters[name];
|
|
};
|
|
|
|
/**
|
|
* A stack for nested, but not cumulative syntactic stops.
|
|
* Example: '=' is allowed in values of template arguments, even if those
|
|
* are nested in attribute names.
|
|
*/
|
|
SyntaxStops.prototype.push = function ( name, value ) {
|
|
if( this.stacks[name] === undefined ) {
|
|
this.stacks[name] = [value];
|
|
} else {
|
|
this.stacks[name].push( value );
|
|
}
|
|
return true;
|
|
};
|
|
SyntaxStops.prototype.pop = function ( name ) {
|
|
if( this.stacks[name] !== undefined ) {
|
|
this.stacks[name].pop();
|
|
} else {
|
|
throw "SyntaxStops.pop: unknown stop for " + name;
|
|
}
|
|
return false;
|
|
};
|
|
SyntaxStops.prototype.onStack = function ( name ) {
|
|
var stack = this.stacks[name];
|
|
if ( stack === undefined || stack.length === 0 ) {
|
|
return false;
|
|
} else {
|
|
return stack[stack.length - 1];
|
|
}
|
|
};
|
|
|
|
var stops = new SyntaxStops();
|
|
|
|
// Start position of top-level block
|
|
// Could also provide positions for lower-level blocks using a stack.
|
|
var blockStart = 0;
|
|
|
|
// Start position of generic tag production
|
|
var tagStartPos = 0;
|
|
|
|
// Stack of source positions
|
|
var posStack = {
|
|
positions: {},
|
|
push: function( key, pos ) {
|
|
if ( this.positions[key] === undefined ) {
|
|
this.positions[key] = [pos];
|
|
} else {
|
|
this.positions[key].push( pos );
|
|
}
|
|
return true;
|
|
},
|
|
pop: function( key, pos ) {
|
|
var pk = this.positions[key];
|
|
if ( pk === undefined || ! pk.length ) {
|
|
throw "Tried to pop unknown position for " + key;
|
|
} else {
|
|
return [ pk.pop(), pos ];
|
|
}
|
|
}
|
|
};
|
|
|
|
// cache the input length
|
|
var inputLength = input.length;
|
|
|
|
// pseudo-production that matches at end of input
|
|
var isEOF = function (pos) {
|
|
return pos === inputLength;
|
|
};
|
|
|
|
// text start position
|
|
var textStart = 0;
|
|
|
|
// hack to support numbered external links ([http://example.com]).
|
|
// XXX: Move to token stream transform after templates are expanded!
|
|
var linkCount = 1;
|
|
|
|
// Define block-level tags in JS, so we can use toLowerCase to match tags
|
|
// case-independently. This would be quite ugly (and possibly slower) if
|
|
// done manually in the grammar.
|
|
var block_names = (function () {
|
|
var names = [ "p", "table", "td", "tr", "ul", "ol"
|
|
, "li", "dl", "dt", "dd", "div", "center"
|
|
, "blockquote" ];
|
|
var bnames = {};
|
|
for(var i = 0, l = names.length; i < l; i++) {
|
|
bnames[names[i]] = true;
|
|
}
|
|
return bnames;
|
|
})();
|
|
|
|
var self = this;
|
|
|
|
|
|
}
|
|
|
|
/*********************************************************
|
|
* The top-level production
|
|
*********************************************************/
|
|
|
|
start
|
|
= e:toplevelblock* newline* {
|
|
// end is passed inline as a token, as well as a separate event for now.
|
|
__parseArgs[2]( [ new EOFTk( ) ] );
|
|
return []; //flatten(e);
|
|
}
|
|
|
|
|
|
/*
|
|
* A document (start production) is a sequence of toplevelblocks. Tokens are
|
|
* emitted in chunks per toplevelblock to avoid buffering the full document.
|
|
*/
|
|
toplevelblock
|
|
= & { blockStart = pos; return true; } b:block {
|
|
b = flatten(b);
|
|
|
|
// Add source offsets for round-tripping. XXX: Add these not just for
|
|
// toplevelblocks!
|
|
if ( b.length ) {
|
|
var bs = b[0];
|
|
if ( bs.constructor === String && bs.attribs === undefined ) {
|
|
b[0] = new String( bs );
|
|
bs = b[0];
|
|
}
|
|
if (bs.dataAttribs === undefined) {
|
|
bs.dataAttribs = {};
|
|
}
|
|
bs.dataAttribs.sourcePos = [blockStart, pos];
|
|
}
|
|
|
|
// Emit tokens for this toplevelblock. This feeds a chunk to the parser
|
|
// pipeline.
|
|
__parseArgs[2]( flatten( b ) );
|
|
|
|
// We don't return any tokens to the start production to save memory. We
|
|
// just emitted them already to our consumers.
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* The actual contents of each block.
|
|
*/
|
|
block
|
|
= block_lines
|
|
/ & '<' r:( pre // tag variant can start anywhere
|
|
/ comment &eolf
|
|
/ nowiki
|
|
// avoid a paragraph if we know that the line starts with a block tag
|
|
/ bt:block_tag { return [bt] }
|
|
) { return r; }
|
|
/ paragraph
|
|
// Inlineline includes generic tags; wrapped into paragraphs in token
|
|
// transform and DOM postprocessor
|
|
/ inlineline
|
|
/ sol
|
|
|
|
/*
|
|
* A block nested in other constructs. Avoid eating end delimiters for other
|
|
* constructs by checking against inline_breaks first.
|
|
*/
|
|
nested_block = !inline_breaks b:block { return b }
|
|
|
|
/*
|
|
* Line-based block constructs.
|
|
*/
|
|
block_lines
|
|
= s:sol
|
|
// eat an empty line before the block
|
|
s2:(os:optionalSpaceToken so:sol { return os.concat(so) })?
|
|
bl:block_line {
|
|
var s2_ = (s2 !== '') ? s2 : [];
|
|
return s.concat(s2_, bl);
|
|
}
|
|
|
|
/*
|
|
* Block structures with start-of-line wiki syntax
|
|
*/
|
|
block_line
|
|
= h
|
|
/ lists
|
|
/ st:optionalSpaceToken
|
|
r:( & [{}|!] tl:table_lines { return tl; }
|
|
// tag-only lines should not trigger pre either
|
|
/ bts:(bt:block_tag stl:optionalSpaceToken { return bt.concat(stl) })+
|
|
&eolf { return bts }
|
|
) {
|
|
return st.concat(r);
|
|
}
|
|
/ pre_indent
|
|
/ pre
|
|
|
|
|
|
/*
|
|
* A paragraph. We don't emit 'p' tokens to avoid issues with template
|
|
* transclusions, <p> tags in the source and the like. Instead, we perform
|
|
* some paragraph wrapping on the token stream and the DOM.
|
|
*/
|
|
paragraph
|
|
= s1:sol s2:sol c:inlineline {
|
|
return s1.concat(s2, /* [new TagTk('p')],*/ c);
|
|
}
|
|
|
|
br = space* &newline { return new SelfclosingTagTk( 'br' ) }
|
|
|
|
/*
|
|
* Syntax stops: Avoid eating significant tokens for higher-level productions
|
|
* in nested inline productions.
|
|
*
|
|
* Repeated testing of flags is not terribly efficient. See new and faster
|
|
* version below.
|
|
*/
|
|
|
|
/*
|
|
* Syntax stops: Avoid eating significant tokens for higher-level productions
|
|
* in nested inline productions.
|
|
*/
|
|
inline_breaks
|
|
= & [=|!}{:\r\n\]<]
|
|
& { // Important hack: disable caching for this production, as the default
|
|
// cache key does not take into account flag states!
|
|
cacheKey = '';
|
|
//console.warn('ilbf: ' + input.substr(pos, 5) );
|
|
return null !== __parseArgs[3].inline_breaks( input, pos, stops )
|
|
}
|
|
|
|
inline
|
|
= c:(urltext / (! inline_breaks (inline_element / . )))+ {
|
|
//console.warn('inline out:' + pp(c));
|
|
return flatten_stringlist( c );
|
|
}
|
|
|
|
|
|
inlineline
|
|
= c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
|
|
return flatten_stringlist( c );
|
|
}
|
|
|
|
inline_element
|
|
= //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
|
|
& '<' ( comment / xmlish_tag )
|
|
/// & '{' ( & '{{{{{' template / tplarg / template )
|
|
/ & '{' tplarg_or_template
|
|
/// & '{' ( tplarg / template )
|
|
// Eat three opening brackets as text.
|
|
/ '[[[' { return '[[[' }
|
|
/ & '[' ( wikilink / extlink / autolink )
|
|
/ & "'" quote
|
|
|
|
/* Headings */
|
|
|
|
h = & "=" // guard, to make sure '='+ will match.
|
|
// XXX: Also check to end to avoid inline parsing?
|
|
r:(
|
|
s:'='+ // moved in here to make s accessible to inner action
|
|
& { return stops.inc('h'); }
|
|
c:inlineline
|
|
e:'='+
|
|
spc:(sp:space+ { return sp.join('') } / comment)*
|
|
&eolf
|
|
{
|
|
stops.dec('h');
|
|
var level = Math.min(s.length, e.length);
|
|
// convert surplus equals into text
|
|
if(s.length > level) {
|
|
var extras = s.substr(0, s.length - level);
|
|
if(c[0].constructor === String) {
|
|
c[0] = extras + c[0];
|
|
} else {
|
|
c.unshift( extras );
|
|
}
|
|
}
|
|
if(e.length > level) {
|
|
var extras = e.substr(0, e.length - level),
|
|
lastElem = c[c.length - 1];
|
|
if(lastElem.constructor === String) {
|
|
lastElem += extras;
|
|
} else {
|
|
c.push( extras );
|
|
}
|
|
}
|
|
|
|
return [new TagTk( 'h' + level )]
|
|
.concat(c, [new EndTagTk( 'h' + level ), spc]);
|
|
}
|
|
/ & { /* dp('nomatch exit h'); */ stops.dec('h'); return false } { return null }
|
|
) { return r }
|
|
|
|
comment
|
|
= '<!--' c:comment_chars* ('-->' / eof)
|
|
cs:(space* newline space* cn:comment { return cn })* {
|
|
return [new CommentTk( c.join('') )].concat(cs);
|
|
}
|
|
|
|
comment_chars
|
|
= c:[^-] { return c; }
|
|
/ c:'-' !'->' { return c; }
|
|
|
|
|
|
|
|
// Behavior switches
|
|
behavior_switch
|
|
= '__'
|
|
behavior:(
|
|
'NOTOC'
|
|
/ 'FORCETOC'
|
|
/ 'TOC'
|
|
/ 'NOEDITSECTION'
|
|
/ 'NEWSECTIONLINK'
|
|
/ 'NONEWSECTIONLINK'
|
|
/ 'NOGALLERY'
|
|
/ 'HIDDENCAT'
|
|
/ 'NOCONTENTCONVERT'
|
|
/ 'NOCC'
|
|
/ 'NOTITLECONVERT'
|
|
/ 'NOTC'
|
|
/ 'START'
|
|
/ 'END' // XXX: Removed in 19213. Should we keep this?
|
|
/ 'INDEX'
|
|
/ 'NOINDEX'
|
|
/ 'STATICREDIRECT'
|
|
)
|
|
'__'
|
|
{
|
|
behaviors[behavior] = true;
|
|
return '';
|
|
}
|
|
|
|
/**************************************************************
|
|
* External (bracketed and autolinked) links
|
|
**************************************************************/
|
|
|
|
autolink
|
|
= ! { return stops.onCount('extlink') }
|
|
(urllink / autoref / isbn)
|
|
|
|
urllink
|
|
= target:url {
|
|
return [ new TagTk( 'urllink', [new KV('href', target)] ) ];
|
|
}
|
|
|
|
extlink
|
|
= ! { return stops.onCount('extlink') } // extlink cannot be nested
|
|
(
|
|
"["
|
|
& { return stops.inc('extlink'); }
|
|
//target:urllink
|
|
target:extlink_preprocessor_text
|
|
text:(( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )*
|
|
t:inlineline { return t } )?
|
|
"]" {
|
|
stops.dec('extlink');
|
|
if ( text === '' ) {
|
|
// XXX: Link numbering should be implemented in post-processor.
|
|
text = [ "[" + linkCount + "]" ];
|
|
linkCount++;
|
|
}
|
|
//console.warn( 'extlink text: ' + pp( text ) );
|
|
return [
|
|
new SelfclosingTagTk( 'extlink', [
|
|
new KV('href', target),
|
|
new KV('content', text)
|
|
] )
|
|
];
|
|
}
|
|
/ "[" & { stops.dec('extlink'); return false; }
|
|
)
|
|
|
|
autoref
|
|
= ref:('RFC' / 'PMID') (newline / space)+ identifier:[0-9]+
|
|
{
|
|
identifier = identifier.join('');
|
|
|
|
var base_urls = {
|
|
'RFC' : '//tools.ietfs.org/html/rfc%s',
|
|
'PMID' : '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract'
|
|
},
|
|
url = sprintf(base_urls[ref], identifier);
|
|
|
|
return [
|
|
new SelfclosingTagTk( 'extlink', [
|
|
new KV('href', sprintf(base_urls[ref], identifier)),
|
|
new KV('content', [ref, identifier].join(' '))
|
|
] )
|
|
];
|
|
}
|
|
|
|
isbn
|
|
= 'ISBN' (newline / space)+
|
|
head:[0-9]
|
|
digits:([- ]? [0-9])+
|
|
tail:'X'?
|
|
{
|
|
var isbn = [head, digits, tail].join('').replace(/,/g, '');
|
|
|
|
if (!isValidISBN(isbn)) {
|
|
return null;
|
|
}
|
|
|
|
return [
|
|
new SelfclosingTagTk( 'wikilink', [
|
|
new KV('', 'Special:BookSources/' + isbn.replace(/[^\d]/g, '')),
|
|
new KV('', 'ISBN ' + isbn),
|
|
new KV('tail', ''),
|
|
] )
|
|
];
|
|
}
|
|
|
|
|
|
/* Default URL protocols in MediaWiki (see DefaultSettings). Normally
|
|
* these can be configured dynamically. */
|
|
|
|
url_chars = [/fghimnstwIPR]
|
|
|
|
url_protocol
|
|
= '//' // for protocol-relative URLs
|
|
/ 'ftp://'
|
|
/ 'git://'
|
|
/ 'gopher://'
|
|
/ 'http://'
|
|
/ 'https://'
|
|
/ 'irc://'
|
|
/ 'ircs://' // @bug 28503
|
|
/ 'mailto:'
|
|
/ 'mms://'
|
|
/ 'news:'
|
|
/ 'nntp://' // @bug 3808 RFC 1738
|
|
/ 'svn://'
|
|
/ 'telnet://' // Well if we're going to support the above.. -ævar
|
|
/ 'worldwind://'
|
|
|
|
// javascript does not support unicode features..
|
|
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
|
|
|
|
|
|
urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
|
|
try {
|
|
return decodeURI("%" + c0 + c1)
|
|
} catch ( e ) {
|
|
// Reject the match, and allow other fall-back productions to have a
|
|
// go at it.
|
|
return null;
|
|
}
|
|
}
|
|
|
|
//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
|
|
|
|
// no punctiation, and '{<' to trigger directives
|
|
no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]
|
|
|
|
url
|
|
= proto:url_protocol
|
|
addr:( ipv6_address / ipv4_address )?
|
|
path:( ( !inline_breaks
|
|
c:no_punctuation_char
|
|
{ return c }
|
|
)
|
|
/ s:[.:,] !(space / eolf) { return s }
|
|
/ comment
|
|
/ tplarg_or_template
|
|
/ htmlentity
|
|
/ [&%{]
|
|
)+
|
|
{
|
|
//console.warn( "path: " + pp( flatten_stringlist( [proto + addr].concat( path ) ) ) );
|
|
return flatten_string( [proto + addr].concat( path ) );
|
|
}
|
|
|
|
ipv4_address
|
|
= a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*)
|
|
{
|
|
return flatten( a ).join('');
|
|
}
|
|
|
|
ipv6_address
|
|
= a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']')
|
|
{
|
|
return flatten( a ).join('');
|
|
}
|
|
|
|
|
|
/**************************************************************
|
|
* Templates, -arguments and wikilinks
|
|
**************************************************************/
|
|
|
|
/*
|
|
* Precedence: template arguments win over templates. See
|
|
* http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
|
|
* 4: {{{{·}}}} → {·{{{·}}}·}
|
|
* 5: {{{{{·}}}}} → {{·{{{·}}}·}}
|
|
* 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
|
|
* 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
|
|
*/
|
|
tplarg_or_template
|
|
=
|
|
! '{{{{{{{' (
|
|
& ( '{{{{{{' (!'}}}' .)* '}}}}}}') tplarg
|
|
/ & ( '{{{{{' (!'}}}' .)* '}}}}}') template
|
|
/ tplarg
|
|
/ template
|
|
)
|
|
|
|
template
|
|
= "{{" (newline / space)*
|
|
target:template_name
|
|
params:(( newline / space )* "|"
|
|
r:( (newline / space )* &"|" { return new KV( '', '') } // empty argument
|
|
/ p:template_param { return p }
|
|
) { return r }
|
|
)*
|
|
( newline / space )*
|
|
"}}" {
|
|
// Insert target as first positional attribute, so that it can be
|
|
// generically expanded. The TemplateHandler then needs to shift it out
|
|
// again.
|
|
params.unshift( { k: '', v: flatten( target ) } );
|
|
var obj = new SelfclosingTagTk( 'template', params );
|
|
//console.warn( 'tokenizer template ' + JSON.stringify( target ));
|
|
return obj;
|
|
}
|
|
|
|
// XXX: support template and args in target!
|
|
//template_target
|
|
// = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') }
|
|
|
|
tplarg
|
|
= "{{{"
|
|
name:template_name?
|
|
params:( ( space / newline )*
|
|
'|' ( space / newline )*
|
|
r:(
|
|
&'}}' { return new KV( '', '') }
|
|
/ p:template_param { return p }
|
|
) { return r }
|
|
)*
|
|
( space / newline )*
|
|
"}}}" {
|
|
name = flatten( name );
|
|
params.unshift( { k: '', v: name } );
|
|
var obj = new SelfclosingTagTk( 'templatearg', params );
|
|
//console.warn( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 ));
|
|
return obj;
|
|
}
|
|
|
|
template_param
|
|
= name:template_param_name
|
|
// MW accepts |foo | = bar | as a single param..
|
|
('|' (space / newline)* &'=')?
|
|
val:(
|
|
s0:space*
|
|
"="
|
|
s1:space*
|
|
value:template_param_value? {
|
|
return { s0: s0, s1: s1, value: value };
|
|
}
|
|
)?
|
|
|
|
{
|
|
//console.warn( 'named template_param matched' + pp([name, value ]) );
|
|
if ( val !== '' ) {
|
|
if ( val.value !== '' ) {
|
|
return new KV( name, flatten( val.value ) );
|
|
} else {
|
|
return new KV(flatten( name ), []);
|
|
}
|
|
} else {
|
|
return new KV([], flatten(name));
|
|
}
|
|
}
|
|
// empty parameter
|
|
/ & [|}] { return new KV([], []); }
|
|
|
|
|
|
template_name = template_param_value
|
|
|
|
// FIXME: handle template args and templates in key! (or even parser functions?)
|
|
template_param_name
|
|
= & { return stops.push( 'equal', true ) }
|
|
tpt:(template_param_text / &'=' { return '' })
|
|
{
|
|
stops.pop( 'equal' );
|
|
//console.warn( 'template param name matched: ' + pp( tpt ) );
|
|
return tpt;
|
|
}
|
|
|
|
/ & { return stops.pop( 'equal' ) }
|
|
//= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
|
|
|
|
template_param_value
|
|
= & { return stops.push( 'equal', false ) }
|
|
tpt:template_param_text
|
|
{
|
|
stops.pop( 'equal' );
|
|
//console.warn( 'template param value matched: ' + pp( tpt ) );
|
|
return tpt;
|
|
}
|
|
/ & { return stops.pop( 'equal' ) }
|
|
|
|
template_param_text
|
|
= & { /*console.warn( 'tpt: ' +
|
|
input.substr( pos - 10, 9) +
|
|
input[pos].green +
|
|
input.substr( pos +1, 9) ); */
|
|
// re-enable tables within template parameters
|
|
stops.push('table', false );
|
|
return stops.inc('template')
|
|
}
|
|
il:nested_block+ {
|
|
stops.pop('table');
|
|
stops.dec('template');
|
|
//console.warn( 'tpt match: ' + pp (il));
|
|
var r = flatten( il ),
|
|
firstToken = r[0];
|
|
|
|
// Strip leading newlines and space-only text tokens.
|
|
// XXX: Will need to round-trip these eventually.
|
|
while (
|
|
firstToken !== undefined &&
|
|
( firstToken.constructor === NlTk ||
|
|
firstToken.constructor === String && firstToken.match( /^[\t ]+$/ ) )
|
|
)
|
|
{
|
|
r.shift();
|
|
firstToken = r[0];
|
|
}
|
|
|
|
return r;
|
|
}
|
|
/ & { stops.pop('table'); return stops.dec('template'); }
|
|
|
|
|
|
// TODO: handle link prefixes as in al[[Razi]]
|
|
wikilink
|
|
= & { return posStack.push('wikilink' , pos); }
|
|
"[["
|
|
! url
|
|
//target:link_target
|
|
// XXX: disallow pipe!
|
|
target:wikilink_preprocessor_text
|
|
lcontent:(
|
|
& { return posStack.push('lcontent' , pos); }
|
|
lcs:( pipe lt:link_text { return new KV( '', lt ); } )+ {
|
|
return { pos: posStack.pop('lcontent' , pos), content: lcs };
|
|
}
|
|
|
|
/ {
|
|
return { pos: posStack.pop('lcontent' , pos), content: [] };
|
|
}
|
|
)
|
|
"]]"
|
|
// XXX In real MediaWiki, this is a language-dependent positive character
|
|
// class. Can we work out a static negative class instead?
|
|
// XXX: Exclude uppercase chars from non-latin languages too!
|
|
tail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
|
|
var obj = new SelfclosingTagTk( 'wikilink' ),
|
|
textTokens = [];
|
|
obj.attribs.push( new KV('', target) );
|
|
obj.dataAttribs = {
|
|
sourcePos: posStack.pop( 'wikilink', pos ),
|
|
contentPos: lcontent.pos
|
|
};
|
|
// XXX: Point to object with path, revision and input information
|
|
obj.source = input;
|
|
|
|
//console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) );
|
|
// Deal with content. XXX: Properly support pipe-trick etc
|
|
//lcontent.tail = tail && tail.join('') || '';
|
|
|
|
obj.attribs = obj.attribs.concat( lcontent.content );
|
|
obj.attribs.push( new KV( 'tail', tail && tail.join('') || '' ) );
|
|
//console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) );
|
|
return [obj];
|
|
}
|
|
/ ! { return posStack.pop( 'wikilink', pos ); }
|
|
|
|
link_text
|
|
= & { return stops.inc('linkdesc'); }
|
|
h:inline
|
|
// 'equal' syntaxFlag is set for links in template parameters. Consume the
|
|
// '=' here.
|
|
hs:( '=' inline)?
|
|
{
|
|
//console.warn('link_text' + pp(h) + pp(hs));
|
|
stops.dec('linkdesc');
|
|
if( hs !== '' ) {
|
|
return h.concat(hs);
|
|
} else {
|
|
return h;
|
|
}
|
|
}
|
|
/ & { return stops.dec('linkdesc'); }
|
|
|
|
link_option
|
|
= & { stops.inc('pipe'); return stops.inc('linkdesc'); }
|
|
h:inline
|
|
// 'equal' syntaxFlag is set for links in template parameters. Consume the
|
|
// '=' here.
|
|
hs:( '=' inline)?
|
|
{
|
|
//console.warn('link_text' + pp(h) + pp(hs));
|
|
stops.dec('pipe');
|
|
stops.dec('linkdesc');
|
|
if( hs !== '' ) {
|
|
return h.concat(hs);
|
|
} else {
|
|
return h;
|
|
}
|
|
}
|
|
/ & { stops.dec('pipe'); return stops.dec('linkdesc'); }
|
|
|
|
link_end = "]]"
|
|
|
|
/* Generic quote production for italic and bold, further processed in a token
|
|
* stream transformation in doQuotes. Relies on NlTk tokens being emitted
|
|
* for each line of text to balance quotes per line.
|
|
*
|
|
* We are not using a simple pair rule here as we need to support mis-nested
|
|
* bolds/italics and MediaWiki's special heuristics for apostrophes, which are
|
|
* all not context free. */
|
|
quote = "''" x:"'"* {
|
|
var res = new TagTk( 'mw-quote' ); // Will be consumed in token transforms
|
|
res.value = "''" + x.join('');
|
|
return res;
|
|
}
|
|
|
|
|
|
/**
|
|
* Image option productions, only called from the LinkHandler token stream
|
|
* transformer, and only for images.
|
|
*/
|
|
img_options =
|
|
& { return stops.inc( 'pipe' ); }
|
|
os:img_option* {
|
|
stops.dec( 'pipe' );
|
|
var options = {};
|
|
os = flatten( os );
|
|
for ( var i = 0, l = os.length; i < l; i++ ) {
|
|
var o = os[i];
|
|
options[o.k] = o.v;
|
|
}
|
|
options._options = os;
|
|
return options;
|
|
}
|
|
/ & { return stops.dec( 'pipe' ); }
|
|
|
|
img_option
|
|
= pipe space*
|
|
o:(
|
|
img_attribute
|
|
/ img_format
|
|
/ img_dimensions
|
|
/ img_halign
|
|
/ img_valign
|
|
/ img_link
|
|
/ lt:link_text { return new KV('caption', lt) }
|
|
)
|
|
space* {
|
|
return o
|
|
};
|
|
|
|
img_format
|
|
= f:( 'border' / 'frameless' / 'frame' / 'thumbnail' / 'thumb' ) {
|
|
return new KV( 'format', f );
|
|
}
|
|
|
|
img_dimensions
|
|
= x:(n:[0-9]+ { return n.join('') })? y:('x' n:[0-9]+ { return n.join('') })? 'px' {
|
|
if ( x === '' && y ) {
|
|
return new KV( 'height', y );
|
|
} else if ( y === '' && x ) {
|
|
return new KV( 'width', x );
|
|
} else {
|
|
return [ new KV( 'width', x ), new KV( 'height', y ) ];
|
|
}
|
|
}
|
|
/ 'upright' { return [ new KV( 'width', 'upright' ) ] }
|
|
|
|
img_halign
|
|
= a:( 'left' / 'right' / 'center' / 'none' ) {
|
|
return new KV( 'halign', a );
|
|
}
|
|
|
|
img_valign
|
|
= a:( 'baseline' / 'sub' / 'super' / 'top' / 'text-top'
|
|
/ 'middle' / 'bottom' / 'text-bottom' ) {
|
|
return new KV( 'valign', a );
|
|
}
|
|
|
|
// External link targets are already parsed as link tokens. If we can make
|
|
// sure that those cleanly convert back to the original text, then we could
|
|
// re-parse them here.
|
|
img_link
|
|
= 'link=' space*
|
|
u:(
|
|
t:url {
|
|
stops.dec( 'pipe' );
|
|
return t;
|
|
}
|
|
/ & { return stops.dec( 'pipe' ); }
|
|
)
|
|
{
|
|
return new KV( 'link', u );
|
|
}
|
|
|
|
img_attribute
|
|
= k:( 'page' / 'alt' / 'thumbnail' / 'thumb' ) '=' t:preprocessor_text? {
|
|
return new KV( k, t );
|
|
}
|
|
|
|
|
|
|
|
/***********************************************************
|
|
* Pre and xmlish tags
|
|
***********************************************************/
|
|
|
|
// Indented pre blocks differ from their non-indented (purely tag-based)
|
|
// cousins by having their contents parsed.
|
|
pre_indent
|
|
= pre_indent_in_tags
|
|
/ l:pre_indent_line ls:(sol pre_indent_line)* {
|
|
return [new TagTk( 'pre' )]
|
|
.concat( [l], ls
|
|
, [new EndTagTk( 'pre' )]);
|
|
}
|
|
|
|
// An indented pre block that is surrounded with pre tags. The pre tags are
|
|
// used directly.
|
|
pre_indent_in_tags
|
|
= space+ // XXX: capture space for round-tripping
|
|
"<pre"
|
|
attribs:generic_attribute*
|
|
">"
|
|
& { return stops.inc('pre'); }
|
|
l:inlineline
|
|
ls:(sol pre_indent_line)*
|
|
"</pre>"
|
|
{
|
|
stops.dec('pre');
|
|
return [ new TagTk( 'pre', attribs ) ]
|
|
.concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] );
|
|
}
|
|
/ & { return stops.dec('pre'); }
|
|
|
|
pre_indent_line = space l:inlineline {
|
|
return [ '\n' ].concat(l);
|
|
}
|
|
|
|
/*
|
|
* Pre blocks defined using non-indented HTML tags only parse nowiki tags
|
|
* inside them, and convert other content to verbatim text. Nowiki inside pre
|
|
* is not functionally needed, but supported for backwards compatibility.
|
|
*/
|
|
pre
|
|
= "<pre"
|
|
attribs:generic_attribute*
|
|
">"
|
|
// MediaWiki <pre> is special in that it converts all pre content to plain
|
|
// text.
|
|
ts:(t1:[^<]+ { return t1.join('') }
|
|
/ nowiki
|
|
/ !"</pre>" t2:. { return t2 })+
|
|
("</pre>" / eof) {
|
|
// return nowiki tags as well?
|
|
//console.warn('inpre');
|
|
return [ new TagTk( 'pre', attribs ) ]
|
|
.concat(ts, [ new EndTagTk( 'pre' ) ]);
|
|
}
|
|
/ "</pre>" { return "</pre>"; }
|
|
|
|
/* XXX: Extension tags can require a change in the tokenizer mode, which
|
|
* returns any text between extension tags verbatim. For now, we simply
|
|
* continue to parse the contained text and return the tokens. The original
|
|
* input source can be recovered from the source positions added on tag
|
|
* tokens. This won't however work in all cases. For example, a comment start
|
|
* (<!--) between extension tags would cause the remaining text to be consumed
|
|
* as a comment. To avoid this, we might need to look ahead for the end tag
|
|
* and limit the content parsing to this section. */
|
|
|
|
xmlish_tag = nowiki / generic_tag
|
|
|
|
/*
|
|
* Nowiki treats anything inside it as plain text. It could thus also be
|
|
* defined as an extension that returns its raw input text, possibly wrapped
|
|
* in a span for round-trip information. The special treatment for nowiki in
|
|
* pre blocks would still remain in the grammar though, so overall handling it
|
|
* all here is cleaner.
|
|
*/
|
|
nowiki
|
|
= "<nowiki>" nc:nowiki_content "</nowiki>" {
|
|
//console.warn( 'full nowiki return: ' + pp(nc));
|
|
return nc;
|
|
}
|
|
/ "<nowiki>" {
|
|
//console.warn('nowiki fallback');
|
|
return ['<nowiki>'];
|
|
}
|
|
/ "</nowiki>" {
|
|
//console.warn('nowiki end fallback');
|
|
return ['</nowiki>'];
|
|
}
|
|
|
|
nowiki_content
|
|
= ts:( t:[^<]+ { return t.join('') }
|
|
/ "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
|
|
//console.warn('nested pre in nowiki');
|
|
return ["<pre"].concat(p0, p1, [">"], p2, ["</pre>"]).join('');
|
|
}
|
|
/ (!("</"( "nowiki>" / "pre>")) c:. {
|
|
//console.warn('nowiki: single char' + c);
|
|
return c;
|
|
})
|
|
)* {
|
|
// return nowiki tags as well?
|
|
//console.warn('nowiki_content: return' + pp(ts));
|
|
return [ts.join('')];
|
|
}
|
|
|
|
|
|
// The list of HTML5 tags, mainly used for the identification of *non*-html
|
|
// tags. Non-html tags terminate otherwise tag-eating productions (see list
|
|
// below) in order to support potential extension tags.
|
|
html5_tagnames
|
|
= "a" / "abbr" / "address" / "area" / "article"
|
|
/ "aside" / "audio" / "b" / "base" / "bdi" / "bdo" / "blockquote"
|
|
/ "body" / "br" / "button" / "canvas" / "caption" / "cite" / "code"
|
|
/ "col" / "colgroup" / "command" / "data" / "datalist" / "dd" / "del"
|
|
/ "details" / "dfn" / "div" / "dl" / "dt" / "em" / "embed" / "fieldset"
|
|
/ "figcaption" / "figure" / "footer" / "form"
|
|
/ "h1" / "h2" / "h3" / "h4" / "h5" / "h6" / "head" / "header" / "hgroup"
|
|
/ "hr" / "html" / "i" / "iframe" / "img" / "input" / "ins" / "kbd" / "keygen"
|
|
/ "label" / "legend" / "li" / "link" / "map" / "mark" / "menu" / "meta"
|
|
/ "meter" / "nav" / "noscript" / "object" / "ol" / "optgroup" / "option"
|
|
/ "output" / "p" / "param" / "pre" / "progress" / "q" / "rp" / "rt"
|
|
/ "ruby" / "s" / "samp" / "script" / "section" / "select" / "small"
|
|
/ "source" / "span" / "strong" / "style" / "sub" / "summary" / "sup"
|
|
/ "table" / "tbody" / "td" / "textarea" / "tfoot" / "th" / "thead" / "time"
|
|
/ "title" / "tr" / "track" / "u" / "ul" / "var" / "video" / "wbr"
|
|
|
|
html_oldnames = "center" / "font" / "tt"
|
|
|
|
// Simple match on non-html endtags, for use in inline_breaks
|
|
nonhtml_endtag
|
|
= '</'
|
|
! ( html5_tagnames / html_oldnames )
|
|
[^ >]+
|
|
( space / newline ) *
|
|
[^>]*
|
|
'>'
|
|
|
|
/* Generic XML-like tags
|
|
*
|
|
* These also cover extensions (including Cite), which will hook into the
|
|
* token stream for further processing. The content of extension tags is
|
|
* parsed as regular inline, but the source positions of the tag are added
|
|
* to allow reconstructing the unparsed text from the input. */
|
|
|
|
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
|
|
// following paragraphs
|
|
generic_tag
|
|
= "<"
|
|
& { tagStartPos = pos; return true; } // remember the start position of this tag
|
|
end:"/"? name:[0-9a-zA-Z]+
|
|
attribs:generic_newline_attribute*
|
|
( space / newline ) *
|
|
selfclose:"/"?
|
|
">" {
|
|
name = name.join('');
|
|
var res;
|
|
if ( end != '' ) {
|
|
res = new EndTagTk( name, attribs );
|
|
} else if ( selfclose != '' ) {
|
|
res = new SelfclosingTagTk( name, attribs );
|
|
} else {
|
|
res = new TagTk( name, attribs );
|
|
}
|
|
if ( res.dataAttribs === undefined ) {
|
|
res.dataAttribs = {};
|
|
}
|
|
res.dataAttribs.sourceTagPos = [tagStartPos - 1, pos];
|
|
return res;
|
|
}
|
|
|
|
// A generic attribute that can span multiple lines.
|
|
generic_newline_attribute
|
|
= s:( space / newline )*
|
|
name:generic_attribute_name
|
|
value:(( space / newline )*
|
|
v:generic_attribute_newline_value { return v })?
|
|
{
|
|
//console.warn('generic_newline_attribute: ' + pp( name ))
|
|
if ( value !== '' ) {
|
|
return new KV( name, value );
|
|
} else {
|
|
return new KV( name, '' );
|
|
}
|
|
}
|
|
|
|
// A single-line attribute.
|
|
generic_attribute
|
|
= s:space*
|
|
name:generic_attribute_name
|
|
value:(space*
|
|
v:generic_attribute_value { return v })?
|
|
{
|
|
//console.warn( 'generic attribute: ' + pp([name, value]));
|
|
// FIXME: name might just be a template, which can expand to a key-value
|
|
// pair later. We'll need to handle that in the AttributeTransformManager.
|
|
if ( value !== '' ) {
|
|
return new KV( name, value );
|
|
} else {
|
|
return new KV( name, '' );
|
|
}
|
|
}
|
|
|
|
// ( Replaced by generic_attribute_name for template / parameter support. )
|
|
//// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
|
|
//// disallow newlines, | and {.
|
|
//generic_attribute_plain_name
|
|
// = n:[^ \t\0/"'>=\n|{]+ {
|
|
// return n.join('');
|
|
// }
|
|
|
|
generic_attribute_name
|
|
= & { return stops.push( 'equal', true ) }
|
|
! '/>'
|
|
name:attribute_preprocessor_text_line
|
|
{
|
|
stops.pop( 'equal' );
|
|
//console.warn( 'generic attribute name: ' + pp( name ) );
|
|
return name;
|
|
}
|
|
/ & { return stops.pop( 'equal' ) }
|
|
|
|
// A generic attribute, possibly spanning multiple lines.
|
|
generic_attribute_newline_value
|
|
= "=" (space / newline )* v:xml_att_value {
|
|
return v;
|
|
}
|
|
// A generic but single-line attribute.
|
|
generic_attribute_value
|
|
= "=" space* v:att_value {
|
|
return v;
|
|
}
|
|
|
|
// Attribute value, quoted variants can span multiple lines.
|
|
xml_att_value
|
|
= "'" t:attribute_preprocessor_text_single "'" { return t; }
|
|
/ '"' t:attribute_preprocessor_text_double '"' { return t; }
|
|
/ attribute_preprocessor_text
|
|
|
|
// Attribute value, restricted to a single line.
|
|
att_value
|
|
= "'" t:attribute_preprocessor_text_single_line "'" { return t; }
|
|
/ '"' t:attribute_preprocessor_text_double_line '"' { return t; }
|
|
/ attribute_preprocessor_text_line
|
|
|
|
/*
|
|
* A variant of generic_tag, but also checks if the tag name is a block-level
|
|
* tag as defined in
|
|
* http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and following
|
|
* paragraphs.
|
|
*/
|
|
block_tag
|
|
= "<" end:"/"?
|
|
name:(cs:[a-zA-Z]+ { return cs.join('') })
|
|
attribs:generic_newline_attribute*
|
|
( space / newline ) *
|
|
selfclose:"/"?
|
|
">" {
|
|
if (block_names[name.toLowerCase()] !== true) {
|
|
// abort match if tag is not block-level
|
|
return null;
|
|
}
|
|
var res;
|
|
if ( end != '' ) {
|
|
res = new EndTagTk( name, attribs );
|
|
} else if ( selfclose != '' ) {
|
|
res = new SelfclosingTagTk( name, attribs );
|
|
} else {
|
|
res = new TagTk( name, attribs );
|
|
}
|
|
return [res];
|
|
}
|
|
|
|
|
|
/*********************************************************
|
|
* Lists
|
|
*********************************************************/
|
|
lists = e:(dtdd / li) es:(sol (dtdd / li))*
|
|
{
|
|
return annotateList( [ new TagTk( 'list' ) ]
|
|
.concat(flatten([e].concat(es))
|
|
,[ new EndTagTk( 'list' ) ]));
|
|
}
|
|
|
|
li = bullets:list_char+
|
|
c:inlineline?
|
|
&eolf
|
|
{
|
|
if ( c === '' )
|
|
c = [];
|
|
var li = new TagTk( 'listItem' );
|
|
li.bullets = bullets;
|
|
return [ li, c ];
|
|
}
|
|
|
|
dtdd
|
|
= bullets:(!(";" !list_char) list_char)*
|
|
";"
|
|
& {return stops.inc('colon');}
|
|
c:inlineline
|
|
":"
|
|
// Fortunately dtdds cannot be nested, so we can simply set the flag
|
|
// back to 0 to disable it.
|
|
& { stops.counters['colon'] = 0; return true;}
|
|
d:inlineline
|
|
&eolf {
|
|
// Convert trailing space into
|
|
// XXX: This should be moved to a serializer
|
|
//var clen = c.length;
|
|
//if (clen && c[clen - 1].constructor === String) {
|
|
// var val = c[clen - 1].value;
|
|
// if(val.length && val[val.length - 1] == ' ') {
|
|
// c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
|
|
// }
|
|
//}
|
|
|
|
var li = new TagTk( 'listItem' );
|
|
li.bullets = bullets + ";";
|
|
var li2 = new TagTk( 'listItem' );
|
|
li2.bullets = bullets + ":";
|
|
return [ li ].concat( c, [ li2 ], d );
|
|
}
|
|
// Fall-back case to clear the colon flag
|
|
/ & { return true; } { stops.counters['colon'] = 0; return null; }
|
|
|
|
|
|
list_char = [*#:;]
|
|
|
|
|
|
|
|
/*********************************************************************
|
|
* Tables
|
|
*
|
|
* Table productions are geared to support independent parsing of fragments in
|
|
* templates (the common table start / row / table end use case). The tokens
|
|
* produced by these fragments then match up to a table while building the
|
|
* DOM tree. For similar reasons, table rows do not emit explicit end tag
|
|
* tokens.
|
|
*
|
|
* The separate table_lines production is faster than moving those productions
|
|
* directly to block_lines.
|
|
*********************************************************************/
|
|
|
|
table_lines
|
|
= ! inline_breaks
|
|
& { return stops.push('table', true); }
|
|
tl:table_line
|
|
tls:( s:sol tl2:table_line { return s.concat(tl2); } )* {
|
|
stops.pop('table');
|
|
//console.warn('table_lines: ' + pp(tl.concat(tls)));
|
|
return tl.concat( tls );
|
|
}
|
|
/ & { return stops.pop('table'); }
|
|
|
|
// This production assumes start-of-line position!
|
|
table_line
|
|
= table_start_tag
|
|
/ table_row_tag
|
|
/ table_data_tags
|
|
/ table_heading_tags
|
|
/ table_caption_tag
|
|
/ table_end_tag
|
|
|
|
table_start_tag
|
|
= "{" pipe
|
|
ta:generic_attribute*
|
|
space*
|
|
te:table_end_tag? // can occur somewhere in the middle of the line too
|
|
{
|
|
var toks = [ new TagTk( 'table' ) ];
|
|
if ( ta )
|
|
toks[0].attribs = ta;
|
|
if ( te )
|
|
toks = toks.concat( te );
|
|
return toks;
|
|
}
|
|
|
|
table_caption_tag
|
|
= pipe "+"
|
|
c:inline* {
|
|
return [ new TagTk( 'caption' )]
|
|
.concat( c, [ new EndTagTk( 'caption' ) ]);
|
|
}
|
|
|
|
|
|
table_row_tag
|
|
= //& { console.warn("table row enter @" + input.substr(pos, 30)); return true; }
|
|
pipe "-"
|
|
a:generic_attribute*
|
|
space*
|
|
// handle tables with missing table cells after a row
|
|
td:( s:sol !( pipe / "!" ) tdt:table_data_tag { return s.concat(tdt); } )?
|
|
{
|
|
// We rely on our tree builder to close the row as needed. This is
|
|
// needed to support building tables from fragment templates with
|
|
// individual cells or rows.
|
|
var trToken = [ new TagTk( 'tr', a ) ];
|
|
if ( !td ) {
|
|
return trToken;
|
|
} else {
|
|
return trToken.concat(td);
|
|
}
|
|
}
|
|
|
|
table_data_tags
|
|
= pipe
|
|
td:table_data_tag
|
|
tds:( pipe_pipe tdt:table_data_tag { return tdt } )* {
|
|
return td.concat(tds);
|
|
}
|
|
|
|
table_data_tag
|
|
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
|
|
! [}+-]
|
|
a:table_cell_args?
|
|
//& { console.warn("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
|
|
// use inline_breaks to break on tr etc
|
|
td:( //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
|
|
b:nested_block { return b }
|
|
)*
|
|
{
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
//console.warn("table data result: " + pp(td) + ", attribts: " + pp(a));
|
|
return [ new TagTk( 'td', a )]
|
|
.concat( td );
|
|
}
|
|
|
|
table_heading_tags
|
|
= //& { console.warn( 'th enter @' + input.substr(pos, 10)); return true; }
|
|
"!"
|
|
th:table_heading_tag
|
|
ths:( "!!" tht:table_heading_tag { return tht } )* {
|
|
return th.concat(ths);
|
|
}
|
|
|
|
table_heading_tag
|
|
= a:table_cell_args?
|
|
c:nested_block* {
|
|
if ( a === '' ) {
|
|
a = [];
|
|
}
|
|
return [ new TagTk( 'th', a ) ]
|
|
.concat( c );
|
|
}
|
|
|
|
table_end_tag
|
|
= pipe "}" {
|
|
var tok = [new EndTagTk( 'table' )];
|
|
return tok;
|
|
}
|
|
|
|
table_cell_args
|
|
= & { return stops.inc('tableCellArg'); }
|
|
as:generic_attribute* space* pipe !pipe {
|
|
stops.dec('tableCellArg');
|
|
//console.warn( 'tcargs' + JSON.stringify( as ) );
|
|
return as;
|
|
}
|
|
/ & { return stops.dec('tableCellArg'); }
|
|
|
|
|
|
|
|
/* Tables, full-table version (no support for table fragments from templates)
|
|
* Only left here for comparison purposes with the above productions. Remove
|
|
* when those work as intended! */
|
|
table
|
|
= tas:table_start space* c:table_caption? b:table_body? te:table_end {
|
|
var res = new TagTk( 'table' );
|
|
var body = b !== '' ? b : [];
|
|
//dp("body: " + pp(body));
|
|
if (tas.length > 0) {
|
|
// FIXME: actually parse and build structure
|
|
//res.attribs = [new KV('data-unparsed', tas.join(''))];
|
|
res.attribs = tas;
|
|
}
|
|
|
|
if (c != '') {
|
|
var caption = [ new TagTk( 'caption' ) ]
|
|
.concat(c, [new EndTagTk( 'caption' )], te);
|
|
} else {
|
|
var caption = [];
|
|
}
|
|
//dp(pp(res));
|
|
|
|
return [res].concat(caption, body, [new EndTagTk( 'table' )]);
|
|
}
|
|
|
|
table_start
|
|
= "{" pipe
|
|
res:(
|
|
& { stops.push('table', true); return true; }
|
|
ta:generic_attribute*
|
|
{
|
|
//dp("table_start " + pp(ta) + ", pos:" + pos);
|
|
return ta;
|
|
}
|
|
/ & { stops.pop('table'); return false; } { return null; }
|
|
) { return res }
|
|
|
|
table_caption
|
|
= n:newlineToken
|
|
pipe "+" c:nested_block* {
|
|
return n.concat(c);
|
|
}
|
|
|
|
table_body
|
|
= //& { dp("table_body enter"); return true; }
|
|
firstrow:table_firstrow otherrows:table_row* {
|
|
/* dp('table first and otherrows: '
|
|
* + pp([firstrow].concat(otherrows))); */
|
|
return [firstrow].concat(otherrows);
|
|
}
|
|
/ otherrows:table_row* {
|
|
//dp('table otherrows: ' + pp(otherrows));
|
|
return otherrows;
|
|
}
|
|
|
|
table_firstrow
|
|
= td:(table_data / table_heading)+ {
|
|
//dp('firstrow: ' + pp(td));
|
|
return [ new TagTk( 'tr' )]
|
|
.concat(td, [ new EndTagTk( 'tr' )]);
|
|
}
|
|
|
|
table_row
|
|
= //& { dp("table row enter"); return true; }
|
|
n:newlineToken
|
|
pipe "-"
|
|
a:(as:generic_attribute+ space* pipe !pipe { return as } )?
|
|
space*
|
|
td:(table_data / table_heading)* {
|
|
return n.concat([ new TagTk( 'tr' )]
|
|
, td, [ new EndTagTk( 'tr' )]);
|
|
}
|
|
|
|
table_data
|
|
= //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
|
|
n:(pipe_pipe { return [] } / nt:newlineToken (pipe / !'!') { return nt })
|
|
! [}+-]
|
|
//& { dp('before attrib, pos=' + pos); return true; }
|
|
a:(as:generic_attribute+ space* end_pipe { return as } )?
|
|
//& { dp('past attrib, pos=' + pos); return true; }
|
|
// use inline_breaks to break on tr etc
|
|
td:(
|
|
//& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
|
|
b:nested_block+ { return b }
|
|
)* {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
//dp("table data result: " + pp(td) + ", attribts: " + pp(a));
|
|
return n.concat( [ new TagTk( 'td', a ) ]
|
|
, td, [ new EndTagTk( 'td' ) ] );
|
|
}
|
|
|
|
table_heading
|
|
= n:("!!" { return [] } / nl:newlineToken "!" { return nl })
|
|
a:(as:generic_attribute+ end_pipe { return as } )?
|
|
c:inline {
|
|
if ( a == '' ) {
|
|
a = [];
|
|
}
|
|
return n.concat( [ new TagTk( 'th', a )]
|
|
, c, [ new EndTagTk( 'th' ) ]);
|
|
}
|
|
|
|
thtd_attribs
|
|
// In particular, do not match [|\n]
|
|
= a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ end_pipe {
|
|
return a;
|
|
}
|
|
|
|
|
|
table_end
|
|
= nt:newlineToken? ( pipe "}" / eof ) {
|
|
stops.pop('table');
|
|
if(nt)
|
|
return nt;
|
|
else
|
|
return [];
|
|
}
|
|
|
|
|
|
|
|
/*******************************************************************
|
|
* Text variants and other general productions
|
|
*******************************************************************/
|
|
|
|
/* All chars that cannot start syntactic structures in the middle of a line
|
|
* XXX: ] and other end delimiters should probably only be activated inside
|
|
* structures to avoid unnecessarily leaving the text production on plain
|
|
* content. */
|
|
|
|
text_char = [^'<~[{\n\r:\]}|!=]
|
|
|
|
text = t:text_char+ { return t.join(''); }
|
|
|
|
/* Legend
|
|
* ' quotes (italic/bold)
|
|
* < start of xmlish_tag
|
|
* ~ signatures/dates
|
|
* [ start of links
|
|
* { start of parser functions, transclusion and template args
|
|
* \n all sort of block-level markup at start of line
|
|
* \r ditto
|
|
* h http(s) urls
|
|
* n nntp(s) urls
|
|
* m mailto urls
|
|
* I start of ISBN 10/13 auto links
|
|
* P start of PMID auto links
|
|
* R start of RFC auto links
|
|
*
|
|
* _ behavior switches (e.g., '__NOTOC__') (XXX: not URL related)
|
|
* ! and | table cell delimiters, might be better to specialize those
|
|
* = headings - also specialize those!
|
|
*
|
|
* The following chars are also included for now, but only apply in some
|
|
* contexts and should probably be enabled only in those:
|
|
* : separate definition in ; term : definition
|
|
* ] end of link
|
|
* } end of parser func/transclusion/template arg
|
|
*/
|
|
|
|
urltext = ( t:[^'<~[{\n\rIPRfghimnstw_|!:\]} &=]+ { return t.join(''); }
|
|
/ & url_chars autolink
|
|
/ & ('__') behavior_switch
|
|
/ htmlentity
|
|
// Convert trailing space into
|
|
// XXX: This should be moved to a serializer
|
|
/ ' ' & ':' { return "\u00a0"; }
|
|
/ t:text_char )+
|
|
|
|
/*
|
|
'//', // for protocol-relative URLs, but not in text!
|
|
'ftp://',
|
|
'git://',
|
|
'gopher://',
|
|
'http://',
|
|
'https://',
|
|
'irc://',
|
|
'ircs://', // @bug 28503
|
|
'mailto:',
|
|
'mms://',
|
|
'news:',
|
|
'nntp://', // @bug 3808 RFC 1738
|
|
'svn://',
|
|
'telnet://', // Well if we're going to support the above.. -ævar
|
|
'worldwind://',
|
|
*/
|
|
|
|
// Old version
|
|
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
|
|
|
|
// Experimental tweaked version: avoid expensive single-char substrings
|
|
// This did not bring the expected performance boost, however.
|
|
//text = [A-Za-z0-9,._ -] {
|
|
// textStart = pos;
|
|
//
|
|
// var res = input.substr(textStart - 1, inputLength)
|
|
// .match(/[A-Za-z0-9,._ -]+/)[0];
|
|
// pos = pos + (res.length - 1);
|
|
// return res
|
|
// }
|
|
|
|
htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
|
|
return unentity("&" + c.join('') + ";")
|
|
}
|
|
|
|
space
|
|
= s:[ \t]+ { return s.join(''); }
|
|
|
|
optionalSpaceToken
|
|
= s:space* {
|
|
if ( s.length ) {
|
|
return [s.join('')];
|
|
} else {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
|
|
// Start of line
|
|
sol = nl:(newlineToken / & { return pos === 0; } { return [] })
|
|
// Eat multi-line comments, so that syntax after still matches as if it
|
|
// was actually preceded by a newline
|
|
cn:( c:comment n:newline? {
|
|
if ( n !== '' ) {
|
|
return [c, n];
|
|
} else {
|
|
return [c];
|
|
}
|
|
}
|
|
)*
|
|
// Eat includeonly/noinclude at start of line, so that start-of-line
|
|
// syntax after it still matches
|
|
ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
|
|
{
|
|
var niToken = [];
|
|
if ( ni !== '') {
|
|
if ( ni[0] === '/' ) {
|
|
niToken = [new EndTagTk( ni[1] )];
|
|
} else {
|
|
niToken = [new TagTk( ni[1] )];
|
|
}
|
|
}
|
|
|
|
return nl.concat(cn, niToken);
|
|
}
|
|
|
|
eof = & { return isEOF(pos); } { return true; }
|
|
|
|
|
|
newline
|
|
= '\n' / '\r\n'
|
|
|
|
newlineToken = newline { return [new NlTk()] }
|
|
|
|
eolf = newline / eof
|
|
|
|
|
|
// 'Preprocessor' directive- higher-level things that can occur in otherwise
|
|
// plain-text content.
|
|
directive
|
|
= comment
|
|
/ nowiki
|
|
/ tplarg_or_template
|
|
/ htmlentity
|
|
|
|
// Plain text, but can contain templates, template arguments, comments etc-
|
|
// all stuff that is normally handled by the preprocessor
|
|
// Returns either a list of tokens, or a plain string (if nothing is to be
|
|
// processed).
|
|
preprocessor_text
|
|
= r:( t:[^<~[{\n\r\t|!\]}{ &=]+ { return t.join(''); }
|
|
/ !inline_breaks (
|
|
directive
|
|
/ text_char )
|
|
)+ {
|
|
return flatten ( r );
|
|
}
|
|
|
|
spaceless_preprocessor_text
|
|
= r:( t:[^'<~[{\n\r|!\]}{\t &=]+ { return t.join(''); }
|
|
/ !inline_breaks (
|
|
directive
|
|
/ !' ' text_char )
|
|
)+ {
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
|
|
wikilink_preprocessor_text
|
|
= r:( t:[^<~[{\n\r\t|!\]}{ &=]+ { return t.join(''); }
|
|
/// urlencoded_char
|
|
/ !inline_breaks ( directive / !"]]" text_char )
|
|
)+ {
|
|
return flatten_stringlist ( r );
|
|
}
|
|
|
|
extlink_preprocessor_text
|
|
// added special separator character class inline: separates url from
|
|
// description / text
|
|
= r:( t:[^'<~[{\n\r|!\]}{\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
|
|
/ !inline_breaks ( directive / no_punctuation_char )
|
|
/// urlencoded_char
|
|
// !inline_breaks no_punctuation_char
|
|
/ s:[.:,] !(space / eolf) { return s }
|
|
/ [&%] )+ {
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
// Attribute values with preprocessor support
|
|
attribute_preprocessor_text
|
|
= r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t ] {return t})+ { return ts.join(''); }
|
|
/ !inline_breaks (
|
|
directive
|
|
/ !inline_breaks [&%]
|
|
)
|
|
)+
|
|
{
|
|
//console.warn('prep');
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
attribute_preprocessor_text_single
|
|
= r:( t:[^{&']+ { return t.join(''); }
|
|
/ !inline_breaks (
|
|
directive
|
|
/ [{&] )
|
|
)*
|
|
{
|
|
return flatten_string ( r );
|
|
}
|
|
attribute_preprocessor_text_double
|
|
= r:( t:[^{&"]+ { return t.join(''); }
|
|
/ !inline_breaks (
|
|
directive
|
|
/ [{&] )
|
|
)*
|
|
{
|
|
//console.warn( 'double:' + pp(r) );
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
// Variants with the entire attribute on a single line
|
|
attribute_preprocessor_text_line
|
|
= r:( ts:(!inline_breaks t:[^=<>{\n\r&'"\t \[] {return t})+ { return ts.join(''); }
|
|
/ !inline_breaks (
|
|
directive
|
|
/ !'\n' [&%] )
|
|
)+ {
|
|
//console.warn('prep');
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
attribute_preprocessor_text_single_line
|
|
= r:( t:[^{&']+ { return t.join(''); }
|
|
/ !inline_breaks (
|
|
directive
|
|
/ !'\n' [{&] )
|
|
)* {
|
|
return flatten_string ( r );
|
|
}
|
|
attribute_preprocessor_text_double_line
|
|
= r:( t:[^{&"]+ { return t.join(''); }
|
|
/ !inline_breaks (
|
|
directive
|
|
/ !'\n' [{&] )
|
|
)* {
|
|
//console.warn( 'double:' + pp(r) );
|
|
return flatten_string ( r );
|
|
}
|
|
|
|
// Special-case support for those pipe templates
|
|
pipe = "|" / "{{!}}"
|
|
|
|
end_pipe = "|" ! "|" / "{{!}}" ! "{{!}}"
|
|
|
|
pipe_pipe = "||" / "{{!}}{{!}}"
|
|
|
|
|
|
// Similar, for tables..
|
|
exclam = "!" / "{{;}}"
|
|
|
|
/* Tabs do not mix well with the hybrid production syntax */
|
|
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent : */
|