mediawiki-extensions-Visual.../modules/parser/pegParser.pegjs.txt
Gabriel Wicke 33e19f7275 Recognize block-level elements independent of case; Ignore toc and section
edit links in tests. 148 parser tests passing.
2011-12-05 20:03:24 +00:00

1269 lines
39 KiB
JavaScript

/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
{
/* Fixme: use static functions to separate module! Unfortunately, this
* does not work:
* var tu = require('./mediawiki.tokenizer.utils.js');
* console.log(tu.flatten([]));
* Using exports in the module gets a bit further, but accesses to
* tu.flatten in productions still fail. Thus, I just moved the functions
* here until a solution is found:
*/
/* Static utilities */
// Flatten a list of lists.
var flatten = function ( e ) {
var es = [];
// flatten sub-arrays
for(var i = 0, length = e.length; i < length; i++) {
var ei = e[i];
if ($.isArray(ei))
es = es.concat(flatten(ei));
else
es.push(ei);
};
return es;
};
// Remove escaped quotes from attributes etc
var unquote = function (quotec, text) {
return text.replace('\\' + quotec, quotec);
};
// Debug print with global switch
var dp = function ( msg ) {
if ( false ) {
console.log(msg);
}
};
var pp = function ( s ) { return JSON.stringify(s, null, 2); }
/*
* Annotate a token stream with list items with appropriate list tokens
*
* @static
* @method
* @param {[tokens]} Token stream with li tokens
* @returns {[tokens]} Token stream, possibly with additional list tokens
* */
var annotateList = function ( tokens ) {
var out = [], // List of tokens
bstack = [], // Bullet stack, previous element's listStyle
bnext = [], // Next element's listStyle
endtags = []; // Stack of end tags
var commonPrefixLength = function (x, y) {
var minLength = Math.min(x.length, y.length);
for(var i = 0; i < minLength; i++) {
if (x[i] != y[i])
break;
}
return i;
};
var pushList = function ( listName, itemName ) {
out.push({type: 'TAG', name: listName});
out.push({type: 'TAG', name: itemName});
endtags.push({type: 'ENDTAG', name: listName});
endtags.push({type: 'ENDTAG', name: itemName});
};
var popTags = function ( n ) {
for(;n > 0; n--) {
// push list item..
out.push(endtags.pop());
// and the list end tag
out.push(endtags.pop());
}
};
var isDlDd = function (a, b) {
var ab = [a,b].sort();
return (ab[0] === ':' && ab[1] === ';');
};
var doListItem = function ( bs, bn ) {
var prefixLen = commonPrefixLength (bs, bn);
var changeLen = bn.length - prefixLen;
var prefix = bn.slice(0, prefixLen);
// emit close tag tokens for closed lists
if (changeLen === 0) {
var itemToken = endtags.pop();
out.push(itemToken);
out.push({type: 'TAG', name: itemToken.name});
endtags.push({type: 'ENDTAG', name: itemToken.name});
} else if ( bs.length == bn.length
&& changeLen == 1
&& isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
// handle dd/dt transitions
out.push(endtags.pop());
if( bn[prefixLen] == ';') {
var newName = 'dt';
} else {
var newName = 'dd';
}
out.push({type: 'TAG', name: newName});
endtags.push({type: 'ENDTAG', name: newName});
} else {
popTags(bs.length - prefixLen);
for(var i = prefixLen; i < bn.length; i++) {
switch (bn[i]) {
case '*':
pushList('ul', 'li');
break;
case '#':
pushList('ol', 'li');
break;
case ';':
pushList('dl', 'dt');
break;
case ':':
pushList('dl', 'dd');
break;
default:
throw("Unknown node prefix " + prefix[i]);
}
}
}
};
for (var i = 0, length = tokens.length; i < length; i++) {
var token = tokens[i];
switch ( token.type ) {
case 'TAG':
switch (token.name) {
case 'list':
// ignore token
break;
case 'listItem':
// convert listItem to list and list item tokens
bnext = token.bullets;
doListItem( bstack, bnext );
bstack = bnext;
break;
default:
// pass through all remaining start tags
out.push(token);
break;
}
break;
case 'ENDTAG':
if ( token.name == 'list' ) {
// pop all open list item tokens
popTags(bstack.length);
bstack = "";
} else {
out.push(token);
}
break;
default:
out.push(token);
break;
}
}
return out;
};
/*
* Italic/Bold handling.
*
* - list of tokens
* - NEWLINE
* - ticks (2+) -> list with link in line token list?
* - process on newline
* - need access to text nodes before/after for conversion back to text
*/
var doQuotes = function ( tokens ) {
var italics = [],
bolds = [],
out = [],
inserted = 0;
var convertBold = function ( i ) {
var index = bolds[i];
var txt = out[index - 1];
txt.value += "'";
bolds = bolds.slice(0, i)
.concat(bolds.slice(i + 1, bolds.length - i - 1));
italics.push(index);
italics.sort();
};
// convert italics/bolds into tags
var quotesToTags = function ( offsets, name ) {
var toggle = true;
for (var j = 0; j < offsets.length; j++) {
var t = out[offsets[j]];
if(toggle) {
t.type = 'TAG';
} else {
t.type = 'ENDTAG';
}
t.name = name;
delete t.value;
toggle = !toggle;
}
if (!toggle) {
// add end tag
out.push({type: 'ENDTAG', name: name});
inserted++;
}
toggle = true;
};
for (var i = 0, length = tokens.length; i < length; i++) {
var token = tokens[i];
switch (token.type) {
case 'QUOTE':
// depending on length, add starting 's to preceding text node
// (if any)
// add token index to italic/bold lists
// add placeholder for token
var qlen = token.value.length;
switch (qlen) {
case 2: italics.push(i + inserted); out.push(token); break;
case 3: bolds.push(i + inserted); out.push(token); break;
case 4:
token.value = "'''";
if (i > 0 && tokens[i-1].type === 'TEXT') {
tokens[i-1].value += "'";
} else {
out.push({type: 'TEXT', value: "'"});
inserted++;
}
bolds.push(i + inserted);
out.push(token);
break;
case 5:
// order does not matter here, will be fixed
// by HTML parser backend
italics.push(i + inserted);
out.push({type: 'QUOTE', value: "''"});
inserted++;
bolds.push(i + inserted);
out.push({type: 'QUOTE', value: "'''"});
break;
default: // longer than 5, only use the last 5 ticks
token.value = "'''''";
var newvalue = token.value.substr(0, qlen - 5 );
if (i > 0 && tokens[i-1].type === 'TEXT') {
tokens[i-1].value += newvalue;
} else {
out.push({type: 'TEXT', value: newvalue});
inserted++;
}
italics.push(i + inserted);
out.push({type: 'QUOTE', value: "''"});
inserted++;
bolds.push(i + inserted);
out.push({type: 'QUOTE', value: "'''"});
break;
}
break;
case 'NEWLINE':
// balance out tokens, convert placeholders into tags
if (italics.length % 2 && bolds.length % 2) {
dp("balancing!");
var firstsingleletterword = -1,
firstmultiletterword = -1,
firstspace = -1;
for (var j = 0; j < bolds.length; j++) {
var ticki = bolds[j];
if (ticki > 0 && out[ticki - 1].type === 'TEXT') {
var txt = out[ticki - 1],
lastchar = txt.value[txt.value.length - 1],
secondtolastchar = txt.value[txt.value.length - 2];
dp('txt: ' + pp(txt));
if (lastchar === ' ' && firstspace === -1) {
firstspace = j;
} else if (lastchar !== ' ') {
if ( secondtolastchar === ' ' &&
firstsingleletterword === -1) {
firstsingleletterword = j;
} else if ( secondtolastchar &&
secondtolastchar !== ' ') {
firstmultiletterword = j;
}
}
}
}
// now see if we can convert a bold to an italic and
// an apostrophe
if (firstsingleletterword > -1) {
convertBold(firstsingleletterword);
} else if (firstmultiletterword > -1) {
convertBold(firstmultiletterword);
} else if (firstspace > -1) {
convertBold(firstspace);
}
}
quotesToTags(bolds, 'b');
quotesToTags(italics, 'i');
bolds = [];
italics = [];
out.push(token);
break;
default:
out.push(token);
}
}
return out;
};
/* End static utilities */
/*
* Flags for specific parse environments (inside tables, links etc). Flags
* trigger syntactic stops in the inline_breaks production, which
* terminates inline and attribute matches. Flags merely reduce the number
* of productions needed: The grammar is still context-free as the
* productions can just be unrolled for all combinations of environments
* at the cost of a much larger grammar.
*/
var syntaxFlags = {};
var setFlag = function(flag) {
if (syntaxFlags[flag] !== undefined) {
syntaxFlags[flag]++;
} else {
syntaxFlags[flag] = 1;
}
return true;
};
var clearFlag = function(flag) {
syntaxFlags[flag]--;
};
// Start position of top-level block
// Could also provide positions for lower-level blocks using a stack.
var blockStart = 0;
// cache the input length
var inputLength = input.length;
// pseudo-production that matches at end of input
var isEOF = function (pos) {
return pos === inputLength;
};
// text start position
var textStart = 0;
// hack to support numbered external links ([http://example.com]).
// XXX: Move to token stream transform after templates are expanded!
var linkCount = 1;
// Define block-level tags in JS, so we can use toLowerCase to match tags
// case-independently. This would be quite ugly (and possibly slower) if
// done manually in the grammar.
var block_names = (function () {
var names = [ "p", "table", "td", "tr", "ul", "ol"
, "li", "dl", "dt", "dd", "div", "center"
, "blockquote" ];
var bnames = {};
for(var i = 0, l = names.length; i < l; i++) {
bnames[names[i]] = true;
}
return bnames;
})();
}
start
= e:toplevelblock* newline* {
return flatten(e);
}
/* All chars that cannot start syntactic structures in the middle of a line
* XXX: ] and other end delimiters should probably only be activated inside
* structures to avoid unnecessarily leaving the text production on plain
* content. */
text_char = [^'<~[{\n\r:\]}|!=]
text = t:text_char+ { return t.join(''); }
/* Explanation of chars
* ' quotes (italic/bold)
* < start of xmlish_tag
* ~ signatures/dates
* [ start of links
* { start of parser functions, transclusion and template args
* \n all sort of block-level markup at start of line
* \r ditto
* h http(s) urls
* n nntp(s) urls
* m mailto urls
*
* ! and | table cell delimiters, might be better to specialize those
* = headings - also specialize those!
*
* The following chars are also included for now, but only apply in some
* contexts and should probably be enabled only in those:
* : separate definition in ; term : definition
* ] end of link
* } end of parser func/transclusion/template arg
*/
urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
// XXX: use general entity decode!
/ "&amp;" { return "&"; } // decode ampersand in text
/ urllink
// Convert trailing space into &nbsp;
// XXX: This should be moved to a serializer
/ ' ' & ':' { return "\u00a0"; }
/ t:text_char )+
/*
'//', // for protocol-relative URLs, but not in text!
'ftp://',
'git://',
'gopher://',
'http://',
'https://',
'irc://',
'ircs://', // @bug 28503
'mailto:',
'mms://',
'news:',
'nntp://', // @bug 3808 RFC 1738
'svn://',
'telnet://', // Well if we're going to support the above.. -ævar
'worldwind://',
*/
// Old version
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
// Experimental tweaked version: avoid expensive single-char substrings
// This did not bring the expected performance boost, however.
//text = [A-Za-z0-9,._ -] {
// textStart = pos;
//
// var res = input.substr(textStart - 1, inputLength)
// .match(/[A-Za-z0-9,._ -]+/)[0];
// pos = pos + (res.length - 1);
// return res
// }
space
= s:[ \t]+ { return s.join(''); }
// Start of line
sol = (newline / & { return pos === 0; } { return true; })
cn:(c:comment n:newline? { return [c, {type: 'TEXT', value: n}] })* {
return [{type: 'NEWLINE'}].concat(cn);
}
eof = & { return isEOF(pos); } { return true; }
newline
= '\n' / '\r\n'
eolf = newline / eof
toplevelblock
= & { blockStart = pos; return true; } b:block {
b = flatten(b);
var bs = b[0];
//dp('toplevelblock:' + pp(b));
if (bs.attribs === undefined) {
bs.attribs = [];
}
bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);
// XXX: only run this for lines that actually need it!
b.push({type: 'NEWLINE'});
b = doQuotes(b);
return b;
}
block
= block_lines
/ pre
/ comment &eolf
/ nowiki
/ pre
/ bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
/ para
/ inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
/ s:sol {
if (s) {
return [s, {type: 'NEWLINE'}];
} else {
return [{type: 'NEWLINE'}];
}
}
block_lines
= s:sol
// eat an empty line before the block
s2:(ss:space* so:sol { return [{type: 'TEXT', value: ss.join('')}].concat(so) })?
bl:block_line {
var s2_ = (s2 !== '') ? s2 : [];
return s.concat(s2_, bl);
}
// Block structures with start-of-line wiki syntax
block_line
= h
/ table
/ lists
// tag-only lines should not trigger pre
/ space* bt:block_tag space* &eolf { return bt }
/ pre_indent
/ pre
// TODO: convert inline content to annotations!
para
= s1:sol s2:sol c:inlineline {
return s1.concat(s2, [{type: 'TAG', name: 'p'}], c);
}
br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
// Syntax stops to limit inline expansion defending on syntactic context
inline_breaks
=
& { // Important hack: disable caching for this production, as the default
// cache key does not take into account flag states!
cacheKey = '';
return true;
}
& { return syntaxFlags['table']; }
a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; }
/ & { return (syntaxFlags['colon'] &&
! syntaxFlags.extlink &&
! syntaxFlags.linkdesk); } ":" { return true; }
/ & { return syntaxFlags['extlink']; } "]" { return true; }
/ & { return syntaxFlags['linkdesc']; } link_end { return true; }
/ & { return syntaxFlags['h']; }
( & { return syntaxFlags['h1'] } '=' space* newline { return true; }
/ & { return syntaxFlags['h2'] } '==' space* newline { return true; }
/ & { return syntaxFlags['h3'] } '===' space* newline { return true; }
/ & { return syntaxFlags['h4'] } '====' space* newline { return true; }
/ & { return syntaxFlags['h5'] } '=====' space* newline { return true; }
/ & { return syntaxFlags['h6'] } '======' space* newline { return true; }
)
inline
= c:(urltext / inline_element / (!inline_breaks ch:. { return ch; }))+ {
var out = [];
var text = [];
c = flatten(c);
for (var i = 0; i < c.length; i++) {
if (typeof c[i] == 'string') {
text.push(c[i]);
} else {
if (text.length) {
out.push({ type: "TEXT", value: text.join('') });
text = [];
}
out.concat(c[i]);
}
}
if (text.length) {
out.push({ type: 'TEXT', value: text.join('') });
}
return out;
}
inlineline
= c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
var out = [];
var text = [];
c = flatten(c);
for (var i = 0; i < c.length; i++) {
if (typeof c[i] == 'string') {
text.push(c[i]);
} else {
if (text.length) {
out.push({type: 'TEXT', value: text.join('')});
text = [];
}
out.push(c[i]);
}
}
if (text.length) {
out.push({type: 'TEXT', value: text.join('')});
}
dp('inlineline out:', pp(out));
return out;
}
/* TODO: convert all these to annotations!
* -> need (start, end) offsets within block
*/
inline_element
= & { dp('inline_element enter' + input.substr(pos, 10)); return true; }
comment
// Can actually also be block-level elements, we don't really try to enforce
// a content model in the tokenizer. The HTML tree builder and DOM
// transformations are better equipped to deal with it.
/ xmlish_tag
/ template
/ wikilink
/ extlink
/ quote
/* Headings
*
* Listed in reverse order on purpose ;) */
h = h6 / h5 / h4 / h3 / h2 / h1
/* We might want to consider using a single rule for all headings, and
* figuring out the level in the action. This saves quite some backtracking,
* and the conversion of equal signs into text should not be a problem as
* equals are not part of other syntax. */
h1 = '='
r:(
& { setFlag('h'); return setFlag('h1') }
c:inlineline '=' (space / comment)*
&eolf
{
clearFlag('h');
clearFlag('h1');
return [{type: 'TAG', name: 'h1'}]
.concat(c, [{type: 'ENDTAG', name: 'h1'}]);
}
/ & { dp('nomatch exit h1'); clearFlag('h'); clearFlag('h1'); return false } { return null }
) { return r }
h2 = '=='
r:(
& { setFlag('h'); return setFlag('h2') }
c:inlineline '==' (space / comment)* &newline {
clearFlag('h');
clearFlag('h2');
return [{type: 'TAG', name: 'h2'}]
.concat(c, [{type: 'ENDTAG', name: 'h2'}]);
}
/ & { clearFlag('h'); clearFlag('h2'); return false }
) { return r }
h3 = '==='
r:(
& { setFlag('h'); return setFlag('h3') }
c:inlineline '===' comment? &newline {
clearFlag('h');
clearFlag('h3');
return [{type: 'TAG', name: 'h3'}]
.concat(c, [{type: 'ENDTAG', name: 'h3'}]);
}
/ & { clearFlag('h'); clearFlag('h3'); return false }
) { return r }
h4 = '===='
r:(
& { setFlag('h'); return setFlag('h4') }
c:inlineline '====' comment? &newline {
clearFlag('h');
clearFlag('h4');
return [{type: 'TAG', name: 'h4'}]
.concat(c, [{type: 'ENDTAG', name: 'h4'}]);
}
/ & { clearFlag('h'); clearFlag('h4'); return false }
) { return r }
h5 = '====='
r:(& { setFlag('h'); return setFlag('h5') }
c:inlineline '=====' comment? &newline {
clearFlag('h');
clearFlag('h5');
return [{type: 'TAG', name: 'h5'}]
.concat(c, [{type: 'ENDTAG', name: 'h5'}]);
}
/ & { clearFlag('h'); clearFlag('h5'); return false }
) { return r }
h6 = '======'
r:(& { setFlag('h'); return setFlag('h6') }
c:inlineline '======' comment? &newline {
clearFlag('h');
clearFlag('h6');
return [{type: 'TAG', name: 'h6'}]
.concat(c, [{type: 'ENDTAG', name: 'h6'}]);
}
/ & { clearFlag('h'); clearFlag('h6'); return false }
) { return r }
pre_indent
= l:pre_indent_line ls:(sol pre_indent_line)* {
return [{type: 'TAG', name: 'pre'}]
.concat( [l], ls
, [{type: 'ENDTAG', name: 'pre'}]);
}
pre_indent_line = space l:inlineline {
return [{type: 'TEXT', value: '\n'}].concat(l);
}
comment
= '<!--' c:comment_chars* ('-->' / eof)
cs:(space* newline space* cn:comment { return cn })* {
return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
}
comment_chars
= c:[^-] { return c; }
/ c:'-' !'->' { return c; }
urllink
= target:url {
return [ { type: 'TAG',
name: 'a',
attribs: [['href', target]] }
, {type: 'TEXT', value: target}
, {type: 'ENDTAG', name: 'a'}
];
}
extlink
= "["
& { return setFlag('extlink'); }
target:url
space*
text:inlineline?
"]" {
clearFlag('extlink');
if ( text == '' ) {
// XXX: Link numbering should be implemented in post-processor.
text = [{type: 'TEXT', value: "[" + linkCount + "]"}];
linkCount++;
}
return [ { type: 'TAG',
name: 'a',
attribs: [['href', target]] } ]
.concat( text
, [{type: 'ENDTAG', name: 'a'}]);
}
/ "[" & { clearFlag('extlink'); return false; }
/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
* be configured dynamically. */
url_protocol
= '//' // for protocol-relative URLs
/ 'ftp://'
/ 'git://'
/ 'gopher://'
/ 'http://'
/ 'https://'
/ 'irc://'
/ 'ircs://' // @bug 28503
/ 'mailto:'
/ 'mms://'
/ 'news:'
/ 'nntp://' // @bug 3808 RFC 1738
/ 'svn://'
/ 'telnet://' // Well if we're going to support the above.. -ævar
/ 'worldwind://'
// javascript does not support unicode features..
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
url
= proto:url_protocol
rest:( [^ :\]\[\n"'<>\x00-\x20\x7f,.&\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]
/ s:[.:,] !(space / eolf) { return s }
// XXX: use general entity decode!
/ '&amp;' { return '&' }
/ '&' )+
{
return proto + rest.join('');
}
//[^][<>"\\x00-\\x20\\x7F\p{Zs}]
template
= "{{" target:template_target params:("|" p:template_param { return p })* "}}" {
var obj = { type: 'TAG', name: 'template', attribs: [['target', target]] }
if (params && params.length) {
obj.attribs = obj.attribs.concat(params);
}
// Should actually use a self-closing tag here, but the Node HTML5
// parser only recognizes known self-closing tags for now, so use an
// explicit end tag for now.
//console.log(pp(obj));
return [obj, {type: 'ENDTAG', name: 'template'}];
}
template_target
= h:( !"}}" x:([^|\n]) { return x } )* { return h.join(''); }
/* XXX: pass these as convenient js structures to later stages, but serialize
* (to json) before passing remaining args to the tree builder */
template_param
= name:template_param_name "=" c:template_param_text {
return ['data-k-' + name, c];
} / c:template_param_text {
return ['data-p-', c];
}
tplarg
= "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
var obj = {
type: 'SELFCLOSINGTAG',
name: 'templatearg',
attribs: [['argname', name]]
};
if (params && params.length) {
obj.attribs = obj.attribs.concat(params);
}
return obj;
}
template_param_name
= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }
template_param_text
= tcs:template_param_text_chunk* { return JSON.stringify(tcs); }
//= h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }
template_param_text_chunk
= comment
/ xmlish_tag
/ extlink
/ template
/ wikilink
/ quote
/ c:[^}|\n]+ {return {type: 'TEXT', value: c.join('')}}
/ !"}}" x:([^|\n]) { return {type: 'TEXT', value: x} }
wikilink
= "[[" target:link_target text:("|" lt:link_text { return lt })* "]]" suffix:text? {
var obj = {
type: 'TAG',
name: 'a',
attribs: [['data-type', 'internal']]
};
obj.attribs.push(['href', target]);
if (text && text.length) {
var textTokens = text; // XXX
} else {
if (suffix !== '') {
target += suffix;
}
var textTokens = [{type: 'TEXT', value: target}];
}
return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
}
link_target
= h:( !"]]" x:([^|\n]) { return x } )* { return h.join(''); }
link_text
= h:( & { return setFlag('linkdesc'); }
x:inlineline { return x }
)* {
clearFlag('linkdesc')
return h;
}
/ & { clearFlag('linkdesc') } { return null; }
link_end = "]]"
/* Generic quote production for italic and bold, further processed in a token
* stream transformation in doQuotes. Relies on NEWLINE tokens being emitted
* for each line of text to balance quotes per line.
*
* We are not using a simple pair rule here as we need to support mis-nested
* bolds/italics and MediaWiki's special heuristics for apostrophes, which are
* all not context free. */
quote = "''" x:"'"* {
return {
type : 'QUOTE',
value: "''" + x.join('')
}
}
/* Will need to check anything xmlish agains known/allowed HTML tags and
* registered extensions, otherwise fail the match. Should ref be treated as a
* regular extension? */
xmlish_tag = nowiki / ref / references / generic_tag
pre
= "<pre"
attribs:generic_attribute*
">"
ts:(t1:[^<]+ { return {type:'TEXT',value:t1.join('')} }
/ nowiki
/ !"</pre>" t2:. {return {type:'TEXT',value:t2}})+
("</pre>" / eof) {
// return nowiki tags as well?
//console.log('inpre');
return [ {type: 'TAG', name: 'pre', attribs: attribs} ]
.concat(ts, [{type: 'ENDTAG', name: 'pre'}]);
}
/ "</pre>" { return {type: 'TEXT', value: "</pre>"}; }
nowiki
= "<nowiki>" nc:nowiki_content "</nowiki>" {
// console.log(pp(nc));
return nc;
}
/ "<nowiki>" {
//console.log('nowiki fallback');
return [{type: 'TEXT', value: '<nowiki>'}];
}
/ "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; }
nowiki_content
= ts:( t:[^<]+ { return t.join('') }
/ "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
//console.log('nested pre in nowiki');
return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');
}
/ (!("</nowiki>" / "</pre>") c:. {return c})
)* {
// return nowiki tags as well?
return [{type: 'TEXT', value: ts.join('')}];
}
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
block_tag
= "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
attribs:generic_attribute*
selfclose:"/"?
">" {
if (block_names[name.toLowerCase()] !== true) {
// abort match if tag is not block-level
return null;
}
var res = {name: name, attribs: attribs};
if ( end != '' ) {
res.type = 'ENDTAG';
} else if ( selfclose != '' ) {
res.type = 'SELFCLOSINGTAG';
} else {
res.type = 'TAG';
}
return res;
}
// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
generic_tag
= "<" end:"/"? name:[0-9a-zA-Z]+
attribs:generic_attribute*
space*
selfclose:"/"?
">" {
var res = {name: name.join(''), attribs: attribs};
if ( end != '' ) {
res.type = 'ENDTAG';
} else if ( selfclose != '' ) {
res.type = 'SELFCLOSINGTAG';
} else {
res.type = 'TAG';
}
return res;
}
generic_attribute
= s:space*
name:generic_attribute_name
value:(space*
generic_attribute_value)?
{
if ( value !== '' ) {
return [name, value[1]];
} else {
return [name,''];
}
}
// http://dev.w3.org/html5/spec/Overview.html#attributes-0
generic_attribute_name
= n:[^ \t\0/"'>=]+ {
return n.join('');
}
generic_attribute_value
= "=" space* v:att_value {return v}
att_value
= t:[^ \t'"<>='\n]+ { return [null, t.join('')]; }
/ "'" t:[^'>]+ "'" { return unquote("'", t.join('')); }
/ '"' t:[^">]+ '"' { return unquote('"', t.join('')); }
ref = ref_full / ref_empty
/* Can we do backreferences to genericize this? */
ref_full
= start:ref_start ">" content:ref_content* close:ref_end {
return [
{ type: 'TAG',
name: 'ext',
attribs: [['data-extname', 'ref']]
.concat(start.params, [['data-startws', start.ws]])},
content,
{type: 'ENDTAG', name: 'ref'}
];
}
ref_empty
= start:ref_start close:(space* "/>") {
return [{ type: 'SELFCLOSINGTAG',
name: 'ext',
attribs: [['data-extname', 'ref']]
.concat(start.params
,[['data-startws', start.ws]])
}];
}
ref_start
= "<ref" params:ext_param* ws:space* {
return {
params: params,
ws: ws
};
}
ref_end
= all:("</ref" space* ">") {
return all.join('');
}
ref_content
= !ref_end a:inline { // XXX: ineffective syntactic stop
return a;
}
/* fixme probably have to programatically add these */
references = references_full / references_empty
references_full
= start:references_start ">" content:references_content* close:references_end {
return [
{ type: 'TAG',
name: 'ext',
attribs: [['data-extname', 'references']]
.concat(start.params
,[['data-startws', start.ws]])
},
content,
{ type: 'ENDTAG', name: 'ext' }
];
}
references_empty
= start:references_start close:(space* "/>") {
return
[{ type: 'SELFCLOSINGTAG',
name: 'ext',
attribs: [['data-extname', 'references']]
.concat(start.params
,[['data-startws', start.ws]])
}];
}
references_start
= "<references" params:ext_param* ws:space* {
return {
params: params,
ws: ws
};
}
references_end
= all:("</references" space* ">") {
return all.join('');
}
references_content
= !references_end a:inline {
return a;
}
ext_param
= space* name:ext_param_name "=" val:ext_param_val {
val[0] = name;
return val;
}
ext_param_name
= name:[a-zA-Z0-9-]+ {
return name.join('');
}
ext_param_val
= t:[0-9A-Za-z]+ { return [null, t.join('')]; }
/ "'" t:[^'>]+ "'" { return [null, unquote("'", t.join(''))]; }
/ '"' t:[^">]+ '"' { return [null, unquote('"', t.join(''))]; }
lists = e:(dtdd / li) es:(sol (dtdd / li))*
{
return annotateList( [ { type: 'TAG', name: 'list'} ]
.concat(flatten([e].concat(es))
,[{ type: 'ENDTAG', name: 'list' }]));
}
li = bullets:list_char+
c:inlineline?
&eolf
{
if ( c == '' )
c = [];
return [ { type: 'TAG',
name: 'listItem',
bullets: bullets }
, c ];
}
dtdd
= bullets:(!(";" !list_char) list_char)*
";"
& {return setFlag('colon');}
c:inlineline
":"
// Fortunately dtdds cannot be nested, so we can simply set the flag
// back to 0 to disable it.
& {syntaxFlags['colon'] = 0; return true;}
d:inlineline
&eolf {
// Convert trailing space into &nbsp;
// XXX: This should be moved to a serializer
//var clen = c.length;
//if (clen && c[clen - 1].type === 'TEXT') {
// var val = c[clen - 1].value;
// if(val.length && val[val.length - 1] == ' ') {
// c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
// }
//}
return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
.concat( c
,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
, d );
}
// Fall-back case to clear the colon flag
/ & { return true; } { syntaxFlags['colon'] = 0; return null; }
list_char = [*#:;]
/* Tables */
table
= tas:table_start c:table_caption? b:table_body? table_end {
var res = {type: 'TAG', name: 'table'}
var body = b !== '' ? b : [];
dp("body: " + pp(body));
if (tas.length > 0) {
// FIXME: actually parse and build structure
res.attribs = [['data-unparsed', tas.join('')]];
}
if (c != '') {
var caption = [{type: 'TAG', name: 'caption'}]
.concat(c, [{type: 'ENDTAG', name: 'caption'}]);
} else {
var caption = [];
}
//dp(pp(res));
return [res].concat(caption, body,
[{type: 'ENDTAG', name: 'table'}]);
}
table_start
= "{|"
res:(
& { setFlag('table'); return true; }
ta:table_attribs*
{
dp("table_start " + pp(ta) + ", pos:" + pos);
return ta;
}
/ & { clearFlag('table'); return false; } { return null; }
) { return res }
table_attribs
= text / ! inline_breaks !newline ![|] c:. { return c }
table_caption
= newline
"|+" c:inline* {
return c;
}
table_body
= & { dp("table_body enter"); return true; }
firstrow:table_firstrow otherrows:table_row* {
/* dp('table first and otherrows: '
* + pp([firstrow].concat(otherrows))); */
return [firstrow].concat(otherrows);
}
/ otherrows:table_row* {
//dp('table otherrows: ' + pp(otherrows));
return otherrows;
}
table_firstrow
= td:table_data+ {
return [{ type: 'TAG', name: 'tr' }]
.concat(td, [{type: 'ENDTAG', name: 'tr'}]);
}
table_row
= & { dp("table row enter"); return true; }
newline
"|-" thtd_attribs? space* td:(table_data / table_header)* {
return [{type: 'TAG', name: 'tr'}]
.concat(td, [{type: 'ENDTAG', name: 'tr'}]);
}
table_data
= & { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
("||" / newline "|")
! [}+-]
a:thtd_attribs?
// use inline_breaks to break on tr etc
td:(!inline_breaks
& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
b:block { return b })* {
dp("table data result: " + pp(td) + ", attribts: " + pp(a));
return [{ type: 'TAG', name: 'td', attribs: [['data-unparsed', a]]}]
.concat(td, [{type: 'ENDTAG', name: 'td'}]);
}
table_header
= ("!!" / newline "!")
a:thtd_attribs?
c:inline {
return [{type: 'TAG', name: 'th', attribs: [['data-unparsed', a]]}]
.concat(c, [{type: 'ENDTAG', name: 'th'}]);
}
thtd_attribs
// In particular, do not match [|\n]
= a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "|" ! "|" {
return a;
}
table_end = newline? "|}" { clearFlag('table'); }
/* Tabs do not mix well with the hybrid production syntax */
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent: */