mediawiki-extensions-Visual.../modules/parser/pegParser.pegjs.txt
2011-11-21 09:22:30 +00:00

780 lines
20 KiB
Plaintext

/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
{
var dp = function ( msg ) {
if ( false ) {
console.log(msg);
}
};
/*
* Flags for specific parse environments (inside tables, links etc). Flags
* trigger syntactic stops in the inline_breaks production, which
* terminates inline and attribute matches. Flags merely reduce the number
* of productions needed: The grammar is still context-free as the
* productions can just be unrolled for all combinations of environments
* at the cost of a much larger grammar.
*/
var syntaxFlags = {};
var setFlag = function(flag) {
if (syntaxFlags[flag] !== undefined) {
syntaxFlags[flag]++;
} else {
syntaxFlags[flag] = 1;
}
return true;
};
var clearFlag = function(flag) {
syntaxFlags[flag]--;
};
var pp = function ( s ) { return JSON.stringify(s, null, 2); }
// Convert list prefixes to a list of WikiDom list styles
var bulletsToTypes = function (bullets) {
var bTypes = [];
var blen = bullets.length;
for (var i = 0; i < bullets.length; i++) {
switch (bullets[i]) {
case '*':
bTypes.push('bullet'); break;
case '#':
bTypes.push('number'); break;
case ';':
bTypes.push('term'); break;
case ':':
bTypes.push('description'); break;
}
}
return bTypes;
};
/*var extractInline = function ( node ) {
return { text: extractText(node, 0) };
};
// return [text [annotations]]
var extractText = function ( node, offset ) {
dp("extract: " + pp(node));
if (typeof node === 'string') {
return [node, []];
} else if ($.isArray(node)) {
var texts = [],
annotations = [];
for (var i = 0, length = node.length; i < length; i++) {
var res = extractText(node[i], offset);
texts.push(res[0]);
annotations.concat(res[1]);
offset += res[0].length;
}
return [texts.join(''), annotations];
} else if ( 'text' in node ) {
var res = extractText(node, offset);
if ('annotations' in node) {
return [res[0], node.annotations.concat(res[1])];
} else {
return res;
}
} else if ( 'content' in node ) {
return extractText(node.content, offset);
} else if ( 'children' in node ) {
var texts = [];
for (var i = 0, length = node.children.length; i < length; i++) {
texts.push(extractText(node.children[i]));
}
return texts.join('');
} else {
throw ("extract failed: " + pp(node));
}
};
*/
// Start position of top-level block
// Could also provide positions for lower-level blocks using a stack.
var blockStart = 0;
var unquote = function (quotec, text) {
return text.replace('\\' + quotec, quotec);
};
var flatten = function ( e ) {
var es = [];
// flatten sub-arrays
for(var i = 0, length = e.length; i < length; i++) {
var ei = e[i];
if ($.isArray(ei))
es = es.concat(flatten(ei));
else
es.push(ei);
};
return es;
};
}
start
= e:toplevelblock* newline* {
return flatten(e);
}
anyblock = block / inline
anyblockline = block / inlineline
// All chars that cannot start syntactic structures
text = t:[A-Za-z0-9,._ -]+ { return t.join('') }
space
= s:[ \t]+ { return s.join(''); }
// Start of line
sol = (newline / & { return pos === 0; } { return true; })
cn:(c:comment n:newline? { return [c, n] })? {
return cn;
}
newline
= '\n' / '\r\n'
toplevelblock
= & { blockStart = pos; return true; } b:block {
b = flatten(b);
var bs = b[0];
dp('toplevelblock:' + pp(b));
if (bs.attribs === undefined) {
bs.attribs = [];
}
bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);
return b;
}
block
= (sol space* &newline)? bl:block_lines { return bl; }
/ para
/ comment
/ (s:sol {
if (s) {
return [s, {type: 'NEWLINE'}];
} else {
return [{type: 'NEWLINE'}];
}
}
)
// Block structures with start-of-line wiki syntax
block_lines
= h
/ table
/ lists
/ pre_indent
/* Headings */
h = h1 / h2 / h3 / h4 / h5 / h6
h1 = sol '='
(
& { setFlag('h'); return setFlag('h1') }
c:inlineline '=' comment? &newline {
clearFlag('h');
clearFlag('h1');
return [{type: 'TAG', name: 'h1'}]
.concat(c, [{type: 'ENDTAG', name: 'h1'}]);
}
/ { clearFlag('h'); clearFlag('h1'); return null }
)
h2 = sol '=='
(
& { setFlag('h'); return setFlag('h2') }
c:inlineline '==' comment? &newline {
clearFlag('h');
clearFlag('h2');
return [{type: 'TAG', name: 'h2'}]
.concat(c, [{type: 'ENDTAG', name: 'h2'}]);
}
/ { clearFlag('h'); clearFlag('h2'); return null }
)
h3 = sol '==='
(
& { setFlag('h'); return setFlag('h3') }
c:inlineline '===' comment? &newline {
clearFlag('h');
clearFlag('h3');
return [{type: 'TAG', name: 'h3'}]
.concat(c, [{type: 'ENDTAG', name: 'h3'}]);
}
/ { clearFlag('h'); clearFlag('h3'); return null }
)
h4 = sol '===='
(
& { setFlag('h'); return setFlag('h4') }
c:inlineline '====' comment? &newline {
clearFlag('h');
clearFlag('h4');
return [{type: 'TAG', name: 'h4'}]
.concat(c, [{type: 'ENDTAG', name: 'h4'}]);
}
/ { clearFlag('h'); clearFlag('h4'); return null }
)
h5 = sol '====='
(& { setFlag('h'); return setFlag('h5') }
c:inlineline '=====' comment? &newline {
clearFlag('h');
clearFlag('h5');
return [{type: 'TAG', name: 'h5'}]
.concat(c, [{type: 'ENDTAG', name: 'h5'}]);
}
/ { clearFlag('h'); clearFlag('h5'); return null }
)
h6 = sol '======'
(& { setFlag('h'); return setFlag('h6') }
c:inlineline '======' comment? &newline {
clearFlag('h');
clearFlag('h6');
return [{type: 'TAG', name: 'h6'}]
.concat(c, [{type: 'ENDTAG', name: 'h6'}]);
}
/ { clearFlag('h'); clearFlag('h6'); return null }
)
heading_marker
= '=' '='*
heading_text
= h:( !(heading_marker newline) x:inlineline { return x } )* { return h.join(''); }
// TODO: convert inline content to annotations!
para
= (sol br)? pl:para_lines { return pl; }
para_lines
= s:sol c:inlineline cs:(!block_lines para_lines)* {
var res = [{type: 'TAG', name: 'p'}];
if (s !== '') {
res.push(s)
}
//console.log('paralines' + pp(res.concat(c, cs, [{type: 'ENDTAG', name: 'p'}])));
return res.concat(c, cs, [{type: 'ENDTAG', name: 'p'}]);
}
br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }
pre_indent
= l:pre_indent_line+ {
return [{type: 'TAG', name: 'pre'}]
.concat( l
, [{type: 'ENDTAG', name: 'pre'}]);
}
pre_indent_line = sol space l:inlineline { return l }
// Syntax that stops inline expansion
inline_breaks
= //& { console.log(pp(syntaxFlags)); return true; }
& { return syntaxFlags['table']; }
a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a)); return true; }
/ & { return syntaxFlags['italic']; } italic_marker { return true; }
/ & { return syntaxFlags['bold']; } bold_marker { return true; }
/ & { return syntaxFlags['linkdesc']; } link_end { return true; }
/ & { return syntaxFlags['h']; }
( & { return syntaxFlags['h1'] } '=' newline { return true; }
/ & { return syntaxFlags['h2'] } '==' newline { return true; }
/ & { return syntaxFlags['h3'] } '===' newline { return true; }
/ & { return syntaxFlags['h4'] } '====' newline { return true; }
/ & { return syntaxFlags['h5'] } '=====' newline { return true; }
/ & { return syntaxFlags['h6'] } '======' newline { return true; }
)
inline
= c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ {
var out = [];
var text = [];
c = flatten(c);
for (var i = 0; i < c.length; i++) {
if (typeof c[i] == 'string') {
text.push(c[i]);
} else {
if (text.length) {
out.push({ type: "TEXT", value: text.join('') });
text = [];
}
out.concat(c[i]);
}
}
if (text.length) {
out.push({ type: 'TEXT', value: text.join('') });
}
return out;
}
inlineline
= c:(text / !inline_breaks (inline_element / [^\n]))+ {
var out = [];
var text = [];
c = flatten(c);
for (var i = 0; i < c.length; i++) {
if (typeof c[i] == 'string') {
text.push(c[i]);
} else {
if (text.length) {
out.push({type: 'TEXT', value: text.join('')});
text = [];
}
out.push(c[i]);
}
}
if (text.length) {
out.push({type: 'TEXT', value: text.join('')});
}
//dp('inlineline out:', pp(out));
return out;
}
/* TODO: convert all these to annotations!
* -> need (start, end) offsets within block
*/
inline_element
= comment
/ xmlish_tag
/ extlink
/ template
/ link
/ bold
/ italic
comment
= '<!--' c:comment_chars* '-->'
(space* newline space* comment)* {
return [{ type: 'COMMENT', value: c.join('') }];
}
comment_chars
= c:[^-] { return c; }
/ c:'-' !'->' { return c; }
extlink
= "[" target:url " " text:extlink_text "]" {
return [ { type: 'TAG',
name: 'a',
attribs: [['href', target]] }
, {type: 'TEXT', value: text}
, {type: 'ENDTAG', name: 'a'}];
}
// = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } }
url
= proto:"http:" rest:([^ \]]+) { return proto + rest.join(''); }
extlink_text
= c:[^\]]+ { return c.join(''); }
template
= "{{" target:link_target params:("|" p:template_param { return p })* "}}" {
var obj = { type: 'SELFCLOSINGTAG', name: 'template', attribs: [['target', target]] }
if (params && params.length) {
obj.attribs.push(params);
}
return obj;
}
template_param
= name:template_param_name "=" c:template_param_text {
return [name, c];
} / c:template_param_text {
return [null, c];
}
tplarg
= "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
var obj = {
type: 'SELFCLOSINGTAG',
name: 'templatearg',
attribs: [['argname', name]]
};
if (params && params.length) {
obj.attribs.push(params);
}
return obj;
}
template_param_name
= h:( !"}}" x:([^=|]) { return x } )* { return h.join(''); }
template_param_text
= template_param_text_chunk*
/* = h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }*/
template_param_text_chunk
= comment
/ xmlish_tag
/ extlink
/ template
/ link
/ bold
/ italic
/ !"}}" x:([^|]) { return x }
link
= "[[" target:link_target text:("|" link_text)* "]]" {
var obj = {
type: 'TAG',
name: 'a',
attribs: [['data-type', 'internal']]
};
obj.attribs.push(['href', target]);
if (text && text.length) {
var textTokens = text[0][1]; // XXX
} else {
var textTokens = [{type: 'TEXT', value: target}];
}
return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
}
link_target
= h:( !"]]" x:([^|]) { return x } )* { return h.join(''); }
link_text
= h:( & { return setFlag('linkdesc'); }
x:inlineline { return x }
)* {
clearFlag('linkdesc')
return h;
}
/ & { clearFlag('linkdesc') } { return null; }
link_end = "]]"
bold
= bold_marker
& { dp('benter:' + pos); return setFlag('bold'); }
c:inlineline
bold_marker {
clearFlag('bold');
return [{ type: 'TAG', name: 'b' }]
.concat(c, [{type: 'ENDTAG', name: 'b'}]);
}
/ bold_marker { clearFlag('bold'); return null }
bold_marker
= "'''"
italic
= italic_marker
& { dp('ienter:' + pos); return setFlag('italic'); }
c:inlineline
italic_marker {
clearFlag('italic');
dp('ileave:' + pos);
return [{ type: 'TAG', name: 'i' }]
.concat(c, [{ type: 'ENDTAG', name: 'i'}]);
}
/ italic_marker { clearFlag('italic'); return null }
italic_marker
= "''"
/* Will need to check anything xmlish agains known/allowed HTML tags and
* registered extensions, otherwise fail the match. Should ref be treated as a
* regular extension? */
xmlish_tag = ref / references
ref = ref_full / ref_empty
/* Can we do backreferences to genericize this? */
ref_full
= start:ref_start ">" content:ref_content* close:ref_end {
return [
{ type: 'TAG',
name: 'ext',
attribs: [['data-extname', 'ref']]
.concat(start.params, [['data-startws', start.ws]])},
content,
{type: 'ENDTAG', name: 'ref'}
];
}
ref_empty
= start:ref_start close:(space* "/>") {
return [{ type: 'SELFCLOSINGTAG',
name: 'ext',
attribs: [['data-extname', 'ref']]
.concat(start.params
,[['data-startws', start.ws]])
}];
}
ref_start
= "<ref" params:ext_param* ws:space* {
return {
params: params,
ws: ws
};
}
ref_end
= all:("</ref" space* ">") {
return all.join('');
}
ref_content
= !ref_end a:inline { // XXX: ineffective syntactic stop
return a;
}
/* fixme probably have to programatically add these */
references = references_full / references_empty
references_full
= start:references_start ">" content:references_content* close:references_end {
return [
{ type: 'TAG',
name: 'ext',
attribs: [['data-extname', 'references']]
.concat(start.params
,[['data-startws', start.ws]])
},
content,
{ type: 'ENDTAG', name: 'ext' }
];
}
references_empty
= start:references_start close:(space* "/>") {
return
[{ type: 'SELFCLOSINGTAG',
name: 'ext',
attribs: [['data-extname', 'references']]
.concat(start.params
,[['data-startws', start.ws]])
}];
}
references_start
= "<references" params:ext_param* ws:space* {
return {
params: params,
ws: ws
};
}
references_end
= all:("</references" space* ">") {
return all.join('');
}
references_content
= !references_end a:inline {
return a;
}
ext_param
= space* name:ext_param_name "=" val:ext_param_val {
val[0] = name;
return val;
}
ext_param_name
= name:[a-zA-Z0-9-]+ {
return name.join('');
}
ext_param_val
= t:[0-9A-Za-z]+ { return [null, t.join('')]; }
/ "'" t:[^'>]+ "'" { return [null, unquote("'", t.join(''))]; }
/ '"' t:[^">]+ '"' { return [null, unquote('"', t.join(''))]; }
lists = es:(dtdd / li)+
{
return [ { type: 'TAG',
name: 'ul'} ] // XXX!!
.concat(flatten(es)
,[{ type: 'ENDTAG', name: 'ul' }]);
}
li = sol
bullets:list_char+
c:inlineline
&newline
{
return [ { type: 'TAG',
name: 'li',
attribs: [['data-styles', bullets]] }
, c
, { type: 'ENDTAG', name: 'li' }
];
}
dtdd = sol
bullets:list_char+
c:(inline_element / (n:[^:\n] { return {type: 'TEXT', value: n}; }))+
":"
d:(inline_element / (n:[^\n] { return {type: 'TEXT', value: n}; }))+
&newline
{
// reject rule if bullets do not end in semicolon
if (bullets[bullets.length - 1] != ';') {
return null;
} else {
return [ { type: 'TAG', name: 'dl', attribs: [['data-styles', bullets]] }
, { type: 'TAG', name: 'dt' } ]
.concat( c
, [ {type: 'ENDTAG', name: 'dt'}
, {type: 'TAG', name: 'dd'} ]
, d
, [ {type: 'ENDTAG', name: 'dd'}
, {type: 'ENDTAG', name: 'dl'} ]);
}
}
list_char = [*#:;]
/* Tables */
table
= tas:table_start c:table_caption? b:table_body? table_end {
var res = {type: 'TAG', name: 'table'}
var body = b !== '' ? b : [];
dp("body: " + pp(body));
if (tas.length > 0) {
// FIXME: actually parse and build structure
res.attribs = [['data-unparsed', tas.join('')]];
}
if (c != '') {
var caption = [{type: 'TAG', name: 'caption'}]
.concat(c, [{type: 'ENDTAG', name: 'caption'}]);
} else {
var caption = [];
}
//dp(pp(res));
return [res].concat(caption, body,
[{type: 'ENDTAG', name: 'table'}]);
}
table_start
= sol
"{|"
& { setFlag('table'); return true; }
ta:table_attribs*
space* {
//dp("table_start " + pp(ta) + ", pos:" + pos);
return ta;
}
/ sol "{|" { clearFlag('table'); return null; }
table_attribs
= text / ! inline_breaks !newline .
table_caption
= newline
"|+" c:inline* {
return c;
}
table_body
= & { dp("table_body enter"); return true; }
firstrow:table_firstrow otherrows:table_row* {
/* dp('table first and otherrows: '
* + pp([firstrow].concat(otherrows))); */
return [firstrow].concat(otherrows);
}
/ otherrows:table_row* {
//dp('table otherrows: ' + pp(otherrows));
return otherrows;
}
table_firstrow
= td:table_data+ {
return [{ type: 'TAG', name: 'tr' }]
.concat(td, [{type: 'ENDTAG', name: 'tr'}]);
}
table_row
= & { dp("table row enter"); return true; }
newline
"|-" thtd_attribs? space* td:(table_data / table_header)* {
return [{type: 'TAG', name: 'tr'}]
.concat(td, [{type: 'ENDTAG', name: 'tr'}]);
}
table_data
= & { dp("table_data enter, pos=" + pos); return true; }
("||" / newline "|")
! [}+-]
a:thtd_attribs?
td:(!inline_breaks anyblock)* {
dp("table data result: " + pp(td) + ", attribts: " + pp(a));
return [{ type: 'TAG', name: 'td', attribs: [['data-unparsed', a]]}]
.concat(td, [{type: 'ENDTAG', name: 'td'}]);
}
table_header
= ("!!" / newline "!")
a:thtd_attribs?
c:inline {
return [{type: 'TAG', name: 'th', attribs: [['data-unparsed', a]]}]
.concat(c, [{type: 'ENDTAG', name: 'th'}]);
}
thtd_attribs
// In particular, do not match [|\n]
= a:(text / ! inline_breaks [="':;/,.-] )+ "|" ! [|}+-] {
return a;
}
table_end = newline? "|}" { clearFlag('table'); }
/* Wikidom TODO:
* split off text into content nodes
* convert inlines into annotations
* change contents into children
*
* { text: text,
* annotations: [(normal annotations)],
* maybeannotations: [
* { type: 'something',
* side: MA_START,
* tag: { start: x, length: y }
* }
* ]
* }
* offsets in annotations: presume maybeannotations are actually text
* -> need to transform annotations if match found
* -> format annotations, comments can run to the end (re-opened after
* block-level tags); only closed on table cells, object,?
* -> other annotations (images, templates etc) are limited by block-level
* elements, tightly bound
*
* Block-level elements
* --------------------
* - Need some early clean-up to provide structure and offsets
* - Establish scope limits for some inlines
* - Line-based balanced by construction
* - HTML tags need balancing/ matching / implicit close
* - content in illegal places (e.g. between table and td tags) needs foster
* parenting
* - grammar will match outermost pair if unmatched pairs are recognized as
* tokens (or as text)
* - post-processing needed, but has to be limited by scope
*/
/* Tabs do not mix well with the hybrid production syntax */
/* vim: et:ts=4:sw=4:cindent */