mediawiki-extensions-Visual.../modules/parser/pegParser.pegjs.txt
2011-11-04 11:08:11 +00:00

739 lines
16 KiB
Plaintext

/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
{
var dp = function ( msg ) {
if ( false ) {
console.log(msg);
}
}
// Forbidden chars in anything content
var forbiddenThingRegexp = {
// key: regexp
// value: nesting counter, regexp removed if zero reached
kvs: {},
regexp: "",
rebuild: function () {
var keys = [];
for (var key in this.kvs) {
keys.push(key);
}
this.regexp = keys.join('|');
},
push: function (key) {
if (this.kvs[key] !== undefined) {
this.kvs[key]++;
} else {
this.kvs[key] = 1;
this.rebuild();
}
},
pop: function (key) {
if (this.kvs[key] !== undefined) {
if(this.kvs[key] == 1) {
delete this.kvs[key];
this.rebuild();
} else {
this.kvs[key]--;
}
} else {
throw "Trying to pop non-existing forbiddenThingRegexp";
}
}
}
/* Temporary debugging help. Is there anything similar in JS or a library? */
var print_r = function (arr, level) {
var dumped_text = "";
if (!level) level = 0;
//The padding given at the beginning of the line.
var level_padding = "";
var bracket_level_padding = "";
for (var j = 0; j < level + 1; j++) level_padding += " ";
for (var b = 0; b < level; b++) bracket_level_padding += " ";
if (typeof(arr) == 'object') { //Array/Hashes/Objects
dumped_text += "Array\n";
dumped_text += bracket_level_padding + "(\n";
for (var item in arr) {
var value = arr[item];
if (typeof(value) == 'object') { //If it is an array,
dumped_text += level_padding + "[" + item + "] => ";
dumped_text += print_r(value, level + 2);
} else {
dumped_text += level_padding + "[" + item + "] => '" + value + "'\n";
}
}
dumped_text += bracket_level_padding + ")\n\n";
} else { //Strings/Chars/Numbers etc.
dumped_text = "=>" + arr + "<=(" + typeof(arr) + ")";
}
return dumped_text;
}
// Convert list prefixes to a list of WikiDom list styles
var bulletsToTypes = function (bullets) {
var bTypes = [];
var blen = bullets.length;
for (var i = 0; i < bullets.length; i++) {
switch (bullets[i]) {
case '*':
bTypes.push('bullet'); break;
case '#':
bTypes.push('number'); break;
case ';':
bTypes.push('definitionterm'); break;
case ':':
bTypes.push('definitiondata'); break;
}
}
return bTypes;
}
}
start
= e:block* {
var es = [];
// flatten sub-arrays, as a list block can contain multiple lists
$.each(e, function(i, ei) {
if (ei.constructor == Array)
es = es.concat(ei);
else
es.push(ei);
});
dp(es);
return {
type: 'page',
content: es
}
}
anyblock = block / inline
anyblockline = block / inlineline
anything
= text
/ s:.+ {
// reject match if forbiddenThingRegexp matches
var str = s.join('');
dp("anything: " +print_r(str));
if (forbiddenThingRegexp.regexp !== '') {
var m = str.search(forbiddenThingRegexp.regexp)
if ( m > 0 ) {
dp("anything reverse " + (str.length - m)
+ ", matched: " + str.substr(0,m));
// reverse parser position
pos -= str.length - m;
return {text: str.substr(0,m)};
} else {
if (m == 0) {
pos -= str.length;
return null;
} else {
return {text: str};
}
}
} else {
return {text: str};
}
}
anyline
= text
/ s:[^\n]+ {
// reject match if forbiddenThingRegexp matches
var str = s.join('');
dp("anyline: " + print_r(str) + ", pos:" + pos);
if (forbiddenThingRegexp.regexp !== '') {
var m = str.search(forbiddenThingRegexp.regexp)
if ( m > 0 ) {
// reverse parser position
pos -= str.length - m;
dp("anyline reverse " + (str.length - m)
+ ", matched: " + str.substr(0,m));
return {text: str.substr(0,m)};
} else {
if (m == 0) {
pos -= str.length;
return null;
} else {
return {text : str};
}
}
} else {
return {text: str};
}
}
// All chars that cannot start syntactic structures
text = t:[A-Za-z0-9,._ -]+ { return t.join('') }
space
= s:[ ]+ { return s.join(''); }
newline
= [\n]
block
= br
/ h
/ table
/ lists
/ para
h = h1 / h2 / h3 / h4 / h5 / h6
h1 = '=' c:heading_text '=' newline {
return {
type: 'heading',
level: 1,
text: c
}
}
h2 = '==' c:heading_text '==' newline {
return {
type: 'heading',
level: 2,
text: c
}
}
h3 = '===' c:heading_text '===' newline {
return {
type: 'heading',
level: 3,
text: c
}
}
h4 = '====' c:heading_text '====' newline {
return {
type: 'heading',
level: 4,
text: c
}
}
h5 = '=====' c:heading_text '=====' newline {
return {
type: 'heading',
level: 5,
text: c
}
}
h6 = '======' c:heading_text '======' newline {
return {
type: 'heading',
level: 6,
text: c
}
}
heading_marker
= '=' '='*
heading_text
= h:( !heading_marker x:(anyline) { return x } )* { return h.join(''); }
br
= newline { return {type: 'br'} }
// TODO: convert inline content to annotations!
para
= c:inlineline newline { return {type: 'paragraph', content: c[0] } } /
c:anyline
inline
= c:(inline_element / anything)+ {
var out = [];
var text = '';
for (var i = 0; i < c.length; i++) {
if (typeof c[i] == 'string') {
text += c[i];
} else {
if (text.length) {
out.push({
type: 'text',
text: text
});
text = '';
}
out.push(c[i]);
}
}
if (text.length) {
out.push({
type: 'text',
text: text
});
}
return out;
}
inlineline
= c:(inline_element / anyline)+ {
var out = [];
var text = '';
for (var i = 0; i < c.length; i++) {
if (typeof c[i] == 'string') {
text += c[i];
} else {
if (text.length) {
out.push({
type: 'text',
text: text
});
text = '';
}
out.push(c[i]);
}
}
if (text.length) {
out.push({
text: text,
//annotations: []
});
}
return out;
}
inline_element
= comment
/ xmlish_tag
/ extlink
/ template
/ link
/ bold
/ italic
comment
= '<!--' c:comment_chars+ '-->' {
return {
type: 'comment',
text: c.join('')
}
}
comment_chars
= c:[^-] { return c; }
/ c:'-' !'-' { return c; }
inline_text_run
= c:[^\n]+ { return c.join(''); }
extlink
= "[" target:url " " text:extlink_text "]" {
return {
type: 'extlink',
target: target,
text: text
}
}
// = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } }
url
= proto:"http:" rest:([^ \]]+) { return proto + rest.join(''); }
extlink_text
= c:[^\]]+ { return c.join(''); }
template
= "{{" target:link_target params:("|" p:template_param { return p })* "}}" {
var obj = {
type: 'template',
target: target
};
if (params && params.length) {
obj.params = params;
}
return obj;
}
template_param
= name:template_param_name "=" c:template_param_text {
return {
name: name,
content: c
};
} / c:template_param_text {
return {
content: c
};
}
tplarg = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
var obj = {
type: 'tplarg',
name: name
};
if (params && params.length) {
obj.params = params;
}
return obj;
}
template_param_name
= h:( !"}}" x:([^=|]) { return x } )* { return h.join(''); }
template_param_text
= template_param_text_chunk*
/* = h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }*/
template_param_text_chunk
= comment
/ xmlish_tag
/ extlink
/ template
/ link
/ bold
/ italic
/ !"}}" x:([^|]) { return x }
link
= "[[" target:link_target text:("|" link_text)* "]]" {
var obj = {
type: 'link',
target: target
};
if (text && text.length) {
obj.text = text[0][1]; // ehhhh
}
return obj;
}
link_target
= h:( !"]]" x:([^|]) { return x } )* { return h.join(''); }
link_text
= h:( !"]]" x:(anyline) { return x } )* { return h.join(''); }
bold
= bold_marker c:bold_text bold_marker {
return {
type: 'b',
text: c,
}
}
bold_marker
= "'''"
bold_text
= h:( !bold_marker x:(anyline) { return x } )+ { return h.join(''); }
italic
= italic_marker c:italic_text italic_marker {
return {
type: 'i',
text: c
}
}
italic_marker
= "''"
italic_text
= h:( !italic_marker x:(anyline) { return x } )+ { return h.join(''); }
/* Will need to check anything xmlish agains known/allowed HTML tags and
* registered extensions, otherwise fail the match. Should ref be treated as a
* regular extension? */
xmlish_tag =
ref /
references
ref = ref_full / ref_empty
/* Can we do backreferences to genericize this? */
ref_full
= start:ref_start ">" content:ref_content* close:ref_end {
return {
type: 'ext',
name: 'ref',
params: start.params,
ws: start.ws,
content: content,
close: close
}
}
ref_empty
= start:ref_start close:(space* "/>") {
return {
type: 'ext',
name: 'ref',
ws: start.ws,
params: start.params,
close: close
}
}
ref_start
= "<ref" params:ext_param* ws:space* {
return {
params: params,
ws: ws
};
}
ref_end
= all:("</ref" space* ">") {
return all.join('');
}
ref_content
= !ref_end a:(inline_element / anyline) {
return a;
}
/* fixme probably have to programatically add these */
references = references_full / references_empty
references_full
= start:references_start ">" content:references_content* close:references_end {
return {
type: 'ext',
name: 'references',
params: start.params,
ws: start.ws,
content: content,
close: close
}
}
references_empty
= start:references_start close:(space* "/>") {
return {
type: 'ext',
name: 'references',
ws: start.ws,
params: start.params,
close: close
}
}
references_start
= "<references" params:ext_param* ws:space* {
return {
params: params,
ws: ws
};
}
references_end
= all:("</references" space* ">") {
return all.join('');
}
references_content
= !references_end a:(inline_element / anyline) {
return a;
}
ext_param
= space* name:ext_param_name "=" val:ext_param_val {
val.name = name;
return val;
}
ext_param_name
= name:[a-zA-Z0-9-]+ {
return name.join('');
}
ext_param_val
= t:[0-9A-Za-z]+ { return {text: t.join('') } }
/ "'" t:[^'>]+ "'" { return { quote: "'", text: t.join('') } }
/ '"' t:[^">]+ '"' { return { quote: '"', text: t.join('') } }
lists = es:(dtdd / li)+
{
// Flatten es
var esLen = es.length;
var flatEs = [];
for (var i = 0; i < esLen; i++) {
var ei = es[i];
if ($.isArray(ei)) {
flatEs = flatEs.concat(ei);
} else {
flatEs.push(ei);
}
}
return {
type: 'list',
children: flatEs
}
}
li = bullets:list_char+
c:(inlineline / anyline)
newline
{
return {
type: 'listItem',
attributes: {
styles: bulletsToTypes(bullets)
},
content: c[0]
};
}
dtdd = bullets:list_char+
c:(inline_element / [^:\n])+
":"
d:(inline_element / [^\n])+
newline
{
// reject rule if bullets do not end in semicolon
if (bullets[bullets.length - 1] != ';') {
return null;
} else {
return [
{
type: 'listItem',
attributes: {styles: bulletsToTypes(bullets)},
content: {text: c.join('')}
}, {
type: 'listItem',
attributes: {styles: bulletsToTypes(
bullets.slice(0, bullets.length - 1) + ':')},
content: {text: d.join('')}
}
]
}
}
list_char = [*#:;]
/* Tables */
table
= tas:table_start c:table_caption? b:table_body? table_end {
var res = {type: 'table'}
var body = b !== '' ? b : [];
if (c !== '') {
res.content = [c].concat(body);
} else {
res.content = body;
}
if (tas.length > 0) {
// FIXME: actually parse and build structure
res.attributes = { unparsed: tas }
}
dp(print_r(res));
return res;
}
table_start
= "{|"
& { forbiddenThingRegexp.push('(\n|^)\\||\\|[|\x7d+]'); return true; }
ta:table_attribs* space* newline? {
dp("table_start " + print_r(ta) + ", pos:" + pos);
return ta;
}
/ "{|" { forbiddenThingRegexp.pop('(\n|^)\\||\\|[|\x7d+]'); return null; }
table_attribs = anyline
table_caption
= "|+" c:inline* newline? {
return {
type: 'tableCaption',
content: c
}
}
table_body
= & { dp("table_body enter"); return true; }
firstrow:table_firstrow otherrows:table_row* {
/* dp('table first and otherrows: '
* + print_r([firstrow].concat(otherrows))); */
return [firstrow].concat(otherrows);
}
/ otherrows:table_row* {
//dp('table otherrows: ' + print_r(otherrows));
return otherrows;
}
table_firstrow
= td:table_data+ {
return {
type: 'tableRow',
content: td
};
}
table_row
= "|-" space* newline? td:(table_data / table_header)* {
return {
type: 'tableRow',
content: td
};
}
table_data
= & { dp("table_data enter, pos=" + pos); return true; }
"||" td_attr? td:anyblock* newline? {
//dp("table || result:" + print_r(td));
return {
type: 'tableCell',
content: td
};
}
/ & { dp("table_data : | enter pos=" + pos); return true; }
"|" ![}+-] td_attr? td:anyblock* newline? {
//dp("table | result:" + print_r(td));
return {
type: 'tableCell',
content: td
};
}
td_attr = a:[^\n|]+ "|" !"|" {
dp("td_attr: " + a.join(''));
return a.join('');
}
table_header
= "!!" c:(inline / ("!" !"!") / [^!\n])* newline? {
return {
type: 'tableHeader',
content: c
}
}
/ "!" c:(inline / text / '!' !'!' / [^!\n])* newline? {
return {
type: 'tableHeader',
content: c
}
}
table_end = "|}" newline? { forbiddenThingRegexp.pop('(\n|^)\\||\\|[|\x7d+]'); }
/* Wikidom TODO:
* split off text into content nodes
* convert inlines into annotations
* change contents into children
*/