mediawiki-extensions-Visual.../modules/parser/pegParser.pegjs.txt

/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */

{
    var dp = function ( msg ) {
        if ( false ) {
            console.log(msg);
        }
    }


    // Forbidden chars in anything content
    var forbiddenThingRegexp = {
        // key: regexp
        // value: nesting counter, regexp removed if zero reached
        kvs: {},
        regexp: "",
        rebuild: function () {
            var keys = [];
            for (var key in this.kvs) {
                keys.push(key);
            }
            this.regexp = keys.join('|');
        },
        push: function (key) {
            if (this.kvs[key] !== undefined) {
                this.kvs[key]++;
            } else {
                this.kvs[key] = 1;
                this.rebuild();
            }
        },
        pop: function (key) {
            if (this.kvs[key] !== undefined) {
                if(this.kvs[key] == 1) {
                    delete this.kvs[key];
                    this.rebuild();
                } else {
                    this.kvs[key]--;
                }
            } else {
                throw "Trying to pop non-existing forbiddenThingRegexp";
            }
        }
    }


    /* Temporary debugging help. Is there anything similar in JS or a library? */
    var print_r = function (arr, level) {

        var dumped_text = "";
        if (!level) level = 0;

        //The padding given at the beginning of the line.
        var level_padding = "";
        var bracket_level_padding = "";

        for (var j = 0; j < level + 1; j++) level_padding += "    ";
        for (var b = 0; b < level; b++) bracket_level_padding += "    ";

        if (typeof(arr) == 'object') { //Array/Hashes/Objects
            dumped_text += "Array\n";
            dumped_text += bracket_level_padding + "(\n";
            for (var item in arr) {

                var value = arr[item];

                if (typeof(value) == 'object') { //If it is an array,
                    dumped_text += level_padding + "[" + item + "] => ";
                    dumped_text += print_r(value, level + 2);
                } else {
                    dumped_text += level_padding + "[" + item + "] => '" + value + "'\n";
                }

            }
            dumped_text += bracket_level_padding + ")\n\n";
        } else { //Stings/Chars/Numbers etc.
            dumped_text = "===>" + arr + "<===(" + typeof(arr) + ")";
        }

        return dumped_text;

    }
}

start
  = e:block* {
        var es = [];
        // flatten sub-arrays, as a list block can contain multiple lists
        $.each(e, function(i, ei) {
            if (ei.constructor == Array)
                es = es.concat(ei);
            else
                es.push(ei);
        });
        dp(print_r(es));
        return {
            type: 'page',
            content: es
        }
    }

anyblock = block / inline
anyblockline = block / inlineline

anything
  = text
  / s:.+ {
      // reject match if forbiddenThingRegexp matches
      var str = s.join('');
      dp("anything: " +print_r(str));
      if (forbiddenThingRegexp.regexp !== '') {
          var m = str.search(forbiddenThingRegexp.regexp)
          if ( m > 0 ) {
              dp("anything reverse " + (str.length - m)
                      + ", matched: " + str.substr(0,m));
              // reverse parser position
              pos -= str.length - m;
              return {text: str.substr(0,m)};
          } else {
              if (m == 0) {
                  pos -= str.length;
                  return null;
              } else {
                  return {text: str};
              }
          }
      } else {
          return {text: str};
      }
  }

anyline
  = text
  / s:[^\n]+ {
      // reject match if forbiddenThingRegexp matches
      var str = s.join('');
      dp("anyline: " + print_r(str) + ", pos:" + pos);
      if (forbiddenThingRegexp.regexp !== '') {
          var m = str.search(forbiddenThingRegexp.regexp)
          if ( m > 0 ) {
              // reverse parser position
              pos -= str.length - m;
              dp("anyline reverse " + (str.length - m)
                      + ", matched: " + str.substr(0,m));
              return {text: str.substr(0,m)};
          } else {
              if (m == 0) {
                  pos -= str.length;
                  return null;
              } else {
                  return {text : str};
              }
          }
      } else {
          return {text: str};
      }
  }


// All chars that cannot start syntactic structures
text = t:[A-Za-z0-9,._ -]+ { return t.join('') }

space
  = s:[ ]+ { return s.join(''); }

newline
  = [\n]

block
  = br
  / h
  / table
  / lists
  / para

h = h1 / h2 / h3 / h4 / h5 / h6

h1 = '=' c:heading_text '=' newline {
    return {
      type: 'heading',
      level: 1,
      text: c
    }
}

h2 = '==' c:heading_text '==' newline {
    return {
      type: 'heading',
      level: 2,
      text: c
    }
}

h3 = '===' c:heading_text '===' newline {
    return {
      type: 'heading',
      level: 3,
      text: c
    }
}

h4 = '====' c:heading_text '====' newline {
    return {
      type: 'heading',
      level: 4,
      text: c
    }
}

h5 = '=====' c:heading_text '=====' newline {
    return {
      type: 'heading',
      level: 5,
      text: c
    }
}

h6 = '======' c:heading_text '======' newline {
    return {
      type: 'heading',
      level: 6,
      text: c
    }
}

heading_marker
  = '=' '='*

heading_text
  = h:( !heading_marker x:(anyline) { return x } )* { return h.join(''); }

br
  = newline { return {type: 'br'} }

// TODO: convert inline content to annotations!
para
  = c:inlineline newline { return {type: 'paragraph', content: c[0] } } /
    c:anyline

inline
  = c:(inline_element / anything)+ {
    var out = [];
    var text = '';
    for (var i = 0; i < c.length; i++) {
      if (typeof c[i] == 'string') {
        text += c[i];
      } else {
        if (text.length) {
          out.push({
            type: 'text',
            text: text
          });
          text = '';
        }
        out.push(c[i]);
      }
    }
    if (text.length) {
      out.push({
        type: 'text',
        text: text
      });
    }
    return out;
}

inlineline
  = c:(inline_element / anyline)+ {
    var out = [];
    var text = '';
    for (var i = 0; i < c.length; i++) {
      if (typeof c[i] == 'string') {
        text += c[i];
      } else {
        if (text.length) {
          out.push({
            type: 'text',
            text: text
          });
          text = '';
        }
        out.push(c[i]);
      }
    }
    if (text.length) {
      out.push({
        text: text,
        //annotations: []
      });
    }
    return out;
}

inline_element
  = comment
  / xmlish_tag
  / extlink
  / template
  / link
  / bold
  / italic

comment
  = '<!--' c:comment_chars+ '-->' {
	return {
		type: 'comment',
		text: c.join('')
	}
}

comment_chars
  = c:[^-] { return c; }
  / c:'-' !'-' { return c; }

inline_text_run
  = c:[^\n]+ { return c.join(''); }

extlink
  = "[" target:url " " text:extlink_text "]" {
    return {
      type: 'extlink',
      target: target,
      text: text
    }
}

//  = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } }

url
  = proto:"http:" rest:([^ \]]+) { return proto + rest.join(''); }

extlink_text
  = c:[^\]]+ { return c.join(''); }

template
  = "{{" target:link_target params:("|" p:template_param { return p })* "}}" {
    var obj = {
      type: 'template',
      target: target
    };
    if (params && params.length) {
      obj.params = params;
    }
    return obj;
}

template_param
  = name:template_param_name "=" c:template_param_text {
  return {
    name: name,
    content: c
  };
} / c:template_param_text {
  return {
    content: c
  };
}

tplarg = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
    var obj = {
      type: 'tplarg',
      name: name
    };
    if (params && params.length) {
      obj.params = params;
    }
    return obj;
}

template_param_name
  = h:( !"}}" x:([^=|]) { return x } )* { return h.join(''); }

template_param_text
  = template_param_text_chunk*
/*  = h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }*/

template_param_text_chunk
  = comment
  / xmlish_tag
  / extlink
  / template
  / link
  / bold
  / italic
  / !"}}" x:([^|]) { return x }

link
  = "[[" target:link_target text:("|" link_text)* "]]" {
    var obj = {
      type: 'link',
      target: target
    };
    if (text && text.length) {
      obj.text = text[0][1]; // ehhhh
    }
    return obj;
}

link_target
  = h:( !"]]" x:([^|]) { return x } )* { return h.join(''); }

link_text
  = h:( !"]]" x:(anyline) { return x } )* { return h.join(''); }

bold
  = bold_marker c:bold_text bold_marker {
  return {
    type: 'b',
    text: c,
  }
}

bold_marker
  = "'''"

bold_text
  = h:( !bold_marker x:(anyline) { return x } )+ { return h.join(''); }

italic
  = italic_marker c:italic_text italic_marker {
  return {
    type: 'i',
    text: c
  }
}

italic_marker
  = "''"

italic_text
  = h:( !italic_marker x:(anyline) { return x } )+ { return h.join(''); }

/* Will need to check anything xmlish agains known/allowed HTML tags and
 * registered extensions, otherwise fail the match. Should ref be treated as a
 * regular extension? */
xmlish_tag =
    ref /
    references

ref = ref_full / ref_empty

/* Can we do backreferences to genericize this? */
ref_full
  = start:ref_start ">" content:ref_content* close:ref_end {
    return {
        type: 'ext',
        name: 'ref',
        params: start.params,
		ws: start.ws,
        content: content,
        close: close
    }
}

ref_empty
  = start:ref_start close:(space* "/>") {
    return {
        type: 'ext',
        name: 'ref',
		ws: start.ws,
        params: start.params,
		close: close
    }
}

ref_start
  = "<ref" params:ext_param* ws:space* {
  return {
	params: params,
	ws: ws
  };
}

ref_end
  = all:("</ref" space* ">") {
  return all.join('');
}

ref_content
  = !ref_end a:(inline_element / anyline) {
  return a;
}

/* fixme probably have to programatically add these */
references = references_full / references_empty

references_full
  = start:references_start ">" content:references_content* close:references_end {
    return {
        type: 'ext',
        name: 'references',
        params: start.params,
		ws: start.ws,
        content: content,
        close: close
    }
}

references_empty
  = start:references_start close:(space* "/>") {
    return {
        type: 'ext',
        name: 'references',
		ws: start.ws,
        params: start.params,
		close: close
    }
}

references_start
  = "<references" params:ext_param* ws:space* {
  return {
	params: params,
	ws: ws
  };
}

references_end
  = all:("</references" space* ">") {
  return all.join('');
}

references_content
  = !references_end a:(inline_element / anyline) {
  return a;
}


ext_param
  = space* name:ext_param_name "=" val:ext_param_val {
  val.name = name;
  return val;
}

ext_param_name
  = name:[a-zA-Z0-9-]+ {
  return name.join('');
}

ext_param_val
  = t:[0-9A-Za-z]+ { return {text: t.join('') } }
  / "'" t:[^'>]+ "'" { return { quote: "'", text: t.join('') } }
  / '"' t:[^">]+ '"' { return { quote: '"', text: t.join('') } }

lists = es:(dtdd / li)+
{
    var out = [],    // List of list nodes
        bstack = "", // Bullet stack, previous element's listStyle
        bnext = "",  // Next element's listStyle
        nodes = [];  // Stack of currently active, nested list nodes

    var commonPrefixLength = function (x, y) {
        var minLength = Math.min(x.length, y.length);
        for(var i = 0; i < minLength; i++) {
            if (x[i] != y[i])
                break;
        }
        return i;
    }

    var pushN = function ( n ) {
        if (nodes.length > 0) {
            nodes[nodes.length - 1].content.push(n);
        } else {
            out.push(n);
            nodes.push(n);
        }

    }

    var openLists = function ( bs, bn ) {
        var prefix = commonPrefixLength (bs, bn);
        nodes = nodes.slice(0, prefix);
        $.each(bn.slice(prefix, bn.length), function (i, c) {
            switch (c) {
                case '*':
                    pushN({type: 'ul', content: []});
                    break;
                case '#':
                    pushN({type: 'ol', content: []});
                    break;
                case ';':
                case ':':
                    pushN({type: 'dl', content: []});
                    break;
                default:
                    throw("Unknown node prefix " + c);
            }
        });
    }


    $.each(es, function(i, e) {
        if (e.type == 'dtdd') {
            bnext = e.content[0].listStyle;
            lnode = openLists( bstack, bnext );

            nodes[nodes.length - 1].content =
                nodes[nodes.length - 1].content.concat(e.content);
        } else {
            bnext = e.listStyle;
            openLists( bstack, bnext, nodes );
            nodes[nodes.length - 1].content.push(e);
        }
        bstack = bnext;
    });
    //dp("out: " + print_r(out, 5));
    return out;
}

li = bullets:list_char+
    c:(inlineline / anyline)
    newline
{
    var type;
    switch (bullets[bullets.length - 1]) {
        case '#':
        case '*':
            type = 'li'; break;
        case ';': type = 'dt'; break;
        case ':': type = 'dd'; break;
    }
    return {
        type: type,
        listStyle: bullets,
        content: c
    };
}

dtdd = bullets:list_char+
       c:(inline_element / [^:\n])+
       ":"
       d:(inline / anyline)
       newline
{
    // reject rule if bullets do not end in semicolon
    if (bullets[bullets.length - 1] != ';') {
        return null;
    } else {
        return {
            type: 'dtdd',
            content: [
                {
                    type: 'dt',
                    listStyle: bullets,
                    content: c
                }, {
                    type: 'dd',
                    listStyle: bullets.slice(0, bullets.length - 1) + ':',
                    content: d
                }

            ]
        }
    }
}


list_char = [*#:;]


/* Tables */

table
  = tas:table_start c:table_caption? b:table_body? table_end {
      var res = {type: 'table'}
      var body = b !== '' ? b : [];
      if (c !== '') {
          res.content = [c].concat(body);
      } else {
          res.content = body;
      }
      if (tas.length > 0) {
          // FIXME: actually parse and build structure
          res.attributes = { unparsed: tas }
      }
      dp(print_r(res));
      return res;
  }

table_start
  =  "{|"
     & { forbiddenThingRegexp.push('(\n|^)\\||\\|[|\x7d+]'); return true; }
     ta:table_attribs* space* newline? {
         dp("table_start " + print_r(ta) + ", pos:" + pos);
         return ta;
     }
  / "{|" { forbiddenThingRegexp.pop('(\n|^)\\||\\|[|\x7d+]'); return null; }

table_attribs = anyline

table_caption
  = "|+" c:inline* newline? {
      return {
          type: 'tableCaption',
          content: c
      }
  }

table_body
  = & { dp("table_body enter"); return true; }
    firstrow:table_firstrow otherrows:table_row* {
      /* dp('table first and otherrows: '
       * + print_r([firstrow].concat(otherrows))); */
      return [firstrow].concat(otherrows);
  }
  / otherrows:table_row* {
      //dp('table otherrows: ' + print_r(otherrows));
      return otherrows;
  }

table_firstrow
  = td:table_data+ {
      return {
          type: 'tableRow',
          content: td
      };
  }

table_row
  = "|-" space* newline? td:(table_data / table_header)* {
      return {
          type: 'tableRow',
          content: td
      };
  }

table_data
  = & { dp("table_data enter, pos=" + pos); return true; }
    "||" td_attr? td:anyblock* newline? {
      //dp("table || result:" + print_r(td));
      return {
          type: 'tableCell',
          content: td
      };
  }
  / & { dp("table_data : | enter pos=" + pos); return true; }
    "|" ![}+-] td_attr? td:anyblock* newline? {
      //dp("table | result:" + print_r(td));
      return {
          type: 'tableCell',
          content: td
      };
  }

td_attr = a:[^\n|]+ "|" !"|" {
    dp("td_attr: " + a.join(''));
    return a.join('');
}

table_header
  = "!!" c:(inline / ("!" !"!") / [^!\n])* newline? {
      return {
          type: 'tableHeader',
              content: c
      }
  }
  / "!" c:(inline / text / '!' !'!' / [^!\n])* newline? {
      return {
          type: 'tableHeader',
              content: c
      }
  }

table_end = "|}" newline? { forbiddenThingRegexp.pop('(\n|^)\\||\\|[|\x7d+]'); }


/* Wikidom TODO:
 * split off text into content nodes
 * convert inlines into annotations
 * change contents into children
 */