mediawiki-extensions-Visual.../modules/parser/pegParser.pegjs.txt

/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */

{
    var dp = function ( msg ) {
        if ( false ) {
            console.log(msg);
        }
    }

    /*
     * Flags for specific parse environments (inside tables, links etc). Flags
     * trigger syntactic stops in the inline_breaks production, which
     * terminates inline and attribute matches. Flags merely reduce the number
     * of productions needed: The grammar is still context-free as the
     * productions can just be unrolled for all combinations of environments
     * at the cost of a much larger grammar.
     */
    var syntaxFlags = {};
    var setFlag = function(flag) {
        if (syntaxFlags[flag] !== undefined) {
            syntaxFlags[flag]++;
        } else {
            syntaxFlags[flag] = 1;
        }
        return true;
    }
    var clearFlag = function(flag) {
        syntaxFlags[flag]--;
    }


    /* Temporary debugging help. Is there anything similar in JS or a library? */
    var print_r = function (arr, level) {

        var dumped_text = "";
        if (!level) level = 0;

        //The padding given at the beginning of the line.
        var level_padding = "";
        var bracket_level_padding = "";

        for (var j = 0; j < level + 1; j++) level_padding += "  ";
        for (var b = 0; b < level; b++) bracket_level_padding += "  ";

        if (typeof(arr) == 'object') { //Array/Hashes/Objects
            dumped_text += "Array\n";
            dumped_text += bracket_level_padding + "(\n";
            for (var item in arr) {

                var value = arr[item];

                if (typeof(value) == 'object') { //If it is an array,
                    dumped_text += level_padding + "[" + item + "] => ";
                    dumped_text += print_r(value, level + 2);
                } else {
                    dumped_text += level_padding + "[" + item + "] => '" + value + "'\n";
                }

            }
            dumped_text += bracket_level_padding + ")\n\n";
        } else { //Strings/Chars/Numbers etc.
            dumped_text = "=>" + arr + "<=(" + typeof(arr) + ")";
        }

        return dumped_text;

    }

    // Convert list prefixes to a list of WikiDom list styles
    var bulletsToTypes = function (bullets) {
        var bTypes = [];
        var blen = bullets.length;
        for (var i = 0; i < bullets.length; i++) {
            switch (bullets[i]) {
                case '*':
                    bTypes.push('bullet'); break;
                case '#':
                    bTypes.push('number'); break;
                case ';':
                    bTypes.push('term'); break;
                case ':':
                    bTypes.push('description'); break;
            }
        }
        return bTypes;
    };

    var extractInline = function ( node ) {
        return { text: extractText(node)
        }
    };


    var extractText = function ( node ) {
        dp("extract: " + print_r(node));
        if (typeof node === 'string') {
            return node;
        } else if ($.isArray(node)) {
            var texts = [];
            for (var i = 0, length = node.length; i < length; i++) {
                texts.push(extractText(node[i]));
            }
            return texts.join('');
        } else if ( 'text' in node ) {
            return extractText(node.text);
        } else if ( 'content' in node ) {
            return extractText(node.content);
        } else if ( 'children' in node ) {
            var texts = [];
            for (var i = 0, length = node.children.length; i < length; i++) {
                texts.push(extractText(node.children[i]));
            }
            return texts.join('');
        } else {
            console.log("extract failed!" + print_r(node));
            throw ("extract failed: " + print_r(node));
        }
    };
}

start
  = e:block* newline* {
      var es = [];
      // flatten sub-arrays, as a list block can contain multiple lists
      for(var i = 0, length = e.length; i < length; i++) {
          var ei = e[i];
          if ($.isArray(ei))
              es = es.concat(ei);
          else
              es.push(ei);
      };
      return {
          type: 'page',
              children: es
      }
  }

anyblock = block / inline
anyblockline = block / inlineline


// All chars that cannot start syntactic structures
text = t:[A-Za-z0-9,._ -]+ { return t.join('') }

space
  = s:[ \t]+ { return s.join(''); }


// Start of line
sol = (newline / & { return pos === 0; } { return true; })
      cn:(comment n:newline? { return n })? {
          return cn;
      }


newline
  = '\n' / '\r\n'

block
  = (sol space* &newline)? block_lines
  / para
  / comment
  / sol

block_lines
  = h
  / table
  / lists
  / pre_indent


/* Headings */
h = h1 / h2 / h3 / h4 / h5 / h6

h1 = sol '='
    (
     & { setFlag('h'); return setFlag('h1') }
     c:inlineline '=' &newline {
         clearFlag('h');
         clearFlag('h1');
         return {
             type: 'heading',
             attributes: {level: 1},
             content: extractInline(c)
         }
      }
    / { clearFlag('h'); clearFlag('h1'); return null }
    )

h2 = sol '=='
    (
     & { setFlag('h'); return setFlag('h2') }
     c:inlineline '==' &newline {
         clearFlag('h');
         clearFlag('h2');
         return {
             type: 'heading',
             attributes: {level: 2},
             content: extractInline(c)
         }
      }
    / { clearFlag('h'); clearFlag('h2'); return null }
    )

h3 = sol '==='
    (
     & { setFlag('h'); return setFlag('h3') }
     c:inlineline '===' &newline {
         clearFlag('h');
         clearFlag('h3');
         return {
             type: 'heading',
             attributes: {level: 3},
             content: extractInline(c)
         }
     }
    / { clearFlag('h'); clearFlag('h3'); return null }
    )

h4 = sol '===='
    (
     & { setFlag('h'); return setFlag('h4') }
     c:inlineline '====' &newline {
         clearFlag('h');
         clearFlag('h4');
         return {
             type: 'heading',
             attributes: {level: 4},
             content: extractInline(c)
         }
     }
    / { clearFlag('h'); clearFlag('h4'); return null }
    )

h5 = sol '====='
    (& { setFlag('h'); return setFlag('h5') }
     c:inlineline '=====' &newline {
         clearFlag('h');
         clearFlag('h5');
         return {
             type: 'heading',
             attributes: {level: 5},
             content: extractInline(c)
         }
     }
    / { clearFlag('h'); clearFlag('h5'); return null }
    )

h6 = sol '======'
    (& { setFlag('h'); return setFlag('h6') }
     c:inlineline '======' &newline {
         clearFlag('h');
         clearFlag('h6');
         return {
             type: 'heading',
             attributes: {level: 6},
             content: extractInline(c)
         }
     }
    / { clearFlag('h'); clearFlag('h6'); return null }
    )

heading_marker
  = '=' '='*

heading_text
  = h:( !(heading_marker newline) x:inlineline { return x } )* { return h.join(''); }


// TODO: convert inline content to annotations!
para
  = (sol br)? para_lines

para_lines
  = s:sol c:inlineline cs:(!block_lines para_lines)* {
      return {
          type: 'paragraph',
          content: extractInline([s].concat([c]).concat(cs))
      }
  }

br = space* &newline { return {type: 'br'} }

pre_indent
  = l:pre_indent_line+ {
      return {
          type: 'pre',
              content: extractInline(l)
      }
  }
pre_indent_line = sol space l:inlineline { return l }

// Syntax that stops inline expansion
inline_breaks
  = //& { console.log(print_r(syntaxFlags)); return true; }
    & { return syntaxFlags['table']; }
    a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + print_r(a)); return true; }
  / & { return syntaxFlags['italic']; } italic_marker { return true; }
  / & { return syntaxFlags['bold']; } bold_marker { return true; }
  / & { return syntaxFlags['linkdesc']; } link_end { return true; }
  / & { return syntaxFlags['h']; }
        ( & { return syntaxFlags['h1'] } '=' newline { return true; }
        / & { return syntaxFlags['h2'] } '==' newline { return true; }
        / & { return syntaxFlags['h3'] } '===' newline { return true; }
        / & { return syntaxFlags['h4'] } '====' newline { return true; }
        / & { return syntaxFlags['h5'] } '=====' newline { return true; }
        / & { return syntaxFlags['h6'] } '======' newline { return true; }
        )


inline
  = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ {
      var out = [];
      var text = '';
      for (var i = 0; i < c.length; i++) {
          if (typeof c[i] == 'string') {
              text += c[i];
          } else {
              if (text.length) {
                  out.push({
                      type: 'text',
                      text: text
                  });
                  text = '';
              }
              out.push(c[i]);
          }
      }
      if (text.length) {
          out.push({
              type: 'text',
              text: text
          });
      }
      return out;
}

inlineline
  = c:(text / !inline_breaks (inline_element / [^\n]))+ {
      var out = [];
      var text = '';
      //dp("inlineline: " + print_r(c));
      for (var i = 0; i < c.length; i++) {
          if (typeof c[i] == 'string') {
              text += c[i];
          } else {
              if (text.length) {
                  out.push({
                      type: 'text',
                      text: text
                  });
                  text = '';
              }
              out.push(c[i]);
          }
      }
      if (text.length) {
          out.push({
              text: text,
              //annotations: []
          });
      }
      return out;
}

/* TODO: convert all these to annotations!
 * -> need (start, end) offsets within block
 */
inline_element
  = comment
  / xmlish_tag
  / extlink
  / template
  / link
  / bold
  / italic

comment
  = '<!--' c:comment_chars* '-->'
    (space* newline space* comment)* {
        return {
            type: 'comment',
            text: c.join('')
        }
    }

comment_chars
  = c:[^-] { return c; }
  / c:'-' !'->' { return c; }

extlink
  = "[" target:url " " text:extlink_text "]" {
      return {
          type: 'extlink',
              target: target,
              text: text
      }
  }

//  = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } }

url
  = proto:"http:" rest:([^ \]]+) { return proto + rest.join(''); }

extlink_text
  = c:[^\]]+ { return c.join(''); }

template
  = "{{" target:link_target params:("|" p:template_param { return p })* "}}" {
      var obj = {
          type: 'template',
          target: target
      };
      if (params && params.length) {
          obj.params = params;
      }
      return obj;
  }

template_param
  = name:template_param_name "=" c:template_param_text {
      return {
          name: name,
              content: c
      };
  } / c:template_param_text {
      return {
          content: c
      };
  }

tplarg
  = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
      var obj = {
          type: 'tplarg',
          name: name
      };
      if (params && params.length) {
          obj.params = params;
      }
      return obj;
  }

template_param_name
  = h:( !"}}" x:([^=|]) { return x } )* { return h.join(''); }

template_param_text
  = template_param_text_chunk*
/*  = h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }*/

template_param_text_chunk
  = comment
  / xmlish_tag
  / extlink
  / template
  / link
  / bold
  / italic
  / !"}}" x:([^|]) { return x }

link
  = "[[" target:link_target text:("|" link_text)* "]]" {
      var obj = {
          type: 'link',
          target: target
      };
      if (text && text.length) {
          obj.text = text[0][1]; // ehhhh
      }
      return obj;
  }

link_target
  = h:( !"]]" x:([^|]) { return x } )* { return h.join(''); }

link_text
  = h:( & { return setFlag('linkdesc'); }
          x:inlineline { return x }
      )* {
          clearFlag('linkdesc')
              return h.join('');
      }
  / & { clearFlag('linkdesc') } { return null; }

link_end = "]]"

bold
  = bold_marker
    & { dp('benter:' + pos); return setFlag('bold'); }
    c:inlineline
    bold_marker {
        clearFlag('bold');
        return {
            type: 'b',
            content: {text: c}
        }
    }
  / bold_marker { clearFlag('bold'); return null }

bold_marker
  = "'''"


italic
  = italic_marker
    & { dp('ienter:' + pos); return setFlag('italic'); }
    c:inlineline
    italic_marker {
            clearFlag('italic');
            dp('ileave:' + pos);
            return {
                type: 'i',
                content: {text: c}
            }
        }
  / italic_marker { clearFlag('italic'); return null }

italic_marker
  = "''"

/* Will need to check anything xmlish agains known/allowed HTML tags and
 * registered extensions, otherwise fail the match. Should ref be treated as a
 * regular extension? */
xmlish_tag = ref / references

ref = ref_full / ref_empty

/* Can we do backreferences to genericize this? */
ref_full
  = start:ref_start ">" content:ref_content* close:ref_end {
      return {
          type: 'ext',
              name: 'ref',
              params: start.params,
              ws: start.ws,
              content: content,
              close: close
      }
}

ref_empty
  = start:ref_start close:(space* "/>") {
      return {
          type: 'ext',
          name: 'ref',
          ws: start.ws,
          params: start.params,
          close: close
      }
}

ref_start
  = "<ref" params:ext_param* ws:space* {
      return {
          params: params,
          ws: ws
      };
}

ref_end
  = all:("</ref" space* ">") {
  return all.join('');
}

ref_content
  = !ref_end a:(inline) {
  return a;
}

/* fixme probably have to programatically add these */
references = references_full / references_empty

references_full
  = start:references_start ">" content:references_content* close:references_end {
      return {
          type: 'ext',
          name: 'references',
          params: start.params,
          ws: start.ws,
          content: content,
          close: close
      }
  }

references_empty
  = start:references_start close:(space* "/>") {
      return {
          type: 'ext',
          name: 'references',
          ws: start.ws,
          params: start.params,
          close: close
      }
  }

references_start
  = "<references" params:ext_param* ws:space* {
      return {
          params: params,
          ws: ws
      };
  }

references_end
  = all:("</references" space* ">") {
      return all.join('');
  }

references_content
  = !references_end a:(inline) {
      return a;
  }


ext_param
  = space* name:ext_param_name "=" val:ext_param_val {
      val.name = name;
      return val;
  }

ext_param_name
  = name:[a-zA-Z0-9-]+ {
  return name.join('');
}

ext_param_val
  = t:[0-9A-Za-z]+ { return {text: t.join('') } }
  / "'" t:[^'>]+ "'" { return { quote: "'", text: t.join('') } }
  / '"' t:[^">]+ '"' { return { quote: '"', text: t.join('') } }

lists = es:(dtdd / li)+
{
    // Flatten es
    var esLen = es.length;
    var flatEs = [];
    for (var i = 0; i < esLen; i++) {
        var ei = es[i];
        if ($.isArray(ei)) {
            flatEs = flatEs.concat(ei);
        } else {
            flatEs.push(ei);
        }
    }
    return {
        type: 'list',
            children: flatEs
    }
}

li = sol
     bullets:list_char+
     c:inlineline
     &newline
{
    return {
        type: 'listItem',
        attributes: {
            styles: bulletsToTypes(bullets)
        },
        content: extractInline(c)
    };
}

dtdd = sol
       bullets:list_char+
       c:(inline_element / [^:\n])+
       ":"
       d:(inline_element / [^\n])+
       &newline
{
    // reject rule if bullets do not end in semicolon
    if (bullets[bullets.length - 1] != ';') {
        return null;
    } else {
        return [
        {
            type: 'listItem',
                attributes: {styles: bulletsToTypes(bullets)},
                content: extractInline(c)
        }, {
            type: 'listItem',
                attributes: {styles: bulletsToTypes(
                        bullets.slice(0, bullets.length - 1) + ':')},
                content: extractInline(d)
        }
        ]
    }
}


list_char = [*#:;]


/* Tables */

table
  = tas:table_start c:table_caption? b:table_body? table_end {
      var res = {type: 'table'}
      var body = b !== '' ? b : [];
      if (c !== '') {
          res.children = [c].concat(body);
      } else {
          res.children = body;
      }
      if (tas.length > 0) {
          // FIXME: actually parse and build structure
          res.attributes = { unparsed: tas }
      }
      //dp(print_r(res));
      return res;
  }

table_start
  =  sol
     "{|"
     & { setFlag('table'); return true; }
     ta:table_attribs*
     space* {
         //dp("table_start " + print_r(ta) + ", pos:" + pos);
         return ta;
     }
  / sol "{|" { clearFlag('table'); return null; }

table_attribs
 = text / ! inline_breaks !newline .

table_caption
  = newline
    "|+" c:inline* {
        return {
            type: 'tableCaption',
                content: c[0]
        }
    }

table_body
  = & { dp("table_body enter"); return true; }
    firstrow:table_firstrow otherrows:table_row* {
      /* dp('table first and otherrows: '
       * + print_r([firstrow].concat(otherrows))); */
      return [firstrow].concat(otherrows);
  }
  / otherrows:table_row* {
      //dp('table otherrows: ' + print_r(otherrows));
      return otherrows;
  }

table_firstrow
  = td:table_data+ {
      return {
          type: 'tableRow',
          children: td
      };
  }

table_row
  = & { dp("table row enter"); return true; }
    newline
    "|-" thtd_attribs? space* td:(table_data / table_header)* {
        return {
            type: 'tableRow',
                children: td
        };
    }

table_data
  = & { dp("table_data enter, pos=" + pos); return true; }
    ("||" / newline "|")
    ! [}+-]
    a:thtd_attribs?
    td:(!inline_breaks anyblock)* {
        dp("table data result: " + print_r(td) + ", attribts: " + print_r(a));
        return {
            type: 'tableCell',
                attributes: { unparsed: a },
                children: td
        };
    }

table_header
  = ("!!" / newline "!")
     a:thtd_attribs?
     c:inline {
         return {
             type: 'tableHeading',
                 attributes: { unparsed: a },
                 children: c
         }
     }

thtd_attribs
  // In particular, do not match [|\n]
  = a:(text / ! inline_breaks [="':;/,.-] )+ "|" ! [|}+-] {
  return a;
  }


table_end = newline? "|}" { clearFlag('table'); }


/* Wikidom TODO:
 * split off text into content nodes
 * convert inlines into annotations
 * change contents into children
 */

/* Tabs do not mix well with the hybrid production syntax */
/* vim: et:ts=4:sw=4:cindent */