mediawiki-extensions-Visual.../modules/parser/pegParser.pegjs.txt

/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */

{
    var dp = function ( msg ) {
        if ( false ) {
            console.log(msg);
        }
    }

    /*
     * Flags for specific parse environments (inside tables, links etc). Flags
     * trigger syntactic stops in the inline_breaks production, which
     * terminates inline and attribute matches. Flags merely reduce the number
     * of productions needed: The grammar is still context-free as the
     * productions can just be unrolled for all combinations of environments
     * at the cost of a much larger grammar.
     */
    var syntaxFlags = {};
    var setFlag = function(flag) {
        if (syntaxFlags[flag] !== undefined) {
            syntaxFlags[flag]++;
        } else {
            syntaxFlags[flag] = 1;
        }
        return true;
    }
    var clearFlag = function(flag) {
        syntaxFlags[flag]--;
    }


    /* Temporary debugging help. Is there anything similar in JS or a library? */
    var print_r = function (arr, level) {

        var dumped_text = "";
        if (!level) level = 0;

        //The padding given at the beginning of the line.
        var level_padding = "";
        var bracket_level_padding = "";

        for (var j = 0; j < level + 1; j++) level_padding += "  ";
        for (var b = 0; b < level; b++) bracket_level_padding += "  ";

        if (typeof(arr) == 'object') { //Array/Hashes/Objects 
            dumped_text += "Array\n";
            dumped_text += bracket_level_padding + "(\n";
            for (var item in arr) {

                var value = arr[item];

                if (typeof(value) == 'object') { //If it is an array,
                    dumped_text += level_padding + "[" + item + "] => ";
                    dumped_text += print_r(value, level + 2);
                } else {
                    dumped_text += level_padding + "[" + item + "] => '" + value + "'\n";
                }

            }
            dumped_text += bracket_level_padding + ")\n\n";
        } else { //Strings/Chars/Numbers etc.
            dumped_text = "=>" + arr + "<=(" + typeof(arr) + ")";
        }

        return dumped_text;

    }

    // Convert list prefixes to a list of WikiDom list styles
    var bulletsToTypes = function (bullets) {
        var bTypes = [];
        var blen = bullets.length;
        for (var i = 0; i < bullets.length; i++) {
            switch (bullets[i]) {
                case '*': 
                    bTypes.push('bullet'); break;
                case '#': 
                    bTypes.push('number'); break;
                case ';':
                    bTypes.push('definitionterm'); break;
                case ':':
                    bTypes.push('definitiondescription'); break;
            }
        }
        return bTypes;
    }
}

start
  = e:block* { 
        var es = [];
        // flatten sub-arrays, as a list block can contain multiple lists
        for(var i = 0, length = e.length; i < length; i++) {
            var ei = e[i];
            if ($.isArray(ei))
                es = es.concat(ei);
            else
                es.push(ei);
        };
        return {
            type: 'page', 
            children: es
        } 
    }

anyblock = block / inline
anyblockline = block / inlineline


// All chars that cannot start syntactic structures
text = t:[A-Za-z0-9,._ -]+ { return t.join('') }

space
  = s:[ \t]+ { return s.join(''); }


// Start of line
sol = (newline / & { return pos === 0; } { return true; }) comment?

newline
  = '\n' / '\r\n'

block
  = block_lines
  / para
  / br
  / newline &newline
  / comment

block_lines
  = h
  / table
  / lists
  / pre_indent


/* Headings */
h = h1 / h2 / h3 / h4 / h5 / h6

h1 = sol
     '=' c:heading_text '=' &newline {
    return {
      type: 'heading',
      attributes: {level: 1},
      text: c
    }
}

h2 = sol
     '==' c:heading_text '==' &newline {
    return {
      type: 'heading',
      attributes: {level: 2},
      content: c
    }
}

h3 = sol
     '===' c:heading_text '===' &newline {
    return {
      type: 'heading',
      attributes: {level: 3},
      content: c
    }
}

h4 = sol
     '====' c:heading_text '====' &newline {
    return {
      type: 'heading',
      attributes: {level: 4},
      content: c
    }
}

h5 = sol
     '=====' c:heading_text '=====' &newline {
    return {
      type: 'heading',
      attributes: {level: 5},
      content: c
    }
}

h6 = sol
     '======' c:heading_text '======' &newline {
    return {
      type: 'heading',
      attributes: {level: 6},
      content: c
    }
}

heading_marker
  = '=' '='*

heading_text
  = h:( !heading_marker x:(inlineline) { return x } )* { return h.join(''); }

br
  = newline !newline { return {type: 'br'} }

// TODO: convert inline content to annotations!
para
  = sol c:inlineline cs:(!block_lines para)* { return {type: 'paragraph', content: c[0] } }

pre_indent 
  = l:pre_indent_line+ {
      return { 
          type: 'pre',
          children: l 
      }
  }
pre_indent_line = sol space l:inlineline { return l }

// Syntax that stops inline expansion
inline_breaks
  = //& { console.log(print_r(syntaxFlags)); return true; }
    & { return syntaxFlags['table']; } 
    a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + print_r(a)); return true; }
  / & { return syntaxFlags['italic']; } italic_marker { return true; }
  / & { return syntaxFlags['bold']; } bold_marker { return true; }
  / & { return syntaxFlags['linkdesc']; } link_end { return true; }


inline
  = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ {
    var out = [];
    var text = '';
    for (var i = 0; i < c.length; i++) {
      if (typeof c[i] == 'string') {
        text += c[i];
      } else {
        if (text.length) {
          out.push({
            type: 'text',
            text: text
          });
          text = '';
        }
        out.push(c[i]);
      }
    }
    if (text.length) {
      out.push({
        type: 'text',
        text: text
      });
    }
    return out;
}

inlineline
  = c:(text / !inline_breaks (inline_element / [^\n]))+ {
    var out = [];
    var text = '';
    dp("inlineline: " + print_r(c));
    for (var i = 0; i < c.length; i++) {
      if (typeof c[i] == 'string') {
        text += c[i];
      } else {
        if (text.length) {
          out.push({
            type: 'text',
            text: text
          });
          text = '';
        }
        out.push(c[i]);
      }
    }
    if (text.length) {
      out.push({
        text: text,
        //annotations: []
      });
    }
    return out;
}

/* TODO: convert all these to annotations!
 * -> need (start, end) offsets within block
 */
inline_element
  = comment
  / xmlish_tag
  / extlink
  / template
  / link
  / bold
  / italic

comment
  = '<!--' c:comment_chars+ '-->' {
	return {
		type: 'comment',
		text: c.join('')
	}
}

comment_chars
  = c:[^-] { return c; }
  / c:'-' !'->' { return c; }

extlink
  = "[" target:url " " text:extlink_text "]" {
    return {
      type: 'extlink',
      target: target,
      text: text
    }
}

//  = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } }

url
  = proto:"http:" rest:([^ \]]+) { return proto + rest.join(''); }

extlink_text
  = c:[^\]]+ { return c.join(''); }

template
  = "{{" target:link_target params:("|" p:template_param { return p })* "}}" {
    var obj = {
      type: 'template',
      target: target
    };
    if (params && params.length) {
      obj.params = params;
    }
    return obj;
}

template_param
  = name:template_param_name "=" c:template_param_text {
  return {
    name: name,
    content: c
  };
} / c:template_param_text {
  return {
    content: c
  };
}

tplarg = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
    var obj = {
      type: 'tplarg',
      name: name
    };
    if (params && params.length) {
      obj.params = params;
    }
    return obj;
}

template_param_name
  = h:( !"}}" x:([^=|]) { return x } )* { return h.join(''); }

template_param_text
  = template_param_text_chunk*
/*  = h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }*/

template_param_text_chunk
  = comment
  / xmlish_tag
  / extlink
  / template
  / link
  / bold
  / italic
  / !"}}" x:([^|]) { return x }

link
  = "[[" target:link_target text:("|" link_text)* "]]" {
    var obj = {
      type: 'link',
      target: target
    };
    if (text && text.length) {
      obj.text = text[0][1]; // ehhhh
    }
    return obj;
}

link_target
  = h:( !"]]" x:([^|]) { return x } )* { return h.join(''); }

link_text
  = h:( & { return setFlag('linkdesc'); }
        x:inlineline { return x } 
      )* { 
          clearFlag('linkdesc')
          return h.join(''); 
      }
  / & { clearFlag('linkdesc') } { return null; }

link_end = "]]"

bold
  = bold_marker 
    & { dp('benter:' + pos); return setFlag('bold'); }
    c:inlineline
    bold_marker {
        clearFlag('bold');
        return {
            type: 'b',
                text: c,
        }
    }
  / bold_marker { clearFlag('bold'); return null }

bold_marker
  = "'''"


italic
  = italic_marker 
    & { dp('ienter:' + pos); return setFlag('italic'); }
        c:inlineline 
        italic_marker {
            clearFlag('italic');
            dp('ileave:' + pos);
            return {
                type: 'i',
                    text: c
            }
        }
  / italic_marker { clearFlag('italic'); return null }

italic_marker
  = "''"

/* Will need to check anything xmlish agains known/allowed HTML tags and
 * registered extensions, otherwise fail the match. Should ref be treated as a
 * regular extension? */
xmlish_tag =
    ref /
    references

ref = ref_full / ref_empty

/* Can we do backreferences to genericize this? */
ref_full
  = start:ref_start ">" content:ref_content* close:ref_end {
    return {
        type: 'ext',
        name: 'ref',
        params: start.params,
		ws: start.ws,
        content: content,
        close: close
    }
}

ref_empty
  = start:ref_start close:(space* "/>") {
    return {
        type: 'ext',
        name: 'ref',
		ws: start.ws,
        params: start.params,
		close: close
    }
}

ref_start
  = "<ref" params:ext_param* ws:space* {
  return {
	params: params,
	ws: ws
  };
}

ref_end
  = all:("</ref" space* ">") {
  return all.join('');
}

ref_content
  = !ref_end a:(inline) {
  return a;
}

/* fixme probably have to programatically add these */
references = references_full / references_empty

references_full
  = start:references_start ">" content:references_content* close:references_end {
    return {
        type: 'ext',
        name: 'references',
        params: start.params,
		ws: start.ws,
        content: content,
        close: close
    }
}

references_empty
  = start:references_start close:(space* "/>") {
    return {
        type: 'ext',
        name: 'references',
		ws: start.ws,
        params: start.params,
		close: close
    }
}

references_start
  = "<references" params:ext_param* ws:space* {
  return {
	params: params,
	ws: ws
  };
}

references_end
  = all:("</references" space* ">") {
  return all.join('');
}

references_content
  = !references_end a:(inline) {
  return a;
}


ext_param
  = space* name:ext_param_name "=" val:ext_param_val {
  val.name = name;
  return val;
}

ext_param_name
  = name:[a-zA-Z0-9-]+ {
  return name.join('');
}

ext_param_val
  = t:[0-9A-Za-z]+ { return {text: t.join('') } }
  / "'" t:[^'>]+ "'" { return { quote: "'", text: t.join('') } }
  / '"' t:[^">]+ '"' { return { quote: '"', text: t.join('') } }

lists = es:(dtdd / li)+
{  
    // Flatten es
    var esLen = es.length;
    var flatEs = [];
    for (var i = 0; i < esLen; i++) {
        var ei = es[i];
        if ($.isArray(ei)) {
            flatEs = flatEs.concat(ei);
        } else {
            flatEs.push(ei);
        }
    }
    return {
        type: 'list',
        children: flatEs
    }
}

li = sol
     bullets:list_char+ 
     c:inlineline
     &newline 
{
    return {
        type: 'listItem',
        attributes: {
            styles: bulletsToTypes(bullets)
        },
        content: c[0]
    };
}

dtdd = sol
       bullets:list_char+
       c:(inline_element / [^:\n])+
       ":"
       d:(inline_element / [^\n])+
       &newline 
{
    // reject rule if bullets do not end in semicolon
    if (bullets[bullets.length - 1] != ';') {
        return null;
    } else {
        return [
            {
                type: 'listItem',
                    attributes: {styles: bulletsToTypes(bullets)},
                    content: {text: c.join('')}
            }, {
                type: 'listItem',
                    attributes: {styles: bulletsToTypes(
                            bullets.slice(0, bullets.length - 1) + ':')},
                    content: {text: d.join('')}
            }
        ]
    }
}


list_char = [*#:;]


/* Tables */

table 
  = tas:table_start c:table_caption? b:table_body? table_end { 
      var res = {type: 'table'}
      var body = b !== '' ? b : [];
      if (c !== '') {
          res.children = [c].concat(body); 
      } else { 
          res.children = body;
      }
      if (tas.length > 0) {
          // FIXME: actually parse and build structure
          res.attributes = { unparsed: tas } 
      }
      //dp(print_r(res));
      return res;
  }

table_start 
  =  sol
     "{|"
     & { setFlag('table'); return true; }
     ta:table_attribs* 
     space* { 
         //dp("table_start " + print_r(ta) + ", pos:" + pos);
         return ta;
     }
  / sol "{|" { clearFlag('table'); return null; }

table_attribs 
 = text / ! inline_breaks !newline .

table_caption 
  = newline
    "|+" c:inline* { 
      return {
          type: 'tableCaption',
          content: c[0]
      }
  }

table_body 
  = & { dp("table_body enter"); return true; }
    firstrow:table_firstrow otherrows:table_row* { 
      /* dp('table first and otherrows: ' 
       * + print_r([firstrow].concat(otherrows))); */
      return [firstrow].concat(otherrows); 
  }
  / otherrows:table_row* {
      //dp('table otherrows: ' + print_r(otherrows));
      return otherrows;
  }

table_firstrow 
  = td:table_data+ { 
      return {
          type: 'tableRow',
          children: td
      };
  }

table_row 
  = & { dp("table row enter"); return true; }
    newline
    "|-" thtd_attribs? space* td:(table_data / table_header)* { 
      return {
          type: 'tableRow',
          children: td
      };
  }

table_data 
  = & { dp("table_data enter, pos=" + pos); return true; }
    ("||" / newline "|") 
    ! [}+-]
    a:thtd_attribs? 
    td:(!inline_breaks anyblock)* { 
      dp("table data result: " + print_r(td) + ", attribts: " + print_r(a));
      return {
          type: 'tableCell',
          attributes: { unparsed: a },
          children: td
      };
  }

table_header
  = ("!!" / newline "!") 
     a:thtd_attribs?
     c:inline {
         return { 
             type: 'tableHeading',
             attributes: { unparsed: a },
             children: c
         }
     }

thtd_attribs
  // In particular, do not match [|\n]
  = a:(text / ! inline_breaks [="':;/,.-] )+ "|" ! [|}+-] {
        return a;
    }


table_end = newline? "|}" { clearFlag('table'); }


/* Wikidom TODO:
 * split off text into content nodes
 * convert inlines into annotations
 * change contents into children
 */