mediawiki-extensions-Visual.../modules/parser/pegParser.pegjs.txt

/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
{
	/* Fixme: use static functions to separate module! Unfortunately, this
	 * does not work:
	 * var tu = require('./mediawiki.tokenizer.utils.js');
	 * console.log(tu.flatten([]));
	 * Using exports in the module gets a bit further, but accesses to
	 * tu.flatten in productions still fail. Thus, I just moved the functions
	 * here until a solution is found:
	 */

	/* Static utilities */

	// Flatten a list of lists.
	var flatten = function ( e ) {
		var es = [];
		// flatten sub-arrays
		for(var i = 0, length = e.length; i < length; i++) {
			var ei = e[i];
			if ($.isArray(ei))
				es = es.concat(flatten(ei));
			else
				es.push(ei);
		};
		return es;
	};

	// Remove escaped quotes from attributes etc
	var unquote = function (quotec, text) {
		return text.replace('\\' + quotec, quotec);
	};


	// Debug print with global switch
    var dp = function ( msg ) {
        if ( false ) {
            console.log(msg);
        }
    };

    var pp = function ( s ) { return JSON.stringify(s, null, 2); }

	/*
	 * Annotate a token stream with list items with appropriate list tokens
	 *
	 * @static
	 * @method
	 * @param {[tokens]}   Token stream with li tokens
	 * @returns {[tokens]} Token stream, possibly with additional list tokens
	 * */
	var annotateList = function ( tokens ) {
		var out = [],    // List of tokens
			bstack = [], // Bullet stack, previous element's listStyle
			bnext = [],  // Next element's listStyle
			endtags = [];  // Stack of end tags

		var commonPrefixLength = function (x, y) {
			var minLength = Math.min(x.length, y.length);
			for(var i = 0; i < minLength; i++) {
				if (x[i] != y[i])
					break;
			}
			return i;
		};

		var pushList = function ( listName, itemName ) {
			out.push({type: 'TAG', name: listName});
			out.push({type: 'TAG', name: itemName});
			endtags.push({type: 'ENDTAG', name: listName});
			endtags.push({type: 'ENDTAG', name: itemName});
		};

		var popTags = function ( n ) {
			for(;n > 0; n--) {
				// push list item..
				out.push(endtags.pop());
				// and the list end tag
				out.push(endtags.pop());
			}
		};

		var isDlDd = function (a, b) {
			var ab = [a,b].sort();
			return (ab[0] === ':' && ab[1] === ';');
		};

		var doListItem = function ( bs, bn ) {
			var prefixLen = commonPrefixLength (bs, bn);
			var changeLen = bn.length - prefixLen;
			var prefix = bn.slice(0, prefixLen);
			// emit close tag tokens for closed lists
			if (changeLen === 0) {
				var itemToken = endtags.pop();
				out.push(itemToken);
				out.push({type: 'TAG', name: itemToken.name});
				endtags.push({type: 'ENDTAG', name: itemToken.name});
			} else if ( bs.length == bn.length
						&& changeLen == 1
						&& isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
				// handle dd/dt transitions
				out.push(endtags.pop());
				if( bn[prefixLen] == ';') {
					var newName = 'dt';
				} else {
					var newName = 'dd';
				}
				out.push({type: 'TAG', name: newName});
				endtags.push({type: 'ENDTAG', name: newName});
			} else {
				for(var i = prefixLen; i < bn.length; i++) {
					switch (bn[i]) {
						case '*':
							pushList('ul', 'li');
							break;
						case '#':
							pushList('ol', 'li');
							break;
						case ';':
							pushList('dl', 'dt');
							break;
						case ':':
							pushList('dl', 'dd');
							break;
						default:
							throw("Unknown node prefix " + prefix[i]);
					}
				}
			}
		};

		for (var i = 0, length = tokens.length; i < length; i++) {
			var token = tokens[i];
			switch ( token.type ) {
				case 'TAG':
					switch (token.name) {
						case 'list':
							// ignore token
							break;
						case 'listItem':
							// convert listItem to list and list item tokens
							bnext = token.bullets;
							doListItem( bstack, bnext );
							bstack = bnext;
							break;
						default:
							// pass through all remaining start tags
							out.push(token);
							break;
					}
					break;
				case 'ENDTAG':
					if ( token.name == 'list' ) {
						// pop all open list item tokens
						popTags(bstack.length);
						bstack = "";
					} else {
						out.push(token);
					}
					break;
				default:
					out.push(token);
					break;
			}
		}
		return out;
	};


	/* End static utilities */

    /*
     * Flags for specific parse environments (inside tables, links etc). Flags
     * trigger syntactic stops in the inline_breaks production, which
     * terminates inline and attribute matches. Flags merely reduce the number
     * of productions needed: The grammar is still context-free as the
     * productions can just be unrolled for all combinations of environments
     * at the cost of a much larger grammar.
     */
    var syntaxFlags = {};
    var setFlag = function(flag) {
        if (syntaxFlags[flag] !== undefined) {
            syntaxFlags[flag]++;
        } else {
            syntaxFlags[flag] = 1;
        }
        return true;
    };
    var clearFlag = function(flag) {
        syntaxFlags[flag]--;
    };

    // Start position of top-level block
    // Could also provide positions for lower-level blocks using a stack.
    var blockStart = 0;
}

start
  = e:toplevelblock* newline* {
      return flatten(e);
  }

anyblock = block / inline
anyblockline = block / inlineline


// All chars that cannot start syntactic structures
text = t:[A-Za-z0-9,._ -]+ { return t.join('') }

space
  = s:[ \t]+ { return s.join(''); }


// Start of line
sol = (newline / & { return pos === 0; } { return true; })
      cn:(c:comment n:newline? { return [c, n] })? {
          return cn;
      }


newline
  = '\n' / '\r\n'

toplevelblock
  = & { blockStart = pos; return true; } b:block {
    b = flatten(b);
    var bs = b[0];
    dp('toplevelblock:' + pp(b));
    if (bs.attribs === undefined) {
        bs.attribs = [];
    }
    bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);
    return b;
  }

block
  = (sol space* &newline)? bl:block_lines { return bl; }
  / para
  / comment
  / (s:sol {
      if (s) {
        return [s, {type: 'NEWLINE'}];
      } else {
        return [{type: 'NEWLINE'}];
      }
    }
  )


// Block structures with start-of-line wiki syntax
block_lines
  = h
  / table
  / lists
  / pre_indent


/* Headings */
h = h1 / h2 / h3 / h4 / h5 / h6

h1 = sol '='
    (
     & { setFlag('h'); return setFlag('h1') }
     c:inlineline '=' comment? &newline {
         clearFlag('h');
         clearFlag('h1');
         return [{type: 'TAG', name: 'h1'}]
                    .concat(c, [{type: 'ENDTAG', name: 'h1'}]);
      }
    / { clearFlag('h'); clearFlag('h1'); return null }
    )

h2 = sol '=='
    (
     & { setFlag('h'); return setFlag('h2') }
     c:inlineline '==' comment?  &newline {
         clearFlag('h');
         clearFlag('h2');
         return [{type: 'TAG', name: 'h2'}]
                    .concat(c, [{type: 'ENDTAG', name: 'h2'}]);
      }
    / { clearFlag('h'); clearFlag('h2'); return null }
    )

h3 = sol '==='
    (
     & { setFlag('h'); return setFlag('h3') }
     c:inlineline '===' comment? &newline {
         clearFlag('h');
         clearFlag('h3');
         return [{type: 'TAG', name: 'h3'}]
                    .concat(c, [{type: 'ENDTAG', name: 'h3'}]);
      }
    / { clearFlag('h'); clearFlag('h3'); return null }
    )

h4 = sol '===='
    (
     & { setFlag('h'); return setFlag('h4') }
     c:inlineline '====' comment? &newline {
         clearFlag('h');
         clearFlag('h4');
         return [{type: 'TAG', name: 'h4'}]
                    .concat(c, [{type: 'ENDTAG', name: 'h4'}]);
      }
    / { clearFlag('h'); clearFlag('h4'); return null }
    )

h5 = sol '====='
    (& { setFlag('h'); return setFlag('h5') }
     c:inlineline '=====' comment? &newline {
         clearFlag('h');
         clearFlag('h5');
         return [{type: 'TAG', name: 'h5'}]
                    .concat(c, [{type: 'ENDTAG', name: 'h5'}]);
     }
    / { clearFlag('h'); clearFlag('h5'); return null }
    )

h6 = sol '======'
    (& { setFlag('h'); return setFlag('h6') }
     c:inlineline '======' comment? &newline {
         clearFlag('h');
         clearFlag('h6');
         return [{type: 'TAG', name: 'h6'}]
                    .concat(c, [{type: 'ENDTAG', name: 'h6'}]);
     }
    / { clearFlag('h'); clearFlag('h6'); return null }
    )

heading_marker
  = '=' '='*

heading_text
  = h:( !(heading_marker newline) x:inlineline { return x } )* { return h.join(''); }


// TODO: convert inline content to annotations!
para
  = (sol br)? pl:para_lines { return pl; }

para_lines
  = s:sol c:inlineline cs:(!block_lines para_lines)* {
      var res = [{type: 'TAG', name: 'p'}];
      if (s !== '') {
          res.push(s)
      }
      //console.log('paralines' + pp(res.concat(c, cs, [{type: 'ENDTAG', name: 'p'}])));
      return res.concat(c, cs, [{type: 'ENDTAG', name: 'p'}]);
  }

br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }

pre_indent
  = l:pre_indent_line+ {
      return [{type: 'TAG', name: 'pre'}]
                    .concat( l
                           , [{type: 'ENDTAG', name: 'pre'}]);
  }
pre_indent_line = sol space l:inlineline { return l }

// Syntax that stops inline expansion
inline_breaks
  = //& { console.log(pp(syntaxFlags)); return true; }
    & { return syntaxFlags['table']; }
    a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a)); return true; }
  / & { return syntaxFlags['italic']; } italic_marker { return true; }
  / & { return syntaxFlags['bold']; } bold_marker { return true; }
  / & { return syntaxFlags['linkdesc']; } link_end { return true; }
  / & { return syntaxFlags['h']; }
        ( & { return syntaxFlags['h1'] } '=' newline { return true; }
        / & { return syntaxFlags['h2'] } '==' newline { return true; }
        / & { return syntaxFlags['h3'] } '===' newline { return true; }
        / & { return syntaxFlags['h4'] } '====' newline { return true; }
        / & { return syntaxFlags['h5'] } '=====' newline { return true; }
        / & { return syntaxFlags['h6'] } '======' newline { return true; }
        )


inline
  = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ {
      var out = [];
      var text = [];
      c = flatten(c);
      for (var i = 0; i < c.length; i++) {
          if (typeof c[i] == 'string') {
              text.push(c[i]);
          } else {
              if (text.length) {
                  out.push({ type: "TEXT", value: text.join('') });
                  text = [];
              }
              out.concat(c[i]);
          }
      }
      if (text.length) {
          out.push({ type: 'TEXT', value: text.join('') });
      }
      return out;
}

inlineline
  = c:(text / !inline_breaks (inline_element / [^\n]))+ {
      var out = [];
      var text = [];
      c = flatten(c);
      for (var i = 0; i < c.length; i++) {
          if (typeof c[i] == 'string') {
              text.push(c[i]);
          } else {
              if (text.length) {
                  out.push({type: 'TEXT', value: text.join('')});
                  text = [];
              }
              out.push(c[i]);
          }
      }
      if (text.length) {
          out.push({type: 'TEXT', value: text.join('')});
      }
      //dp('inlineline out:', pp(out));
      return out;
}

/* TODO: convert all these to annotations!
 * -> need (start, end) offsets within block
 */
inline_element
  = comment
  / xmlish_tag
  / extlink
  / template
  / link
  / bold
  / italic

comment
  = '<!--' c:comment_chars* '-->'
    (space* newline space* comment)* {
        return [{ type: 'COMMENT', value: c.join('') }];
    }

comment_chars
  = c:[^-] { return c; }
  / c:'-' !'->' { return c; }

extlink
  = "[" target:url " " text:extlink_text "]" {
      return [ { type: 'TAG',
                 name: 'a',
                 attribs: [['href', target]] }
             , {type: 'TEXT', value: text}
             , {type: 'ENDTAG', name: 'a'}];
  }

//  = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } }

url
  = proto:"http:" rest:([^ \]]+) { return proto + rest.join(''); }

extlink_text
  = c:[^\]]+ { return c.join(''); }

template
  = "{{" target:link_target params:("|" p:template_param { return p })* "}}" {
      var obj = { type: 'SELFCLOSINGTAG', name: 'template', attribs: [['target', target]] }
      if (params && params.length) {
          obj.attribs.push(params);
      }
      return obj;
  }

template_param
  = name:template_param_name "=" c:template_param_text {
      return [name, c];
  } / c:template_param_text {
      return [null, c];
  }

tplarg
  = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
      var obj = {
            type: 'SELFCLOSINGTAG',
            name: 'templatearg',
            attribs: [['argname', name]]
      };
      if (params && params.length) {
          obj.attribs.push(params);
      }
      return obj;
  }

template_param_name
  = h:( !"}}" x:([^=|]) { return x } )* { return h.join(''); }

template_param_text
  = template_param_text_chunk*
/*  = h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }*/

template_param_text_chunk
  = comment
  / xmlish_tag
  / extlink
  / template
  / link
  / bold
  / italic
  / !"}}" x:([^|]) { return x }

link
  = "[[" target:link_target text:("|" link_text)* "]]" {
      var obj = {
          type: 'TAG',
          name: 'a',
          attribs: [['data-type', 'internal']]
      };
      obj.attribs.push(['href', target]);
      if (text && text.length) {
          var textTokens = text[0][1]; // XXX
      } else {
          var textTokens = [{type: 'TEXT', value: target}];
      }
      return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
  }

link_target
  = h:( !"]]" x:([^|]) { return x } )* { return h.join(''); }

link_text
  = h:( & { return setFlag('linkdesc'); }
          x:inlineline { return x }
      )* {
          clearFlag('linkdesc')
              return h;
      }
  / & { clearFlag('linkdesc') } { return null; }

link_end = "]]"

/* This implementation of bold and italic is very basic so far, and misses the
 * finer points of doQuotes in the parser. A rough plan to get closer:
 * - '''' -> ' '''
 * - last ''''' in a row of ' is used
 * - if *both* italics and bolds are unbalanced, check for prefix
 *   - convert single-letter or multi-letter non-space prefixed tick back to
 *     text
 */
bold
  = bold_marker
    & { dp('benter:' + pos); return setFlag('bold'); }
    c:inlineline
    (bold_marker / &newline) {
        clearFlag('bold');
        return [{ type: 'TAG', name: 'b' }]
                .concat(c, [{type: 'ENDTAG', name: 'b'}]);
    }
  / bold_marker { clearFlag('bold'); return null }

bold_marker = "'''"


italic
  = italic_marker
    & { dp('ienter:' + pos); return setFlag('italic'); }
    c:inlineline
    (italic_marker / &newline) {
            clearFlag('italic');
            dp('ileave:' + pos);
            return [{ type: 'TAG', name: 'i' }]
                    .concat(c, [{ type: 'ENDTAG', name: 'i'}]);
    }
  / italic_marker { clearFlag('italic'); return null }

italic_marker = "''"

/* Will need to check anything xmlish agains known/allowed HTML tags and
 * registered extensions, otherwise fail the match. Should ref be treated as a
 * regular extension? */
xmlish_tag = ref / references

ref = ref_full / ref_empty

/* Can we do backreferences to genericize this? */
ref_full
  = start:ref_start ">" content:ref_content* close:ref_end {
      return [
        { type: 'TAG',
          name: 'ext',
          attribs: [['data-extname', 'ref']]
              .concat(start.params, [['data-startws', start.ws]])},
        content,
        {type: 'ENDTAG', name: 'ref'}
      ];
}

ref_empty
  = start:ref_start close:(space* "/>") {
      return [{ type: 'SELFCLOSINGTAG',
               name: 'ext',
               attribs: [['data-extname', 'ref']]
                          .concat(start.params
                                 ,[['data-startws', start.ws]])
      }];
}

ref_start
  = "<ref" params:ext_param* ws:space* {
      return {
          params: params,
          ws: ws
      };
}

ref_end
  = all:("</ref" space* ">") {
  return all.join('');
}

ref_content
  = !ref_end a:inline { // XXX: ineffective syntactic stop
  return a;
}

/* fixme probably have to programatically add these */
references = references_full / references_empty

references_full
  = start:references_start ">" content:references_content* close:references_end {
      return [
          { type: 'TAG',
            name: 'ext',
            attribs: [['data-extname', 'references']]
                        .concat(start.params
                               ,[['data-startws', start.ws]])
          },
          content,
          { type: 'ENDTAG', name: 'ext' }
      ];
  }

references_empty
  = start:references_start close:(space* "/>") {
      return
          [{ type: 'SELFCLOSINGTAG',
             name: 'ext',
             attribs: [['data-extname', 'references']]
                        .concat(start.params
                               ,[['data-startws', start.ws]])
          }];
  }

references_start
  = "<references" params:ext_param* ws:space* {
      return {
          params: params,
          ws: ws
      };
  }

references_end
  = all:("</references" space* ">") {
      return all.join('');
  }

references_content
  = !references_end a:inline {
      return a;
  }


ext_param
  = space* name:ext_param_name "=" val:ext_param_val {
      val[0] = name;
      return val;
  }

ext_param_name
  = name:[a-zA-Z0-9-]+ {
  return name.join('');
}

ext_param_val
  = t:[0-9A-Za-z]+ { return [null, t.join('')]; }
  / "'" t:[^'>]+ "'" { return [null, unquote("'", t.join(''))]; }
  / '"' t:[^">]+ '"' { return [null, unquote('"', t.join(''))]; }

lists = es:(dtdd / li)+
{
    return annotateList( [ { type: 'TAG', name: 'list'} ]
							.concat(flatten(es)
								,[{ type: 'ENDTAG', name: 'list' }]));
}

li = sol
     bullets:list_char+
     c:inlineline
     &newline
{
    return [ { type: 'TAG',
               name: 'listItem',
               bullets: bullets }
           , c ];
}

dtdd = sol
       bullets:list_char+
       c:(inline_element / (n:[^:\n] { return {type: 'TEXT', value: n}; }))+
       ":"
       d:(inline_element / (n:[^\n] { return {type: 'TEXT', value: n}; }))+
       &newline
{
    // reject rule if bullets do not end in semicolon
    if (bullets[bullets.length - 1] != ';') {
        return null;
    } else {
		var dtbullets = bullets.slice(0, bullets.length - 1);
		dtbullets.push(':');
        return [ { type: 'TAG', name: 'listItem', bullets: bullets } ]
                .concat( c
					,[{ type: 'TAG', name: 'listItem', bullets: dtbullets } ]
                    , d );
    }
}


list_char = [*#:;]


/* Tables */

table
  = tas:table_start c:table_caption? b:table_body? table_end {
      var res = {type: 'TAG', name: 'table'}
      var body = b !== '' ? b : [];
      dp("body: " + pp(body));
      if (tas.length > 0) {
          // FIXME: actually parse and build structure
          res.attribs = [['data-unparsed', tas.join('')]];
      }

      if (c != '') {
          var caption = [{type: 'TAG', name: 'caption'}]
                        .concat(c, [{type: 'ENDTAG', name: 'caption'}]);
      } else {
          var caption = [];
      }
      //dp(pp(res));

      return [res].concat(caption, body,
              [{type: 'ENDTAG', name: 'table'}]);
  }

table_start
  =  sol
     "{|"
     & { setFlag('table'); return true; }
     ta:table_attribs*
     space* {
         //dp("table_start " + pp(ta) + ", pos:" + pos);
         return ta;
     }
  / sol "{|" { clearFlag('table'); return null; }

table_attribs
 = text / ! inline_breaks !newline .

table_caption
  = newline
    "|+" c:inline* {
        return c;
    }

table_body
  = & { dp("table_body enter"); return true; }
    firstrow:table_firstrow otherrows:table_row* {
      /* dp('table first and otherrows: '
       * + pp([firstrow].concat(otherrows))); */
      return [firstrow].concat(otherrows);
  }
  / otherrows:table_row* {
      //dp('table otherrows: ' + pp(otherrows));
      return otherrows;
  }

table_firstrow
  = td:table_data+ {
      return [{ type: 'TAG', name: 'tr' }]
             .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
  }

table_row
  = & { dp("table row enter"); return true; }
    newline
    "|-" thtd_attribs? space* td:(table_data / table_header)* {
        return [{type: 'TAG', name: 'tr'}]
               .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
    }

table_data
  = & { dp("table_data enter, pos=" + pos); return true; }
    ("||" / newline "|")
    ! [}+-]
    a:thtd_attribs?
    td:(!inline_breaks anyblock)* {
        dp("table data result: " + pp(td) + ", attribts: " + pp(a));
        return [{ type: 'TAG', name: 'td', attribs: [['data-unparsed', a]]}]
               .concat(td, [{type: 'ENDTAG', name: 'td'}]);
    }

table_header
  = ("!!" / newline "!")
     a:thtd_attribs?
     c:inline {
         return [{type: 'TAG', name: 'th', attribs: [['data-unparsed', a]]}]
                .concat(c, [{type: 'ENDTAG', name: 'th'}]);
     }

thtd_attribs
  // In particular, do not match [|\n]
  = a:(text / ! inline_breaks [="':;/,.-] )+ "|" ! [|}+-] {
  return a;
  }


table_end = newline? "|}" { clearFlag('table'); }


/* Wikidom TODO:
 * split off text into content nodes
 * convert inlines into annotations
 * change contents into children
 *
 * { text: text,
 *   annotations: [(normal annotations)],
 *   maybeannotations: [
 *      { type: 'something',
 *        side: MA_START,
 *        tag: { start: x, length: y }
 *      }
 *   ]
 * }
 *  offsets in annotations: presume maybeannotations are actually text
 *  -> need to transform annotations if match found
 *  -> format annotations, comments can run to the end (re-opened after
 *     block-level tags); only closed on table cells, object,?
 *  -> other annotations (images, templates etc) are limited by block-level
 *     elements, tightly bound
 *
 *  Block-level elements
 *  --------------------
 *  - Need some early clean-up to provide structure and offsets
 *  - Establish scope limits for some inlines
 *  - Line-based balanced by construction
 *  - HTML tags need balancing/ matching / implicit close
 *  - content in illegal places (e.g. between table and td tags) needs foster
 *    parenting
 *  - grammar will match outermost pair if unmatched pairs are recognized as
 *    tokens (or as text)
 *  - post-processing needed, but has to be limited by scope
 */
/* Tabs do not mix well with the hybrid production syntax */
/* vim: et:ts=4:sw=4:cindent */