mediawiki-extensions-Visual.../modules/parser/pegTokenizer.pegjs.txt

/* Combined Wiki (MediaWiki) and HTML tokenizer. Produces a token stream
** (actually a list of tokens for now) suitable for a HTML5TreeBuilder. */
{
    /* Fixme: use static functions to separate module! Unfortunately, this
     * does not work:
     * var tu = require('./mediawiki.tokenizer.utils.js');
     * console.log(tu.flatten([]));
     * Using exports in the module gets a bit further, but accesses to
     * tu.flatten in productions still fail. Thus, I just moved the functions
     * here until a solution is found:
     */

    /* Static utilities */

    // Flatten a list of lists.
    var flatten = function ( e ) {
        // Fast single-level flatten:
        //return [].concat.apply([], e);

        var es = [];
        // flatten sub-arrays
        for(var i = 0, length = e.length; i < length; i++) {
            var ei = e[i];
            if ($.isArray(ei))
                es = es.concat(flatten(ei));
            else
                es.push(ei);
        };
        return es;
    };


    var flatten_string = function ( c ) {
        var out = [],
            text = [];
        c = flatten(c);
        for (var i = 0, l = c.length; i < l; i++) {
            var ci = c[i];
            if (ci.constructor === String) {
                if(ci !== '') {
                    text.push(ci);
                }
            } else {
                if (text.length) {
                    out.push( text.join('') );
                    text = [];
                }
                out.push(ci);
            }
        }
        if (text.length) {
            out.push( text.join('') );
        }

        if ( out.length === 1 && out[0].constructor === String ) {
            return out[0];
        } else {
            return out;
        }
    };

    // Remove escaped quotes from attributes etc
    // This was in the original PEG parser, but could not find anything in
    // MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke
    var unquote = function (quotec, text) {
        return text.replace('\\' + quotec, quotec);
    };

    // Decode html entities. In a browser, this should only be fed the entity,
    // not untrusted html! XXX: replace with safer version.
    var unentity = function ( entity ) {
        return $("<div/>").html(entity).text();
    };

    // Debug print with global switch
    var dp = function ( msg ) {
        if ( false ) {
            console.log(msg);
        }
    };

    var pp = function ( s ) { return JSON.stringify(s, null, 2); }

    /*
     * Annotate a token stream with list items with appropriate list tokens
     *
     * @static
     * @method
     * @param {[tokens]}   Token stream with li tokens
     * @returns {[tokens]} Token stream, possibly with additional list tokens
     * */
    var annotateList = function ( tokens ) {
        var out = [],    // List of tokens
            bstack = [], // Bullet stack, previous element's listStyle
            bnext = [],  // Next element's listStyle
            endtags = [];  // Stack of end tags

        var commonPrefixLength = function (x, y) {
            var minLength = Math.min(x.length, y.length);
            for(var i = 0; i < minLength; i++) {
                if (x[i] != y[i])
                    break;
            }
            return i;
        };

        var pushList = function ( listName, itemName ) {
            out.push( new TagTk( listName ));
            out.push( new TagTk( itemName ));
            endtags.push( new EndTagTk( listName ));
            endtags.push( new EndTagTk( itemName ));
        };

        var popTags = function ( n ) {
            for(;n > 0; n--) {
                // push list item..
                out.push(endtags.pop());
                // and the list end tag
                out.push(endtags.pop());
            }
        };

        var isDlDd = function (a, b) {
            var ab = [a,b].sort();
            return (ab[0] === ':' && ab[1] === ';');
        };

        var doListItem = function ( bs, bn ) {
            var prefixLen = commonPrefixLength (bs, bn);
            var changeLen = Math.max(bs.length, bn.length) - prefixLen;
            var prefix = bn.slice(0, prefixLen);
            // emit close tag tokens for closed lists
            if (changeLen === 0) {
                var itemToken = endtags.pop();
                out.push(itemToken);
                out.push(new TagTk( itemToken.name ));
                endtags.push(new EndTagTk( itemToken.name ));
            } else if ( bs.length == bn.length
                    && changeLen == 1
                    && isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
                        // handle dd/dt transitions
                        out.push(endtags.pop());
                        if( bn[prefixLen] == ';') {
                            var newName = 'dt';
                        } else {
                            var newName = 'dd';
                        }
                        out.push(new TagTk( newName ));
                        endtags.push(new EndTagTk( newName ));
            } else {
                popTags(bs.length - prefixLen);

                if (prefixLen > 0 && bn.length == prefixLen ) {
                    var itemToken = endtags.pop();
                    out.push(itemToken);
                    out.push(new TagTk( itemToken.name ));
                    endtags.push(new EndTagTk( itemToken.name ));
                }

                for(var i = prefixLen; i < bn.length; i++) {
                    switch (bn[i]) {
                        case '*':
                            pushList('ul', 'li');
                            break;
                        case '#':
                            pushList('ol', 'li');
                            break;
                        case ';':
                            pushList('dl', 'dt');
                            break;
                        case ':':
                            pushList('dl', 'dd');
                            break;
                        default:
                            throw("Unknown node prefix " + prefix[i]);
                    }
                }
            }
        };

        for (var i = 0, length = tokens.length; i < length; i++) {
            var token = tokens[i];
            switch ( token.constructor ) {
                case TagTk:
                    switch (token.name) {
                        case 'list':
                            // ignore token
                            break;
                        case 'listItem':
                            // convert listItem to list and list item tokens
                            bnext = token.bullets;
                            doListItem( bstack, bnext );
                            bstack = bnext;
                            break;
                        default:
                            // pass through all remaining start tags
                            out.push(token);
                            break;
                    }
                    break;
                case EndTagTk:
                    if ( token.name == 'list' ) {
                        // pop all open list item tokens
                        popTags(bstack.length);
                        bstack = [];
                    } else {
                        out.push(token);
                    }
                    break;
                default:
                    out.push(token);
                    break;
            }
        }
        return out;
    };


    /* End static utilities */

    /*
     * Flags for specific parse environments (inside tables, links etc). Flags
     * trigger syntactic stops in the inline_breaks production, which
     * terminates inline and attribute matches. Flags merely reduce the number
     * of productions needed: The grammar is still context-free as the
     * productions can just be unrolled for all combinations of environments
     * at the cost of a much larger grammar.
     */
    var syntaxFlags = {};
    var setFlag = function(flag) {
        if (syntaxFlags[flag] !== undefined) {
            syntaxFlags[flag]++;
        } else {
            syntaxFlags[flag] = 1;
        }
        return true;
    };
    var clearFlag = function(flag) {
        syntaxFlags[flag]--;
        return false;
    };

    // Start position of top-level block
    // Could also provide positions for lower-level blocks using a stack.
    var blockStart = 0;

    // Start position of generic tag production
    var tagStartPos = 0;

    // cache the input length
    var inputLength = input.length;

    // pseudo-production that matches at end of input
    var isEOF = function (pos) {
        return pos === inputLength;
    };

    // text start position
    var textStart = 0;

    // hack to support numbered external links ([http://example.com]).
    // XXX: Move to token stream transform after templates are expanded!
    var linkCount = 1;

    // Define block-level tags in JS, so we can use toLowerCase to match tags
    // case-independently. This would be quite ugly (and possibly slower) if
    // done manually in the grammar.
    var block_names = (function () {
        var names = [ "p", "table", "td", "tr", "ul", "ol"
                    , "li", "dl", "dt", "dd", "div", "center"
                    , "blockquote" ];
        var bnames = {};
        for(var i = 0, l = names.length; i < l; i++) {
            bnames[names[i]] = true;
        }
        return bnames;
    })();

    var self = this;


}

start
  = e:toplevelblock* newline* {
      // end is passed inline as a token, as well as a separate event for now.

      // this does not work yet.
      //console.log('about to emit' + pp(self));
	  //self._tokenizer.emit('chunk', [ { type: 'END' } ] );
      //self._tokenizer.emit('end');
		// Append the end (for obvious reasons this should not
		// be part of a stream, only when tokenizing complete
		// texts)
      //console.log( pp( flatten ( e ) ) );
      cache = {};
      __parseArgs[2]( [ new EOFTk( ) ] );
      return []; //flatten(e);
  }


/* All chars that cannot start syntactic structures in the middle of a line
 * XXX: ] and other end delimiters should probably only be activated inside
 * structures to avoid unnecessarily leaving the text production on plain
 * content. */

text_char = [^'<~[{\n\r:\]}|!=]

text = t:text_char+ { return t.join(''); }

/* Explanation of chars
 * '    quotes (italic/bold)
 * <    start of xmlish_tag
 * ~    signatures/dates
 * [    start of links
 * {    start of parser functions, transclusion and template args
 * \n   all sort of block-level markup at start of line
 * \r   ditto
 * h    http(s) urls
 * n    nntp(s) urls
 * m    mailto urls
 *
 * ! and | table cell delimiters, might be better to specialize those
 * =    headings - also specialize those!
 *
 * The following chars are also included for now, but only apply in some
 * contexts and should probably be enabled only in those:
 * :    separate definition in ; term : definition
 * ]    end of link
 * }    end of parser func/transclusion/template arg
 */

urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
          / & url_chars urllink
          / htmlentity
          // Convert trailing space into &nbsp;
          // XXX: This should be moved to a serializer
          / ' ' & ':' { return "\u00a0"; }
          / t:text_char )+

directive
  = comment
  / tplarg_or_template
  / htmlentity

spaceless_preprocessor_text
  = r:( t:[^'<~[{\n\r|!\]}\t &=]+ { return t.join(''); }
  / directive
  / !inline_breaks !' ' text_char )+ {
      return flatten_string ( r );
  }

link_preprocessor_text
  = r:( t:[^'<~[{\n\r|!\]}\t &="']+ { return t.join(''); }
  / directive
  / urlencoded_char
  / !inline_breaks no_punctuation_char
  / s:[.:,] !(space / eolf) { return s }
  / [&%] )+ {
      return flatten_string ( r );
  }

// Plain text, but can contain templates, template arguments, comments etc-
// all stuff that is normally handled by the preprocessor
// Returns either a list of tokens, or a plain string (if nothing is to be
// processed).
preprocessor_text
  = r:( t:[^<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
  / directive
  / !inline_breaks text_char )+ {
      return flatten_string ( r );
  }


/*
	'//', // for protocol-relative URLs, but not in text!
	'ftp://',
	'git://',
	'gopher://',
	'http://',
	'https://',
	'irc://',
	'ircs://',  // @bug 28503
	'mailto:',
	'mms://',
	'news:',
	'nntp://', // @bug 3808 RFC 1738
	'svn://',
	'telnet://', // Well if we're going to support the above.. -ævar
	'worldwind://',
*/

// Old version
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }

// Experimental tweaked version: avoid expensive single-char substrings
// This did not bring the expected performance boost, however.
//text = [A-Za-z0-9,._ -] {
//            textStart = pos;
//
//            var res = input.substr(textStart - 1, inputLength)
//                        .match(/[A-Za-z0-9,._ -]+/)[0];
//            pos = pos + (res.length - 1);
//            return res
//       }

htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
    return unentity("&" + c.join('') + ";")
}

space
  = s:[ \t]+ { return s.join(''); }

optionalSpaceToken
  = s:space* {
      if ( s.length ) {
          return [s.join('')];
      } else {
          return [];
      }
  }


// Start of line
sol = nl:(newlineToken / & { return pos === 0; } { return [] })
      // Eat multi-line comments, so that syntax after still matches as if it
      // was actually preceded by a newline
      cn:( c:comment n:newline? {
                if ( n !== '' ) {
                    return [c, n];
                } else {
                    return [c];
                }
            }
      )*
      // Eat includeonly/noinclude at start of line, so that start-of-line
      // syntax after it still matches
      ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
      {
          var niToken = [];
          if ( ni !== '') {
              if ( ni[0] === '/' ) {
                  niToken = [new EndTagTk( ni[1] )];
              } else {
                  niToken = [new TagTk( ni[1] )];
              }
          }

          return nl.concat(cn, niToken);
      }

eof = & { return isEOF(pos); } { return true; }


newline
  = '\n' / '\r\n'

newlineToken = newline { return [new NlTk()] }

eolf = newline / eof

toplevelblock
  = & { blockStart = pos; return true; } b:block {
    b = flatten(b);
    if ( b.length ) {
        var bs = b[0];
        if ( bs.constructor === String && bs.attribs === undefined ) {
            b[0] = new String( bs );
            bs = b[0];
        }
        //dp('toplevelblock:' + pp(b));
        if (bs.attribs === undefined) {
            bs.attribs = [];
        }
        bs.attribs.push(new KV('data-sourcePos', blockStart + ':' + pos));
        //console.log( 'toplevelblock: ' +  pp( bs ));
    }
    // XXX: only run this for lines that actually need it!
    //b.push({type: 'NEWLINE'});
	// Move this to a token stream transform!
    //console.log('about to emit' + pp(self));
    //console.log( '__parseArgs call: ' +  pp( b ));
    __parseArgs[2]( flatten( b ) );

    //return [];
    return true;
  }

block
  = block_lines
  / pre
  / comment &eolf
  / nowiki
  / pre
  / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
  / para
  / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
  / s:sol /*{
          if (s) {
              return [s, {type: 'NEWLINE'}];
          } else {
              return [{type: 'NEWLINE'}];
          }
      }*/

block_lines
  = s:sol
    // eat an empty line before the block
    s2:(os:optionalSpaceToken so:sol { return os.concat(so) })?
    bl:block_line {
        var s2_ = (s2 !== '') ? s2 : [];
        return s.concat(s2_, bl);
    }

// Block structures with start-of-line wiki syntax
block_line
  = h
  /// table
  / & [{}|] tl:table_lines { return tl; }
  / lists
  // tag-only lines should not trigger pre
  / st:optionalSpaceToken
    bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+
    &eolf {
        return st.concat(bt);
    }
  / pre_indent
  / pre


para
  = s1:sol s2:sol c:inlineline {
      return s1.concat(s2, /* [new TagTk('p')],*/ c);
  }

br = space* &newline { return new SelfclosingTagTk( 'br' ) }

// Syntax stops to limit inline expansion defending on syntactic context
inline_breaks
  =
    & { // Important hack: disable caching for this production, as the default
        // cache key does not take into account flag states!
        cacheKey = '';
        return true;
      }
    & { return syntaxFlags['table']; }
    ( a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; }
      / & { return syntaxFlags['tableCellArg'] }
        "|" { return true }
    )
  / & { return (syntaxFlags['colon'] &&
          ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition
          ! syntaxFlags.linkdesc); } ":" { return true; }
  / & { return syntaxFlags['extlink']; } "]" { return true; }
  / & { return syntaxFlags['linkdesc']; } link_end { return true; }
  / & { return syntaxFlags['h']; } '='+ space* newline { return true; }
  / & { return syntaxFlags['template']; } ('|' / '}}' ) { return true; }
  / & { return syntaxFlags['equal']; } '=' {
                    //console.log( 'equal stop!' );
                    return true;
                }

inline
  = c:(urltext / (! inline_breaks (inline_element / . )))+ {
      var out = [];
      var text = [];
      c = flatten(c);
      for (var i = 0, l = c.length; i < l; i++) {
          var ci = c[i];
          if (typeof ci === 'string') {
              if(ci !== '') {
                text.push(ci);
              }
          } else {
              if (text.length) {
                  out.push( text.join('') );
                  text = [];
              }
              out.push(ci);
          }
      }
      if (text.length) {
          out.push( text.join('') );
      }
      //dp('inline out:' + pp(out));
      return out;
}


inlineline
  = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
      var out = [];
      var text = [];
      c = flatten(c);
      for (var i = 0; i < c.length; i++) {
          var ci = c[i]
          if (typeof ci == 'string') {
              if(ci !== '') {
                text.push(ci);
              }
          } else {
              if (text.length) {
                  out.push( text.join('') );
                  text = [];
              }
              out.push(ci);
          }
      }
      if (text.length) {
          out.push( text.join('') );
      }
      //dp('inlineline out:' + pp(out));
      return out;
}

inline_element
  = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
      & '<' ( comment / xmlish_tag )
    / & '{' ( & '{{{{{' template / tplarg / template )
    / & '{' tplarg_or_template
    /// & '{' ( tplarg / template )
     // Eat three opening brackets as text.
    / '[[[' { return '[[[' }
    / & '[' ( wikilink / extlink )
    / & "'" quote

/* Headings  */

h = & "=" // guard, to make sure '='+ will match.
          // XXX: Also check to end to avoid inline parsing?
    r:(
     s:'='+ // moved in here to make s accessible to inner action
     & { return setFlag('h'); }
     c:inlineline
     e:'='+
     spc:(sp:space+ { return sp.join('') } / comment)*
     &eolf
     {
        clearFlag('h');
        var level = Math.min(s.length, e.length);
        // convert surplus equals into text
        if(s.length > level) {
            var extras = s.substr(0, s.length - level);
            if(c[0].constructor === String) {
                c[0] = extras + c[0];
            } else {
                c.unshift( extras );
            }
        }
        if(e.length > level) {
            var extras = e.substr(0, e.length - level),
                lastElem = c[c.length - 1];
            if(lastElem.constructor === String) {
                lastElem += extras;
            } else {
                c.push( extras );
            }
        }

        return [new TagTk( 'h' + level )]
                .concat(c, [new EndTagTk( 'h' + level ), spc]);
      }
    / & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
    ) { return r }


pre_indent
  = l:pre_indent_line ls:(sol pre_indent_line)* {
      return [new TagTk( 'pre' )]
                    .concat( [l], ls
                           , [new EndTagTk( 'pre' )]);
  }
pre_indent_line = space l:inlineline {
    return [ '\n' ].concat(l);
}


comment
  = '<!--' c:comment_chars* ('-->' / eof)
    cs:(space* newline space* cn:comment { return cn })* {
        return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
    }

comment_chars
  = c:[^-] { return c; }
  / c:'-' !'->' { return c; }


urllink
  = ! { return syntaxFlags['extlink'] }
    target:url {
      return [ new TagTk( 'a', [new KV('href', target)] )
             , target
             , new EndTagTk( 'a' )
             ];
  }

extlink
  = ! { return syntaxFlags['extlink'] } // extlink cannot be nested
    (
        "["
        & { return setFlag('extlink'); }
        //target:urllink
        target:link_preprocessor_text
        text:(space* t:inlineline { return t } )?
        "]" {
            clearFlag('extlink');
            if ( text === '' ) {
                // XXX: Link numbering should be implemented in post-processor.
                text = [ "[" + linkCount + "]" ];
                linkCount++;
            }
            var res = [
                new TagTk( 'a', [
                            new KV('href', target),
                            new KV('data-type', 'external')
                        ] ),
            ].concat( text
                    , [ new EndTagTk( 'a' )]);
            //console.log( JSON.stringify( res, null, 2 ) );
            return res;
        }
      / "[" & { clearFlag('extlink'); return false; }
    )

/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
 * be configured dynamically. */

url_chars = [/fghimnstw]

url_protocol
  = '//' // for protocol-relative URLs
  / 'ftp://'
  / 'git://'
  / 'gopher://'
  / 'http://'
  / 'https://'
  / 'irc://'
  / 'ircs://'  // @bug 28503
  / 'mailto:'
  / 'mms://'
  / 'news:'
  / 'nntp://' // @bug 3808 RFC 1738
  / 'svn://'
  / 'telnet://' // Well if we're going to support the above.. -ævar
  / 'worldwind://'

// javascript does not support unicode features..
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]


urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] {
    return decodeURI("%" + c0 + c1)
}

//[^][<>"\\x00-\\x20\\x7F\p{Zs}]

no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]

url
  = proto:url_protocol
    addr:( ipv6_address / ipv4_address )?
    rest:(  ( !inline_breaks
              c:no_punctuation_char
              { return c }
            )
            / s:[.:,] !(space / eolf) { return s }
            / htmlentity
            / urlencoded_char
            / [&%] )+
{
    return proto + addr + rest.join('');
}

ipv4_address
  = a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*)
{
    return flatten( a ).join('');
}

ipv6_address
  = a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']')
{
    return flatten( a ).join('');
}

tplarg_or_template = & '{{{{{' template / tplarg / template

template
  = "{{" target:template_param_text
    params:(newline? "|" newline? p:template_param { return p })*
    "}}" {
      // Insert target as first positional attribute, so that it can be
      // generically expanded. The TemplateHandler then needs to shift it out
      // again.
      params.unshift( { k: '', v: flatten( target ) } );
      var obj = new SelfclosingTagTk( 'template', params );
      //console.log( 'tokenizer template ' + JSON.stringify( target ));
      return obj;
  }

// XXX: support template and args in target!
//template_target
//  = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') }

tplarg
  = "{{{"
    name:template_param_text
    params:( newline? "|" newline? p:template_param { return p })*
    "}}}" {
      name = flatten( name );
      params.unshift( { k: '', v: name } );
      var obj = new SelfclosingTagTk( 'templatearg', params );
      //console.log( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 ));
      return obj;
  }

template_param
  = name:template_param_name space* "=" space* c:template_param_text? {
      //console.log( 'named template_param matched' + pp([name, flatten( c )]) );
      if ( c !== '' ) {
        return new KV(name, flatten( c ));
      } else {
        return new KV(name, []);
      }
  } / c:template_param_text {
        return new KV([], flatten( c ) );
  }
  / & [|}] { return new KV([], []); }


// FIXME: handle template args and templates in key! (or even parser functions?)
template_param_name
  = & { return setFlag( 'equal' ) }
    tpt:template_param_text
    & { clearFlag( 'equal' ); return true; }
    {
        //console.log( 'template param name matched: ' + pp( tpt ) );
        return tpt;
    }

  / & { return clearFlag( 'equal' ) }
  //= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }

template_param_text
  = & { return setFlag('template') }
    il:inline {
        clearFlag('template');
        return il;
    }
  / & { return clearFlag('template'); }

// TODO: handle link prefixes as in al[[Razi]]
wikilink
  = "[["
    ! url
    //target:link_target
    target:preprocessor_text
    lcontent:( "|" lt:link_text { return lt } )*
    "]]"
    // XXX In real MediaWiki, this is a language-dependent positive character
    // class. Can we work out a static negative class instead?
    // XXX: Exclude uppercase chars from non-latin languages too!
    trail:(! [A-Z \t(),.:-] tc:text_char { return tc })* {
      var obj = new TagTk( 'a',
                [
                    new KV('data-type', 'internal')
                ] ),
          textTokens = [];
      obj.attribs.push( new KV('href', target) );
      if (lcontent && lcontent.length) {
          textTokens = lcontent;
          if (trail) {
            textTokens.push( trail.join('') );
          }
      } else {
          if (trail) {
              target += trail.join('');
          }
          textTokens = [ target ];
      }
      return [obj].concat(textTokens, [new EndTagTk( 'a' )]);
  }

link_target
  = h:( c:[^|%\n\]]+ { return c.join('') } // quickly eat anything unsuspicious
      / !"]]"
        hi:(
              [^|%\n]
            / urlencoded_char
            / '%'
           ) { return hi }
      )* { return h.join(''); }

link_text
  = & { return setFlag('linkdesc'); }
    h:inlineline
    {
        clearFlag('linkdesc');
        return h;
    }
  / & { clearFlag('linkdesc'); return false }

link_end = "]]"

/* Generic quote production for italic and bold, further processed in a token
 * stream transformation in doQuotes. Relies on NlTk tokens being emitted
 * for each line of text to balance quotes per line.
 *
 * We are not using a simple pair rule here as we need to support mis-nested
 * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
 * all not context free. */
quote = "''" x:"'"* {
	var res = new TagTk( 'mw-quote' ); // Will be consumed in token transforms
    res.value = "''" + x.join('');
    return res;
}

/* XXX: Extension tags can require a change in the tokenizer mode, which
 * returns any text between extension tags verbatim. For now, we simply
 * continue to parse the contained text and return the tokens. The original
 * input source can be recovered from the source positions added on tag
 * tokens. This won't however work in all cases. For example, a comment start
 * (<!--) between extension tags would cause the remaining text to be consumed
 * as a comment. To avoid this, we might need to look ahead for the end tag
 * and limit the content parsing to this section. */

xmlish_tag = nowiki / generic_tag

pre
  = "<pre"
    attribs:generic_attribute*
    ">"
    ts:(t1:[^<]+ { return t1.join('') }
                / nowiki
                / !"</pre>" t2:. { return t2 })+
    ("</pre>" / eof) {
        // return nowiki tags as well?
        //console.log('inpre');
        return [ new TagTk( 'pre', attribs ) ]
                    .concat(ts, [ new EndTagTk( 'pre' ) ]);
    }
  / "</pre>" { return "</pre>"; }

nowiki
  = "<nowiki>" nc:nowiki_content "</nowiki>" {
        //console.log( 'full nowiki return: ' + pp(nc));
        return nc;
    }
  / "<nowiki>" {
          //console.log('nowiki fallback');
          return ['<nowiki>'];
    }
  / "</nowiki>" {
      //console.log('nowiki end fallback');
      return ['</nowiki>'];
  }

nowiki_content
  = ts:(   t:[^<]+ { return t.join('') }
           / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
                 //console.log('nested pre in nowiki');
                 return ["<pre"].concat(p0, p1, [">"], p2, ["</pre>"]).join('');
               }
           / (!("</"( "nowiki>" / "pre>")) c:. {
               //console.log('nowiki: single char' + c);
               return c;
           })
       )* {
            // return nowiki tags as well?
            //console.log('nowiki_content: return' + pp(ts));
            return [ts.join('')];
          }

// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
block_tag
  = "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
    attribs:generic_attribute*
    selfclose:"/"?
    ">" {
        if (block_names[name.toLowerCase()] !== true) {
            // abort match if tag is not block-level
            return null;
        }
        var res;
        if ( end != '' ) {
            res = new EndTagTk( name, attribs );
        } else if ( selfclose != '' ) {
            res = new SelfclosingTagTk( name, attribs );
        } else {
            res = new TagTk( name, attribs );
        }
        return [res];
    }

/* Generic XML-like tags
 *
 * These also cover extensions (including Cite), which will hook into the
 * token stream for further processing. The content of extension tags is
 * parsed as regular inline, but the source positions of the tag are added
 * to allow reconstructing the unparsed text from the input. */

// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
generic_tag
  = "<"
    & { tagStartPos = pos; return true; } // remember the start position of this tag
    end:"/"? name:[0-9a-zA-Z]+
    attribs:generic_attribute*
    space*
    selfclose:"/"?
    ">" {
        name = name.join('');
        var res;
        if ( end != '' ) {
            res = new EndTagTk( name, attribs );
        } else if ( selfclose != '' ) {
            res = new SelfclosingTagTk( name, attribs );
        } else {
            res = new TagTk( name, attribs );
        }
        res.attribs.push(new KV('data-sourceTagPos', (tagStartPos - 1) + ":" + pos));
        return res;
    }

generic_attribute
  = s:space*
    name:generic_attribute_name
    value:(space*
          v:generic_attribute_value { return v })?
{
    if ( value !== '' ) {
        return new KV( name, value );
    } else {
        return new KV( name, '' );
    }
}

// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
// disallow newlines, | and {.
generic_attribute_name
  = n:[^ \t\0/"'>=\n|{]+ {
        return n.join('');
  }

generic_attribute_value
  = "=" space* v:att_value {
      return v;
  }

// XXX: attributes can contain templates and template args!!
att_value
  = t:(!inline_breaks c:[^ \t'"<>='\n] { return c } )+ {
        return t.join('');
    }
    // XXX: is "\"" also valid html? or just Wikitext?
  / "'" t:[^'>]* "'" { return unquote("'", t.join('')); }
  / '"' t:[^">]* '"' { return unquote('"', t.join('')); }


/* Lists */
lists = e:(dtdd / li) es:(sol (dtdd / li))*
{
    return annotateList( [ new TagTk( 'list' ) ]
            .concat(flatten([e].concat(es))
                ,[ new EndTagTk( 'list' ) ]));
}

li = bullets:list_char+
     c:inlineline?
     &eolf
{
    if ( c == '' )
        c = [];
    var li = new TagTk( 'listItem' );
    li.bullets = bullets;
    return [ li, c ];
}

dtdd
  = bullets:(!(";" !list_char) list_char)*
    ";"
    & {return setFlag('colon');}
    c:inlineline
    ":"
    // Fortunately dtdds cannot be nested, so we can simply set the flag
    // back to 0 to disable it.
    & {syntaxFlags['colon'] = 0; return true;}
    d:inlineline
    &eolf {
        // Convert trailing space into &nbsp;
        // XXX: This should be moved to a serializer
        //var clen = c.length;
        //if (clen && c[clen - 1].constructor === String) {
        //    var val = c[clen - 1].value;
        //    if(val.length && val[val.length - 1] == ' ') {
        //        c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
        //    }
        //}

        var li = new TagTk( 'listItem' );
        li.bullets = bullets + ";";
        var li2 = new TagTk( 'listItem' );
        li2.bullets = bullets + ":";
        return [ li ].concat( c, [ li2 ], d );
    }
  // Fall-back case to clear the colon flag
  / & { return true; } { syntaxFlags['colon'] = 0; return null; }


list_char = [*#:;]

/**
 * Tables
 *
 * Table productions are geared to support independent parsing of fragments in
 * templates (the common table start / row / table end use case). The tokens
 * produced by these fragments then match up to a table while building the
 * DOM tree. For similar reasons, table rows do not emit explicit end tag
 * tokens.
 *
 * The separate table_lines production is faster than moving those productions
 * directly to block_lines.
 * */

table_lines
  = & { return setFlag('table'); }
    tl:table_line
    tls:( s:sol tl2:table_line { return s.concat(tl2); } )* {
        clearFlag('table');
        //console.log('table_lines: ' + pp(tl.concat(tls)));
        return tl.concat( tls );
    }
  / & { return clearFlag('table'); }

// This production assumes start-of-line position!
table_line
  = table_start_tag
  / table_row_tag
  / table_data_tags
  / table_heading_tags
  / table_caption_tag
  / table_end_tag

table_start_tag
  = "{|"
    ta:generic_attribute*
    space*
    te:table_end_tag? // can occur somewhere in the middle of the line too
    {
        var toks = [ new TagTk( 'table' ) ];
        if ( ta )
            toks[0].attribs = ta;
        if ( te )
            toks = toks.concat( te );
        return toks;
    }

table_caption_tag
  = "|+"
    c:inline* {
        return [ new TagTk( 'caption' )]
                    .concat( c, [ new EndTagTk( 'caption' ) ]);
    }


table_row_tag
  = //& { console.log("table row enter"); return true; }
    "|-"
    a:generic_attribute*
    space*
    // handle tables with missing table cells after a row
    td:( s:sol ![|!] tdt:table_data_tag { return s.concat(tdt); } )?
    {
        // We rely on our tree builder to close the row as needed. This is
        // needed to support building tables from fragment templates with
        // individual cells or rows.
        var trToken = [ new TagTk( 'tr', a ) ];
        if ( !td ) {
            return trToken;
        } else {
            return trToken.concat(td);
        }
    }

table_data_tags
  = "|"
    td:table_data_tag
    tds:( "||" tdt:table_data_tag { return tdt } )* {
        return td.concat(tds);
    }

table_data_tag
  = //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
    ! [}+-]
    a:table_cell_args?
    //& { console.log("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
    // use inline_breaks to break on tr etc
    td:( !inline_breaks
         //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
         b:block { return b } )*
    {
        if ( a == '' ) {
            a = [];
        }
        //dp("table data result: " + pp(td) + ", attribts: " + pp(a));
        return [ new TagTk( 'td', a )]
                    .concat( td, [ new EndTagTk ( 'td' ) ] );
    }

table_heading_tags
  = "!"
    th:table_heading_tag
    ths:( "!!" tht:table_heading_tag  { return tht } )* {
        return th.concat(ths);
    }

table_heading_tag
  = a:table_cell_args?
    c:inline {
         if ( a == '' ) {
             a = [];
         }
         return [ new TagTk( 'th', a ) ]
						.concat( c, [new EndTagTk( 'th' )] );
     }

table_end_tag
  = "|}" {
      var tok = [new EndTagTk( 'table' )];
	  return tok;
  }

table_cell_args
  = & { return setFlag('tableCellArg'); }
    as:generic_attribute+ space* "|" !"|" {
        clearFlag('tableCellArg');
        return as;
    }
    / & { return clearFlag('tableCellArg'); }


/* Tables, full-table version (no support for table fragments from templates)
 * Only left here for comparison purposes with the above productions. Remove
 * when those work as intended! */
table
  = tas:table_start space* c:table_caption? b:table_body? te:table_end {
      var res = new TagTk( 'table' );
      var body = b !== '' ? b : [];
      dp("body: " + pp(body));
      if (tas.length > 0) {
          // FIXME: actually parse and build structure
          //res.attribs = [new KV('data-unparsed', tas.join(''))];
          res.attribs = tas;
      }

      if (c != '') {
          var caption = [ new TagTk( 'caption' ) ]
                        .concat(c, [new EndTagTk( 'caption' )], te);
      } else {
          var caption = [];
      }
      //dp(pp(res));

      return [res].concat(caption, body, [new EndTagTk( 'table' )]);
  }

table_start
  = "{|"
    res:(
        & { setFlag('table'); return true; }
        ta:generic_attribute*
        {
            dp("table_start " + pp(ta) + ", pos:" + pos);
            return ta;
        }
        / & { clearFlag('table'); return false; } { return null; }
    ) { return res }

table_caption
  = n:newlineToken
    "|+" c:inline* {
        return n.concat(c);
    }

table_body
  = //& { dp("table_body enter"); return true; }
    firstrow:table_firstrow otherrows:table_row* {
      /* dp('table first and otherrows: '
       * + pp([firstrow].concat(otherrows))); */
      return [firstrow].concat(otherrows);
  }
  / otherrows:table_row* {
      //dp('table otherrows: ' + pp(otherrows));
      return otherrows;
  }

table_firstrow
  = td:(table_data / table_heading)+ {
      //dp('firstrow: ' + pp(td));
      return [ new TagTk( 'tr' )]
             .concat(td, [ new EndTagTk( 'tr' )]);
  }

table_row
  = //& { dp("table row enter"); return true; }
  n:newlineToken
    "|-"
    a:(as:generic_attribute+ space* "|" !"|" { return as } )?
    space*
    td:(table_data / table_heading)* {
        return n.concat([ new TagTk( 'tr' )]
					   , td, [ new EndTagTk( 'tr' )]);
    }

table_data
  = //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
	n:("||" { return [] } / nt:newlineToken ("|" / !'!') { return nt })
    ! [}+-]
    //& { dp('before attrib, pos=' + pos); return true; }
    a:(as:generic_attribute+ space* "|" !"|" { return as } )?
    //& { dp('past attrib, pos=' + pos); return true; }
    // use inline_breaks to break on tr etc
    td:(!inline_breaks
        //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
            b:block { return b })* {
        if ( a == '' ) {
            a = [];
        }
        //dp("table data result: " + pp(td) + ", attribts: " + pp(a));
        return n.concat( [ new TagTk( 'td',  a ) ]
					   , td, [ new EndTagTk( 'td' ) ] );
    }

table_heading
  = n:("!!" { return [] } / nl:newlineToken "!" { return nl })
    a:(as:generic_attribute+ "|" !"|" { return as } )?
     c:inline {
         if ( a == '' ) {
             a = [];
         }
         return n.concat( [ new TagTk( 'th', a )]
						, c, [ new EndTagTk( 'th' ) ]);
     }

thtd_attribs
  // In particular, do not match [|\n]
  = a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "|" ! "|" {
    return a;
  }


table_end
  = nt:newlineToken? ( "|}" / eof ) {
	  clearFlag('table');
	  if(nt)
		  return nt;
	  else
		  return [];
  }


/* Tabs do not mix well with the hybrid production syntax */
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent : */