mediawiki-extensions-Visual.../modules/parser/pegTokenizer.pegjs.txt

/**
 * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several
 * chunks of tokens (one chunk per top-level block matched) and eventually an
 * end event. Tokens map to HTML tags as far as possible, with custom tokens
 * used where further processing on the token stream is needed.
 *
 * @author Gabriel Wicke <gwicke@wikimedia.org>
 * @author Brion Vibber <brion@wikimedia.org>
 */
{
    /* Fixme: use static functions to separate module! Unfortunately, this
     * does not work:
     * var tu = require('./mediawiki.tokenizer.utils.js');
     * console.warn(tu.flatten([]));
     * Using exports in the module gets a bit further, but accesses to
     * tu.flatten in productions still fail. Thus, I just moved the functions
     * here until a solution is found:
     */

    /* Static utilities */

    // Flatten a list of lists.
    //var simple_flatten = function ( e ) {
    //    // Fast single-level flatten:
    //    //return [].concat.apply([], e);
    //    
    //    var es = [];
    //    // flatten sub-arrays
    //    for(var i = 0, length = e.length; i < length; i++) {
    //        var ei = e[i];
    //        if ($.isArray(ei))
    //            es = es.concat(flatten(ei));
    //        else
    //            es.push(ei);
    //    };
    //    return es;
    //};

    var flatten = function ( e ) {
        // Fast single-level flatten:
        //return [].concat.apply([], e);
        
        var es = [];
        // flatten sub-arrays
        var chunkStart = null,
            modified = false;
        for(var i = 0, l = e.length; i < l; i++) {
            if ( e[i].constructor === Array ) {
                if ( chunkStart !== null ) {
                    es = es.concat( e.slice( chunkStart, i ), flatten( e[i] ) );
                    chunkStart = null;
                } else {
                    es = es.concat(flatten(e[i]));
                }
                modified = true;
            } else if ( chunkStart === null ) {
                chunkStart = i;
            }
        };
        if ( modified ) {
            if ( chunkStart !== null ) {
                es = es.concat( e.slice( chunkStart, e.length ) );
            }
            return es;
        } else {
            return e;
        }
    };

    
    var flatten_string = function ( c ) {
        var out = flatten_stringlist( c );
        if ( out.length === 1 && out[0].constructor === String ) {
            return out[0];
        } else {
            return out;
        }
    };

    var flatten_stringlist = function ( c ) {
        var out = [],
            text = '';
        c = flatten(c);
        for (var i = 0, l = c.length; i < l; i++) {
            var ci = c[i];
            if (ci.constructor === String) {
                if(ci !== '') {
                    text += ci;
                }
            } else {
                if (text !== '') {
                    out.push( text );
                    text = '';
                }
                out.push(ci);
            }
        }
        if (text !== '' ) {
            out.push( text );
        }
        return out;
    }

    // Remove escaped quotes from attributes etc
    // This was in the original PEG parser, but could not find anything in
    // MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke
    var unquote = function (quotec, text) {
        return text.replace('\\' + quotec, quotec);
    };

    // Decode html entities. In a browser, this should only be fed the entity,
    // not untrusted html! XXX: replace with safer version.
    var unentity = function ( entity ) {
        return $("<div/>").html(entity).text();
    };

    // Debug print with global switch
    var dp = function ( msg ) {
        if ( false ) {
            console.warn(msg);
        }
    };

    var pp = function ( s ) { return JSON.stringify(s, null, 2); };

    // Simple string formatting using '%s'
    var sprintf = function ( format ) {
        var args = Array.prototype.slice.call(arguments, 1);
        return format.replace(/%s/g, function () {
            return args.length ? args.shift() : '';
        });
    };


    /**
     * Determine if a string represents a valid ISBN-10 or ISBN-13 identifier
     *
     * @static
     * @method
     * @param {String} isbn: The string to check
     * @returns {Boolean}: True if valid ISBN, false otherwise.
     */
    var isValidISBN = function ( isbn ) {
        var i = 0, checksum = 0;

        isbn = isbn.toUpperCase().replace(/[^\dX]/g, '');
        return [10, 13].indexOf(isbn.length) !== -1;

        // XXX: The following code is short-circuited because it is stricter
        // than the standard parser:

        switch (isbn.length) {
        case 10:
            for (i = 0; i < 9; i++) {
                checksum += parseInt(isbn[i], 10) * (10 - i);
            }
            checksum += '0123456789X'.indexOf(isbn[9]);
            return (checksum % 11 === 0);
        case 13:
            for (i = 0; i < 13; i++) {
                checksum += parseInt(isbn[i], 10) * (i & 1 ? 3 : 1);
            }
            return (checksum % 10 === 0) && (/^97[89]/.test(isbn));
        }
        return false;
    };

    
    /* End static utilities */

    /*
     * Flags for specific parse environments (inside tables, links etc). Flags
     * trigger syntactic stops in the inline_breaks production, which
     * terminates inline and attribute matches. Flags merely reduce the number
     * of productions needed: The grammar is still context-free as the
     * productions can just be unrolled for all combinations of environments
     * at the cost of a much larger grammar.
     */
    function SyntaxStops () {
        this.counters = {};
        this.stacks = {};
    }
    SyntaxStops.prototype.inc = function(flag) {
        if (this.counters[flag] !== undefined) {
            this.counters[flag]++;
        } else {
            this.counters[flag] = 1;
        }
        return true;
    };
    SyntaxStops.prototype.dec = function(flag) {
        this.counters[flag]--;
        return false;
    };
    SyntaxStops.prototype.onCount = function ( name ) {
        return this.counters[name];
    };

    /**
     * A stack for nested, but not cumulative syntactic stops.
     * Example: '=' is allowed in values of template arguments, even if those
     * are nested in attribute names.
     */
    SyntaxStops.prototype.push = function ( name, value ) {
        if( this.stacks[name] === undefined ) {
            this.stacks[name] = [value];
        } else {
            this.stacks[name].push( value );
        }
        return true;
    };
    SyntaxStops.prototype.pop = function ( name ) {
        if( this.stacks[name] !== undefined ) {
            this.stacks[name].pop();
        } else {
            throw "SyntaxStops.pop: unknown stop for " + name;
        }
        return false;
    };
    SyntaxStops.prototype.onStack = function ( name ) {
        var stack = this.stacks[name];
        if ( stack === undefined || stack.length === 0 ) {
            return false;
        } else {
            return stack[stack.length - 1];
        }
    };

    var stops = new SyntaxStops();

    // Start position of top-level block
    // Could also provide positions for lower-level blocks using a stack.
    var blockStart = 0;

    // Start position of generic tag production
    var tagStartPos = 0;

    // Stack of source positions
    var posStack = {
        positions: {},
        push: function( key, pos ) {
            if ( this.positions[key] === undefined ) {
                this.positions[key] = [pos];
            } else {
                this.positions[key].push( pos );
            }
            return true;
        }, 
        pop: function( key, pos ) {
            var pk = this.positions[key];
            if ( pk === undefined || ! pk.length ) {
                throw "Tried to pop unknown position for " + key;
            } else {
                return [ pk.pop(), pos ];
            }
        } 
    };

    // cache the input length
    var inputLength = input.length;

    // pseudo-production that matches at end of input
    var isEOF = function (pos) {
        return pos === inputLength;
    };

    // text start position
    var textStart = 0;

    // Define block-level tags in JS, so we can use toLowerCase to match tags
    // case-independently. This would be quite ugly (and possibly slower) if
    // done manually in the grammar.
    var block_names = (function () {
        var names = [ "p", "table", "td", "tr", "ul", "ol"
                    , "li", "dl", "dt", "dd", "div", "center"
                    , "blockquote" ];
        var bnames = {};
        for(var i = 0, l = names.length; i < l; i++) {
            bnames[names[i]] = true;
        }
        return bnames;
    })();

    var self = this;

        
}

/*********************************************************
 * The top-level production
 *********************************************************/

start
  = e:toplevelblock* newline* { 
      // end is passed inline as a token, as well as a separate event for now.
      __parseArgs[2]( [ new EOFTk( ) ] );
      return []; //flatten(e);
  }


/*
 * A document (start production) is a sequence of toplevelblocks. Tokens are
 * emitted in chunks per toplevelblock to avoid buffering the full document.
 */
toplevelblock 
  = & { blockStart = pos; return true; } !eof b:block {
    b = flatten(b);

    // Add source offsets for round-tripping. XXX: Add these not just for
    // toplevelblocks!
    if ( b.length ) {
        var bs = b[0];
        if ( bs.constructor === String && bs.attribs === undefined ) {
            b[0] = new String( bs );
            bs = b[0];
        }
        if (bs.dataAttribs === undefined) {
            bs.dataAttribs = {};
        }
        bs.dataAttribs.sourcePos = [blockStart, pos];
    }

    // Emit tokens for this toplevelblock. This feeds a chunk to the parser
    // pipeline.
    __parseArgs[2]( flatten( b ) );

    // We don't return any tokens to the start production to save memory. We
    // just emitted them already to our consumers.
    return true;
  }

/*
 * The actual contents of each block.
 */
block
  = block_lines
    / & '<' r:( pre // tag variant can start anywhere
            / comment &eolf
            / nowiki
            // avoid a paragraph if we know that the line starts with a block tag
            / bt:block_tag { return [bt] } 
            ) { return r; }
    / paragraph
    // Inlineline includes generic tags; wrapped into paragraphs in token
    // transform and DOM postprocessor
    / inlineline
    / sol

/*
 * A block nested in other constructs. Avoid eating end delimiters for other
 * constructs by checking against inline_breaks first.
 */
nested_block = !inline_breaks b:block { return b }

/*
 * Line-based block constructs.
 */
block_lines
  = s:sol
    // eat an empty line before the block
    s2:(os:optionalSpaceToken so:sol { return os.concat(so) })? 
    bl:block_line { 
        var s2_ = (s2 !== '') ? s2 : [];
        return s.concat(s2_, bl); 
    }

/*
 * Block structures with start-of-line wiki syntax
 */
block_line
  = h
  / lists
  / st:optionalSpaceToken 
    r:( & [{}|!] tl:table_lines { return tl; }
      // tag-only lines should not trigger pre either
      / bts:(bt:block_tag stl:optionalSpaceToken { return bt.concat(stl) })+ 
        &eolf { return bts }
      ) { 
          return st.concat(r); 
      }
  / ! { return stops.counters.nopre } pre_indent
  / pre

/*
 * A paragraph. We don't emit 'p' tokens to avoid issues with template
 * transclusions, <p> tags in the source and the like. Instead, we perform
 * some paragraph wrapping on the token stream and the DOM.
 */
paragraph
  = s1:sol s2:sol c:inlineline { 
      return s1.concat(s2, /* [new TagTk('p')],*/ c); 
  }

br = space* &newline { return new SelfclosingTagTk( 'br' ) }

/*
 * Syntax stops: Avoid eating significant tokens for higher-level productions
 * in nested inline productions.
 *
 * Repeated testing of flags is not terribly efficient. See new and faster
 * version below.
 */

/*
 * Syntax stops: Avoid eating significant tokens for higher-level productions
 * in nested inline productions.
 */
inline_breaks
  = & [=|!}{:\r\n\]<]
    & { // Important hack: disable caching for this production, as the default
        // cache key does not take into account flag states!
        cacheKey = ''; 
        //console.warn('ilbf: ' + input.substr(pos, 5) );
        return null !== __parseArgs[3].inline_breaks( input, pos, stops )
      } 

inline
  = c:(urltext / (! inline_breaks (inline_element / . )))+ {
      //console.warn('inline out:' + pp(c));
      return flatten_stringlist( c );
}


inlineline
  = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
      return flatten_stringlist( c );
}

inline_element
  = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
      & '<' ( comment / xmlish_tag )
    /// & '{' ( & '{{{{{' template / tplarg / template )
    / & '{' tplarg_or_template 
    /// & '{' ( tplarg / template )
     // Eat three opening brackets as text.
    / '[[[' { return '[[[' }
    / & '[' ( wikilink / extlink / autolink )
    / & "'" quote

/* Headings  */

h = & "=" // guard, to make sure '='+ will match. 
          // XXX: Also check to end to avoid inline parsing?
    r:(
     s:'='+ // moved in here to make s accessible to inner action
     & { return stops.inc('h'); }
     c:inlineline 
     e:'='+ 
     spc:(sp:space+ { return sp.join('') } / comment)*
     &eolf 
     {
        stops.dec('h');
        var level = Math.min(s.length, e.length);
        // convert surplus equals into text
        if(s.length > level) {
            var extras = s.substr(0, s.length - level);
            if(c[0].constructor === String) {
                c[0] = extras + c[0];
            } else {
                c.unshift( extras );
            }
        }
        if(e.length > level) {
            var extras = e.substr(0, e.length - level),
                lastElem = c[c.length - 1];
            if(lastElem.constructor === String) {
                lastElem += extras;
            } else {
                c.push( extras );
            }
        }

        return [new TagTk( 'h' + level )]
                .concat(c, [new EndTagTk( 'h' + level ), spc]);
      }
    / & { /* dp('nomatch exit h'); */ stops.dec('h'); return false } { return null }
    ) { return r }

comments
  = '<!--' c:comment_chars* ('-->' / eof)
    cs:(space* newline space* cn:comment { return cn })* {
        return [new CommentTk( c.join('') )].concat(cs);
    }

comment
  = '<!--' c:comment_chars* ('-->' / eof)
    {
        return [new CommentTk( c.join('') )];
    }

comment_chars
  = c:[^-] { return c; }
  / c:'-' !'->' { return c; }


// Behavior switches
behavior_switch
  = '__' 
    behavior:(
        'NOTOC'
      / 'FORCETOC'
      / 'TOC'
      / 'NOEDITSECTION'
      / 'NEWSECTIONLINK'
      / 'NONEWSECTIONLINK'
      / 'NOGALLERY'
      / 'HIDDENCAT'
      / 'NOCONTENTCONVERT'
      / 'NOCC'
      / 'NOTITLECONVERT'
      / 'NOTC'
      / 'START'
      / 'END'    // XXX: Removed in 19213. Should we keep this?
      / 'INDEX'
      / 'NOINDEX'
      / 'STATICREDIRECT'
    )
    '__'
{
    return [
        new TagTk(
            'behavior-switch',
            [new KV('word', behavior)]
        )
    ];
}

/**************************************************************
 * External (bracketed and autolinked) links
 **************************************************************/

autolink
  = ! { return stops.onStack('extlink') }
    (urllink / autoref / isbn)

urllink
  = target:url {
      return [ new TagTk( 'urllink', [new KV('href', target)] ) ];
  }

extlink
  = ! { return stops.onStack('extlink') } // extlink cannot be nested
    (
        "[" 
        & { return stops.push('extlink', true); }
        //target:urllink
        target:extlink_preprocessor_text
        text:(( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )* 
              t:inlineline { return t } )?
        "]" {
            stops.pop('extlink');
            //if ( text === '' ) {
            //    // XXX: Link numbering should be implemented in post-processor.
            //    text = [ "[" + linkCount + "]" ];
            //    linkCount++;
            //}
            //console.warn( 'extlink text: ' + pp( text ) );
            var dataAttribs = {};
            if ( text === '' ) {
                dataAttribs.gc = 1;
            }
            return [
                new SelfclosingTagTk( 'extlink', [
                    new KV('href', target),
                    new KV('content', text)
                ],
                dataAttribs)
            ];
        }
      / "[" & { return stops.pop('extlink'); }
    )

autoref
  = ref:('RFC' / 'PMID') (newline / space)+ identifier:[0-9]+
{
    identifier = identifier.join('');

    var base_urls = {
            'RFC'  : '//tools.ietfs.org/html/rfc%s',
            'PMID' : '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract'
        },
        url = sprintf(base_urls[ref], identifier);

    return [
        new SelfclosingTagTk( 'extlink', [
           new KV('href', sprintf(base_urls[ref], identifier)),
           new KV('content', [ref, identifier].join(' '))
        ] )
    ];
}

isbn
  = 'ISBN' (newline / space)+
    head:[0-9]
    digits:([- ]? [0-9])+
    tail:'X'?
{
    var isbn = [head, digits, tail].join('').replace(/,/g, '');

    if (!isValidISBN(isbn)) {
        return null;
    }

    return [
        new SelfclosingTagTk( 'wikilink', [
           new KV('', 'Special:BookSources/' + isbn.replace(/[^\d]/g, '')),
           new KV('', 'ISBN ' + isbn),
           new KV('tail', ''),
        ] )
    ];
}


/* Default URL protocols in MediaWiki (see DefaultSettings). Normally
 * these can be configured dynamically. */

url_chars = [/fghimnstwIPR]

url_protocol
  = '//' // for protocol-relative URLs
  / 'ftp://'
  / 'git://'
  / 'gopher://'
  / 'http://'
  / 'https://'
  / 'irc://'
  / 'ircs://'  // @bug 28503
  / 'mailto:'
  / 'mms://'
  / 'news:'
  / 'nntp://' // @bug 3808 RFC 1738
  / 'svn://'
  / 'telnet://' // Well if we're going to support the above.. -ævar
  / 'worldwind://'

// javascript does not support unicode features..
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]


urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] { 
    try {
        return decodeURI("%" + c0 + c1) 
    } catch ( e ) {
        // Reject the match, and allow other fall-back productions to have a
        // go at it.
        return null;
    }
}

//[^][<>"\\x00-\\x20\\x7F\p{Zs}]

// no punctiation, and '{<' to trigger directives
no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{]

url
  = proto:url_protocol 
    addr:( ipv6_address / ipv4_address )?
    path:(  ( !inline_breaks 
              c:no_punctuation_char
              { return c }
            )
            / s:[.:,] !(space / eolf) { return s } 
            / comment
            / tplarg_or_template
            / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" ) 
                r:(
                    htmlentity
                  / [&%{] 
                ) { return r }
         )+ 
{ 
    //console.warn( "path: " + pp( flatten_stringlist( [proto + addr].concat( path ) ) ) );
    return flatten_string( [proto + addr].concat( path ) ); 
}

ipv4_address 
  = a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*)
{
    return flatten( a ).join('');
}

ipv6_address 
  = a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']')
{
    return flatten( a ).join('');
}


/**************************************************************
 * Templates, -arguments and wikilinks
 **************************************************************/

/*
 * Precedence: template arguments win over templates. See 
 * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence
 * 4: {{{{·}}}} → {·{{{·}}}·}
 * 5: {{{{{·}}}}} → {{·{{{·}}}·}}
 * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}}
 * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·}
 */
tplarg_or_template 
    = & '{{{{{{{' '{' tplarg_or_template '}'
    / & ( '{{{' &'{{{' tplarg ) tplarg
    // tplarg in template
    / & ( '{{' &'{{{' tplarg )  template
    / tplarg 
    / template

template
  = "{{" nl_comment_space*
    target:template_param_value
    params:(nl_comment_space* "|"
                r:( nl_comment_space* &"|" { return new KV( '', '') } // empty argument
                    / p:template_param { return p } 
                  ) { return r } 
            )* 
    nl_comment_space* 
    "}}" {
      // Insert target as first positional attribute, so that it can be
      // generically expanded. The TemplateHandler then needs to shift it out
      // again.
      params.unshift( { k: flatten( target ), v: [] } );
      var obj = new SelfclosingTagTk( 'template', params );
      //console.warn( 'tokenizer template ' + pos + );
      //console.warn('template @' + pos + '::' + input.substr(pos, 40) );
      return obj;
  }

// XXX: support template and args in target!
//template_target
//  = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') }

tplarg 
  = "{{{" 
    name:template_param_value?
    params:( nl_comment_space* 
              '|' nl_comment_space*  
               r:( 
                    &'}}}' { return new KV( '', '') }
                    / p:template_param { return p } 
               ) { return r }
           )* 
    nl_comment_space* 
    "}}}" {
      name = flatten( name );
      params.unshift( new KV( name, '' ) );
      var obj = new SelfclosingTagTk( 'templatearg', params );
      //console.warn( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 ));
      //console.warn('template arg @' + pos + '::' + input.substr(pos, 40) );
      return obj;
  }

template_param
  = name:template_param_name 
    // MW accepts |foo | = bar | as a single param..
    ('|' (space / newline)* &'=')?
    val:(
        s0:space* 
        "="
        s1:space*
        value:template_param_value? {
            return { s0: s0, s1: s1, value: value };
        }
        )?

    {
      //console.warn( 'named template_param matched' + pp([name, value ]) );
      if ( val !== '' ) {
          if ( val.value !== '' ) {
            return new KV( name, flatten( val.value ) );
          } else {
            return new KV(flatten( name ), []);
          }
      } else {
        return new KV([], flatten(name));
      }
    } 
  // empty parameter
  / & [|}] { return new KV([], []); }


// FIXME: handle template args and templates in key! (or even parser functions?)
template_param_name
  = & { return stops.push( 'equal', true ) }
    tpt:(template_param_text / &'=' { return '' })
    {
        stops.pop( 'equal' );
        //console.warn( 'template param name matched: ' + pp( tpt ) );
        return tpt;
    }

  / & { return stops.pop( 'equal' ) }
  //= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }

template_param_value
  = & { stops.inc( 'nopre' ); return stops.push( 'equal', false ) }
    tpt:template_param_text
    {
        stops.dec( 'nopre' );
        stops.pop( 'equal' );
        //console.warn( 'template param value matched: ' + pp( tpt ) );
        return tpt;
    }
  / & { stops.dec( 'nopre' ); return stops.pop( 'equal' ) }

template_param_text
  = & { /*console.warn( 'tpt: ' + 
          input.substr( pos - 10, 9) + 
          input[pos].green +
          input.substr( pos +1, 9) ); */
        // re-enable tables within template parameters
        stops.push('table', false ); 
        stops.push('extlink', false);
        return stops.inc('template') 
    }
    il:nested_block+ {
        stops.pop('table');
        stops.pop('extlink');
        stops.dec('template');
        //console.warn( 'tpt match: ' + pp (il) + " stops: " + pp(stops));
        var r = flatten( il ),
            firstToken = r[0];

        // Strip leading newlines and space-only text tokens. 
        // XXX: Will need to round-trip these eventually.
        while ( 
                firstToken !== undefined &&
                ( firstToken.constructor === NlTk || 
                  firstToken.constructor === String && firstToken.match( /^[\t ]+$/ ) )
              )
        {
            r.shift();
            firstToken = r[0];
        }
        //console.warn('tpl param name' + pos + r[0] + '::' + input.substr(pos, 40) );
        if ( r.length === 1 && r[0].constructor === String ) {
            r = r[0];
        }

        return r;
    }
  / & { stops.pop('table'); stops.pop('extlink'); return stops.dec('template'); }


// TODO: handle link prefixes as in al[[Razi]]
wikilink
  = & { return posStack.push('wikilink' , pos); }
    "[[" 
    ! url
    //target:link_target 
    // XXX: disallow pipe!
    target:wikilink_preprocessor_text
    lcontent:( 
               & { return posStack.push('lcontent' , pos); }
               lcs:( pipe lt:link_text { return new KV( '', lt ); } )+ { 
                   return { pos: posStack.pop('lcontent' , pos), content: lcs };
               }
            
             / { 
                 return { pos: posStack.pop('lcontent' , pos), content: [] };
               }
             )
    "]]" 
    // XXX In real MediaWiki, this is a language-dependent positive character
    // class. Can we work out a static negative class instead?
    // XXX: Exclude uppercase chars from non-latin languages too!
    tail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* {
      var obj = new SelfclosingTagTk( 'wikilink' ),
          textTokens = [];
      obj.attribs.push( new KV('', target) );
      obj.dataAttribs = { 
          sourcePos: posStack.pop( 'wikilink', pos ),
          contentPos: lcontent.pos
      };
      // XXX: Point to object with path, revision and input information
      //obj.source = input;

      //console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) );
      // Deal with content. XXX: Properly support pipe-trick etc
      //lcontent.tail = tail && tail.join('') || '';

      obj.attribs = obj.attribs.concat( lcontent.content );
      obj.attribs.push( new KV( 'tail', tail && tail.join('') || '' ) );
      //console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) );
      return [obj];
  }
  / ! { return posStack.pop( 'wikilink', pos ); }

link_text
  = & { return stops.inc('linkdesc'); }
    h:inline 
    // 'equal' syntaxFlag is set for links in template parameters. Consume the
    // '=' here.
    hs:( '=' inline?)?
    { 
        //console.warn('link_text' + pp(h) + pp(hs));
        stops.dec('linkdesc');
        if( hs !== '' ) {
            return h.concat(hs);
        } else {
            return h;
        }
    }
  / & { return stops.dec('linkdesc'); }

link_option
  = & { stops.inc('pipe'); return stops.inc('linkdesc'); }
    h:inline 
    // 'equal' syntaxFlag is set for links in template parameters. Consume the
    // '=' here.
    hs:( '=' inline)?
    { 
        //console.warn('link_text' + pp(h) + pp(hs));
        stops.dec('pipe');
        stops.dec('linkdesc');
        if( hs !== '' ) {
            return h.concat(hs);
        } else {
            return h;
        }
    }
  / & { stops.dec('pipe'); return stops.dec('linkdesc'); }

link_end = "]]"

/* Generic quote production for italic and bold, further processed in a token
 * stream transformation in doQuotes. Relies on NlTk tokens being emitted
 * for each line of text to balance quotes per line.
 * 
 * We are not using a simple pair rule here as we need to support mis-nested
 * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
 * all not context free. */
quote = "''" x:"'"* {
	var res = new TagTk( 'mw-quote' ); // Will be consumed in token transforms
    res.value = "''" + x.join('');
    return res;
}


/**
 * Image option productions, only called from the LinkHandler token stream
 * transformer, and only for images.
 */
img_options = 
  & { return stops.inc( 'pipe' ); }
  os:img_option* {
    stops.dec( 'pipe' );
	var options = {};
    os = flatten( os );
	for ( var i = 0, l = os.length; i < l; i++ ) {
		var o = os[i];
		options[o.k] = o.v;
	}
	options._options = os;
	return options;
}
/ & { return stops.dec( 'pipe' ); }

img_option 
  = pipe space*
  o:(  
      img_attribute
    / img_format
    / img_dimensions
    / img_halign
    / img_valign
    / img_link
    / lt:link_text { return new KV('caption', lt) }
  )
  space* { 
      return o 
  };

img_format
  = f:( 'border' / 'frameless' / 'frame' / 'thumbnail' / 'thumb' ) {
      return new KV( 'format', f );
  }

img_dimensions 
  = x:(n:[0-9]+ { return n.join('') })? y:('x' n:[0-9]+ { return n.join('') })? 'px' {
      if ( x === '' && y ) {
          return new KV( 'height', y );
      } else if ( y === '' && x ) {
          return new KV( 'width', x );
      } else {
          return [ new KV( 'width', x ), new KV( 'height', y ) ];
      }
  }
  / 'upright' { return [ new KV( 'width', 'upright' ) ] }

img_halign
  = a:( 'left' / 'right' / 'center' / 'none' ) {
      return new KV( 'halign', a );
  }

img_valign
  = a:( 'baseline' / 'sub' / 'super' / 'top' / 'text-top' 
          / 'middle' / 'bottom' / 'text-bottom' ) {
      return new KV( 'valign', a );
  }

// External link targets are already parsed as link tokens. If we can make
// sure that those cleanly convert back to the original text, then we could
// re-parse them here.
img_link
  = 'link=' space* 
    u:( 
        t:url {
            stops.dec( 'pipe' );
            return t;
        }
        / & { return stops.dec( 'pipe' ); }
      ) 
{
      return new KV( 'link', u );
}

img_attribute
  = k:( 'page' / 'alt' / 'thumbnail' / 'thumb' ) '=' t:preprocessor_text? {
      return new KV( k, t );
  }


/***********************************************************
 * Pre and xmlish tags
 ***********************************************************/

// Indented pre blocks differ from their non-indented (purely tag-based)
// cousins by having their contents parsed.
pre_indent 
  = pre_indent_in_tags
  / l:pre_indent_line ls:(sol pre_indent_line)* {
      return [new TagTk( 'pre' )]
                    .concat( [l], ls
                           , [new EndTagTk( 'pre' )]);
  }

// An indented pre block that is surrounded with pre tags. The pre tags are
// used directly.
pre_indent_in_tags
  = space+ // XXX: capture space for round-tripping
    "<pre" 
    attribs:generic_attribute* 
    ">"
    & { return stops.inc('pre'); }
    l:inlineline
    ls:(sol pre_indent_line)*
    "</pre>"
  {   
    stops.dec('pre');
    return [ new TagTk( 'pre', attribs ) ]
        .concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] );
  }
  / & { return stops.dec('pre'); }

pre_indent_line = space l:inlineline { 
    return [ '\n' ].concat(l); 
}

/*
 * Pre blocks defined using non-indented HTML tags only parse nowiki tags
 * inside them, and convert other content to verbatim text. Nowiki inside pre
 * is not functionally needed, but supported for backwards compatibility.
 */
pre
  = "<pre" 
    attribs:generic_attribute* 
    ">"
    // MediaWiki <pre> is special in that it converts all pre content to plain
    // text.
    ts:(t1:[^<]+ { return t1.join('') } 
                / nowiki 
                / !"</pre>" t2:. { return t2 })+ 
    ("</pre>" / eof) {
        // return nowiki tags as well?
        //console.warn('inpre');
        return [ new TagTk( 'pre', attribs, { stx: 'html' } ) ]
                    .concat(ts, [ new EndTagTk( 'pre' ) ]);
    }
  / "</pre>" { return "</pre>"; }

/* XXX: Extension tags can require a change in the tokenizer mode, which
 * returns any text between extension tags verbatim. For now, we simply
 * continue to parse the contained text and return the tokens. The original
 * input source can be recovered from the source positions added on tag
 * tokens. This won't however work in all cases. For example, a comment start
 * (<!--) between extension tags would cause the remaining text to be consumed
 * as a comment. To avoid this, we might need to look ahead for the end tag
 * and limit the content parsing to this section. */

xmlish_tag = nowiki / generic_tag

/*
 * Nowiki treats anything inside it as plain text. It could thus also be
 * defined as an extension that returns its raw input text, possibly wrapped
 * in a span for round-trip information. The special treatment for nowiki in
 * pre blocks would still remain in the grammar though, so overall handling it
 * all here is cleaner.
 */
nowiki 
  = "<nowiki>" nc:nowiki_content "</nowiki>" { 
        //console.warn( 'full nowiki return: ' + pp(nc)); 
        return [ new SelfclosingTagTk( 'meta', 
                [
                    {k: 'typeof', v: 'mw:tag'}, 
                    {k: 'content', v: 'nowiki'} 
                ])].concat( nc, [ 
                    new SelfclosingTagTk( 'meta', 
                    [
                        {k: 'typeof', v: 'mw:tag'}, 
                        {k: 'content', v: '/nowiki'} 
                    ] ) 
                ] ); 
    }
  / "<nowiki>" { 
          //console.warn('nowiki fallback');
          return ['<nowiki>']; 
    }
  / "</nowiki>" { 
      //console.warn('nowiki end fallback');
      return ['</nowiki>']; 
  }

nowiki_content
  = ts:(   t:[^<]+ { return t.join('') } 
           / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
                 //console.warn('nested pre in nowiki');
                 return ["<pre"].concat(p0, p1, [">"], p2, ["</pre>"]).join('');
               }
           / (!("</"( "nowiki>" / "pre>")) c:. {
               //console.warn('nowiki: single char' + c);
               return c;
           }) 
       )* {
            // return nowiki tags as well?
            //console.warn('nowiki_content: return' + pp(ts));
            return [ts.join('')];
          }


// The list of HTML5 tags, mainly used for the identification of *non*-html
// tags. Non-html tags terminate otherwise tag-eating productions (see list
// below) in order to support potential extension tags.
html5_tagnames 
  = "a" / "abbr" / "address" / "area" / "article" 
    / "aside" / "audio" / "b" / "base" / "bdi" / "bdo" / "blockquote" 
    / "body" / "br" / "button" / "canvas" / "caption" / "cite" / "code" 
    / "col" / "colgroup" / "command" / "data" / "datalist" / "dd" / "del" 
    / "details" / "dfn" / "div" / "dl" / "dt" / "em" / "embed" / "fieldset" 
    / "figcaption" / "figure" / "footer" / "form" 
    / "h1" / "h2" / "h3" / "h4" / "h5" / "h6" / "head" / "header" / "hgroup" 
    / "hr" / "html" / "i" / "iframe" / "img" / "input" / "ins" / "kbd" / "keygen" 
    / "label" / "legend" / "li" / "link" / "map" / "mark" / "menu" / "meta" 
    / "meter" / "nav" / "noscript" / "object" / "ol" / "optgroup" / "option" 
    / "output" / "p" / "param" / "pre" / "progress" / "q" / "rp" / "rt" 
    / "ruby" / "s" / "samp" / "script" / "section" / "select" / "small" 
    / "source" / "span" / "strong" / "style" / "sub" / "summary" / "sup" 
    / "table" / "tbody" / "td" / "textarea" / "tfoot" / "th" / "thead" / "time" 
    / "title" / "tr" / "track" / "u" / "ul" / "var" / "video" / "wbr"

html_oldnames = "center" / "font" / "tt"

// Simple match on non-html endtags, for use in inline_breaks
nonhtml_endtag
  = '</' 
    ! ( html5_tagnames / html_oldnames )
    [^ >]+
    ( space / newline ) *
    [^>]*
    '>'

/* Generic XML-like tags
 * 
 * These also cover extensions (including Cite), which will hook into the
 * token stream for further processing. The content of extension tags is
 * parsed as regular inline, but the source positions of the tag are added 
 * to allow reconstructing the unparsed text from the input. */

// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
generic_tag
  = "<" 
    & { tagStartPos = pos; return true; } // remember the start position of this tag
    end:"/"? name:[0-9a-zA-Z]+ 
    attribs:generic_newline_attribute* 
    ( space / newline ) *
    selfclose:"/"?
    ">" {
        name = name.join('');
        var res;
        if ( end != '' ) {
            res = new EndTagTk( name, attribs );
        } else if ( selfclose != '' ) {
            res = new SelfclosingTagTk( name, attribs );
        } else {
            res = new TagTk( name, attribs );
        }
        if ( res.dataAttribs === undefined ) {
            res.dataAttribs = {};
        }
        res.dataAttribs.sourceTagPos = [tagStartPos - 1, pos];
        res.dataAttribs.stx = 'html';
        return res;
    }

// A generic attribute that can span multiple lines.
generic_newline_attribute
  = s:( space / newline )* 
    name:generic_attribute_name
    value:(( space / newline )*
          v:generic_attribute_newline_value { return v })?
{
    //console.warn('generic_newline_attribute: ' + pp( name ))
    if ( value !== '' ) {
        return new KV( name, value );
    } else {
        return new KV( name, '' );
    }
}

// A single-line attribute.
generic_attribute
  = s:space* 
    name:generic_attribute_name
    value:(space*
          v:generic_attribute_value { return v })?
{
    //console.warn( 'generic attribute: ' + pp([name, value]));
    // FIXME: name might just be a template, which can expand to a key-value
    // pair later. We'll need to handle that in the AttributeTransformManager.
    if ( value !== '' ) {
        return new KV( name, value );
    } else {
        return new KV( name, '' );
    }
}

// ( Replaced by generic_attribute_name for template / parameter support. )
//// http://dev.w3.org/html5/spec/Overview.html#attributes-0, and we also
//// disallow newlines, | and {.
//generic_attribute_plain_name
//  = n:[^ \t\0/"'>=\n|{]+ {
//        return n.join('');
//  }

generic_attribute_name
  = & { return stops.push( 'equal', true ) }
  ! '/>'
  name:attribute_preprocessor_text_line
    {
        stops.pop( 'equal' );
        //console.warn( 'generic attribute name: ' + pp( name ) );
        return name;
    }
  / & { return stops.pop( 'equal' ) }

// A generic attribute, possibly spanning multiple lines.
generic_attribute_newline_value
  = "=" (space / newline )* v:xml_att_value { 
      return v; 
  }
// A generic but single-line attribute.
generic_attribute_value
  = "=" space* v:att_value { 
      return v; 
  }

// Attribute value, quoted variants can span multiple lines.
xml_att_value
  = "'" t:attribute_preprocessor_text_single "'" { return t; }
  / '"' t:attribute_preprocessor_text_double '"' { return t; }
  / attribute_preprocessor_text

// Attribute value, restricted to a single line.
att_value
  = "'" t:attribute_preprocessor_text_single_line "'" { return t; }
  / '"' t:attribute_preprocessor_text_double_line '"' { return t; }
  / attribute_preprocessor_text_line

/*
 * A variant of generic_tag, but also checks if the tag name is a block-level
 * tag as defined in
 * http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and following
 * paragraphs.
 */
block_tag
  = "<" end:"/"? 
    name:(cs:[a-zA-Z]+ { return cs.join('') })
    attribs:generic_newline_attribute* 
    ( space / newline ) *
    selfclose:"/"?
    ">" {
        if (block_names[name.toLowerCase()] !== true) {
            // abort match if tag is not block-level
            return null;
        }
        var res;
        if ( end != '' ) {
            res = new EndTagTk( name, attribs );
        } else if ( selfclose != '' ) {
            res = new SelfclosingTagTk( name, attribs );
        } else {
            res = new TagTk( name, attribs );
        }
        return [res];
    }


/*********************************************************
 *   Lists 
 *********************************************************/
lists = (dtdd / li) (sol (dtdd / li))*

li = bullets:list_char+ 
     c:inlineline?
     &eolf 
{
    if ( c === '' )
        c = [];
    var li = new TagTk( 'listItem' );
    li.bullets = bullets;
    return [ li, c ];
}

dtdd 
  = bullets:(!(";" !list_char) lc:list_char { return lc })*
    ";"
    & {return stops.inc('colon');}
    c:inlineline
    ":"
    // Fortunately dtdds cannot be nested, so we can simply set the flag
    // back to 0 to disable it.
    & { stops.counters['colon'] = 0; return true;}
    d:inlineline?
    &eolf {
        // Convert trailing space into &nbsp;
        // XXX: This should be moved to a serializer
        //var clen = c.length;
        //if (clen && c[clen - 1].constructor === String) {
        //    var val = c[clen - 1].value;
        //    if(val.length && val[val.length - 1] == ' ') {
        //        c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
        //    }
        //}
        
        bullets = bullets.join('');
        var li = new TagTk( 'listItem' );
        li.bullets = bullets + ";";
        var li2 = new TagTk( 'listItem' );
        li2.bullets = bullets + ":";

        return [ li ].concat( c, [ li2 ], d || new NlTk() );
    }
  // Fall-back case to clear the colon flag
  / & { return true; } { stops.counters['colon'] = 0; return null; }


list_char = [*#:;]


/*********************************************************************
 * Tables 
 *
 * Table productions are geared to support independent parsing of fragments in
 * templates (the common table start / row / table end use case). The tokens
 * produced by these fragments then match up to a table while building the
 * DOM tree. For similar reasons, table rows do not emit explicit end tag
 * tokens.
 *
 * The separate table_lines production is faster than moving those productions
 * directly to block_lines.
 *********************************************************************/

table_lines
  = //& { console.warn('enter table_lines: ' + input.substr(pos, 20)); return true; }
    (! inline_breaks / & '{{!}}' )  r:(
        & { return stops.push('table', true); }
        tl:table_line 
        tls:( s:sol tl2:table_line { return s.concat(tl2); } )* {
            stops.pop('table');
            //console.warn('table_lines: ' + pp(tl.concat(tls)));
            return tl.concat( tls );
        }
      / & { return stops.pop('table'); }
    ) { return r }

// This production assumes start-of-line position!
table_line
  = table_start_tag
  / table_row_tag
  / table_data_tags
  / table_heading_tags
  / table_caption_tag
  / table_end_tag

table_start_tag
  = "{" pipe
    ta:generic_attribute* 
    space*
    te:table_end_tag? // can occur somewhere in the middle of the line too
    { 
        var toks = [ new TagTk( 'table' ) ];
        if ( ta )
            toks[0].attribs = ta;
        if ( te )
            toks = toks.concat( te );
        return toks;
    }

table_caption_tag
  = pipe "+" 
    c:inline* { 
        return [ new TagTk( 'caption' )]
                    .concat( c, [ new EndTagTk( 'caption' ) ]);
    }


table_row_tag
  = //& { console.warn("table row enter @" + input.substr(pos, 30)); return true; }
    pipe "-"
    a:generic_attribute*
    space*
    // handle tables with missing table cells after a row
    td:( s:sol !( pipe / "!" ) tdt:table_data_tag { return s.concat(tdt); } )?
    { 
        // We rely on our tree builder to close the row as needed. This is
        // needed to support building tables from fragment templates with
        // individual cells or rows.
        var trToken = [ new TagTk( 'tr', a ) ];
        if ( !td ) {
            return trToken;
        } else {
            //console.warn( 'tr result: ' + pp(trToken.concat(td)) + ' stops: ' + pp(stops));
            return trToken.concat(td);
        }
    }

table_data_tags
  = pipe
    td:table_data_tag
    tds:( pipe_pipe tdt:table_data_tag { tdt[0].dataAttribs.stx_v = 'row';  return tdt } )* {
        return td.concat(tds);
    }

table_data_tag
  = //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
    ! [}+-]
    a:table_cell_args?
    //& { console.warn("past attrib, pos=" + pos + input.substr(pos,10)); return true; }
    // use inline_breaks to break on tr etc
    td:nested_block* 
    { 
        if ( a === '' ) {
            a = [];
        }
        //console.warn("table data result: " + pp(td) + ", attribs: " + pp(a));
        return [ new TagTk( 'td', a )]
                    .concat( td );
    }

table_heading_tags
  = //& { console.warn( 'th enter @' + input.substr(pos, 10)); return true; }
    "!"
    th:table_heading_tag
    ths:( "!!" tht:table_heading_tag  { tht[0].dataAttribs.stx_v = 'row'; return tht } )* {
        //console.warn( 'thts: ' + pp(th.concat(ths)));
        return th.concat(ths);
    }

table_heading_tag
  = a:table_cell_args?
    c:nested_block* {
         //console.warn( 'th stops: ' + pp( stops ) );
         if ( a === '' ) {
             a = [];
         }
         return [ new TagTk( 'th', a ) ]
						.concat( c );
     }

table_end_tag
  = space* pipe "}" { 
      var tok = [new EndTagTk( 'table' )];
	  return tok;
  }

table_cell_args
  = & { return stops.inc('tableCellArg'); }
    as:generic_attribute* space* pipe !pipe { 
        stops.dec('tableCellArg');
        //console.warn( 'tcargs' + JSON.stringify( as ) + " stops: " + pp(stops) );
        return as;
    }
    / & { return stops.dec('tableCellArg'); }


/* Tables, full-table version (no support for table fragments from templates) 
 * Only left here for comparison purposes with the above productions. Remove
 * when those work as intended! */
table 
  = tas:table_start space* c:table_caption? b:table_body? te:table_end { 
      var res = new TagTk( 'table' );
      var body = b !== '' ? b : [];
      //dp("body: " + pp(body));
      if (tas.length > 0) {
          // FIXME: actually parse and build structure
          //res.attribs = [new KV('data-unparsed', tas.join(''))];
          res.attribs = tas;
      }

      if (c != '') {
          var caption = [ new TagTk( 'caption' ) ]
                        .concat(c, [new EndTagTk( 'caption' )], te);
      } else {
          var caption = [];
      }
      //dp(pp(res));

      return [res].concat(caption, body, [new EndTagTk( 'table' )]);
  }

table_start 
  = "{" pipe
    res:(
        & { stops.push('table', true); return true; }
        ta:generic_attribute* 
        { 
            //dp("table_start " + pp(ta) + ", pos:" + pos);
            return ta;
        }
        / & { stops.pop('table'); return false; } { return null; }
    ) { return res }

table_caption 
  = n:newlineToken
    pipe "+" c:nested_block* { 
        return n.concat(c);
    }

table_body 
  = //& { dp("table_body enter"); return true; }
    firstrow:table_firstrow otherrows:table_row* { 
      /* dp('table first and otherrows: ' 
       * + pp([firstrow].concat(otherrows))); */
      return [firstrow].concat(otherrows); 
  }
  / otherrows:table_row* {
      //dp('table otherrows: ' + pp(otherrows));
      return otherrows;
  }

table_firstrow 
  = td:(table_data / table_heading)+ { 
      //dp('firstrow: ' + pp(td));
      return [ new TagTk( 'tr' )]
             .concat(td, [ new EndTagTk( 'tr' )]);
  }

table_row 
  = //& { dp("table row enter"); return true; }
  n:newlineToken
    pipe "-" 
    a:(as:generic_attribute+ space* pipe !pipe { return as } )?
    space* 
    td:(table_data / table_heading)* { 
        return n.concat([ new TagTk( 'tr' )]
					   , td, [ new EndTagTk( 'tr' )]);
    }

table_data 
  = //& { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
	n:(pipe_pipe { return [] } / nt:newlineToken (pipe / !'!') { return nt }) 
    ! [}+-]
    //& { dp('before attrib, pos=' + pos); return true; }
    a:(as:generic_attribute+ space* end_pipe { return as } )?
    //& { dp('past attrib, pos=' + pos); return true; }
    // use inline_breaks to break on tr etc
    td:(
        //& { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
        b:nested_block+ { return b }
       )* { 
        if ( a == '' ) {
            a = [];
        }
        //dp("table data result: " + pp(td) + ", attribts: " + pp(a));
        return n.concat( [ new TagTk( 'td',  a ) ]
					   , td, [ new EndTagTk( 'td' ) ] );
    }

table_heading
  = n:("!!" { return [] } / nl:sol "!" { return nl }) 
    a:(as:generic_attribute+ end_pipe { return as } )?
     c:inline {
         if ( a == '' ) {
             a = [];
         }
         return n.concat( [ new TagTk( 'th', a )]
						, c, [ new EndTagTk( 'th' ) ]);
     }

thtd_attribs
  // In particular, do not match [|\n]
  = a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ end_pipe {
    return a;
  }


table_end 
  = nt:newlineToken? ( pipe "}" / eof ) { 
	  stops.pop('table'); 
	  if(nt) 
		  return nt;
	  else
		  return [];
  }


/*******************************************************************
 * Text variants and other general productions
 *******************************************************************/

/* All chars that cannot start syntactic structures in the middle of a line
 * XXX: ] and other end delimiters should probably only be activated inside
 * structures to avoid unnecessarily leaving the text production on plain
 * content. */

text_char = [^'<~[{\n\r:\]}|!=]

text = t:text_char+ { return t.join(''); }

/* Legend
 * '    quotes (italic/bold)
 * <    start of xmlish_tag
 * ~    signatures/dates
 * [    start of links
 * {    start of parser functions, transclusion and template args
 * \n   all sort of block-level markup at start of line
 * \r   ditto
 * h    http(s) urls
 * n    nntp(s) urls
 * m    mailto urls
 * I    start of ISBN 10/13 auto links
 * P    start of PMID auto links
 * R    start of RFC auto links
 *
 * _    behavior switches (e.g., '__NOTOC__') (XXX: not URL related)
 * ! and | table cell delimiters, might be better to specialize those
 * =    headings - also specialize those!
 *
 * The following chars are also included for now, but only apply in some
 * contexts and should probably be enabled only in those:
 * :    separate definition in ; term : definition
 * ]    end of link
 * }    end of parser func/transclusion/template arg
 */

urltext = ( t:[^'<~[{\n\rIPRfghimnstw_|!:\]} &=]+ { return t.join(''); }
          / & url_chars autolink
          / & ('__') behavior_switch
          / htmlentity 
          // Convert trailing space into &nbsp;
          // XXX: This should be moved to a serializer
          / ' ' & ':' { return "\u00a0"; }
          / t:text_char )+

/*
	'//', // for protocol-relative URLs, but not in text!
	'ftp://',
	'git://',
	'gopher://',
	'http://',
	'https://',
	'irc://',
	'ircs://',  // @bug 28503
	'mailto:',
	'mms://',
	'news:',
	'nntp://', // @bug 3808 RFC 1738
	'svn://',
	'telnet://', // Well if we're going to support the above.. -ævar
	'worldwind://',
*/

// Old version
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }

// Experimental tweaked version: avoid expensive single-char substrings
// This did not bring the expected performance boost, however.
//text = [A-Za-z0-9,._ -] { 
//            textStart = pos;
//
//            var res = input.substr(textStart - 1, inputLength)
//                        .match(/[A-Za-z0-9,._ -]+/)[0];
//            pos = pos + (res.length - 1);
//            return res
//       }

htmlentity = "&" c:[#0-9a-zA-Z]+ ";" { 
    return unentity("&" + c.join('') + ";") 
}

space
  = s:[ \t]+ { return s.join(''); }

optionalSpaceToken
  = s:space* {
      if ( s.length ) {
          return [s.join('')];
      } else {
          return [];
      }
  }


// Start of line
sol 
  = nl:(newlineToken / & { return pos === 0; } { return [] }) 
    // Eat multi-line comment, so that syntax after still matches as if it
    // was actually preceded by a newline
    cn:( c:comment n:newline? { 
              if ( n !== '' ) {
                  return [c, n];
              } else {
                  return [c];
              }
          }
    )?
    // Eat includeonly/noinclude at start of line, so that start-of-line
    // syntax after it still matches
    ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )?
    {
        var niToken = [];
        if ( ni !== '') {
            if ( ni[0] === '/' ) {
                niToken = [new EndTagTk( ni[1] )];
            } else {
                niToken = [new TagTk( ni[1] )];
            }
        }

        if ( cn === '' ) {
            cn = [];
        }

        return nl.concat(cn, niToken);
    }

nl_comment_space = newline / comment / space

/**
 * noinclude / includeonly / onlyinclude productions. These are normally
 * handled by the generic_tag production, except where generic tags are not
 * allowed- for example in directives, which are allowed in various attribute
 * names and -values.
 *
 * Example test case:
 * {|
 * |-<includeonly>
 * foo
 * </includeonly>
 * |Hello
 * |}
 */
include_limits = noinclude / includeonly / onlyinclude

noinclude 
  = & { return stops.inc('noinclude'); }
    "<noinclude>"
    i:inline
    "</noinclude>"
    {
        stops.dec('noinclude');
        return [new TagTk('noinclude')].concat(i, [new EndTagTk('noinclude')]);
    }
  / & { return stops.dec('noinclude'); }

includeonly 
  = & { return stops.inc('includeonly'); }
    "<includeonly>"
    i:inline
    "</includeonly>"
    {
        stops.dec('includeonly');
        return [new TagTk('includeonly')].concat(i, [new EndTagTk('includeonly')]);
    }
  / & { return stops.dec('includeonly'); }

onlyinclude 
  = & { return stops.inc('onlyinclude'); }
    "<onlyinclude>"
    i:inline
    "</onlyinclude>"
    {
        stops.dec('onlyinclude');
        return [new TagTk('onlyinclude')].concat(i, [new EndTagTk('onlyinclude')]);
    }
  / & { return stops.dec('onlyinclude'); }

eof = & { return isEOF(pos); } { return true; }


newline
  = '\n' / '\r\n'

newlineToken = newline { return [new NlTk()] }

eolf = newline / eof


// 'Preprocessor' directive- higher-level things that can occur in otherwise
// plain-text content.
directive
  = comment
  / nowiki
  / tplarg_or_template
  / htmlentity 
  / include_limits

// Plain text, but can contain templates, template arguments, comments etc-
// all stuff that is normally handled by the preprocessor
// Returns either a list of tokens, or a plain string (if nothing is to be
// processed).
preprocessor_text 
  = r:( t:[^<~[{\n\r\t|!\]}{ &=]+ { return t.join(''); }
  / !inline_breaks (
      directive
    / text_char )
  )+ {
      return flatten ( r );
  }

spaceless_preprocessor_text
  = r:( t:[^'<~[{\n\r|!\]}{\t &=]+ { return t.join(''); }
  / !inline_breaks (
      directive
    / !' ' text_char )
  )+ {
      return flatten_string ( r );
  }


wikilink_preprocessor_text 
  = r:( t:[^<~[{\n\r\t|!\]}{ &=]+ { return t.join(''); }
        / !inline_breaks ( directive / !"]]" text_char ) 
    )+ {
      return flatten_stringlist ( r );
  }

extlink_preprocessor_text
  // added special separator character class inline: separates url from
  // description / text
  = r:( t:[^'<~[{\n\r|!\]}{\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
  / !inline_breaks ( directive / no_punctuation_char )
  /// urlencoded_char
  // !inline_breaks no_punctuation_char
  / s:[.:,] !(space / eolf) { return s } 
  / [&%] )+ {
      return flatten_string ( r );
  }

// Attribute values with preprocessor support
attribute_preprocessor_text
  = r:( ts:(!inline_breaks t:[^=<>{}\n\r&'"\t ] {return t})+ { return ts.join(''); }
  / !inline_breaks ( 
          directive
        / !inline_breaks [&%] 
    )
  )+ 
  {
      //console.warn('prep');
      return flatten_string ( r );
  }

attribute_preprocessor_text_single
  = r:( t:[^{}&']+ { return t.join(''); }
  / !inline_breaks ( 
      directive
    / [{&] )
  )* 
  {
      return flatten_string ( r );
  }
attribute_preprocessor_text_double
  = r:( t:[^{}&"]+ { return t.join(''); }
  / !inline_breaks ( 
      directive
    / [{&] )
  )* 
  {
      //console.warn( 'double:' + pp(r) );
      return flatten_string ( r );
  }

// Variants with the entire attribute on a single line
attribute_preprocessor_text_line
  = r:( ts:[^=<>{\n\r&'"\t \[\]|{}]+ { return ts.join(''); }
        /  !inline_breaks 
            t:( 
                directive
              / !(newline / space / [\[>]) c:. { 
                    //console.warn( 'aptl: ' + pp(c) ); 
                    return c 
                }
            ) { return t }
      )+ 
  {
      //console.warn('prep');
      return flatten_string ( r );
  }

attribute_preprocessor_text_single_line
  = r:( t:[^{}&']+ { return t.join(''); }
  / !inline_breaks ( 
      directive
    / !'\n' [{&] )
  )* {
      return flatten_string ( r );
  }
attribute_preprocessor_text_double_line
  = r:( t:[^{}&"]+ { return t.join(''); }
  / !inline_breaks (
      directive
    / !'\n' [{&] )
  )* {
      //console.warn( 'double:' + pp(r) );
      return flatten_string ( r );
  }

// Special-case support for those pipe templates
pipe = "|" / "{{!}}"

end_pipe = "|" ! "|" / "{{!}}" ! "{{!}}"

pipe_pipe = "||" / "{{!}}{{!}}"


// Similar, for tables..
exclam = "!" / "{{;}}"

/* Tabs do not mix well with the hybrid production syntax */
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent : */