mediawiki-extensions-Visual.../modules/parser/pegParser.pegjs.txt

/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */
{
    /* Fixme: use static functions to separate module! Unfortunately, this
     * does not work:
     * var tu = require('./mediawiki.tokenizer.utils.js');
     * console.log(tu.flatten([]));
     * Using exports in the module gets a bit further, but accesses to
     * tu.flatten in productions still fail. Thus, I just moved the functions
     * here until a solution is found:
     */

    /* Static utilities */

    // Flatten a list of lists.
    var flatten = function ( e ) {
        var es = [];
        // flatten sub-arrays
        for(var i = 0, length = e.length; i < length; i++) {
            var ei = e[i];
            if ($.isArray(ei))
                es = es.concat(flatten(ei));
            else
                es.push(ei);
        };
        return es;
    };

    // Remove escaped quotes from attributes etc
    var unquote = function (quotec, text) {
        return text.replace('\\' + quotec, quotec);
    };


    // Debug print with global switch
    var dp = function ( msg ) {
        if ( false ) {
            console.log(msg);
        }
    };

    var pp = function ( s ) { return JSON.stringify(s, null, 2); }

    /* 
     * Annotate a token stream with list items with appropriate list tokens
     *
     * @static
     * @method
     * @param {[tokens]}   Token stream with li tokens 
     * @returns {[tokens]} Token stream, possibly with additional list tokens
     * */
    var annotateList = function ( tokens ) {
        var out = [],    // List of tokens
            bstack = [], // Bullet stack, previous element's listStyle
            bnext = [],  // Next element's listStyle
            endtags = [];  // Stack of end tags

        var commonPrefixLength = function (x, y) {
            var minLength = Math.min(x.length, y.length);
            for(var i = 0; i < minLength; i++) {
                if (x[i] != y[i])
                    break;
            }
            return i;
        };

        var pushList = function ( listName, itemName ) {
            out.push({type: 'TAG', name: listName});
            out.push({type: 'TAG', name: itemName});
            endtags.push({type: 'ENDTAG', name: listName});
            endtags.push({type: 'ENDTAG', name: itemName});
        };

        var popTags = function ( n ) {
            for(;n > 0; n--) {
                // push list item..
                out.push(endtags.pop());
                // and the list end tag
                out.push(endtags.pop());
            }
        };

        var isDlDd = function (a, b) {
            var ab = [a,b].sort();
            return (ab[0] === ':' && ab[1] === ';');
        };

        var doListItem = function ( bs, bn ) {
            var prefixLen = commonPrefixLength (bs, bn);
            var changeLen = bn.length - prefixLen;
            var prefix = bn.slice(0, prefixLen);
            // emit close tag tokens for closed lists
            if (changeLen === 0) {
                var itemToken = endtags.pop();
                out.push(itemToken);
                out.push({type: 'TAG', name: itemToken.name});
                endtags.push({type: 'ENDTAG', name: itemToken.name});
            } else if ( bs.length == bn.length
                    && changeLen == 1
                    && isDlDd( bs[prefixLen], bn[prefixLen] ) ) {
                        // handle dd/dt transitions
                        out.push(endtags.pop());
                        if( bn[prefixLen] == ';') {
                            var newName = 'dt';
                        } else {
                            var newName = 'dd';
                        }
                        out.push({type: 'TAG', name: newName});
                        endtags.push({type: 'ENDTAG', name: newName});
                    } else {
                popTags(bs.length - prefixLen);
                for(var i = prefixLen; i < bn.length; i++) {
                    switch (bn[i]) {
                        case '*':
                            pushList('ul', 'li');
                            break;
                        case '#':
                            pushList('ol', 'li');
                            break;
                        case ';':
                            pushList('dl', 'dt');
                            break;
                        case ':':
                            pushList('dl', 'dd');
                            break;
                        default:
                            throw("Unknown node prefix " + prefix[i]);
                    }
                }
            }
        };

        for (var i = 0, length = tokens.length; i < length; i++) {
            var token = tokens[i];
            switch ( token.type ) {
                case 'TAG':
                    switch (token.name) {
                        case 'list':
                            // ignore token
                            break;
                        case 'listItem':
                            // convert listItem to list and list item tokens
                            bnext = token.bullets;
                            doListItem( bstack, bnext );
                            bstack = bnext;
                            break;
                        default:
                            // pass through all remaining start tags
                            out.push(token);
                            break;
                    }
                    break;
                case 'ENDTAG':
                    if ( token.name == 'list' ) {
                        // pop all open list item tokens
                        popTags(bstack.length);
                        bstack = "";
                    } else {
                        out.push(token);
                    }
                    break;
                default:
                    out.push(token);
                    break;
            }
        }
        return out;
    };

    /*
     * Italic/Bold handling.
     *
     * - list of tokens
     *   - NEWLINE
     *   - ticks (2+) -> list with link in line token list?
     *   - process on newline
     *   - need access to text nodes before/after for conversion back to text
     */
    var doQuotes = function ( tokens ) {

        var italics = [],
            bolds = [],
            out = [],
            inserted = 0;

        var convertBold = function ( i ) {
            var index = bolds[i];
            var txt = out[index - 1];
            txt.value += "'";
            bolds = bolds.slice(0, i)
                .concat(bolds.slice(i + 1, bolds.length - i - 1));
            italics.push(index);
            italics.sort();
        };

        // convert italics/bolds into tags
        var quotesToTags = function ( offsets, name ) {
            var toggle = true;
            for (var j = 0; j < offsets.length; j++) {
                var t = out[offsets[j]];
                if(toggle) {
                    t.type = 'TAG';
                } else {
                    t.type = 'ENDTAG';
                }
                t.name = name;
                delete t.value;
                toggle = !toggle;
            }
            if (!toggle) {
                // add end tag
                out.push({type: 'ENDTAG', name: name});
                inserted++;
            }
            toggle = true;
        };

        for (var i = 0, length = tokens.length; i < length; i++) {
            var token = tokens[i];
            switch (token.type) {
                case 'QUOTE':
                    // depending on length, add starting 's to preceding text node
                    // (if any)
                    // add token index to italic/bold lists
                    // add placeholder for token
                    var qlen = token.value.length;
                    switch (qlen) {
                        case 2: italics.push(i + inserted); out.push(token); break;
                        case 3: bolds.push(i + inserted); out.push(token); break;
                        case 4: 
                                token.value = "'''";
                                if (i > 0 && tokens[i-1].type === 'TEXT') {
                                    tokens[i-1].value += "'";
                                } else {
                                    out.push({type: 'TEXT', value: "'"});
                                    inserted++;
                                }
                                bolds.push(i + inserted); 
                                out.push(token);
                                break;
                        case 5:
                                // order does not matter here, will be fixed
                                // by HTML parser backend
                                italics.push(i + inserted); 
                                out.push({type: 'QUOTE', value: "''"}); 
                                inserted++;
                                bolds.push(i + inserted);
                                out.push({type: 'QUOTE', value: "'''"}); 
                                break;
                        default: // longer than 5, only use the last 5 ticks
                                token.value = "'''''";
                                var newvalue = token.value.substr(0, qlen - 5 );
                                if (i > 0 && tokens[i-1].type === 'TEXT') {
                                    tokens[i-1].value += newvalue;
                                } else {
                                    out.push({type: 'TEXT', value: newvalue});
                                    inserted++;
                                }
                                italics.push(i + inserted); 
                                out.push({type: 'QUOTE', value: "''"}); 
                                inserted++;
                                bolds.push(i + inserted);
                                out.push({type: 'QUOTE', value: "'''"}); 
                                break;
                    }
                    break;

                case 'NEWLINE':
                    // balance out tokens, convert placeholders into tags
                    if (italics.length % 2 && bolds.length % 2) {
                        dp("balancing!");
                        var firstsingleletterword = -1,
                            firstmultiletterword = -1,
                            firstspace = -1;
                        for (var j = 0; j < bolds.length; j++) {
                            var ticki = bolds[j];
                            if (ticki > 0 && out[ticki - 1].type === 'TEXT') {
                                var txt = out[ticki - 1],
                                    lastchar = txt.value[txt.value.length - 1],
                                             secondtolastchar = txt.value[txt.value.length - 2];
                                dp('txt: ' + pp(txt));
                                if (lastchar === ' ' && firstspace === -1) {
                                    firstspace = j;
                                } else if (lastchar !== ' ') {
                                    if ( secondtolastchar === ' ' && 
                                            firstsingleletterword === -1) {
                                                firstsingleletterword = j;
                                            } else if ( secondtolastchar && 
                                                    secondtolastchar !== ' ') {
                                                        firstmultiletterword = j;
                                                    }
                                }
                            }
                        }


                        // now see if we can convert a bold to an italic and
                        // an apostrophe
                        if (firstsingleletterword > -1) {
                            convertBold(firstsingleletterword);
                        } else if (firstmultiletterword > -1) {
                            convertBold(firstmultiletterword);
                        } else if (firstspace > -1) {
                            convertBold(firstspace);
                        }
                    }

                    quotesToTags(bolds, 'b');
                    quotesToTags(italics, 'i');
                    bolds = [];
                    italics = [];
                    out.push(token);
                    break;
                default:
                    out.push(token);
            }
        }
        return out;
    };


    /* End static utilities */

    /*
     * Flags for specific parse environments (inside tables, links etc). Flags
     * trigger syntactic stops in the inline_breaks production, which
     * terminates inline and attribute matches. Flags merely reduce the number
     * of productions needed: The grammar is still context-free as the
     * productions can just be unrolled for all combinations of environments
     * at the cost of a much larger grammar.
     */
    var syntaxFlags = {};
    var setFlag = function(flag) {
        if (syntaxFlags[flag] !== undefined) {
            syntaxFlags[flag]++;
        } else {
            syntaxFlags[flag] = 1;
        }
        return true;
    };
    var clearFlag = function(flag) {
        syntaxFlags[flag]--;
    };

    // Start position of top-level block
    // Could also provide positions for lower-level blocks using a stack.
    var blockStart = 0;

    // cache the input length
    var inputLength = input.length;

    // pseudo-production that matches at end of input
    var isEOF = function (pos) {
        return pos === inputLength;
    };

    // text start position
    var textStart = 0;

    // hack to support numbered external links ([http://example.com]).
    // XXX: Move to token stream transform after templates are expanded!
    var linkCount = 1;

    // Define block-level tags in JS, so we can use toLowerCase to match tags
    // case-independently. This would be quite ugly (and possibly slower) if
    // done manually in the grammar.
    var block_names = (function () {
        var names = [ "p", "table", "td", "tr", "ul", "ol"
                    , "li", "dl", "dt", "dd", "div", "center"
                    , "blockquote" ];
        var bnames = {};
        for(var i = 0, l = names.length; i < l; i++) {
            bnames[names[i]] = true;
        }
        return bnames;
    })();

        
}

start
  = e:toplevelblock* newline* { 
      return flatten(e);
  }


/* All chars that cannot start syntactic structures in the middle of a line
 * XXX: ] and other end delimiters should probably only be activated inside
 * structures to avoid unnecessarily leaving the text production on plain
 * content. */

text_char = [^'<~[{\n\r:\]}|!=]

text = t:text_char+ { return t.join(''); }

/* Explanation of chars
 * '    quotes (italic/bold)
 * <    start of xmlish_tag
 * ~    signatures/dates
 * [    start of links
 * {    start of parser functions, transclusion and template args
 * \n   all sort of block-level markup at start of line
 * \r   ditto
 * h    http(s) urls
 * n    nntp(s) urls
 * m    mailto urls
 *
 * ! and | table cell delimiters, might be better to specialize those
 * =    headings - also specialize those!
 *
 * The following chars are also included for now, but only apply in some
 * contexts and should probably be enabled only in those:
 * :    separate definition in ; term : definition
 * ]    end of link
 * }    end of parser func/transclusion/template arg
 */

urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); }
            // XXX: use general entity decode!
          / "&amp;" { return "&"; } // decode ampersand in text
          / urllink
          // Convert trailing space into &nbsp;
          // XXX: This should be moved to a serializer
          / ' ' & ':' { return "\u00a0"; }
          / t:text_char )+

/*
	'//', // for protocol-relative URLs, but not in text!
	'ftp://',
	'git://',
	'gopher://',
	'http://',
	'https://',
	'irc://',
	'ircs://',  // @bug 28503
	'mailto:',
	'mms://',
	'news:',
	'nntp://', // @bug 3808 RFC 1738
	'svn://',
	'telnet://', // Well if we're going to support the above.. -ævar
	'worldwind://',
*/

// Old version
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }

// Experimental tweaked version: avoid expensive single-char substrings
// This did not bring the expected performance boost, however.
//text = [A-Za-z0-9,._ -] { 
//            textStart = pos;
//
//            var res = input.substr(textStart - 1, inputLength)
//                        .match(/[A-Za-z0-9,._ -]+/)[0];
//            pos = pos + (res.length - 1);
//            return res
//       }

space
  = s:[ \t]+ { return s.join(''); }


// Start of line
sol = (newline / & { return pos === 0; } { return true; }) 
      cn:(c:comment n:newline? { return [c, {type: 'TEXT', value: n}] })* {
          return [{type: 'NEWLINE'}].concat(cn);
      }

eof = & { return isEOF(pos); } { return true; }


newline
  = '\n' / '\r\n'

eolf = newline / eof

toplevelblock 
  = & { blockStart = pos; return true; } b:block {
    b = flatten(b);
    var bs = b[0];
    //dp('toplevelblock:' + pp(b));
    if (bs.attribs === undefined) {
        bs.attribs = [];
    }
    bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]);
    // XXX: only run this for lines that actually need it!
    b.push({type: 'NEWLINE'});
    b = doQuotes(b);
    return b;
  }

block
  = block_lines
  / pre
  / comment &eolf
  / nowiki
  / pre
  / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag
  / para
  / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor
  / s:sol { 
          if (s) {
              return [s, {type: 'NEWLINE'}]; 
          } else {
              return [{type: 'NEWLINE'}]; 
          }
      }

block_lines
  = s:sol 
    // eat an empty line before the block
    s2:(ss:space* so:sol { return [{type: 'TEXT', value: ss.join('')}].concat(so) })? 
    bl:block_line { 
        var s2_ = (s2 !== '') ? s2 : [];
        return s.concat(s2_, bl); 
    }

// Block structures with start-of-line wiki syntax
block_line
  = h
  / table
  / lists
  // tag-only lines should not trigger pre
  / space* bt:block_tag space* &eolf { return bt }
  / pre_indent
  / pre


// TODO: convert inline content to annotations!
para
  = s1:sol s2:sol c:inlineline { 
      return s1.concat(s2, [{type: 'TAG', name: 'p'}], c); 
  }

br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} }

// Syntax stops to limit inline expansion defending on syntactic context
inline_breaks
  = 
    & { // Important hack: disable caching for this production, as the default
        // cache key does not take into account flag states!
        cacheKey = ''; 
        return true; 
      }
    & { return syntaxFlags['table']; } 
    a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; }
  / & { return (syntaxFlags['colon'] && 
          ! syntaxFlags.extlink &&
          ! syntaxFlags.linkdesk); } ":" { return true; }
  / & { return syntaxFlags['extlink']; } "]" { return true; }
  / & { return syntaxFlags['linkdesc']; } link_end { return true; }
  / & { return syntaxFlags['h']; } '='+ space* newline { return true; }

inline
  = c:(urltext / inline_element / (!inline_breaks ch:. { return ch; }))+ {
      var out = [];
      var text = [];
      c = flatten(c);
      for (var i = 0; i < c.length; i++) {
          if (typeof c[i] == 'string') {
              text.push(c[i]);
          } else {
              if (text.length) {
                  out.push({ type: "TEXT", value: text.join('') });
                  text = [];
              }
              out.concat(c[i]);
          }
      }
      if (text.length) {
          out.push({ type: 'TEXT', value: text.join('') });
      }
      return out;
}


inlineline
  = c:(urltext / !inline_breaks (inline_element / [^\n]))+ {
      var out = [];
      var text = [];
      c = flatten(c);
      for (var i = 0; i < c.length; i++) {
          if (typeof c[i] == 'string') {
              text.push(c[i]);
          } else {
              if (text.length) {
                  out.push({type: 'TEXT', value: text.join('')});
                  text = [];
              }
              out.push(c[i]);
          }
      }
      if (text.length) {
          out.push({type: 'TEXT', value: text.join('')});
      }
      dp('inlineline out:', pp(out));
      return out;
}

/* TODO: convert all these to annotations!
 * -> need (start, end) offsets within block
 */
inline_element
  = & { dp('inline_element enter' + input.substr(pos, 10)); return true; }
    comment
  // Can actually also be block-level elements, we don't really try to enforce
  // a content model in the tokenizer. The HTML tree builder and DOM
  // transformations are better equipped to deal with it.
  / xmlish_tag 
  / template
  / wikilink
  / extlink
  / quote

/* Headings  */

h = & "=" // guard, to make sure '='+ will match. 
          // XXX: Also check to end to avoid inline parsing?
    r:(
     s:'='+ // moved in here to make s accessible to inner action
     & { return setFlag('h'); }
     c:inlineline 
     e:'='+ 
     spc:(sp:space+ { return {type: 'TEXT', value: sp.join('') } } / comment)*
     &eolf 
     {
        clearFlag('h');
        var level = Math.min(s.length, e.length);
        // convert surplus equals into text
        if(s.length > level) {
            var extras = s.substr(0, s.length - level);
            if(c[0].type == 'TEXT') {
                c[0].value = extras + c[0].value;
            } else {
                c.unshift({type: 'TEXT', value: extras});
            }
        }
        if(e.length > level) {
            var extras = e.substr(0, e.length - level),
                lastElem = c[c.length - 1];
            if(lastElem.type == 'TEXT') {
                lastElem.value = lastElem.value + extras;
            } else {
                c.push({type: 'TEXT', value: extras});
            }
        }

        return [{type: 'TAG', name: 'h' + level}]
                .concat(c, [{type: 'ENDTAG', name: 'h' + level}, spc]);
      }
    / & { dp('nomatch exit h'); clearFlag('h'); return false } { return null }
    ) { return r }


pre_indent 
  = l:pre_indent_line ls:(sol pre_indent_line)* {
      return [{type: 'TAG', name: 'pre'}]
                    .concat( [l], ls
                           , [{type: 'ENDTAG', name: 'pre'}]);
  }
pre_indent_line = space l:inlineline { 
    return [{type: 'TEXT', value: '\n'}].concat(l); 
}


comment
  = '<!--' c:comment_chars* ('-->' / eof)
    cs:(space* newline space* cn:comment { return cn })* {
        return [{ type: 'COMMENT', value: c.join('') }].concat(cs);
    }

comment_chars
  = c:[^-] { return c; }
  / c:'-' !'->' { return c; }


urllink
  = target:url {
      return [ { type: 'TAG', 
                 name: 'a',
                 attribs: [['href', target]] }
             , {type: 'TEXT', value: target}
             , {type: 'ENDTAG', name: 'a'} 
             ];
  }

extlink
  = "[" 
    & { return setFlag('extlink'); }
    target:url 
    space*
    text:inlineline?
    "]" {
        clearFlag('extlink');
        if ( text == '' ) {
            // XXX: Link numbering should be implemented in post-processor.
            text = [{type: 'TEXT', value: "[" + linkCount + "]"}];
            linkCount++;
        }
        return [ { type: 'TAG', 
            name: 'a',
            attribs: [['href', target]] } ]
                .concat( text 
                        , [{type: 'ENDTAG', name: 'a'}]);
    }
  / "[" & { clearFlag('extlink'); return false; }

/* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can
 * be configured dynamically. */
url_protocol
  = '//' // for protocol-relative URLs
  / 'ftp://'
  / 'git://'
  / 'gopher://'
  / 'http://'
  / 'https://'
  / 'irc://'
  / 'ircs://'  // @bug 28503
  / 'mailto:'
  / 'mms://'
  / 'news:'
  / 'nntp://' // @bug 3808 RFC 1738
  / 'svn://'
  / 'telnet://' // Well if we're going to support the above.. -ævar
  / 'worldwind://'

// javascript does not support unicode features..
unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]

url
  = proto:url_protocol 
    rest:( [^ :\]\[\n"'<>\x00-\x20\x7f,.&\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] 
            / s:[.:,] !(space / eolf) { return s } 
            // XXX: use general entity decode!
            / '&amp;' { return '&' } 
            / '&' )+ 
{ 
    return proto + rest.join(''); 
}
//[^][<>"\\x00-\\x20\\x7F\p{Zs}]

template
  = "{{" target:template_target params:("|" p:template_param { return p })* "}}" {
      var obj = { type: 'TAG', name: 'template', attribs: [['target', target]] }
      if (params && params.length) {
          obj.attribs = obj.attribs.concat(params);
      }
      // Should actually use a self-closing tag here, but the Node HTML5
      // parser only recognizes known self-closing tags for now, so use an
      // explicit end tag for now.
      //console.log(pp(obj));
      return [obj, {type: 'ENDTAG', name: 'template'}];
  }

template_target
  = h:( !"}}" x:([^|\n]) { return x } )* { return h.join(''); }

/* XXX: pass these as convenient js structures to later stages, but serialize
 * (to json) before passing remaining args to the tree builder */
template_param
  = name:template_param_name "=" c:template_param_text {
      return ['data-k-' + name, c];
  } / c:template_param_text {
      return ['data-p-', c];
  }

tplarg 
  = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" {
      var obj = { 
            type: 'SELFCLOSINGTAG', 
            name: 'templatearg', 
            attribs: [['argname', name]]
      };
      if (params && params.length) {
          obj.attribs = obj.attribs.concat(params);
      }
      return obj;
  }

template_param_name
  = h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); }

template_param_text
  = tcs:template_param_text_chunk* { return JSON.stringify(tcs); }
  //= h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }


template_param_text_chunk
  = comment
  / xmlish_tag
  / extlink
  / template
  / wikilink
  / quote
  / c:[^}|\n]+ {return {type: 'TEXT', value: c.join('')}}
  / !"}}" x:([^|\n]) { return {type: 'TEXT', value: x} }

wikilink
  = "[[" 
    ! url
    target:link_target text:("|" lt:link_text { return lt })* "]]" suffix:text? {
      var obj = {
          type: 'TAG',
          name: 'a',
          attribs: [['data-type', 'internal']]
      };
      obj.attribs.push(['href', target]);
      if (text && text.length) {
          var textTokens = text; // XXX
      } else {
          if (suffix !== '') {
              target += suffix;
          }
          var textTokens = [{type: 'TEXT', value: target}];
      }
      return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]);
  }

link_target
  = h:( !"]]" x:([^|\n]) { return x } )* { return h.join(''); }

link_text
  = h:( & { return setFlag('linkdesc'); }
          x:inlineline { return x } 
      )* { 
          clearFlag('linkdesc')
              return h; 
      }
  / & { clearFlag('linkdesc') } { return null; }

link_end = "]]"

/* Generic quote production for italic and bold, further processed in a token
 * stream transformation in doQuotes. Relies on NEWLINE tokens being emitted
 * for each line of text to balance quotes per line.
 * 
 * We are not using a simple pair rule here as we need to support mis-nested
 * bolds/italics and MediaWiki's special heuristics for apostrophes, which are
 * all not context free. */
quote = "''" x:"'"* {
	return { 
		type : 'QUOTE',
		value: "''" + x.join('')
	}
}

/* Will need to check anything xmlish agains known/allowed HTML tags and
 * registered extensions, otherwise fail the match. Should ref be treated as a
 * regular extension? */
xmlish_tag = nowiki / ref / references / generic_tag

pre
  = "<pre" 
    attribs:generic_attribute* 
    ">"
    ts:(t1:[^<]+ { return {type:'TEXT',value:t1.join('')} } 
                / nowiki 
                / !"</pre>" t2:. {return {type:'TEXT',value:t2}})+ 
    ("</pre>" / eof) {
        // return nowiki tags as well?
        //console.log('inpre');
        return [ {type: 'TAG', name: 'pre', attribs: attribs} ]
                    .concat(ts, [{type: 'ENDTAG', name: 'pre'}]);
    }
  / "</pre>" { return {type: 'TEXT', value: "</pre>"}; }

nowiki 
  = "<nowiki>" nc:nowiki_content "</nowiki>" { 
        // console.log(pp(nc)); 
        return nc; 
    }
  / "<nowiki>" { 
          //console.log('nowiki fallback');
          return [{type: 'TEXT', value: '<nowiki>'}]; 
    }
  / "</nowiki>" { return [{type: 'TEXT', value: '</nowiki>'}]; }

nowiki_content
  = ts:(   t:[^<]+ { return t.join('') } 
           / "<pre" p0:space* p1:[^>]* ">" p2:nowiki_content "</pre>" {
                 //console.log('nested pre in nowiki');
                 return ["<pre"].concat(p0, p1, [">"], [p2[0].value], ["</pre>"]).join('');
               }
           / (!("</nowiki>" / "</pre>") c:. {return c}) 
       )* {
            // return nowiki tags as well?
            return [{type: 'TEXT', value: ts.join('')}];
          }

// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
block_tag
  = "<" end:"/"? name:(cs:[a-zA-Z]+ { return cs.join('') })
    attribs:generic_attribute* 
    selfclose:"/"?
    ">" {
        if (block_names[name.toLowerCase()] !== true) {
            // abort match if tag is not block-level
            return null;
        }
        var res = {name: name, attribs: attribs};
        if ( end != '' ) {
            res.type = 'ENDTAG';
        } else if ( selfclose != '' ) {
            res.type = 'SELFCLOSINGTAG';
        } else {
            res.type = 'TAG';
        }
        return res;
    }


// See http://dev.w3.org/html5/spec/Overview.html#syntax-tag-name and
// following paragraphs
generic_tag
  = "<" end:"/"? name:[0-9a-zA-Z]+ 
    attribs:generic_attribute* 
    space*
    selfclose:"/"?
    ">" {
        var res = {name: name.join(''), attribs: attribs};
        if ( end != '' ) {
            res.type = 'ENDTAG';
        } else if ( selfclose != '' ) {
            res.type = 'SELFCLOSINGTAG';
        } else {
            res.type = 'TAG';
        }
        return res;
    }

generic_attribute
  = s:space* 
    name:generic_attribute_name
    value:(space*
          generic_attribute_value)?
{
    if ( value !== '' ) {
        return [name, value[1]];
    } else {
        return [name,''];
    }
}

// http://dev.w3.org/html5/spec/Overview.html#attributes-0
generic_attribute_name
  = n:[^ \t\0/"'>=]+ {
        return n.join('');
  }

generic_attribute_value
  = "=" space* v:att_value {return v}

att_value
  = t:[^ \t'"<>='\n]+ { return [null, t.join('')]; }
  / "'" t:[^'>]+ "'" { return unquote("'", t.join('')); }
  / '"' t:[^">]+ '"' { return unquote('"', t.join('')); }

ref = ref_full / ref_empty

/* Can we do backreferences to genericize this? */
ref_full
  = start:ref_start ">" content:ref_content* close:ref_end {
      return [
        { type: 'TAG',
          name: 'ext',
          attribs: [['data-extname', 'ref']]
              .concat(start.params, [['data-startws', start.ws]])}, 
        content, 
        {type: 'ENDTAG', name: 'ref'}
      ];
}

ref_empty
  = start:ref_start close:(space* "/>") {
      return [{ type: 'SELFCLOSINGTAG',
               name: 'ext',
               attribs: [['data-extname', 'ref']]
                          .concat(start.params
                                 ,[['data-startws', start.ws]])
      }];
}

ref_start
  = "<ref" params:ext_param* ws:space* {
      return {
          params: params,
          ws: ws
      };
}

ref_end
  = all:("</ref" space* ">") {
  return all.join('');
}

ref_content
  = !ref_end a:inline { // XXX: ineffective syntactic stop
  return a;
}

/* fixme probably have to programatically add these */
references = references_full / references_empty

references_full
  = start:references_start ">" content:references_content* close:references_end {
      return [ 
          { type: 'TAG',
            name: 'ext',
            attribs: [['data-extname', 'references']]
                        .concat(start.params
                               ,[['data-startws', start.ws]])
          },
          content,
          { type: 'ENDTAG', name: 'ext' }
      ];
  }

references_empty
  = start:references_start close:(space* "/>") {
      return
          [{ type: 'SELFCLOSINGTAG',
             name: 'ext',
             attribs: [['data-extname', 'references']]
                        .concat(start.params
                               ,[['data-startws', start.ws]])
          }];
  }

references_start
  = "<references" params:ext_param* ws:space* {
      return {
          params: params,
          ws: ws
      };
  }

references_end
  = all:("</references" space* ">") {
      return all.join('');
  }

references_content
  = !references_end a:inline {
      return a;
  }


ext_param
  = space* name:ext_param_name "=" val:ext_param_val {
      val[0] = name;
      return val;
  }

ext_param_name
  = name:[a-zA-Z0-9-]+ {
  return name.join('');
}

ext_param_val
  = t:[0-9A-Za-z]+ { return [null, t.join('')]; }
  / "'" t:[^'>]+ "'" { return [null, unquote("'", t.join(''))]; }
  / '"' t:[^">]+ '"' { return [null, unquote('"', t.join(''))]; }

lists = e:(dtdd / li) es:(sol (dtdd / li))*
{
    return annotateList( [ { type: 'TAG', name: 'list'} ]
            .concat(flatten([e].concat(es))
                ,[{ type: 'ENDTAG', name: 'list' }]));
}

li = bullets:list_char+ 
     c:inlineline?
     &eolf 
{
    if ( c == '' )
        c = [];
    return [ { type: 'TAG', 
               name: 'listItem', 
               bullets: bullets }
           , c ];
}

dtdd 
  = bullets:(!(";" !list_char) list_char)*
    ";"
    & {return setFlag('colon');}
    c:inlineline
    ":"
    // Fortunately dtdds cannot be nested, so we can simply set the flag
    // back to 0 to disable it.
    & {syntaxFlags['colon'] = 0; return true;}
    d:inlineline
    &eolf {
        // Convert trailing space into &nbsp;
        // XXX: This should be moved to a serializer
        //var clen = c.length;
        //if (clen && c[clen - 1].type === 'TEXT') {
        //    var val = c[clen - 1].value;
        //    if(val.length && val[val.length - 1] == ' ') {
        //        c[clen - 1].value = val.substr(0, val.length - 1) + "\u00a0";
        //    }
        //}

        return [ { type: 'TAG', name: 'listItem', bullets: bullets + ";" } ]
            .concat( c
                    ,[{ type: 'TAG', name: 'listItem', bullets: bullets + ":" } ]
                    , d );
    }
  // Fall-back case to clear the colon flag
  / & { return true; } { syntaxFlags['colon'] = 0; return null; }


list_char = [*#:;]


/* Tables */
table 
  = tas:table_start c:table_caption? b:table_body? table_end { 
      var res = {type: 'TAG', name: 'table'}
      var body = b !== '' ? b : [];
      dp("body: " + pp(body));
      if (tas.length > 0) {
          // FIXME: actually parse and build structure
          res.attribs = [['data-unparsed', tas.join('')]];
      }

      if (c != '') {
          var caption = [{type: 'TAG', name: 'caption'}]
                        .concat(c, [{type: 'ENDTAG', name: 'caption'}]);
      } else {
          var caption = [];
      }
      //dp(pp(res));

      return [res].concat(caption, body,
              [{type: 'ENDTAG', name: 'table'}]);
  }

table_start 
  = "{|"
    res:(
        & { setFlag('table'); return true; }
        ta:table_attribs* 
        { 
            dp("table_start " + pp(ta) + ", pos:" + pos);
            return ta;
        }
        / & { clearFlag('table'); return false; } { return null; }
    ) { return res }

table_attribs 
 = text / ! inline_breaks !newline ![|] c:. { return c }

table_caption 
  = newline
    "|+" c:inline* { 
        return c;
    }

table_body 
  = & { dp("table_body enter"); return true; }
    firstrow:table_firstrow otherrows:table_row* { 
      /* dp('table first and otherrows: ' 
       * + pp([firstrow].concat(otherrows))); */
      return [firstrow].concat(otherrows); 
  }
  / otherrows:table_row* {
      //dp('table otherrows: ' + pp(otherrows));
      return otherrows;
  }

table_firstrow 
  = td:table_data+ { 
      return [{ type: 'TAG', name: 'tr' }]
             .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
  }

table_row 
  = & { dp("table row enter"); return true; }
    newline
    "|-" thtd_attribs? space* td:(table_data / table_header)* { 
        return [{type: 'TAG', name: 'tr'}]
               .concat(td, [{type: 'ENDTAG', name: 'tr'}]);
    }

table_data 
  = & { dp("table_data enter, pos=" + pos + input.substr(pos,10)); return true; }
    ("||" / newline "|") 
    ! [}+-]
    a:thtd_attribs? 
    // use inline_breaks to break on tr etc
    td:(!inline_breaks 
        & { dp("table_data 2, pos=" + pos + input.substr(pos,10)); return true; }
            b:block { return b })* { 
        dp("table data result: " + pp(td) + ", attribts: " + pp(a));
        return [{ type: 'TAG', name: 'td', attribs: [['data-unparsed', a]]}]
               .concat(td, [{type: 'ENDTAG', name: 'td'}]);
    }

table_header
  = ("!!" / newline "!") 
     a:thtd_attribs?
     c:inline {
         return [{type: 'TAG', name: 'th', attribs: [['data-unparsed', a]]}]
                .concat(c, [{type: 'ENDTAG', name: 'th'}]);
     }

thtd_attribs
  // In particular, do not match [|\n]
  = a:(text / ! inline_breaks c:[="':;/,. -] { return c } )+ "|" ! "|" {
    return a;
  }


table_end = newline? "|}" { clearFlag('table'); }


/* Tabs do not mix well with the hybrid production syntax */
/* vim: set filetype=javascript expandtab ts=4 sw=4 cindent: */