/** * Combined Wiki (MediaWiki) and HTML tokenizer based on pegjs. Emits several * chunks of tokens (one chunk per top-level block matched) and eventually an * end event. Tokens map to HTML tags as far as possible, with custom tokens * used where further processing on the token stream is needed. * * @author Gabriel Wicke * @author Brion Vibber */ { /* Fixme: use static functions to separate module! Unfortunately, this * does not work: * var tu = require('./mediawiki.tokenizer.utils.js'); * console.warn(tu.flatten([])); * Using exports in the module gets a bit further, but accesses to * tu.flatten in productions still fail. Thus, I just moved the functions * here until a solution is found: */ /* Static utilities */ // Flatten a list of lists. //var simple_flatten = function ( e ) { // // Fast single-level flatten: // //return [].concat.apply([], e); // // var es = []; // // flatten sub-arrays // for(var i = 0, length = e.length; i < length; i++) { // var ei = e[i]; // if ($.isArray(ei)) // es = es.concat(flatten(ei)); // else // es.push(ei); // }; // return es; //}; var flatten = function ( e ) { // Fast single-level flatten: //return [].concat.apply([], e); var es = []; // flatten sub-arrays var chunkStart = null, modified = false; for(var i = 0, l = e.length; i < l; i++) { if ( e[i].constructor === Array ) { if ( chunkStart !== null ) { es = es.concat( e.slice( chunkStart, i ), flatten( e[i] ) ); chunkStart = null; } else { es = es.concat(flatten(e[i])); } modified = true; } else if ( chunkStart === null ) { chunkStart = i; } }; if ( modified ) { if ( chunkStart !== null ) { es = es.concat( e.slice( chunkStart, e.length ) ); } return es; } else { return e; } }; var flatten_string = function ( c ) { var out = flatten_stringlist( c ); if ( out.length === 1 && out[0].constructor === String ) { return out[0]; } else { return out; } }; var flatten_stringlist = function ( c ) { var out = [], text = ''; c = flatten(c); for (var i = 0, l = c.length; i < l; i++) { var ci = c[i]; if (ci.constructor === String) { if(ci !== '') { text += ci; } } else { if (text !== '') { out.push( text ); text = ''; } out.push(ci); } } if (text !== '' ) { out.push( text ); } return out; } // Remove escaped quotes from attributes etc // This was in the original PEG parser, but could not find anything in // MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke var unquote = function (quotec, text) { return text.replace('\\' + quotec, quotec); }; // Decode html entities. In a browser, this should only be fed the entity, // not untrusted html! XXX: replace with safer version. var unentity = function ( entity ) { return $("
").html(entity).text(); }; // Debug print with global switch var dp = function ( msg ) { if ( false ) { console.warn(msg); } }; var pp = function ( s ) { return JSON.stringify(s, null, 2); } /* * Annotate a token stream with list items with appropriate list tokens * * XXX: Move this to a token handler in phase sync23! That way we can * support list items from templates too. * * @static * @method * @param {[tokens]} Token stream with li tokens * @returns {[tokens]} Token stream, possibly with additional list tokens * */ var annotateList = function ( tokens ) { var out = [], // List of tokens bstack = [], // Bullet stack, previous element's listStyle bnext = [], // Next element's listStyle endtags = []; // Stack of end tags var commonPrefixLength = function (x, y) { var minLength = Math.min(x.length, y.length); for(var i = 0; i < minLength; i++) { if (x[i] != y[i]) break; } return i; }; var pushList = function ( listName, itemName ) { out.push( new TagTk( listName )); out.push( new TagTk( itemName )); endtags.push( new EndTagTk( listName )); endtags.push( new EndTagTk( itemName )); }; var popTags = function ( n ) { for(;n > 0; n--) { // push list item.. out.push(endtags.pop()); // and the list end tag out.push(endtags.pop()); } }; var isDlDd = function (a, b) { var ab = [a,b].sort(); return (ab[0] === ':' && ab[1] === ';'); }; var doListItem = function ( bs, bn ) { var prefixLen = commonPrefixLength (bs, bn); var changeLen = Math.max(bs.length, bn.length) - prefixLen; var prefix = bn.slice(0, prefixLen); // emit close tag tokens for closed lists if (changeLen === 0) { var itemToken = endtags.pop(); out.push(itemToken); out.push(new TagTk( itemToken.name )); endtags.push(new EndTagTk( itemToken.name )); } else if ( bs.length == bn.length && changeLen == 1 && isDlDd( bs[prefixLen], bn[prefixLen] ) ) { // handle dd/dt transitions out.push(endtags.pop()); if( bn[prefixLen] == ';') { var newName = 'dt'; } else { var newName = 'dd'; } out.push(new TagTk( newName )); endtags.push(new EndTagTk( newName )); } else { popTags(bs.length - prefixLen); if (prefixLen > 0 && bn.length == prefixLen ) { var itemToken = endtags.pop(); out.push(itemToken); out.push(new TagTk( itemToken.name )); endtags.push(new EndTagTk( itemToken.name )); } for(var i = prefixLen; i < bn.length; i++) { switch (bn[i]) { case '*': pushList('ul', 'li'); break; case '#': pushList('ol', 'li'); break; case ';': pushList('dl', 'dt'); break; case ':': pushList('dl', 'dd'); break; default: throw("Unknown node prefix " + prefix[i]); } } } }; for (var i = 0, length = tokens.length; i < length; i++) { var token = tokens[i]; switch ( token.constructor ) { case TagTk: switch (token.name) { case 'list': // ignore token break; case 'listItem': // convert listItem to list and list item tokens bnext = token.bullets; doListItem( bstack, bnext ); bstack = bnext; break; default: // pass through all remaining start tags out.push(token); break; } break; case EndTagTk: if ( token.name == 'list' ) { // pop all open list item tokens popTags(bstack.length); bstack = []; } else { out.push(token); } break; default: out.push(token); break; } } return out; }; /* End static utilities */ /* * Flags for specific parse environments (inside tables, links etc). Flags * trigger syntactic stops in the inline_breaks production, which * terminates inline and attribute matches. Flags merely reduce the number * of productions needed: The grammar is still context-free as the * productions can just be unrolled for all combinations of environments * at the cost of a much larger grammar. */ var syntaxFlags = {}; var setFlag = function(flag) { if (syntaxFlags[flag] !== undefined) { syntaxFlags[flag]++; } else { syntaxFlags[flag] = 1; } return true; }; var clearFlag = function(flag) { syntaxFlags[flag]--; return false; }; // Start position of top-level block // Could also provide positions for lower-level blocks using a stack. var blockStart = 0; // Start position of generic tag production var tagStartPos = 0; // cache the input length var inputLength = input.length; // pseudo-production that matches at end of input var isEOF = function (pos) { return pos === inputLength; }; // text start position var textStart = 0; // hack to support numbered external links ([http://example.com]). // XXX: Move to token stream transform after templates are expanded! var linkCount = 1; // Define block-level tags in JS, so we can use toLowerCase to match tags // case-independently. This would be quite ugly (and possibly slower) if // done manually in the grammar. var block_names = (function () { var names = [ "p", "table", "td", "tr", "ul", "ol" , "li", "dl", "dt", "dd", "div", "center" , "blockquote" ]; var bnames = {}; for(var i = 0, l = names.length; i < l; i++) { bnames[names[i]] = true; } return bnames; })(); var self = this; } /********************************************************* * The top-level production *********************************************************/ start = e:toplevelblock* newline* { // end is passed inline as a token, as well as a separate event for now. // this does not work yet. //console.warn('about to emit' + pp(self)); //self._tokenizer.emit('chunk', [ { type: 'END' } ] ); //self._tokenizer.emit('end'); // Append the end (for obvious reasons this should not // be part of a stream, only when tokenizing complete // texts) //console.warn( pp( flatten ( e ) ) ); cache = {}; __parseArgs[2]( [ new EOFTk( ) ] ); return []; //flatten(e); } /* * A document (start production) is a sequence of toplevelblocks. Tokens are * emitted in chunks per toplevelblock to avoid buffering the full document. */ toplevelblock = & { blockStart = pos; return true; } b:block { b = flatten(b); // Add source offsets for round-tripping. XXX: Add these not just for // toplevelblocks! if ( b.length ) { var bs = b[0]; if ( bs.constructor === String && bs.attribs === undefined ) { b[0] = new String( bs ); bs = b[0]; } if (bs.dataAttribs === undefined) { bs.dataAttribs = {}; } bs.dataAttribs.sourcePos = [blockStart, pos]; } // Emit tokens for this toplevelblock. This feeds a chunk to the parser // pipeline. __parseArgs[2]( flatten( b ) ); // We don't return any tokens to the start production to save memory. We // just emitted them already to our consumers. return true; } /* * The actual contents of each block. */ block = block_lines / & '<' r:( pre // tag variant can start anywhere / comment &eolf / nowiki // avoid a paragraph if we know that the line starts with a block tag / bt:block_tag { return [bt] } ) { return r; } / para // Inlineline includes generic tags; wrapped into paragraphs in token // transform and DOM postprocessor / inlineline / sol /* * A block nested in other constructs. Avoid eating end delimiters for other * constructs by checking against inline_breaks first. */ nested_block = !inline_breaks b:block { return b } /* * Line-based block constructs. */ block_lines = s:sol // eat an empty line before the block s2:(os:optionalSpaceToken so:sol { return os.concat(so) })? bl:block_line { var s2_ = (s2 !== '') ? s2 : []; return s.concat(s2_, bl); } /* * Block structures with start-of-line wiki syntax */ block_line = h / & [{}|] tl:table_lines { return tl; } / lists // tag-only lines should not trigger pre / st:optionalSpaceToken bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+ &eolf { return st.concat(bt); } / pre_indent / pre /* * A paragraph. We don't emit 'p' tokens to avoid issues with template * transclusions,

tags in the source and the like. Instead, we perform * some paragraph wrapping on the DOM. */ para = s1:sol s2:sol c:inlineline { return s1.concat(s2, /* [new TagTk('p')],*/ c); } br = space* &newline { return new SelfclosingTagTk( 'br' ) } /* * Syntax stops: Avoid eating significant tokens for higher-level productions * in nested inline productions. * * Repeated testing of flags is not terribly efficient. See new and faster * version below. */ /* * Syntax stops: Avoid eating significant tokens for higher-level productions * in nested inline productions. */ inline_breaks = & [=|!}:\r\n\]<] & { // Important hack: disable caching for this production, as the default // cache key does not take into account flag states! cacheKey = ''; //console.warn('ilbf: ' + input.substr(pos, 5) ); return null !== __parseArgs[3].inline_breaks( input, pos, syntaxFlags ) } inline = c:(urltext / (! inline_breaks (inline_element / . )))+ { //console.warn('inline out:' + pp(out)); return flatten_stringlist( c ); } inlineline = c:(urltext / !inline_breaks (inline_element / [^\n]))+ { return flatten_stringlist( c ); } inline_element = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; } & '<' ( comment / xmlish_tag ) /// & '{' ( & '{{{{{' template / tplarg / template ) / & '{' tplarg_or_template /// & '{' ( tplarg / template ) // Eat three opening brackets as text. / '[[[' { return '[[[' } / & '[' ( wikilink / extlink ) / & "'" quote /* Headings */ h = & "=" // guard, to make sure '='+ will match. // XXX: Also check to end to avoid inline parsing? r:( s:'='+ // moved in here to make s accessible to inner action & { return setFlag('h'); } c:inlineline e:'='+ spc:(sp:space+ { return sp.join('') } / comment)* &eolf { clearFlag('h'); var level = Math.min(s.length, e.length); // convert surplus equals into text if(s.length > level) { var extras = s.substr(0, s.length - level); if(c[0].constructor === String) { c[0] = extras + c[0]; } else { c.unshift( extras ); } } if(e.length > level) { var extras = e.substr(0, e.length - level), lastElem = c[c.length - 1]; if(lastElem.constructor === String) { lastElem += extras; } else { c.push( extras ); } } return [new TagTk( 'h' + level )] .concat(c, [new EndTagTk( 'h' + level ), spc]); } / & { /* dp('nomatch exit h'); */ clearFlag('h'); return false } { return null } ) { return r } comment = '' / eof) cs:(space* newline space* cn:comment { return cn })* { return [{ type: 'COMMENT', value: c.join('') }].concat(cs); } comment_chars = c:[^-] { return c; } / c:'-' !'->' { return c; } /************************************************************** * External (bracketed and autolinked) links **************************************************************/ urllink = ! { return syntaxFlags['extlink'] } target:url { return [ new TagTk( 'a', [new KV('href', target)] ) , target , new EndTagTk( 'a' ) ]; } extlink = ! { return syntaxFlags['extlink'] } // extlink cannot be nested ( "[" & { return setFlag('extlink'); } //target:urllink target:extlink_preprocessor_text text:(( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )* t:inlineline { return t } )? "]" { clearFlag('extlink'); if ( text === '' ) { // XXX: Link numbering should be implemented in post-processor. text = [ "[" + linkCount + "]" ]; linkCount++; } var res = [ new TagTk( 'a', [ new KV('href', target), new KV('data-mw-type', 'external') ] ), ].concat( text , [ new EndTagTk( 'a' )]); //console.warn( JSON.stringify( res, null, 2 ) ); return res; } / "[" & { clearFlag('extlink'); return false; } ) /* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can * be configured dynamically. */ url_chars = [/fghimnstw] url_protocol = '//' // for protocol-relative URLs / 'ftp://' / 'git://' / 'gopher://' / 'http://' / 'https://' / 'irc://' / 'ircs://' // @bug 28503 / 'mailto:' / 'mms://' / 'news:' / 'nntp://' // @bug 3808 RFC 1738 / 'svn://' / 'telnet://' // Well if we're going to support the above.. -ævar / 'worldwind://' // javascript does not support unicode features.. unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] { try { return decodeURI("%" + c0 + c1) } catch ( e ) { // Reject the match, and allow other fall-back productions to have a // go at it. return null; } } //[^][<>"\\x00-\\x20\\x7F\p{Zs}] no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] url = proto:url_protocol addr:( ipv6_address / ipv4_address )? rest:( ( !inline_breaks c:no_punctuation_char { return c } ) / s:[.:,] !(space / eolf) { return s } / htmlentity / urlencoded_char / [&%] )+ { return proto + addr + rest.join(''); } ipv4_address = a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*) { return flatten( a ).join(''); } ipv6_address = a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']') { return flatten( a ).join(''); } /************************************************************** * Templates, -arguments and wikilinks **************************************************************/ /* * Precedence: template arguments win over templates. See * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence * 4: {{{{·}}}} → {·{{{·}}}·} * 5: {{{{{·}}}}} → {{·{{{·}}}·}} * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}} * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·} */ tplarg_or_template = & '{{{{{' template / tplarg / template template = "{{" target:template_param_text params:(newline? "|" newline? p:template_param { return p })* "}}" { // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. params.unshift( { k: '', v: flatten( target ) } ); var obj = new SelfclosingTagTk( 'template', params ); //console.warn( 'tokenizer template ' + JSON.stringify( target )); return obj; } // XXX: support template and args in target! //template_target // = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') } tplarg = "{{{" name:template_param_text params:( newline? "|" newline? p:template_param { return p })* "}}}" { name = flatten( name ); params.unshift( { k: '', v: name } ); var obj = new SelfclosingTagTk( 'templatearg', params ); //console.warn( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 )); return obj; } template_param = name:template_param_name s0:space* eq:"="? s1:space* value:template_param_text? { //console.warn( 'named template_param matched' + pp([name, value ]) ); if ( value !== '' ) { return new KV( name, flatten( value ) ); } else if ( eq !== '' ) { return new KV(flatten( name ), []); } else { return new KV([], flatten(name)); } } // empty parameter / & [|}] { return new KV([], []); } // FIXME: handle template args and templates in key! (or even parser functions?) template_param_name = & { return setFlag( 'equal' ) } tpt:template_param_text { clearFlag( 'equal' ); //console.warn( 'template param name matched: ' + pp( tpt ) ); return tpt; } / & { return clearFlag( 'equal' ) } //= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); } template_param_text = & { return setFlag('template') } il:inline { clearFlag('template'); //console.warn( 'tpt match: ' + pp (il)); return il; } / & { return clearFlag('template'); } // TODO: handle link prefixes as in al[[Razi]] wikilink = "[[" ! url //target:link_target target:wikilink_preprocessor_text lcontent:( "|" lt:link_text { return lt } )* "]]" // XXX In real MediaWiki, this is a language-dependent positive character // class. Can we work out a static negative class instead? // XXX: Exclude uppercase chars from non-latin languages too! trail:(! [A-Z \t(),.:-] tc:text_char { return tc })* { var obj = new TagTk( 'a', [ new KV('data-mw-type', 'internal') ] ), textTokens = []; obj.attribs.push( new KV('href', target) ); if (lcontent && lcontent.length) { textTokens = lcontent; if (trail) { textTokens.push( trail.join('') ); } } else { if (trail) { textTokens = $.extend(true, [], target).concat( [ trail.join('') ] ); } else { // copy list textTokens = $.extend(true, [], target); } } //console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) ); return [obj].concat(textTokens, [new EndTagTk( 'a' )]); } link_text = & { return setFlag('linkdesc'); } h:inline // 'equal' syntaxFlag is set for links in template parameters. Consume the // '=' here. hs:( '=' inline)? { //console.warn('link_text' + pp(h) + pp(hs)); clearFlag('linkdesc'); if( hs !== '' ) { return h.concat(hs); } else { return h; } } / & { return clearFlag('linkdesc'); } link_end = "]]" /* Generic quote production for italic and bold, further processed in a token * stream transformation in doQuotes. Relies on NlTk tokens being emitted * for each line of text to balance quotes per line. * * We are not using a simple pair rule here as we need to support mis-nested * bolds/italics and MediaWiki's special heuristics for apostrophes, which are * all not context free. */ quote = "''" x:"'"* { var res = new TagTk( 'mw-quote' ); // Will be consumed in token transforms res.value = "''" + x.join(''); return res; } /*********************************************************** * Pre and xmlish tags ***********************************************************/ // Indented pre blocks differ from their non-indented (purely tag-based) // cousins by having their contents parsed. pre_indent = pre_indent_in_tags / l:pre_indent_line ls:(sol pre_indent_line)* { return [new TagTk( 'pre' )] .concat( [l], ls , [new EndTagTk( 'pre' )]); } // An indented pre block that is surrounded with pre tags. The pre tags are // used directly. pre_indent_in_tags = space+ // XXX: capture space for round-tripping "" & { return setFlag('pre'); } l:inlineline ls:(sol pre_indent_line)* "" { clearFlag('pre'); return [ new TagTk( 'pre', attribs ) ] .concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] ); } / & { return clearFlag('pre'); } pre_indent_line = space l:inlineline { return [ '\n' ].concat(l); } /* * Pre blocks defined using non-indented HTML tags only parse nowiki tags * inside them, and convert other content to verbatim text. Nowiki inside pre * is not functionally needed, but supported for backwards compatibility. */ pre = "" // MediaWiki

 is special in that it converts all pre content to plain
    // text.
    ts:(t1:[^<]+ { return t1.join('') } 
                / nowiki 
                / !"
" t2:. { return t2 })+ ("" / eof) { // return nowiki tags as well? //console.warn('inpre'); return [ new TagTk( 'pre', attribs ) ] .concat(ts, [ new EndTagTk( 'pre' ) ]); } / "" { return ""; } /* XXX: Extension tags can require a change in the tokenizer mode, which * returns any text between extension tags verbatim. For now, we simply * continue to parse the contained text and return the tokens. The original * input source can be recovered from the source positions added on tag * tokens. This won't however work in all cases. For example, a comment start * (