/* Combined Wiki (MediaWiki) and HTML tokenizer. Produces a token stream ** (actually a list of tokens for now) suitable for a HTML5TreeBuilder. */ { /* Fixme: use static functions to separate module! Unfortunately, this * does not work: * var tu = require('./mediawiki.tokenizer.utils.js'); * console.log(tu.flatten([])); * Using exports in the module gets a bit further, but accesses to * tu.flatten in productions still fail. Thus, I just moved the functions * here until a solution is found: */ /* Static utilities */ // Flatten a list of lists. var flatten = function ( e ) { // Fast single-level flatten: //return [].concat.apply([], e); var es = []; // flatten sub-arrays for(var i = 0, length = e.length; i < length; i++) { var ei = e[i]; if ($.isArray(ei)) es = es.concat(flatten(ei)); else es.push(ei); }; return es; }; // Remove escaped quotes from attributes etc // This was in the original PEG parser, but could not find anything in // MediaWiki that supports \' and \"-style escaping. So remove? -- gwicke var unquote = function (quotec, text) { return text.replace('\\' + quotec, quotec); }; // Decode html entities. In a browser, this should only be fed the entity, // not untrusted html! XXX: replace with safer version. var unentity = function ( entity ) { return $("
").html(entity).text(); }; // Debug print with global switch var dp = function ( msg ) { if ( false ) { console.log(msg); } }; var pp = function ( s ) { return JSON.stringify(s, null, 2); } /* * Annotate a token stream with list items with appropriate list tokens * * @static * @method * @param {[tokens]} Token stream with li tokens * @returns {[tokens]} Token stream, possibly with additional list tokens * */ var annotateList = function ( tokens ) { var out = [], // List of tokens bstack = [], // Bullet stack, previous element's listStyle bnext = [], // Next element's listStyle endtags = []; // Stack of end tags var commonPrefixLength = function (x, y) { var minLength = Math.min(x.length, y.length); for(var i = 0; i < minLength; i++) { if (x[i] != y[i]) break; } return i; }; var pushList = function ( listName, itemName ) { out.push({type: 'TAG', name: listName}); out.push({type: 'TAG', name: itemName}); endtags.push({type: 'ENDTAG', name: listName}); endtags.push({type: 'ENDTAG', name: itemName}); }; var popTags = function ( n ) { for(;n > 0; n--) { // push list item.. out.push(endtags.pop()); // and the list end tag out.push(endtags.pop()); } }; var isDlDd = function (a, b) { var ab = [a,b].sort(); return (ab[0] === ':' && ab[1] === ';'); }; var doListItem = function ( bs, bn ) { var prefixLen = commonPrefixLength (bs, bn); var changeLen = Math.max(bs.length, bn.length) - prefixLen; var prefix = bn.slice(0, prefixLen); // emit close tag tokens for closed lists if (changeLen === 0) { var itemToken = endtags.pop(); out.push(itemToken); out.push({type: 'TAG', name: itemToken.name}); endtags.push({type: 'ENDTAG', name: itemToken.name}); } else if ( bs.length == bn.length && changeLen == 1 && isDlDd( bs[prefixLen], bn[prefixLen] ) ) { // handle dd/dt transitions out.push(endtags.pop()); if( bn[prefixLen] == ';') { var newName = 'dt'; } else { var newName = 'dd'; } out.push({type: 'TAG', name: newName}); endtags.push({type: 'ENDTAG', name: newName}); } else { popTags(bs.length - prefixLen); if (prefixLen > 0 && bn.length == prefixLen ) { var itemToken = endtags.pop(); out.push(itemToken); out.push({type: 'TAG', name: itemToken.name}); endtags.push({type: 'ENDTAG', name: itemToken.name}); } for(var i = prefixLen; i < bn.length; i++) { switch (bn[i]) { case '*': pushList('ul', 'li'); break; case '#': pushList('ol', 'li'); break; case ';': pushList('dl', 'dt'); break; case ':': pushList('dl', 'dd'); break; default: throw("Unknown node prefix " + prefix[i]); } } } }; for (var i = 0, length = tokens.length; i < length; i++) { var token = tokens[i]; switch ( token.type ) { case 'TAG': switch (token.name) { case 'list': // ignore token break; case 'listItem': // convert listItem to list and list item tokens bnext = token.bullets; doListItem( bstack, bnext ); bstack = bnext; break; default: // pass through all remaining start tags out.push(token); break; } break; case 'ENDTAG': if ( token.name == 'list' ) { // pop all open list item tokens popTags(bstack.length); bstack = []; } else { out.push(token); } break; default: out.push(token); break; } } return out; }; /* End static utilities */ /* * Flags for specific parse environments (inside tables, links etc). Flags * trigger syntactic stops in the inline_breaks production, which * terminates inline and attribute matches. Flags merely reduce the number * of productions needed: The grammar is still context-free as the * productions can just be unrolled for all combinations of environments * at the cost of a much larger grammar. */ var syntaxFlags = {}; var setFlag = function(flag) { if (syntaxFlags[flag] !== undefined) { syntaxFlags[flag]++; } else { syntaxFlags[flag] = 1; } return true; }; var clearFlag = function(flag) { syntaxFlags[flag]--; return false; }; // Start position of top-level block // Could also provide positions for lower-level blocks using a stack. var blockStart = 0; // Start position of generic tag production var tagStartPos = 0; // cache the input length var inputLength = input.length; // pseudo-production that matches at end of input var isEOF = function (pos) { return pos === inputLength; }; // text start position var textStart = 0; // hack to support numbered external links ([http://example.com]). // XXX: Move to token stream transform after templates are expanded! var linkCount = 1; // Define block-level tags in JS, so we can use toLowerCase to match tags // case-independently. This would be quite ugly (and possibly slower) if // done manually in the grammar. var block_names = (function () { var names = [ "p", "table", "td", "tr", "ul", "ol" , "li", "dl", "dt", "dd", "div", "center" , "blockquote" ]; var bnames = {}; for(var i = 0, l = names.length; i < l; i++) { bnames[names[i]] = true; } return bnames; })(); var self = this; } start = e:toplevelblock* newline* { // end is passed inline as a token, as well as a separate event for now. // this does not work yet. //console.log('about to emit' + pp(self)); //self._tokenizer.emit('chunk', [ { type: 'END' } ] ); //self._tokenizer.emit('end'); // Append the end (for obvious reasons this should not // be part of a stream, only when tokenizing complete // texts) //console.log( pp( flatten ( e ) ) ); return flatten(e); } /* All chars that cannot start syntactic structures in the middle of a line * XXX: ] and other end delimiters should probably only be activated inside * structures to avoid unnecessarily leaving the text production on plain * content. */ text_char = [^'<~[{\n\r:\]}|!=] text = t:text_char+ { return t.join(''); } /* Explanation of chars * ' quotes (italic/bold) * < start of xmlish_tag * ~ signatures/dates * [ start of links * { start of parser functions, transclusion and template args * \n all sort of block-level markup at start of line * \r ditto * h http(s) urls * n nntp(s) urls * m mailto urls * * ! and | table cell delimiters, might be better to specialize those * = headings - also specialize those! * * The following chars are also included for now, but only apply in some * contexts and should probably be enabled only in those: * : separate definition in ; term : definition * ] end of link * } end of parser func/transclusion/template arg */ urltext = ( t:[^'<~[{\n\rfghimnstw|!:\]} &=]+ { return t.join(''); } / htmlentity / urllink // Convert trailing space into // XXX: This should be moved to a serializer / ' ' & ':' { return "\u00a0"; } / t:text_char )+ /* '//', // for protocol-relative URLs, but not in text! 'ftp://', 'git://', 'gopher://', 'http://', 'https://', 'irc://', 'ircs://', // @bug 28503 'mailto:', 'mms://', 'news:', 'nntp://', // @bug 3808 RFC 1738 'svn://', 'telnet://', // Well if we're going to support the above.. -ævar 'worldwind://', */ // Old version //text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') } // Experimental tweaked version: avoid expensive single-char substrings // This did not bring the expected performance boost, however. //text = [A-Za-z0-9,._ -] { // textStart = pos; // // var res = input.substr(textStart - 1, inputLength) // .match(/[A-Za-z0-9,._ -]+/)[0]; // pos = pos + (res.length - 1); // return res // } htmlentity = "&" c:[#0-9a-zA-Z]+ ";" { return unentity("&" + c.join('') + ";") } space = s:[ \t]+ { return s.join(''); } optionalSpaceToken = s:space* { if ( s.length ) { return [{type: 'TEXT', value: s.join('')}]; } else { return []; } } // Start of line sol = nl:(newlineToken / & { return pos === 0; } { return [] }) // Eat multi-line comments, so that syntax after still matches as if it // was actually preceded by a newline cn:( c:comment n:newline? { if ( n !== '' ) { return [c, {type: 'TEXT', value: n}]; } else { return [c]; } } )* // Eat includeonly/noinclude at start of line, so that start-of-line // syntax after it still matches ni:(space* "<" c:"/"? t:("includeonly" / "noinclude") ">" {return [c, t]} )? { var niToken = []; if ( ni !== '') { if ( ni[0] === '/' ) { niToken = [{type: 'ENDTAG', name: ni[1]}]; } else { niToken = [{type: 'TAG', name: ni[1]}]; } } return nl.concat(cn, niToken); } eof = & { return isEOF(pos); } { return true; } newline = '\n' / '\r\n' newlineToken = newline { return [{type: 'NEWLINE'}] } eolf = newline / eof toplevelblock = & { blockStart = pos; return true; } b:block { b = flatten(b); var bs = b[0]; //dp('toplevelblock:' + pp(b)); if (bs.attribs === undefined) { bs.attribs = []; } bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]); // XXX: only run this for lines that actually need it! //b.push({type: 'NEWLINE'}); // Move this to a token stream transform! //console.log('about to emit' + pp(self)); //console.log( pp( result._tokenizer )); //console.log('emitted chunk' + pp(b)); //return []; return b; } block = block_lines / pre / comment &eolf / nowiki / pre / bt:block_tag { return [bt] } // avoid a paragraph if we know that the line starts with a block tag / para / inlineline // includes generic tags; wrapped into paragraphs in DOM postprocessor / s:sol /*{ if (s) { return [s, {type: 'NEWLINE'}]; } else { return [{type: 'NEWLINE'}]; } }*/ block_lines = s:sol // eat an empty line before the block s2:(os:optionalSpaceToken so:sol { return os.concat(so) })? bl:block_line { var s2_ = (s2 !== '') ? s2 : []; return s.concat(s2_, bl); } // Block structures with start-of-line wiki syntax block_line = h /// table / & [{}|] tl:table_lines { return tl; } / lists // tag-only lines should not trigger pre / st:optionalSpaceToken bt:(bts:block_tag stl:optionalSpaceToken { return bts.concat(stl) })+ &eolf { return st.concat(bt); } / pre_indent / pre para = s1:sol s2:sol c:inlineline { return s1.concat(s2, /* [{type: 'TAG', name: 'p'}],*/ c); } br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} } // Syntax stops to limit inline expansion defending on syntactic context inline_breaks = & { // Important hack: disable caching for this production, as the default // cache key does not take into account flag states! cacheKey = ''; return true; } & { return syntaxFlags['table']; } ( a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a) + pos); return true; } / & { return syntaxFlags['tableCellArg'] } "|" { return true } ) / & { return (syntaxFlags['colon'] && ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition ! syntaxFlags.linkdesc); } ":" { return true; } / & { return syntaxFlags['extlink']; } "]" { return true; } / & { return syntaxFlags['linkdesc']; } link_end { return true; } / & { return syntaxFlags['h']; } '='+ space* newline { return true; } / & { return syntaxFlags['template']; } ('|' / '}}') { return true; } inline = c:(urltext / (! inline_breaks (inline_element / . )))+ { var out = []; var text = []; c = flatten(c); for (var i = 0, l = c.length; i < l; i++) { var ci = c[i]; if (typeof ci === 'string') { if(ci !== '') { text.push(ci); } } else { if (text.length) { out.push({ type: "TEXT", value: text.join('') }); text = []; } out.push(ci); } } if (text.length) { out.push({ type: 'TEXT', value: text.join('') }); } //dp('inline out:' + pp(out)); return out; } inlineline = c:(urltext / !inline_breaks (inline_element / [^\n]))+ { var out = []; var text = []; c = flatten(c); for (var i = 0; i < c.length; i++) { var ci = c[i] if (typeof ci == 'string') { if(ci !== '') { text.push(ci); } } else { if (text.length) { out.push({type: 'TEXT', value: text.join('')}); text = []; } out.push(ci); } } if (text.length) { out.push({type: 'TEXT', value: text.join('')}); } //dp('inlineline out:' + pp(out)); return out; } inline_element = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; } & '<' ( comment / xmlish_tag ) / & '{' ( & '{{{{{' template / tplarg / template ) // Eat three opening brackets as text. / '[[[' { return { type: 'TEXT', value: '[[[' } } / & '[' ( wikilink / extlink ) / & "'" quote /* Headings */ h = & "=" // guard, to make sure '='+ will match. // XXX: Also check to end to avoid inline parsing? r:( s:'='+ // moved in here to make s accessible to inner action & { return setFlag('h'); } c:inlineline e:'='+ spc:(sp:space+ { return {type: 'TEXT', value: sp.join('') } } / comment)* &eolf { clearFlag('h'); var level = Math.min(s.length, e.length); // convert surplus equals into text if(s.length > level) { var extras = s.substr(0, s.length - level); if(c[0].type == 'TEXT') { c[0].value = extras + c[0].value; } else { c.unshift({type: 'TEXT', value: extras}); } } if(e.length > level) { var extras = e.substr(0, e.length - level), lastElem = c[c.length - 1]; if(lastElem.type == 'TEXT') { lastElem.value = lastElem.value + extras; } else { c.push({type: 'TEXT', value: extras}); } } return [{type: 'TAG', name: 'h' + level}] .concat(c, [{type: 'ENDTAG', name: 'h' + level}, spc]); } / & { dp('nomatch exit h'); clearFlag('h'); return false } { return null } ) { return r } pre_indent = l:pre_indent_line ls:(sol pre_indent_line)* { return [{type: 'TAG', name: 'pre'}] .concat( [l], ls , [{type: 'ENDTAG', name: 'pre'}]); } pre_indent_line = space l:inlineline { return [{type: 'TEXT', value: '\n'}].concat(l); } comment = '' / eof) cs:(space* newline space* cn:comment { return cn })* { return [{ type: 'COMMENT', value: c.join('') }].concat(cs); } comment_chars = c:[^-] { return c; } / c:'-' !'->' { return c; } urllink = target:url { return [ { type: 'TAG', name: 'a', attribs: [['href', target]] } , {type: 'TEXT', value: target} , {type: 'ENDTAG', name: 'a'} ]; } extlink = "[" & { return setFlag('extlink'); } target:url space* text:inlineline? "]" { clearFlag('extlink'); if ( text == '' ) { // XXX: Link numbering should be implemented in post-processor. text = [{type: 'TEXT', value: "[" + linkCount + "]"}]; linkCount++; } return [ { type: 'TAG', name: 'a', attribs: [ ['href', target], ['data-type', 'external'] ], } ].concat( text , [{type: 'ENDTAG', name: 'a'}]); } / "[" & { clearFlag('extlink'); return false; } /* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can * be configured dynamically. */ url_protocol = '//' // for protocol-relative URLs / 'ftp://' / 'git://' / 'gopher://' / 'http://' / 'https://' / 'irc://' / 'ircs://' // @bug 28503 / 'mailto:' / 'mms://' / 'news:' / 'nntp://' // @bug 3808 RFC 1738 / 'svn://' / 'telnet://' // Well if we're going to support the above.. -ævar / 'worldwind://' // javascript does not support unicode features.. unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] { return decodeURI("%" + c0 + c1) } //[^][<>"\\x00-\\x20\\x7F\p{Zs}] no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] url = proto:url_protocol addr:( ipv6_address / ipv4_address )? rest:( ( !inline_breaks c:no_punctuation_char { return c } ) / s:[.:,] !(space / eolf) { return s } / htmlentity / urlencoded_char / [&%] )+ { return proto + addr + rest.join(''); } ipv4_address = a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*) { return flatten( a ).join(''); } ipv6_address = a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']') { return flatten( a ).join(''); } template = "{{" target:template_param_text params:(newline? "|" newline? p:template_param { return p })* "}}" { target = flatten( target ); var obj = { type: 'SELFCLOSINGTAG', name: 'template', attribs: [['data-target', JSON.stringify(target)]], orderedArgs: params, args: {}, target: target }; if (params && params.length) { var position = 1; for ( var i = 0, l = params.length; i < l; i++ ) { var param = params[i]; if ( param[0] === null ) { obj.args[position] = param[1]; position++; } else { // Last value wins for duplicates. obj.args[param[0]] = param[1]; } } // HACK: temporarily also push the args into an attribute // (just for debugging) obj.attribs.push(['data-json-args', JSON.stringify(obj.args)]); } // Should actually use a self-closing tag here, but the Node HTML5 // parser only recognizes known self-closing tags for now, so use an // explicit end tag for now. //console.log(pp(obj)); return obj; } // XXX: support template and args in target! template_target = h:( !"}}" x:([^|\n]) { return x } )* { return { type: 'TEXT', value: h.join('') } } template_param = name:template_param_name space* "=" space* c:template_param_text { return [[{ type: 'TEXT', value: name }], flatten( c )]; } / c:template_param_text { return [[], flatten( c ) ]; } tplarg = "{{{" name:template_param_text params:("|" p:template_param { return p })* "}}}" { name = flatten( name ); var obj = { type: 'SELFCLOSINGTAG', name: 'templatearg', attribs: [], argname: name, defaultvalue: [] }; if (params && params.length) { // HACK, not final. obj.attribs.push(['data-defaultvalue', params[0][1]]); obj.attribs.push(['data-json-args', JSON.stringify(params)]); obj.defaultvalue = params[0][1]; } //console.log( 'tokenizer templatearg ' + JSON.stringify( obj )); return obj; } // FIXME: handle template args and templates in key! (or even parser functions?) template_param_name = h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); } template_param_text = & { return setFlag('template') } il:inline+ { clearFlag('template'); return il; } / & { clearFlag('template'); return false; } // TODO: handle link prefixes as in al[[Razi]] wikilink = "[[" ! url target:link_target lcontent:( "|" lt:link_text { return lt } )* "]]" // XXX In real MediaWiki, this is a language-dependent positive character // class. Can we work out a static negative class instead? // XXX: Exclude uppercase chars from non-latin languages too! trail:(! [A-Z \t(),.:-] tc:text_char { return tc })* { var obj = { type: 'TAG', name: 'a', attribs: [ ['data-type', 'internal'] ] }, textTokens = []; obj.attribs.push(['href', target]); if (lcontent && lcontent.length) { textTokens = lcontent; if (trail) { textTokens.push( { type: 'TEXT', value: trail.join('') } ); } } else { if (trail) { target += trail.join(''); } textTokens = [{type: 'TEXT', value: target}]; } return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]); } link_target = h:( c:[^|%\n\]]+ { return c.join('') } // quickly eat anything unsuspicious / !"]]" hi:( [^|%\n] / urlencoded_char / '%' ) { return hi } )* { return h.join(''); } link_text = & { return setFlag('linkdesc'); } h:inlineline { clearFlag('linkdesc'); return h; } / & { clearFlag('linkdesc'); return false } link_end = "]]" /* Generic quote production for italic and bold, further processed in a token * stream transformation in doQuotes. Relies on NEWLINE tokens being emitted * for each line of text to balance quotes per line. * * We are not using a simple pair rule here as we need to support mis-nested * bolds/italics and MediaWiki's special heuristics for apostrophes, which are * all not context free. */ quote = "''" x:"'"* { return { type: 'TAG', name : 'mw-quote', // Will be consumed in token transforms value: "''" + x.join('') } } /* XXX: Extension tags can require a change in the tokenizer mode, which * returns any text between extension tags verbatim. For now, we simply * continue to parse the contained text and return the tokens. The original * input source can be recovered from the source positions added on tag * tokens. This won't however work in all cases. For example, a comment start * (