tags in the source and the like. Instead, we perform * some paragraph wrapping on the DOM. */ para = s1:sol s2:sol c:inlineline { return s1.concat(s2, /* [new TagTk('p')],*/ c); } br = space* &newline { return new SelfclosingTagTk( 'br' ) } /* * Syntax stops: Avoid eating significant tokens for higher-level productions * in nested inline productions. * * XXX: Repeated testing of flags is not terribly efficient. */ inline_breaks = & [=|!}:\r\n\]<] // don't check further if char cannot match res:( & { // Important hack: disable caching for this production, as the default // cache key does not take into account flag states! cacheKey = ''; console.warn('ilb: ' + input.substr(pos, 5) ); return true; } & { return syntaxFlags['table']; } ( a:(newline [!|] / '||' / '!!' / '|}') { //console.warn("table break" + pp(a) + pos); return true; } / & { return syntaxFlags['tableCellArg'] } "|" { return true } ) / & { return (syntaxFlags['colon'] && ! syntaxFlags.extlink && // example: ; [[Link:Term]] : Definition ! syntaxFlags.linkdesc); } ":" { return true; } / & { return syntaxFlags['extlink']; } "]" { return true; } / & { return syntaxFlags['linkdesc']; } link_end { return true; } / & { return syntaxFlags['h']; } '='+ space* newline { return true; } / & { return syntaxFlags['template']; } ('|' / '}}' ) { //console.warn( 'template break @' + pos + input.substr(pos-1, 4) ); return true; } / & { return syntaxFlags['equal']; } '=' { //console.warn( 'equal stop @' + pos + input.substr(pos-1, 4) ); return true; } / & { return syntaxFlags['pre']; } '' { //console.warn( 'pre stop @' + pos + input.substr(pos-1, 4) ); return true; } ) { return res } inline_breaks_experiment = & [=|!}:\r\n\]<] & { // Important hack: disable caching for this production, as the default // cache key does not take into account flag states! cacheKey = ''; //console.warn('ilbf: ' + input.substr(pos, 5) ); return true; } . { return __parseArgs[3].inline_breaks( input, pos - 1, syntaxFlags ) && true || null ; } inline = c:(urltext / (! inline_breaks (inline_element / . )))+ { //console.warn('inline out:' + pp(out)); return flatten_stringlist( c ); } inlineline = c:(urltext / !inline_breaks (inline_element / [^\n]))+ { return flatten_stringlist( c ); } inline_element = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; } & '<' ( comment / xmlish_tag ) /// & '{' ( & '{{{{{' template / tplarg / template ) / & '{' tplarg_or_template /// & '{' ( tplarg / template ) // Eat three opening brackets as text. / '[[[' { return '[[[' } / & '[' ( wikilink / extlink ) / & "'" quote /* Headings */ h = & "=" // guard, to make sure '='+ will match. // XXX: Also check to end to avoid inline parsing? r:( s:'='+ // moved in here to make s accessible to inner action & { return setFlag('h'); } c:inlineline e:'='+ spc:(sp:space+ { return sp.join('') } / comment)* &eolf { clearFlag('h'); var level = Math.min(s.length, e.length); // convert surplus equals into text if(s.length > level) { var extras = s.substr(0, s.length - level); if(c[0].constructor === String) { c[0] = extras + c[0]; } else { c.unshift( extras ); } } if(e.length > level) { var extras = e.substr(0, e.length - level), lastElem = c[c.length - 1]; if(lastElem.constructor === String) { lastElem += extras; } else { c.push( extras ); } } return [new TagTk( 'h' + level )] .concat(c, [new EndTagTk( 'h' + level ), spc]); } / & { dp('nomatch exit h'); clearFlag('h'); return false } { return null } ) { return r } comment = '' / eof) cs:(space* newline space* cn:comment { return cn })* { return [{ type: 'COMMENT', value: c.join('') }].concat(cs); } comment_chars = c:[^-] { return c; } / c:'-' !'->' { return c; } /************************************************************** * External (bracketed and autolinked) links **************************************************************/ urllink = ! { return syntaxFlags['extlink'] } target:url { return [ new TagTk( 'a', [new KV('href', target)] ) , target , new EndTagTk( 'a' ) ]; } extlink = ! { return syntaxFlags['extlink'] } // extlink cannot be nested ( "[" & { return setFlag('extlink'); } //target:urllink target:extlink_preprocessor_text text:(( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )* t:inlineline { return t } )? "]" { clearFlag('extlink'); if ( text === '' ) { // XXX: Link numbering should be implemented in post-processor. text = [ "[" + linkCount + "]" ]; linkCount++; } var res = [ new TagTk( 'a', [ new KV('href', target), new KV('data-mw-type', 'external') ] ), ].concat( text , [ new EndTagTk( 'a' )]); //console.warn( JSON.stringify( res, null, 2 ) ); return res; } / "[" & { clearFlag('extlink'); return false; } ) /* Defaul URL protocols in MediaWiki (see DefaultSettings). Normally these can * be configured dynamically. */ url_chars = [/fghimnstw] url_protocol = '//' // for protocol-relative URLs / 'ftp://' / 'git://' / 'gopher://' / 'http://' / 'https://' / 'irc://' / 'ircs://' // @bug 28503 / 'mailto:' / 'mms://' / 'news:' / 'nntp://' // @bug 3808 RFC 1738 / 'svn://' / 'telnet://' // Well if we're going to support the above.. -ævar / 'worldwind://' // javascript does not support unicode features.. unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] { try { return decodeURI("%" + c0 + c1) } catch ( e ) { // Reject the match, and allow other fall-back productions to have a // go at it. return null; } } //[^][<>"\\x00-\\x20\\x7F\p{Zs}] no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] url = proto:url_protocol addr:( ipv6_address / ipv4_address )? rest:( ( !inline_breaks c:no_punctuation_char { return c } ) / s:[.:,] !(space / eolf) { return s } / htmlentity / urlencoded_char / [&%] )+ { return proto + addr + rest.join(''); } ipv4_address = a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*) { return flatten( a ).join(''); } ipv6_address = a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']') { return flatten( a ).join(''); } /************************************************************** * Templates, -arguments and wikilinks **************************************************************/ tplarg_or_template = & '{{{{{' template / tplarg / template template = "{{" target:template_param_text params:(newline? "|" newline? p:template_param { return p })* "}}" { // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. params.unshift( { k: '', v: flatten( target ) } ); var obj = new SelfclosingTagTk( 'template', params ); //console.warn( 'tokenizer template ' + JSON.stringify( target )); return obj; } // XXX: support template and args in target! //template_target // = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') } tplarg = "{{{" name:template_param_text params:( newline? "|" newline? p:template_param { return p })* "}}}" { name = flatten( name ); params.unshift( { k: '', v: name } ); var obj = new SelfclosingTagTk( 'templatearg', params ); //console.warn( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 )); return obj; } template_param = name:template_param_name s0:space* eq:"="? s1:space* value:template_param_text? { //console.warn( 'named template_param matched' + pp([name, value ]) ); if ( value !== '' ) { return new KV( name, flatten( value ) ); } else if ( eq !== '' ) { return new KV(flatten( name ), []); } else { return new KV([], flatten(name)); } } // empty parameter / & [|}] { return new KV([], []); } // FIXME: handle template args and templates in key! (or even parser functions?) template_param_name = & { return setFlag( 'equal' ) } tpt:template_param_text { clearFlag( 'equal' ); //console.warn( 'template param name matched: ' + pp( tpt ) ); return tpt; } / & { return clearFlag( 'equal' ) } //= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); } template_param_text = & { return setFlag('template') } il:inline { clearFlag('template'); //console.warn( 'tpt match: ' + pp (il)); return il; } / & { return clearFlag('template'); } // TODO: handle link prefixes as in al[[Razi]] wikilink = "[[" ! url //target:link_target target:wikilink_preprocessor_text lcontent:( "|" lt:link_text { return lt } )* "]]" // XXX In real MediaWiki, this is a language-dependent positive character // class. Can we work out a static negative class instead? // XXX: Exclude uppercase chars from non-latin languages too! trail:(! [A-Z \t(),.:-] tc:text_char { return tc })* { var obj = new TagTk( 'a', [ new KV('data-mw-type', 'internal') ] ), textTokens = []; obj.attribs.push( new KV('href', target) ); if (lcontent && lcontent.length) { textTokens = lcontent; if (trail) { textTokens.push( trail.join('') ); } } else { if (trail) { textTokens = $.extend(true, [], target).concat( [ trail.join('') ] ); } else { // copy list textTokens = $.extend(true, [], target); } } //console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) ); return [obj].concat(textTokens, [new EndTagTk( 'a' )]); } link_text = & { return setFlag('linkdesc'); } h:inline // 'equal' syntaxFlag is set for links in template parameters. Consume the // '=' here. hs:( '=' inline)? { //console.warn('link_text' + pp(h) + pp(hs)); clearFlag('linkdesc'); if( hs !== '' ) { return h.concat(hs); } else { return h; } } / & { return clearFlag('linkdesc'); } link_end = "]]" /* Generic quote production for italic and bold, further processed in a token * stream transformation in doQuotes. Relies on NlTk tokens being emitted * for each line of text to balance quotes per line. * * We are not using a simple pair rule here as we need to support mis-nested * bolds/italics and MediaWiki's special heuristics for apostrophes, which are * all not context free. */ quote = "''" x:"'"* { var res = new TagTk( 'mw-quote' ); // Will be consumed in token transforms res.value = "''" + x.join(''); return res; } /*********************************************************** * Pre and xmlish tags ***********************************************************/ // Indented pre blocks differ from their non-indented (purely tag-based) // cousins by having their contents parsed. pre_indent = pre_indent_in_tags / l:pre_indent_line ls:(sol pre_indent_line)* { return [new TagTk( 'pre' )] .concat( [l], ls , [new EndTagTk( 'pre' )]); } // An indented pre block that is surrounded with pre tags. The pre tags are // used directly. pre_indent_in_tags = space+ // XXX: capture space for round-tripping "
" & { return setFlag('pre'); } l:inlineline ls:(sol pre_indent_line)* "" { clearFlag('pre'); return [ new TagTk( 'pre', attribs ) ] .concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] ); } / & { return clearFlag('pre'); } pre_indent_line = space l:inlineline { return [ '\n' ].concat(l); } /* * Pre blocks defined using non-indented HTML tags only parse nowiki tags * inside them, and convert other content to verbatim text. Nowiki inside pre * is not functionally needed, but supported for backwards compatibility. */ pre = "
" // MediaWiki" / eof) { // return nowiki tags as well? //console.warn('inpre'); return [ new TagTk( 'pre', attribs ) ] .concat(ts, [ new EndTagTk( 'pre' ) ]); } / "" { return ""; } /* XXX: Extension tags can require a change in the tokenizer mode, which * returns any text between extension tags verbatim. For now, we simply * continue to parse the contained text and return the tokens. The original * input source can be recovered from the source positions added on tag * tokens. This won't however work in all cases. For example, a comment start * (is special in that it converts all pre content to plain // text. ts:(t1:[^<]+ { return t1.join('') } / nowiki / !"" t2:. { return t2 })+ ("