tags in the source and the like. Instead, we perform * some paragraph wrapping on the token stream and the DOM. */ paragraph = s1:sol s2:sol c:inlineline { return s1.concat(s2, /* [new TagTk('p')],*/ c); } br = s:optionalSpaceToken &newline { return s.concat( [ new SelfclosingTagTk( 'br', [], {tsr: [pos0, pos]} ) ] ); } /* * Syntax stops: Avoid eating significant tokens for higher-level productions * in nested inline productions. * * Repeated testing of flags is not terribly efficient. See new and faster * version below. */ /* * Syntax stops: Avoid eating significant tokens for higher-level productions * in nested inline productions. */ inline_breaks = & [=|!}{:\r\n\]<] //& { // // Important hack: disable caching for this production, as the default // // cache key does not take into account flag states! // cacheKey += 'foo'; // return true; //} & { //console.warn('ilbf: ' + input.substr(pos, 5) ); //if ( null !== __parseArgs[3].inline_breaks( input, pos, stops ) ) { // console.warn('ilb break: ' + pp(input.substr(pos, 5)) ); //} else { // console.warn('ilb no break: ' + pp(input.substr(pos, 5)) ); //} return null !== __parseArgs[3].inline_breaks( input, pos, stops ) } inline = c:(urltext / (! inline_breaks (inline_element / . )))+ { //console.warn('inline out:' + pp(c)); return flatten_stringlist( c ); } inlineline = c:(urltext / !inline_breaks (inline_element / [^\r\n]))+ { //console.warn('inlineline out:' + pp(c) + input.substr(pos0, pos)); return flatten_stringlist( c ); } inline_element = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; } & '<' ( comment / xmlish_tag ) /// & '{' ( & '{{{{{' template / tplarg / template ) / & '{' tplarg_or_template /// & '{' ( tplarg / template ) // Eat three opening brackets as text. / '[[[' { return '[[[' } / & '[' ( wikilink / extlink / autolink ) / & "'" quote /* Headings */ h = & "=" // guard, to make sure '='+ will match. // XXX: Also check to end to avoid inline parsing? r:( s:'='+ // moved in here to make s accessible to inner action & { return stops.inc('h'); } c:inlineline endTPos:({ return pos }) e:'='+ spc:(sp:space+ { return sp.join('') } / comment)* &eolf { stops.dec('h'); var level = Math.min(s.length, e.length); // convert surplus equals into text if(s.length > level) { s = s.join(''); var extras = s.substr(0, s.length - level); if(c[0].constructor === String) { c[0] = extras + c[0]; } else { c.unshift( extras ); } } if(e.length > level) { e = e.join(''); var extras = e.substr(0, e.length - level), lastElem = c[c.length - 1]; if(lastElem.constructor === String) { lastElem += extras; } else { c.push( extras ); } } return [new TagTk( 'h' + level, [], { tsr: [pos0, pos0+level] } )] .concat(c, [ new EndTagTk( 'h' + level, [], { tsr: [endTPos, endTPos + level]} ), spc ]); } / & { /* dp('nomatch exit h'); */ stops.dec('h'); return false } { return null } ) { return r } comment = '' / eof) { return [new CommentTk( c.join(''), { tsr: [pos0, pos] } )]; } comment_chars = c:[^-] { return c; } / c:'-' !'->' { return c; } // Behavior switches behavior_switch = '__' behavior:( 'NOTOC' / 'FORCETOC' / 'TOC' / 'NOEDITSECTION' / 'NEWSECTIONLINK' / 'NONEWSECTIONLINK' / 'NOGALLERY' / 'HIDDENCAT' / 'NOCONTENTCONVERT' / 'NOCC' / 'NOTITLECONVERT' / 'NOTC' / 'START' / 'END' // XXX: Removed in 19213. Should we keep this? / 'INDEX' / 'NOINDEX' / 'STATICREDIRECT' ) '__' { return [ new TagTk( 'behavior-switch', [new KV('word', behavior)], { tsr: [pos0, pos], src: input.substr(pos0, pos - pos0) } ) ]; } /************************************************************** * External (bracketed and autolinked) links **************************************************************/ autolink = ! { return stops.onStack('extlink') } (urllink / autoref / isbn) urllink = target:url { return [ new TagTk( 'urllink', [new KV('href', target)], { tsr: [pos0, pos] } ) ]; } extlink = ! { return stops.onStack('extlink') } // extlink cannot be nested ( "[" & { return stops.push('extlink', true); } //target:urllink target:extlink_preprocessor_text text:(( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )* t:inlineline { return t } )? "]" { stops.pop('extlink'); //if ( text === '' ) { // // XXX: Link numbering should be implemented in post-processor. // text = [ "[" + linkCount + "]" ]; // linkCount++; //} //console.warn( 'extlink text: ' + pp( text ) ); var dataAttribs = {}; if ( text === '' ) { dataAttribs.gc = 1; } dataAttribs.tsr = [pos0, pos]; return [ new SelfclosingTagTk( 'extlink', [ new KV('href', target), new KV('content', text) ], dataAttribs) ]; } / "[" & { return stops.pop('extlink'); } ) autoref = ref:('RFC' / 'PMID') (newline / space)+ identifier:[0-9]+ { identifier = identifier.join(''); var base_urls = { 'RFC' : '//tools.ietfs.org/html/rfc%s', 'PMID' : '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract' }, url = sprintf(base_urls[ref], identifier); return [ new SelfclosingTagTk( 'extlink', [ new KV('href', sprintf(base_urls[ref], identifier)), new KV('content', [ref, identifier].join(' ')) ], {tsr: [pos0, pos]}) ]; } isbn = 'ISBN' (newline / space)+ head:[0-9] digits:([- ]? [0-9])+ tail:'X'? { var isbn = [head, digits, tail].join('').replace(/,/g, ''); if (!isValidISBN(isbn)) { return null; } return [ new SelfclosingTagTk( 'wikilink', [ new KV('href', 'Special:BookSources/' + isbn.replace(/[^\d]/g, '')), new KV('tail', ''), new KV('content', 'ISBN ' + isbn), ], {tsr: [pos0, pos]}) ]; } /* Default URL protocols in MediaWiki (see DefaultSettings). Normally * these can be configured dynamically. */ url_chars = [/fghimnstwIPR] url_protocol = '//' // for protocol-relative URLs / 'ftp://' / 'git://' / 'gopher://' / 'http://' / 'https://' / 'irc://' / 'ircs://' // @bug 28503 / 'mailto:' / 'mms://' / 'news:' / 'nntp://' // @bug 3808 RFC 1738 / 'svn://' / 'telnet://' // Well if we're going to support the above.. -ævar / 'worldwind://' // javascript does not support unicode features.. unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] { try { return decodeURI("%" + c0 + c1) } catch ( e ) { // Reject the match, and allow other fall-back productions to have a // go at it. return null; } } //[^][<>"\\x00-\\x20\\x7F\p{Zs}] // no punctiation, and '{<' to trigger directives no_punctuation_char = [^ :\]\[\r\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{] url = proto:url_protocol addr:( ipv6_address / ipv4_address )? path:( ( !inline_breaks c:no_punctuation_char { return c } ) / s:[.:,] !(space / eolf) { return s } / comment / tplarg_or_template / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" ) r:( htmlentity / [&%{] ) { return r } )+ { //console.warn( "path: " + pp( flatten_stringlist( [proto + addr].concat( path ) ) ) ); return flatten_string( [proto + addr].concat( path ) ); } ipv4_address = a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*) { return flatten( a ).join(''); } ipv6_address = a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']') { return flatten( a ).join(''); } /************************************************************** * Templates, -arguments and wikilinks **************************************************************/ /* * Precedence: template arguments win over templates. See * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence * 4: {{{{·}}}} → {·{{{·}}}·} * 5: {{{{{·}}}}} → {{·{{{·}}}·}} * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}} * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·} */ tplarg_or_template = & '{{{{{{{' '{' tplarg_or_template '}' / & ( '{{{' &'{{{' tplarg ) tplarg // tplarg in template / & ( '{{' &'{{{' tplarg ) template / tplarg / template template = "{{" nl_comment_space* target:template_param_value params:(nl_comment_space* "|" r:( nl_comment_space* &"|" { return new KV( '', '') } // empty argument / p:template_param { return p } ) { return r } )* nl_comment_space* "}}" { // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. params.unshift( { k: flatten( target ), v: [] } ); var obj = new SelfclosingTagTk( 'template', params, {tsr: [pos0, pos]} ); //console.warn( 'tokenizer template ' + pos + ); //console.warn('template @' + pos + '::' + input.substr(pos, 40) ); return obj; } // XXX: support template and args in target! //template_target // = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') } tplarg = "{{{" name:template_param_value? params:( nl_comment_space* '|' nl_comment_space* r:( &'}}}' { return new KV( '', '') } / p:template_param { return p } ) { return r } )* nl_comment_space* "}}}" { name = flatten( name ); params.unshift( new KV( name, '' ) ); var obj = new SelfclosingTagTk( 'templatearg', params, {tsr: [pos0, pos]} ); //console.warn( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 )); //console.warn('template arg @' + pos + '::' + input.substr(pos, 40) ); return obj; } template_param = name:template_param_name // MW accepts |foo | = bar | as a single param.. ('|' (space / newline)* &'=')? val:( s0:optionalSpaceToken "=" s1:optionalSpaceToken value:template_param_value? { return { s0: s0, s1: s1, value: value }; } )? { //console.warn( 'named template_param matched' + pp([name, value ]) ); if ( val !== '' ) { if ( val.value !== '' ) { return new KV( name, flatten( val.value ) ); } else { return new KV(flatten( name ), []); } } else { return new KV([], flatten(name)); } } // empty parameter / & [|}] { return new KV([], []); } // FIXME: handle template args and templates in key! (or even parser functions?) template_param_name = & { return stops.push( 'equal', true ) } tpt:(template_param_text / &'=' { return '' }) { stops.pop( 'equal' ); //console.warn( 'template param name matched: ' + pp( tpt ) ); return tpt; } / & { return stops.pop( 'equal' ) } //= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); } template_param_value = & { stops.inc( 'nopre' ); return stops.push( 'equal', false ) } tpt:template_param_text { stops.dec( 'nopre' ); stops.pop( 'equal' ); //console.warn( 'template param value matched: ' + pp( tpt ) ); return tpt; } / & { stops.dec( 'nopre' ); return stops.pop( 'equal' ) } template_param_text = & { /*console.warn( 'tpt: ' + input.substr( pos - 10, 9) + input[pos].green + input.substr( pos +1, 9) ); */ // re-enable tables within template parameters stops.push('table', false ); stops.push('extlink', false); return stops.inc('template') } il:nested_block+ { stops.pop('table'); stops.pop('extlink'); stops.dec('template'); //console.warn( 'tpt match: ' + pp (il) + " stops: " + pp(stops)); var r = flatten( il ), firstToken = r[0]; // Strip leading newlines and space-only text tokens. // XXX: Will need to round-trip these eventually. while ( firstToken !== undefined && ( firstToken.constructor === NlTk || firstToken.constructor === String && firstToken.match( /^[\t ]+$/ ) ) ) { r.shift(); firstToken = r[0]; } //console.warn('tpl param name' + pos + r[0] + '::' + input.substr(pos, 40) ); if ( r.length === 1 && r[0].constructor === String ) { r = r[0]; } return r; } / & { stops.pop('table'); stops.pop('extlink'); return stops.dec('template'); } wikilink_content = lcs:( pipe lt:link_text { return new KV( '', lt ); } )+ { return { pos: [pos0, pos], content: lcs }; } / { return { pos: [pos0, pos], content: [] }; } // TODO: handle link prefixes as in al[[Razi]] wikilink = "[[" ! url //target:link_target // XXX: disallow pipe! target:wikilink_preprocessor_text lcontent:wikilink_content "]]" // XXX In real MediaWiki, this is a language-dependent positive character // class. Can we work out a static negative class instead? // XXX: Exclude uppercase chars from non-latin languages too! tail:( ![A-Z \t(),.:\n\r-] tc:text_char { return tc } )* { var obj = new SelfclosingTagTk( 'wikilink' ), textTokens = []; obj.attribs.push( new KV('href', target) ); obj.dataAttribs = { tsr: [pos0, pos], contentPos: lcontent.pos, src: input.substr(pos0, pos - pos0) }; // XXX: Point to object with path, revision and input information //obj.source = input; //console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) ); // Deal with content. XXX: Properly support pipe-trick etc //lcontent.tail = tail && tail.join('') || ''; // We store the tail and content(s) in attributes, so that it is // expanded by the AttributeExpander pass. obj.attribs.push( new KV( 'tail', tail && tail.join('') || '') ); obj.attribs = obj.attribs.concat( lcontent.content ); //console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) ); return [obj]; } link_text = & { return stops.inc('linkdesc'); } h:inline // 'equal' syntaxFlag is set for links in template parameters. Consume the // '=' here. hs:( '=' inline?)? { //console.warn('link_text' + pp(h) + pp(hs)); stops.dec('linkdesc'); if( hs !== '' ) { return h.concat(hs); } else { return h; } } / & { return stops.dec('linkdesc'); } link_option = & { stops.inc('pipe'); return stops.inc('linkdesc'); } h:inline // 'equal' syntaxFlag is set for links in template parameters. Consume the // '=' here. hs:( '=' inline)? { //console.warn('link_text' + pp(h) + pp(hs)); stops.dec('pipe'); stops.dec('linkdesc'); if( hs !== '' ) { return h.concat(hs); } else { return h; } } / & { stops.dec('pipe'); return stops.dec('linkdesc'); } link_end = "]]" /* Generic quote production for italic and bold, further processed in a token * stream transformation in doQuotes. Relies on NlTk tokens being emitted * for each line of text to balance quotes per line. * * We are not using a simple pair rule here as we need to support mis-nested * bolds/italics and MediaWiki's special heuristics for apostrophes, which are * all not context free. */ quote = "''" x:"'"* { var res = new TagTk( 'mw-quote', [], { tsr: [pos0, pos] } ); // Will be consumed in token transforms res.value = "''" + x.join(''); return res; } /** * Image option productions, only called from the LinkHandler token stream * transformer, and only for images. */ img_options = & { return stops.inc( 'pipe' ); } os:img_option* { stops.dec( 'pipe' ); var options = {}; os = flatten( os ); for ( var i = 0, l = os.length; i < l; i++ ) { var o = os[i]; options[o.k] = o.v; } options._options = os; return options; } / & { return stops.dec( 'pipe' ); } img_option = pipe space* o:( img_attribute / img_format / img_dimensions / img_halign / img_valign / img_link / lt:link_text { return new KV('caption', lt) } ) space* { return o }; img_format = f:( 'border' / 'frameless' / 'frame' / 'thumbnail' / 'thumb' ) { return new KV( 'format', f ); } img_dimensions = x:(n:[0-9]+ { return n.join('') })? y:('x' n:[0-9]+ { return n.join('') })? 'px' { if ( x === '' && y ) { return new KV( 'height', y ); } else if ( y === '' && x ) { return new KV( 'width', x ); } else { return [ new KV( 'width', x ), new KV( 'height', y ) ]; } } / 'upright' { return [ new KV( 'width', 'upright' ) ] } img_halign = a:( 'left' / 'right' / 'center' / 'none' ) { return new KV( 'halign', a ); } img_valign = a:( 'baseline' / 'sub' / 'super' / 'top' / 'text-top' / 'middle' / 'bottom' / 'text-bottom' ) { return new KV( 'valign', a ); } // External link targets are already parsed as link tokens. If we can make // sure that those cleanly convert back to the original text, then we could // re-parse them here. img_link = 'link=' space* u:( t:url { stops.dec( 'pipe' ); return t; } / & { return stops.dec( 'pipe' ); } ) { return new KV( 'link', u ); } img_attribute = k:( 'page' / 'alt' / 'thumbnail' / 'thumb' ) '=' t:preprocessor_text? { return new KV( k, t ); } /*********************************************************** * Pre and xmlish tags ***********************************************************/ // Indented pre blocks differ from their non-indented (purely tag-based) // cousins by having their contents parsed. pre_indent = pre_indent_in_tags / l:pre_indent_line ls:(sol pre_indent_line)* { return [new TagTk( 'pre', [], { tsr: [pos0, pos0 + 1] } )] .concat( [l], ls , [new EndTagTk( 'pre', [], {tsr: [pos, pos]} )]); } // An indented pre block that is surrounded with pre tags. The pre tags are // used directly. // XXX gwicke: check if the first line is not indented, and round-trip spaces; // possibly merge with the regular 'pre' production. // FIXME: fix tag end position pre_indent_in_tags = space+ // XXX: capture space for round-tripping "
" & { return stops.inc('pre'); } l:inlineline ls:(sol pre_indent_line)* "" { stops.dec('pre'); return [ new TagTk( 'pre', attribs, { tsr: [pos0, pos0] } ) ] .concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] ); } / & { return stops.dec('pre'); } pre_indent_line = space l:inlineline { //console.warn( JSON.stringify( [s, l] ) ); return l; } /* * Pre blocks defined using non-indented HTML tags only parse nowiki tags and * html entities inside them, and convert other content to verbatim text. * Nowiki inside pre is not functionally needed, but supported for backwards * compatibility. * * TODO: add entity support! */ pre = "
" // MediaWiki" / eof) { // return nowiki tags as well? //console.warn('inpre'); // FIXME: end pos of start tag return [ new TagTk( 'pre', attribs, { stx: 'html', tsr: [pos0, pos0 + 5] } ) ] .concat(ts, [ new EndTagTk( 'pre', [], { tsr: [pos - 6, pos] } ) ]); } / "" { return ""; } /* XXX: Extension tags can require a change in the tokenizer mode, which * returns any text between extension tags verbatim. For now, we simply * continue to parse the contained text and return the tokens. The original * input source can be recovered from the source positions added on tag * tokens. This won't however work in all cases. For example, a comment start * (is special in that it converts all pre content to plain // text. ts:(t1:[^<]+ { return t1.join('') } / nowiki / !"" t2:. { return t2 })+ ("