").html(entity).text(); }; // Debug print with global switch var dp = function ( msg ) { if ( false ) { console.warn(msg); } }; var pp = function ( s ) { return JSON.stringify(s, null, 2); }; // Simple string formatting using '%s' var sprintf = function ( format ) { var args = Array.prototype.slice.call(arguments, 1); return format.replace(/%s/g, function () { return args.length ? args.shift() : ''; }); }; /** * Determine if a string represents a valid ISBN-10 or ISBN-13 identifier * * @static * @method * @param {String} isbn: The string to check * @returns {Boolean}: True if valid ISBN, false otherwise. */ var isValidISBN = function ( isbn ) { var i = 0, checksum = 0; isbn = isbn.toUpperCase().replace(/[^\dX]/g, ''); return [10, 13].indexOf(isbn.length) !== -1; // XXX: The following code is short-circuited because it is stricter // than the standard parser: switch (isbn.length) { case 10: for (i = 0; i < 9; i++) { checksum += parseInt(isbn[i], 10) * (10 - i); } checksum += '0123456789X'.indexOf(isbn[9]); return (checksum % 11 === 0); case 13: for (i = 0; i < 13; i++) { checksum += parseInt(isbn[i], 10) * (i & 1 ? 3 : 1); } return (checksum % 10 === 0) && (/^97[89]/.test(isbn)); } return false; }; /* End static utilities */ /* * Flags for specific parse environments (inside tables, links etc). Flags * trigger syntactic stops in the inline_breaks production, which * terminates inline and attribute matches. Flags merely reduce the number * of productions needed: The grammar is still context-free as the * productions can just be unrolled for all combinations of environments * at the cost of a much larger grammar. */ function SyntaxStops () { this.counters = {}; this.stacks = {}; } SyntaxStops.prototype.inc = function(flag) { if (this.counters[flag] !== undefined) { this.counters[flag]++; } else { this.counters[flag] = 1; } return true; }; SyntaxStops.prototype.dec = function(flag) { this.counters[flag]--; return false; }; SyntaxStops.prototype.onCount = function ( name ) { return this.counters[name]; }; /** * A stack for nested, but not cumulative syntactic stops. * Example: '=' is allowed in values of template arguments, even if those * are nested in attribute names. */ SyntaxStops.prototype.push = function ( name, value ) { if( this.stacks[name] === undefined ) { this.stacks[name] = [value]; } else { this.stacks[name].push( value ); } return true; }; SyntaxStops.prototype.pop = function ( name ) { if( this.stacks[name] !== undefined ) { this.stacks[name].pop(); } else { throw "SyntaxStops.pop: unknown stop for " + name; } return false; }; SyntaxStops.prototype.onStack = function ( name ) { var stack = this.stacks[name]; if ( stack === undefined || stack.length === 0 ) { return false; } else { return stack[stack.length - 1]; } }; var stops = new SyntaxStops(); // Start position of top-level block // Could also provide positions for lower-level blocks using a stack. var blockStart = 0; // Start position of generic tag production var tagStartPos = 0; // Stack of source positions var posStack = { positions: {}, push: function( key, pos ) { if ( this.positions[key] === undefined ) { this.positions[key] = [pos]; } else { this.positions[key].push( pos ); } return true; }, pop: function( key, pos ) { var pk = this.positions[key]; if ( pk === undefined || ! pk.length ) { throw "Tried to pop unknown position for " + key; } else { return [ pk.pop(), pos ]; } } }; // cache the input length var inputLength = input.length; // pseudo-production that matches at end of input var isEOF = function (pos) { return pos === inputLength; }; // text start position var textStart = 0; // hack to support numbered external links ([http://example.com]). // XXX: Move to token stream transform after templates are expanded! var linkCount = 1; // Define block-level tags in JS, so we can use toLowerCase to match tags // case-independently. This would be quite ugly (and possibly slower) if // done manually in the grammar. var block_names = (function () { var names = [ "p", "table", "td", "tr", "ul", "ol" , "li", "dl", "dt", "dd", "div", "center" , "blockquote" ]; var bnames = {}; for(var i = 0, l = names.length; i < l; i++) { bnames[names[i]] = true; } return bnames; })(); var self = this; } /********************************************************* * The top-level production *********************************************************/ start = e:toplevelblock* newline* { // end is passed inline as a token, as well as a separate event for now. __parseArgs[2]( [ new EOFTk( ) ] ); return []; //flatten(e); } /* * A document (start production) is a sequence of toplevelblocks. Tokens are * emitted in chunks per toplevelblock to avoid buffering the full document. */ toplevelblock = & { blockStart = pos; return true; } b:block { b = flatten(b); // Add source offsets for round-tripping. XXX: Add these not just for // toplevelblocks! if ( b.length ) { var bs = b[0]; if ( bs.constructor === String && bs.attribs === undefined ) { b[0] = new String( bs ); bs = b[0]; } if (bs.dataAttribs === undefined) { bs.dataAttribs = {}; } bs.dataAttribs.sourcePos = [blockStart, pos]; } // Emit tokens for this toplevelblock. This feeds a chunk to the parser // pipeline. __parseArgs[2]( flatten( b ) ); // We don't return any tokens to the start production to save memory. We // just emitted them already to our consumers. return true; } /* * The actual contents of each block. */ block = block_lines / & '<' r:( pre // tag variant can start anywhere / comment &eolf / nowiki // avoid a paragraph if we know that the line starts with a block tag / bt:block_tag { return [bt] } ) { return r; } / paragraph // Inlineline includes generic tags; wrapped into paragraphs in token // transform and DOM postprocessor / inlineline / sol /* * A block nested in other constructs. Avoid eating end delimiters for other * constructs by checking against inline_breaks first. */ nested_block = !inline_breaks b:block { return b } /* * Line-based block constructs. */ block_lines = s:sol // eat an empty line before the block s2:(os:optionalSpaceToken so:sol { return os.concat(so) })? bl:block_line { var s2_ = (s2 !== '') ? s2 : []; return s.concat(s2_, bl); } /* * Block structures with start-of-line wiki syntax */ block_line = h / lists / st:optionalSpaceToken r:( & [{}|!] tl:table_lines { return tl; } // tag-only lines should not trigger pre either / bts:(bt:block_tag stl:optionalSpaceToken { return bt.concat(stl) })+ &eolf { return bts } ) { return st.concat(r); } / ! { return stops.counters.nopre } pre_indent / pre /* * A paragraph. We don't emit 'p' tokens to avoid issues with template * transclusions,

tags in the source and the like. Instead, we perform * some paragraph wrapping on the token stream and the DOM. */ paragraph = s1:sol s2:sol c:inlineline { return s1.concat(s2, /* [new TagTk('p')],*/ c); } br = space* &newline { return new SelfclosingTagTk( 'br' ) } /* * Syntax stops: Avoid eating significant tokens for higher-level productions * in nested inline productions. * * Repeated testing of flags is not terribly efficient. See new and faster * version below. */ /* * Syntax stops: Avoid eating significant tokens for higher-level productions * in nested inline productions. */ inline_breaks = & [=|!}{:\r\n\]<] & { // Important hack: disable caching for this production, as the default // cache key does not take into account flag states! cacheKey = ''; //console.warn('ilbf: ' + input.substr(pos, 5) ); return null !== __parseArgs[3].inline_breaks( input, pos, stops ) } inline = c:(urltext / (! inline_breaks (inline_element / . )))+ { //console.warn('inline out:' + pp(c)); return flatten_stringlist( c ); } inlineline = c:(urltext / !inline_breaks (inline_element / [^\n]))+ { return flatten_stringlist( c ); } inline_element = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; } & '<' ( comment / xmlish_tag ) /// & '{' ( & '{{{{{' template / tplarg / template ) / & '{' tplarg_or_template /// & '{' ( tplarg / template ) // Eat three opening brackets as text. / '[[[' { return '[[[' } / & '[' ( wikilink / extlink / autolink ) / & "'" quote /* Headings */ h = & "=" // guard, to make sure '='+ will match. // XXX: Also check to end to avoid inline parsing? r:( s:'='+ // moved in here to make s accessible to inner action & { return stops.inc('h'); } c:inlineline e:'='+ spc:(sp:space+ { return sp.join('') } / comment)* &eolf { stops.dec('h'); var level = Math.min(s.length, e.length); // convert surplus equals into text if(s.length > level) { var extras = s.substr(0, s.length - level); if(c[0].constructor === String) { c[0] = extras + c[0]; } else { c.unshift( extras ); } } if(e.length > level) { var extras = e.substr(0, e.length - level), lastElem = c[c.length - 1]; if(lastElem.constructor === String) { lastElem += extras; } else { c.push( extras ); } } return [new TagTk( 'h' + level )] .concat(c, [new EndTagTk( 'h' + level ), spc]); } / & { /* dp('nomatch exit h'); */ stops.dec('h'); return false } { return null } ) { return r } comments = '' / eof) cs:(space* newline space* cn:comment { return cn })* { return [new CommentTk( c.join('') )].concat(cs); } comment = '' / eof) { return [new CommentTk( c.join('') )]; } comment_chars = c:[^-] { return c; } / c:'-' !'->' { return c; } // Behavior switches behavior_switch = '__' behavior:( 'NOTOC' / 'FORCETOC' / 'TOC' / 'NOEDITSECTION' / 'NEWSECTIONLINK' / 'NONEWSECTIONLINK' / 'NOGALLERY' / 'HIDDENCAT' / 'NOCONTENTCONVERT' / 'NOCC' / 'NOTITLECONVERT' / 'NOTC' / 'START' / 'END' // XXX: Removed in 19213. Should we keep this? / 'INDEX' / 'NOINDEX' / 'STATICREDIRECT' ) '__' { return [ new TagTk( 'behavior-switch', [new KV('word', behavior)] ) ]; } /************************************************************** * External (bracketed and autolinked) links **************************************************************/ autolink = ! { return stops.onStack('extlink') } (urllink / autoref / isbn) urllink = target:url { return [ new TagTk( 'urllink', [new KV('href', target)] ) ]; } extlink = ! { return stops.onStack('extlink') } // extlink cannot be nested ( "[" & { return stops.push('extlink', true); } //target:urllink target:extlink_preprocessor_text text:(( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )* t:inlineline { return t } )? "]" { stops.pop('extlink'); if ( text === '' ) { // XXX: Link numbering should be implemented in post-processor. text = [ "[" + linkCount + "]" ]; linkCount++; } //console.warn( 'extlink text: ' + pp( text ) ); return [ new SelfclosingTagTk( 'extlink', [ new KV('href', target), new KV('content', text) ], { type: 'extlink' }) ]; } / "[" & { return stops.pop('extlink'); } ) autoref = ref:('RFC' / 'PMID') (newline / space)+ identifier:[0-9]+ { identifier = identifier.join(''); var base_urls = { 'RFC' : '//tools.ietfs.org/html/rfc%s', 'PMID' : '//www.ncbi.nlm.nih.gov/pubmed/%s?dopt=Abstract' }, url = sprintf(base_urls[ref], identifier); return [ new SelfclosingTagTk( 'extlink', [ new KV('href', sprintf(base_urls[ref], identifier)), new KV('content', [ref, identifier].join(' ')) ] ) ]; } isbn = 'ISBN' (newline / space)+ head:[0-9] digits:([- ]? [0-9])+ tail:'X'? { var isbn = [head, digits, tail].join('').replace(/,/g, ''); if (!isValidISBN(isbn)) { return null; } return [ new SelfclosingTagTk( 'wikilink', [ new KV('', 'Special:BookSources/' + isbn.replace(/[^\d]/g, '')), new KV('', 'ISBN ' + isbn), new KV('tail', ''), ] ) ]; } /* Default URL protocols in MediaWiki (see DefaultSettings). Normally * these can be configured dynamically. */ url_chars = [/fghimnstwIPR] url_protocol = '//' // for protocol-relative URLs / 'ftp://' / 'git://' / 'gopher://' / 'http://' / 'https://' / 'irc://' / 'ircs://' // @bug 28503 / 'mailto:' / 'mms://' / 'news:' / 'nntp://' // @bug 3808 RFC 1738 / 'svn://' / 'telnet://' // Well if we're going to support the above.. -ævar / 'worldwind://' // javascript does not support unicode features.. unicode_separator_space = [ \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] urlencoded_char = "%" c0:[0-9a-fA-F] c1:[0-9a-fA-F] { try { return decodeURI("%" + c0 + c1) } catch ( e ) { // Reject the match, and allow other fall-back productions to have a // go at it. return null; } } //[^][<>"\\x00-\\x20\\x7F\p{Zs}] // no punctiation, and '{<' to trigger directives no_punctuation_char = [^ :\]\[\n"'<>\x00-\x20\x7f,.&%\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000{] url = proto:url_protocol addr:( ipv6_address / ipv4_address )? path:( ( !inline_breaks c:no_punctuation_char { return c } ) / s:[.:,] !(space / eolf) { return s } / comment / tplarg_or_template / ! ( "&" ( [lL][tT] / [gG][tT] ) ";" ) r:( htmlentity / [&%{] ) { return r } )+ { //console.warn( "path: " + pp( flatten_stringlist( [proto + addr].concat( path ) ) ) ); return flatten_string( [proto + addr].concat( path ) ); } ipv4_address = a:([0-9]* '.' [0-9]* '.' [0-9]* '.' [0-9]*) { return flatten( a ).join(''); } ipv6_address = a:('[' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ':' [0-9]* ']') { return flatten( a ).join(''); } /************************************************************** * Templates, -arguments and wikilinks **************************************************************/ /* * Precedence: template arguments win over templates. See * http://www.mediawiki.org/wiki/Preprocessor_ABNF#Ideal_precedence * 4: {{{{·}}}} → {·{{{·}}}·} * 5: {{{{{·}}}}} → {{·{{{·}}}·}} * 6: {{{{{{·}}}}}} → {{{·{{{·}}}·}}} * 7: {{{{{{{·}}}}}}} → {·{{{·{{{·}}}·}}}·} */ tplarg_or_template = & '{{{{{{{' '{' tplarg_or_template '}' / & ( '{{{' &'{{{' tplarg ) tplarg // tplarg in template / & ( '{{' &'{{{' tplarg ) template / tplarg / template template = "{{" nl_comment_space* target:template_param_value params:(nl_comment_space* "|" r:( nl_comment_space* &"|" { return new KV( '', '') } // empty argument / p:template_param { return p } ) { return r } )* nl_comment_space* "}}" { // Insert target as first positional attribute, so that it can be // generically expanded. The TemplateHandler then needs to shift it out // again. params.unshift( { k: flatten( target ), v: [] } ); var obj = new SelfclosingTagTk( 'template', params ); //console.warn( 'tokenizer template ' + pos + ); //console.warn('template @' + pos + '::' + input.substr(pos, 40) ); return obj; } // XXX: support template and args in target! //template_target // = h:( !"}}" x:([^|\n]) { return x } )* { return h.join('') } tplarg = "{{{" name:template_param_value? params:( nl_comment_space* '|' nl_comment_space* r:( &'}}}' { return new KV( '', '') } / p:template_param { return p } ) { return r } )* nl_comment_space* "}}}" { name = flatten( name ); params.unshift( new KV( name, '' ) ); var obj = new SelfclosingTagTk( 'templatearg', params ); //console.warn( 'tokenizer tplarg ' + JSON.stringify( obj, null, 2 )); //console.warn('template arg @' + pos + '::' + input.substr(pos, 40) ); return obj; } template_param = name:template_param_name // MW accepts |foo | = bar | as a single param.. ('|' (space / newline)* &'=')? val:( s0:space* "=" s1:space* value:template_param_value? { return { s0: s0, s1: s1, value: value }; } )? { //console.warn( 'named template_param matched' + pp([name, value ]) ); if ( val !== '' ) { if ( val.value !== '' ) { return new KV( name, flatten( val.value ) ); } else { return new KV(flatten( name ), []); } } else { return new KV([], flatten(name)); } } // empty parameter / & [|}] { return new KV([], []); } // FIXME: handle template args and templates in key! (or even parser functions?) template_param_name = & { return stops.push( 'equal', true ) } tpt:(template_param_text / &'=' { return '' }) { stops.pop( 'equal' ); //console.warn( 'template param name matched: ' + pp( tpt ) ); return tpt; } / & { return stops.pop( 'equal' ) } //= h:( !"}}" x:([^=|\n]) { return x } )* { return h.join(''); } template_param_value = & { stops.inc( 'nopre' ); return stops.push( 'equal', false ) } tpt:template_param_text { stops.dec( 'nopre' ); stops.pop( 'equal' ); //console.warn( 'template param value matched: ' + pp( tpt ) ); return tpt; } / & { stops.dec( 'nopre' ); return stops.pop( 'equal' ) } template_param_text = & { /*console.warn( 'tpt: ' + input.substr( pos - 10, 9) + input[pos].green + input.substr( pos +1, 9) ); */ // re-enable tables within template parameters stops.push('table', false ); stops.push('extlink', false); return stops.inc('template') } il:nested_block+ { stops.pop('table'); stops.pop('extlink'); stops.dec('template'); //console.warn( 'tpt match: ' + pp (il) + " stops: " + pp(stops)); var r = flatten( il ), firstToken = r[0]; // Strip leading newlines and space-only text tokens. // XXX: Will need to round-trip these eventually. while ( firstToken !== undefined && ( firstToken.constructor === NlTk || firstToken.constructor === String && firstToken.match( /^[\t ]+$/ ) ) ) { r.shift(); firstToken = r[0]; } //console.warn('tpl param name' + pos + r[0] + '::' + input.substr(pos, 40) ); if ( r.length === 1 && r[0].constructor === String ) { r = r[0]; } return r; } / & { stops.pop('table'); stops.pop('extlink'); return stops.dec('template'); } // TODO: handle link prefixes as in al[[Razi]] wikilink = & { return posStack.push('wikilink' , pos); } "[[" ! url //target:link_target // XXX: disallow pipe! target:wikilink_preprocessor_text lcontent:( & { return posStack.push('lcontent' , pos); } lcs:( pipe lt:link_text { return new KV( '', lt ); } )+ { return { pos: posStack.pop('lcontent' , pos), content: lcs }; } / { return { pos: posStack.pop('lcontent' , pos), content: [] }; } ) "]]" // XXX In real MediaWiki, this is a language-dependent positive character // class. Can we work out a static negative class instead? // XXX: Exclude uppercase chars from non-latin languages too! tail:( ![A-Z \t(),.:-] tc:text_char { return tc } )* { var obj = new SelfclosingTagTk( 'wikilink' ), textTokens = []; obj.attribs.push( new KV('', target) ); obj.dataAttribs = { sourcePos: posStack.pop( 'wikilink', pos ), contentPos: lcontent.pos }; // XXX: Point to object with path, revision and input information //obj.source = input; //console.warn('lcontent: ' + JSON.stringify( lcontent, null, 2 ) ); // Deal with content. XXX: Properly support pipe-trick etc //lcontent.tail = tail && tail.join('') || ''; obj.attribs = obj.attribs.concat( lcontent.content ); obj.attribs.push( new KV( 'tail', tail && tail.join('') || '' ) ); //console.warn( "XXX:" + pp([obj].concat(textTokens, [new EndTagTk( 'a' )])) ); return [obj]; } / ! { return posStack.pop( 'wikilink', pos ); } link_text = & { return stops.inc('linkdesc'); } h:inline // 'equal' syntaxFlag is set for links in template parameters. Consume the // '=' here. hs:( '=' inline?)? { //console.warn('link_text' + pp(h) + pp(hs)); stops.dec('linkdesc'); if( hs !== '' ) { return h.concat(hs); } else { return h; } } / & { return stops.dec('linkdesc'); } link_option = & { stops.inc('pipe'); return stops.inc('linkdesc'); } h:inline // 'equal' syntaxFlag is set for links in template parameters. Consume the // '=' here. hs:( '=' inline)? { //console.warn('link_text' + pp(h) + pp(hs)); stops.dec('pipe'); stops.dec('linkdesc'); if( hs !== '' ) { return h.concat(hs); } else { return h; } } / & { stops.dec('pipe'); return stops.dec('linkdesc'); } link_end = "]]" /* Generic quote production for italic and bold, further processed in a token * stream transformation in doQuotes. Relies on NlTk tokens being emitted * for each line of text to balance quotes per line. * * We are not using a simple pair rule here as we need to support mis-nested * bolds/italics and MediaWiki's special heuristics for apostrophes, which are * all not context free. */ quote = "''" x:"'"* { var res = new TagTk( 'mw-quote' ); // Will be consumed in token transforms res.value = "''" + x.join(''); return res; } /** * Image option productions, only called from the LinkHandler token stream * transformer, and only for images. */ img_options = & { return stops.inc( 'pipe' ); } os:img_option* { stops.dec( 'pipe' ); var options = {}; os = flatten( os ); for ( var i = 0, l = os.length; i < l; i++ ) { var o = os[i]; options[o.k] = o.v; } options._options = os; return options; } / & { return stops.dec( 'pipe' ); } img_option = pipe space* o:( img_attribute / img_format / img_dimensions / img_halign / img_valign / img_link / lt:link_text { return new KV('caption', lt) } ) space* { return o }; img_format = f:( 'border' / 'frameless' / 'frame' / 'thumbnail' / 'thumb' ) { return new KV( 'format', f ); } img_dimensions = x:(n:[0-9]+ { return n.join('') })? y:('x' n:[0-9]+ { return n.join('') })? 'px' { if ( x === '' && y ) { return new KV( 'height', y ); } else if ( y === '' && x ) { return new KV( 'width', x ); } else { return [ new KV( 'width', x ), new KV( 'height', y ) ]; } } / 'upright' { return [ new KV( 'width', 'upright' ) ] } img_halign = a:( 'left' / 'right' / 'center' / 'none' ) { return new KV( 'halign', a ); } img_valign = a:( 'baseline' / 'sub' / 'super' / 'top' / 'text-top' / 'middle' / 'bottom' / 'text-bottom' ) { return new KV( 'valign', a ); } // External link targets are already parsed as link tokens. If we can make // sure that those cleanly convert back to the original text, then we could // re-parse them here. img_link = 'link=' space* u:( t:url { stops.dec( 'pipe' ); return t; } / & { return stops.dec( 'pipe' ); } ) { return new KV( 'link', u ); } img_attribute = k:( 'page' / 'alt' / 'thumbnail' / 'thumb' ) '=' t:preprocessor_text? { return new KV( k, t ); } /*********************************************************** * Pre and xmlish tags ***********************************************************/ // Indented pre blocks differ from their non-indented (purely tag-based) // cousins by having their contents parsed. pre_indent = pre_indent_in_tags / l:pre_indent_line ls:(sol pre_indent_line)* { return [new TagTk( 'pre' )] .concat( [l], ls , [new EndTagTk( 'pre' )]); } // An indented pre block that is surrounded with pre tags. The pre tags are // used directly. pre_indent_in_tags = space+ // XXX: capture space for round-tripping "" & { return stops.inc('pre'); } l:inlineline ls:(sol pre_indent_line)* "" { stops.dec('pre'); return [ new TagTk( 'pre', attribs ) ] .concat( l, flatten( ls ), [ new EndTagTk( 'pre' ) ] ); } / & { return stops.dec('pre'); } pre_indent_line = space l:inlineline { return [ '\n' ].concat(l); } /* * Pre blocks defined using non-indented HTML tags only parse nowiki tags * inside them, and convert other content to verbatim text. Nowiki inside pre * is not functionally needed, but supported for backwards compatibility. */ pre = "" // MediaWiki

 is special in that it converts all pre content to plain
    // text.
    ts:(t1:[^<]+ { return t1.join('') } 
                / nowiki 
                / !"

" t2:. { return t2 })+ ("" / eof) { // return nowiki tags as well? //console.warn('inpre'); return [ new TagTk( 'pre', attribs ) ] .concat(ts, [ new EndTagTk( 'pre' ) ]); } / "" { return ""; } /* XXX: Extension tags can require a change in the tokenizer mode, which * returns any text between extension tags verbatim. For now, we simply * continue to parse the contained text and return the tokens. The original * input source can be recovered from the source positions added on tag * tokens. This won't however work in all cases. For example, a comment start * (