/* Produces output more or less compatible with FakeParser; plug it into FP's output and see */ { var dp = function ( msg ) { if ( false ) { console.log(msg); } }; /* * Flags for specific parse environments (inside tables, links etc). Flags * trigger syntactic stops in the inline_breaks production, which * terminates inline and attribute matches. Flags merely reduce the number * of productions needed: The grammar is still context-free as the * productions can just be unrolled for all combinations of environments * at the cost of a much larger grammar. */ var syntaxFlags = {}; var setFlag = function(flag) { if (syntaxFlags[flag] !== undefined) { syntaxFlags[flag]++; } else { syntaxFlags[flag] = 1; } return true; }; var clearFlag = function(flag) { syntaxFlags[flag]--; }; var pp = function ( s ) { return JSON.stringify(s, null, 2); } // Convert list prefixes to a list of WikiDom list styles var bulletsToTypes = function (bullets) { var bTypes = []; var blen = bullets.length; for (var i = 0; i < bullets.length; i++) { switch (bullets[i]) { case '*': bTypes.push('bullet'); break; case '#': bTypes.push('number'); break; case ';': bTypes.push('term'); break; case ':': bTypes.push('description'); break; } } return bTypes; }; /*var extractInline = function ( node ) { return { text: extractText(node, 0) }; }; // return [text [annotations]] var extractText = function ( node, offset ) { dp("extract: " + pp(node)); if (typeof node === 'string') { return [node, []]; } else if ($.isArray(node)) { var texts = [], annotations = []; for (var i = 0, length = node.length; i < length; i++) { var res = extractText(node[i], offset); texts.push(res[0]); annotations.concat(res[1]); offset += res[0].length; } return [texts.join(''), annotations]; } else if ( 'text' in node ) { var res = extractText(node, offset); if ('annotations' in node) { return [res[0], node.annotations.concat(res[1])]; } else { return res; } } else if ( 'content' in node ) { return extractText(node.content, offset); } else if ( 'children' in node ) { var texts = []; for (var i = 0, length = node.children.length; i < length; i++) { texts.push(extractText(node.children[i])); } return texts.join(''); } else { throw ("extract failed: " + pp(node)); } }; */ // Start position of top-level block // Could also provide positions for lower-level blocks using a stack. var blockStart = 0; var unquote = function (quotec, text) { return text.replace('\\' + quotec, quotec); }; var flatten = function ( e ) { var es = []; // flatten sub-arrays for(var i = 0, length = e.length; i < length; i++) { var ei = e[i]; if ($.isArray(ei)) es = es.concat(flatten(ei)); else es.push(ei); }; return es; }; } start = e:toplevelblock* newline* { return flatten(e); } anyblock = block / inline anyblockline = block / inlineline // All chars that cannot start syntactic structures text = t:[A-Za-z0-9,._ -]+ { return t.join('') } space = s:[ \t]+ { return s.join(''); } // Start of line sol = (newline / & { return pos === 0; } { return true; }) cn:(c:comment n:newline? { return [c, n] })? { return cn; } newline = '\n' / '\r\n' toplevelblock = & { blockStart = pos; return true; } b:block { b = flatten(b); var bs = b[0]; dp('toplevelblock:' + pp(b)); if (bs.attribs === undefined) { bs.attribs = []; } bs.attribs.push(['data-sourcePos', blockStart + ':' + pos]); return b; } block = (sol space* &newline)? bl:block_lines { return bl; } / para / comment / (s:sol { if (s) { return [s, {type: 'NEWLINE'}]; } else { return [{type: 'NEWLINE'}]; } } ) // Block structures with start-of-line wiki syntax block_lines = h / table / lists / pre_indent /* Headings */ h = h1 / h2 / h3 / h4 / h5 / h6 h1 = sol '=' ( & { setFlag('h'); return setFlag('h1') } c:inlineline '=' comment? &newline { clearFlag('h'); clearFlag('h1'); return [{type: 'TAG', name: 'h1'}] .concat(c, [{type: 'ENDTAG', name: 'h1'}]); } / { clearFlag('h'); clearFlag('h1'); return null } ) h2 = sol '==' ( & { setFlag('h'); return setFlag('h2') } c:inlineline '==' comment? &newline { clearFlag('h'); clearFlag('h2'); return [{type: 'TAG', name: 'h2'}] .concat(c, [{type: 'ENDTAG', name: 'h2'}]); } / { clearFlag('h'); clearFlag('h2'); return null } ) h3 = sol '===' ( & { setFlag('h'); return setFlag('h3') } c:inlineline '===' comment? &newline { clearFlag('h'); clearFlag('h3'); return [{type: 'TAG', name: 'h3'}] .concat(c, [{type: 'ENDTAG', name: 'h3'}]); } / { clearFlag('h'); clearFlag('h3'); return null } ) h4 = sol '====' ( & { setFlag('h'); return setFlag('h4') } c:inlineline '====' comment? &newline { clearFlag('h'); clearFlag('h4'); return [{type: 'TAG', name: 'h4'}] .concat(c, [{type: 'ENDTAG', name: 'h4'}]); } / { clearFlag('h'); clearFlag('h4'); return null } ) h5 = sol '=====' (& { setFlag('h'); return setFlag('h5') } c:inlineline '=====' comment? &newline { clearFlag('h'); clearFlag('h5'); return [{type: 'TAG', name: 'h5'}] .concat(c, [{type: 'ENDTAG', name: 'h5'}]); } / { clearFlag('h'); clearFlag('h5'); return null } ) h6 = sol '======' (& { setFlag('h'); return setFlag('h6') } c:inlineline '======' comment? &newline { clearFlag('h'); clearFlag('h6'); return [{type: 'TAG', name: 'h6'}] .concat(c, [{type: 'ENDTAG', name: 'h6'}]); } / { clearFlag('h'); clearFlag('h6'); return null } ) heading_marker = '=' '='* heading_text = h:( !(heading_marker newline) x:inlineline { return x } )* { return h.join(''); } // TODO: convert inline content to annotations! para = (sol br)? pl:para_lines { return pl; } para_lines = s:sol c:inlineline cs:(!block_lines para_lines)* { var res = [{type: 'TAG', name: 'p'}]; if (s !== '') { res.push(s) } //console.log('paralines' + pp(res.concat(c, cs, [{type: 'ENDTAG', name: 'p'}]))); return res.concat(c, cs, [{type: 'ENDTAG', name: 'p'}]); } br = space* &newline { return {type: 'SELFCLOSINGTAG', name: 'br'} } pre_indent = l:pre_indent_line+ { return [{type: 'TAG', name: 'pre'}] .concat( l , [{type: 'ENDTAG', name: 'pre'}]); } pre_indent_line = sol space l:inlineline { return l } // Syntax that stops inline expansion inline_breaks = //& { console.log(pp(syntaxFlags)); return true; } & { return syntaxFlags['table']; } a:(newline [!|] / '||' / '!!' / '|}') { dp("table break" + pp(a)); return true; } / & { return syntaxFlags['italic']; } italic_marker { return true; } / & { return syntaxFlags['bold']; } bold_marker { return true; } / & { return syntaxFlags['linkdesc']; } link_end { return true; } / & { return syntaxFlags['h']; } ( & { return syntaxFlags['h1'] } '=' newline { return true; } / & { return syntaxFlags['h2'] } '==' newline { return true; } / & { return syntaxFlags['h3'] } '===' newline { return true; } / & { return syntaxFlags['h4'] } '====' newline { return true; } / & { return syntaxFlags['h5'] } '=====' newline { return true; } / & { return syntaxFlags['h6'] } '======' newline { return true; } ) inline = c:(text / inline_element / (!inline_breaks ch:. { return ch; }))+ { var out = []; var text = []; c = flatten(c); for (var i = 0; i < c.length; i++) { if (typeof c[i] == 'string') { text.push(c[i]); } else { if (text.length) { out.push({ type: "TEXT", value: text.join('') }); text = []; } out.concat(c[i]); } } if (text.length) { out.push({ type: 'TEXT', value: text.join('') }); } return out; } inlineline = c:(text / !inline_breaks (inline_element / [^\n]))+ { var out = []; var text = []; c = flatten(c); for (var i = 0; i < c.length; i++) { if (typeof c[i] == 'string') { text.push(c[i]); } else { if (text.length) { out.push({type: 'TEXT', value: text.join('')}); text = []; } out.push(c[i]); } } if (text.length) { out.push({type: 'TEXT', value: text.join('')}); } //dp('inlineline out:', pp(out)); return out; } /* TODO: convert all these to annotations! * -> need (start, end) offsets within block */ inline_element = comment / xmlish_tag / extlink / template / link / bold / italic comment = '' (space* newline space* comment)* { return [{ type: 'COMMENT', value: c.join('') }]; } comment_chars = c:[^-] { return c; } / c:'-' !'->' { return c; } extlink = "[" target:url " " text:extlink_text "]" { return [ { type: 'TAG', name: 'a', attribs: [['href', target]] } , {type: 'TEXT', value: text} , {type: 'ENDTAG', name: 'a'}]; } // = "[" target:url text:extlink_text "]" { return { type: 'extlink', target: target, text: text } } url = proto:"http:" rest:([^ \]]+) { return proto + rest.join(''); } extlink_text = c:[^\]]+ { return c.join(''); } template = "{{" target:link_target params:("|" p:template_param { return p })* "}}" { var obj = { type: 'SELFCLOSINGTAG', name: 'template', attribs: [['target', target]] } if (params && params.length) { obj.attribs.push(params); } return obj; } template_param = name:template_param_name "=" c:template_param_text { return [name, c]; } / c:template_param_text { return [null, c]; } tplarg = "{{{" name:link_target params:("|" p:template_param { return p })* "}}}" { var obj = { type: 'SELFCLOSINGTAG', name: 'templatearg', attribs: [['argname', name]] }; if (params && params.length) { obj.attribs.push(params); } return obj; } template_param_name = h:( !"}}" x:([^=|]) { return x } )* { return h.join(''); } template_param_text = template_param_text_chunk* /* = h:( !"}}" x:([^|]) { return x } )* { return h.join(''); }*/ template_param_text_chunk = comment / xmlish_tag / extlink / template / link / bold / italic / !"}}" x:([^|]) { return x } link = "[[" target:link_target text:("|" link_text)* "]]" { var obj = { type: 'TAG', name: 'a', attribs: [['data-type', 'internal']] }; obj.attribs.push(['href', target]); if (text && text.length) { var textTokens = text[0][1]; // XXX } else { var textTokens = [{type: 'TEXT', value: target}]; } return [obj].concat(textTokens, [{type: 'ENDTAG', name: 'a'}]); } link_target = h:( !"]]" x:([^|]) { return x } )* { return h.join(''); } link_text = h:( & { return setFlag('linkdesc'); } x:inlineline { return x } )* { clearFlag('linkdesc') return h; } / & { clearFlag('linkdesc') } { return null; } link_end = "]]" bold = bold_marker & { dp('benter:' + pos); return setFlag('bold'); } c:inlineline bold_marker { clearFlag('bold'); return [{ type: 'TAG', name: 'b' }] .concat(c, [{type: 'ENDTAG', name: 'b'}]); } / bold_marker { clearFlag('bold'); return null } bold_marker = "'''" italic = italic_marker & { dp('ienter:' + pos); return setFlag('italic'); } c:inlineline italic_marker { clearFlag('italic'); dp('ileave:' + pos); return [{ type: 'TAG', name: 'i' }] .concat(c, [{ type: 'ENDTAG', name: 'i'}]); } / italic_marker { clearFlag('italic'); return null } italic_marker = "''" /* Will need to check anything xmlish agains known/allowed HTML tags and * registered extensions, otherwise fail the match. Should ref be treated as a * regular extension? */ xmlish_tag = ref / references ref = ref_full / ref_empty /* Can we do backreferences to genericize this? */ ref_full = start:ref_start ">" content:ref_content* close:ref_end { return [ { type: 'TAG', name: 'ext', attribs: [['data-extname', 'ref']] .concat(start.params, [['data-startws', start.ws]])}, content, {type: 'ENDTAG', name: 'ref'} ]; } ref_empty = start:ref_start close:(space* "/>") { return [{ type: 'SELFCLOSINGTAG', name: 'ext', attribs: [['data-extname', 'ref']] .concat(start.params ,[['data-startws', start.ws]]) }]; } ref_start = "") { return all.join(''); } ref_content = !ref_end a:inline { // XXX: ineffective syntactic stop return a; } /* fixme probably have to programatically add these */ references = references_full / references_empty references_full = start:references_start ">" content:references_content* close:references_end { return [ { type: 'TAG', name: 'ext', attribs: [['data-extname', 'references']] .concat(start.params ,[['data-startws', start.ws]]) }, content, { type: 'ENDTAG', name: 'ext' } ]; } references_empty = start:references_start close:(space* "/>") { return [{ type: 'SELFCLOSINGTAG', name: 'ext', attribs: [['data-extname', 'references']] .concat(start.params ,[['data-startws', start.ws]]) }]; } references_start = "") { return all.join(''); } references_content = !references_end a:inline { return a; } ext_param = space* name:ext_param_name "=" val:ext_param_val { val[0] = name; return val; } ext_param_name = name:[a-zA-Z0-9-]+ { return name.join(''); } ext_param_val = t:[0-9A-Za-z]+ { return [null, t.join('')]; } / "'" t:[^'>]+ "'" { return [null, unquote("'", t.join(''))]; } / '"' t:[^">]+ '"' { return [null, unquote('"', t.join(''))]; } lists = es:(dtdd / li)+ { return [ { type: 'TAG', name: 'ul'} ] // XXX!! .concat(flatten(es) ,[{ type: 'ENDTAG', name: 'ul' }]); } li = sol bullets:list_char+ c:inlineline &newline { return [ { type: 'TAG', name: 'li', attribs: [['data-styles', bullets]] } , c , { type: 'ENDTAG', name: 'li' } ]; } dtdd = sol bullets:list_char+ c:(inline_element / (n:[^:\n] { return {type: 'TEXT', value: n}; }))+ ":" d:(inline_element / (n:[^\n] { return {type: 'TEXT', value: n}; }))+ &newline { // reject rule if bullets do not end in semicolon if (bullets[bullets.length - 1] != ';') { return null; } else { return [ { type: 'TAG', name: 'dl', attribs: [['data-styles', bullets]] } , { type: 'TAG', name: 'dt' } ] .concat( c , [ {type: 'ENDTAG', name: 'dt'} , {type: 'TAG', name: 'dd'} ] , d , [ {type: 'ENDTAG', name: 'dd'} , {type: 'ENDTAG', name: 'dl'} ]); } } list_char = [*#:;] /* Tables */ table = tas:table_start c:table_caption? b:table_body? table_end { var res = {type: 'TAG', name: 'table'} var body = b !== '' ? b : []; dp("body: " + pp(body)); if (tas.length > 0) { // FIXME: actually parse and build structure res.attribs = [['data-unparsed', tas.join('')]]; } if (c != '') { var caption = [{type: 'TAG', name: 'caption'}] .concat(c, [{type: 'ENDTAG', name: 'caption'}]); } else { var caption = []; } //dp(pp(res)); return [res].concat(caption, body, [{type: 'ENDTAG', name: 'table'}]); } table_start = sol "{|" & { setFlag('table'); return true; } ta:table_attribs* space* { //dp("table_start " + pp(ta) + ", pos:" + pos); return ta; } / sol "{|" { clearFlag('table'); return null; } table_attribs = text / ! inline_breaks !newline . table_caption = newline "|+" c:inline* { return c; } table_body = & { dp("table_body enter"); return true; } firstrow:table_firstrow otherrows:table_row* { /* dp('table first and otherrows: ' * + pp([firstrow].concat(otherrows))); */ return [firstrow].concat(otherrows); } / otherrows:table_row* { //dp('table otherrows: ' + pp(otherrows)); return otherrows; } table_firstrow = td:table_data+ { return [{ type: 'TAG', name: 'tr' }] .concat(td, [{type: 'ENDTAG', name: 'tr'}]); } table_row = & { dp("table row enter"); return true; } newline "|-" thtd_attribs? space* td:(table_data / table_header)* { return [{type: 'TAG', name: 'tr'}] .concat(td, [{type: 'ENDTAG', name: 'tr'}]); } table_data = & { dp("table_data enter, pos=" + pos); return true; } ("||" / newline "|") ! [}+-] a:thtd_attribs? td:(!inline_breaks anyblock)* { dp("table data result: " + pp(td) + ", attribts: " + pp(a)); return [{ type: 'TAG', name: 'td', attribs: [['data-unparsed', a]]}] .concat(td, [{type: 'ENDTAG', name: 'td'}]); } table_header = ("!!" / newline "!") a:thtd_attribs? c:inline { return [{type: 'TAG', name: 'th', attribs: [['data-unparsed', a]]}] .concat(c, [{type: 'ENDTAG', name: 'th'}]); } thtd_attribs // In particular, do not match [|\n] = a:(text / ! inline_breaks [="':;/,.-] )+ "|" ! [|}+-] { return a; } table_end = newline? "|}" { clearFlag('table'); } /* Wikidom TODO: * split off text into content nodes * convert inlines into annotations * change contents into children * * { text: text, * annotations: [(normal annotations)], * maybeannotations: [ * { type: 'something', * side: MA_START, * tag: { start: x, length: y } * } * ] * } * offsets in annotations: presume maybeannotations are actually text * -> need to transform annotations if match found * -> format annotations, comments can run to the end (re-opened after * block-level tags); only closed on table cells, object,? * -> other annotations (images, templates etc) are limited by block-level * elements, tightly bound * * Block-level elements * -------------------- * - Need some early clean-up to provide structure and offsets * - Establish scope limits for some inlines * - Line-based balanced by construction * - HTML tags need balancing/ matching / implicit close * - content in illegal places (e.g. between table and td tags) needs foster * parenting * - grammar will match outermost pair if unmatched pairs are recognized as * tokens (or as text) * - post-processing needed, but has to be limited by scope */ /* Tabs do not mix well with the hybrid production syntax */ /* vim: et:ts=4:sw=4:cindent */