mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-29 00:30:44 +00:00
59fc634cce
Change-Id: I321d9a58ea1af33842a606fc8706938093a8330f
936 lines
28 KiB
JavaScript
936 lines
28 KiB
JavaScript
require('../core-upgrade');
|
|
var HTML5 = require('../html5');
|
|
var events = require('events');
|
|
var Buffer = require('./buffer').Buffer;
|
|
var Models = HTML5.Models;
|
|
|
|
function keys(h) {
|
|
var r = [];
|
|
for(var k in h) {
|
|
r.push(k);
|
|
}
|
|
return r;
|
|
}
|
|
|
|
var t = HTML5.Tokenizer = function HTML5Tokenizer(input, document) {
|
|
var state;
|
|
var buffer = new Buffer();
|
|
var escapeFlag = false;
|
|
var lastFourChars = '';
|
|
var current_token = null;
|
|
var script_buffer = null;
|
|
var content_model = Models.PCDATA;
|
|
|
|
function data_state(buffer) {
|
|
var c = buffer.char()
|
|
if(c !== HTML5.EOF && (content_model == Models.CDATA || content_model == Models.RCDATA || content_model == Models.SCRIPT_CDATA)) {
|
|
lastFourChars += c;
|
|
if(lastFourChars.length >= 4) {
|
|
lastFourChars = lastFourChars.substr(-4)
|
|
}
|
|
}
|
|
|
|
if (content_model == Models.SCRIPT_CDATA) {
|
|
if (script_buffer == null) {
|
|
script_buffer = '';
|
|
}
|
|
}
|
|
|
|
if(c === HTML5.EOF) {
|
|
emitToken(HTML5.EOF_TOK);
|
|
buffer.commit();
|
|
return false;
|
|
} else if(c === '\0' && (content_model == Models.SCRIPT_CDATA || content_model == Models.PLAINTEXT || content_model == Models.RAWTEXT || content_model == Models.RCDATA)) {
|
|
emitToken({type: 'Characters', data: "\ufffd"});
|
|
buffer.commit();
|
|
} else if(c == '&' && (content_model == Models.PCDATA || content_model == Models.RCDATA) && !escapeFlag) {
|
|
newState(entity_data_state);
|
|
} else if(c == '-' && (content_model == Models.CDATA || content_model == Models.RCDATA || content_model == Models.SCRIPT_CDATA) && !escapeFlag && lastFourChars == '<!--') {
|
|
escapeFlag = true;
|
|
emitToken({type: 'Characters', data: c});
|
|
buffer.commit();
|
|
} else if(c == '<' && !escapeFlag && (content_model == Models.PCDATA || content_model == Models.RCDATA || content_model == Models.CDATA || content_model == Models.SCRIPT_CDATA)) {
|
|
newState(tag_open_state);
|
|
} else if(c == '>' && escapeFlag && (content_model == Models.CDATA || content_model == Models.RCDATA || content_model == Models.SCRIPT_CDATA) && lastFourChars.match(/-->$/)) {
|
|
escapeFlag = false;
|
|
emitToken({type: 'Characters', data: c});
|
|
buffer.commit();
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(c)) {
|
|
emitToken({type: 'SpaceCharacters', data: c + buffer.matchWhile(HTML5.SPACE_CHARACTERS)});
|
|
buffer.commit();
|
|
} else {
|
|
var o = buffer.matchUntil("[&<>-]");
|
|
if(o !== HTML5.EOF) {
|
|
c = c + o;
|
|
}
|
|
emitToken({type: 'Characters', data: c});
|
|
lastFourChars += c
|
|
lastFourChars = lastFourChars.slice(-4)
|
|
buffer.commit();
|
|
}
|
|
return true;
|
|
};
|
|
|
|
var entity_data_state = function entity_data_state(buffer) {
|
|
var entity = consume_entity(buffer);
|
|
if(entity) {
|
|
emitToken({type: 'Characters', data: entity});
|
|
} else {
|
|
emitToken({type: 'Characters', data: '&'});
|
|
}
|
|
newState(data_state);
|
|
return true;
|
|
}
|
|
|
|
this.tokenize = function() {
|
|
if(this.pump) this.pump();
|
|
}
|
|
|
|
var emitToken = function emitToken(tok) {
|
|
tok = normalize_token(tok);
|
|
if (content_model == Models.SCRIPT_CDATA && (tok.type == 'Characters' || tok.type == 'SpaceCharacters') && !buffer.eof) {
|
|
HTML5.debug('tokenizer.addScriptData', tok)
|
|
script_buffer += tok.data;
|
|
} else {
|
|
HTML5.debug('tokenizer.token', tok)
|
|
this.emit('token', tok);
|
|
}
|
|
}.bind(this);
|
|
|
|
function consume_entity(buffer, from_attr) {
|
|
var char = null;
|
|
var chars = buffer.char();
|
|
if(chars === HTML5.EOF) return false;
|
|
if(chars.match(HTML5.SPACE_CHARACTERS) || chars == '<' || chars == '&') {
|
|
buffer.unget(chars);
|
|
} else if(chars[0] == '#') { // Maybe a numeric entity
|
|
var c = buffer.shift(2);
|
|
if(c === HTML5.EOF) {
|
|
buffer.unget(chars);
|
|
return false;
|
|
}
|
|
chars += c;
|
|
if(chars[1] && chars[1].toLowerCase() == 'x' && HTML5.HEX_DIGITS_R.test(chars[2])) {
|
|
// Hex entity
|
|
buffer.unget(chars[2]);
|
|
char = consume_numeric_entity(buffer, true);
|
|
} else if(chars[1] && HTML5.DIGITS_R.test(chars[1])) {
|
|
// Decimal entity
|
|
buffer.unget(chars.slice(1));
|
|
char = consume_numeric_entity(buffer, false);
|
|
} else {
|
|
// Not numeric
|
|
buffer.unget(chars);
|
|
parse_error("expected-numeric-entity");
|
|
}
|
|
} else {
|
|
var filteredEntityList = keys(HTML5.ENTITIES).filter(function(e) {
|
|
return e[0] == chars[0];
|
|
});
|
|
var entityName = null;
|
|
while(true) {
|
|
if(filteredEntityList.some(function(e) {
|
|
return e.indexOf(chars) == 0;
|
|
})) {
|
|
filteredEntityList = filteredEntityList.filter(function(e) {
|
|
return e.indexOf(chars) == 0;
|
|
});
|
|
var c = buffer.char();
|
|
if(c !== HTML5.EOF) {
|
|
chars += c;
|
|
} else {
|
|
break;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
|
|
if(HTML5.ENTITIES[chars]) {
|
|
entityName = chars;
|
|
if(entityName[entityName.length - 1] == ';') break;
|
|
}
|
|
}
|
|
|
|
if(entityName) {
|
|
char = HTML5.ENTITIES[entityName];
|
|
|
|
if(entityName[entityName.length - 1] != ';' && this.from_attribute && (HTML5.ASCII_LETTERS_R.test(chars.substr(entityName.length, 1) || HTML5.DIGITS.test(chars.substr(entityName.length, 1))))) {
|
|
buffer.unget(chars);
|
|
char = '&';
|
|
} else {
|
|
buffer.unget(chars.slice(entityName.length));
|
|
}
|
|
} else {
|
|
parse_error("expected-named-entity");
|
|
buffer.unget(chars);
|
|
}
|
|
}
|
|
|
|
return char;
|
|
}
|
|
|
|
function replaceEntityNumbers(c) {
|
|
switch(c) {
|
|
case 0x00: return 0xFFFD; // REPLACEMENT CHARACTER
|
|
case 0x13: return 0x0010; // Carriage return
|
|
case 0x80: return 0x20AC; // EURO SIGN
|
|
case 0x81: return 0x0081; // <control>
|
|
case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK
|
|
case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK
|
|
case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
|
|
case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS
|
|
case 0x86: return 0x2020; // DAGGER
|
|
case 0x87: return 0x2021; // DOUBLE DAGGER
|
|
case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
|
|
case 0x89: return 0x2030; // PER MILLE SIGN
|
|
case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
|
|
case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
|
case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE
|
|
case 0x8D: return 0x008D; // <control>
|
|
case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
|
|
case 0x8F: return 0x008F; // <control>
|
|
case 0x90: return 0x0090; // <control>
|
|
case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK
|
|
case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK
|
|
case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK
|
|
case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK
|
|
case 0x95: return 0x2022; // BULLET
|
|
case 0x96: return 0x2013; // EN DASH
|
|
case 0x97: return 0x2014; // EM DASH
|
|
case 0x98: return 0x02DC; // SMALL TILDE
|
|
case 0x99: return 0x2122; // TRADE MARK SIGN
|
|
case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON
|
|
case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
|
case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE
|
|
case 0x9D: return 0x009D; // <control>
|
|
case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON
|
|
case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
|
|
default:
|
|
if((c >= 0xD800 && c <= 0xDFFF) || c >= 0x10FFFF) { /// @todo. The spec says > 0x10FFFF, not >=. Section 8.2.4.69.
|
|
return 0xFFFD;
|
|
} else if((c >= 0x0001 && c <= 0x0008) || (c >= 0x000E && c <= 0x001F) || (c >= 0x007F && c <= 0x009F) || (c >= 0xFDD0 && c <= 0xFDEF)
|
|
|| c == 0x000B || c == 0xFFFE || c == 0x1FFFE || c == 0x2FFFFE || c == 0x2FFFF || c == 0x3FFFE || c == 0x3FFFF || c == 0x4FFFE || c == 0x4FFFF
|
|
|| c == 0x5FFFE || c == 0x5FFFF || c == 0x6FFFE || c == 0x6FFFF || c == 0x7FFFE || c == 0x7FFFF || c == 0x8FFFE || c == 0x8FFFF
|
|
|| c == 0x9FFFE || c == 0x9FFFF || c == 0xAFFFE || c == 0xAFFFF || c == 0xBFFFE || c == 0xBFFFF || c == 0xCFFFE || c == 0xCFFFF
|
|
|| c == 0xDFFFE || c == 0xDFFFF || c == 0xEFFFE || c == 0xEFFFF || c == 0xFFFFE || c == 0xFFFFF || c == 0x10FFFE || c == 0x10FFFF) {
|
|
return c;
|
|
}
|
|
}
|
|
};
|
|
|
|
function consume_numeric_entity(buffer, hex) {
|
|
if(hex) {
|
|
var allowed = HTML5.HEX_DIGITS_R;
|
|
var radix = 16;
|
|
} else {
|
|
var allowed = HTML5.DIGITS_R;
|
|
var radix = 10;
|
|
}
|
|
|
|
var chars = '';
|
|
|
|
var c = buffer.char();
|
|
while(c !== HTML5.EOF && allowed.test(c)) {
|
|
chars = chars + c;
|
|
c = buffer.char();
|
|
}
|
|
|
|
var charAsInt = parseInt(chars, radix);
|
|
|
|
var replacement;
|
|
if(replacement = replaceEntityNumbers(charAsInt)) {
|
|
parse_error("invalid-numeric-entity-replaced", {old: charAsInt, new: replacement});
|
|
charAsInt = replacement;
|
|
}
|
|
|
|
var char = String.fromCharCode(charAsInt);
|
|
/*if(charAsInt <= 0x10FFFF && !(charAsInt >= 0xD800 && charAsInt <= 0xDFFF)) {
|
|
} else {
|
|
char = String.fromCharCode(0xFFFD);
|
|
parse_error("cant-convert-numeric-entity");
|
|
} */
|
|
|
|
if(c !== ';') {
|
|
parse_error("numeric-entity-without-semicolon");
|
|
buffer.unget(c);
|
|
}
|
|
|
|
return char;
|
|
}
|
|
|
|
function process_entity_in_attribute(buffer) {
|
|
var entity = consume_entity(buffer);
|
|
if(entity) {
|
|
current_token.data.last().nodeValue += entity;
|
|
} else {
|
|
current_token.data.last().nodeValue += '&';
|
|
}
|
|
}
|
|
|
|
function process_solidus_in_tag(buffer) {
|
|
var data = buffer.peek(1);
|
|
if(current_token.type == 'StartTag' && data == '>') {
|
|
current_token.type = 'EmptyTag';
|
|
return true;
|
|
} else {
|
|
parse_error("incorrectly-placed-solidus");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
function tag_open_state(buffer) {
|
|
var data = buffer.char();
|
|
if(content_model == Models.PCDATA) {
|
|
if (data === HTML5.EOF) {
|
|
parse_error("bare-less-than-sign-at-eof");
|
|
emitToken({type: 'Characters', data: '<'});
|
|
newState(data_state);
|
|
} else if (data !== HTML5.EOF && HTML5.ASCII_LETTERS_R.test(data)) {
|
|
current_token = {type: 'StartTag', name: data, data: []};
|
|
newState(tag_name_state);
|
|
} else if(data == '!') {
|
|
newState(markup_declaration_open_state);
|
|
} else if (data == '/') {
|
|
newState(close_tag_open_state);
|
|
} else if (data == '>') {
|
|
// XXX In theory it could be something besides a tag name. But
|
|
// do we really care?
|
|
parse_error("expected-tag-name-but-got-right-bracket");
|
|
emitToken({type: 'Characters', data: "<>"});
|
|
newState(data_state);
|
|
} else if (data == '?') {
|
|
// XXX In theory it could be something besides a tag name. But
|
|
// do we really care?
|
|
parse_error("expected-tag-name-but-got-question-mark");
|
|
buffer.unget(data);
|
|
newState(bogus_comment_state);
|
|
} else {
|
|
// XXX
|
|
parse_error("expected-tag-name");
|
|
emitToken({type: 'Characters', data: "<"});
|
|
buffer.unget(data);
|
|
newState(data_state);
|
|
}
|
|
} else {
|
|
// We know the content model flag is set to either RCDATA or CDATA or SCRIPT_CDATA
|
|
// now because this state can never be entered with the PLAINTEXT
|
|
// flag.
|
|
if (data === '/') {
|
|
newState(close_tag_open_state);
|
|
} else {
|
|
emitToken({type: 'Characters', data: "<"});
|
|
buffer.unget(data);
|
|
newState(data_state);
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
function close_tag_open_state(buffer) {
|
|
if(content_model == Models.RCDATA || content_model == Models.CDATA || content_model == Models.SCRIPT_CDATA) {
|
|
var chars = '';
|
|
if(current_token) {
|
|
for(var i = 0; i <= current_token.name.length; i++) {
|
|
var c = buffer.char();
|
|
if(c === HTML5.EOF) break;
|
|
chars += c;
|
|
}
|
|
buffer.unget(chars);
|
|
}
|
|
|
|
if(current_token
|
|
&& current_token.name.toLowerCase() == chars.slice(0, current_token.name.length).toLowerCase()
|
|
&& (chars.length > current_token.name.length ? new RegExp('[' + HTML5.SPACE_CHARACTERS_IN + '></\0]').test(chars.substr(-1)) : true)
|
|
) {
|
|
content_model = Models.PCDATA;
|
|
} else {
|
|
emitToken({type: 'Characters', data: '</'});
|
|
newState(data_state);
|
|
return true
|
|
}
|
|
}
|
|
|
|
var data = buffer.char()
|
|
if (data === HTML5.EOF) {
|
|
parse_error("expected-closing-tag-but-got-eof");
|
|
emitToken({type: 'Characters', data: '</'});
|
|
buffer.unget(data);
|
|
newState(data_state);
|
|
} else if (HTML5.ASCII_LETTERS_R.test(data)) {
|
|
current_token = {type: 'EndTag', name: data, data: []}
|
|
newState(tag_name_state);
|
|
} else if (data == '>') {
|
|
parse_error("expected-closing-tag-but-got-right-bracket");
|
|
newState(data_state);
|
|
} else {
|
|
parse_error("expected-closing-tag-but-got-char", {data: data}); // param 1 is datavars:
|
|
buffer.unget(data);
|
|
newState(bogus_comment_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function tag_name_state(buffer) {
|
|
var data = buffer.char();
|
|
if(data === HTML5.EOF) {
|
|
parse_error('eof-in-tag-name');
|
|
emit_current_token();
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
newState(before_attribute_name_state);
|
|
} else if(HTML5.ASCII_LETTERS_R.test(data)) {
|
|
var c = buffer.matchWhile(HTML5.ASCII_LETTERS)
|
|
if(c !== HTML5.EOF) {
|
|
current_token.name += data + c
|
|
} else {
|
|
current_token.name += data
|
|
buffer.unget(c)
|
|
newState(data_state);
|
|
}
|
|
} else if(data == '>') {
|
|
emit_current_token();
|
|
} else if(data == '/') {
|
|
process_solidus_in_tag(buffer)
|
|
newState(self_closing_tag_state);
|
|
} else {
|
|
current_token.name += data;
|
|
}
|
|
buffer.commit();
|
|
|
|
return true;
|
|
}
|
|
|
|
function before_attribute_name_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if (data === HTML5.EOF) {
|
|
parse_error("expected-attribute-name-but-got-eof");
|
|
emit_current_token();
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
buffer.matchWhile(HTML5.SPACE_CHARACTERS);
|
|
} else if (HTML5.ASCII_LETTERS_R.test(data)) {
|
|
current_token.data.push({nodeName: data, nodeValue: ""});
|
|
newState(attribute_name_state);
|
|
} else if(data == '>') {
|
|
emit_current_token();
|
|
} else if(data == '/') {
|
|
newState(self_closing_tag_state);
|
|
} else if(data == "'" || data == '"' || data == '=') {
|
|
parse_error("invalid-character-in-attribute-name");
|
|
current_token.data.push({nodeName: data, nodeValue: ""});
|
|
newState(attribute_name_state);
|
|
} else {
|
|
current_token.data.push({nodeName: data, nodeValue: ""});
|
|
newState(attribute_name_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function attribute_name_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
var leavingThisState = true;
|
|
var emitToken = false;
|
|
if(data === HTML5.EOF) {
|
|
parse_error("eof-in-attribute-name");
|
|
newState(data_state);
|
|
emitToken = true;
|
|
} else if(data == '=') {
|
|
newState(before_attribute_value_state);
|
|
} else if(HTML5.ASCII_LETTERS_R.test(data)) {
|
|
current_token.data.last().nodeName += data + buffer.matchWhile(HTML5.ASCII_LETTERS);
|
|
leavingThisState = false;
|
|
} else if(data == '>') {
|
|
// XXX If we emit here the attributes are converted to a dict
|
|
// without being checked and when the code below runs we error
|
|
// because data is a dict not a list
|
|
emitToken = true;
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
newState(after_attribute_name_state);
|
|
} else if(data == '/') {
|
|
if(!process_solidus_in_tag(buffer)) {
|
|
newState(before_attribute_name_state);
|
|
}
|
|
} else if(data == "'" || data == '"') {
|
|
parse_error("invalid-character-in-attribute-name");
|
|
current_token.data.last().nodeName += data;
|
|
leavingThisState = false;
|
|
} else {
|
|
current_token.data.last().nodeName += data;
|
|
leavingThisState = false;
|
|
}
|
|
|
|
if(leavingThisState) {
|
|
// Attributes are not dropped at this stage. That happens when the
|
|
// start tag token is emitted so values can still be safely appended
|
|
// to attributes, but we do want to report the parse error in time.
|
|
if(this.lowercase_attr_name) {
|
|
current_token.data.last().nodeName = current_token.data.last().nodeName.toLowerCase();
|
|
}
|
|
for (var k in current_token.data.slice(0, -1)) {
|
|
// FIXME this is a fucking mess.
|
|
if(current_token.data.slice(-1)[0] == current_token.data.slice(0, -1)[k].name) {
|
|
parse_error("duplicate-attribute");
|
|
break; // Don't emit more than one of these errors
|
|
}
|
|
}
|
|
if(emitToken) emit_current_token();
|
|
} else {
|
|
buffer.commit()
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function after_attribute_name_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error("expected-end-of-tag-but-got-eof");
|
|
emit_current_token();
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
buffer.matchWhile(HTML5.SPACE_CHARACTERS);
|
|
} else if(data == '=') {
|
|
newState(before_attribute_value_state);
|
|
} else if(data == '>') {
|
|
emit_current_token();
|
|
} else if(HTML5.ASCII_LETTERS_R.test(data)) {
|
|
current_token.data.push({nodeName: data, nodeValue: ""});
|
|
newState(attribute_name_state);
|
|
} else if(data == '/') {
|
|
newState(self_closing_tag_state);
|
|
} else {
|
|
current_token.data.push({nodeName: data, nodeValue: ""});
|
|
newState(attribute_name_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function before_attribute_value_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error("expected-attribute-value-but-got-eof");
|
|
emit_current_token();
|
|
newState(attribute_value_unquoted_state);
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
buffer.matchWhile(HTML5.SPACE_CHARACTERS);
|
|
} else if(data == '"') {
|
|
newState(attribute_value_double_quoted_state);
|
|
} else if(data == '&') {
|
|
newState(attribute_value_unquoted_state);
|
|
buffer.unget(data);
|
|
} else if(data == "'") {
|
|
newState(attribute_value_single_quoted_state);
|
|
} else if(data == '>') {
|
|
emit_current_token();
|
|
} else if(data == '=') {
|
|
parse_error("equals-in-unquoted-attribute-value");
|
|
current_token.data.last().nodeValue += data;
|
|
newState(attribute_value_unquoted_state);
|
|
} else {
|
|
current_token.data.last().nodeValue += data
|
|
newState(attribute_value_unquoted_state);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
function attribute_value_double_quoted_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error("eof-in-attribute-value-double-quote");
|
|
newState(data_state);
|
|
} else if(data == '"') {
|
|
newState(after_attribute_value_state);
|
|
} else if(data == '&') {
|
|
process_entity_in_attribute(buffer);
|
|
} else {
|
|
var s = buffer.matchUntil('["&]');
|
|
if(s !== HTML5.EOF) data = data + s;
|
|
current_token.data.last().nodeValue += data;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function attribute_value_single_quoted_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error("eof-in-attribute-value-single-quote");
|
|
emit_current_token();
|
|
} else if(data == "'") {
|
|
newState(after_attribute_value_state);
|
|
} else if(data == '&') {
|
|
process_entity_in_attribute(buffer);
|
|
} else {
|
|
current_token.data.last().nodeValue += data + buffer.matchUntil("['&]");
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function attribute_value_unquoted_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error("eof-in-attribute-value-no-quotes");
|
|
buffer.commit();
|
|
emit_current_token();
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
newState(before_attribute_name_state);
|
|
} else if(data == '&') {
|
|
process_entity_in_attribute(buffer);
|
|
} else if(data == '>') {
|
|
emit_current_token();
|
|
} else if(data == '"' || data == "'" || data == '=') {
|
|
parse_error("unexpected-character-in-unquoted-attribute-value");
|
|
current_token.data.last().nodeValue += data;
|
|
} else {
|
|
var o = buffer.matchUntil("["+ HTML5.SPACE_CHARACTERS_IN + '&<>' +"]")
|
|
if(o === HTML5.EOF) {
|
|
parse_error("eof-in-attribute-value-no-quotes");
|
|
emit_current_token();
|
|
}
|
|
// Commit here since this state is re-enterable and its outcome won't change with more data.
|
|
buffer.commit();
|
|
current_token.data.last().nodeValue += data + o
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function after_attribute_value_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error( "unexpected-EOF-after-attribute-value");
|
|
emit_current_token();
|
|
buffer.unget(data);
|
|
newState(data_state);
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
newState(before_attribute_name_state);
|
|
} else if(data == '>') {
|
|
emit_current_token();
|
|
newState(data_state);
|
|
} else if(data == '/') {
|
|
newState(self_closing_tag_state);
|
|
} else {
|
|
emitToken({type: 'ParseError', data: "unexpected-character-after-attribute-value"});
|
|
buffer.unget(data);
|
|
newState(before_attribute_name_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function self_closing_tag_state(buffer) {
|
|
var c = buffer.shift(1);
|
|
if(c === HTML5.EOF) {
|
|
parse_error("eof-in-tag-name");
|
|
buffer.unget(c);
|
|
newState(data_state);
|
|
} else if(c == '>') {
|
|
current_token.self_closing = true;
|
|
emit_current_token();
|
|
newState(data_state);
|
|
} else {
|
|
parse_error("expected-self-closing-tag");
|
|
buffer.unget(c);
|
|
newState(before_attribute_name_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function bogus_comment_state(buffer) {
|
|
var s = buffer.matchUntil('>');
|
|
if(s === HTML5.EOF) {
|
|
s = ''
|
|
}
|
|
var tok = {type: 'Comment', data: s};
|
|
buffer.char()
|
|
emitToken(tok);
|
|
newState(data_state);
|
|
return true;
|
|
}
|
|
|
|
function markup_declaration_open_state(buffer) {
|
|
var chars = buffer.shift(2);
|
|
if(chars === '--') {
|
|
current_token = {type: 'Comment', data: ''};
|
|
newState(comment_start_state);
|
|
} else {
|
|
var newchars = buffer.shift(5);
|
|
if(newchars === HTML5.EOF || chars === HTML5.EOF) {
|
|
parse_error("expected-dashes-or-doctype");
|
|
newState(bogus_comment_state);
|
|
buffer.unget(chars);
|
|
return true;
|
|
}
|
|
|
|
chars += newchars;
|
|
if(chars.toUpperCase() == 'DOCTYPE') {
|
|
current_token = {type: 'Doctype', name: '', publicId: null, systemId: null, correct: true};
|
|
newState(doctype_state);
|
|
} else {
|
|
parse_error("expected-dashes-or-doctype");
|
|
buffer.unget(chars);
|
|
newState(bogus_comment_state);
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function comment_start_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error("eof-in-comment");
|
|
emitToken(current_token);
|
|
newState(data_state);
|
|
} else if(data == '-') {
|
|
newState(comment_start_dash_state);
|
|
} else if(data == '>') {
|
|
parse_error("incorrect comment");
|
|
emitToken(current_token);
|
|
newState(data_state);
|
|
} else {
|
|
current_token.data += data + buffer.matchUntil('-');
|
|
newState(comment_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function comment_start_dash_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error("eof-in-comment");
|
|
emitToken(current_token);
|
|
newState(data_state);
|
|
} else if(data == '-') {
|
|
newState(comment_end_state);
|
|
} else if(data == '>') {
|
|
parse_error("incorrect-comment");
|
|
emitToken(current_token);
|
|
newState(data_state);
|
|
} else {
|
|
var s = buffer.matchUntil('-');
|
|
if(s !== HTML5.EOF) data = data + s;
|
|
current_token.data += '-' + data;
|
|
newState(comment_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function comment_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error("eof-in-comment");
|
|
emitToken(current_token);
|
|
newState(data_state);
|
|
} else if(data == '-') {
|
|
newState(comment_end_dash_state);
|
|
} else {
|
|
current_token.data += data + buffer.matchUntil('-');
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function comment_end_dash_state(buffer) {
|
|
var data = buffer.char();
|
|
if (data === HTML5.EOF) {
|
|
parse_error("eof-in-comment-end-dash");
|
|
emitToken(current_token);
|
|
newState(data_state);
|
|
} else if(data == '-') {
|
|
newState(comment_end_state);
|
|
} else {
|
|
current_token.data += '-' + data + buffer.matchUntil('-');
|
|
// Consume the next character which is either a "-" or an :EOF as
|
|
// well so if there's a "-" directly after the "-" we go nicely to
|
|
// the "comment end state" without emitting a ParseError there.
|
|
buffer.char();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function comment_end_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if (data === HTML5.EOF) {
|
|
parse_error("eof-in-comment-double-dash");
|
|
emitToken(current_token);
|
|
newState(data_state);
|
|
} else if(data == '>') {
|
|
emitToken(current_token);
|
|
newState(data_state);
|
|
} else if(data == '-') {
|
|
parse_error("unexpected-dash-after-double-dash-in-comment");
|
|
current_token.data += data;
|
|
} else {
|
|
// XXX
|
|
parse_error("unexpected-char-in-comment");
|
|
current_token.data += '--' + data;
|
|
newState(comment_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function doctype_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
newState(before_doctype_name_state);
|
|
} else {
|
|
parse_error("need-space-after-doctype");
|
|
buffer.unget(data);
|
|
newState(before_doctype_name_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function before_doctype_name_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
parse_error("expected-doctype-name-but-got-eof");
|
|
current_token.correct = false;
|
|
emit_current_token();
|
|
newState(data_state);
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
} else if(data == '>') {
|
|
parse_error("expected-doctype-name-but-got-right-bracket");
|
|
current_token.correct = false;
|
|
emit_current_token();
|
|
newState(data_state);
|
|
} else {
|
|
current_token.name = data;
|
|
newState(doctype_name_state);
|
|
}
|
|
return true
|
|
}
|
|
|
|
function doctype_name_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data === HTML5.EOF) {
|
|
current_token.correct = false;
|
|
buffer.unget(data);
|
|
parse_error("eof-in-doctype");
|
|
emit_current_token();
|
|
newState(data_state);
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
newState(bogus_doctype_state);
|
|
} else if(data == '>') {
|
|
emit_current_token();
|
|
newState(data_state);
|
|
} else {
|
|
current_token.name += data;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function bogus_doctype_state(buffer) {
|
|
var data = buffer.shift(1);
|
|
current_token.correct = false;
|
|
if(data === HTML5.EOF) {
|
|
throw(new Error("Unimplemented!"))
|
|
} else if(data == '>') {
|
|
emit_current_token();
|
|
newState(data_state);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
function parse_error(message, context) {
|
|
emitToken({type: 'ParseError', data: message});
|
|
HTML5.debug('tokenizer.parseError', message, context);
|
|
}
|
|
|
|
function emit_current_token() {
|
|
var tok = current_token;
|
|
switch(tok.type) {
|
|
case 'StartTag':
|
|
case 'EndTag':
|
|
case 'EmptyTag':
|
|
if(tok.type == 'EndTag' && tok.self_closing) {
|
|
parse_error('self-closing-end-tag');
|
|
}
|
|
break;
|
|
}
|
|
if (current_token.name.toLowerCase() == "script" && tok.type == 'EndTag' && script_buffer) {
|
|
emitToken({ type: 'Characters', data: script_buffer });
|
|
script_buffer = null;
|
|
}
|
|
emitToken(tok);
|
|
newState(data_state);
|
|
}
|
|
|
|
function normalize_token(token) {
|
|
if(token.type == 'EmptyTag') {
|
|
if(HTML5.VOID_ELEMENTS.indexOf(token.name) == -1) {
|
|
parse_error('incorrectly-placed-solidus');
|
|
}
|
|
token.type = 'StartTag';
|
|
}
|
|
|
|
if(token.type == 'StartTag') {
|
|
token.name = token.name.toLowerCase();
|
|
if(token.data.length != 0) {
|
|
var data = {};
|
|
// reverse so that the first value for each key wins
|
|
token.data.reverse();
|
|
token.data.forEach(function(e) {
|
|
data[e.nodeName.toLowerCase()] = e.nodeValue;
|
|
});
|
|
token.data = [];
|
|
for(var k in data) {
|
|
token.data.push({nodeName: k, nodeValue: data[k]});
|
|
}
|
|
// now restore the original attribute order
|
|
token.data.reverse();
|
|
}
|
|
} else if(token.type == 'EndTag') {
|
|
if(token.data.length != 0) parse_error('attributes-in-end-tag');
|
|
token.name = token.name.toLowerCase();
|
|
}
|
|
|
|
return token;
|
|
}
|
|
|
|
if(typeof input === 'undefined') throw(new Error("No input given"));
|
|
var content_model;
|
|
this.document = document;
|
|
this.__defineSetter__('content_model', function(model) {
|
|
HTML5.debug('tokenizer.content_model=', model)
|
|
content_model = model
|
|
})
|
|
this.__defineGetter__('content_model', function() {
|
|
return content_model
|
|
})
|
|
function newState(newstate) {
|
|
HTML5.debug('tokenizer.state=', newstate.name)
|
|
state = newstate;
|
|
buffer.commit();
|
|
}
|
|
|
|
newState(data_state);
|
|
|
|
if(input instanceof events.EventEmitter) {
|
|
source = input;
|
|
this.pump = null;
|
|
} else {
|
|
var source = new events.EventEmitter();
|
|
this.pump = function() {
|
|
source.emit('data', input);
|
|
source.emit('end');
|
|
}
|
|
}
|
|
|
|
source.addListener('data', function(data) {
|
|
if(typeof data !== 'string') data = data.toString();
|
|
buffer.append(data);
|
|
try {
|
|
while(state(buffer));
|
|
} catch(e) {
|
|
if(e != HTML5.DRAIN) {
|
|
throw(e);
|
|
} else {
|
|
HTML5.debug('tokenizer.drain', 'Drain')
|
|
buffer.undo();
|
|
}
|
|
}
|
|
});
|
|
source.addListener('end', function() {
|
|
buffer.eof = true;
|
|
while(state(buffer));
|
|
this.emit('end');
|
|
}.bind(this));
|
|
|
|
}
|
|
|
|
t.prototype = new events.EventEmitter;
|