mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-16 02:51:50 +00:00
b750ce38b8
Builds a DOM tree (jsdom) from the tokens and then serializes that using document.innerHTML. This is all very experimental, so don't be surprised by rough edges.
866 lines
25 KiB
JavaScript
866 lines
25 KiB
JavaScript
//require('../core-upgrade');
|
|
var HTML5 = require('../html5');
|
|
var events = require('events');
|
|
var Buffer = require('./buffer').Buffer;
|
|
var Models = HTML5.Models;
|
|
|
|
function keys(h) {
|
|
var r = [];
|
|
for(var k in h) {
|
|
r.push(k);
|
|
}
|
|
return r;
|
|
}
|
|
|
|
HTML5.Tokenizer = t = function HTML5Tokenizer(input, document) {
|
|
if(!input) throw(new Error("No input given"));
|
|
var content_model;
|
|
this.document = document;
|
|
this.__defineSetter__('content_model', function(model) {
|
|
HTML5.debug('tokenizer.content_model=', model)
|
|
content_model = model
|
|
})
|
|
this.__defineGetter__('content_model', function() {
|
|
return content_model
|
|
})
|
|
this.content_model = Models.PCDATA;
|
|
var state;
|
|
var buffer = this.buffer = new Buffer();
|
|
this.__defineSetter__('state', function(newstate) {
|
|
HTML5.debug('tokenizer.state=', newstate)
|
|
state = newstate;
|
|
buffer.commit();
|
|
});
|
|
this.state = 'data_state';
|
|
this.escapeFlag = false;
|
|
this.lastFourChars = '';
|
|
this.current_token = null;
|
|
|
|
if(input instanceof events.EventEmitter) {
|
|
source = input;
|
|
this.pump = null;
|
|
} else {
|
|
var source = new events.EventEmitter();
|
|
this.pump = function() {
|
|
source.emit('data', input);
|
|
source.emit('end');
|
|
}
|
|
}
|
|
|
|
this.commit = function() {
|
|
buffer.commit();
|
|
};
|
|
|
|
var tokenizer = this;
|
|
source.addListener('data', function(data) {
|
|
if(typeof data !== 'string') data = data.toString();
|
|
buffer.append(data);
|
|
try {
|
|
while(tokenizer[state](buffer));
|
|
} catch(e) {
|
|
if(e != HTML5.DRAIN) {
|
|
throw(e);
|
|
} else {
|
|
HTML5.debug('tokenizer.drain', 'Drain')
|
|
buffer.undo();
|
|
}
|
|
}
|
|
});
|
|
source.addListener('end', function(t) { return function() {
|
|
buffer.eof = true;
|
|
while(tokenizer[state](buffer));
|
|
t.emit('end');
|
|
}}(this));
|
|
}
|
|
|
|
t.prototype = new events.EventEmitter;
|
|
|
|
t.prototype.tokenize = function() {
|
|
if(this.pump) this.pump();
|
|
}
|
|
|
|
t.prototype.emitToken = function(tok) {
|
|
tok = this.normalize_token(tok);
|
|
HTML5.debug('tokenizer.token', tok)
|
|
if (this.content_model == Models.SCRIPT_CDATA && (tok.type == 'Characters' || tok.type == 'SpaceCharacters')) {
|
|
this.script_buffer += tok.data;
|
|
} else {
|
|
this.emit('token', tok);
|
|
}
|
|
}
|
|
|
|
t.prototype.consume_entity = function(buffer, from_attr) {
|
|
var char = null;
|
|
var chars = buffer.char();
|
|
if(chars == HTML5.EOF) return false;
|
|
if(chars.match(HTML5.SPACE_CHARACTERS) || chars == '<' || chars == '&') {
|
|
buffer.unget(chars);
|
|
} else if(chars[0] == '#') { // Maybe a numeric entity
|
|
var c = buffer.shift(2);
|
|
if(c == HTML5.EOF) {
|
|
buffer.unget(chars);
|
|
return false;
|
|
}
|
|
chars += c;
|
|
if(chars[1] && chars[1].toLowerCase() == 'x' && HTML5.HEX_DIGITS_R.test(chars[2])) {
|
|
// Hex entity
|
|
buffer.unget(chars[2]);
|
|
char = this.consume_numeric_entity(buffer, true);
|
|
} else if(chars[1] && HTML5.DIGITS_R.test(chars[1])) {
|
|
// Decimal entity
|
|
buffer.unget(chars.slice(1));
|
|
char = this.consume_numeric_entity(buffer, false);
|
|
} else {
|
|
// Not numeric
|
|
buffer.unget(chars);
|
|
this.parse_error("expected-numeric-entity");
|
|
}
|
|
} else {
|
|
var filteredEntityList = keys(HTML5.ENTITIES).filter(function(e) {
|
|
return e[0] == chars[0];
|
|
});
|
|
var entityName = null;
|
|
while(true) {
|
|
if(filteredEntityList.some(function(e) {
|
|
return e.indexOf(chars) == 0;
|
|
})) {
|
|
filteredEntityList = filteredEntityList.filter(function(e) {
|
|
return e.indexOf(chars) == 0;
|
|
});
|
|
chars += buffer.char()
|
|
} else {
|
|
break;
|
|
}
|
|
|
|
if(HTML5.ENTITIES[chars]) {
|
|
entityName = chars;
|
|
if(entityName[entityName.length - 1] == ';') break;
|
|
}
|
|
}
|
|
|
|
if(entityName) {
|
|
char = HTML5.ENTITIES[entityName];
|
|
|
|
if(entityName[entityName.length - 1] != ';' && this.from_attribute && (HTML5.ASCII_LETTERS_R.test(chars.substr(entityName.length, 1) || HTML5.DIGITS.test(chars.substr(entityName.length, 1))))) {
|
|
buffer.unget(chars);
|
|
char = '&';
|
|
} else {
|
|
buffer.unget(chars.slice(entityName.length));
|
|
}
|
|
} else {
|
|
this.parse_error("expected-named-entity");
|
|
buffer.unget(chars);
|
|
}
|
|
}
|
|
|
|
return char;
|
|
}
|
|
|
|
t.prototype.consume_numeric_entity = function(buffer, hex) {
|
|
if(hex) {
|
|
var allowed = HTML5.HEX_DIGITS_R;
|
|
var radix = 16;
|
|
} else {
|
|
var allowed = HTML5.DIGITS_R;
|
|
var radix = 10;
|
|
}
|
|
|
|
chars = '';
|
|
|
|
var c = buffer.char();
|
|
while(allowed.test(c)) {
|
|
chars = chars + c;
|
|
c = buffer.char();
|
|
}
|
|
|
|
var charAsInt = parseInt(chars, radix);
|
|
|
|
if(charAsInt == 13) {
|
|
this.parse_error("incorrect-cr-newline-entity");
|
|
charAsInt = 10;
|
|
} else if(charAsInt >= 128 && charAsInt <= 159) {
|
|
this.parse_error("illegal-windows-1252-entity");
|
|
charAsInt = HTML5.ENTITIES_WINDOWS1252[charAsInt - 128];
|
|
}
|
|
|
|
if(0 < charAsInt && charAsInt <= 1114111 && !(55296 <= charAsInt && charAsInt <= 57343)) {
|
|
char = String.fromCharCode(charAsInt);
|
|
} else {
|
|
char = String.fromCharCode(0xFFFD);
|
|
this.parse_error("cant-convert-numeric-entity");
|
|
}
|
|
|
|
if(c != ';') {
|
|
this.parse_error("numeric-entity-without-semicolon");
|
|
buffer.unget(c);
|
|
}
|
|
|
|
return char;
|
|
}
|
|
|
|
t.prototype.process_entity_in_attribute = function(buffer) {
|
|
var entity = this.consume_entity(buffer);
|
|
if(entity) {
|
|
this.current_token.data.last().nodeValue += entity;
|
|
} else {
|
|
this.current_token.data.last().nodeValue += '&';
|
|
}
|
|
}
|
|
|
|
t.prototype.process_solidus_in_tag = function(buffer) {
|
|
var data = buffer.peek(1);
|
|
if(this.current_token.type == 'StartTag' && data == '>') {
|
|
this.current_token.type = 'EmptyTag';
|
|
return true;
|
|
} else {
|
|
this.parse_error("incorrectly-placed-solidus");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
t.prototype.data_state = function(buffer) {
|
|
var c = buffer.char()
|
|
if(c != HTML5.EOF && this.content_model == Models.CDATA || this.content_model == Models.RCDATA || this.content_model == Models.SCRIPT_CDATA) {
|
|
this.lastFourChars += c;
|
|
if(this.lastFourChars.length >= 4) {
|
|
this.lastFourChars = this.lastFourChars.substr(-4)
|
|
}
|
|
}
|
|
|
|
if (this.content_model == Models.SCRIPT_CDATA) {
|
|
if (this.script_buffer == null) {
|
|
this.script_buffer = '';
|
|
}
|
|
}
|
|
|
|
if(c == HTML5.EOF) {
|
|
this.emitToken(HTML5.EOF_TOK);
|
|
this.commit();
|
|
return false;
|
|
} else if(c == '&' && (this.content_model == Models.PCDATA || this.content_model == Models.RCDATA) && !this.escapeFlag) {
|
|
this.state = 'entity_data_state';
|
|
} else if(c == '-' && (this.content_model == Models.CDATA || this.content_model == Models.RCDATA || this.content_model == Models.SCRIPT_CDATA) && !this.escapeFlag && this.lastFourChars == '<!--') {
|
|
this.escapeFlag = true;
|
|
this.emitToken({type: 'Characters', data: c});
|
|
this.commit();
|
|
} else if(c == '<' && !this.escapeFlag && (this.content_model == Models.PCDATA || this.content_model == Models.RCDATA || this.content_model == Models.CDATA || this.content_model == Models.SCRIPT_CDATA)) {
|
|
this.state = 'tag_open_state';
|
|
} else if(c == '>' && this.escapeFlag && (this.content_model == Models.CDATA || this.content_model == Models.RCDATA || this.content_model == Models.SCRIPT_CDATA) && this.lastFourChars.match(/-->$/)) {
|
|
this.escapeFlag = false;
|
|
this.emitToken({type: 'Characters', data: c});
|
|
this.commit();
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(c)) {
|
|
this.emitToken({type: 'SpaceCharacters', data: c + buffer.matchWhile(HTML5.SPACE_CHARACTERS)});
|
|
this.commit();
|
|
} else {
|
|
var o = buffer.matchUntil("[&<>-]")
|
|
this.emitToken({type: 'Characters', data: c + o});
|
|
this.lastFourChars += c+o
|
|
this.lastFourChars = this.lastFourChars.slice(-4)
|
|
this.commit();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.entity_data_state = function(buffer) {
|
|
var entity = this.consume_entity(buffer);
|
|
if(entity) {
|
|
this.emitToken({type: 'Characters', data: entity});
|
|
} else {
|
|
this.emitToken({type: 'Characters', data: '&'});
|
|
}
|
|
this.state = 'data_state';
|
|
return true;
|
|
}
|
|
|
|
t.prototype.tag_open_state = function(buffer) {
|
|
var data = buffer.char();
|
|
if(this.content_model == Models.PCDATA) {
|
|
if(data == '!') {
|
|
this.state = 'markup_declaration_open_state';
|
|
} else if (data == '/') {
|
|
this.state = 'close_tag_open_state';
|
|
} else if (data != HTML5.EOF && HTML5.ASCII_LETTERS_R.test(data)) {
|
|
this.current_token = {type: 'StartTag', name: data, data: []};
|
|
this.state = 'tag_name_state';
|
|
} else if (data == '>') {
|
|
// XXX In theory it could be something besides a tag name. But
|
|
// do we really care?
|
|
this.parse_error("expected-tag-name-but-got-right-bracket");
|
|
this.emitToken({type: 'Characters', data: "<>"});
|
|
this.state = 'data_state';
|
|
} else if (data == '?') {
|
|
// XXX In theory it could be something besides a tag name. But
|
|
// do we really care?
|
|
this.parse_error("expected-tag-name-but-got-question-mark");
|
|
buffer.unget(data);
|
|
this.state = 'bogus_comment_state';
|
|
} else {
|
|
// XXX
|
|
this.parse_error("expected-tag-name");
|
|
this.emitToken({type: 'Characters', data: "<"});
|
|
buffer.unget(data);
|
|
this.state = 'data_state';
|
|
}
|
|
} else {
|
|
// We know the content model flag is set to either RCDATA or CDATA or SCRIPT_CDATA
|
|
// now because this state can never be entered with the PLAINTEXT
|
|
// flag.
|
|
if (data == '/') {
|
|
this.state = 'close_tag_open_state';
|
|
} else {
|
|
this.emitToken({type: 'Characters', data: "<"});
|
|
buffer.unget(data);
|
|
this.state = 'data_state';
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
t.prototype.close_tag_open_state = function(buffer) {
|
|
if(this.content_model == Models.RCDATA || this.content_model == Models.CDATA || this.content_model == Models.SCRIPT_CDATA) {
|
|
var chars = '';
|
|
if(this.current_token) {
|
|
for(var i = 0; i <= this.current_token.name.length; i++) {
|
|
var c = buffer.char();
|
|
if(c == HTML5.EOF) break;
|
|
chars += c;
|
|
}
|
|
buffer.unget(chars);
|
|
}
|
|
|
|
if(this.current_token
|
|
&& this.current_token.name.toLowerCase() == chars.slice(0, this.current_token.name.length).toLowerCase()
|
|
&& (chars.length > this.current_token.name.length ? new RegExp('[' + HTML5.SPACE_CHARACTERS_IN + '></\0]').test(chars.substr(-1)) : true)
|
|
) {
|
|
this.content_model = Models.PCDATA;
|
|
} else {
|
|
this.emitToken({type: 'Characters', data: '</'});
|
|
this.state = 'data_state';
|
|
return true
|
|
}
|
|
}
|
|
|
|
data = buffer.char()
|
|
if (data == HTML5.EOF) {
|
|
this.parse_error("expected-closing-tag-but-got-eof");
|
|
this.emitToken({type: 'Characters', data: '</'});
|
|
buffer.unget(data);
|
|
this.state = 'data_state'
|
|
} else if (HTML5.ASCII_LETTERS_R.test(data)) {
|
|
this.current_token = {type: 'EndTag', name: data, data: []}
|
|
this.state = 'tag_name_state';
|
|
} else if (data == '>') {
|
|
this.parse_error("expected-closing-tag-but-got-right-bracket");
|
|
this.state = 'data_state';
|
|
} else {
|
|
this.parse_error("expected-closing-tag-but-got-char", {data: data}); // param 1 is datavars:
|
|
buffer.unget(data);
|
|
this.state = 'bogus_comment_state';
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.tag_name_state = function(buffer) {
|
|
data = buffer.char();
|
|
if(data == HTML5.EOF) {
|
|
this.parse_error('eof-in-tag-name');
|
|
this.emit_current_token();
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
this.state = 'before_attribute_name_state';
|
|
} else if(HTML5.ASCII_LETTERS_R.test(data)) {
|
|
this.current_token.name += data + buffer.matchWhile(HTML5.ASCII_LETTERS);
|
|
} else if(data == '>') {
|
|
this.emit_current_token();
|
|
} else if(data == '/') {
|
|
this.process_solidus_in_tag(buffer)
|
|
this.state = 'self_closing_tag_state';
|
|
} else {
|
|
this.current_token.name += data;
|
|
}
|
|
this.commit();
|
|
|
|
return true;
|
|
}
|
|
|
|
t.prototype.before_attribute_name_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
buffer.matchWhile(HTML5.SPACE_CHARACTERS);
|
|
} else if (data == HTML5.EOF) {
|
|
this.parse_error("expected-attribute-name-but-got-eof");
|
|
this.emit_current_token();
|
|
} else if (HTML5.ASCII_LETTERS_R.test(data)) {
|
|
this.current_token.data.push({nodeName: data, nodeValue: ""});
|
|
this.state = 'attribute_name_state';
|
|
} else if(data == '>') {
|
|
this.emit_current_token();
|
|
} else if(data == '/') {
|
|
this.state = 'self_closing_tag_state';
|
|
} else if(data == "'" || data == '"' || data == '=') {
|
|
this.parse_error("invalid-character-in-attribute-name");
|
|
this.current_token.data.push({nodeName: data, nodeValue: ""});
|
|
this.state = 'attribute_name_state';
|
|
} else {
|
|
this.current_token.data.push({nodeName: data, nodeValue: ""});
|
|
this.state = 'attribute_name_state';
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.attribute_name_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
var leavingThisState = true;
|
|
var emitToken = false;
|
|
if(data == '=') {
|
|
this.state = 'before_attribute_value_state';
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("eof-in-attribute-name");
|
|
this.state = 'data_state';
|
|
emitToken = true;
|
|
} else if(HTML5.ASCII_LETTERS_R.test(data)) {
|
|
this.current_token.data.last().nodeName += data + buffer.matchWhile(HTML5.ASCII_LETTERS);
|
|
leavingThisState = false;
|
|
} else if(data == '>') {
|
|
// XXX If we emit here the attributes are converted to a dict
|
|
// without being checked and when the code below runs we error
|
|
// because data is a dict not a list
|
|
emitToken = true;
|
|
} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
this.state = 'after_attribute_name_state';
|
|
} else if(data == '/') {
|
|
if(!this.process_solidus_in_tag(buffer)) {
|
|
this.state = 'before_attribute_name_state';
|
|
}
|
|
} else if(data == "'" || data == '"') {
|
|
this.parse_error("invalid-character-in-attribute-name");
|
|
this.current_token.data.last().nodeName += data;
|
|
leavingThisState = false;
|
|
} else {
|
|
this.current_token.data.last().nodeName += data;
|
|
leavingThisState = false;
|
|
}
|
|
|
|
if(leavingThisState) {
|
|
// Attributes are not dropped at this stage. That happens when the
|
|
// start tag token is emitted so values can still be safely appended
|
|
// to attributes, but we do want to report the parse error in time.
|
|
if(this.lowercase_attr_name) {
|
|
this.current_token.data.last().nodeName = this.current_token.data.last().nodeName.toLowerCase();
|
|
}
|
|
for (k in this.current_token.data.slice(0, -1)) {
|
|
// FIXME this is a fucking mess.
|
|
if(this.current_token.data.slice(-1)[0] == this.current_token.data.slice(0, -1)[k].name) {
|
|
this.parse_error("duplicate-attribute");
|
|
break; // Don't emit more than one of these errors
|
|
}
|
|
}
|
|
if(emitToken) this.emit_current_token();
|
|
} else {
|
|
this.commit()
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.after_attribute_name_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
buffer.matchWhile(HTML5.SPACE_CHARACTERS);
|
|
} else if(data == '=') {
|
|
this.state = 'before_attribute_value_state';
|
|
} else if(data == '>') {
|
|
this.emit_current_token();
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("expected-end-of-tag-but-got-eof");
|
|
this.emit_current_token();
|
|
} else if(HTML5.ASCII_LETTERS_R.test(data)) {
|
|
this.current_token.data.push({nodeName: data, nodeValue: ""});
|
|
this.state = 'attribute_name_state';
|
|
} else if(data == '/') {
|
|
this.state = 'self_closing_tag_state';
|
|
} else {
|
|
this.current_token.data.push({nodeName: data, nodeValue: ""});
|
|
this.state = 'attribute_name_state';
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.before_attribute_value_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
buffer.matchWhile(HTML5.SPACE_CHARACTERS);
|
|
} else if(data == '"') {
|
|
this.state = 'attribute_value_double_quoted_state';
|
|
} else if(data == '&') {
|
|
this.state = 'attribute_value_unquoted_state';
|
|
buffer.unget(data);
|
|
} else if(data == "'") {
|
|
this.state = 'attribute_value_single_quoted_state';
|
|
} else if(data == '>') {
|
|
this.emit_current_token();
|
|
} else if(data == '=') {
|
|
this.parse_error("equals-in-unquoted-attribute-value");
|
|
this.current_token.data.last().nodeValue += data;
|
|
this.state = 'attribute_value_unquoted_state';
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("expected-attribute-value-but-got-eof");
|
|
this.emit_current_token();
|
|
this.state = 'attribute_value_unquoted_state';
|
|
} else {
|
|
this.current_token.data.last().nodeValue += data
|
|
this.state = 'attribute_value_unquoted_state'
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
t.prototype.attribute_value_double_quoted_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data == '"') {
|
|
this.state = 'after_attribute_value_state';
|
|
} else if(data == '&') {
|
|
this.process_entity_in_attribute(buffer);
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("eof-in-attribute-value-double-quote");
|
|
this.emit_current_token();
|
|
} else {
|
|
this.current_token.data.last().nodeValue += data + buffer.matchUntil('["&]');
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.attribute_value_single_quoted_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data == "'") {
|
|
this.state = 'after_attribute_value_state';
|
|
} else if(data == '&') {
|
|
this.process_entity_in_attribute(buffer);
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("eof-in-attribute-value-single-quote");
|
|
this.emit_current_token();
|
|
} else {
|
|
this.current_token.data.last().nodeValue += data + buffer.matchUntil("['&]");
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.attribute_value_unquoted_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
this.state = 'before_attribute_name_state';
|
|
} else if(data == '&') {
|
|
this.process_entity_in_attribute(buffer);
|
|
} else if(data == '>') {
|
|
this.emit_current_token();
|
|
} else if(data == '"' || data == "'" || data == '=') {
|
|
this.parse_error("unexpected-character-in-unquoted-attribute-value");
|
|
this.current_token.data.last().nodeValue += data;
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("eof-in-attribute-value-no-quotes");
|
|
this.emit_current_token();
|
|
} else {
|
|
var o = buffer.matchUntil("["+ HTML5.SPACE_CHARACTERS_IN + '&<>' +"]")
|
|
this.current_token.data.last().nodeValue += data + o
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.after_attribute_value_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
this.state = 'before_attribute_name_state';
|
|
} else if(data == '>') {
|
|
this.emit_current_token();
|
|
this.state = 'data_state';
|
|
} else if(data == '/') {
|
|
this.state = 'self_closing_tag_state';
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error( "unexpected-EOF-after-attribute-value");
|
|
this.emit_current_token();
|
|
buffer.unget(data);
|
|
this.state = 'data_state';
|
|
} else {
|
|
this.emitToken({type: 'ParseError', data: "unexpected-character-after-attribute-value"});
|
|
buffer.unget(data);
|
|
this.state = 'before_attribute_name_state';
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.self_closing_tag_state = function(buffer) {
|
|
var c = buffer.shift(1);
|
|
if(c == '>') {
|
|
this.current_token.self_closing = true;
|
|
this.emit_current_token();
|
|
this.state = 'data_state';
|
|
} else if(c == HTML5.EOF) {
|
|
this.parse_error("eof-in-tag-name");
|
|
buffer.unget(c);
|
|
this.state = 'data_state';
|
|
} else {
|
|
this.parse_error("expected-self-closing-tag");
|
|
buffer.unget(c);
|
|
this.state = 'before_attribute_name_state';
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.bogus_comment_state = function(buffer) {
|
|
var tok = {type: 'Comment', data: buffer.matchUntil('>')}
|
|
buffer.char()
|
|
this.emitToken(tok);
|
|
this.state = 'data_state';
|
|
return true;
|
|
}
|
|
|
|
t.prototype.markup_declaration_open_state = function(buffer) {
|
|
var chars = buffer.shift(2);
|
|
if(chars == '--') {
|
|
this.current_token = {type: 'Comment', data: ''};
|
|
this.state = 'comment_start_state';
|
|
} else {
|
|
var newchars = buffer.shift(5);
|
|
if(newchars == HTML5.EOF || chars == HTML5.EOF) {
|
|
this.parse_error("expected-dashes-or-doctype");
|
|
this.state = 'bogus_comment_state'
|
|
if(chars != HTML5.EOF) buffer.unget(chars);
|
|
return true;
|
|
}
|
|
|
|
// Check for EOF better -- FIXME
|
|
chars += newchars;
|
|
if(chars.toUpperCase() == 'DOCTYPE') {
|
|
this.current_token = {type: 'Doctype', name: '', publicId: null, systemId: null, correct: true};
|
|
this.state = 'doctype_state';
|
|
} else {
|
|
this.parse_error("expected-dashes-or-doctype");
|
|
buffer.unget(chars);
|
|
this.state = 'bogus_comment_state';
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.comment_start_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data == '-') {
|
|
this.state = 'comment_start_dash_state';
|
|
} else if(data == '>') {
|
|
this.parse_error("incorrect comment");
|
|
this.emitToken(this.current_token);
|
|
this.state = 'data_state';
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("eof-in-comment");
|
|
this.emitToken(this.current_token);
|
|
this.state = 'data_state';
|
|
} else {
|
|
this.current_token.data += data + buffer.matchUntil('-');
|
|
this.state = 'comment_state';
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.comment_start_dash_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data == '-') {
|
|
this.state = 'comment_end_state'
|
|
} else if(data == '>') {
|
|
this.parse_error("incorrect-comment");
|
|
this.emitToken(this.current_token);
|
|
this.state = 'data_state';
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("eof-in-comment");
|
|
this.emitToken(this.current_token);
|
|
this.state = 'data_state';
|
|
} else {
|
|
this.current_token.data += '-' + data + buffer.matchUntil('-');
|
|
this.state = 'comment_state';
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.comment_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data == '-') {
|
|
this.state = 'comment_end_dash_state';
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("eof-in-comment");
|
|
this.emitToken(this.current_token);
|
|
this.state = 'data_state';
|
|
} else {
|
|
this.current_token.data += data + buffer.matchUntil('-');
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.comment_end_dash_state = function(buffer) {
|
|
var data = buffer.char();
|
|
if(data == '-') {
|
|
this.state = 'comment_end_state';
|
|
} else if (data == HTML5.EOF) {
|
|
this.parse_error("eof-in-comment-end-dash");
|
|
this.emitToken(this.current_token);
|
|
this.state = 'data_state';
|
|
} else {
|
|
this.current_token.data += '-' + data + buffer.matchUntil('-');
|
|
// Consume the next character which is either a "-" or an :EOF as
|
|
// well so if there's a "-" directly after the "-" we go nicely to
|
|
// the "comment end state" without emitting a ParseError there.
|
|
buffer.char();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.comment_end_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(data == '>') {
|
|
this.emitToken(this.current_token);
|
|
this.state = 'data_state';
|
|
} else if(data == '-') {
|
|
this.parse_error("unexpected-dash-after-double-dash-in-comment");
|
|
this.current_token.data += data;
|
|
} else if (data == HTML5.EOF) {
|
|
this.parse_error("eof-in-comment-double-dash");
|
|
this.emitToken(this.current_token);
|
|
this.state = 'data_state';
|
|
} else {
|
|
// XXX
|
|
this.parse_error("unexpected-char-in-comment");
|
|
this.current_token.data += '--' + data;
|
|
this.state = 'comment_state';
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.doctype_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
this.state = 'before_doctype_name_state';
|
|
} else {
|
|
this.parse_error("need-space-after-doctype");
|
|
buffer.unget(data);
|
|
this.state = 'before_doctype_name_state';
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.before_doctype_name_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
} else if(data == '>') {
|
|
this.parse_error("expected-doctype-name-but-got-right-bracket");
|
|
this.current_token.correct = false;
|
|
this.emit_current_token();
|
|
this.state = 'data_state';
|
|
} else if(data == HTML5.EOF) {
|
|
this.parse_error("expected-doctype-name-but-got-eof");
|
|
this.current_token.correct = false;
|
|
this.emit_current_token();
|
|
this.state = 'data_state';
|
|
} else {
|
|
this.current_token.name = data;
|
|
this.state = 'doctype_name_state';
|
|
}
|
|
return true
|
|
}
|
|
|
|
t.prototype.doctype_name_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
if(HTML5.SPACE_CHARACTERS_R.test(data)) {
|
|
this.state = 'bogus_doctype_state';
|
|
} else if(data == '>') {
|
|
this.emit_current_token();
|
|
this.state = 'data_state';
|
|
} else if(data == HTML5.EOF) {
|
|
this.current_token.correct = false;
|
|
buffer.unget(data);
|
|
this.parse_error("eof-in-doctype");
|
|
this.emit_current_token();
|
|
this.state = 'data_state';
|
|
} else {
|
|
this.current_token.name += data;
|
|
}
|
|
return true;
|
|
}
|
|
/*
|
|
data += buffer.shift(5);
|
|
var token = data.toLowerCase();
|
|
if(token == 'public') {
|
|
this.state = 'before_doctype_public_identifier_state';
|
|
} else if(token == 'system') {
|
|
this.state = 'before_doctype_system_identifier_state';
|
|
} else {
|
|
buffer.unget(data);
|
|
this.parse_error("expected-space-or-right-bracket-in-doctype", {data: data});
|
|
this.state = 'bogus_doctype_state';
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
*/
|
|
|
|
t.prototype.bogus_doctype_state = function(buffer) {
|
|
var data = buffer.shift(1);
|
|
this.current_token.correct = false;
|
|
if(data == '>') {
|
|
this.emit_current_token();
|
|
this.state = 'data_state';
|
|
} else if(data == HTML5.EOF) {
|
|
throw(new Error("Unimplemented!"))
|
|
}
|
|
return true;
|
|
}
|
|
|
|
t.prototype.parse_error = function(message) {
|
|
this.emitToken({type: 'ParseError', data: message});
|
|
}
|
|
|
|
t.prototype.emit_current_token = function() {
|
|
var tok = this.current_token;
|
|
switch(tok.type) {
|
|
case 'StartTag':
|
|
case 'EndTag':
|
|
case 'EmptyTag':
|
|
if(tok.type == 'EndTag' && tok.self_closing) {
|
|
this.parse_error('self-closing-end-tag');
|
|
}
|
|
break;
|
|
}
|
|
if (this.current_token.name == "script" && tok.type == 'EndTag') {
|
|
this.emitToken({ type: 'Characters', data: this.script_buffer });
|
|
this.script_buffer = null;
|
|
}
|
|
this.emitToken(tok);
|
|
this.state = 'data_state';
|
|
}
|
|
|
|
t.prototype.normalize_token = function(token) {
|
|
if(token.type == 'EmptyTag') {
|
|
if(HTML5.VOID_ELEMENTS.indexOf(token.name) == -1) {
|
|
this.parse_error('incorrectly-placed-solidus');
|
|
}
|
|
token.type = 'StartTag';
|
|
}
|
|
|
|
if(token.type == 'StartTag') {
|
|
token.name = token.name.toLowerCase();
|
|
if(token.data.length != 0) {
|
|
var data = {};
|
|
token.data.reverse();
|
|
token.data.forEach(function(e) {
|
|
data[e.nodeName.toLowerCase()] = e.nodeValue;
|
|
});
|
|
token.data = [];
|
|
for(var k in data) {
|
|
token.data.push({nodeName: k, nodeValue: data[k]});
|
|
}
|
|
}
|
|
} else if(token.type == 'EndTag') {
|
|
if(token.data.length != 0) this.parse_error('attributes-in-end-tag');
|
|
token.name = token.name.toLowerCase();
|
|
}
|
|
|
|
return token;
|
|
}
|
|
|