mediawiki-extensions-Visual.../modules/parser/html5/tokenizer.js

require('../core-upgrade');
var HTML5 = require('../html5');
var events = require('events');
var Buffer = require('./buffer').Buffer;
var Models = HTML5.Models;

function keys(h) {
	var r = [];
	for(var k in h) {
		r.push(k);
	}
	return r;
}

var t = HTML5.Tokenizer = function HTML5Tokenizer(input, document) {
	var state;
	var buffer = new Buffer();
	var escapeFlag = false;
	var lastFourChars = '';
	var current_token = null;
	var script_buffer = null;
	var content_model = Models.PCDATA;

	function data_state(buffer) {
		var c = buffer.char()
		if(c !== HTML5.EOF && (content_model == Models.CDATA || content_model == Models.RCDATA || content_model == Models.SCRIPT_CDATA)) {
			lastFourChars += c;
			if(lastFourChars.length >= 4) {
				lastFourChars = lastFourChars.substr(-4)
			}
		}

		if (content_model == Models.SCRIPT_CDATA) {
			if (script_buffer == null) {
				script_buffer = '';
			}
		}

		if(c === HTML5.EOF) {
			emitToken(HTML5.EOF_TOK);
			buffer.commit();
			return false;
		} else if(c === '\0' && (content_model == Models.SCRIPT_CDATA || content_model == Models.PLAINTEXT || content_model == Models.RAWTEXT || content_model == Models.RCDATA)) {
			emitToken({type: 'Characters', data: "\ufffd"});
			buffer.commit();
		} else if(c == '&' && (content_model == Models.PCDATA || content_model == Models.RCDATA) && !escapeFlag) {
			newState(entity_data_state);
		} else if(c == '-' && (content_model == Models.CDATA || content_model == Models.RCDATA || content_model == Models.SCRIPT_CDATA) && !escapeFlag && lastFourChars == '<!--') {
			escapeFlag = true;
			emitToken({type: 'Characters', data: c});
			buffer.commit();
		} else if(c == '<' && !escapeFlag && (content_model == Models.PCDATA || content_model == Models.RCDATA || content_model == Models.CDATA || content_model == Models.SCRIPT_CDATA)) {
			newState(tag_open_state);
		} else if(c == '>' && escapeFlag && (content_model == Models.CDATA || content_model == Models.RCDATA || content_model == Models.SCRIPT_CDATA) && lastFourChars.match(/-->$/)) {
			escapeFlag = false;
			emitToken({type: 'Characters', data: c});
			buffer.commit();
		} else if(HTML5.SPACE_CHARACTERS_R.test(c)) {
			emitToken({type: 'SpaceCharacters', data: c + buffer.matchWhile(HTML5.SPACE_CHARACTERS)});
			buffer.commit();
		} else {
			var o = buffer.matchUntil("[&<>-]");
			if(o !== HTML5.EOF) {
				c = c + o;
			}
			emitToken({type: 'Characters', data: c});
			lastFourChars += c
			lastFourChars = lastFourChars.slice(-4)
			buffer.commit();
		}
		return true;
	};

	var entity_data_state = function entity_data_state(buffer) {
		var entity = consume_entity(buffer);
		if(entity) {
			emitToken({type: 'Characters', data: entity});
		} else {
			emitToken({type: 'Characters', data: '&'});
		}
		newState(data_state);
		return true;
	}

	this.tokenize = function() {
		if(this.pump) this.pump();
	}

	var emitToken = function emitToken(tok) {
		tok = normalize_token(tok);
		if (content_model == Models.SCRIPT_CDATA && (tok.type == 'Characters' || tok.type == 'SpaceCharacters') && !buffer.eof) {
			HTML5.debug('tokenizer.addScriptData', tok)
			script_buffer += tok.data;
		} else {
			HTML5.debug('tokenizer.token', tok)
			this.emit('token', tok);
		}
	}.bind(this);

	function consume_entity(buffer, from_attr) {
		var char = null;
		var chars = buffer.char();
		if(chars === HTML5.EOF) return false;
		if(chars.match(HTML5.SPACE_CHARACTERS) || chars == '<' || chars == '&') {
			buffer.unget(chars);
		} else if(chars[0] == '#') { // Maybe a numeric entity
			var c = buffer.shift(2);
			if(c === HTML5.EOF) {
				buffer.unget(chars);
				return false;
			}
			chars += c;
			if(chars[1] && chars[1].toLowerCase() == 'x' && HTML5.HEX_DIGITS_R.test(chars[2])) {
				// Hex entity
				buffer.unget(chars[2]);
				char = consume_numeric_entity(buffer, true);
			} else if(chars[1] && HTML5.DIGITS_R.test(chars[1])) {
				// Decimal entity
				buffer.unget(chars.slice(1));
				char = consume_numeric_entity(buffer, false);
			} else {
				// Not numeric
				buffer.unget(chars);
				parse_error("expected-numeric-entity");
			}
		} else {
			var filteredEntityList = keys(HTML5.ENTITIES).filter(function(e) {
				return e[0] == chars[0];
			});
			var entityName = null;
			while(true) {
				if(filteredEntityList.some(function(e) {
					return e.indexOf(chars) == 0;
				})) {
					filteredEntityList = filteredEntityList.filter(function(e) {
						return e.indexOf(chars) == 0;
					});
					var c = buffer.char();
					if(c !== HTML5.EOF) {
						chars += c;
					} else {
						break;
					}
				} else {
					break;
				}

				if(HTML5.ENTITIES[chars]) {
					entityName = chars;
					if(entityName[entityName.length - 1] == ';') break;
				}
			}

			if(entityName) {
				char = HTML5.ENTITIES[entityName];

				if(entityName[entityName.length - 1] != ';' && this.from_attribute && (HTML5.ASCII_LETTERS_R.test(chars.substr(entityName.length, 1) || HTML5.DIGITS.test(chars.substr(entityName.length, 1))))) {
					buffer.unget(chars);
					char = '&';
				} else {
					buffer.unget(chars.slice(entityName.length));
				}
			} else {
				parse_error("expected-named-entity");
				buffer.unget(chars);
			}
		}

		return char;
	}

	function replaceEntityNumbers(c) {
		switch(c) {
			case 0x00: return 0xFFFD; // REPLACEMENT CHARACTER
			case 0x13: return 0x0010; // Carriage return
			case 0x80: return 0x20AC; // EURO SIGN
			case 0x81: return 0x0081; // <control>
			case 0x82: return 0x201A; // SINGLE LOW-9 QUOTATION MARK
			case 0x83: return 0x0192; // LATIN SMALL LETTER F WITH HOOK
			case 0x84: return 0x201E; // DOUBLE LOW-9 QUOTATION MARK
			case 0x85: return 0x2026; // HORIZONTAL ELLIPSIS
			case 0x86: return 0x2020; // DAGGER
			case 0x87: return 0x2021; // DOUBLE DAGGER
			case 0x88: return 0x02C6; // MODIFIER LETTER CIRCUMFLEX ACCENT
			case 0x89: return 0x2030; // PER MILLE SIGN
			case 0x8A: return 0x0160; // LATIN CAPITAL LETTER S WITH CARON
			case 0x8B: return 0x2039; // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
			case 0x8C: return 0x0152; // LATIN CAPITAL LIGATURE OE
			case 0x8D: return 0x008D; // <control>
			case 0x8E: return 0x017D; // LATIN CAPITAL LETTER Z WITH CARON
			case 0x8F: return 0x008F; // <control>
			case 0x90: return 0x0090; // <control>
			case 0x91: return 0x2018; // LEFT SINGLE QUOTATION MARK
			case 0x92: return 0x2019; // RIGHT SINGLE QUOTATION MARK
			case 0x93: return 0x201C; // LEFT DOUBLE QUOTATION MARK
			case 0x94: return 0x201D; // RIGHT DOUBLE QUOTATION MARK
			case 0x95: return 0x2022; // BULLET
			case 0x96: return 0x2013; // EN DASH
			case 0x97: return 0x2014; // EM DASH
			case 0x98: return 0x02DC; // SMALL TILDE
			case 0x99: return 0x2122; // TRADE MARK SIGN
			case 0x9A: return 0x0161; // LATIN SMALL LETTER S WITH CARON
			case 0x9B: return 0x203A; // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
			case 0x9C: return 0x0153; // LATIN SMALL LIGATURE OE
			case 0x9D: return 0x009D; // <control>
			case 0x9E: return 0x017E; // LATIN SMALL LETTER Z WITH CARON
			case 0x9F: return 0x0178; // LATIN CAPITAL LETTER Y WITH DIAERESIS
			default:
				if((c >= 0xD800 && c <= 0xDFFF) || c >= 0x10FFFF) { /// @todo. The spec says > 0x10FFFF, not >=. Section 8.2.4.69.
					return 0xFFFD;
				} else if((c >= 0x0001 && c <= 0x0008) || (c >= 0x000E && c <= 0x001F) || (c >= 0x007F && c <= 0x009F) || (c >= 0xFDD0 && c <= 0xFDEF)
						|| c == 0x000B || c == 0xFFFE || c == 0x1FFFE || c == 0x2FFFFE || c == 0x2FFFF || c == 0x3FFFE || c == 0x3FFFF || c == 0x4FFFE || c == 0x4FFFF
						|| c == 0x5FFFE || c == 0x5FFFF || c == 0x6FFFE || c == 0x6FFFF || c == 0x7FFFE || c == 0x7FFFF || c == 0x8FFFE || c == 0x8FFFF
						|| c == 0x9FFFE || c == 0x9FFFF || c == 0xAFFFE || c == 0xAFFFF || c == 0xBFFFE || c == 0xBFFFF || c == 0xCFFFE || c == 0xCFFFF
						|| c == 0xDFFFE || c == 0xDFFFF || c == 0xEFFFE || c == 0xEFFFF || c == 0xFFFFE || c == 0xFFFFF || c == 0x10FFFE || c == 0x10FFFF) {
					return c;
				}
		}
	};

	function consume_numeric_entity(buffer, hex) {
		if(hex) {
			var allowed = HTML5.HEX_DIGITS_R;
			var radix = 16;
		} else {
			var allowed = HTML5.DIGITS_R;
			var radix = 10;
		}

		var chars = '';

		var c = buffer.char();
		while(c !== HTML5.EOF && allowed.test(c)) {
			chars = chars + c;
			c = buffer.char();
		}

		var charAsInt = parseInt(chars, radix);

		var replacement;
		if(replacement = replaceEntityNumbers(charAsInt)) {
			parse_error("invalid-numeric-entity-replaced", {old: charAsInt, new: replacement});
			charAsInt = replacement;
		}

		var char = String.fromCharCode(charAsInt);
		/*if(charAsInt <= 0x10FFFF && !(charAsInt >= 0xD800 && charAsInt <= 0xDFFF)) {
		} else {
			char = String.fromCharCode(0xFFFD);
			parse_error("cant-convert-numeric-entity");
		} */

		if(c !== ';') {
			parse_error("numeric-entity-without-semicolon");
			buffer.unget(c);
		}

		return char;
	}

	function process_entity_in_attribute(buffer) {
		var entity = consume_entity(buffer);
		if(entity) {
			current_token.data.last().nodeValue += entity;
		} else {
			current_token.data.last().nodeValue += '&';
		}
	}

	function process_solidus_in_tag(buffer) {
		var data = buffer.peek(1);
		if(current_token.type == 'StartTag' && data == '>') {
			current_token.type = 'EmptyTag';
			return true;
		} else {
			parse_error("incorrectly-placed-solidus");
			return false;
		}
	}

	function tag_open_state(buffer) {
		var data = buffer.char();
		if(content_model == Models.PCDATA) {
			if (data === HTML5.EOF) {
				parse_error("bare-less-than-sign-at-eof");
				emitToken({type: 'Characters', data: '<'});
				newState(data_state);
			} else if (data !== HTML5.EOF && HTML5.ASCII_LETTERS_R.test(data)) {
				current_token = {type: 'StartTag', name: data, data: []};
				newState(tag_name_state);
			} else if(data == '!') {
				newState(markup_declaration_open_state);
			} else if (data == '/') {
				newState(close_tag_open_state);
			} else if (data == '>') {
				// XXX In theory it could be something besides a tag name. But
				// do we really care?
				parse_error("expected-tag-name-but-got-right-bracket");
				emitToken({type: 'Characters', data: "<>"});
				newState(data_state);
			} else if (data == '?') {
				// XXX In theory it could be something besides a tag name. But
				// do we really care?
				parse_error("expected-tag-name-but-got-question-mark");
				buffer.unget(data);
				newState(bogus_comment_state);
			} else {
				// XXX
				parse_error("expected-tag-name");
				emitToken({type: 'Characters', data: "<"});
				buffer.unget(data);
				newState(data_state);
			}
		} else {
			// We know the content model flag is set to either RCDATA or CDATA or SCRIPT_CDATA
			// now because this state can never be entered with the PLAINTEXT
			// flag.
			if (data === '/') {
				newState(close_tag_open_state);
			} else {
				emitToken({type: 'Characters', data: "<"});
				buffer.unget(data);
				newState(data_state);
			}
		}
		return true
	}

	function close_tag_open_state(buffer) {
		if(content_model == Models.RCDATA || content_model == Models.CDATA || content_model == Models.SCRIPT_CDATA) {
			var chars = '';
			if(current_token) {
				for(var i = 0; i <= current_token.name.length; i++) {
					var c = buffer.char();
					if(c === HTML5.EOF) break;
					chars += c;
				}
				buffer.unget(chars);
			}

			if(current_token
				&& current_token.name.toLowerCase() == chars.slice(0, current_token.name.length).toLowerCase()
				&& (chars.length > current_token.name.length ? new RegExp('[' + HTML5.SPACE_CHARACTERS_IN + '></\0]').test(chars.substr(-1)) : true)
			) {
				content_model = Models.PCDATA;
			} else {
				emitToken({type: 'Characters', data: '</'});
				newState(data_state);
				return true
			}
		}

		var data = buffer.char()
		if (data === HTML5.EOF) {
			parse_error("expected-closing-tag-but-got-eof");
			emitToken({type: 'Characters', data: '</'});
			buffer.unget(data);
			newState(data_state);
		} else if (HTML5.ASCII_LETTERS_R.test(data)) {
			current_token = {type: 'EndTag', name: data, data: []}
			newState(tag_name_state);
		} else if (data == '>') {
			parse_error("expected-closing-tag-but-got-right-bracket");
			newState(data_state);
		} else {
			parse_error("expected-closing-tag-but-got-char", {data: data}); // param 1 is datavars:
			buffer.unget(data);
			newState(bogus_comment_state);
		}
		return true;
	}

	function tag_name_state(buffer) {
		var data = buffer.char();
		if(data === HTML5.EOF) {
			parse_error('eof-in-tag-name');
			emit_current_token();
		} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
			newState(before_attribute_name_state);
		} else if(HTML5.ASCII_LETTERS_R.test(data)) {
			var c = buffer.matchWhile(HTML5.ASCII_LETTERS)
			if(c !== HTML5.EOF) {
				current_token.name += data + c
			} else {
				current_token.name += data
				buffer.unget(c)
				newState(data_state);
			}
		} else if(data == '>') {
			emit_current_token();
		} else if(data == '/') {
			process_solidus_in_tag(buffer)
			newState(self_closing_tag_state);
		} else {
			current_token.name += data;
		}
		buffer.commit();

		return true;
	}

	function before_attribute_name_state(buffer) {
		var data = buffer.shift(1);
		if (data === HTML5.EOF) {
			parse_error("expected-attribute-name-but-got-eof");
			emit_current_token();
		} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
			buffer.matchWhile(HTML5.SPACE_CHARACTERS);
		} else if (HTML5.ASCII_LETTERS_R.test(data)) {
			current_token.data.push({nodeName: data, nodeValue: ""});
			newState(attribute_name_state);
		} else if(data == '>') {
			emit_current_token();
		} else if(data == '/') {
			newState(self_closing_tag_state);
		} else if(data == "'" || data == '"' || data == '=') {
			parse_error("invalid-character-in-attribute-name");
			current_token.data.push({nodeName: data, nodeValue: ""});
			newState(attribute_name_state);
		} else {
			current_token.data.push({nodeName: data, nodeValue: ""});
			newState(attribute_name_state);
		}
		return true;
	}

	function attribute_name_state(buffer) {
		var data = buffer.shift(1);
		var leavingThisState = true;
		var emitToken = false;
		if(data === HTML5.EOF) {
			parse_error("eof-in-attribute-name");
			newState(data_state);
			emitToken = true;
		} else if(data == '=') {
			newState(before_attribute_value_state);
		} else if(HTML5.ASCII_LETTERS_R.test(data)) {
			current_token.data.last().nodeName += data + buffer.matchWhile(HTML5.ASCII_LETTERS);
			leavingThisState = false;
		} else if(data == '>') {
			// XXX If we emit here the attributes are converted to a dict
			// without being checked and when the code below runs we error
			// because data is a dict not a list
			emitToken = true;
		} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
			newState(after_attribute_name_state);
		} else if(data == '/') {
			if(!process_solidus_in_tag(buffer)) {
				newState(before_attribute_name_state);
			}
		} else if(data == "'" || data == '"') {
			parse_error("invalid-character-in-attribute-name");
			current_token.data.last().nodeName += data;
			leavingThisState = false;
		} else {
			current_token.data.last().nodeName += data;
			leavingThisState = false;
		}

		if(leavingThisState) {
			// Attributes are not dropped at this stage. That happens when the
			// start tag token is emitted so values can still be safely appended
			// to attributes, but we do want to report the parse error in time.
			if(this.lowercase_attr_name) {
				current_token.data.last().nodeName = current_token.data.last().nodeName.toLowerCase();
			}
			for (var k in current_token.data.slice(0, -1)) {
				// FIXME this is a fucking mess.
				if(current_token.data.slice(-1)[0] == current_token.data.slice(0, -1)[k].name) {
					parse_error("duplicate-attribute");
					break; // Don't emit more than one of these errors
				}
			}
			if(emitToken) emit_current_token();
		} else {
			buffer.commit()
		}
		return true;
	}

	function after_attribute_name_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error("expected-end-of-tag-but-got-eof");
			emit_current_token();
		} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
			buffer.matchWhile(HTML5.SPACE_CHARACTERS);
		} else if(data == '=') {
			newState(before_attribute_value_state);
		} else if(data == '>') {
			emit_current_token();
		} else if(HTML5.ASCII_LETTERS_R.test(data)) {
			current_token.data.push({nodeName: data, nodeValue: ""});
			newState(attribute_name_state);
		} else if(data == '/') {
			newState(self_closing_tag_state);
		} else {
			current_token.data.push({nodeName: data, nodeValue: ""});
			newState(attribute_name_state);
		}
		return true;
	}

	function before_attribute_value_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error("expected-attribute-value-but-got-eof");
			emit_current_token();
			newState(attribute_value_unquoted_state);
		} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
			buffer.matchWhile(HTML5.SPACE_CHARACTERS);
		} else if(data == '"') {
			newState(attribute_value_double_quoted_state);
		} else if(data == '&') {
			newState(attribute_value_unquoted_state);
			buffer.unget(data);
		} else if(data == "'") {
			newState(attribute_value_single_quoted_state);
		} else if(data == '>') {
			emit_current_token();
		} else if(data == '=') {
			parse_error("equals-in-unquoted-attribute-value");
			current_token.data.last().nodeValue += data;
			newState(attribute_value_unquoted_state);
		} else {
			current_token.data.last().nodeValue += data
			newState(attribute_value_unquoted_state);
		}

		return true;
	}

	function attribute_value_double_quoted_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error("eof-in-attribute-value-double-quote");
			newState(data_state);
		} else if(data == '"') {
			newState(after_attribute_value_state);
		} else if(data == '&') {
			process_entity_in_attribute(buffer);
		} else {
			var s = buffer.matchUntil('["&]');
			if(s !== HTML5.EOF) data = data + s;
			current_token.data.last().nodeValue += data;
		}
		return true;
	}

	function attribute_value_single_quoted_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error("eof-in-attribute-value-single-quote");
			emit_current_token();
		} else if(data == "'") {
			newState(after_attribute_value_state);
		} else if(data == '&') {
			process_entity_in_attribute(buffer);
		} else {
			current_token.data.last().nodeValue += data + buffer.matchUntil("['&]");
		}
		return true;
	}

	function attribute_value_unquoted_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error("eof-in-attribute-value-no-quotes");
			buffer.commit();
			emit_current_token();
		} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
			newState(before_attribute_name_state);
		} else if(data == '&') {
			process_entity_in_attribute(buffer);
		} else if(data == '>') {
			emit_current_token();
		} else if(data == '"' || data == "'" || data == '=') {
			parse_error("unexpected-character-in-unquoted-attribute-value");
			current_token.data.last().nodeValue += data;
		} else {
			var o = buffer.matchUntil("["+ HTML5.SPACE_CHARACTERS_IN + '&<>' +"]")
			if(o === HTML5.EOF) {
				parse_error("eof-in-attribute-value-no-quotes");
				emit_current_token();
			}
			// Commit here since this state is re-enterable and its outcome won't change with more data.
			buffer.commit();
			current_token.data.last().nodeValue += data + o
		}
		return true;
	}

	function after_attribute_value_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error( "unexpected-EOF-after-attribute-value");
			emit_current_token();
			buffer.unget(data);
			newState(data_state);
		} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
			newState(before_attribute_name_state);
		} else if(data == '>') {
			emit_current_token();
			newState(data_state);
		} else if(data == '/') {
			newState(self_closing_tag_state);
		} else {
			emitToken({type: 'ParseError', data: "unexpected-character-after-attribute-value"});
			buffer.unget(data);
			newState(before_attribute_name_state);
		}
		return true;
	}

	function self_closing_tag_state(buffer) {
		var c = buffer.shift(1);
		if(c === HTML5.EOF) {
			parse_error("eof-in-tag-name");
			buffer.unget(c);
			newState(data_state);
		} else if(c == '>') {
			current_token.self_closing = true;
			emit_current_token();
			newState(data_state);
		} else {
			parse_error("expected-self-closing-tag");
			buffer.unget(c);
			newState(before_attribute_name_state);
		}
		return true;
	}

	function bogus_comment_state(buffer) {
		var s = buffer.matchUntil('>');
		if(s === HTML5.EOF) {
			s = ''
		}
		var tok = {type: 'Comment', data: s};
		buffer.char()
		emitToken(tok);
		newState(data_state);
		return true;
	}

	function markup_declaration_open_state(buffer) {
		var chars = buffer.shift(2);
		if(chars === '--') {
			current_token = {type: 'Comment', data: ''};
			newState(comment_start_state);
		} else {
			var newchars = buffer.shift(5);
			if(newchars === HTML5.EOF || chars === HTML5.EOF) {
				parse_error("expected-dashes-or-doctype");
				newState(bogus_comment_state);
				buffer.unget(chars);
				return true;
			}

			chars += newchars;
			if(chars.toUpperCase() == 'DOCTYPE') {
				current_token = {type: 'Doctype', name: '', publicId: null, systemId: null, correct: true};
				newState(doctype_state);
			} else {
				parse_error("expected-dashes-or-doctype");
				buffer.unget(chars);
				newState(bogus_comment_state);
			}
		}
		return true;
	}

	function comment_start_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error("eof-in-comment");
			emitToken(current_token);
			newState(data_state);
		} else if(data == '-') {
			newState(comment_start_dash_state);
		} else if(data == '>') {
			parse_error("incorrect comment");
			emitToken(current_token);
			newState(data_state);
		} else {
			current_token.data += data + buffer.matchUntil('-');
			newState(comment_state);
		}
		return true;
	}

	function comment_start_dash_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error("eof-in-comment");
			emitToken(current_token);
			newState(data_state);
		} else if(data == '-') {
			newState(comment_end_state);
		} else if(data == '>') {
			parse_error("incorrect-comment");
			emitToken(current_token);
			newState(data_state);
		} else {
			var s = buffer.matchUntil('-');
			if(s !== HTML5.EOF) data = data + s;
			current_token.data += '-' + data;
			newState(comment_state);
		}
		return true;
	}

	function comment_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error("eof-in-comment");
			emitToken(current_token);
			newState(data_state);
		} else if(data == '-') {
			newState(comment_end_dash_state);
		} else {
			current_token.data += data + buffer.matchUntil('-');
		}
		return true;
	}

	function comment_end_dash_state(buffer) {
		var data = buffer.char();
		if (data === HTML5.EOF) {
			parse_error("eof-in-comment-end-dash");
			emitToken(current_token);
			newState(data_state);
		} else if(data == '-') {
			newState(comment_end_state);
		} else {
			current_token.data += '-' + data + buffer.matchUntil('-');
			// Consume the next character which is either a "-" or an :EOF as
			// well so if there's a "-" directly after the "-" we go nicely to
			// the "comment end state" without emitting a ParseError there.
			buffer.char();
		}
		return true;
	}

	function comment_end_state(buffer) {
		var data = buffer.shift(1);
		if (data === HTML5.EOF) {
			parse_error("eof-in-comment-double-dash");
			emitToken(current_token);
			newState(data_state);
		} else if(data == '>') {
			emitToken(current_token);
			newState(data_state);
		} else if(data == '-') {
			parse_error("unexpected-dash-after-double-dash-in-comment");
			current_token.data += data;
		} else {
			// XXX
			parse_error("unexpected-char-in-comment");
			current_token.data += '--' + data;
			newState(comment_state);
		}
		return true;
	}

	function doctype_state(buffer) {
		var data = buffer.shift(1);
		if(HTML5.SPACE_CHARACTERS_R.test(data)) {
			newState(before_doctype_name_state);
		} else {
			parse_error("need-space-after-doctype");
			buffer.unget(data);
			newState(before_doctype_name_state);
		}
		return true;
	}

	function before_doctype_name_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			parse_error("expected-doctype-name-but-got-eof");
			current_token.correct = false;
			emit_current_token();
			newState(data_state);
		} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
		} else if(data == '>') {
			parse_error("expected-doctype-name-but-got-right-bracket");
			current_token.correct = false;
			emit_current_token();
			newState(data_state);
		} else {
			current_token.name = data;
			newState(doctype_name_state);
		}
		return true
	}

	function doctype_name_state(buffer) {
		var data = buffer.shift(1);
		if(data === HTML5.EOF) {
			current_token.correct = false;
			buffer.unget(data);
			parse_error("eof-in-doctype");
			emit_current_token();
			newState(data_state);
		} else if(HTML5.SPACE_CHARACTERS_R.test(data)) {
			newState(bogus_doctype_state);
		} else if(data == '>') {
			emit_current_token();
			newState(data_state);
		} else {
			current_token.name += data;
		}
		return true;
	}

	function bogus_doctype_state(buffer) {
		var data = buffer.shift(1);
		current_token.correct = false;
		if(data === HTML5.EOF) {
			throw(new Error("Unimplemented!"))
		} else if(data == '>') {
			emit_current_token();
			newState(data_state);
		}
		return true;
	}

	function parse_error(message, context) {
		emitToken({type: 'ParseError', data: message});
		HTML5.debug('tokenizer.parseError', message, context);
	}

	function emit_current_token() {
		var tok = current_token;
		switch(tok.type) {
		case 'StartTag':
		case 'EndTag':
		case 'EmptyTag':
			if(tok.type == 'EndTag' && tok.self_closing) {
				parse_error('self-closing-end-tag');
			}
			break;
		}
		if (current_token.name.toLowerCase() == "script" && tok.type == 'EndTag' && script_buffer) {
			emitToken({ type: 'Characters', data: script_buffer });
			script_buffer = null;
		}
		emitToken(tok);
		newState(data_state);
	}

	function normalize_token(token) {
		if(token.type == 'EmptyTag') {
			if(HTML5.VOID_ELEMENTS.indexOf(token.name) == -1) {
				parse_error('incorrectly-placed-solidus');
			}
			token.type = 'StartTag';
		}

		if(token.type == 'StartTag') {
			token.name = token.name.toLowerCase();
			if(token.data.length != 0) {
				var data = {};
				// reverse so that the first value for each key wins
				token.data.reverse();
				token.data.forEach(function(e) {
					data[e.nodeName.toLowerCase()] = e.nodeValue;
				});
				token.data = [];
				for(var k in data) {
					token.data.push({nodeName: k, nodeValue: data[k]});
				}
				// now restore the original attribute order
				token.data.reverse();
			}
		} else if(token.type == 'EndTag') {
			if(token.data.length != 0) parse_error('attributes-in-end-tag');
			token.name = token.name.toLowerCase();
		}

		return token;
	}

	if(typeof input === 'undefined') throw(new Error("No input given"));
	var content_model;
	this.document = document;
	this.__defineSetter__('content_model', function(model) {
		HTML5.debug('tokenizer.content_model=', model)
		content_model = model
	})
	this.__defineGetter__('content_model', function() {
		return content_model
	})
	function newState(newstate) {
		HTML5.debug('tokenizer.state=', newstate.name)
		state = newstate;
		buffer.commit();
	}

	newState(data_state);

	if(input instanceof events.EventEmitter) {
		source = input;
		this.pump = null;
	} else {
		var source = new events.EventEmitter();
		this.pump = function() {
			source.emit('data', input);
			source.emit('end');
		}
	}

	source.addListener('data', function(data) {
		if(typeof data !== 'string') data = data.toString();
		buffer.append(data);
		try {
			while(state(buffer));
		} catch(e) {
			if(e != HTML5.DRAIN) {
				throw(e);
			} else {
				HTML5.debug('tokenizer.drain', 'Drain')
				buffer.undo();
			}
		}
	});
	source.addListener('end', function() {
		buffer.eof = true;
		while(state(buffer));
		this.emit('end');
	}.bind(this));

}

t.prototype = new events.EventEmitter;