mediawiki-extensions-Visual.../tests/parser/dumpReader.js
Gabriel Wicke 0d30a5528e First combination of WikiDom serializers with existing parser in
tests/parser/parserTests.js.

* Removed var from es in es.js to allow node.js to access it as global. Only
  alternative solution appears to be a node-specific 'exports' construct:
  http://nodejs.org/docs/v0.3.1/api/modules.html
* Added es.Document.js and es.Document.Serializer.js in es/bases. Not sure if
  this is the desired location.
* Changed es.extend to es.extendClass in the serializers
* Modified the first parser test to include the WikiDom modules and call the
  new HTML serializer
2011-11-03 13:55:48 +00:00

133 lines
3.1 KiB
JavaScript

var events = require('events'),
util = require('util'),
libxml = require('libxmljs'); // npm install libxmljs
function DumpReader() {
events.EventEmitter.call(this);
}
util.inherits(DumpReader, events.EventEmitter);
/**
* @param {Stream} stream input stream to read XML from
*/
DumpReader.prototype.read = function(stream) {
var self = this;
var complete = false;
var stack = [{}],
workspace = {},
buffer = '';
function flip(arr) {
var obj = {};
arr.forEach(function(val) {
obj[val] = true;
});
return obj;
}
var textNodes = flip(['id', 'text', 'title', 'minor', 'comment', 'username', 'timestamp']),
boolNodes = flip(['minor', 'redirect']),
ignoreNodes = flip(['mediawiki', 'siteinfo', 'upload', 'thread']);
var parser = new libxml.SaxPushParser(function(cb) {
cb.onStartElementNS(function(elem, attrs, prefix, uri, namespaces) {
if (elem in ignoreNodes) {
// ...
} else if (elem == 'page') {
stack = [];
workspace = {};
} else if (elem == 'revision') {
stack.push(workspace);
workspace = {
page: workspace
};
} else if (elem in textNodes || elem in boolNodes) {
buffer = '';
} else {
stack.push(workspace);
workspace = {};
}
});
cb.onEndElementNS(function(elem, prefix, uri) {
// ping something!
if (elem == 'mediawiki') {
self.complete = true;
stream.pause();
self.emit('end', {});
} else if (elem == 'page') {
self.emit('page', workspace);
workspace = stack.pop();
} else if (elem == 'revision') {
self.emit('revision', workspace);
workspace = stack.pop();
} else if (elem in textNodes) {
workspace[elem] = buffer;
} else if (elem in boolNodes) {
workspace[elem] = true;
} else {
var current = workspace;
workspace = stack.pop();
workspace[elem] = current;
}
});
cb.onCharacters(function(chars) {
buffer += chars;
});
cb.onCdata(function(cdata) {
buffer += cdata;
});
cb.onEndDocument(function() {
// This doesn't seem to run...?
self.complete = true;
stream.pause();
self.emit('end', {});
})
cb.onError(function(err) {
self.emit('error', err);
// Should we.... stop reading now or what?
});
// Now, start reading the file in :D
stream.on('data', function(buffer) {
parser.push(buffer); // @fixme does this want bytes or chars?
});
stream.on('end', function() {
if (!complete) {
// uh-oh!
//self.emit('error', 'End of file before end of XML stream.');
}
});
stream.on('error', function(err) {
self.emit('error', err);
});
});
};
module.exports.DumpReader = DumpReader;
if (module === require.main) {
var reader = new DumpReader();
reader.on('end', function() {
console.log('done!');
process.exit();
});
reader.on('error', function(err) {
console.log('error!', err);
process.exit(1);
});
reader.on('page', function(page) {
console.log('page', page);
});
reader.on('revision', function(revision) {
revision.text = revision.text.substr(0, 40);
console.log('revision', revision);
});
console.log('Reading!');
process.stdin.setEncoding('utf8');
process.stdin.resume();
reader.read(process.stdin);
}