2012-02-28 14:24:37 +00:00
|
|
|
/**
|
|
|
|
* A simple dump grepper based on the DumpReader module.
|
|
|
|
*
|
|
|
|
* @author Gabriel Wicke <gwicke@wikimedia.org>
|
|
|
|
*/
|
|
|
|
|
2012-02-27 16:40:01 +00:00
|
|
|
var dumpReader = require('./dumpReader.js'),
|
|
|
|
events = require('events'),
|
2012-02-28 13:21:01 +00:00
|
|
|
optimist = require('optimist'),
|
|
|
|
colors = require('colors');
|
2012-02-27 16:40:01 +00:00
|
|
|
|
|
|
|
function DumpGrepper ( regexp ) {
|
|
|
|
// inherit from EventEmitter
|
|
|
|
//events.EventEmitter.call(this);
|
|
|
|
this.re = regexp;
|
|
|
|
}
|
|
|
|
|
|
|
|
DumpGrepper.prototype = new events.EventEmitter();
|
|
|
|
DumpGrepper.prototype.constructor = DumpGrepper;
|
|
|
|
|
|
|
|
DumpGrepper.prototype.grepRev = function ( revision ) {
|
2012-02-29 10:49:00 +00:00
|
|
|
var result = this.re.exec( revision.text ),
|
|
|
|
matches = [];
|
|
|
|
while ( result ) {
|
|
|
|
matches.push( result );
|
|
|
|
result = this.re.exec( revision.text );
|
|
|
|
}
|
|
|
|
if ( matches.length ) {
|
|
|
|
this.emit( 'match', revision, matches );
|
2012-02-27 16:40:01 +00:00
|
|
|
}
|
2012-02-28 13:21:01 +00:00
|
|
|
};
|
2012-02-27 16:40:01 +00:00
|
|
|
|
|
|
|
module.exports.DumpGrepper = DumpGrepper;
|
|
|
|
|
|
|
|
if (module === require.main) {
|
2012-02-28 14:24:37 +00:00
|
|
|
var argv = optimist.usage( 'Usage: zcat dump.xml.gz | $0 <regexp>', {
|
2012-02-27 16:40:01 +00:00
|
|
|
'i': {
|
|
|
|
description: 'Case-insensitive matching',
|
|
|
|
'boolean': true,
|
|
|
|
'default': false
|
2012-02-28 13:48:47 +00:00
|
|
|
},
|
|
|
|
'color': {
|
2012-02-28 14:24:37 +00:00
|
|
|
description: 'Highlight matched substring using color. Use --no-color to disable.',
|
2012-02-28 13:48:47 +00:00
|
|
|
'boolean': true,
|
|
|
|
'default': true
|
2012-02-27 16:40:01 +00:00
|
|
|
}
|
|
|
|
} ).argv;
|
2012-02-28 14:24:37 +00:00
|
|
|
|
|
|
|
if( argv.help ) {
|
|
|
|
optimist.showHelp();
|
|
|
|
process.exit( 0 );
|
|
|
|
}
|
2012-02-27 16:40:01 +00:00
|
|
|
|
2012-02-29 10:49:00 +00:00
|
|
|
var flags = 'g';
|
2012-02-27 16:40:01 +00:00
|
|
|
if(argv.i) {
|
|
|
|
flags += 'i';
|
|
|
|
}
|
|
|
|
|
2012-02-29 13:02:46 +00:00
|
|
|
var re = new RegExp( argv._[0], flags );
|
2012-02-28 13:21:01 +00:00
|
|
|
|
2012-02-27 16:40:01 +00:00
|
|
|
var reader = new dumpReader.DumpReader(),
|
2012-03-01 16:42:28 +00:00
|
|
|
grepper = new DumpGrepper( re ),
|
2012-03-02 15:49:05 +00:00
|
|
|
stats = {
|
|
|
|
revisions: 0,
|
|
|
|
matches: 0
|
|
|
|
};
|
2012-03-01 16:42:28 +00:00
|
|
|
|
|
|
|
reader.on( 'revision', function ( revision ) {
|
2012-03-02 15:49:05 +00:00
|
|
|
stats.revisions++;
|
2012-03-01 16:42:28 +00:00
|
|
|
grepper.grepRev( revision );
|
|
|
|
} );
|
2012-02-27 16:40:01 +00:00
|
|
|
|
2012-02-29 10:49:00 +00:00
|
|
|
grepper.on( 'match', function ( revision, matches ) {
|
2012-03-02 15:49:05 +00:00
|
|
|
stats.matches++;
|
2012-02-29 10:49:00 +00:00
|
|
|
for ( var i = 0, l = matches.length; i < l; i++ ) {
|
2012-02-28 14:11:44 +00:00
|
|
|
console.log( '== Match: [[' + revision.page.title + ']] ==' );
|
2012-02-29 10:49:00 +00:00
|
|
|
var m = matches[i];
|
|
|
|
//console.warn( JSON.stringify( m.index, null, 2 ) );
|
2012-02-28 13:48:47 +00:00
|
|
|
if ( argv.color ) {
|
2012-02-29 10:49:00 +00:00
|
|
|
console.log(
|
|
|
|
revision.text.substr( m.index - 40, 40 ) +
|
|
|
|
m[0].green +
|
|
|
|
revision.text.substr( m.index + m[0].length, 40 ) );
|
2012-02-28 13:48:47 +00:00
|
|
|
} else {
|
2012-02-29 10:49:00 +00:00
|
|
|
console.log(
|
|
|
|
revision.text.substr( m.index, -40 ) +
|
|
|
|
m[0] +
|
|
|
|
revision.text.substr( m.index + m[0].length, 40 ) );
|
2012-02-28 13:48:47 +00:00
|
|
|
}
|
2012-02-28 13:21:01 +00:00
|
|
|
}
|
2012-02-27 16:40:01 +00:00
|
|
|
} );
|
2012-03-01 16:42:28 +00:00
|
|
|
|
|
|
|
process.stdin.on ( 'end' , function() {
|
|
|
|
// Print some stats
|
|
|
|
console.log( '################################################' );
|
2012-03-02 15:49:05 +00:00
|
|
|
console.log( 'Total revisions: ' + stats.revisions );
|
|
|
|
console.log( 'Total matches: ' + stats.matches );
|
|
|
|
console.log( 'Ratio: ' + (stats.matches / stats.revisions * 100) + '%' );
|
2012-03-01 16:42:28 +00:00
|
|
|
console.log( '################################################' );
|
|
|
|
} );
|
|
|
|
|
2012-02-27 16:40:01 +00:00
|
|
|
process.stdin.on('data', reader.push.bind(reader) );
|
2012-03-02 15:49:05 +00:00
|
|
|
process.stdin.setEncoding('utf8');
|
2012-02-27 16:40:01 +00:00
|
|
|
process.stdin.resume();
|
2012-03-01 16:42:28 +00:00
|
|
|
|
|
|
|
|
2012-02-27 16:40:01 +00:00
|
|
|
}
|
|
|
|
|