Make Linter category counts more accurate when counts are low

* The code now produces an accurate count if the number of
   errors for a category is below the threshold set by a
   public constant MAX_ACCURATE_COUNT (currently 20).
   The database record count limit was originally set to 1,
   to determine accurately, if there were actually 0 errors
   in a category as the estimate code would never report 0.
   If not 0, it would use the estimated count which does not
   produce an accurate count for any other number of errors.
   For low error counts this is annoying to editors and
   unnecessary. The additional CPU/disk activity to accurately
   check for low error counts is not significantly more than
   checking for 0 or 1, as checking for 0 likely requires
   a complete table scan which is probably expensive compared
   to a low count that early outs when it hits to record limit.

 * An improvement to consider is recording the accurate count in
   a separate tiny table, and maintaining an accurate count there
   which is used in preference to doing the select with row limit
   based on say a 30 second TTL, to prevent a stampede of requests
   from doing extraneous database operations.

 * Added unit test coverage for accurately counting low error
   conditions that are lower than the threshold and also verify
   that the estimate is inaccurate beyond the error count
   threshold.

Bug: T194872
Change-Id: I4f74cfe3bf9601baa0dc8fa6464a68030ac2bc4b
This commit is contained in:
sbailey 2021-04-07 12:33:16 -07:00
parent 077c30220e
commit 201b47e01d
2 changed files with 63 additions and 9 deletions

View file

@ -33,6 +33,7 @@ class Database {
* for a page, the rest are just dropped
*/
public const MAX_PER_CAT = 20;
public const MAX_ACCURATE_COUNT = 20;
/**
* @var int
@ -246,21 +247,25 @@ class Database {
*/
private function getTotalsEstimate( $catId ) {
$dbr = wfGetDB( DB_REPLICA );
// First see if there are no rows, since the distinction
// between 0 and 1 is important. And estimateRowCount seems
// to never return 0.
// First see if there are no rows, or a moderate number
// within the limit specified by the MAX_ACCURATE_COUNT.
// The distinction between 0, a few and a lot is important
// to determine first, as estimateRowCount seem to never
// return 0 or accurate low error counts.
$rows = $dbr->selectRowCount(
'linter',
'*',
[ 'linter_cat' => $catId ],
__METHOD__,
[ 'LIMIT' => 1 ]
[ 'LIMIT' => self::MAX_ACCURATE_COUNT ]
);
if ( $rows === 0 ) {
return 0;
// Return an accurate count if the number of errors is
// below the maximum accurate count limit
if ( $rows < self::MAX_ACCURATE_COUNT ) {
return $rows;
}
// Now we can just estimate
// Now we can just estimate if the maximum accurate count limit
// was returned, which isn't the actual count but the limit reached
return $dbr->estimateRowCount(
'linter',
'*',

View file

@ -29,7 +29,6 @@ use MediaWikiTestCase;
* @covers MediaWiki\Linter\Database
*/
class DatabaseTest extends MediaWikiTestCase {
public function testConstructor() {
$this->assertInstanceOf( Database::class, new Database( 5 ) );
}
@ -62,6 +61,16 @@ class DatabaseTest extends MediaWikiTestCase {
$this->assertArrayEquals( $expectedIds, $actualIds );
}
private function createManyLintErrors( $lintDb, $errorCount ) {
$manyLintErrors = [];
for ( $i = 0; $i < $errorCount; $i++ ) {
$manyLintErrors[] = new LintError(
'obsolete-tag', [ 15, 20 + $i ], [ 'name' => 'big' ]
);
}
$lintDb->setForPage( $manyLintErrors );
}
public function testSetForPage() {
$lintDb = new Database( 5 );
$dummyErrors = $this->getDummyLintErrors();
@ -69,11 +78,21 @@ class DatabaseTest extends MediaWikiTestCase {
$this->assertSetForPageResult( $result, [], [ 'fostered' => 1, 'obsolete-tag' => 1 ] );
$this->assertLintErrorsEqual( $dummyErrors, $lintDb->getForPage() );
// Accurate low error count values should match for both methods
$resultTotals = $lintDb->getTotalsForPage();
$resultEstimatedTotals = $lintDb->getTotals();
$this->assertEquals( $resultTotals, $resultEstimatedTotals );
// Should delete the second error
$result2 = $lintDb->setForPage( [ $dummyErrors[0] ] );
$this->assertSetForPageResult( $result2, [ 'obsolete-tag' => 1 ], [] );
$this->assertLintErrorsEqual( [ $dummyErrors[0] ], $lintDb->getForPage() );
// Accurate low error count values should match for both methods
$resultTotals = $lintDb->getTotalsForPage();
$resultEstimatedTotals = $lintDb->getTotals();
$this->assertEquals( $resultTotals, $resultEstimatedTotals );
// Insert the second error, delete the first
$result3 = $lintDb->setForPage( [ $dummyErrors[1] ] );
$this->assertSetForPageResult( $result3, [ 'fostered' => 1 ], [ 'obsolete-tag' => 1 ] );
@ -83,6 +102,36 @@ class DatabaseTest extends MediaWikiTestCase {
$result4 = $lintDb->setForPage( [] );
$this->assertSetForPageResult( $result4, [ 'obsolete-tag' => 1 ], [] );
$this->assertLintErrorsEqual( [], $lintDb->getForPage() );
// Accurate low error count values should match for both methods
$resultTotals = $lintDb->getTotalsForPage();
$resultEstimatedTotals = $lintDb->getTotals();
$this->assertEquals( $resultTotals, $resultEstimatedTotals );
// For error counts below the MAX_ACCURATE_COUNT, both error
// count methods should return the same count.
self::createManyLintErrors( $lintDb, Database::MAX_ACCURATE_COUNT - 1 );
$resultTotals = $lintDb->getTotalsForPage();
$resultEstimatedTotals = $lintDb->getTotals();
$this->assertEquals( $resultTotals, $resultEstimatedTotals );
// For error counts equal to or above the MAX_ACCURATE_COUNT, both error
// count methods should NOT return the same count in this test scenario
// because previously added and deleted records will be included
// in the estimated count which is normal.
self::createManyLintErrors( $lintDb, Database::MAX_ACCURATE_COUNT );
$resultTotals = $lintDb->getTotalsForPage();
$resultEstimatedTotals = $lintDb->getTotals();
$this->assertNotEquals( $resultTotals, $resultEstimatedTotals );
// For error counts greatly above the MAX_ACCURATE_COUNT, the estimated
// count method should return a greater count in this test scenario
// because previously added and deleted records will be included
// in the estimated count which is normal.
self::createManyLintErrors( $lintDb, Database::MAX_ACCURATE_COUNT * 10 );
$resultTotals = $lintDb->getTotalsForPage();
$resultEstimatedTotals = $lintDb->getTotals();
$this->assertGreaterThan( $resultTotals, $resultEstimatedTotals );
}
}