2014-12-31 23:02:05 +00:00
|
|
|
|
<?php
|
|
|
|
|
|
2024-07-12 22:40:54 +00:00
|
|
|
|
namespace MediaWiki\Extension\TextExtracts;
|
2014-12-31 23:02:05 +00:00
|
|
|
|
|
2024-06-26 21:56:41 +00:00
|
|
|
|
use Generator;
|
2024-10-20 11:21:15 +00:00
|
|
|
|
use MediaWiki\Api\ApiMain;
|
|
|
|
|
use MediaWiki\Api\ApiResult;
|
2023-08-14 17:49:09 +00:00
|
|
|
|
use MediaWiki\Api\Hook\ApiOpenSearchSuggestHook;
|
2024-06-26 21:56:41 +00:00
|
|
|
|
use MediaWiki\Config\Config;
|
|
|
|
|
use MediaWiki\Config\ConfigFactory;
|
2023-05-19 10:25:20 +00:00
|
|
|
|
use MediaWiki\Request\FauxRequest;
|
2024-06-26 21:56:41 +00:00
|
|
|
|
use MediaWiki\Rest\Hook\SearchResultProvideDescriptionHook;
|
2014-12-31 23:02:05 +00:00
|
|
|
|
|
2019-04-24 16:26:53 +00:00
|
|
|
|
/**
|
|
|
|
|
* @license GPL-2.0-or-later
|
|
|
|
|
*/
|
2024-06-26 21:56:41 +00:00
|
|
|
|
class Hooks implements
|
|
|
|
|
ApiOpenSearchSuggestHook,
|
|
|
|
|
SearchResultProvideDescriptionHook
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
private Config $config;
|
|
|
|
|
|
|
|
|
|
public function __construct(
|
|
|
|
|
ConfigFactory $configFactory
|
|
|
|
|
) {
|
|
|
|
|
$this->config = $configFactory->makeConfig( 'textextracts' );
|
|
|
|
|
}
|
2015-12-09 23:33:24 +00:00
|
|
|
|
|
2014-12-31 23:02:05 +00:00
|
|
|
|
/**
|
2024-06-26 21:56:41 +00:00
|
|
|
|
* Trim an extract to a sensible length.
|
|
|
|
|
*
|
|
|
|
|
* Adapted from Extension:OpenSearchXml, which adapted it from
|
|
|
|
|
* Extension:ActiveAbstract.
|
|
|
|
|
*
|
|
|
|
|
* @param string $text
|
|
|
|
|
* @param int $length Target length; actual result will continue to the end of a sentence.
|
|
|
|
|
* @return string
|
2014-12-31 23:02:05 +00:00
|
|
|
|
*/
|
2024-06-26 21:56:41 +00:00
|
|
|
|
private static function trimExtract( $text, $length ) {
|
|
|
|
|
static $regex = null;
|
|
|
|
|
if ( $regex === null ) {
|
|
|
|
|
$endchars = [
|
|
|
|
|
// regular ASCII
|
|
|
|
|
'([^\d])\.\s', '\!\s', '\?\s',
|
|
|
|
|
// full-width ideographic full-stop
|
|
|
|
|
'。',
|
|
|
|
|
// double-width roman forms
|
|
|
|
|
'.', '!', '?',
|
|
|
|
|
// half-width ideographic full stop
|
|
|
|
|
'。',
|
|
|
|
|
];
|
|
|
|
|
$endgroup = implode( '|', $endchars );
|
|
|
|
|
$end = "(?:$endgroup)";
|
|
|
|
|
$sentence = ".{{$length},}?$end+";
|
|
|
|
|
$regex = "/^($sentence)/u";
|
2014-12-31 23:02:05 +00:00
|
|
|
|
}
|
2024-06-26 21:56:41 +00:00
|
|
|
|
$matches = [];
|
|
|
|
|
if ( preg_match( $regex, $text, $matches ) ) {
|
|
|
|
|
return trim( $matches[1] );
|
|
|
|
|
} else {
|
|
|
|
|
// Just return the first line
|
|
|
|
|
return trim( explode( "\n", $text )[0] );
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-06-28 18:45:20 +00:00
|
|
|
|
|
2024-06-26 21:56:41 +00:00
|
|
|
|
/**
|
|
|
|
|
* Retrieves extracts data for the given page IDs from the TextExtract API.
|
|
|
|
|
* The page IDs are chunked into the max limit of exlimit of the TextExtract API
|
|
|
|
|
*
|
|
|
|
|
* @param array $pageIds An array of page IDs to retrieve extracts for
|
|
|
|
|
* @return Generator Yields the result data from the API request
|
|
|
|
|
* $data = [
|
|
|
|
|
* 'pageId' => [
|
|
|
|
|
* 'ns' => int of the namespace
|
|
|
|
|
* 'title' => string of the title of the page
|
|
|
|
|
* 'extract' => string of the text extracts of the page
|
|
|
|
|
* ]
|
|
|
|
|
* ]
|
|
|
|
|
*/
|
|
|
|
|
private function getExtractsData( array $pageIds ) {
|
|
|
|
|
foreach ( array_chunk( $pageIds, 20 ) as $chunkedPageIds ) {
|
2017-05-26 21:17:21 +00:00
|
|
|
|
$api = new ApiMain( new FauxRequest(
|
|
|
|
|
[
|
|
|
|
|
'action' => 'query',
|
|
|
|
|
'prop' => 'extracts',
|
|
|
|
|
'explaintext' => true,
|
|
|
|
|
'exintro' => true,
|
2024-06-26 21:56:41 +00:00
|
|
|
|
'exlimit' => count( $chunkedPageIds ),
|
|
|
|
|
'pageids' => implode( '|', $chunkedPageIds ),
|
|
|
|
|
]
|
|
|
|
|
) );
|
2017-05-26 21:17:21 +00:00
|
|
|
|
$api->execute();
|
2024-06-26 21:56:41 +00:00
|
|
|
|
yield $api->getResult()->getResultData( [ 'query', 'pages' ] );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* ApiOpenSearchSuggest hook handler
|
|
|
|
|
* @param array &$results Array of search results
|
|
|
|
|
*/
|
|
|
|
|
public function onApiOpenSearchSuggest( &$results ) {
|
|
|
|
|
if ( !$this->config->get( 'ExtractsExtendOpenSearchXml' ) || $results === [] ) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$pageIds = array_keys( $results );
|
|
|
|
|
foreach ( $this->getExtractsData( $pageIds ) as $data ) {
|
2017-05-26 21:17:21 +00:00
|
|
|
|
foreach ( $pageIds as $id ) {
|
2019-02-02 20:53:08 +00:00
|
|
|
|
$contentKey = $data[$id]['extract'][ApiResult::META_CONTENT] ?? '*';
|
2017-05-26 21:17:21 +00:00
|
|
|
|
if ( isset( $data[$id]['extract'][$contentKey] ) ) {
|
|
|
|
|
$results[$id]['extract'] = $data[$id]['extract'][$contentKey];
|
|
|
|
|
$results[$id]['extract trimmed'] = false;
|
|
|
|
|
}
|
2014-12-31 23:02:05 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-06-26 21:56:41 +00:00
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Used to update Search Results with descriptions for Search Engine.
|
|
|
|
|
* @param array $pageIdentities Array (string=>SearchResultPageIdentity) where key is pageId
|
|
|
|
|
* @param array &$descriptions Output array (string=>string|null)
|
|
|
|
|
* where key is pageId and value is either a description for given page or null
|
|
|
|
|
*/
|
|
|
|
|
public function onSearchResultProvideDescription(
|
|
|
|
|
array $pageIdentities,
|
|
|
|
|
&$descriptions
|
|
|
|
|
): void {
|
|
|
|
|
if ( !$this->config->get( 'ExtractsExtendRestSearch' ) || $pageIdentities === [] ) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$pageIds = array_map( static function ( $identity ) {
|
|
|
|
|
return $identity->getId();
|
|
|
|
|
}, $pageIdentities );
|
|
|
|
|
foreach ( $this->getExtractsData( $pageIds ) as $data ) {
|
|
|
|
|
foreach ( $pageIds as $id ) {
|
|
|
|
|
$contentKey = $data[$id]['extract'][ApiResult::META_CONTENT] ?? '*';
|
|
|
|
|
if ( isset( $data[$id]['extract'][$contentKey] ) ) {
|
|
|
|
|
$descriptions[$id] = self::trimExtract( $data[$id]['extract'][$contentKey], 150 );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-12-30 22:20:55 +00:00
|
|
|
|
}
|