From dcfd6ea6a8f99e18aa644c3e4e70eeac83b3f2b3 Mon Sep 17 00:00:00 2001 From: alex4401 Date: Tue, 2 Jan 2024 18:33:05 +0100 Subject: [PATCH] Implement a RemexHtml-based provider (requires 1.38) This patch has been in testing on ark.wiki.gg (with the platform's approval) for a few months now, and has yielded us significantly better extracts than the live algorithm. It brings Description2 closer to TextExtracts' level, but without its constraints. TextExtracts does not enable us to override descriptions as we see fit, requires internal API calls, and uses HtmlFormatter which has its own slew of problems. I'm also raising MW requirement to 1.38: this change has not been tested with older versions, and I removed uses of old `getProperty` in favour of `getPageProperty` to clean up the code. I doubt this really matters much: support for 1.35 is about to end (if it hasn't already), and 1.38 itself is already EoL. It also appears that lately this extension only received forward-compatibility patches. New provider leverages RemexHtml (used in core MediaWiki and Parsoid) to parse and process a page's HTML. For performance reasons (but also bit of practicality - it's unlikely that description derival needs full page text...) any HTML after the first heading is dropped and not parsed. However, this is just a precaution, on ARK we haven't noticed any performance degradation. Two new configuration variables are added: - `$wgUseSimpleDescriptionAlgorithm` (proposed default: false) determines which extract provider is used. `true` is the previous algorithm. `false` is the new Remex implementation. - `$wgDescriptionRemoveElements` is an array of tag names and/or class names to strip out when generating extracts. This is only supported in the Remex provider, and would be hard to retrofit into the previous algorithm. Depends-On: I04b00f99085f07f773212ee3eca8470eece34e9e Change-Id: I8e6bf1f17443feac89f07e728d395e5a525bd4d1 --- extension.json | 25 ++++- includes/Description2.php | 14 +-- includes/RemexDescriptionProvider.php | 133 +++++++++++++++++++++++++ includes/ServiceWiring.php | 5 +- includes/SimpleDescriptionProvider.php | 7 ++ 5 files changed, 169 insertions(+), 15 deletions(-) create mode 100644 includes/RemexDescriptionProvider.php diff --git a/extension.json b/extension.json index e89cdbc..42d5432 100644 --- a/extension.json +++ b/extension.json @@ -9,11 +9,32 @@ "license-name": "GPL-2.0-or-later", "type": "other", "requires": { - "MediaWiki": ">= 1.35.0" + "MediaWiki": ">= 1.38.0" }, "config": { "EnableMetaDescriptionFunctions": false, - "DescriptionMaxChars": 300 + "DescriptionMaxChars": 300, + "DescriptionAlgorithm": "MediaWiki\\Extension\\Description2\\RemexDescriptionProvider", + "DescriptionRemoveElements": [ + "table", + "div", + "script", + "style", + "figure", + "input", + "div.infobox", + "div.ambox", + "div.dmbox", + "ul.gallery", + ".mw-editsection", + "sup.reference", + "ul.references", + ".error", + ".nomobile", + ".noprint", + ".noexcerpt", + ".sortkey" + ] }, "ConfigRegistry": { "Description2": "GlobalVarConfig::newInstance" diff --git a/includes/Description2.php b/includes/Description2.php index 6794e5d..3b6ce92 100644 --- a/includes/Description2.php +++ b/includes/Description2.php @@ -23,18 +23,10 @@ class Description2 { */ public static function setDescription( Parser $parser, $desc ) { $parserOutput = $parser->getOutput(); - if ( method_exists( $parserOutput, 'getPageProperty' ) ) { - // MW 1.38+ - if ( $parserOutput->getPageProperty( 'description' ) !== null ) { - return; - } - $parserOutput->setPageProperty( 'description', $desc ); - } else { - if ( $parserOutput->getProperty( 'description' ) !== false ) { - return; - } - $parserOutput->setProperty( 'description', $desc ); + if ( $parserOutput->getPageProperty( 'description' ) !== null ) { + return; } + $parserOutput->setPageProperty( 'description', $desc ); } /** diff --git a/includes/RemexDescriptionProvider.php b/includes/RemexDescriptionProvider.php new file mode 100644 index 0000000..1fc42fa --- /dev/null +++ b/includes/RemexDescriptionProvider.php @@ -0,0 +1,133 @@ +toRemove = $config->get( 'DescriptionRemoveElements' ); + } + + /** + * Extracts description from the HTML representation of a page. + * + * This algorithm: + * 1. Looks for the first heading (potentially included in the ToC) and cuts the text to avoid unnecessary + * server load. + * 2. Invokes RemexHTML to parse and reserialise the HTML representation. + * 3. Comments are excluded. + * 4. HTML elements are filtered by tag name and the 'class' attribute. Removals are dictated through the + * $wgDescriptionRemoveElements config variable. + * 5. HTML tags are stripped, and only text is preserved. + * 6. Strips white-space around the extract. + * + * This is more costly than the SimpleDescriptionProvider but is far more flexible and easier to manipulate by + * editors. + * + * @param string $text + * @return string + */ + public function derive( string $text ): ?string { + $formatter = new class( $options = [], $this->toRemove ) extends HtmlFormatter { + + /** @var string[] */ + private array $toRemove; + + /** + * @param array $options + * @param array $toRemove + */ + public function __construct( $options, array $toRemove ) { + parent::__construct( $options ); + $this->toRemove = $toRemove; + } + + /** + * Skips comments. + * + * @param SerializerNode $parent + * @param string $text + * @return void + */ + public function comment( SerializerNode $parent, $text ) { + return ''; + } + + /** + * Strips out HTML tags leaving bare text, and strips out undesirable elements per configuration. + * + * @param SerializerNode $parent + * @param SerializerNode $node + * @param string $contents + * @return void + */ + public function element( SerializerNode $parent, SerializerNode $node, $contents ) { + // Read CSS classes off the node into an array for later + $nodeClasses = $node->attrs->getValues()['class'] ?? null; + if ( $nodeClasses ) { + $nodeClasses = explode( ' ', $nodeClasses ); + } + + // Strip away elements matching our removal list. This only supports tags and classes. + foreach ( $this->toRemove as $selectorish ) { + $split = explode( '.', $selectorish ); + $tagName = array_shift( $split ); + + if ( $tagName !== '' && $node->name !== $tagName ) { + continue; + } + + if ( $split && ( !$nodeClasses || array_diff( $split, $nodeClasses ) ) ) { + continue; + } + + return ''; + } + + return $contents; + } + + /** + * Skips document starter tags. + * + * @param string $fragmentNamespace + * @param string $fragmentName + * @return void + */ + public function startDocument( $fragmentNamespace, $fragmentName ) { + return ''; + } + }; + + // Preserve only the first section + if ( preg_match( '/^.*?(?=execute( [ + 'fragmentNamespace' => HTMLData::NS_HTML, + 'fragmentName' => 'body', + ] ); + + return trim( $serializer->getResult() ); + } +} diff --git a/includes/ServiceWiring.php b/includes/ServiceWiring.php index 3358d00..26ff129 100644 --- a/includes/ServiceWiring.php +++ b/includes/ServiceWiring.php @@ -1,13 +1,14 @@ static function ( MediaWikiServices $services ): DescriptionProvider { - return new SimpleDescriptionProvider(); + // The provider implementation is determined by the $wgDescriptionAlgorithm variable. + $class = $services->getMainConfig()->get( 'DescriptionAlgorithm' ); + return new $class( $services->getMainConfig() ); }, ]; diff --git a/includes/SimpleDescriptionProvider.php b/includes/SimpleDescriptionProvider.php index c94f08b..be9548f 100644 --- a/includes/SimpleDescriptionProvider.php +++ b/includes/SimpleDescriptionProvider.php @@ -2,7 +2,14 @@ namespace MediaWiki\Extension\Description2; +use Config; + class SimpleDescriptionProvider implements DescriptionProvider { + /** + * @param Config $config + */ + public function __construct( Config $config ) { + } /** * Extracts description from the HTML representation of a page.