diff --git a/extension.json b/extension.json index e89cdbc..42d5432 100644 --- a/extension.json +++ b/extension.json @@ -9,11 +9,32 @@ "license-name": "GPL-2.0-or-later", "type": "other", "requires": { - "MediaWiki": ">= 1.35.0" + "MediaWiki": ">= 1.38.0" }, "config": { "EnableMetaDescriptionFunctions": false, - "DescriptionMaxChars": 300 + "DescriptionMaxChars": 300, + "DescriptionAlgorithm": "MediaWiki\\Extension\\Description2\\RemexDescriptionProvider", + "DescriptionRemoveElements": [ + "table", + "div", + "script", + "style", + "figure", + "input", + "div.infobox", + "div.ambox", + "div.dmbox", + "ul.gallery", + ".mw-editsection", + "sup.reference", + "ul.references", + ".error", + ".nomobile", + ".noprint", + ".noexcerpt", + ".sortkey" + ] }, "ConfigRegistry": { "Description2": "GlobalVarConfig::newInstance" diff --git a/includes/Description2.php b/includes/Description2.php index 6794e5d..3b6ce92 100644 --- a/includes/Description2.php +++ b/includes/Description2.php @@ -23,18 +23,10 @@ class Description2 { */ public static function setDescription( Parser $parser, $desc ) { $parserOutput = $parser->getOutput(); - if ( method_exists( $parserOutput, 'getPageProperty' ) ) { - // MW 1.38+ - if ( $parserOutput->getPageProperty( 'description' ) !== null ) { - return; - } - $parserOutput->setPageProperty( 'description', $desc ); - } else { - if ( $parserOutput->getProperty( 'description' ) !== false ) { - return; - } - $parserOutput->setProperty( 'description', $desc ); + if ( $parserOutput->getPageProperty( 'description' ) !== null ) { + return; } + $parserOutput->setPageProperty( 'description', $desc ); } /** diff --git a/includes/RemexDescriptionProvider.php b/includes/RemexDescriptionProvider.php new file mode 100644 index 0000000..1fc42fa --- /dev/null +++ b/includes/RemexDescriptionProvider.php @@ -0,0 +1,133 @@ +toRemove = $config->get( 'DescriptionRemoveElements' ); + } + + /** + * Extracts description from the HTML representation of a page. + * + * This algorithm: + * 1. Looks for the first heading (potentially included in the ToC) and cuts the text to avoid unnecessary + * server load. + * 2. Invokes RemexHTML to parse and reserialise the HTML representation. + * 3. Comments are excluded. + * 4. HTML elements are filtered by tag name and the 'class' attribute. Removals are dictated through the + * $wgDescriptionRemoveElements config variable. + * 5. HTML tags are stripped, and only text is preserved. + * 6. Strips white-space around the extract. + * + * This is more costly than the SimpleDescriptionProvider but is far more flexible and easier to manipulate by + * editors. + * + * @param string $text + * @return string + */ + public function derive( string $text ): ?string { + $formatter = new class( $options = [], $this->toRemove ) extends HtmlFormatter { + + /** @var string[] */ + private array $toRemove; + + /** + * @param array $options + * @param array $toRemove + */ + public function __construct( $options, array $toRemove ) { + parent::__construct( $options ); + $this->toRemove = $toRemove; + } + + /** + * Skips comments. + * + * @param SerializerNode $parent + * @param string $text + * @return void + */ + public function comment( SerializerNode $parent, $text ) { + return ''; + } + + /** + * Strips out HTML tags leaving bare text, and strips out undesirable elements per configuration. + * + * @param SerializerNode $parent + * @param SerializerNode $node + * @param string $contents + * @return void + */ + public function element( SerializerNode $parent, SerializerNode $node, $contents ) { + // Read CSS classes off the node into an array for later + $nodeClasses = $node->attrs->getValues()['class'] ?? null; + if ( $nodeClasses ) { + $nodeClasses = explode( ' ', $nodeClasses ); + } + + // Strip away elements matching our removal list. This only supports tags and classes. + foreach ( $this->toRemove as $selectorish ) { + $split = explode( '.', $selectorish ); + $tagName = array_shift( $split ); + + if ( $tagName !== '' && $node->name !== $tagName ) { + continue; + } + + if ( $split && ( !$nodeClasses || array_diff( $split, $nodeClasses ) ) ) { + continue; + } + + return ''; + } + + return $contents; + } + + /** + * Skips document starter tags. + * + * @param string $fragmentNamespace + * @param string $fragmentName + * @return void + */ + public function startDocument( $fragmentNamespace, $fragmentName ) { + return ''; + } + }; + + // Preserve only the first section + if ( preg_match( '/^.*?(?=execute( [ + 'fragmentNamespace' => HTMLData::NS_HTML, + 'fragmentName' => 'body', + ] ); + + return trim( $serializer->getResult() ); + } +} diff --git a/includes/ServiceWiring.php b/includes/ServiceWiring.php index 3358d00..26ff129 100644 --- a/includes/ServiceWiring.php +++ b/includes/ServiceWiring.php @@ -1,13 +1,14 @@ static function ( MediaWikiServices $services ): DescriptionProvider { - return new SimpleDescriptionProvider(); + // The provider implementation is determined by the $wgDescriptionAlgorithm variable. + $class = $services->getMainConfig()->get( 'DescriptionAlgorithm' ); + return new $class( $services->getMainConfig() ); }, ]; diff --git a/includes/SimpleDescriptionProvider.php b/includes/SimpleDescriptionProvider.php index c94f08b..be9548f 100644 --- a/includes/SimpleDescriptionProvider.php +++ b/includes/SimpleDescriptionProvider.php @@ -2,7 +2,14 @@ namespace MediaWiki\Extension\Description2; +use Config; + class SimpleDescriptionProvider implements DescriptionProvider { + /** + * @param Config $config + */ + public function __construct( Config $config ) { + } /** * Extracts description from the HTML representation of a page.