Extract description algorithm into a new class

Separating the extract algorithm from integration code. This results in a slightly cleaner code structure (at least in my opinion) and enables adding alternate algorithms without devolving into spaghetti.

The DescriptionProvider (name of the new base interface) is exposed as a service through dependency injection to avoid factories. The implementation can be swapped at service instantiation time.

Depends-On: I73c61ce045dcf31ac1ca5888f1548de8fd8b56ff
Change-Id: I97fd065c9554837747021ba9fff26005e33270f4
This commit is contained in:
alex4401 2023-12-04 20:54:13 +01:00 committed by Alex44019
parent 43eb47183e
commit b73fe26c29
6 changed files with 75 additions and 19 deletions

View file

@ -17,6 +17,9 @@
"ConfigRegistry": { "ConfigRegistry": {
"Description2": "GlobalVarConfig::newInstance" "Description2": "GlobalVarConfig::newInstance"
}, },
"ServiceWiringFiles": [
"includes/ServiceWiring.php"
],
"AutoloadNamespaces": { "AutoloadNamespaces": {
"MediaWiki\\Extension\\Description2\\": "includes/" "MediaWiki\\Extension\\Description2\\": "includes/"
}, },
@ -32,7 +35,8 @@
"Description2": { "Description2": {
"class": "MediaWiki\\Extension\\Description2\\Hooks", "class": "MediaWiki\\Extension\\Description2\\Hooks",
"services": [ "services": [
"ConfigFactory" "ConfigFactory",
"Description2.DescriptionProvider"
] ]
} }
}, },

View file

@ -17,7 +17,6 @@ use PPFrame;
*/ */
class Description2 { class Description2 {
/** /**
* @param Parser $parser The parser. * @param Parser $parser The parser.
* @param string $desc The description text. * @param string $desc The description text.

View file

@ -0,0 +1,13 @@
<?php
namespace MediaWiki\Extension\Description2;
interface DescriptionProvider {
/**
* Extracts description from the HTML representation of a page.
*
* @param string $text HTML to extract the description from.
* @return ?string
*/
public function derive( string $text ): ?string;
}

View file

@ -25,15 +25,21 @@ class Hooks implements
\MediaWiki\Hook\OutputPageParserOutputHook \MediaWiki\Hook\OutputPageParserOutputHook
{ {
/** @var Config */
private Config $config; private Config $config;
/** @var DescriptionProvider */
private DescriptionProvider $descriptionProvider;
/** /**
* @param ConfigFactory $configFactory * @param ConfigFactory $configFactory
*/ */
public function __construct( public function __construct(
ConfigFactory $configFactory ConfigFactory $configFactory,
DescriptionProvider $descriptionProvider
) { ) {
$this->config = $configFactory->makeConfig( 'Description2' ); $this->config = $configFactory->makeConfig( 'Description2' );
$this->descriptionProvider = $descriptionProvider;
} }
/** /**
@ -43,22 +49,7 @@ class Hooks implements
* @return bool * @return bool
*/ */
public function onParserAfterTidy( $parser, &$text ) { public function onParserAfterTidy( $parser, &$text ) {
$desc = ''; $desc = $this->descriptionProvider->derive( $text );
$pattern = '%<table\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?table\b)<[^<]*+)*+)*+</table>%i';
$myText = preg_replace( $pattern, '', $text );
$paragraphs = [];
if ( preg_match_all( '#<p>.*?</p>#is', $myText, $paragraphs ) ) {
foreach ( $paragraphs[0] as $paragraph ) {
$paragraph = trim( strip_tags( $paragraph ) );
if ( !$paragraph ) {
continue;
}
$desc = $paragraph;
break;
}
}
if ( $desc ) { if ( $desc ) {
Description2::setDescription( $parser, $desc ); Description2::setDescription( $parser, $desc );

View file

@ -0,0 +1,13 @@
<?php
use MediaWiki\Extension\Description2\DescriptionProvider;
use MediaWiki\Extension\Description2\SimpleDescriptionProvider;
use MediaWiki\MediaWikiServices;
return [
'Description2.DescriptionProvider' => static function (
MediaWikiServices $services
): DescriptionProvider {
return new SimpleDescriptionProvider();
},
];

View file

@ -0,0 +1,36 @@
<?php
namespace MediaWiki\Extension\Description2;
class SimpleDescriptionProvider implements DescriptionProvider {
/**
* Extracts description from the HTML representation of a page.
*
* The algorithm:
* 1. Removes all <table> elements and their contents.
* 2. Selects all <p> elements.
* 3. Iterates over those paragraphs, strips out all HTML tags and trims white-space around.
* 4. Then the first non-empty paragraph is picked as the result.
*
* @param string $text
* @return string
*/
public function derive( string $text ): ?string {
$pattern = '%<table\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?table\b)<[^<]*+)*+)*+</table>%i';
$myText = preg_replace( $pattern, '', $text );
$paragraphs = [];
if ( preg_match_all( '#<p>.*?</p>#is', $myText, $paragraphs ) ) {
foreach ( $paragraphs[0] as $paragraph ) {
$paragraph = trim( strip_tags( $paragraph ) );
if ( !$paragraph ) {
continue;
}
return $paragraph;
}
}
return null;
}
}