Extract description algorithm into a new class

Separating the extract algorithm from integration code. This results in a slightly cleaner code structure (at least in my opinion) and enables adding alternate algorithms without devolving into spaghetti. The DescriptionProvider (name of the new base interface) is exposed as a service through dependency injection to avoid factories. The implementation can be swapped at service instantiation time. Depends-On: I73c61ce045dcf31ac1ca5888f1548de8fd8b56ff Change-Id: I97fd065c9554837747021ba9fff26005e33270f4
2023-12-04 20:54:13 +01:00 · 2023-12-04 20:54:13 +01:00 · b73fe26c29
parent 43eb47183e
commit b73fe26c29
6 changed files with 75 additions and 19 deletions
--- a/extension.json
+++ b/extension.json
@ -17,6 +17,9 @@
 	"ConfigRegistry": {
 		"Description2": "GlobalVarConfig::newInstance"
 	},
 	"ServiceWiringFiles": [
 		"includes/ServiceWiring.php"
 	],
 	"AutoloadNamespaces": {
 		"MediaWiki\\Extension\\Description2\\": "includes/"
 	},
@ -32,7 +35,8 @@
 		"Description2": {
 			"class": "MediaWiki\\Extension\\Description2\\Hooks",
 			"services": [
-				"ConfigFactory"
+				"ConfigFactory",
 				"Description2.DescriptionProvider"
 			]
 		}
 	},
--- a/includes/Description2.php
+++ b/includes/Description2.php
@ -17,7 +17,6 @@ use PPFrame;
 */
 class Description2 {
 	/**
 	 * @param Parser $parser The parser.
 	 * @param string $desc The description text.
--- a/includes/DescriptionProvider.php
+++ b/includes/DescriptionProvider.php
@ -0,0 +1,13 @@
 <?php
 namespace MediaWiki\Extension\Description2;
 interface DescriptionProvider {
 	/**
 	 * Extracts description from the HTML representation of a page.
 	 *
 	 * @param string $text HTML to extract the description from.
 	 * @return ?string
 	 */
 	public function derive( string $text ): ?string;
 }
--- a/includes/Hooks.php
+++ b/includes/Hooks.php
@ -25,15 +25,21 @@ class Hooks implements
 	\MediaWiki\Hook\OutputPageParserOutputHook
 {
 	/** @var Config */
 	private Config $config;
 	/** @var DescriptionProvider */
 	private DescriptionProvider $descriptionProvider;
 	/**
 	 * @param ConfigFactory $configFactory
 	 */
 	public function __construct(
-		ConfigFactory $configFactory
+		ConfigFactory $configFactory,
 		DescriptionProvider $descriptionProvider
 	) {
 		$this->config = $configFactory->makeConfig( 'Description2' );
 		$this->descriptionProvider = $descriptionProvider;
 	}
 	/**
@ -43,22 +49,7 @@ class Hooks implements
 	 * @return bool
 	 */
 	public function onParserAfterTidy( $parser, &$text ) {
-		$desc = '';
+		$desc = $this->descriptionProvider->derive( $text );
 		$pattern = '%<table\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?table\b)<[^<]*+)*+)*+</table>%i';
 		$myText = preg_replace( $pattern, '', $text );
 		$paragraphs = [];
 		if ( preg_match_all( '#<p>.*?</p>#is', $myText, $paragraphs ) ) {
 			foreach ( $paragraphs[0] as $paragraph ) {
 				$paragraph = trim( strip_tags( $paragraph ) );
 				if ( !$paragraph ) {
 					continue;
 				}
 				$desc = $paragraph;
 				break;
 			}
 		}
 		if ( $desc ) {
 			Description2::setDescription( $parser, $desc );
--- a/includes/ServiceWiring.php
+++ b/includes/ServiceWiring.php
@ -0,0 +1,13 @@
 <?php
 use MediaWiki\Extension\Description2\DescriptionProvider;
 use MediaWiki\Extension\Description2\SimpleDescriptionProvider;
 use MediaWiki\MediaWikiServices;
 return [
 	'Description2.DescriptionProvider' => static function (
 		MediaWikiServices $services
 	): DescriptionProvider {
 		return new SimpleDescriptionProvider();
 	},
 ];
--- a/includes/SimpleDescriptionProvider.php
+++ b/includes/SimpleDescriptionProvider.php
@ -0,0 +1,36 @@
 <?php
 namespace MediaWiki\Extension\Description2;
 class SimpleDescriptionProvider implements DescriptionProvider {
 	/**
 	 * Extracts description from the HTML representation of a page.
 	 *
 	 * The algorithm:
 	 * 1. Removes all <table> elements and their contents.
 	 * 2. Selects all <p> elements.
 	 * 3. Iterates over those paragraphs, strips out all HTML tags and trims white-space around.
 	 * 4. Then the first non-empty paragraph is picked as the result.
 	 *
 	 * @param string $text
 	 * @return string
 	 */
 	public function derive( string $text ): ?string {
 		$pattern = '%<table\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?table\b)<[^<]*+)*+)*+</table>%i';
 		$myText = preg_replace( $pattern, '', $text );
 		$paragraphs = [];
 		if ( preg_match_all( '#<p>.*?</p>#is', $myText, $paragraphs ) ) {
 			foreach ( $paragraphs[0] as $paragraph ) {
 				$paragraph = trim( strip_tags( $paragraph ) );
 				if ( !$paragraph ) {
 					continue;
 				}
 				return $paragraph;
 			}
 		}
 		return null;
 	}
 }