2023-12-04 19:54:13 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace MediaWiki\Extension\Description2;
|
|
|
|
|
2024-01-02 17:33:05 +00:00
|
|
|
use Config;
|
|
|
|
|
2023-12-04 19:54:13 +00:00
|
|
|
class SimpleDescriptionProvider implements DescriptionProvider {
|
2024-01-02 17:33:05 +00:00
|
|
|
/**
|
|
|
|
* @param Config $config
|
|
|
|
*/
|
|
|
|
public function __construct( Config $config ) {
|
|
|
|
}
|
2023-12-04 19:54:13 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Extracts description from the HTML representation of a page.
|
|
|
|
*
|
|
|
|
* The algorithm:
|
2024-01-31 08:13:34 +00:00
|
|
|
* 1. Removes all <style> and <table> elements and their contents.
|
2023-12-04 19:54:13 +00:00
|
|
|
* 2. Selects all <p> elements.
|
|
|
|
* 3. Iterates over those paragraphs, strips out all HTML tags and trims white-space around.
|
|
|
|
* 4. Then the first non-empty paragraph is picked as the result.
|
|
|
|
*
|
|
|
|
* @param string $text
|
|
|
|
* @return string
|
|
|
|
*/
|
|
|
|
public function derive( string $text ): ?string {
|
2024-01-31 08:13:34 +00:00
|
|
|
$myText = $text;
|
|
|
|
$stripTags = [ 'style', 'table' ];
|
|
|
|
foreach ( $stripTags as $tag ) {
|
|
|
|
$pattern = "%<$tag\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?$tag\b)<[^<]*+)*+)*+</$tag>%i";
|
|
|
|
$myText = preg_replace( $pattern, '', $myText );
|
|
|
|
}
|
2023-12-04 19:54:13 +00:00
|
|
|
|
|
|
|
$paragraphs = [];
|
|
|
|
if ( preg_match_all( '#<p>.*?</p>#is', $myText, $paragraphs ) ) {
|
|
|
|
foreach ( $paragraphs[0] as $paragraph ) {
|
|
|
|
$paragraph = trim( strip_tags( $paragraph ) );
|
|
|
|
if ( !$paragraph ) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
return $paragraph;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|