Merge "Remove style tags in description"

This commit is contained in:
jenkins-bot 2024-04-05 10:07:08 +00:00 committed by Gerrit Code Review
commit 859b31417f

View file

@ -8,7 +8,7 @@ class SimpleDescriptionProvider implements DescriptionProvider {
* Extracts description from the HTML representation of a page.
*
* The algorithm:
* 1. Removes all <table> elements and their contents.
* 1. Removes all <style> and <table> elements and their contents.
* 2. Selects all <p> elements.
* 3. Iterates over those paragraphs, strips out all HTML tags and trims white-space around.
* 4. Then the first non-empty paragraph is picked as the result.
@ -17,8 +17,12 @@ class SimpleDescriptionProvider implements DescriptionProvider {
* @return string
*/
public function derive( string $text ): ?string {
$pattern = '%<table\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?table\b)<[^<]*+)*+)*+</table>%i';
$myText = preg_replace( $pattern, '', $text );
$myText = $text;
$stripTags = [ 'style', 'table' ];
foreach ( $stripTags as $tag ) {
$pattern = "%<$tag\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?$tag\b)<[^<]*+)*+)*+</$tag>%i";
$myText = preg_replace( $pattern, '', $myText );
}
$paragraphs = [];
if ( preg_match_all( '#<p>.*?</p>#is', $myText, $paragraphs ) ) {