feat: remove style tags

This commit is contained in:
OfficialCRUGG 2024-02-02 15:08:26 +01:00
parent cdb35a545d
commit 1499b9aa3c

View file

@ -8,7 +8,7 @@ class SimpleDescriptionProvider implements DescriptionProvider {
* Extracts description from the HTML representation of a page. * Extracts description from the HTML representation of a page.
* *
* The algorithm: * The algorithm:
* 1. Removes all <table> elements and their contents. * 1. Removes all <table> and <style> elements and their contents.
* 2. Selects all <p> elements. * 2. Selects all <p> elements.
* 3. Iterates over those paragraphs, strips out all HTML tags and trims white-space around. * 3. Iterates over those paragraphs, strips out all HTML tags and trims white-space around.
* 4. Then the first non-empty paragraph is picked as the result. * 4. Then the first non-empty paragraph is picked as the result.
@ -17,7 +17,7 @@ class SimpleDescriptionProvider implements DescriptionProvider {
* @return string * @return string
*/ */
public function derive( string $text ): ?string { public function derive( string $text ): ?string {
$pattern = '%<table\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?table\b)<[^<]*+)*+)*+</table>%i'; $pattern = '%<(table|style)\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?(table|style)\b)<[^<]*+)*+)*+</(table|style)>%i';
$myText = preg_replace( $pattern, '', $text ); $myText = preg_replace( $pattern, '', $text );
$paragraphs = []; $paragraphs = [];