feat: remove style tags
This commit is contained in:
parent
cdb35a545d
commit
1499b9aa3c
|
@ -8,7 +8,7 @@ class SimpleDescriptionProvider implements DescriptionProvider {
|
||||||
* Extracts description from the HTML representation of a page.
|
* Extracts description from the HTML representation of a page.
|
||||||
*
|
*
|
||||||
* The algorithm:
|
* The algorithm:
|
||||||
* 1. Removes all <table> elements and their contents.
|
* 1. Removes all <table> and <style> elements and their contents.
|
||||||
* 2. Selects all <p> elements.
|
* 2. Selects all <p> elements.
|
||||||
* 3. Iterates over those paragraphs, strips out all HTML tags and trims white-space around.
|
* 3. Iterates over those paragraphs, strips out all HTML tags and trims white-space around.
|
||||||
* 4. Then the first non-empty paragraph is picked as the result.
|
* 4. Then the first non-empty paragraph is picked as the result.
|
||||||
|
@ -17,7 +17,7 @@ class SimpleDescriptionProvider implements DescriptionProvider {
|
||||||
* @return string
|
* @return string
|
||||||
*/
|
*/
|
||||||
public function derive( string $text ): ?string {
|
public function derive( string $text ): ?string {
|
||||||
$pattern = '%<table\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?table\b)<[^<]*+)*+)*+</table>%i';
|
$pattern = '%<(table|style)\b[^>]*+>(?:(?R)|[^<]*+(?:(?!</?(table|style)\b)<[^<]*+)*+)*+</(table|style)>%i';
|
||||||
$myText = preg_replace( $pattern, '', $text );
|
$myText = preg_replace( $pattern, '', $text );
|
||||||
|
|
||||||
$paragraphs = [];
|
$paragraphs = [];
|
||||||
|
|
Loading…
Reference in a new issue