2013-12-17 21:20:13 +00:00
|
|
|
<?php
|
|
|
|
|
2014-12-31 23:02:05 +00:00
|
|
|
namespace TextExtracts;
|
2016-04-13 04:23:01 +00:00
|
|
|
|
2016-09-23 01:51:12 +00:00
|
|
|
use DOMElement;
|
2016-04-13 04:23:01 +00:00
|
|
|
use HtmlFormatter\HtmlFormatter;
|
2014-12-31 23:02:05 +00:00
|
|
|
|
2013-12-17 21:20:13 +00:00
|
|
|
/**
|
|
|
|
* Provides text-only or limited-HTML extracts of page HTML
|
|
|
|
*
|
2019-04-24 16:26:53 +00:00
|
|
|
* @license GPL-2.0-or-later
|
2013-12-17 21:20:13 +00:00
|
|
|
*/
|
|
|
|
class ExtractFormatter extends HtmlFormatter {
|
2020-05-30 00:35:27 +00:00
|
|
|
public const SECTION_MARKER_START = "\1\2";
|
|
|
|
public const SECTION_MARKER_END = "\2\1";
|
2013-12-17 21:20:13 +00:00
|
|
|
|
2019-03-19 11:21:41 +00:00
|
|
|
/**
|
|
|
|
* @var bool
|
|
|
|
*/
|
2013-12-17 21:20:13 +00:00
|
|
|
private $plainText;
|
|
|
|
|
|
|
|
/**
|
2017-09-01 04:58:00 +00:00
|
|
|
* @param string $text Text to convert
|
|
|
|
* @param bool $plainText Whether extract should be plaintext
|
2013-12-17 21:20:13 +00:00
|
|
|
*/
|
2019-03-17 20:58:37 +00:00
|
|
|
public function __construct( $text, $plainText ) {
|
2013-12-17 21:20:13 +00:00
|
|
|
parent::__construct( HtmlFormatter::wrapHTML( $text ) );
|
|
|
|
$this->plainText = $plainText;
|
|
|
|
|
|
|
|
$this->setRemoveMedia( true );
|
|
|
|
|
|
|
|
if ( $plainText ) {
|
|
|
|
$this->flattenAllTags();
|
|
|
|
} else {
|
2016-09-23 01:38:27 +00:00
|
|
|
$this->flatten( [ 'a' ] );
|
2013-12-17 21:20:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-30 03:52:54 +00:00
|
|
|
/**
|
|
|
|
* Performs final transformations (such as newline replacement for plaintext
|
|
|
|
* option) and returns resulting HTML.
|
|
|
|
*
|
|
|
|
* @param DOMElement|string|null $element ID of element to get HTML from.
|
|
|
|
* Ignored
|
|
|
|
* @return string Processed HTML
|
|
|
|
*/
|
2023-08-22 20:46:36 +00:00
|
|
|
public function getText( $element = null ): string {
|
2013-12-17 21:20:13 +00:00
|
|
|
$this->filterContent();
|
|
|
|
$text = parent::getText();
|
|
|
|
if ( $this->plainText ) {
|
|
|
|
$text = html_entity_decode( $text );
|
2017-11-30 00:47:01 +00:00
|
|
|
// replace nbsp with space
|
2019-02-02 20:53:08 +00:00
|
|
|
$text = str_replace( "\u{00A0}", ' ', $text );
|
2017-11-30 00:47:01 +00:00
|
|
|
// for Windows
|
|
|
|
$text = str_replace( "\r", "\n", $text );
|
|
|
|
// normalise newlines
|
|
|
|
$text = preg_replace( "/\n{3,}/", "\n\n", $text );
|
2013-12-17 21:20:13 +00:00
|
|
|
}
|
2019-03-19 11:21:41 +00:00
|
|
|
return trim( $text );
|
2013-12-17 21:20:13 +00:00
|
|
|
}
|
|
|
|
|
2017-11-30 03:52:54 +00:00
|
|
|
/**
|
|
|
|
* @param string $html HTML string to process
|
|
|
|
* @return string Processed HTML
|
|
|
|
*/
|
2023-08-22 20:46:36 +00:00
|
|
|
public function onHtmlReady( string $html ): string {
|
2013-12-17 21:20:13 +00:00
|
|
|
if ( $this->plainText ) {
|
|
|
|
$html = preg_replace( '/\s*(<h([1-6])\b)/i',
|
2016-09-23 01:38:27 +00:00
|
|
|
"\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1',
|
2013-12-17 21:20:13 +00:00
|
|
|
$html
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return $html;
|
|
|
|
}
|
2014-04-23 01:12:44 +00:00
|
|
|
|
2015-01-08 12:50:23 +00:00
|
|
|
/**
|
|
|
|
* Removes content we've chosen to remove then removes class and style
|
|
|
|
* attributes from the remaining span elements.
|
|
|
|
*
|
|
|
|
* @return array Array of removed DOMElements
|
|
|
|
*/
|
2023-08-22 20:46:36 +00:00
|
|
|
public function filterContent(): array {
|
2015-01-08 12:50:23 +00:00
|
|
|
$removed = parent::filterContent();
|
|
|
|
|
|
|
|
$doc = $this->getDoc();
|
|
|
|
$spans = $doc->getElementsByTagName( 'span' );
|
|
|
|
|
2016-09-23 01:51:12 +00:00
|
|
|
/** @var DOMElement $span */
|
2015-01-08 12:50:23 +00:00
|
|
|
foreach ( $spans as $span ) {
|
|
|
|
$span->removeAttribute( 'class' );
|
|
|
|
$span->removeAttribute( 'style' );
|
|
|
|
}
|
|
|
|
|
|
|
|
return $removed;
|
|
|
|
}
|
2013-12-17 21:20:13 +00:00
|
|
|
}
|