mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/TextExtracts
synced 2024-11-30 19:04:38 +00:00
cd8fa3284d
Bug: T330528
Change-Id: I496d84fab3ee8feee5891ca984f37e90837d857c
(cherry picked from commit 47172479bb
)
97 lines
2.2 KiB
PHP
97 lines
2.2 KiB
PHP
<?php
|
|
|
|
namespace TextExtracts;
|
|
|
|
use DOMElement;
|
|
use HtmlFormatter\HtmlFormatter;
|
|
|
|
/**
|
|
* Provides text-only or limited-HTML extracts of page HTML
|
|
*
|
|
* @license GPL-2.0-or-later
|
|
*/
|
|
class ExtractFormatter extends HtmlFormatter {
|
|
public const SECTION_MARKER_START = "\1\2";
|
|
public const SECTION_MARKER_END = "\2\1";
|
|
|
|
/**
|
|
* @var bool
|
|
*/
|
|
private $plainText;
|
|
|
|
/**
|
|
* @param string $text Text to convert
|
|
* @param bool $plainText Whether extract should be plaintext
|
|
*/
|
|
public function __construct( $text, $plainText ) {
|
|
parent::__construct( HtmlFormatter::wrapHTML( $text ) );
|
|
$this->plainText = $plainText;
|
|
|
|
$this->setRemoveMedia( true );
|
|
|
|
if ( $plainText ) {
|
|
$this->flattenAllTags();
|
|
} else {
|
|
$this->flatten( [ 'a' ] );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Performs final transformations (such as newline replacement for plaintext
|
|
* option) and returns resulting HTML.
|
|
*
|
|
* @param DOMElement|string|null $element ID of element to get HTML from.
|
|
* Ignored
|
|
* @return string Processed HTML
|
|
*/
|
|
public function getText( $element = null ): string {
|
|
$this->filterContent();
|
|
$text = parent::getText();
|
|
if ( $this->plainText ) {
|
|
$text = html_entity_decode( $text );
|
|
// replace nbsp with space
|
|
$text = str_replace( "\u{00A0}", ' ', $text );
|
|
// for Windows
|
|
$text = str_replace( "\r", "\n", $text );
|
|
// normalise newlines
|
|
$text = preg_replace( "/\n{3,}/", "\n\n", $text );
|
|
}
|
|
return trim( $text );
|
|
}
|
|
|
|
/**
|
|
* @param string $html HTML string to process
|
|
* @return string Processed HTML
|
|
*/
|
|
public function onHtmlReady( string $html ): string {
|
|
if ( $this->plainText ) {
|
|
$html = preg_replace( '/\s*(<h([1-6])\b)/i',
|
|
"\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1',
|
|
$html
|
|
);
|
|
}
|
|
return $html;
|
|
}
|
|
|
|
/**
|
|
* Removes content we've chosen to remove then removes class and style
|
|
* attributes from the remaining span elements.
|
|
*
|
|
* @return array Array of removed DOMElements
|
|
*/
|
|
public function filterContent(): array {
|
|
$removed = parent::filterContent();
|
|
|
|
$doc = $this->getDoc();
|
|
$spans = $doc->getElementsByTagName( 'span' );
|
|
|
|
/** @var DOMElement $span */
|
|
foreach ( $spans as $span ) {
|
|
$span->removeAttribute( 'class' );
|
|
$span->removeAttribute( 'style' );
|
|
}
|
|
|
|
return $removed;
|
|
}
|
|
}
|