mediawiki-extensions-TextEx.../includes/ExtractFormatter.php
libraryupgrader 7e548ce1b4 build: Updating mediawiki/mediawiki-codesniffer to 0.12.0
Change-Id: I3e1260e19de4a12c995b51a1a4416dbdf87829cf
2017-09-01 04:58:00 +00:00

154 lines
4.3 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace TextExtracts;
use Config;
use DOMElement;
use HtmlFormatter\HtmlFormatter;
/**
* Provides text-only or limited-HTML extracts of page HTML
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*/
class ExtractFormatter extends HtmlFormatter {
const SECTION_MARKER_START = "\1\2";
const SECTION_MARKER_END = "\2\1";
private $plainText;
/**
* @param string $text Text to convert
* @param bool $plainText Whether extract should be plaintext
* @param Config $config
*/
public function __construct( $text, $plainText, Config $config ) {
parent::__construct( HtmlFormatter::wrapHTML( $text ) );
$this->plainText = $plainText;
$this->setRemoveMedia( true );
$this->remove( $config->get( 'ExtractsRemoveClasses' ) );
if ( $plainText ) {
$this->flattenAllTags();
} else {
$this->flatten( [ 'a' ] );
}
}
public function getText( $dummy = null ) {
$this->filterContent();
$text = parent::getText();
if ( $this->plainText ) {
$text = html_entity_decode( $text );
$text = str_replace( "\xC2\xA0", ' ', $text ); // replace nbsp with space
$text = str_replace( "\r", "\n", $text ); // for Windows
$text = preg_replace( "/\n{3,}/", "\n\n", $text ); // normalise newlines
}
return $text;
}
public function onHtmlReady( $html ) {
if ( $this->plainText ) {
$html = preg_replace( '/\s*(<h([1-6])\b)/i',
"\n\n" . self::SECTION_MARKER_START . '$2' . self::SECTION_MARKER_END . '$1',
$html
);
}
return $html;
}
/**
* Returns no more than the given number of sentences
*
* @param string $text
* @param int $requestedSentenceCount
* @return string
*/
public static function getFirstSentences( $text, $requestedSentenceCount ) {
if ( $requestedSentenceCount <= 0 ) {
return '';
}
// Based on code from OpenSearchXml by Brion Vibber
$endchars = [
'[^\p{Lu}]\.(?:[ \n]|$)', '[\!\?](?:[ \n]|$)', // regular ASCII
'。', // full-width ideographic full-stop
'', '', '', // double-width roman forms
'。', // half-width ideographic full stop
];
$endgroup = implode( '|', $endchars );
$regexp = "/($endgroup)+/u";
$matches = [];
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
if ( $res ) {
$index = min( $requestedSentenceCount, $res ) - 1;
list( $tail, $length ) = $matches[0][ $index ];
// PCRE returns raw offsets, so using substr() instead of mb_substr()
$text = substr( $text, 0, $length ) . trim( $tail );
} else {
// Just return the first line
$lines = explode( "\n", $text, 2 );
$text = trim( $lines[0] );
}
return $text;
}
/**
* Returns no more than a requested number of characters, preserving words
*
* @param string $text
* @param int $requestedLength
* @return string
*/
public static function getFirstChars( $text, $requestedLength ) {
if ( $requestedLength <= 0 ) {
return '';
}
$length = mb_strlen( $text );
if ( $length <= $requestedLength ) {
return $text;
}
$pattern = "#^[\\w/]*>?#su";
preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
return mb_substr( $text, 0, $requestedLength ) . $m[0];
}
/**
* Removes content we've chosen to remove then removes class and style
* attributes from the remaining span elements.
*
* @return array Array of removed DOMElements
*/
public function filterContent() {
$removed = parent::filterContent();
$doc = $this->getDoc();
$spans = $doc->getElementsByTagName( 'span' );
/** @var DOMElement $span */
foreach ( $spans as $span ) {
$span->removeAttribute( 'class' );
$span->removeAttribute( 'style' );
}
return $removed;
}
}