mediawiki-extensions-TextEx.../includes/TextTruncator.php
C. Scott Ananian c1397847c0 Tidy is no longer configurable in MW 1.35
Remove use of deprecated MWTidy::isEnabled() and internal
MWTidy::singleton() methods.  See I3584181070da7ed4888beaaf04e083114aca1eab
for context.

Bug: T198214
Change-Id: I511068cc7b2398773a837f66e08def206cbb5626
2020-05-02 01:31:21 -04:00

109 lines
2.5 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace TextExtracts;
use MWTidy;
/**
* This class needs to understand HTML as well as plain text. It tries to not break HTML tags, but
* might break pairs of tags, leaving unclosed tags behind. We can tidy the output to fix
* this.
*
* @license GPL-2.0-or-later
*/
class TextTruncator {
/**
* @var bool Whether to tidy the output
*/
private $useTidy;
/**
* @param bool $useTidy
*/
public function __construct( bool $useTidy ) {
$this->useTidy = $useTidy;
}
/**
* Returns no more than the given number of sentences
*
* @param string $text Source text to extract from
* @param int $requestedSentenceCount Maximum number of sentences to extract
* @return string
*/
public function getFirstSentences( $text, $requestedSentenceCount ) {
if ( $requestedSentenceCount <= 0 ) {
return '';
}
// Based on code from OpenSearchXml by Brion Vibber
$endchars = [
// regular ASCII
'\P{Lu}\.(?=[ \n]|$)',
'[!?](?=[ \n]|$)',
// full-width ideographic full-stop
'。',
// double-width roman forms
'',
'',
'',
// half-width ideographic full stop
'。',
];
$regexp = '/(?:' . implode( '|', $endchars ) . ')+/u';
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
if ( !$res ) {
// Just return the first line
$lines = explode( "\n", $text, 2 );
return trim( $lines[0] );
}
$index = min( $requestedSentenceCount, $res ) - 1;
list( $tail, $length ) = $matches[0][$index];
// PCRE returns raw offsets, so using substr() instead of mb_substr()
$text = substr( $text, 0, $length ) . $tail;
return $this->tidy( $text );
}
/**
* Returns no more than a requested number of characters, preserving words
*
* @param string $text Source text to extract from
* @param int $requestedLength Maximum number of characters to return
* @return string
*/
public function getFirstChars( $text, $requestedLength ) {
if ( $requestedLength <= 0 ) {
return '';
}
$length = mb_strlen( $text );
if ( $length <= $requestedLength ) {
return $text;
}
// This ungreedy pattern always matches, just might return an empty string
$pattern = '/^[\w\/]*>?/su';
preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
$text = mb_substr( $text, 0, $requestedLength ) . $m[0];
return $this->tidy( $text );
}
/**
* @param string $text
* @return string
*/
private function tidy( $text ) {
if ( $this->useTidy ) {
$text = MWTidy::tidy( $text );
}
return trim( $text );
}
}