2019-03-19 11:43:25 +00:00
|
|
|
|
<?php
|
|
|
|
|
|
|
|
|
|
namespace TextExtracts;
|
|
|
|
|
|
2021-10-20 18:33:06 +00:00
|
|
|
|
use MediaWiki\MediaWikiServices;
|
2019-03-19 11:21:41 +00:00
|
|
|
|
|
2019-03-19 11:43:25 +00:00
|
|
|
|
/**
|
2019-03-19 11:21:41 +00:00
|
|
|
|
* This class needs to understand HTML as well as plain text. It tries to not break HTML tags, but
|
2020-05-02 05:25:10 +00:00
|
|
|
|
* might break pairs of tags, leaving unclosed tags behind. We can tidy the output to fix
|
2019-03-19 11:21:41 +00:00
|
|
|
|
* this.
|
|
|
|
|
*
|
2019-03-19 11:43:25 +00:00
|
|
|
|
* @license GPL-2.0-or-later
|
|
|
|
|
*/
|
|
|
|
|
class TextTruncator {
|
2019-03-19 11:21:41 +00:00
|
|
|
|
/**
|
2020-05-02 05:25:10 +00:00
|
|
|
|
* @var bool Whether to tidy the output
|
2019-03-19 11:21:41 +00:00
|
|
|
|
*/
|
2020-05-02 05:25:10 +00:00
|
|
|
|
private $useTidy;
|
2019-03-19 11:21:41 +00:00
|
|
|
|
|
|
|
|
|
/**
|
2020-05-02 05:25:10 +00:00
|
|
|
|
* @param bool $useTidy
|
2019-03-19 11:21:41 +00:00
|
|
|
|
*/
|
2020-05-02 05:25:10 +00:00
|
|
|
|
public function __construct( bool $useTidy ) {
|
|
|
|
|
$this->useTidy = $useTidy;
|
2019-03-19 11:21:41 +00:00
|
|
|
|
}
|
|
|
|
|
|
2019-03-19 11:43:25 +00:00
|
|
|
|
/**
|
|
|
|
|
* Returns no more than the given number of sentences
|
|
|
|
|
*
|
|
|
|
|
* @param string $text Source text to extract from
|
|
|
|
|
* @param int $requestedSentenceCount Maximum number of sentences to extract
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
2019-03-19 11:21:41 +00:00
|
|
|
|
public function getFirstSentences( $text, $requestedSentenceCount ) {
|
2019-03-19 11:43:25 +00:00
|
|
|
|
if ( $requestedSentenceCount <= 0 ) {
|
|
|
|
|
return '';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Based on code from OpenSearchXml by Brion Vibber
|
|
|
|
|
$endchars = [
|
|
|
|
|
// regular ASCII
|
2019-03-19 17:28:58 +00:00
|
|
|
|
'\P{Lu}\.(?=[ \n]|$)',
|
|
|
|
|
'[!?](?=[ \n]|$)',
|
2019-03-19 11:43:25 +00:00
|
|
|
|
// full-width ideographic full-stop
|
|
|
|
|
'。',
|
|
|
|
|
// double-width roman forms
|
2019-03-19 17:28:58 +00:00
|
|
|
|
'.',
|
|
|
|
|
'!',
|
|
|
|
|
'?',
|
2019-03-19 11:43:25 +00:00
|
|
|
|
// half-width ideographic full stop
|
|
|
|
|
'。',
|
|
|
|
|
];
|
|
|
|
|
|
2019-03-19 11:21:41 +00:00
|
|
|
|
$regexp = '/(?:' . implode( '|', $endchars ) . ')+/u';
|
2019-03-19 11:43:25 +00:00
|
|
|
|
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
|
|
|
|
|
|
2019-03-19 11:21:41 +00:00
|
|
|
|
if ( !$res ) {
|
2019-03-19 11:43:25 +00:00
|
|
|
|
// Just return the first line
|
|
|
|
|
$lines = explode( "\n", $text, 2 );
|
2019-03-19 11:21:41 +00:00
|
|
|
|
return trim( $lines[0] );
|
2019-03-19 11:43:25 +00:00
|
|
|
|
}
|
2019-03-19 11:21:41 +00:00
|
|
|
|
|
|
|
|
|
$index = min( $requestedSentenceCount, $res ) - 1;
|
|
|
|
|
list( $tail, $length ) = $matches[0][$index];
|
|
|
|
|
// PCRE returns raw offsets, so using substr() instead of mb_substr()
|
2019-03-19 17:28:58 +00:00
|
|
|
|
$text = substr( $text, 0, $length ) . $tail;
|
2019-03-19 11:21:41 +00:00
|
|
|
|
|
|
|
|
|
return $this->tidy( $text );
|
2019-03-19 11:43:25 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Returns no more than a requested number of characters, preserving words
|
|
|
|
|
*
|
|
|
|
|
* @param string $text Source text to extract from
|
|
|
|
|
* @param int $requestedLength Maximum number of characters to return
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
2019-03-19 11:21:41 +00:00
|
|
|
|
public function getFirstChars( $text, $requestedLength ) {
|
2019-03-19 11:43:25 +00:00
|
|
|
|
if ( $requestedLength <= 0 ) {
|
|
|
|
|
return '';
|
|
|
|
|
}
|
2019-03-19 11:21:41 +00:00
|
|
|
|
|
2019-03-19 11:43:25 +00:00
|
|
|
|
$length = mb_strlen( $text );
|
|
|
|
|
if ( $length <= $requestedLength ) {
|
|
|
|
|
return $text;
|
|
|
|
|
}
|
2019-03-19 11:21:41 +00:00
|
|
|
|
|
|
|
|
|
// This ungreedy pattern always matches, just might return an empty string
|
|
|
|
|
$pattern = '/^[\w\/]*>?/su';
|
2019-03-19 11:43:25 +00:00
|
|
|
|
preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
|
2021-01-08 08:05:35 +00:00
|
|
|
|
$truncatedText = mb_substr( $text, 0, $requestedLength ) . $m[0];
|
|
|
|
|
if ( $truncatedText === $text ) {
|
|
|
|
|
return $text;
|
|
|
|
|
}
|
2019-03-19 11:21:41 +00:00
|
|
|
|
|
2021-01-08 08:05:35 +00:00
|
|
|
|
return $this->tidy( $truncatedText );
|
2019-03-19 11:21:41 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param string $text
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
private function tidy( $text ) {
|
2020-05-02 05:25:10 +00:00
|
|
|
|
if ( $this->useTidy ) {
|
2021-10-20 18:33:06 +00:00
|
|
|
|
$text = MediaWikiServices::getInstance()->getTidy()->tidy( $text );
|
2019-03-19 11:21:41 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return trim( $text );
|
2019-03-19 11:43:25 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|