mediawiki-extensions-TextEx.../includes/TextTruncator.php
alistair3149 cc8cf471fc
Use extension namespace for TextExtracts
Change-Id: I01177e9bef0f25b6245ee3e93f605dc771642273
2024-07-12 18:46:33 -04:00

112 lines
2.6 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
namespace MediaWiki\Extension\TextExtracts;
use MediaWiki\MediaWikiServices;
/**
* This class needs to understand HTML as well as plain text. It tries to not break HTML tags, but
* might break pairs of tags, leaving unclosed tags behind. We can tidy the output to fix
* this.
*
* @license GPL-2.0-or-later
*/
class TextTruncator {
/**
* @var bool Whether to tidy the output
*/
private $useTidy;
/**
* @param bool $useTidy
*/
public function __construct( bool $useTidy ) {
$this->useTidy = $useTidy;
}
/**
* Returns no more than the given number of sentences
*
* @param string $text Source text to extract from
* @param int $requestedSentenceCount Maximum number of sentences to extract
* @return string
*/
public function getFirstSentences( $text, $requestedSentenceCount ) {
if ( $requestedSentenceCount <= 0 ) {
return '';
}
// Based on code from OpenSearchXml by Brion Vibber
$endchars = [
// regular ASCII
'\P{Lu}\.(?=[ \n]|$)',
'[!?](?=[ \n]|$)',
// full-width ideographic full-stop
'。',
// double-width roman forms
'',
'',
'',
// half-width ideographic full stop
'。',
];
$regexp = '/(?:' . implode( '|', $endchars ) . ')+/u';
$res = preg_match_all( $regexp, $text, $matches, PREG_OFFSET_CAPTURE );
if ( !$res ) {
// Just return the first line
$lines = explode( "\n", $text, 2 );
return trim( $lines[0] );
}
$index = min( $requestedSentenceCount, $res ) - 1;
[ $tail, $length ] = $matches[0][$index];
// PCRE returns raw offsets, so using substr() instead of mb_substr()
$text = substr( $text, 0, $length ) . $tail;
return $this->tidy( $text );
}
/**
* Returns no more than a requested number of characters, preserving words
*
* @param string $text Source text to extract from
* @param int $requestedLength Maximum number of characters to return
* @return string
*/
public function getFirstChars( $text, $requestedLength ) {
if ( $requestedLength <= 0 ) {
return '';
}
$length = mb_strlen( $text );
if ( $length <= $requestedLength ) {
return $text;
}
// This ungreedy pattern always matches, just might return an empty string
$pattern = '/^[\w\/]*>?/su';
preg_match( $pattern, mb_substr( $text, $requestedLength ), $m );
$truncatedText = mb_substr( $text, 0, $requestedLength ) . $m[0];
if ( $truncatedText === $text ) {
return $text;
}
return $this->tidy( $truncatedText );
}
/**
* @param string $text
* @return string
*/
private function tidy( $text ) {
if ( $this->useTidy ) {
$text = MediaWikiServices::getInstance()->getTidy()->tidy( $text );
}
return trim( $text );
}
}