2020-09-29 18:48:51 +00:00
|
|
|
<?php
|
|
|
|
|
|
|
|
namespace MediaWiki\Extension\AbuseFilter;
|
|
|
|
|
|
|
|
use Content;
|
|
|
|
use MediaWiki\Extension\AbuseFilter\Hooks\AbuseFilterHookRunner;
|
2021-02-01 19:03:11 +00:00
|
|
|
use MediaWiki\Permissions\Authority;
|
2020-09-29 18:48:51 +00:00
|
|
|
use MediaWiki\Revision\RevisionRecord;
|
|
|
|
use TextContent;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This service provides an interface to convert RevisionRecord and Content objects to some text
|
|
|
|
* suitable for running abuse filters.
|
2022-09-29 16:54:36 +00:00
|
|
|
*
|
2020-09-29 18:48:51 +00:00
|
|
|
* @internal No external code should rely on this representation
|
|
|
|
*/
|
|
|
|
class TextExtractor {
|
|
|
|
public const SERVICE_NAME = 'AbuseFilterTextExtractor';
|
|
|
|
|
|
|
|
/** @var AbuseFilterHookRunner */
|
|
|
|
private $hookRunner;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param AbuseFilterHookRunner $hookRunner
|
|
|
|
*/
|
|
|
|
public function __construct( AbuseFilterHookRunner $hookRunner ) {
|
|
|
|
$this->hookRunner = $hookRunner;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Look up some text of a revision from its revision id
|
|
|
|
*
|
|
|
|
* Note that this is really *some* text, we do not make *any* guarantee
|
|
|
|
* that this text will be even close to what the user actually sees, or
|
|
|
|
* that the form is fit for any intended purpose.
|
|
|
|
*
|
|
|
|
* Note also that if the revision for any reason is not an Revision
|
|
|
|
* the function returns with an empty string.
|
|
|
|
*
|
|
|
|
* For now, this returns all the revision's slots, concatenated together.
|
|
|
|
* In future, this will be replaced by a better solution. See T208769 for
|
|
|
|
* discussion.
|
|
|
|
*
|
|
|
|
* @param RevisionRecord|null $revision a valid revision
|
2021-02-01 19:03:11 +00:00
|
|
|
* @param Authority $performer to check for privileged access
|
2020-09-29 18:48:51 +00:00
|
|
|
* @return string the content of the revision as some kind of string,
|
|
|
|
* or an empty string if it can not be found
|
|
|
|
* @return-taint none
|
|
|
|
*/
|
2021-07-21 18:51:12 +00:00
|
|
|
public function revisionToString( ?RevisionRecord $revision, Authority $performer ): string {
|
2020-09-29 18:48:51 +00:00
|
|
|
if ( !$revision ) {
|
|
|
|
return '';
|
|
|
|
}
|
|
|
|
|
|
|
|
$strings = [];
|
|
|
|
|
|
|
|
foreach ( $revision->getSlotRoles() as $role ) {
|
2021-02-01 19:03:11 +00:00
|
|
|
$content = $revision->getContent( $role, RevisionRecord::FOR_THIS_USER, $performer );
|
2020-09-29 18:48:51 +00:00
|
|
|
if ( $content === null ) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
$strings[$role] = $this->contentToString( $content );
|
|
|
|
}
|
|
|
|
|
|
|
|
return implode( "\n\n", $strings );
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Converts the given Content object to a string.
|
|
|
|
*
|
2022-03-01 20:22:58 +00:00
|
|
|
* This uses TextContent::getText() if $content is an instance of TextContent,
|
2020-09-29 18:48:51 +00:00
|
|
|
* or Content::getTextForSearchIndex() otherwise.
|
|
|
|
*
|
2021-01-03 11:12:16 +00:00
|
|
|
* The hook AbuseFilterContentToString can be used to override this
|
2020-09-29 18:48:51 +00:00
|
|
|
* behavior.
|
|
|
|
*
|
|
|
|
* @param Content $content
|
|
|
|
*
|
|
|
|
* @return string a suitable string representation of the content.
|
|
|
|
*/
|
2021-07-21 18:51:12 +00:00
|
|
|
public function contentToString( Content $content ): string {
|
2020-09-29 18:48:51 +00:00
|
|
|
$text = null;
|
|
|
|
|
2021-03-06 17:18:07 +00:00
|
|
|
if ( $this->hookRunner->onAbuseFilter_contentToString(
|
2020-09-29 18:48:51 +00:00
|
|
|
$content,
|
|
|
|
$text
|
|
|
|
) ) {
|
|
|
|
$text = $content instanceof TextContent
|
|
|
|
? $content->getText()
|
|
|
|
: $content->getTextForSearchIndex();
|
|
|
|
}
|
|
|
|
|
|
|
|
// T22310
|
2023-06-12 15:08:25 +00:00
|
|
|
return TextContent::normalizeLineEndings( (string)$text );
|
2020-09-29 18:48:51 +00:00
|
|
|
}
|
|
|
|
}
|