mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/TextExtracts
synced 2024-11-23 15:56:52 +00:00
ApiQueryExtracts: Use raw text from ParserOutput
Use ParserOutput::getRawText() instead of deprecated getText(). This avoids a dependency on MediaWiki\OutputTransform classes, which call into Skin and lots of other scary code. Small updates are needed so that we continue stripping the TOC and section edit links in this format. This simultaneously reverts two patches that were only needed to work around problems caused by markup added by OutputTransforms: * Revert "Skip <h2> in TOC when extracting first section" This reverts commit60e1c5ad83
. * Revert "ExtractFormatter: Rescue headings from being removed" This reverts commit0fafa44a20
. Change-Id: Ie436576a356d05f2c4c4b84c8c1d82ba70d357d4
This commit is contained in:
parent
173012d395
commit
207242f93d
|
@ -56,7 +56,9 @@
|
|||
"input",
|
||||
"style",
|
||||
"ul.gallery",
|
||||
".mw-editsection",
|
||||
"mw\\:editsection",
|
||||
"editsection",
|
||||
"meta",
|
||||
"sup.reference",
|
||||
"ol.references",
|
||||
".error",
|
||||
|
|
|
@ -26,7 +26,7 @@ class ApiQueryExtracts extends ApiQueryBase {
|
|||
/**
|
||||
* Bump when memcache needs clearing
|
||||
*/
|
||||
private const CACHE_VERSION = 2;
|
||||
private const CACHE_VERSION = 3;
|
||||
|
||||
private const PREFIX = 'ex';
|
||||
|
||||
|
@ -241,10 +241,9 @@ class ApiQueryExtracts extends ApiQueryBase {
|
|||
*/
|
||||
private function getFirstSection( $text, $plainText ) {
|
||||
if ( $plainText ) {
|
||||
$regexp = '/^.*?(?=' . ExtractFormatter::SECTION_MARKER_START .
|
||||
'(?!.' . ExtractFormatter::SECTION_MARKER_END . '<h2 id="mw-toc-heading"))/s';
|
||||
$regexp = '/^(.*?)(?=' . ExtractFormatter::SECTION_MARKER_START . ')/s';
|
||||
} else {
|
||||
$regexp = '/^.*?(?=<h[1-6]\b(?! id="mw-toc-heading"))/s';
|
||||
$regexp = '/^(.*?)(?=<h[1-6]\b)/s';
|
||||
}
|
||||
if ( preg_match( $regexp, $text, $matches ) ) {
|
||||
$text = $matches[0];
|
||||
|
@ -266,7 +265,7 @@ class ApiQueryExtracts extends ApiQueryBase {
|
|||
);
|
||||
if ( $status->isOK() ) {
|
||||
$pout = $status->getValue();
|
||||
$text = $pout->getText( [ 'unwrap' => true ] );
|
||||
$text = $pout->getRawText();
|
||||
if ( $this->params['intro'] ) {
|
||||
$text = $this->getFirstSection( $text, false );
|
||||
}
|
||||
|
|
|
@ -4,7 +4,6 @@ namespace TextExtracts;
|
|||
|
||||
use DOMElement;
|
||||
use HtmlFormatter\HtmlFormatter;
|
||||
use Wikimedia\Parsoid\Utils\DOMCompat;
|
||||
|
||||
/**
|
||||
* Provides text-only or limited-HTML extracts of page HTML
|
||||
|
@ -81,21 +80,9 @@ class ExtractFormatter extends HtmlFormatter {
|
|||
* @return array Array of removed DOMElements
|
||||
*/
|
||||
public function filterContent(): array {
|
||||
$doc = $this->getDoc();
|
||||
|
||||
// Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses,
|
||||
// move it outside the header to rescue it (T363445)
|
||||
// https://www.mediawiki.org/wiki/Heading_HTML_changes
|
||||
$headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' );
|
||||
foreach ( $headings as $heading ) {
|
||||
// @phan-suppress-next-line PhanTypeMismatchArgumentSuperType
|
||||
if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) {
|
||||
$heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode );
|
||||
}
|
||||
}
|
||||
|
||||
$removed = parent::filterContent();
|
||||
|
||||
$doc = $this->getDoc();
|
||||
$spans = $doc->getElementsByTagName( 'span' );
|
||||
|
||||
/** @var DOMElement $span */
|
||||
|
|
|
@ -154,16 +154,6 @@ class ApiQueryExtractsTest extends \MediaWikiIntegrationTestCase {
|
|||
false,
|
||||
'Example <h11>...',
|
||||
],
|
||||
'__TOC__ before intro (HTML)' => [
|
||||
'<h2 id="mw-toc-heading">Contents</h2>Intro<h2>Actual heading</h2>...',
|
||||
false,
|
||||
'<h2 id="mw-toc-heading">Contents</h2>Intro',
|
||||
],
|
||||
'__TOC__ before intro (plaintext)' => [
|
||||
"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro\1\2_\2\1<h2>Actual heading</h2>...",
|
||||
true,
|
||||
"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro",
|
||||
],
|
||||
];
|
||||
}
|
||||
|
||||
|
|
|
@ -67,17 +67,6 @@ class ExtractFormatterTest extends MediaWikiIntegrationTestCase {
|
|||
'<span class="foo"><span style="bar: baz;" lang="qux">quux</span></span>',
|
||||
false,
|
||||
],
|
||||
[
|
||||
// New heading markup from T13555 (T363445)
|
||||
"<p>hello</p><h2 id=\"heading\">heading</h2><p>bye</p>",
|
||||
'<p>hello</p>' .
|
||||
'<div class="mw-heading mw-heading2">' .
|
||||
'<h2 id="heading">heading</h2>' .
|
||||
'<span class="mw-editsection">xxx</span>' .
|
||||
'</div>' .
|
||||
'<p>bye</p>',
|
||||
false,
|
||||
],
|
||||
[
|
||||
// Verify that TOC is properly removed (HTML mode)
|
||||
"Lead\n<h1>Section</h1>\n<p>Section text</p>",
|
||||
|
|
Loading…
Reference in a new issue