mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/TextExtracts
synced 2024-11-23 15:56:52 +00:00
Merge "ApiQueryExtracts: Use raw text from ParserOutput"
This commit is contained in:
commit
bc13671078
|
@ -56,7 +56,9 @@
|
||||||
"input",
|
"input",
|
||||||
"style",
|
"style",
|
||||||
"ul.gallery",
|
"ul.gallery",
|
||||||
".mw-editsection",
|
"mw\\:editsection",
|
||||||
|
"editsection",
|
||||||
|
"meta",
|
||||||
"sup.reference",
|
"sup.reference",
|
||||||
"ol.references",
|
"ol.references",
|
||||||
".error",
|
".error",
|
||||||
|
|
|
@ -27,7 +27,7 @@ class ApiQueryExtracts extends ApiQueryBase {
|
||||||
/**
|
/**
|
||||||
* Bump when memcache needs clearing
|
* Bump when memcache needs clearing
|
||||||
*/
|
*/
|
||||||
private const CACHE_VERSION = 2;
|
private const CACHE_VERSION = 3;
|
||||||
|
|
||||||
private const PREFIX = 'ex';
|
private const PREFIX = 'ex';
|
||||||
|
|
||||||
|
@ -221,10 +221,9 @@ class ApiQueryExtracts extends ApiQueryBase {
|
||||||
*/
|
*/
|
||||||
private function getFirstSection( $text, $plainText ) {
|
private function getFirstSection( $text, $plainText ) {
|
||||||
if ( $plainText ) {
|
if ( $plainText ) {
|
||||||
$regexp = '/^.*?(?=' . ExtractFormatter::SECTION_MARKER_START .
|
$regexp = '/^(.*?)(?=' . ExtractFormatter::SECTION_MARKER_START . ')/s';
|
||||||
'(?!.' . ExtractFormatter::SECTION_MARKER_END . '<h2 id="mw-toc-heading"))/s';
|
|
||||||
} else {
|
} else {
|
||||||
$regexp = '/^.*?(?=<h[1-6]\b(?! id="mw-toc-heading"))/s';
|
$regexp = '/^(.*?)(?=<h[1-6]\b)/s';
|
||||||
}
|
}
|
||||||
if ( preg_match( $regexp, $text, $matches ) ) {
|
if ( preg_match( $regexp, $text, $matches ) ) {
|
||||||
$text = $matches[0];
|
$text = $matches[0];
|
||||||
|
@ -246,7 +245,7 @@ class ApiQueryExtracts extends ApiQueryBase {
|
||||||
);
|
);
|
||||||
if ( $status->isOK() ) {
|
if ( $status->isOK() ) {
|
||||||
$pout = $status->getValue();
|
$pout = $status->getValue();
|
||||||
$text = $pout->getText( [ 'unwrap' => true ] );
|
$text = $pout->getRawText();
|
||||||
if ( $this->params['intro'] ) {
|
if ( $this->params['intro'] ) {
|
||||||
$text = $this->getFirstSection( $text, false );
|
$text = $this->getFirstSection( $text, false );
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,6 @@ namespace MediaWiki\Extension\TextExtracts;
|
||||||
|
|
||||||
use DOMElement;
|
use DOMElement;
|
||||||
use HtmlFormatter\HtmlFormatter;
|
use HtmlFormatter\HtmlFormatter;
|
||||||
use Wikimedia\Parsoid\Utils\DOMCompat;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Provides text-only or limited-HTML extracts of page HTML
|
* Provides text-only or limited-HTML extracts of page HTML
|
||||||
|
@ -81,21 +80,9 @@ class ExtractFormatter extends HtmlFormatter {
|
||||||
* @return array Array of removed DOMElements
|
* @return array Array of removed DOMElements
|
||||||
*/
|
*/
|
||||||
public function filterContent(): array {
|
public function filterContent(): array {
|
||||||
$doc = $this->getDoc();
|
|
||||||
|
|
||||||
// Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses,
|
|
||||||
// move it outside the header to rescue it (T363445)
|
|
||||||
// https://www.mediawiki.org/wiki/Heading_HTML_changes
|
|
||||||
$headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' );
|
|
||||||
foreach ( $headings as $heading ) {
|
|
||||||
// @phan-suppress-next-line PhanTypeMismatchArgumentSuperType
|
|
||||||
if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) {
|
|
||||||
$heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$removed = parent::filterContent();
|
$removed = parent::filterContent();
|
||||||
|
|
||||||
|
$doc = $this->getDoc();
|
||||||
$spans = $doc->getElementsByTagName( 'span' );
|
$spans = $doc->getElementsByTagName( 'span' );
|
||||||
|
|
||||||
/** @var DOMElement $span */
|
/** @var DOMElement $span */
|
||||||
|
|
|
@ -160,16 +160,6 @@ class ApiQueryExtractsTest extends \MediaWikiIntegrationTestCase {
|
||||||
false,
|
false,
|
||||||
'Example <h11>...',
|
'Example <h11>...',
|
||||||
],
|
],
|
||||||
'__TOC__ before intro (HTML)' => [
|
|
||||||
'<h2 id="mw-toc-heading">Contents</h2>Intro<h2>Actual heading</h2>...',
|
|
||||||
false,
|
|
||||||
'<h2 id="mw-toc-heading">Contents</h2>Intro',
|
|
||||||
],
|
|
||||||
'__TOC__ before intro (plaintext)' => [
|
|
||||||
"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro\1\2_\2\1<h2>Actual heading</h2>...",
|
|
||||||
true,
|
|
||||||
"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro",
|
|
||||||
],
|
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -67,17 +67,6 @@ class ExtractFormatterTest extends MediaWikiIntegrationTestCase {
|
||||||
'<span class="foo"><span style="bar: baz;" lang="qux">quux</span></span>',
|
'<span class="foo"><span style="bar: baz;" lang="qux">quux</span></span>',
|
||||||
false,
|
false,
|
||||||
],
|
],
|
||||||
[
|
|
||||||
// New heading markup from T13555 (T363445)
|
|
||||||
"<p>hello</p><h2 id=\"heading\">heading</h2><p>bye</p>",
|
|
||||||
'<p>hello</p>' .
|
|
||||||
'<div class="mw-heading mw-heading2">' .
|
|
||||||
'<h2 id="heading">heading</h2>' .
|
|
||||||
'<span class="mw-editsection">xxx</span>' .
|
|
||||||
'</div>' .
|
|
||||||
'<p>bye</p>',
|
|
||||||
false,
|
|
||||||
],
|
|
||||||
[
|
[
|
||||||
// Verify that TOC is properly removed (HTML mode)
|
// Verify that TOC is properly removed (HTML mode)
|
||||||
"Lead\n<h1>Section</h1>\n<p>Section text</p>",
|
"Lead\n<h1>Section</h1>\n<p>Section text</p>",
|
||||||
|
|
Loading…
Reference in a new issue