Merge "ApiQueryExtracts: Use raw text from ParserOutput"

2024-11-23 15:56:52 +00:00 · 2024-11-15 15:36:35 +00:00 · 2024-11-15 15:36:35 +00:00 · bc13671078
parent 56b3c275d6 207242f93d
commit bc13671078
5 changed files with 8 additions and 41 deletions
--- a/extension.json
+++ b/extension.json
@ -56,7 +56,9 @@
 				"input",
 				"style",
 				"ul.gallery",
-				".mw-editsection",
+				"mw\\:editsection",
 				"editsection",
 				"meta",
 				"sup.reference",
 				"ol.references",
 				".error",
--- a/includes/ApiQueryExtracts.php
+++ b/includes/ApiQueryExtracts.php
@ -27,7 +27,7 @@ class ApiQueryExtracts extends ApiQueryBase {
 	/**
 	 * Bump when memcache needs clearing
 	 */
-	private const CACHE_VERSION = 2;
+	private const CACHE_VERSION = 3;
 	private const PREFIX = 'ex';
@ -221,10 +221,9 @@ class ApiQueryExtracts extends ApiQueryBase {
 	 */
 	private function getFirstSection( $text, $plainText ) {
 		if ( $plainText ) {
-			$regexp = '/^.*?(?=' . ExtractFormatter::SECTION_MARKER_START .
+			$regexp = '/^(.*?)(?=' . ExtractFormatter::SECTION_MARKER_START . ')/s';
 				'(?!.' . ExtractFormatter::SECTION_MARKER_END . '<h2 id="mw-toc-heading"))/s';
 		} else {
-			$regexp = '/^.*?(?=<h[1-6]\b(?! id="mw-toc-heading"))/s';
+			$regexp = '/^(.*?)(?=<h[1-6]\b)/s';
 		}
 		if ( preg_match( $regexp, $text, $matches ) ) {
 			$text = $matches[0];
@ -246,7 +245,7 @@ class ApiQueryExtracts extends ApiQueryBase {
 		);
 		if ( $status->isOK() ) {
 			$pout = $status->getValue();
-			$text = $pout->getText( [ 'unwrap' => true ] );
+			$text = $pout->getRawText();
 			if ( $this->params['intro'] ) {
 				$text = $this->getFirstSection( $text, false );
 			}
--- a/includes/ExtractFormatter.php
+++ b/includes/ExtractFormatter.php
@ -4,7 +4,6 @@ namespace MediaWiki\Extension\TextExtracts;
 use DOMElement;
 use HtmlFormatter\HtmlFormatter;
 use Wikimedia\Parsoid\Utils\DOMCompat;
 /**
 * Provides text-only or limited-HTML extracts of page HTML
@ -81,21 +80,9 @@ class ExtractFormatter extends HtmlFormatter {
 	 * @return array Array of removed DOMElements
 	 */
 	public function filterContent(): array {
 		$doc = $this->getDoc();
 		// Headings in a DIV wrapper may get removed by $wgExtractsRemoveClasses,
 		// move it outside the header to rescue it (T363445)
 		// https://www.mediawiki.org/wiki/Heading_HTML_changes
 		$headings = DOMCompat::querySelectorAll( $doc->documentElement, 'h1, h2, h3, h4, h5, h6' );
 		foreach ( $headings as $heading ) {
 			// @phan-suppress-next-line PhanTypeMismatchArgumentSuperType
 			if ( DOMCompat::getClassList( $heading->parentNode )->contains( 'mw-heading' ) ) {
 				$heading->parentNode->parentNode->insertBefore( $heading, $heading->parentNode );
 			}
 		}
 		$removed = parent::filterContent();
 		$doc = $this->getDoc();
 		$spans = $doc->getElementsByTagName( 'span' );
 		/** @var DOMElement $span */
--- a/tests/phpunit/ApiQueryExtractsTest.php
+++ b/tests/phpunit/ApiQueryExtractsTest.php
@ -160,16 +160,6 @@ class ApiQueryExtractsTest extends \MediaWikiIntegrationTestCase {
 				false,
 				'Example <h11>...',
 			],
 			'__TOC__ before intro (HTML)' => [
 				'<h2 id="mw-toc-heading">Contents</h2>Intro<h2>Actual heading</h2>...',
 				false,
 				'<h2 id="mw-toc-heading">Contents</h2>Intro',
 			],
 			'__TOC__ before intro (plaintext)' => [
 				"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro\1\2_\2\1<h2>Actual heading</h2>...",
 				true,
 				"\1\2_\2\1<h2 id=\"mw-toc-heading\">Contents</h2>Intro",
 			],
 		];
 	}
--- a/tests/phpunit/ExtractFormatterTest.php
+++ b/tests/phpunit/ExtractFormatterTest.php
@ -67,17 +67,6 @@ class ExtractFormatterTest extends MediaWikiIntegrationTestCase {
 				'<span class="foo"><span style="bar: baz;" lang="qux">quux</span></span>',
 				false,
 			],
 			[
 				// New heading markup from T13555 (T363445)
 				"<p>hello</p><h2 id=\"heading\">heading</h2><p>bye</p>",
 				'<p>hello</p>' .
 					'<div class="mw-heading mw-heading2">' .
 					'<h2 id="heading">heading</h2>' .
 					'<span class="mw-editsection">xxx</span>' .
 					'</div>' .
 					'<p>bye</p>',
 				false,
 			],
 			[
 				// Verify that TOC is properly removed (HTML mode)
 				"Lead\n<h1>Section</h1>\n<p>Section text</p>",