From 8880df4123005e7362ee83f61b0bb83993bda61d Mon Sep 17 00:00:00 2001 From: Thiemo Kreuz Date: Tue, 27 Oct 2020 19:59:26 +0100 Subject: [PATCH] Fix DiscussionParser failing in certain languages It appears like the initial \h in this non-Unicode regular expression matches parts of an UTF-8 character, destroying it. This makes the final preg_match() in this method fail, when $output is going to be used as a pattern. Bug: T264922 Change-Id: Iaf240bc2e0808c2f57c1f8bab2589d3207915afe --- includes/DiscussionParser.php | 2 +- tests/phpunit/DiscussionParserTest.php | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/includes/DiscussionParser.php b/includes/DiscussionParser.php index 0bd04f9b8..a6c133592 100644 --- a/includes/DiscussionParser.php +++ b/includes/DiscussionParser.php @@ -1155,7 +1155,7 @@ abstract class EchoDiscussionParser { // Step 2: Generalise it // Trim off the timezone to replace at the end $output = $exemplarTimestamp; - $tzRegex = '/\h*\(\w+\)\h*$/'; + $tzRegex = '/\h*\(\w+\)\h*$/u'; $tzMatches = []; if ( preg_match( $tzRegex, $output, $tzMatches, PREG_OFFSET_CAPTURE ) ) { $output = substr( $output, 0, $tzMatches[0][1] ); diff --git a/tests/phpunit/DiscussionParserTest.php b/tests/phpunit/DiscussionParserTest.php index 50b1f1d76..5e8df643a 100644 --- a/tests/phpunit/DiscussionParserTest.php +++ b/tests/phpunit/DiscussionParserTest.php @@ -1013,6 +1013,11 @@ TEXT $this->assertSame( 1, $match ); } + public function testTimestampRegex_T264922() { + $this->setMwGlobals( 'wgLanguageCode', 'skr' ); + $this->assertIsString( EchoDiscussionParser::getTimestampRegex(), 'does not fail' ); + } + public function testGetTimestampPosition() { $line = 'Hello World. ' . self::getExemplarTimestamp(); $pos = EchoDiscussionParser::getTimestampPosition( $line );