Fix parsing localised digits in PHP discussion parser

The PHP code incorrectly assumed that the digits are single-byte in
UTF-8, which is never the case (except for 0-9).

The JS code worked correctly because it uses UTF-16 strings, so the
bug would only affect non-BMP digits there. This was noted in a TODO
comment, but we overlooked it when reimplementing in PHP.

Instead of a string of 10 characters, use an array of 10
single-character strings.

Bug: T261706
Change-Id: Ic5421382474c88f003424799c53ff473d99cce92
This commit is contained in:
Bartosz Dziewoński 2020-09-01 00:13:00 +02:00
parent 240c766768
commit 2d3fe47ac1
9 changed files with 81 additions and 17 deletions

View file

@ -270,23 +270,23 @@ class CommentParser {
* of matching the regexp returned by getTimestampRegexp()
*
* @param string $format Date format, as used by MediaWiki
* @param string|null $digits Localised digits from 0 to 9, e.g. `0123456789`
* @param array|null $digits Localised digits from 0 to 9, e.g. `[ '0', '1', ..., '9' ]`
* @param string $localTimezone Local timezone IANA name, e.g. `America/New_York`
* @param array $tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
* for the local timezone, e.g. [ 'EDT' => 'EDT', 'EST' => 'EST' ]
* @return callable Parser function
*/
private function getTimestampParser(
string $format, ?string $digits, string $localTimezone, array $tzAbbrs
string $format, ?array $digits, string $localTimezone, array $tzAbbrs
) : callable {
$untransformDigits = function ( string $text ) use ( $digits ) {
if ( !$digits ) {
return $text;
}
return preg_replace_callback(
'/[' . $digits . ']/',
'/[' . implode( '', $digits ) . ']/u',
function ( array $m ) use ( $digits ) {
return (string)strpos( $digits, $m[0] );
return (string)array_search( $m[0], $digits );
},
$text
);
@ -453,7 +453,7 @@ class CommentParser {
public function getLocalTimestampRegexp() : string {
return $this->getTimestampRegexp(
$this->dateFormat,
$this->digits ? "[$this->digits]" : '\\d',
$this->digits ? '[' . implode( '', $this->digits ) . ']' : '\\d',
$this->timezones
);
}

View file

@ -43,10 +43,8 @@ class Data {
$data['dateFormat'] = $lang->getDateFormatString( 'both', $lang->dateFormat( false ) );
// TODO: We probably shouldn't assume that each digit can be represented by a single BMP
// codepoint in every language (although it seems to be true right now).
$data['digits'] = $config->get( 'TranslateNumerals' ) ?
$lang->formatNum( '0123456789', true ) :
preg_split( '//u', $lang->formatNum( '0123456789', true ), -1, PREG_SPLIT_NO_EMPTY ) :
null;
// ApiQuerySiteinfo

View file

@ -186,7 +186,7 @@ Parser.prototype.getTimestampRegexp = function ( format, digitsRegexp, tzAbbrs )
*
* @private
* @param {string} format Date format, as used by MediaWiki
* @param {string|null} digits Localised digits from 0 to 9, e.g. `0123456789`
* @param {array|null} digits Localised digits from 0 to 9, e.g. `[ '0', '1', ..., '9' ]`
* @param {string} localTimezone Local timezone IANA name, e.g. `America/New_York`
* @param {Object} tzAbbrs Map of localised timezone abbreviations to IANA abbreviations
* for the local timezone, e.g. `{EDT: "EDT", EST: "EST"}`
@ -246,7 +246,7 @@ Parser.prototype.getTimestampParser = function ( format, digits, localTimezone,
return text;
}
return text.replace(
new RegExp( '[' + digits + ']', 'g' ),
new RegExp( '[' + digits.join( '' ) + ']', 'g' ),
function ( m ) {
return digits.indexOf( m );
}
@ -371,7 +371,7 @@ Parser.prototype.getTimestampParser = function ( format, digits, localTimezone,
Parser.prototype.getLocalTimestampRegexp = function () {
return this.getTimestampRegexp(
data.dateFormat,
data.digits ? '[' + data.digits + ']' : '\\d',
data.digits ? '[' + data.digits.join( '' ) + ']' : '\\d',
data.timezones
);
};

View file

@ -1,6 +1,17 @@
{
"dateFormat": "H:i، j xg Y",
"digits": "0123456789",
"digits": [
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9"
],
"localTimezone": "UTC",
"specialContributionsName": "مساهمات",
"timezones": {

View file

@ -1,6 +1,17 @@
{
"dateFormat": "H:i, j F Y",
"digits": "0123456789",
"digits": [
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9"
],
"localTimezone": "UTC",
"specialContributionsName": "Contributions",
"timezones": {

View file

@ -1,6 +1,17 @@
{
"dateFormat": "j F Y à H:i",
"digits": "0123456789",
"digits": [
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9"
],
"localTimezone": "Europe/Paris",
"specialContributionsName": "Contributions",
"timezones": {

View file

@ -1,6 +1,17 @@
{
"dateFormat": "Y. F j., H:i",
"digits": "0123456789",
"digits": [
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9"
],
"localTimezone": "Europe/Berlin",
"specialContributionsName": "Szerkesztő_közreműködései",
"timezones": {

View file

@ -1,6 +1,17 @@
{
"dateFormat": "j M Y H:i",
"digits": "0123456789",
"digits": [
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9"
],
"localTimezone": "Europe/Berlin",
"specialContributionsName": "Bijdragen",
"timezones": {

View file

@ -1,6 +1,17 @@
{
"dateFormat": "H:i, j M Y",
"digits": "0123456789",
"digits": [
"0",
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9"
],
"localTimezone": "Europe/Warsaw",
"specialContributionsName": "Wkład",
"timezones": {