Support ISO 8601 timestamps in the parser

https://wikipesija.org is currently using ISO 8601 as the default date
format. The format is xnY-xnm-xnd"T"xnH:xni:xns and 'xn', 'm', and 's'
need support added.

Change-Id: I235098a578eb92ddd23ea47fa23d60df4b28f590
This commit is contained in:
Theodore Dubois 2023-06-04 11:16:56 -07:00
parent 6295079d77
commit 4ca17b8c33
4 changed files with 54 additions and 2 deletions

View file

@ -184,6 +184,7 @@ class CommentParser {
): string {
$formatLength = strlen( $format );
$s = '';
$raw = false;
// Adapted from Language::sprintfDate()
for ( $p = 0; $p < $formatLength; $p++ ) {
$num = false;
@ -204,6 +205,9 @@ class CommentParser {
$this->getMessages( $contLangVariant, Language::MONTH_GENITIVE_MESSAGES )
);
break;
case 'xn':
$raw = true;
break;
case 'd':
$num = '2';
break;
@ -230,6 +234,9 @@ class CommentParser {
$this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES )
);
break;
case 'm':
$num = '2';
break;
case 'n':
$num = '1,2';
break;
@ -248,6 +255,9 @@ class CommentParser {
case 'i':
$num = '2';
break;
case 's':
$num = '2';
break;
case '\\':
// Backslash escaping
if ( $p < $formatLength - 1 ) {
@ -279,7 +289,12 @@ class CommentParser {
$p += strlen( $char ) - 1;
}
if ( $num !== false ) {
$s .= '(' . $digitsRegexp . '{' . $num . '})';
if ( $raw ) {
$s .= '([0-9]{' . $num . '})';
$raw = false;
} else {
$s .= '(' . $digitsRegexp . '{' . $num . '})';
}
}
// Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T308448)
$s .= '[\\x{200E}\\x{200F}]?';
@ -334,6 +349,7 @@ class CommentParser {
switch ( $code ) {
case 'xx':
case 'xn':
break;
case 'xg':
case 'd':
@ -342,12 +358,14 @@ class CommentParser {
case 'l':
case 'F':
case 'M':
case 'm':
case 'n':
case 'Y':
case 'xkY':
case 'G':
case 'H':
case 'i':
case 's':
$matchingGroups[] = $code;
break;
case '\\':
@ -414,6 +432,7 @@ class CommentParser {
$this->getMessages( $contLangVariant, Language::MONTH_ABBREVIATED_MESSAGES )
);
break;
case 'm':
case 'n':
$monthIdx = intval( $untransformDigits( $text ) ) - 1;
break;
@ -431,6 +450,9 @@ class CommentParser {
case 'i':
$minute = intval( $untransformDigits( $text ) );
break;
case 's':
// Seconds - unused, because most timestamp formats omit them
break;
default:
throw new LogicException( 'Not implemented' );
}

View file

@ -179,18 +179,21 @@ class LanguageData {
switch ( $code ) {
case 'xx':
case 'xg':
case 'xn':
case 'd':
case 'D':
case 'j':
case 'l':
case 'F':
case 'M':
case 'm':
case 'n':
case 'Y':
case 'xkY':
case 'G':
case 'H':
case 'i':
case 's':
// Special code - pass through unchanged
$s .= $code;
break;

View file

@ -96,6 +96,7 @@ Parser.prototype.getTimestampRegexp = function ( contLangVariant, format, digits
var parser = this;
var s = '';
var raw = false;
// Adapted from Language::sprintfDate()
for ( var p = 0; p < format.length; p++ ) {
var num = false;
@ -118,6 +119,9 @@ Parser.prototype.getTimestampRegexp = function ( contLangVariant, format, digits
'december-gen'
] ) );
break;
case 'xn':
raw = true;
break;
case 'd':
num = '2';
break;
@ -148,6 +152,9 @@ Parser.prototype.getTimestampRegexp = function ( contLangVariant, format, digits
'sep', 'oct', 'nov', 'dec'
] ) );
break;
case 'm':
num = '2';
break;
case 'n':
num = '1,2';
break;
@ -166,6 +173,9 @@ Parser.prototype.getTimestampRegexp = function ( contLangVariant, format, digits
case 'i':
num = '2';
break;
case 's':
num = '2';
break;
case '\\':
// Backslash escaping
if ( p < format.length - 1 ) {
@ -197,7 +207,12 @@ Parser.prototype.getTimestampRegexp = function ( contLangVariant, format, digits
p += char.length - 1;
}
if ( num !== false ) {
s += regexpGroup( digitsRegexp + '{' + num + '}' );
if ( raw ) {
s += regexpGroup( '[0-9]{' + num + '}' );
raw = false;
} else {
s += regexpGroup( digitsRegexp + '{' + num + '}' );
}
}
// Ignore some invisible Unicode characters that often sneak into copy-pasted timestamps (T308448)
s += '[\\u200E\\u200F]?';
@ -237,6 +252,7 @@ Parser.prototype.getTimestampParser = function ( contLangVariant, format, digits
switch ( code ) {
case 'xx':
case 'xn':
break;
case 'xg':
case 'd':
@ -245,12 +261,14 @@ Parser.prototype.getTimestampParser = function ( contLangVariant, format, digits
case 'l':
case 'F':
case 'M':
case 'm':
case 'n':
case 'Y':
case 'xkY':
case 'G':
case 'H':
case 'i':
case 's':
matchingGroups.push( code );
break;
case '\\':
@ -340,6 +358,7 @@ Parser.prototype.getTimestampParser = function ( contLangVariant, format, digits
'sep', 'oct', 'nov', 'dec'
] ).indexOf( text );
break;
case 'm':
case 'n':
monthIdx = Number( untransformDigits( text ) ) - 1;
break;
@ -357,6 +376,9 @@ Parser.prototype.getTimestampParser = function ( contLangVariant, format, digits
case 'i':
minute = Number( untransformDigits( text ) );
break;
case 's':
// Seconds - unused, because most timestamp formats omit them
break;
default:
throw new Error( 'Not implemented' );
}

View file

@ -43,5 +43,10 @@
"format": "H:i💩 j F Y",
"expected": "(\\d{2})[\\u200E\\u200F]?:[\\u200E\\u200F]?(\\d{2})[\\u200E\\u200F]?💩[\\u200E\\u200F]? [\\u200E\\u200F]?(\\d{1,2})[\\u200E\\u200F]? [\\u200E\\u200F]?(January|February|March|April|May|June|July|August|September|October|November|December)[\\u200E\\u200F]? [\\u200E\\u200F]?(\\d{4})[\\u200E\\u200F]? [\\u200E\\u200F]?\\((UTC)\\)",
"message": "(en) UTF-16 multibyte characters survive"
},
{
"format": "xnY-xnm-xnd\"T\"xnH:xni:xns",
"expected": "[\\u200E\\u200F]?([0-9]{4})[\\u200E\\u200F]?\\-[\\u200E\\u200F]?[\\u200E\\u200F]?([0-9]{2})[\\u200E\\u200F]?\\-[\\u200E\\u200F]?[\\u200E\\u200F]?([0-9]{2})[\\u200E\\u200F]?T[\\u200E\\u200F]?[\\u200E\\u200F]?([0-9]{2})[\\u200E\\u200F]?:[\\u200E\\u200F]?[\\u200E\\u200F]?([0-9]{2})[\\u200E\\u200F]?:[\\u200E\\u200F]?[\\u200E\\u200F]?([0-9]{2})[\\u200E\\u200F]? [\\u200E\\u200F]?\\((UTC)\\)",
"message": "ISO 8601"
}
]