mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-12-02 18:06:16 +00:00
Merge "Use grapheme clusters in unicodeJS.TextString"
This commit is contained in:
commit
a751f4ad67
|
@ -169,7 +169,7 @@
|
||||||
{
|
{
|
||||||
"name": "UnicodeJS",
|
"name": "UnicodeJS",
|
||||||
"classes": [
|
"classes": [
|
||||||
"unicodeJS.TextString", "unicodeJS.wordbreak"
|
"unicodeJS", "unicodeJS.TextString", "unicodeJS.wordbreak"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -196,6 +196,9 @@ $wgResourceModules += array(
|
||||||
'debugScripts' => array(
|
'debugScripts' => array(
|
||||||
've/ve.debug.js',
|
've/ve.debug.js',
|
||||||
),
|
),
|
||||||
|
'dependencies' => array(
|
||||||
|
'unicodejs.wordbreak',
|
||||||
|
),
|
||||||
),
|
),
|
||||||
'ext.visualEditor.mediawiki' => $wgVisualEditorResourceTemplate + array(
|
'ext.visualEditor.mediawiki' => $wgVisualEditorResourceTemplate + array(
|
||||||
'scripts' => array(
|
'scripts' => array(
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
Principal Authors (major contributors, alphabetically)
|
Principal Authors (major contributors, alphabetically)
|
||||||
|
|
||||||
Ed Sanders <esanders@wikimedia.org>
|
Ed Sanders <esanders@wikimedia.org>
|
||||||
|
David Chan <david@troi.org>
|
||||||
|
|
||||||
Patch Contributors (minor contributors, alphabetically)
|
Patch Contributors (minor contributors, alphabetically)
|
||||||
|
|
||||||
David Chan <david@troi.org>
|
|
|
@ -5,4 +5,39 @@
|
||||||
* @license The MIT License (MIT); see LICENSE.txt
|
* @license The MIT License (MIT); see LICENSE.txt
|
||||||
*/
|
*/
|
||||||
|
|
||||||
window.unicodeJS = {};
|
( function () {
|
||||||
|
var unicodeJS;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Namespace for all UnicodeJS classes, static methods and static properties.
|
||||||
|
* @class
|
||||||
|
* @singleton
|
||||||
|
*/
|
||||||
|
unicodeJS = {};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split a string into grapheme clusters.
|
||||||
|
*
|
||||||
|
* @param {string} text Text to split
|
||||||
|
* @returns {string[]} Array of clusters
|
||||||
|
*/
|
||||||
|
unicodeJS.splitClusters = function ( text ) {
|
||||||
|
return text.split( /(?![\uDC00-\uDFFF\u0300-\u036F])/g );
|
||||||
|
// kludge: for now, just don't split UTF surrogate pairs or combining accents
|
||||||
|
// TODO: implement Grapheme boundary rules
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split a string into Unicode characters, keeping surrogates paired.
|
||||||
|
*
|
||||||
|
* @param {string} text Text to split
|
||||||
|
* @returns {string[]} Array of characters
|
||||||
|
*/
|
||||||
|
unicodeJS.splitCharacters = function ( text ) {
|
||||||
|
return text.split( /(?![\uDC00-\uDFFF])/g );
|
||||||
|
// TODO: think through handling of invalid UTF-16
|
||||||
|
};
|
||||||
|
|
||||||
|
// Expose
|
||||||
|
window.unicodeJS = unicodeJS;
|
||||||
|
}() );
|
||||||
|
|
|
@ -16,21 +16,51 @@
|
||||||
* @param {string} text Text
|
* @param {string} text Text
|
||||||
*/
|
*/
|
||||||
unicodeJS.TextString = function UnicodeJSTextString( text ) {
|
unicodeJS.TextString = function UnicodeJSTextString( text ) {
|
||||||
this.text = text;
|
this.clusters = unicodeJS.splitClusters( text );
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Methods */
|
/* Methods */
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Read character at specified position
|
* Read grapheme cluster at specified position
|
||||||
*
|
*
|
||||||
* @method
|
* @method
|
||||||
* @param {number} position Position to read from
|
* @param {number} position Position to read from
|
||||||
* @returns {string|null} Character, or null if out of bounds
|
* @returns {string|null} Grapheme cluster, or null if out of bounds
|
||||||
*/
|
*/
|
||||||
unicodeJS.TextString.prototype.read = function ( position ) {
|
unicodeJS.TextString.prototype.read = function ( position ) {
|
||||||
if ( position < 0 || position >= this.text.length ) {
|
var clusterAt = this.clusters[position];
|
||||||
return null;
|
return clusterAt !== undefined ? clusterAt : null;
|
||||||
}
|
};
|
||||||
return this.text.charAt( position );
|
|
||||||
|
/**
|
||||||
|
* Return number of grapheme clusters in the text string
|
||||||
|
*
|
||||||
|
* @method
|
||||||
|
* @returns {number} Number of grapheme clusters
|
||||||
|
*/
|
||||||
|
unicodeJS.TextString.prototype.getLength = function () {
|
||||||
|
return this.clusters.length;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a sub-TextString
|
||||||
|
*
|
||||||
|
* @param {number} start Start offset
|
||||||
|
* @param {number} end End offset
|
||||||
|
* @returns {unicodeJS.TextString} New TextString object containing substring
|
||||||
|
*/
|
||||||
|
unicodeJS.TextString.prototype.substring = function ( start, end ) {
|
||||||
|
var textString = new unicodeJS.TextString( '' );
|
||||||
|
textString.clusters = this.clusters.slice( start, end );
|
||||||
|
return textString;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get as a plain string
|
||||||
|
*
|
||||||
|
* @returns {string} Plain javascript string
|
||||||
|
*/
|
||||||
|
unicodeJS.TextString.prototype.getString = function () {
|
||||||
|
return this.clusters.join( '' );
|
||||||
};
|
};
|
||||||
|
|
|
@ -19,19 +19,39 @@
|
||||||
|
|
||||||
// build regexes
|
// build regexes
|
||||||
for ( group in groups ) {
|
for ( group in groups ) {
|
||||||
patterns[group] = new RegExp( '[' + groups[group] + ']' );
|
patterns[group] = new RegExp( '[' + groups[group] + ']' ); // TODO: handle surrogates
|
||||||
}
|
}
|
||||||
|
|
||||||
function getGroup( chr ) {
|
/**
|
||||||
// chr is always converted to a string by RegExp#test
|
* Return the wordbreak property value for the cluster
|
||||||
|
*
|
||||||
|
* This is a slight con, because Unicode wordbreak property values are defined
|
||||||
|
* per character, not per cluster, whereas we're already working with a string
|
||||||
|
* split into clusters.
|
||||||
|
*
|
||||||
|
* We are making a working assumption that we can implement the Unicode
|
||||||
|
* word boundary specification by taking the property value of the *first*
|
||||||
|
* character of the cluster. In particular, this implements WB4 for us, because
|
||||||
|
* non-initial Extend or Format characters disapper.
|
||||||
|
*
|
||||||
|
* See http://www.unicode.org/reports/tr29/#Word_Boundaries
|
||||||
|
*
|
||||||
|
* @private
|
||||||
|
* @param {string} cluster The grapheme cluster
|
||||||
|
* @returns {string} The unicode wordbreak property value
|
||||||
|
*/
|
||||||
|
function getGroup( cluster ) {
|
||||||
|
var character;
|
||||||
|
// cluster is always converted to a string by RegExp#test
|
||||||
// e.g. null -> 'null' and would match /[a-z]/
|
// e.g. null -> 'null' and would match /[a-z]/
|
||||||
// so return null for any non-string value
|
// so return null for any non-string value
|
||||||
if ( typeof chr !== 'string' ) {
|
if ( typeof cluster !== 'string' ) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
character = unicodeJS.splitCharacters( cluster )[0];
|
||||||
var group;
|
var group;
|
||||||
for ( group in patterns ) {
|
for ( group in patterns ) {
|
||||||
if ( patterns[group].test( chr ) ) {
|
if ( patterns[group].test( character ) ) {
|
||||||
return group;
|
return group;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,8 +20,15 @@ QUnit.test( 'isBreak', function ( assert ) {
|
||||||
// 30 - 40
|
// 30 - 40
|
||||||
" a_b_3_ナ_ " +
|
" a_b_3_ナ_ " +
|
||||||
// 40 - 50
|
// 40 - 50
|
||||||
"汉字/漢字 c\u0300\u0327k" +
|
"汉字/漢字 c\u0300\u0327k " +
|
||||||
// 50 - 60
|
// 50 - 60
|
||||||
|
"\ud800\udf08" + // U+10308 OLD ITALIC LETTER THE
|
||||||
|
"\ud800\udf08\u0302" + // U+10308 OLD ITALIC LETTER THE + combining circumflex
|
||||||
|
"\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
|
||||||
|
" pad " +
|
||||||
|
"\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
|
||||||
|
"\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
|
||||||
|
// 60 - 65: "a." tests end of para
|
||||||
" c\u0300\u0327 a.",
|
" c\u0300\u0327 a.",
|
||||||
/*jshint quotmark:single */
|
/*jshint quotmark:single */
|
||||||
textString = new unicodeJS.TextString( text ),
|
textString = new unicodeJS.TextString( text ),
|
||||||
|
@ -30,18 +37,22 @@ QUnit.test( 'isBreak', function ( assert ) {
|
||||||
11, 12, 13, 14, 15, 16, 17, 19,
|
11, 12, 13, 14, 15, 16, 17, 19,
|
||||||
21, 25, 30,
|
21, 25, 30,
|
||||||
31, 39, 40,
|
31, 39, 40,
|
||||||
41, 42, 43, 44, 45, 46, 50,
|
41, 42, 43, 44, 45, 46, 48, 49, 50,
|
||||||
51, 54, 55, 56, 57
|
51, 52, // TODO: these should not be in the list
|
||||||
|
53, 54, 57, 58,
|
||||||
|
59, // TODO: this should not be in the list
|
||||||
|
60,
|
||||||
|
61, 62, 63, 64, 65
|
||||||
];
|
];
|
||||||
|
|
||||||
QUnit.expect( text.length + 1 );
|
QUnit.expect( textString.getLength() + 1 );
|
||||||
|
|
||||||
for ( i = 0; i <= text.length; i++ ) {
|
for ( i = 0; i <= textString.getLength(); i++ ) {
|
||||||
result = ( breaks.indexOf( i ) !== -1 );
|
result = ( breaks.indexOf( i ) !== -1 );
|
||||||
context =
|
context =
|
||||||
text.substring( Math.max( i - 4, 0 ), i ) +
|
textString.substring( Math.max( i - 4, 0 ), i ).getString() +
|
||||||
'│' +
|
'│' +
|
||||||
text.substring( i, Math.min( i + 4, text.length ) )
|
textString.substring( i, Math.min( i + 4, text.length ) ).getString()
|
||||||
;
|
;
|
||||||
assert.equal(
|
assert.equal(
|
||||||
unicodeJS.wordbreak.isBreak( textString, i ),
|
unicodeJS.wordbreak.isBreak( textString, i ),
|
||||||
|
|
|
@ -770,15 +770,10 @@
|
||||||
return ve.init.platform.getMessage.apply( ve.init.platform, arguments );
|
return ve.init.platform.getMessage.apply( ve.init.platform, arguments );
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Split a string into individual characters, leaving multibyte characters as one item
|
* @see unicodeJS#splitClusters
|
||||||
*
|
*/
|
||||||
* @param {string} text Text to split
|
ve.splitCharacters = unicodeJS.splitClusters; // TODO: rename to ve.splitClusters
|
||||||
* @returns {string[]} Array of characters
|
|
||||||
*/
|
|
||||||
ve.splitCharacters = function ( text ) {
|
|
||||||
return text.split( /(?![\uDC00-\uDFFF\u0300-\u036F])/g ); // don't split UTF surrogate pairs
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Determine if the text consists of only unattached combining marks
|
* Determine if the text consists of only unattached combining marks
|
||||||
|
|
Loading…
Reference in a new issue