Merge "Use grapheme clusters in unicodeJS.TextString"

This commit is contained in:
jenkins-bot 2013-05-30 16:36:35 +00:00 committed by Gerrit Code Review
commit a751f4ad67
8 changed files with 125 additions and 32 deletions

View file

@ -169,7 +169,7 @@
{ {
"name": "UnicodeJS", "name": "UnicodeJS",
"classes": [ "classes": [
"unicodeJS.TextString", "unicodeJS.wordbreak" "unicodeJS", "unicodeJS.TextString", "unicodeJS.wordbreak"
] ]
} }
] ]

View file

@ -196,6 +196,9 @@ $wgResourceModules += array(
'debugScripts' => array( 'debugScripts' => array(
've/ve.debug.js', 've/ve.debug.js',
), ),
'dependencies' => array(
'unicodejs.wordbreak',
),
), ),
'ext.visualEditor.mediawiki' => $wgVisualEditorResourceTemplate + array( 'ext.visualEditor.mediawiki' => $wgVisualEditorResourceTemplate + array(
'scripts' => array( 'scripts' => array(

View file

@ -1,7 +1,6 @@
Principal Authors (major contributors, alphabetically) Principal Authors (major contributors, alphabetically)
Ed Sanders <esanders@wikimedia.org> Ed Sanders <esanders@wikimedia.org>
David Chan <david@troi.org>
Patch Contributors (minor contributors, alphabetically) Patch Contributors (minor contributors, alphabetically)
David Chan <david@troi.org>

View file

@ -5,4 +5,39 @@
* @license The MIT License (MIT); see LICENSE.txt * @license The MIT License (MIT); see LICENSE.txt
*/ */
window.unicodeJS = {}; ( function () {
var unicodeJS;
/**
* Namespace for all UnicodeJS classes, static methods and static properties.
* @class
* @singleton
*/
unicodeJS = {};
/**
* Split a string into grapheme clusters.
*
* @param {string} text Text to split
* @returns {string[]} Array of clusters
*/
unicodeJS.splitClusters = function ( text ) {
return text.split( /(?![\uDC00-\uDFFF\u0300-\u036F])/g );
// kludge: for now, just don't split UTF surrogate pairs or combining accents
// TODO: implement Grapheme boundary rules
};
/**
* Split a string into Unicode characters, keeping surrogates paired.
*
* @param {string} text Text to split
* @returns {string[]} Array of characters
*/
unicodeJS.splitCharacters = function ( text ) {
return text.split( /(?![\uDC00-\uDFFF])/g );
// TODO: think through handling of invalid UTF-16
};
// Expose
window.unicodeJS = unicodeJS;
}() );

View file

@ -16,21 +16,51 @@
* @param {string} text Text * @param {string} text Text
*/ */
unicodeJS.TextString = function UnicodeJSTextString( text ) { unicodeJS.TextString = function UnicodeJSTextString( text ) {
this.text = text; this.clusters = unicodeJS.splitClusters( text );
}; };
/* Methods */ /* Methods */
/** /**
* Read character at specified position * Read grapheme cluster at specified position
* *
* @method * @method
* @param {number} position Position to read from * @param {number} position Position to read from
* @returns {string|null} Character, or null if out of bounds * @returns {string|null} Grapheme cluster, or null if out of bounds
*/ */
unicodeJS.TextString.prototype.read = function ( position ) { unicodeJS.TextString.prototype.read = function ( position ) {
if ( position < 0 || position >= this.text.length ) { var clusterAt = this.clusters[position];
return null; return clusterAt !== undefined ? clusterAt : null;
} };
return this.text.charAt( position );
/**
* Return number of grapheme clusters in the text string
*
* @method
* @returns {number} Number of grapheme clusters
*/
unicodeJS.TextString.prototype.getLength = function () {
return this.clusters.length;
};
/**
* Return a sub-TextString
*
* @param {number} start Start offset
* @param {number} end End offset
* @returns {unicodeJS.TextString} New TextString object containing substring
*/
unicodeJS.TextString.prototype.substring = function ( start, end ) {
var textString = new unicodeJS.TextString( '' );
textString.clusters = this.clusters.slice( start, end );
return textString;
};
/**
* Get as a plain string
*
* @returns {string} Plain javascript string
*/
unicodeJS.TextString.prototype.getString = function () {
return this.clusters.join( '' );
}; };

View file

@ -19,19 +19,39 @@
// build regexes // build regexes
for ( group in groups ) { for ( group in groups ) {
patterns[group] = new RegExp( '[' + groups[group] + ']' ); patterns[group] = new RegExp( '[' + groups[group] + ']' ); // TODO: handle surrogates
} }
function getGroup( chr ) { /**
// chr is always converted to a string by RegExp#test * Return the wordbreak property value for the cluster
*
* This is a slight con, because Unicode wordbreak property values are defined
* per character, not per cluster, whereas we're already working with a string
* split into clusters.
*
* We are making a working assumption that we can implement the Unicode
* word boundary specification by taking the property value of the *first*
* character of the cluster. In particular, this implements WB4 for us, because
* non-initial Extend or Format characters disapper.
*
* See http://www.unicode.org/reports/tr29/#Word_Boundaries
*
* @private
* @param {string} cluster The grapheme cluster
* @returns {string} The unicode wordbreak property value
*/
function getGroup( cluster ) {
var character;
// cluster is always converted to a string by RegExp#test
// e.g. null -> 'null' and would match /[a-z]/ // e.g. null -> 'null' and would match /[a-z]/
// so return null for any non-string value // so return null for any non-string value
if ( typeof chr !== 'string' ) { if ( typeof cluster !== 'string' ) {
return null; return null;
} }
character = unicodeJS.splitCharacters( cluster )[0];
var group; var group;
for ( group in patterns ) { for ( group in patterns ) {
if ( patterns[group].test( chr ) ) { if ( patterns[group].test( character ) ) {
return group; return group;
} }
} }

View file

@ -20,8 +20,15 @@ QUnit.test( 'isBreak', function ( assert ) {
// 30 - 40 // 30 - 40
" a_b_3_ナ_ " + " a_b_3_ナ_ " +
// 40 - 50 // 40 - 50
"汉字/漢字 c\u0300\u0327k" + "汉字/漢字 c\u0300\u0327k " +
// 50 - 60 // 50 - 60
"\ud800\udf08" + // U+10308 OLD ITALIC LETTER THE
"\ud800\udf08\u0302" + // U+10308 OLD ITALIC LETTER THE + combining circumflex
"\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
" pad " +
"\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
"\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
// 60 - 65: "a." tests end of para
" c\u0300\u0327 a.", " c\u0300\u0327 a.",
/*jshint quotmark:single */ /*jshint quotmark:single */
textString = new unicodeJS.TextString( text ), textString = new unicodeJS.TextString( text ),
@ -30,18 +37,22 @@ QUnit.test( 'isBreak', function ( assert ) {
11, 12, 13, 14, 15, 16, 17, 19, 11, 12, 13, 14, 15, 16, 17, 19,
21, 25, 30, 21, 25, 30,
31, 39, 40, 31, 39, 40,
41, 42, 43, 44, 45, 46, 50, 41, 42, 43, 44, 45, 46, 48, 49, 50,
51, 54, 55, 56, 57 51, 52, // TODO: these should not be in the list
53, 54, 57, 58,
59, // TODO: this should not be in the list
60,
61, 62, 63, 64, 65
]; ];
QUnit.expect( text.length + 1 ); QUnit.expect( textString.getLength() + 1 );
for ( i = 0; i <= text.length; i++ ) { for ( i = 0; i <= textString.getLength(); i++ ) {
result = ( breaks.indexOf( i ) !== -1 ); result = ( breaks.indexOf( i ) !== -1 );
context = context =
text.substring( Math.max( i - 4, 0 ), i ) + textString.substring( Math.max( i - 4, 0 ), i ).getString() +
'│' + '│' +
text.substring( i, Math.min( i + 4, text.length ) ) textString.substring( i, Math.min( i + 4, text.length ) ).getString()
; ;
assert.equal( assert.equal(
unicodeJS.wordbreak.isBreak( textString, i ), unicodeJS.wordbreak.isBreak( textString, i ),

View file

@ -770,15 +770,10 @@
return ve.init.platform.getMessage.apply( ve.init.platform, arguments ); return ve.init.platform.getMessage.apply( ve.init.platform, arguments );
}; };
/** /**
* Split a string into individual characters, leaving multibyte characters as one item * @see unicodeJS#splitClusters
* */
* @param {string} text Text to split ve.splitCharacters = unicodeJS.splitClusters; // TODO: rename to ve.splitClusters
* @returns {string[]} Array of characters
*/
ve.splitCharacters = function ( text ) {
return text.split( /(?![\uDC00-\uDFFF\u0300-\u036F])/g ); // don't split UTF surrogate pairs
};
/** /**
* Determine if the text consists of only unattached combining marks * Determine if the text consists of only unattached combining marks