Merge "Use grapheme clusters in unicodeJS.TextString"

2024-12-01 01:16:30 +00:00 · 2013-05-30 16:36:35 +00:00 · 2013-05-30 16:36:35 +00:00 · a751f4ad67
parent a5ec1602a5 1c78d0a38c
commit a751f4ad67
8 changed files with 125 additions and 32 deletions
--- a/.docs/categories.json
+++ b/.docs/categories.json
@ -169,7 +169,7 @@
 			{
 				"name": "UnicodeJS",
 				"classes": [
-					"unicodeJS.TextString", "unicodeJS.wordbreak"
+					"unicodeJS", "unicodeJS.TextString", "unicodeJS.wordbreak"
 				]
 			}
 		]
--- a/VisualEditor.php
+++ b/VisualEditor.php
@ -196,6 +196,9 @@ $wgResourceModules += array(
 		'debugScripts' => array(
 			've/ve.debug.js',
 		),
+		'dependencies' => array(
+			'unicodejs.wordbreak',
+		),
 	),
 	'ext.visualEditor.mediawiki' => $wgVisualEditorResourceTemplate + array(
 		'scripts' => array(
--- a/modules/unicodejs/AUTHORS.txt
+++ b/modules/unicodejs/AUTHORS.txt
@ -1,7 +1,6 @@
 Principal Authors (major contributors, alphabetically)

 Ed Sanders <esanders@wikimedia.org>
+David Chan <david@troi.org>

 Patch Contributors (minor contributors, alphabetically)
-
-David Chan <david@troi.org>
--- a/modules/unicodejs/unicodejs.js
+++ b/modules/unicodejs/unicodejs.js
@ -5,4 +5,39 @@
 * @license The MIT License (MIT); see LICENSE.txt
 */

-window.unicodeJS = {};
+( function () {
+	var unicodeJS;
+
+	/**
+	 * Namespace for all UnicodeJS classes, static methods and static properties.
+	 * @class
+	 * @singleton
+	 */
+	unicodeJS = {};
+
+	/**
+	 * Split a string into grapheme clusters.
+	 *
+	 * @param {string} text Text to split
+	 * @returns {string[]} Array of clusters
+	 */
+	unicodeJS.splitClusters = function ( text ) {
+		return text.split( /(?![\uDC00-\uDFFF\u0300-\u036F])/g );
+		// kludge: for now, just don't split UTF surrogate pairs or combining accents
+		// TODO: implement Grapheme boundary rules
+	};
+
+	/**
+	 * Split a string into Unicode characters, keeping surrogates paired.
+	 *
+	 * @param {string} text Text to split
+	 * @returns {string[]} Array of characters
+	 */
+	unicodeJS.splitCharacters = function ( text ) {
+		return text.split( /(?![\uDC00-\uDFFF])/g );
+		// TODO: think through handling of invalid UTF-16
+	};
+
+	// Expose
+	window.unicodeJS = unicodeJS;
+}() );
--- a/modules/unicodejs/unicodejs.textstring.js
+++ b/modules/unicodejs/unicodejs.textstring.js
@ -16,21 +16,51 @@
 * @param {string} text Text
 */
 unicodeJS.TextString = function UnicodeJSTextString( text ) {
-	this.text = text;
+	this.clusters = unicodeJS.splitClusters( text );
 };

 /* Methods */

 /**
- * Read character at specified position
+ * Read grapheme cluster at specified position
 *
 * @method
 * @param {number} position Position to read from
- * @returns {string|null} Character, or null if out of bounds
+ * @returns {string|null} Grapheme cluster, or null if out of bounds
 */
 unicodeJS.TextString.prototype.read = function ( position ) {
-	if ( position < 0 || position >= this.text.length ) {
-		return null;
-	}
-	return this.text.charAt( position );
+	var clusterAt = this.clusters[position];
+	return clusterAt !== undefined ? clusterAt : null;
+};
+
+/**
+ * Return number of grapheme clusters in the text string
+ *
+ * @method
+ * @returns {number} Number of grapheme clusters
+ */
+unicodeJS.TextString.prototype.getLength = function () {
+	return this.clusters.length;
+};
+
+/**
+ * Return a sub-TextString
+ *
+ * @param {number} start Start offset
+ * @param {number} end End offset
+ * @returns {unicodeJS.TextString} New TextString object containing substring
+ */
+unicodeJS.TextString.prototype.substring = function ( start, end ) {
+	var textString = new unicodeJS.TextString( '' );
+	textString.clusters = this.clusters.slice( start, end );
+	return textString;
+};
+
+/**
+ * Get as a plain string
+ *
+ * @returns {string} Plain javascript string
+ */
+unicodeJS.TextString.prototype.getString = function () {
+	return this.clusters.join( '' );
 };
--- a/modules/unicodejs/unicodejs.wordbreak.js
+++ b/modules/unicodejs/unicodejs.wordbreak.js
@ -19,19 +19,39 @@

 	// build regexes
 	for ( group in groups ) {
-		patterns[group] = new RegExp( '[' + groups[group] + ']' );
+		patterns[group] = new RegExp( '[' + groups[group] + ']' ); // TODO: handle surrogates
 	}

-	function getGroup( chr ) {
-		// chr is always converted to a string by RegExp#test
+	/**
+	 * Return the wordbreak property value for the cluster
+	 *
+	 * This is a slight con, because Unicode wordbreak property values are defined
+	 * per character, not per cluster, whereas we're already working with a string
+	 * split into clusters.
+	 *
+	 * We are making a working assumption that we can implement the Unicode
+	 * word boundary specification by taking the property value of the *first*
+	 * character of the cluster. In particular, this implements WB4 for us, because
+	 * non-initial Extend or Format characters disapper.
+	 *
+	 * See http://www.unicode.org/reports/tr29/#Word_Boundaries
+	 *
+	 * @private
+	 * @param {string} cluster The grapheme cluster
+	 * @returns {string} The unicode wordbreak property value
+	 */
+	function getGroup( cluster ) {
+		var character;
+		// cluster is always converted to a string by RegExp#test
 		// e.g. null -> 'null' and would match /[a-z]/
 		// so return null for any non-string value
-		if ( typeof chr !== 'string' ) {
+		if ( typeof cluster !== 'string' ) {
 			return null;
 		}
+		character = unicodeJS.splitCharacters( cluster )[0];
 		var group;
 		for ( group in patterns ) {
-			if ( patterns[group].test( chr ) ) {
+			if ( patterns[group].test( character ) ) {
 				return group;
 			}
 		}
--- a/modules/unicodejs/unicodejs.wordbreak.test.js
+++ b/modules/unicodejs/unicodejs.wordbreak.test.js
@ -20,8 +20,15 @@ QUnit.test( 'isBreak', function ( assert ) {
 			// 30 - 40
 			" a_b_3_ナ_ " +
 			// 40 - 50
-			"汉字/漢字 c\u0300\u0327k" +
+			"汉字/漢字 c\u0300\u0327k  " +
 			// 50 - 60
+			"\ud800\udf08" + // U+10308 OLD ITALIC LETTER THE
+			"\ud800\udf08\u0302" + // U+10308 OLD ITALIC LETTER THE + combining circumflex
+			"\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
+			" pad " +
+			"\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
+			"\ud800\udf0a" + // U+1030A OLD ITALIC LETTER KA
+			// 60 - 65: "a." tests end of para
 			" c\u0300\u0327 a.",
 			/*jshint quotmark:single */
 		textString = new unicodeJS.TextString( text ),
@ -30,18 +37,22 @@ QUnit.test( 'isBreak', function ( assert ) {
 			11, 12, 13, 14, 15, 16, 17, 19,
 			21, 25, 30,
 			31, 39, 40,
-			41, 42, 43, 44, 45, 46, 50,
-			51, 54, 55, 56, 57
+			41, 42, 43, 44, 45, 46, 48, 49, 50,
+			51, 52, // TODO: these should not be in the list
+			53, 54, 57, 58,
+			59, // TODO: this should not be in the list
+			60,
+			61, 62, 63, 64, 65
 		];

-	QUnit.expect( text.length + 1 );
+	QUnit.expect( textString.getLength() + 1 );

-	for ( i = 0; i <= text.length; i++ ) {
+	for ( i = 0; i <= textString.getLength(); i++ ) {
 		result = ( breaks.indexOf( i ) !== -1 );
 		context =
-			text.substring( Math.max( i - 4, 0 ), i ) +
+			textString.substring( Math.max( i - 4, 0 ), i ).getString() +
 			'│' +
-			text.substring( i, Math.min( i + 4, text.length ) )
+			textString.substring( i, Math.min( i + 4, text.length ) ).getString()
 		;
 		assert.equal(
 			unicodeJS.wordbreak.isBreak( textString, i ),
--- a/modules/ve/ve.js
+++ b/modules/ve/ve.js
@ -770,15 +770,10 @@
 		return ve.init.platform.getMessage.apply( ve.init.platform, arguments );
 	};

-	/**
-	 * Split a string into individual characters, leaving multibyte characters as one item
-	 *
-	 * @param {string} text Text to split
-	 * @returns {string[]} Array of characters
-	 */
-	ve.splitCharacters = function ( text ) {
-		return text.split( /(?![\uDC00-\uDFFF\u0300-\u036F])/g ); // don't split UTF surrogate pairs
-	};
+    /**
+     * @see unicodeJS#splitClusters
+     */
+	ve.splitCharacters = unicodeJS.splitClusters; // TODO: rename to ve.splitClusters

 	/**
 	 * Determine if the text consists of only unattached combining marks