Merge "Revert model to use simple UTF-16 code units"

2024-11-12 09:09:25 +00:00 · 2013-11-26 22:53:06 +00:00 · 2013-11-26 22:53:06 +00:00 · 91f00b0eaf
parent af9e6dd263 6f2090aac6
commit 91f00b0eaf
7 changed files with 18 additions and 25 deletions
--- a/demos/ve/pages/multibyte.html
+++ b/demos/ve/pages/multibyte.html
@ -1,6 +0,0 @@
-<p>12𨋢456789𨋢bc</p>
-<p>「𨋢」字響<tt>香港</tt>衍生出好多新詞，好似：𨋢<tt>香港</tt> abc</p>
-<p>abc</p>
-<p>one c̀ombining accent</p>
-<p>two ç̀ombining accents</p>
-<p>def</p>
--- a/modules/unicodejs/tools/unicodejs-properties.py
+++ b/modules/unicodejs/tools/unicodejs-properties.py
@ -24,7 +24,7 @@ for breaktype in ['Grapheme', 'Word']:
 		if not m:
 			raise ValueError( "Bad line: %r" % line )
 		start, end, prop = m.groups()
-		if start == 'D800' and end == 'DFFF':
+		if breaktype == 'Grapheme' and start == 'D800' and end == 'DFFF':
 			continue # raw surrogates are not treated

 		if not ranges.has_key( prop ):
--- a/modules/unicodejs/unicodejs.js
+++ b/modules/unicodejs/unicodejs.js
@ -179,7 +179,6 @@
 				throw new Error( 'range includes surrogates: ' +
 					min.toString( 16 ) + '-' + max.toString( 16 ) );
 			}
-
 			if ( max <= 0xFFFF ) {
 				// interval is entirely BMP
 				characterClass.push( codeUnitRange( min, max ) );
--- a/modules/ve/ce/ve.ce.Document.js
+++ b/modules/ve/ce/ve.ce.Document.js
@ -148,12 +148,11 @@ ve.ce.Document.prototype.getNodeAndOffset = function ( offset ) {
 		}
 		item = current[0][current[1]];
 		if ( item.nodeType === Node.TEXT_NODE ) {
-			// offset, startOffset and length are all data model lengths (not byte lengths)
-			length = ve.getClusterOffset( item.textContent, item.textContent.length );
+			length = item.textContent.length;
 			if ( offset >= startOffset && offset <= startOffset + length ) {
 				return {
 					node: item,
-					offset: ve.getByteOffset( item.textContent, offset - startOffset )
+					offset: offset - startOffset
 				};
 			} else {
 				startOffset += length;
--- a/modules/ve/ce/ve.ce.js
+++ b/modules/ve/ce/ve.ce.js
@ -157,11 +157,10 @@ ve.ce.getOffsetFromTextNode = function ( domNode, domOffset ) {
 		item = current[0][current[1]];
 		if ( item.nodeType === Node.TEXT_NODE ) {
 			if ( item === domNode ) {
-				// domOffset is a byte offset, convert it to a grapheme cluster offset
-				offset += ve.getClusterOffset( item.textContent, domOffset );
+				offset += domOffset;
 				break;
 			} else {
-				offset += ve.getClusterOffset( item.textContent, item.textContent.length );
+				offset += item.textContent.length;
 			}
 		} else if ( item.nodeType === Node.ELEMENT_NODE ) {
 			$item = current[0].eq( current[1] );
--- a/modules/ve/test/ve.test.js
+++ b/modules/ve/test/ve.test.js
@ -260,13 +260,11 @@ QUnit.test( 'createDocumentFromHtml', function ( assert ) {
 	}
 } );

-// ve.splitClusters: Tested upstream (UnicodeJS)
-
 // TODO: ve.isUnattachedCombiningMark

 // TODO: ve.getByteOffset

-// TODO: ve.getCharacterOffset
+// TODO: ve.getClusterOffset

 QUnit.test( 'graphemeSafeSubstring', function ( assert ) {
 	var i, text = '12\ud860\udee245\ud860\udee2789\ud860\udee2bc', cases = [
--- a/modules/ve/ve.js
+++ b/modules/ve/ve.js
@ -403,12 +403,14 @@
 		return ve.init.platform.getMessage.apply( ve.init.platform, arguments );
 	};

-    /**
-     * @method
-     * @inheritdoc unicodeJS.graphemebreak#splitClusters
-     * @see unicodeJS.graphemebreak#splitClusters
-     */
-	ve.splitClusters = unicodeJS.graphemebreak.splitClusters;
+	/**
+	 * Compatibility method. We no longer split into clusters at this level.
+	 *
+	 * TODO: strip out calls to splitClusters then delete this method.
+	 */
+	ve.splitClusters = function ( text ) {
+		return text.split( '' );
+	};

 	/**
 	 * Determine if the text consists of only unattached combining marks.
@ -428,7 +430,8 @@
 	 * @returns {number} Byte offset
 	 */
 	ve.getByteOffset = function ( text, clusterOffset ) {
-		return ve.splitClusters( text ).slice( 0, clusterOffset ).join( '' ).length;
+		return unicodeJS.graphemebreak.splitClusters( text ).slice( 0, clusterOffset
+			).join( '' ).length;
 	};

 	/**
@ -439,7 +442,8 @@
 	 * @returns {number} Grapheme cluster offset
 	 */
 	ve.getClusterOffset = function ( text, byteOffset ) {
-		return ve.splitClusters( text.substring( 0, byteOffset ) ).length;
+		return unicodeJS.graphemebreak.splitClusters( text.substring( 0, byteOffset
+			) ).length;
 	};

 	/**