Merge "Revert model to use simple UTF-16 code units"

This commit is contained in:
jenkins-bot 2013-11-26 22:53:06 +00:00 committed by Gerrit Code Review
commit 91f00b0eaf
7 changed files with 18 additions and 25 deletions

View file

@ -1,6 +0,0 @@
<p>12𨋢456789𨋢bc</p>
<p>「𨋢」字響<tt>香港</tt>衍生出好多新詞,好似:𨋢<tt>香港</tt> abc</p>
<p>abc</p>
<p>one c̀ombining accent</p>
<p>two ç̀ombining accents</p>
<p>def</p>

View file

@ -24,7 +24,7 @@ for breaktype in ['Grapheme', 'Word']:
if not m: if not m:
raise ValueError( "Bad line: %r" % line ) raise ValueError( "Bad line: %r" % line )
start, end, prop = m.groups() start, end, prop = m.groups()
if start == 'D800' and end == 'DFFF': if breaktype == 'Grapheme' and start == 'D800' and end == 'DFFF':
continue # raw surrogates are not treated continue # raw surrogates are not treated
if not ranges.has_key( prop ): if not ranges.has_key( prop ):

View file

@ -179,7 +179,6 @@
throw new Error( 'range includes surrogates: ' + throw new Error( 'range includes surrogates: ' +
min.toString( 16 ) + '-' + max.toString( 16 ) ); min.toString( 16 ) + '-' + max.toString( 16 ) );
} }
if ( max <= 0xFFFF ) { if ( max <= 0xFFFF ) {
// interval is entirely BMP // interval is entirely BMP
characterClass.push( codeUnitRange( min, max ) ); characterClass.push( codeUnitRange( min, max ) );

View file

@ -148,12 +148,11 @@ ve.ce.Document.prototype.getNodeAndOffset = function ( offset ) {
} }
item = current[0][current[1]]; item = current[0][current[1]];
if ( item.nodeType === Node.TEXT_NODE ) { if ( item.nodeType === Node.TEXT_NODE ) {
// offset, startOffset and length are all data model lengths (not byte lengths) length = item.textContent.length;
length = ve.getClusterOffset( item.textContent, item.textContent.length );
if ( offset >= startOffset && offset <= startOffset + length ) { if ( offset >= startOffset && offset <= startOffset + length ) {
return { return {
node: item, node: item,
offset: ve.getByteOffset( item.textContent, offset - startOffset ) offset: offset - startOffset
}; };
} else { } else {
startOffset += length; startOffset += length;

View file

@ -157,11 +157,10 @@ ve.ce.getOffsetFromTextNode = function ( domNode, domOffset ) {
item = current[0][current[1]]; item = current[0][current[1]];
if ( item.nodeType === Node.TEXT_NODE ) { if ( item.nodeType === Node.TEXT_NODE ) {
if ( item === domNode ) { if ( item === domNode ) {
// domOffset is a byte offset, convert it to a grapheme cluster offset offset += domOffset;
offset += ve.getClusterOffset( item.textContent, domOffset );
break; break;
} else { } else {
offset += ve.getClusterOffset( item.textContent, item.textContent.length ); offset += item.textContent.length;
} }
} else if ( item.nodeType === Node.ELEMENT_NODE ) { } else if ( item.nodeType === Node.ELEMENT_NODE ) {
$item = current[0].eq( current[1] ); $item = current[0].eq( current[1] );

View file

@ -260,13 +260,11 @@ QUnit.test( 'createDocumentFromHtml', function ( assert ) {
} }
} ); } );
// ve.splitClusters: Tested upstream (UnicodeJS)
// TODO: ve.isUnattachedCombiningMark // TODO: ve.isUnattachedCombiningMark
// TODO: ve.getByteOffset // TODO: ve.getByteOffset
// TODO: ve.getCharacterOffset // TODO: ve.getClusterOffset
QUnit.test( 'graphemeSafeSubstring', function ( assert ) { QUnit.test( 'graphemeSafeSubstring', function ( assert ) {
var i, text = '12\ud860\udee245\ud860\udee2789\ud860\udee2bc', cases = [ var i, text = '12\ud860\udee245\ud860\udee2789\ud860\udee2bc', cases = [

View file

@ -403,12 +403,14 @@
return ve.init.platform.getMessage.apply( ve.init.platform, arguments ); return ve.init.platform.getMessage.apply( ve.init.platform, arguments );
}; };
/** /**
* @method * Compatibility method. We no longer split into clusters at this level.
* @inheritdoc unicodeJS.graphemebreak#splitClusters *
* @see unicodeJS.graphemebreak#splitClusters * TODO: strip out calls to splitClusters then delete this method.
*/ */
ve.splitClusters = unicodeJS.graphemebreak.splitClusters; ve.splitClusters = function ( text ) {
return text.split( '' );
};
/** /**
* Determine if the text consists of only unattached combining marks. * Determine if the text consists of only unattached combining marks.
@ -428,7 +430,8 @@
* @returns {number} Byte offset * @returns {number} Byte offset
*/ */
ve.getByteOffset = function ( text, clusterOffset ) { ve.getByteOffset = function ( text, clusterOffset ) {
return ve.splitClusters( text ).slice( 0, clusterOffset ).join( '' ).length; return unicodeJS.graphemebreak.splitClusters( text ).slice( 0, clusterOffset
).join( '' ).length;
}; };
/** /**
@ -439,7 +442,8 @@
* @returns {number} Grapheme cluster offset * @returns {number} Grapheme cluster offset
*/ */
ve.getClusterOffset = function ( text, byteOffset ) { ve.getClusterOffset = function ( text, byteOffset ) {
return ve.splitClusters( text.substring( 0, byteOffset ) ).length; return unicodeJS.graphemebreak.splitClusters( text.substring( 0, byteOffset
) ).length;
}; };
/** /**