mediawiki-extensions-Visual.../modules/unicodejs/unicodejs.graphemebreak.js
David Chan a1eb56c14f splitClusters uses Grapheme Cluster Boundary rules
unicodejs.graphemebreak.js
* New file: singleton class with splitClusters method
* On load, builds graphemeBreakRegexp from unicodejs.graphemebreakproperties.js

unicodejs.js
* Remove old splitClusters method (was just a placeholder)
* Change "conjunction" -> "disjunction", for consistency and correctness

unicodejs.textstring.js
* Use new splitClusters method

modules/ve/ve.js
* Use new splitClusters method

unicodejs.wordbreak.text.js
* Add new splitClusters test
* Refactor charRangeArrayRegexp test to use splitClusters

PHP files
* add unicodejs.graphemebreak.js, unicodejs.graphemebreakproperties.js

.docs/categories.json
* add unicodeJS.wordbreak class

Change-Id: I8f512e2fc2c46eb4b5f00994a8dac88f3c8f7dd2
2013-06-16 21:46:02 +01:00

92 lines
2.5 KiB
JavaScript

/*!
* Graphemebreak module
*
* Implementation of grapheme cluster boundary detection, based on
* Unicode UAX #29 Default Grapheme Cluster Boundary Specification; see
* http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
*
* @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
* @license The MIT License (MIT); see LICENSE.txt
*/
( function () {
var property, disjunction, graphemeBreakRegexp,
properties = unicodeJS.graphemebreakproperties,
// Single unicode character (either a UTF-16 code unit or a surrogate pair)
oneCharacter = '[^\\ud800-\\udfff]|[\\ud800-\\udbff][\\udc00-\\udfff]',
/**
* @class unicodeJS.graphemebreak
* @singleton
*/
graphemebreak = unicodeJS.graphemebreak = {},
patterns = {};
// build regexes
for ( property in properties ) {
patterns[property] = unicodeJS.charRangeArrayRegexp( properties[property] );
}
// build disjunction for grapheme cluster split
// See http://www.unicode.org/reports/tr29/ at "Grapheme Cluster Boundary Rules"
disjunction = [
// GB1 and GB2 are trivially satisfied
// GB3
'\\r\\n',
// GB4, GB5
patterns.Control,
// GB6, GB7, GB8
'(?:' + patterns.L + ')*' +
'(?:' + patterns.V + ')+' +
'(?:' + patterns.T + ')*',
'(?:' + patterns.L + ')*' +
'(?:' + patterns.LV + ')' +
'(?:' + patterns.V + ')*' +
'(?:' + patterns.T + ')*',
'(?:' + patterns.L + ')*' +
'(?:' + patterns.LVT + ')' +
'(?:' + patterns.T + ')*',
'(?:' + patterns.L + ')+',
'(?:' + patterns.T + ')+',
// GB8a
/*jshint camelcase:false */
'(?:' + patterns.Regional_Indicator + ')+',
/*jshint camelcase:true */
// GB9, GB9a
// (GB9b: As of Unicode 6.2, no characters are "Prepend")
// TODO: this will break if the extended thing is not oneCharacter
// e.g. hangul jamo L+V+T. Does it matter?
'(?:' + oneCharacter + ')' +
'(?:' + patterns.Extend + '|' +
patterns.SpacingMark + ')+',
// GB10 (taking care not to split surrogates)
oneCharacter
];
graphemeBreakRegexp = new RegExp( '(' + disjunction.join( '|' ) + ')' );
/**
* Split a string into grapheme clusters.
*
* @param {string} text Text to split
* @returns {string[]} Array of clusters
*/
graphemebreak.splitClusters = function ( text ) {
var i, parts, length, clusters = [];
parts = text.split( graphemeBreakRegexp );
for ( i = 0, length = parts.length; i < length; i++ ) {
if ( parts[i] !== '' ) {
clusters.push( parts[i] );
}
}
return clusters;
};
}() );