mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-30 00:55:00 +00:00
92 lines
2.5 KiB
JavaScript
92 lines
2.5 KiB
JavaScript
|
/*!
|
||
|
* Graphemebreak module
|
||
|
*
|
||
|
* Implementation of grapheme cluster boundary detection, based on
|
||
|
* Unicode UAX #29 Default Grapheme Cluster Boundary Specification; see
|
||
|
* http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
|
||
|
*
|
||
|
* @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
|
||
|
* @license The MIT License (MIT); see LICENSE.txt
|
||
|
*/
|
||
|
( function () {
|
||
|
var property, disjunction, graphemeBreakRegexp,
|
||
|
properties = unicodeJS.graphemebreakproperties,
|
||
|
// Single unicode character (either a UTF-16 code unit or a surrogate pair)
|
||
|
oneCharacter = '[^\\ud800-\\udfff]|[\\ud800-\\udbff][\\udc00-\\udfff]',
|
||
|
/**
|
||
|
* @class unicodeJS.graphemebreak
|
||
|
* @singleton
|
||
|
*/
|
||
|
graphemebreak = unicodeJS.graphemebreak = {},
|
||
|
patterns = {};
|
||
|
|
||
|
// build regexes
|
||
|
for ( property in properties ) {
|
||
|
patterns[property] = unicodeJS.charRangeArrayRegexp( properties[property] );
|
||
|
}
|
||
|
|
||
|
// build disjunction for grapheme cluster split
|
||
|
// See http://www.unicode.org/reports/tr29/ at "Grapheme Cluster Boundary Rules"
|
||
|
disjunction = [
|
||
|
// GB1 and GB2 are trivially satisfied
|
||
|
|
||
|
// GB3
|
||
|
'\\r\\n',
|
||
|
|
||
|
// GB4, GB5
|
||
|
patterns.Control,
|
||
|
|
||
|
// GB6, GB7, GB8
|
||
|
'(?:' + patterns.L + ')*' +
|
||
|
'(?:' + patterns.V + ')+' +
|
||
|
'(?:' + patterns.T + ')*',
|
||
|
|
||
|
'(?:' + patterns.L + ')*' +
|
||
|
'(?:' + patterns.LV + ')' +
|
||
|
'(?:' + patterns.V + ')*' +
|
||
|
'(?:' + patterns.T + ')*',
|
||
|
|
||
|
'(?:' + patterns.L + ')*' +
|
||
|
'(?:' + patterns.LVT + ')' +
|
||
|
'(?:' + patterns.T + ')*',
|
||
|
|
||
|
'(?:' + patterns.L + ')+',
|
||
|
|
||
|
'(?:' + patterns.T + ')+',
|
||
|
|
||
|
// GB8a
|
||
|
/*jshint camelcase:false */
|
||
|
'(?:' + patterns.Regional_Indicator + ')+',
|
||
|
/*jshint camelcase:true */
|
||
|
|
||
|
// GB9, GB9a
|
||
|
// (GB9b: As of Unicode 6.2, no characters are "Prepend")
|
||
|
// TODO: this will break if the extended thing is not oneCharacter
|
||
|
// e.g. hangul jamo L+V+T. Does it matter?
|
||
|
'(?:' + oneCharacter + ')' +
|
||
|
'(?:' + patterns.Extend + '|' +
|
||
|
patterns.SpacingMark + ')+',
|
||
|
|
||
|
// GB10 (taking care not to split surrogates)
|
||
|
oneCharacter
|
||
|
];
|
||
|
graphemeBreakRegexp = new RegExp( '(' + disjunction.join( '|' ) + ')' );
|
||
|
|
||
|
/**
|
||
|
* Split a string into grapheme clusters.
|
||
|
*
|
||
|
* @param {string} text Text to split
|
||
|
* @returns {string[]} Array of clusters
|
||
|
*/
|
||
|
graphemebreak.splitClusters = function ( text ) {
|
||
|
var i, parts, length, clusters = [];
|
||
|
parts = text.split( graphemeBreakRegexp );
|
||
|
for ( i = 0, length = parts.length; i < length; i++ ) {
|
||
|
if ( parts[i] !== '' ) {
|
||
|
clusters.push( parts[i] );
|
||
|
}
|
||
|
}
|
||
|
return clusters;
|
||
|
};
|
||
|
}() );
|