mediawiki-extensions-Visual.../modules/unicodejs/unicodejs.graphemebreak.js

92 lines
2.4 KiB
JavaScript
Raw Normal View History

/*!
* UnicodeJS Graphemebreak module
*
* Implementation of grapheme cluster boundary detection, based on
* Unicode UAX #29 Default Grapheme Cluster Boundary Specification; see
* http://www.unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table
*
* @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
* @license The MIT License (MIT); see LICENSE.txt
*/
( function () {
var property, disjunction, graphemeBreakRegexp,
properties = unicodeJS.graphemebreakproperties,
// Single unicode character (either a UTF-16 code unit or a surrogate pair)
oneCharacter = '[^\\ud800-\\udfff]|[\\ud800-\\udbff][\\udc00-\\udfff]',
/**
* @class unicodeJS.graphemebreak
* @singleton
*/
graphemebreak = unicodeJS.graphemebreak = {},
patterns = {};
// build regexes
for ( property in properties ) {
patterns[property] = unicodeJS.charRangeArrayRegexp( properties[property] );
}
// build disjunction for grapheme cluster split
// See http://www.unicode.org/reports/tr29/ at "Grapheme Cluster Boundary Rules"
disjunction = [
// GB1 and GB2 are trivially satisfied
// GB3
'\\r\\n',
// GB4, GB5
patterns.Control,
// GB6, GB7, GB8
'(?:' + patterns.L + ')*' +
'(?:' + patterns.V + ')+' +
'(?:' + patterns.T + ')*',
'(?:' + patterns.L + ')*' +
'(?:' + patterns.LV + ')' +
'(?:' + patterns.V + ')*' +
'(?:' + patterns.T + ')*',
'(?:' + patterns.L + ')*' +
'(?:' + patterns.LVT + ')' +
'(?:' + patterns.T + ')*',
'(?:' + patterns.L + ')+',
'(?:' + patterns.T + ')+',
// GB8a
/*jshint camelcase:false */
'(?:' + patterns.Regional_Indicator + ')+',
/*jshint camelcase:true */
// GB9, GB9a
// (GB9b: As of Unicode 6.2, no characters are "Prepend")
// TODO: this will break if the extended thing is not oneCharacter
// e.g. hangul jamo L+V+T. Does it matter?
'(?:' + oneCharacter + ')' +
'(?:' + patterns.Extend + '|' +
patterns.SpacingMark + ')+',
// GB10 (taking care not to split surrogates)
oneCharacter
];
graphemeBreakRegexp = new RegExp( '(' + disjunction.join( '|' ) + ')' );
/**
* Split a string into grapheme clusters.
*
* @param {string} text Text to split
* @returns {string[]} Array of clusters
*/
graphemebreak.splitClusters = function ( text ) {
var i, parts, length, clusters = [];
parts = text.split( graphemeBreakRegexp );
for ( i = 0, length = parts.length; i < length; i++ ) {
if ( parts[i] !== '' ) {
clusters.push( parts[i] );
}
}
return clusters;
};
}() );