mediawiki-extensions-Visual.../modules/unicodejs/unicodejs.wordbreak.js
Ed Sanders f36e68c333 Implement next/prevBreakOffset and word skipping
This provides the functionality for keyboard word skipping
(i.e. pressing ctrl/alt + arrow key).

Bug: 46794
Change-Id: Ib0861fa075df805410717a148b8a6e166d947849
2013-04-09 21:55:57 +00:00

201 lines
6.8 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*!
* Wordbreak module
*
* Implementation of Unicode's Default Word Boundaries
* http://www.unicode.org/reports/tr29/#Default_Word_Boundaries
*
* @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
* @license The MIT License (MIT); see LICENSE.txt
*/
( function () {
var group,
groups = unicodeJS.groups,
/**
* @class unicodeJS.wordbreak
* @singleton
*/
wordbreak = unicodeJS.wordbreak = {},
patterns = {};
// build regexes
for ( group in groups ) {
patterns[group] = new RegExp( '[' + groups[group] + ']' );
}
function getGroup( chr ) {
var group;
for ( group in patterns ) {
if ( patterns[group].test( chr ) ) {
return group;
}
}
return null;
}
/**
* Find the next word break offset.
* @param {unicodeJS.TextString} string TextString
* @param {number} pos Character position
* @param {boolean} [onlyAlphaNumeric=false] When set, ignores a break if the previous character is not alphaNumeric
* @returns {number} Returns the next offset which is a word break
*/
wordbreak.nextBreakOffset = function( string, pos, onlyAlphaNumeric ) {
return wordbreak.moveBreakOffset( 1, string, pos, onlyAlphaNumeric );
};
/**
* Find the previous word break offset.
* @param {unicodeJS.TextString} string TextString
* @param {number} pos Character position
* @param {boolean} [onlyAlphaNumeric=false] When set, ignores a break if the previous character is not alphaNumeric
* @returns {number} Returns the previous offset which is a word break
*/
wordbreak.prevBreakOffset = function( string, pos, onlyAlphaNumeric ) {
return wordbreak.moveBreakOffset( -1, string, pos, onlyAlphaNumeric );
};
/**
* Find the next word break offset in a specified direction.
* @param {number} direction Direction to search in, should be plus or minus one
* @param {unicodeJS.TextString} string TextString
* @param {number} pos Character position
* @param {boolean} [onlyAlphaNumeric=false] When set, ignores a break if the previous character is not alphaNumeric
* @returns {number} Returns the previous offset which is word break
*/
wordbreak.moveBreakOffset = function( direction, string, pos, onlyAlphaNumeric ) {
var lastGroup, i = pos,
// when moving backwards, use the character to the left of the cursor
readCharOffset = direction > 0 ? 0 : -1;
// Search backwards for the previous break point
while ( string.read( i + readCharOffset ) !== null ) {
i += direction;
if ( unicodeJS.wordbreak.isBreak( string, i ) ) {
// Check previous character was alpha-numeric if required
if ( onlyAlphaNumeric ) {
lastGroup = getGroup( string.read( i - direction + readCharOffset ) );
if( lastGroup !== 'ALetter' && lastGroup !== 'Numeric' && lastGroup !== 'Katakana' ) {
continue;
}
}
break;
}
}
return i;
};
/**
* Evaluates if the sepcified position within some text is a word boundary.
* @param {unicodeJS.TextString} string Text string
* @param {number} pos Character position
* @returns {boolean} Is the position a word boundary
*/
wordbreak.isBreak = function ( string, pos ) {
// Break at the start and end of text.
// WB1: sot ÷
// WB2: ÷ eot
if ( string.read( pos - 1 ) === null || string.read( pos ) === null ) {
return true;
}
// get some context
var lft = [], rgt = [], l = 0, r = 0;
rgt.push( getGroup( string.read( pos + r ) ) );
lft.push( getGroup( string.read( pos - l - 1 ) ) );
switch ( true ) {
// Do not break within CRLF.
// WB3: CR × LF
case lft[0] === 'CR' && rgt[0] === 'LF':
return false;
// Otherwise break before and after Newlines (including CR and LF)
// WB3a: (Newline | CR | LF) ÷
case lft[0] === 'Newline' || lft[0] === 'CR' || lft[0] === 'LF':
// WB3b: ÷ (Newline | CR | LF)
case rgt[0] === 'Newline' || rgt[0] === 'CR' || rgt[0] === 'LF':
return true;
}
// Ignore Format and Extend characters, except when they appear at the beginning of a region of text.
// WB4: X (Extend | Format)* → X
if ( rgt[0] === 'Extend' || rgt[0] === 'Format' ) {
// The Extend|Format character is to the right, so it is attached
// to a character to the left, don't split here
return false;
}
// We've reached the end of an Extend|Format sequence, collapse it
while ( lft[0] === 'Extend' || lft[0] === 'Format' ) {
l++;
if ( pos - l - 1 <= 0) {
// start of document
return true;
}
lft[lft.length - 1] = getGroup( string.read( pos - l - 1 ) );
}
// Do not break between most letters.
// WB5: ALetter × ALetter
if ( lft[0] === 'ALetter' && rgt[0] === 'ALetter' ) {
return false;
}
// some tests beyond this point require more context
l++;
r++;
rgt.push( getGroup( string.read( pos + r ) ) );
lft.push( getGroup( string.read( pos - l - 1 ) ) );
switch ( true ) {
// Do not break letters across certain punctuation.
// WB6: ALetter × (MidLetter | MidNumLet) ALetter
case lft[0] === 'ALetter' && rgt[1] === 'ALetter' &&
( rgt[0] === 'MidLetter' || rgt[0] === 'MidNumLet' ):
// WB7: ALetter (MidLetter | MidNumLet) × ALetter
case rgt[0] === 'ALetter' && lft[1] === 'ALetter' &&
( lft[0] === 'MidLetter' || lft[0] === 'MidNumLet' ):
return false;
// Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”).
// WB8: Numeric × Numeric
case lft[0] === 'Numeric' && rgt[0] === 'Numeric':
// WB9: ALetter × Numeric
case lft[0] === 'ALetter' && rgt[0] === 'Numeric':
// WB10: Numeric × ALetter
case lft[0] === 'Numeric' && rgt[0] === 'ALetter':
return false;
// Do not break within sequences, such as “3.2” or “3,456.789”.
// WB11: Numeric (MidNum | MidNumLet) × Numeric
case rgt[0] === 'Numeric' && lft[1] === 'Numeric' &&
( lft[0] === 'MidNum' || lft[0] === 'MidNumLet' ):
// WB12: Numeric × (MidNum | MidNumLet) Numeric
case lft[0] === 'Numeric' && rgt[1] === 'Numeric' &&
( rgt[0] === 'MidNum' || rgt[0] === 'MidNumLet' ):
return false;
// Do not break between Katakana.
// WB13: Katakana × Katakana
case lft[0] === 'Katakana' && rgt[0] === 'Katakana':
return false;
// Do not break from extenders.
// WB13a: (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
case rgt[0] === 'ExtendNumLet' &&
( lft[0] === 'ALetter' || lft[0] === 'Numeric' || lft[0] === 'Katakana' || lft[0] === 'ExtendNumLet' ):
// WB13b: ExtendNumLet × (ALetter | Numeric | Katakana)
case lft[0] === 'ExtendNumLet' &&
( rgt[0] === 'ALetter' || rgt[0] === 'Numeric' || rgt[0] === 'Katakana' ):
return false;
// Do not break between regional indicator symbols.
// WB13c: Regional_Indicator × Regional_Indicator
case lft[0] === 'Regional_Indicator' && rgt[0] === 'Regional_Indicator':
return false;
}
// Otherwise, break everywhere (including around ideographs).
// WB14: Any ÷ Any
return true;
};
}() );