UnicodeJS library to implement Unicode standards

Initially just with a Wordbreak module to implement Unicode standard on 'Default Word Boundaries'. Due to it's standaloneability this has been written as a separate library. Non-BMP characters are currently not supported. Bug: 44085 Change-Id: Ieafa070076f4c36855684f6bc179667e28af2c25
2024-11-27 15:50:29 +00:00 · 2013-03-18 11:31:14 +00:00 · 2013-03-18 11:31:14 +00:00 · 4988efd35e
parent e1f4196046
commit 4988efd35e
22 changed files with 633 additions and 45 deletions
--- a/.docs/categories.json
+++ b/.docs/categories.json
@ -38,6 +38,7 @@
 					"ve.dm.MetaItemFactory",
 					"ve.dm.ModelRegistry",
 					"ve.dm.Converter",
+					"ve.dm.DataString",
 					"ve.dm.DocumentSlice",
 					"ve.dm.DocumentSynchronizer",
 					"ve.dm.NodeFactory",
@ -149,6 +150,17 @@
 			}
 		]
 	},
+	{
+		"name": "UnicodeJS",
+		"groups": [
+			{
+				"name": "UnicodeJS",
+				"classes": [
+					"unicodeJS.TextString", "unicodeJS.wordbreak"
+				]
+			}
+		]
+	},
 	{
 		"name": "Upstream",
 		"groups": [
--- a/.docs/config.json
+++ b/.docs/config.json
@ -9,6 +9,7 @@
 	"--output": "../docs",
 	"--": [
 		"./external.js",
+		"../modules/unicodejs",
 		"../modules/ve"
 	]
 }
--- a/.jshintignore
+++ b/.jshintignore
@ -3,5 +3,6 @@ docs/
 modules/jquery
 modules/qunit
 modules/rangy
+modules/unicodejs
 modules/parser
 tests/parser
--- a/.jshintrc
+++ b/.jshintrc
@ -1,6 +1,7 @@
 {
 	"predef": [
 		"ve",
+		"unicodeJS",
 		"QUnit"
 	],

--- a/VisualEditor.php
+++ b/VisualEditor.php
@ -67,7 +67,15 @@ $wgResourceModules += array(
 	),
 	'jquery.visibleText' => $wgVisualEditorResourceTemplate + array(
 		'scripts' => array(
-			'jquery/jquery.visibleText.js'
+			'jquery/jquery.visibleText.js',
+		),
+	),
+	'unicodejs.wordbreak' => $wgVisualEditorResourceTemplate + array(
+		'scripts' => array(
+			'unicodejs/unicodejs.js',
+			'unicodejs/unicodejs.textstring.js',
+			'unicodejs/unicodejs.wordbreak.groups.js',
+			'unicodejs/unicodejs.wordbreak.js',
 		),
 	),
 	// Alias for backwards compat, safe to remove after
@ -230,6 +238,7 @@ $wgResourceModules += array(
 			've/dm/ve.dm.Transaction.js',
 			've/dm/ve.dm.Surface.js',
 			've/dm/ve.dm.SurfaceFragment.js',
+			've/dm/ve.dm.DataString.js',
 			've/dm/ve.dm.Document.js',
 			've/dm/ve.dm.DocumentSlice.js',
 			've/dm/ve.dm.DocumentSynchronizer.js',
@ -390,6 +399,7 @@ $wgResourceModules += array(
 		'dependencies' => array(
 			'jquery',
 			'rangy',
+			'unicodejs.wordbreak',
 			'ext.visualEditor.base',
 			'mediawiki.Title',
 			'jquery.autoEllipsis',
--- a/demos/ve/index.php
+++ b/demos/ve/index.php
@ -68,6 +68,10 @@ $html = file_get_contents( $page );
 		<script src="../../modules/jquery/jquery.js"></script>
 		<script src="../../modules/rangy/rangy-core.js"></script>
 		<script src="../../modules/rangy/rangy-position.js"></script>
+		<script src="../../modules/unicodejs/unicodejs.js"></script>
+		<script src="../../modules/unicodejs/unicodejs.textstring.js"></script>
+		<script src="../../modules/unicodejs/unicodejs.wordbreak.groups.js"></script>
+		<script src="../../modules/unicodejs/unicodejs.wordbreak.js"></script>
 		<!-- ext.visualEditor.base -->
 		<script src="../../modules/ve/ve.js"></script>
 		<script src="../../modules/ve/ve.EventEmitter.js"></script>
@ -125,6 +129,7 @@ $html = file_get_contents( $page );
 		<script src="../../modules/ve/dm/ve.dm.Transaction.js"></script>
 		<script src="../../modules/ve/dm/ve.dm.Surface.js"></script>
 		<script src="../../modules/ve/dm/ve.dm.SurfaceFragment.js"></script>
+		<script src="../../modules/ve/dm/ve.dm.DataString.js"></script>
 		<script src="../../modules/ve/dm/ve.dm.Document.js"></script>
 		<script src="../../modules/ve/dm/ve.dm.DocumentSlice.js"></script>
 		<script src="../../modules/ve/dm/ve.dm.DocumentSynchronizer.js"></script>
--- a/maintenance/makeStaticLoader.php
+++ b/maintenance/makeStaticLoader.php
@ -42,6 +42,10 @@ class MakeStaticLoader extends Maintenance {
 				'jquery/jquery.js',
 				'rangy/rangy-core.js',
 				'rangy/rangy-position.js',
+				'unicodejs/unicodejs.js',
+				'unicodejs/unicodejs.textstring.js',
+				'unicodejs/unicodejs.wordbreak.groups.js',
+				'unicodejs/unicodejs.wordbreak.js',
 			),
 		);

--- a/modules/unicodejs/AUTHORS.txt
+++ b/modules/unicodejs/AUTHORS.txt
@ -0,0 +1,7 @@
+Principal Authors (major contributors, alphabetically)
+
+Ed Sanders <esanders@wikimedia.org>
+
+Patch Contributors (minor contributors, alphabetically)
+
+David Chan <david@troi.org>
--- a/modules/unicodejs/LICENSE.txt
+++ b/modules/unicodejs/LICENSE.txt
@ -0,0 +1,25 @@
+Copyright (c) 2013 UnicodeJS team and others under the terms
+of The MIT License (MIT), as follows:
+
+This software consists of voluntary contributions made by many
+individuals (AUTHORS.txt) For exact contribution history, see the
+revision history and logs, available at https://gerrit.wikimedia.org
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/modules/unicodejs/index.php
+++ b/modules/unicodejs/index.php
@ -0,0 +1,32 @@
+<!--
+/**
+ * UnicodeJS tests
+ *
+ * @file
+ * @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
+ * @license The MIT License (MIT); see LICENSE.txt
+ */
+-->
+<!DOCTYPE html>
+<html>
+	<head>
+		<meta charset="UTF-8">
+		<title>UnicodeJS Tests</title>
+
+		<!-- Load test framework -->
+		<link rel="stylesheet" href="../qunit/qunit.css">
+		<script src="../qunit/qunit.js"></script>
+
+		<!-- Dependencies -->
+		<script src="../jquery/jquery.js"></script>
+		<script src="unicodejs.js"></script>
+		<script src="unicodejs.textstring.js"></script>
+		<script src="unicodejs.wordbreak.groups.js"></script>
+		<script src="unicodejs.wordbreak.js"></script>
+
+		<script src="unicodejs.wordbreak.test.js"></script>
+	</head>
+	<body>
+		<div id="qunit"></div>
+	</body>
+</html>
--- a/modules/unicodejs/tools/unicodejs.wordbreak.groups.php
+++ b/modules/unicodejs/tools/unicodejs.wordbreak.groups.php
@ -0,0 +1,45 @@
+<?php
+/**
+ * Wordbreak character groups generator
+ *
+ * @file
+ * @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
+ * @license The MIT License (MIT); see LICENSE.txt
+ */
+
+echo 'Downloading break point ranges from unicode.org...   ';
+$data = file_get_contents( 'http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt' );
+echo "done\n";
+
+echo 'Generating regular expressions...   ';
+$lines = explode( "\n", $data );
+
+$groups = array();
+
+for ( $i = 0, $len = count($lines); $i < $len; $i++ ) {
+	$line = $lines[$i];
+	if ( substr( $line, 0, 1 ) === '#' || $line === '' ) {
+		continue;
+	}
+	$cols = preg_split( '/[;#]/', $line );
+	// Ignoring non-BMP characters for the time being
+	if ( preg_match( '/[a-f0-9]{5}/i', $cols[0] ) ) continue;
+	$range = '\u'.str_replace( '..', '-\u', trim( $cols[0] ) );
+	$group = trim( $cols[1] );
+	if ( !isset( $groups[$group] ) ) {
+		$groups[$group] = '';
+	}
+	$groups[$group] .= $range;
+}
+
+echo "done\n";
+
+echo 'Writing to unicodejs.wordbreak.groups.js...   ';
+
+$json = preg_replace( '/    /', "\t", json_encode( $groups, JSON_PRETTY_PRINT ) );
+file_put_contents(
+	dirname( __DIR__ ) . '/unicodejs.wordbreak.groups.js',
+	"/*jshint quotmark:double */\nunicodeJS.groups = " . $json . ";\n"
+);
+
+echo "done\n";
--- a/modules/unicodejs/unicodejs.js
+++ b/modules/unicodejs/unicodejs.js
@ -0,0 +1,8 @@
+/*!
+ * UnicodeJS namespace.
+ *
+ * @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
+ * @license The MIT License (MIT); see LICENSE.txt
+ */
+
+window.unicodeJS = {};
--- a/modules/unicodejs/unicodejs.textstring.js
+++ b/modules/unicodejs/unicodejs.textstring.js
@ -0,0 +1,38 @@
+/*!
+ * UnicodeJS TextString class.
+ *
+ * @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
+ * @license The MIT License (MIT); see LICENSE.txt
+ */
+
+/**
+ * TextString
+ *
+ * This class provides a simple interface to fetching plain text
+ * from a data source. The base class reads data from a string, but
+ * an extended class could provide access to a more complex structure,
+ * e.g. an array or an HTML document tree.
+ *
+ * @class unicodeJS.TextString
+ * @constructor
+ * @param {string} text Text
+ */
+unicodeJS.TextString = function UnicodeJSTextString( text ) {
+	this.text = text;
+};
+
+/* Methods */
+
+/**
+ * Read character at specified position
+ *
+ * @method
+ * @param {number} position Position to read from
+ * @returns {string|null} Character, or null if out of bounds
+ */
+unicodeJS.TextString.prototype.read = function ( position ) {
+	if ( position < 0 || position >= this.text.length ) {
+		return null;
+	}
+	return this.text.charAt( position );
+};
--- a/modules/unicodejs/unicodejs.wordbreak.groups.js
+++ b/modules/unicodejs/unicodejs.wordbreak.groups.js
@ -0,0 +1,15 @@
+/*jshint quotmark:double */
+unicodeJS.groups = {
+	"CR": "\\u000D",
+	"LF": "\\u000A",
+	"Newline": "\\u000B-\\u000C\\u0085\\u2028\\u2029",
+	"Extend": "\\u0300-\\u036F\\u0483-\\u0487\\u0488-\\u0489\\u0591-\\u05BD\\u05BF\\u05C1-\\u05C2\\u05C4-\\u05C5\\u05C7\\u0610-\\u061A\\u064B-\\u065F\\u0670\\u06D6-\\u06DC\\u06DF-\\u06E4\\u06E7-\\u06E8\\u06EA-\\u06ED\\u0711\\u0730-\\u074A\\u07A6-\\u07B0\\u07EB-\\u07F3\\u0816-\\u0819\\u081B-\\u0823\\u0825-\\u0827\\u0829-\\u082D\\u0859-\\u085B\\u08E4-\\u08FE\\u0900-\\u0902\\u0903\\u093A\\u093B\\u093C\\u093E-\\u0940\\u0941-\\u0948\\u0949-\\u094C\\u094D\\u094E-\\u094F\\u0951-\\u0957\\u0962-\\u0963\\u0981\\u0982-\\u0983\\u09BC\\u09BE-\\u09C0\\u09C1-\\u09C4\\u09C7-\\u09C8\\u09CB-\\u09CC\\u09CD\\u09D7\\u09E2-\\u09E3\\u0A01-\\u0A02\\u0A03\\u0A3C\\u0A3E-\\u0A40\\u0A41-\\u0A42\\u0A47-\\u0A48\\u0A4B-\\u0A4D\\u0A51\\u0A70-\\u0A71\\u0A75\\u0A81-\\u0A82\\u0A83\\u0ABC\\u0ABE-\\u0AC0\\u0AC1-\\u0AC5\\u0AC7-\\u0AC8\\u0AC9\\u0ACB-\\u0ACC\\u0ACD\\u0AE2-\\u0AE3\\u0B01\\u0B02-\\u0B03\\u0B3C\\u0B3E\\u0B3F\\u0B40\\u0B41-\\u0B44\\u0B47-\\u0B48\\u0B4B-\\u0B4C\\u0B4D\\u0B56\\u0B57\\u0B62-\\u0B63\\u0B82\\u0BBE-\\u0BBF\\u0BC0\\u0BC1-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCC\\u0BCD\\u0BD7\\u0C01-\\u0C03\\u0C3E-\\u0C40\\u0C41-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55-\\u0C56\\u0C62-\\u0C63\\u0C82-\\u0C83\\u0CBC\\u0CBE\\u0CBF\\u0CC0-\\u0CC4\\u0CC6\\u0CC7-\\u0CC8\\u0CCA-\\u0CCB\\u0CCC-\\u0CCD\\u0CD5-\\u0CD6\\u0CE2-\\u0CE3\\u0D02-\\u0D03\\u0D3E-\\u0D40\\u0D41-\\u0D44\\u0D46-\\u0D48\\u0D4A-\\u0D4C\\u0D4D\\u0D57\\u0D62-\\u0D63\\u0D82-\\u0D83\\u0DCA\\u0DCF-\\u0DD1\\u0DD2-\\u0DD4\\u0DD6\\u0DD8-\\u0DDF\\u0DF2-\\u0DF3\\u0E31\\u0E34-\\u0E3A\\u0E47-\\u0E4E\\u0EB1\\u0EB4-\\u0EB9\\u0EBB-\\u0EBC\\u0EC8-\\u0ECD\\u0F18-\\u0F19\\u0F35\\u0F37\\u0F39\\u0F3E-\\u0F3F\\u0F71-\\u0F7E\\u0F7F\\u0F80-\\u0F84\\u0F86-\\u0F87\\u0F8D-\\u0F97\\u0F99-\\u0FBC\\u0FC6\\u102B-\\u102C\\u102D-\\u1030\\u1031\\u1032-\\u1037\\u1038\\u1039-\\u103A\\u103B-\\u103C\\u103D-\\u103E\\u1056-\\u1057\\u1058-\\u1059\\u105E-\\u1060\\u1062-\\u1064\\u1067-\\u106D\\u1071-\\u1074\\u1082\\u1083-\\u1084\\u1085-\\u1086\\u1087-\\u108C\\u108D\\u108F\\u109A-\\u109C\\u109D\\u135D-\\u135F\\u1712-\\u1714\\u1732-\\u1734\\u1752-\\u1753\\u1772-\\u1773\\u17B4-\\u17B5\\u17B6\\u17B7-\\u17BD\\u17BE-\\u17C5\\u17C6\\u17C7-\\u17C8\\u17C9-\\u17D3\\u17DD\\u180B-\\u180D\\u18A9\\u1920-\\u1922\\u1923-\\u1926\\u1927-\\u1928\\u1929-\\u192B\\u1930-\\u1931\\u1932\\u1933-\\u1938\\u1939-\\u193B\\u19B0-\\u19C0\\u19C8-\\u19C9\\u1A17-\\u1A18\\u1A19-\\u1A1B\\u1A55\\u1A56\\u1A57\\u1A58-\\u1A5E\\u1A60\\u1A61\\u1A62\\u1A63-\\u1A64\\u1A65-\\u1A6C\\u1A6D-\\u1A72\\u1A73-\\u1A7C\\u1A7F\\u1B00-\\u1B03\\u1B04\\u1B34\\u1B35\\u1B36-\\u1B3A\\u1B3B\\u1B3C\\u1B3D-\\u1B41\\u1B42\\u1B43-\\u1B44\\u1B6B-\\u1B73\\u1B80-\\u1B81\\u1B82\\u1BA1\\u1BA2-\\u1BA5\\u1BA6-\\u1BA7\\u1BA8-\\u1BA9\\u1BAA\\u1BAB\\u1BAC-\\u1BAD\\u1BE6\\u1BE7\\u1BE8-\\u1BE9\\u1BEA-\\u1BEC\\u1BED\\u1BEE\\u1BEF-\\u1BF1\\u1BF2-\\u1BF3\\u1C24-\\u1C2B\\u1C2C-\\u1C33\\u1C34-\\u1C35\\u1C36-\\u1C37\\u1CD0-\\u1CD2\\u1CD4-\\u1CE0\\u1CE1\\u1CE2-\\u1CE8\\u1CED\\u1CF2-\\u1CF3\\u1CF4\\u1DC0-\\u1DE6\\u1DFC-\\u1DFF\\u200C-\\u200D\\u20D0-\\u20DC\\u20DD-\\u20E0\\u20E1\\u20E2-\\u20E4\\u20E5-\\u20F0\\u2CEF-\\u2CF1\\u2D7F\\u2DE0-\\u2DFF\\u302A-\\u302D\\u302E-\\u302F\\u3099-\\u309A\\uA66F\\uA670-\\uA672\\uA674-\\uA67D\\uA69F\\uA6F0-\\uA6F1\\uA802\\uA806\\uA80B\\uA823-\\uA824\\uA825-\\uA826\\uA827\\uA880-\\uA881\\uA8B4-\\uA8C3\\uA8C4\\uA8E0-\\uA8F1\\uA926-\\uA92D\\uA947-\\uA951\\uA952-\\uA953\\uA980-\\uA982\\uA983\\uA9B3\\uA9B4-\\uA9B5\\uA9B6-\\uA9B9\\uA9BA-\\uA9BB\\uA9BC\\uA9BD-\\uA9C0\\uAA29-\\uAA2E\\uAA2F-\\uAA30\\uAA31-\\uAA32\\uAA33-\\uAA34\\uAA35-\\uAA36\\uAA43\\uAA4C\\uAA4D\\uAA7B\\uAAB0\\uAAB2-\\uAAB4\\uAAB7-\\uAAB8\\uAABE-\\uAABF\\uAAC1\\uAAEB\\uAAEC-\\uAAED\\uAAEE-\\uAAEF\\uAAF5\\uAAF6\\uABE3-\\uABE4\\uABE5\\uABE6-\\uABE7\\uABE8\\uABE9-\\uABEA\\uABEC\\uABED\\uFB1E\\uFE00-\\uFE0F\\uFE20-\\uFE26\\uFF9E-\\uFF9F",
+	"Format": "\\u00AD\\u0600-\\u0604\\u06DD\\u070F\\u200E-\\u200F\\u202A-\\u202E\\u2060-\\u2064\\u206A-\\u206F\\uFEFF\\uFFF9-\\uFFFB",
+	"Katakana": "\\u3031-\\u3035\\u309B-\\u309C\\u30A0\\u30A1-\\u30FA\\u30FC-\\u30FE\\u30FF\\u31F0-\\u31FF\\u32D0-\\u32FE\\u3300-\\u3357\\uFF66-\\uFF6F\\uFF70\\uFF71-\\uFF9D",
+	"ALetter": "\\u0041-\\u005A\\u0061-\\u007A\\u00AA\\u00B5\\u00BA\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u01BA\\u01BB\\u01BC-\\u01BF\\u01C0-\\u01C3\\u01C4-\\u0293\\u0294\\u0295-\\u02AF\\u02B0-\\u02C1\\u02C6-\\u02D1\\u02E0-\\u02E4\\u02EC\\u02EE\\u0370-\\u0373\\u0374\\u0376-\\u0377\\u037A\\u037B-\\u037D\\u0386\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03F5\\u03F7-\\u0481\\u048A-\\u0527\\u0531-\\u0556\\u0559\\u0561-\\u0587\\u05D0-\\u05EA\\u05F0-\\u05F2\\u05F3\\u0620-\\u063F\\u0640\\u0641-\\u064A\\u066E-\\u066F\\u0671-\\u06D3\\u06D5\\u06E5-\\u06E6\\u06EE-\\u06EF\\u06FA-\\u06FC\\u06FF\\u0710\\u0712-\\u072F\\u074D-\\u07A5\\u07B1\\u07CA-\\u07EA\\u07F4-\\u07F5\\u07FA\\u0800-\\u0815\\u081A\\u0824\\u0828\\u0840-\\u0858\\u08A0\\u08A2-\\u08AC\\u0904-\\u0939\\u093D\\u0950\\u0958-\\u0961\\u0971\\u0972-\\u0977\\u0979-\\u097F\\u0985-\\u098C\\u098F-\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09BD\\u09CE\\u09DC-\\u09DD\\u09DF-\\u09E1\\u09F0-\\u09F1\\u0A05-\\u0A0A\\u0A0F-\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32-\\u0A33\\u0A35-\\u0A36\\u0A38-\\u0A39\\u0A59-\\u0A5C\\u0A5E\\u0A72-\\u0A74\\u0A85-\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2-\\u0AB3\\u0AB5-\\u0AB9\\u0ABD\\u0AD0\\u0AE0-\\u0AE1\\u0B05-\\u0B0C\\u0B0F-\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32-\\u0B33\\u0B35-\\u0B39\\u0B3D\\u0B5C-\\u0B5D\\u0B5F-\\u0B61\\u0B71\\u0B83\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99-\\u0B9A\\u0B9C\\u0B9E-\\u0B9F\\u0BA3-\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB9\\u0BD0\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C3D\\u0C58-\\u0C59\\u0C60-\\u0C61\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CBD\\u0CDE\\u0CE0-\\u0CE1\\u0CF1-\\u0CF2\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D3A\\u0D3D\\u0D4E\\u0D60-\\u0D61\\u0D7A-\\u0D7F\\u0D85-\\u0D96\\u0D9A-\\u0DB1\\u0DB3-\\u0DBB\\u0DBD\\u0DC0-\\u0DC6\\u0F00\\u0F40-\\u0F47\\u0F49-\\u0F6C\\u0F88-\\u0F8C\\u10A0-\\u10C5\\u10C7\\u10CD\\u10D0-\\u10FA\\u10FC\\u10FD-\\u1248\\u124A-\\u124D\\u1250-\\u1256\\u1258\\u125A-\\u125D\\u1260-\\u1288\\u128A-\\u128D\\u1290-\\u12B0\\u12B2-\\u12B5\\u12B8-\\u12BE\\u12C0\\u12C2-\\u12C5\\u12C8-\\u12D6\\u12D8-\\u1310\\u1312-\\u1315\\u1318-\\u135A\\u1380-\\u138F\\u13A0-\\u13F4\\u1401-\\u166C\\u166F-\\u167F\\u1681-\\u169A\\u16A0-\\u16EA\\u16EE-\\u16F0\\u1700-\\u170C\\u170E-\\u1711\\u1720-\\u1731\\u1740-\\u1751\\u1760-\\u176C\\u176E-\\u1770\\u1820-\\u1842\\u1843\\u1844-\\u1877\\u1880-\\u18A8\\u18AA\\u18B0-\\u18F5\\u1900-\\u191C\\u1A00-\\u1A16\\u1B05-\\u1B33\\u1B45-\\u1B4B\\u1B83-\\u1BA0\\u1BAE-\\u1BAF\\u1BBA-\\u1BE5\\u1C00-\\u1C23\\u1C4D-\\u1C4F\\u1C5A-\\u1C77\\u1C78-\\u1C7D\\u1CE9-\\u1CEC\\u1CEE-\\u1CF1\\u1CF5-\\u1CF6\\u1D00-\\u1D2B\\u1D2C-\\u1D6A\\u1D6B-\\u1D77\\u1D78\\u1D79-\\u1D9A\\u1D9B-\\u1DBF\\u1E00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC\\u1FBE\\u1FC2-\\u1FC4\\u1FC6-\\u1FCC\\u1FD0-\\u1FD3\\u1FD6-\\u1FDB\\u1FE0-\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FFC\\u2071\\u207F\\u2090-\\u209C\\u2102\\u2107\\u210A-\\u2113\\u2115\\u2119-\\u211D\\u2124\\u2126\\u2128\\u212A-\\u212D\\u212F-\\u2134\\u2135-\\u2138\\u2139\\u213C-\\u213F\\u2145-\\u2149\\u214E\\u2160-\\u2182\\u2183-\\u2184\\u2185-\\u2188\\u24B6-\\u24E9\\u2C00-\\u2C2E\\u2C30-\\u2C5E\\u2C60-\\u2C7B\\u2C7C-\\u2C7D\\u2C7E-\\u2CE4\\u2CEB-\\u2CEE\\u2CF2-\\u2CF3\\u2D00-\\u2D25\\u2D27\\u2D2D\\u2D30-\\u2D67\\u2D6F\\u2D80-\\u2D96\\u2DA0-\\u2DA6\\u2DA8-\\u2DAE\\u2DB0-\\u2DB6\\u2DB8-\\u2DBE\\u2DC0-\\u2DC6\\u2DC8-\\u2DCE\\u2DD0-\\u2DD6\\u2DD8-\\u2DDE\\u2E2F\\u3005\\u303B\\u303C\\u3105-\\u312D\\u3131-\\u318E\\u31A0-\\u31BA\\uA000-\\uA014\\uA015\\uA016-\\uA48C\\uA4D0-\\uA4F7\\uA4F8-\\uA4FD\\uA500-\\uA60B\\uA60C\\uA610-\\uA61F\\uA62A-\\uA62B\\uA640-\\uA66D\\uA66E\\uA67F\\uA680-\\uA697\\uA6A0-\\uA6E5\\uA6E6-\\uA6EF\\uA717-\\uA71F\\uA722-\\uA76F\\uA770\\uA771-\\uA787\\uA788\\uA78B-\\uA78E\\uA790-\\uA793\\uA7A0-\\uA7AA\\uA7F8-\\uA7F9\\uA7FA\\uA7FB-\\uA801\\uA803-\\uA805\\uA807-\\uA80A\\uA80C-\\uA822\\uA840-\\uA873\\uA882-\\uA8B3\\uA8F2-\\uA8F7\\uA8FB\\uA90A-\\uA925\\uA930-\\uA946\\uA960-\\uA97C\\uA984-\\uA9B2\\uA9CF\\uAA00-\\uAA28\\uAA40-\\uAA42\\uAA44-\\uAA4B\\uAAE0-\\uAAEA\\uAAF2\\uAAF3-\\uAAF4\\uAB01-\\uAB06\\uAB09-\\uAB0E\\uAB11-\\uAB16\\uAB20-\\uAB26\\uAB28-\\uAB2E\\uABC0-\\uABE2\\uAC00-\\uD7A3\\uD7B0-\\uD7C6\\uD7CB-\\uD7FB\\uFB00-\\uFB06\\uFB13-\\uFB17\\uFB1D\\uFB1F-\\uFB28\\uFB2A-\\uFB36\\uFB38-\\uFB3C\\uFB3E\\uFB40-\\uFB41\\uFB43-\\uFB44\\uFB46-\\uFBB1\\uFBD3-\\uFD3D\\uFD50-\\uFD8F\\uFD92-\\uFDC7\\uFDF0-\\uFDFB\\uFE70-\\uFE74\\uFE76-\\uFEFC\\uFF21-\\uFF3A\\uFF41-\\uFF5A\\uFFA0-\\uFFBE\\uFFC2-\\uFFC7\\uFFCA-\\uFFCF\\uFFD2-\\uFFD7\\uFFDA-\\uFFDC",
+	"MidLetter": "\\u003A\\u00B7\\u0387\\u05F4\\u2027\\uFE13\\uFE55\\uFF1A",
+	"MidNum": "\\u002C\\u003B\\u037E\\u0589\\u060C-\\u060D\\u066C\\u07F8\\u2044\\uFE10\\uFE14\\uFE50\\uFE54\\uFF0C\\uFF1B",
+	"MidNumLet": "\\u0027\\u002E\\u2018\\u2019\\u2024\\uFE52\\uFF07\\uFF0E",
+	"Numeric": "\\u0030-\\u0039\\u0660-\\u0669\\u066B\\u06F0-\\u06F9\\u07C0-\\u07C9\\u0966-\\u096F\\u09E6-\\u09EF\\u0A66-\\u0A6F\\u0AE6-\\u0AEF\\u0B66-\\u0B6F\\u0BE6-\\u0BEF\\u0C66-\\u0C6F\\u0CE6-\\u0CEF\\u0D66-\\u0D6F\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F29\\u1040-\\u1049\\u1090-\\u1099\\u17E0-\\u17E9\\u1810-\\u1819\\u1946-\\u194F\\u19D0-\\u19D9\\u1A80-\\u1A89\\u1A90-\\u1A99\\u1B50-\\u1B59\\u1BB0-\\u1BB9\\u1C40-\\u1C49\\u1C50-\\u1C59\\uA620-\\uA629\\uA8D0-\\uA8D9\\uA900-\\uA909\\uA9D0-\\uA9D9\\uAA50-\\uAA59\\uABF0-\\uABF9",
+	"ExtendNumLet": "\\u005F\\u203F-\\u2040\\u2054\\uFE33-\\uFE34\\uFE4D-\\uFE4F\\uFF3F"
+};
--- a/modules/unicodejs/unicodejs.wordbreak.js
+++ b/modules/unicodejs/unicodejs.wordbreak.js
@ -0,0 +1,160 @@
+/*!
+ * Wordbreak module
+ *
+ * Implementation of Unicode's Default Word Boundaries
+ * http://www.unicode.org/reports/tr29/#Default_Word_Boundaries
+ *
+ * @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
+ * @license The MIT License (MIT); see LICENSE.txt
+ */
+( function () {
+	var group,
+		groups = unicodeJS.groups,
+		/**
+		 * @class unicodeJS.wordbreak
+		 * @singleton
+		 */
+		wordbreak = unicodeJS.wordbreak = {},
+		patterns = {};
+
+	// build regexes
+	for ( group in groups ) {
+		patterns[group] = new RegExp( '[' + groups[group] + ']' );
+	}
+
+	function getGroup( chr ) {
+		var group;
+		for ( group in patterns ) {
+			if ( patterns[group].test( chr ) ) {
+				return group;
+			}
+		}
+		return null;
+	}
+
+
+	/**
+	 * Evaluates if the specified position within some text is a word boundary.
+	 * @param {string} text Text
+	 * @param {number} pos Character position
+	 * @returns {boolean} Is the position a word boundary
+	 */
+	wordbreak.isBreakInText = function ( text, pos ) {
+		return unicodeJS.wordbreak.isBreakInTextString( new unicodeJS.TextString( text ), pos );
+	};
+
+	/**
+	 * Evaluates if the sepcified position within some text is a word boundary.
+	 * @param {unicodeJS.TextString} string Text string
+	 * @param {number} pos Character position
+	 * @returns {boolean} Is the position a word boundary
+	 */
+	wordbreak.isBreakInTextString = function ( string, pos ) {
+		// Break at the start and end of text.
+		// WB1: sot ÷
+		// WB2: ÷ eot
+		if ( string.read( pos - 1 ) === null || string.read( pos ) === null ) {
+			return true;
+		}
+
+		// get some context
+		var lft = [], rgt = [], l = 0, r = 0;
+		rgt.push( getGroup( string.read( pos + r  ) ) );
+		lft.push( getGroup( string.read( pos - l - 1 ) ) );
+
+		switch ( true ) {
+			// Do not break within CRLF.
+			// WB3: CR × LF
+			case lft[0] === 'CR' && rgt[0] === 'LF':
+				return false;
+
+			// Otherwise break before and after Newlines (including CR and LF)
+			// WB3a: (Newline | CR | LF) ÷
+			case lft[0] === 'Newline' || lft[0] === 'CR' || lft[0] === 'LF':
+			// WB3b: ÷ (Newline | CR | LF)
+			case rgt[0] === 'Newline' || rgt[0] === 'CR' || rgt[0] === 'LF':
+				return true;
+		}
+
+		// Ignore Format and Extend characters, except when they appear at the beginning of a region of text.
+		// WB4: X (Extend | Format)* → X
+		if ( rgt[0] === 'Extend' || rgt[0] === 'Format' ) {
+			// The Extend|Format character is to the right, so it is attached
+			// to a character to the left, don't split here
+			return false;
+		}
+		// We've reached the end of an Extend|Format sequence, collapse it
+		while ( lft[0] === 'Extend' || lft[0] === 'Format' ) {
+			l++;
+			if ( pos - l - 1 <= 0) {
+				// start of document
+				return true;
+			}
+			lft[lft.length - 1] = getGroup( string.read( pos - l - 1 ) );
+		}
+
+
+		// Do not break between most letters.
+		// WB5: ALetter × ALetter
+		if ( lft[0] === 'ALetter' && rgt[0] === 'ALetter' ) {
+			return false;
+		}
+
+		// some tests beyond this point require more context
+		l++;
+		r++;
+		rgt.push( getGroup( string.read( pos + r ) ) );
+		lft.push( getGroup( string.read( pos - l - 1 ) ) );
+
+		switch ( true ) {
+			// Do not break letters across certain punctuation.
+			// WB6: ALetter × (MidLetter | MidNumLet) ALetter
+			case lft[0] === 'ALetter' && rgt[1] === 'ALetter' &&
+				( rgt[0] === 'MidLetter' || rgt[0] === 'MidNumLet' ):
+			// WB7: ALetter (MidLetter | MidNumLet) × ALetter
+			case rgt[0] === 'ALetter' && lft[1] === 'ALetter' &&
+				( lft[0] === 'MidLetter' || lft[0] === 'MidNumLet' ):
+				return false;
+
+			// Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”).
+			// WB8: Numeric × Numeric
+			case lft[0] === 'Numeric' && rgt[0] === 'Numeric':
+			// WB9: ALetter × Numeric
+			case lft[0] === 'ALetter' && rgt[0] === 'Numeric':
+			// WB10: Numeric × ALetter
+			case lft[0] === 'Numeric' && rgt[0] === 'ALetter':
+				return false;
+
+			// Do not break within sequences, such as “3.2” or “3,456.789”.
+			// WB11: Numeric (MidNum | MidNumLet) × Numeric
+			case rgt[0] === 'Numeric' && lft[1] === 'Numeric' &&
+				( lft[0] === 'MidNum' || lft[0] === 'MidNumLet' ):
+			// WB12: Numeric × (MidNum | MidNumLet) Numeric
+			case lft[0] === 'Numeric' && rgt[1] === 'Numeric' &&
+				( rgt[0] === 'MidNum' || rgt[0] === 'MidNumLet' ):
+				return false;
+
+			// Do not break between Katakana.
+			// WB13: Katakana × Katakana
+			case lft[0] === 'Katakana' && rgt[0] === 'Katakana':
+				return false;
+
+			// Do not break from extenders.
+			// WB13a: (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
+			case rgt[0] === 'ExtendNumLet' &&
+				( lft[0] === 'ALetter' || lft[0] === 'Numeric' || lft[0] === 'Katakana' || lft[0] === 'ExtendNumLet' ):
+			// WB13b: ExtendNumLet × (ALetter | Numeric | Katakana)
+			case lft[0] === 'ExtendNumLet' &&
+				( rgt[0] === 'ALetter' || rgt[0] === 'Numeric' || rgt[0] === 'Katakana' ):
+				return false;
+
+			// Do not break between regional indicator symbols.
+			// WB13c: Regional_Indicator × Regional_Indicator
+			case lft[0] === 'Regional_Indicator' && rgt[0] === 'Regional_Indicator':
+				return false;
+		}
+		// Otherwise, break everywhere (including around ideographs).
+		// WB14: Any ÷ Any
+		return true;
+	};
+}() );
--- a/modules/unicodejs/unicodejs.wordbreak.test.js
+++ b/modules/unicodejs/unicodejs.wordbreak.test.js
@ -0,0 +1,51 @@
+/*!
+ * Wordbreak module tests
+ *
+ * @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
+ * @license The MIT License (MIT); see LICENSE.txt
+ */
+
+QUnit.module( 'unicodeJS.wordbreak' );
+
+QUnit.test( 'isBreakInText', function ( assert ) {
+	var i, result, context,
+		text =
+			/*jshint quotmark:double */
+			// 0 - 10
+			"\u0300xyz'd a' " +
+			// 10 - 20
+			"'a a-b 1a\r" +
+			// 20 - 30
+			"\nカタカナ3,1.2" +
+			// 30 - 40
+			" a_b_3_ナ_ " +
+			// 40 - 50
+			"汉字/漢字 c\u0300\u0327k" +
+			// 50 - 60
+			" c\u0300\u0327",
+			/*jshint quotmark:single */
+		breaks = [
+			0, 1, 6, 7, 8, 9, 10,
+			11, 12, 13, 14, 15, 16, 17, 19,
+			21, 25, 30,
+			31, 39, 40,
+			41, 42, 43, 44, 45, 46, 50,
+			51, 54
+		];
+
+	QUnit.expect( text.length + 1 );
+
+	for ( i = 0; i <= text.length; i++ ) {
+		result = ( breaks.indexOf( i ) !== -1 );
+		context =
+			text.substring( Math.max( i - 4, 0 ), i ) +
+			'│' +
+			text.substring( i, Math.min ( i + 4, text.length ) )
+		;
+		assert.equal(
+			unicodeJS.wordbreak.isBreakInText( text, i ),
+			result,
+			'Position ' + i + ' is ' + ( result ? '' : 'not ' ) + 'a break: ' + context
+		);
+	}
+});
--- a/modules/ve/dm/ve.dm.DataString.js
+++ b/modules/ve/dm/ve.dm.DataString.js
@ -0,0 +1,36 @@
+/*!
+ * VisualEditor DataString class.
+ *
+ * @copyright 2011-2013 VisualEditor Team and others; see AUTHORS.txt
+ * @license The MIT License (MIT); see LICENSE.txt
+ */
+
+/**
+ * Wrapper class to read document data as a plain text string.
+ * @class
+ * @extends unicodeJS.TextString
+ * @constructor
+ * @param {Array} data Document data
+ */
+ve.dm.DataString = function VeDmDataString( data ) {
+	this.data = data;
+};
+
+/* Inheritance */
+
+ve.inheritClass( ve.dm.DataString, unicodeJS.TextString );
+
+/**
+ * Reads the character from the specified position in the data.
+ * @param {number} position Position in data to read from
+ * @returns {string|null} Character at position, or null if not text
+ */
+ve.dm.DataString.prototype.read = function( position ) {
+	var dataAt = this.data[position];
+	// check data is present at position and is not an element
+	if ( dataAt !== undefined && dataAt.type === undefined ) {
+		return typeof dataAt === 'string' ? dataAt : dataAt[0];
+	} else {
+		return null;
+	}
+};
--- a/modules/ve/dm/ve.dm.Document.js
+++ b/modules/ve/dm/ve.dm.Document.js
@ -1033,43 +1033,57 @@ ve.dm.Document.prototype.getNearestStructuralOffset = function ( offset, directi
 };

 /**
- * Get the nearest word boundary.
+ * Get the nearest word boundaries as a range.
 *
- * The offset will first be moved to the nearest content offset if it's not at one already. If a
- * direction was given, the boundary will be found in that direction, otherwise both directions will
- * be calculated and the one with the lowest distance from offset will be returned. Elements are
- * always word boundaries. For more information about what is considered to be a word character,
- * see {ve.dm.SurfaceFragment.wordPattern}.
+ * The offset will first be moved to the nearest content offset if it's not at one already.
+ * Elements are always word boundaries.
 *
 * @method
 * @param {number} offset Offset to start from
- * @param {number} [direction] Direction to prefer matching offset in, -1 for left and 1 for right
- * @returns {number} Nearest word boundary
+ * @returns {ve.Range} Range around nearest word boundaries
 */
-ve.dm.Document.prototype.getNearestWordBoundary = function ( offset, direction ) {
-	var left, right, i, inc,
-		pattern = ve.dm.SurfaceFragment.static.wordBoundaryPattern,
-		data = this.data;
+ve.dm.Document.prototype.getNearestWordRange = function ( offset ) {
+	var offsetLeft, offsetRight, i,
+		dataString = new ve.dm.DataString( this.data );
+
 	offset = this.getNearestContentOffset( offset );
-	if ( !direction ) {
-		left = this.getNearestWordBoundary( offset, -1 );
-		right = this.getNearestWordBoundary( offset, +1 );
-		return offset - left < right - offset ? left : right;
-	} else {
-		inc = direction > 0 ? 1 : -1;
-		i = offset + ( inc > 0 ? 0 : -1 );
-		do {
-			if ( data[i].type === undefined ) {
-				// Plain text extraction
-				if ( pattern.test( typeof data[i] === 'string' ? data[i] : data[i][0] ) ) {
-					break;
-				}
+
+	// If the cursor offset is a break (i.e. the start/end of word) we should
+	// check one position either side to see if there is a non-break
+	// and if so, move the offset accordingly
+	if( unicodeJS.wordbreak.isBreakInTextString( dataString, offset ) ) {
+		if ( !unicodeJS.wordbreak.isBreakInTextString( dataString, offset + 1 ) ) {
+			offset++;
+		} else if( !unicodeJS.wordbreak.isBreakInTextString( dataString, offset - 1 ) ) {
+			offset--;
+		} else {
+			// just return one character to the right, unless we are at the end
+			// of the text, in which case the character to the left
+			if( dataString.read( offset ) !== null ) {
+				return new ve.Range( offset, offset + 1 );
 			} else {
-				break;
+				return new ve.Range( offset - 1, offset );
 			}
-		} while ( data[i += inc] );
-		return i + ( inc > 0 ? 0 : 1 );
+		}
 	}
+
+	i = offset;
+	// Search left and right for next break points
+	while( dataString.read( i++ ) !== null ) {
+		offsetRight = i;
+		if( unicodeJS.wordbreak.isBreakInTextString( dataString, i ) ) {
+			break;
+		}
+	}
+	i = offset;
+	while( dataString.read( i-- ) !== null ) {
+		offsetLeft = i;
+		if( unicodeJS.wordbreak.isBreakInTextString( dataString, i ) ) {
+			break;
+		}
+	}
+
+	return new ve.Range( offsetLeft, offsetRight );
 };

 /**
--- a/modules/ve/dm/ve.dm.SurfaceFragment.js
+++ b/modules/ve/dm/ve.dm.SurfaceFragment.js
@ -216,7 +216,7 @@ ve.dm.SurfaceFragment.prototype.trimRange = function () {
 *
 * @method
 * @param {string} [scope='parent'] Method of expansion:
- *  - `word`: Expands to cover the nearest word by looking for word boundary characters
+ *  - `word`: Expands to cover the nearest word by looking for word breaks (see UnicodeJS.wordbreak)
 *  - `annotation`: Expands to cover a given annotation (argument) within the current range
 *  - `root`: Expands to cover the entire document
 *  - `siblings`: Expands to cover all sibling nodes
@ -233,10 +233,18 @@ ve.dm.SurfaceFragment.prototype.expandRange = function ( scope, type ) {
 	var range, node, nodes, parent;
 	switch ( scope || 'parent' ) {
 		case 'word':
-			range = new ve.Range(
-				this.document.getNearestWordBoundary( this.range.start, -1 ),
-				this.document.getNearestWordBoundary( this.range.end, 1 )
-			);
+			if( this.range.getLength() > 0 ) {
+				range = ve.Range.newCoveringRange( [
+					this.document.getNearestWordRange( this.range.start ),
+					this.document.getNearestWordRange( this.range.end )
+				] );
+				if( this.range.isBackwards() ) {
+					range = range.flip();
+				}
+			} else {
+				// optimisation for zero-length ranges
+				range = this.document.getNearestWordRange( this.range.start );
+			}
 			break;
 		case 'annotation':
 			range = this.document.getAnnotatedRangeFromSelection( this.range, type );
--- a/modules/ve/test/dm/ve.dm.Document.test.js
+++ b/modules/ve/test/dm/ve.dm.Document.test.js
@ -1332,8 +1332,8 @@ QUnit.test( 'getNearestStructuralOffset', function ( assert ) {
 	}
 } );

-QUnit.test( 'getNearestWordBoundary', function ( assert ) {
-	var i, doc, left, right, word,
+QUnit.test( 'getNearestWordRange', function ( assert ) {
+	var i, doc, range, word,
 		cases = [
 		{
 			'phrase': 'visual editor test',
@ -1341,23 +1341,59 @@ QUnit.test( 'getNearestWordBoundary', function ( assert ) {
 			'offset': 10,
 			'expected': 'editor'
 		},
+		{
+			'phrase': 'visual editor test',
+			'msg': 'cursor at start of word',
+			'offset': 7,
+			'expected': 'editor'
+		},
+		{
+			'phrase': 'visual editor test',
+			'msg': 'cursor at end of word',
+			'offset': 13,
+			'expected': 'editor'
+		},
+		{
+			'phrase': 'visual editor test',
+			'msg': 'cursor at start of text',
+			'offset': 0,
+			'expected': 'visual'
+		},
+		{
+			'phrase': 'visual editor test',
+			'msg': 'cursor at end of text',
+			'offset': 18,
+			'expected': 'test'
+		},
 		{
 			'phrase': 'Computer-aided design',
 			'msg': 'hyphenated Latin word',
-			'offset': 2,
-			'expected': 'Computer-aided'
+			'offset': 12,
+			'expected': 'aided'
 		},
 		{
 			'phrase': 'Water (l\'eau) is',
 			'msg': 'apostrophe and parentheses (Latin)',
 			'offset': 8,
-			'expected': '(l\'eau)'
+			'expected': 'l\'eau'
 		},
 		{
 			'phrase': 'Water (H2O) is',
 			'msg': 'number in word (Latin)',
 			'offset': 9,
-			'expected': '(H2O)'
+			'expected': 'H2O'
+		},
+		{
+			'phrase': 'The \'word\' is',
+			'msg': 'apostrophes as single quotes',
+			'offset': 7,
+			'expected': 'word'
+		},
+		{
+			'phrase': 'Some "double" quotes',
+			'msg': 'double quotes',
+			'offset': 8,
+			'expected': 'double'
 		},
 		{
 			'phrase': 'Wikipédia l\'encyclopédie libre',
@ -1365,6 +1401,12 @@ QUnit.test( 'getNearestWordBoundary', function ( assert ) {
 			'offset': 15,
 			'expected': 'l\'encyclopédie'
 		},
+		{
+			'phrase': 'Wikipédia l\'encyclopédie libre',
+			'msg': 'Extend characters (i.e. letter + accent)',
+			'offset': 15,
+			'expected': 'l\'encyclopédie'
+		},
 		{
 			'phrase': 'Википедия свободная энциклопедия',
 			'msg': 'Cyrillic word',
@ -1388,15 +1430,49 @@ QUnit.test( 'getNearestWordBoundary', function ( assert ) {
 			'msg': 'Eastern Arabic numerals',
 			'offset': 13,
 			'expected': '٠١٢٣٤٥٦٧٨٩'
+		},
+		{
+			'phrase': 'Latinカタカナwrapped',
+			'msg': 'Latin-wrapped Katakana word',
+			'offset': 7,
+			'expected': 'カタカナ'
+		},
+		{
+			'phrase': '维基百科',
+			'msg': 'Hanzi characters (cursor in middle)',
+			'offset': 2,
+			'expected': '百'
+		},
+		{
+			'phrase': '维基百科',
+			'msg': 'Hanzi characters (cursor at end)',
+			'offset': 4,
+			'expected': '科'
+		},
+		{
+			'phrase': 'Costs £1,234.00 each',
+			'msg': 'formatted number sequence',
+			'offset': 11,
+			'expected': '1,234.00'
+		},
+		{
+			'phrase': 'Reset index_of variable',
+			'msg': 'underscore-joined word',
+			'offset': 8,
+			'expected': 'index_of'
 		}
 	];
 	QUnit.expect( cases.length );
 	for ( i = 0; i < cases.length; i++ ) {
 		doc = new ve.dm.Document( cases[i].phrase.split('') );
-		left = doc.getNearestWordBoundary( cases[i].offset, -1 );
-		right = doc.getNearestWordBoundary( cases[i].offset, 1 );
-		word = cases[i].phrase.substring( left, right );
-		assert.strictEqual( word, cases[i].expected, cases[i].msg );
+		range = doc.getNearestWordRange( cases[i].offset );
+		word = cases[i].phrase.substring( range.start, range.end );
+		assert.strictEqual( word, cases[i].expected,
+			cases[i].msg + ': ' +
+			cases[i].phrase.substring( 0, cases[i].offset ) + '│' +
+			cases[i].phrase.substring( cases[i].offset, cases[i].phrase.length ) +
+			' → ' + cases[i].expected
+		);
 	}
 } );

--- a/modules/ve/test/dm/ve.dm.SurfaceFragment.test.js
+++ b/modules/ve/test/dm/ve.dm.SurfaceFragment.test.js
@ -59,7 +59,7 @@ QUnit.test( 'collapseRange', 3, function ( assert ) {
 	assert.deepEqual( collapsedFragment.getRange(), new ve.Range( 20, 20 ), 'new range is used' );
 } );

-QUnit.test( 'expandRange', 1, function ( assert ) {
+QUnit.test( 'expandRange (closest)', 1, function ( assert ) {
 	var doc = new ve.dm.Document( ve.copyArray( ve.dm.example.data ) ),
 		surface = new ve.dm.Surface( doc ),
 		fragment = new ve.dm.SurfaceFragment( surface, new ve.Range( 20, 21 ) );
@ -70,6 +70,40 @@ QUnit.test( 'expandRange', 1, function ( assert ) {
 	);
 } );

+QUnit.test( 'expandRange (word)', 1, function ( assert ) {
+	var i, doc, surface, fragment, newFragment, range, word, cases = [
+		{
+			phrase: 'the quick brown fox',
+			range: new ve.Range( 6, 13 ),
+			expected: 'quick brown',
+			msg: 'range starting and ending in latin words'
+		},
+		{
+			phrase: 'the quick brown fox',
+			range: new ve.Range( 18, 12 ),
+			expected: 'brown fox',
+			msg: 'backwards range starting and ending in latin words'
+		},
+		{
+			phrase: 'the quick brown fox',
+			range: new ve.Range( 7, 7 ),
+			expected: 'quick',
+			msg: 'zero-length range'
+		}
+	];
+	QUnit.expect( cases.length*2 );
+	for ( i = 0; i < cases.length; i++ ) {
+		doc = new ve.dm.Document( cases[i].phrase.split('') );
+		surface = new ve.dm.Surface( doc );
+		fragment = new ve.dm.SurfaceFragment( surface, cases[i].range );
+		newFragment = fragment.expandRange( 'word' );
+		range = newFragment.getRange();
+		word = cases[i].phrase.substring( range.start, range.end );
+		assert.strictEqual( word, cases[i].expected, cases[i].msg + ': text' );
+		assert.strictEqual( cases[i].range.isBackwards(), range.isBackwards(), cases[i].msg + ': range direction' );
+	}
+} );
+
 QUnit.test( 'removeContent', 2, function ( assert ) {
 	var doc = new ve.dm.Document( ve.copyArray( ve.dm.example.data ) ),
 		surface = new ve.dm.Surface( doc ),
--- a/modules/ve/test/index.php
+++ b/modules/ve/test/index.php
@ -13,6 +13,10 @@
 		<script src="../../jquery/jquery.js"></script>
 		<script src="../../rangy/rangy-core.js"></script>
 		<script src="../../rangy/rangy-position.js"></script>
+		<script src="../../unicodejs/unicodejs.js"></script>
+		<script src="../../unicodejs/unicodejs.textstring.js"></script>
+		<script src="../../unicodejs/unicodejs.wordbreak.groups.js"></script>
+		<script src="../../unicodejs/unicodejs.wordbreak.js"></script>
 		<!-- ext.visualEditor.base -->
 		<script src="../../ve/ve.js"></script>
 		<script src="../../ve/ve.EventEmitter.js"></script>
@ -70,6 +74,7 @@
 		<script src="../../ve/dm/ve.dm.Transaction.js"></script>
 		<script src="../../ve/dm/ve.dm.Surface.js"></script>
 		<script src="../../ve/dm/ve.dm.SurfaceFragment.js"></script>
+		<script src="../../ve/dm/ve.dm.DataString.js"></script>
 		<script src="../../ve/dm/ve.dm.Document.js"></script>
 		<script src="../../ve/dm/ve.dm.DocumentSlice.js"></script>
 		<script src="../../ve/dm/ve.dm.DocumentSynchronizer.js"></script>