mediawiki-extensions-Visual.../modules/unicodejs/tools/unicodejs.wordbreak.groups.php
Ed Sanders 4988efd35e UnicodeJS library to implement Unicode standards
Initially just with a Wordbreak module to implement Unicode standard
on 'Default Word Boundaries'. Due to it's standaloneability this has
been written as a separate library. Non-BMP characters are currently
not supported.

Bug: 44085
Change-Id: Ieafa070076f4c36855684f6bc179667e28af2c25
2013-03-27 17:44:22 +00:00

46 lines
1.2 KiB
PHP

<?php
/**
* Wordbreak character groups generator
*
* @file
* @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
* @license The MIT License (MIT); see LICENSE.txt
*/
echo 'Downloading break point ranges from unicode.org... ';
$data = file_get_contents( 'http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt' );
echo "done\n";
echo 'Generating regular expressions... ';
$lines = explode( "\n", $data );
$groups = array();
for ( $i = 0, $len = count($lines); $i < $len; $i++ ) {
$line = $lines[$i];
if ( substr( $line, 0, 1 ) === '#' || $line === '' ) {
continue;
}
$cols = preg_split( '/[;#]/', $line );
// Ignoring non-BMP characters for the time being
if ( preg_match( '/[a-f0-9]{5}/i', $cols[0] ) ) continue;
$range = '\u'.str_replace( '..', '-\u', trim( $cols[0] ) );
$group = trim( $cols[1] );
if ( !isset( $groups[$group] ) ) {
$groups[$group] = '';
}
$groups[$group] .= $range;
}
echo "done\n";
echo 'Writing to unicodejs.wordbreak.groups.js... ';
$json = preg_replace( '/ /', "\t", json_encode( $groups, JSON_PRETTY_PRINT ) );
file_put_contents(
dirname( __DIR__ ) . '/unicodejs.wordbreak.groups.js',
"/*jshint quotmark:double */\nunicodeJS.groups = " . $json . ";\n"
);
echo "done\n";