mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-25 14:56:20 +00:00
4988efd35e
Initially just with a Wordbreak module to implement Unicode standard on 'Default Word Boundaries'. Due to it's standaloneability this has been written as a separate library. Non-BMP characters are currently not supported. Bug: 44085 Change-Id: Ieafa070076f4c36855684f6bc179667e28af2c25
46 lines
1.2 KiB
PHP
46 lines
1.2 KiB
PHP
<?php
|
|
/**
|
|
* Wordbreak character groups generator
|
|
*
|
|
* @file
|
|
* @copyright 2013 UnicodeJS team and others; see AUTHORS.txt
|
|
* @license The MIT License (MIT); see LICENSE.txt
|
|
*/
|
|
|
|
echo 'Downloading break point ranges from unicode.org... ';
|
|
$data = file_get_contents( 'http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt' );
|
|
echo "done\n";
|
|
|
|
echo 'Generating regular expressions... ';
|
|
$lines = explode( "\n", $data );
|
|
|
|
$groups = array();
|
|
|
|
for ( $i = 0, $len = count($lines); $i < $len; $i++ ) {
|
|
$line = $lines[$i];
|
|
if ( substr( $line, 0, 1 ) === '#' || $line === '' ) {
|
|
continue;
|
|
}
|
|
$cols = preg_split( '/[;#]/', $line );
|
|
// Ignoring non-BMP characters for the time being
|
|
if ( preg_match( '/[a-f0-9]{5}/i', $cols[0] ) ) continue;
|
|
$range = '\u'.str_replace( '..', '-\u', trim( $cols[0] ) );
|
|
$group = trim( $cols[1] );
|
|
if ( !isset( $groups[$group] ) ) {
|
|
$groups[$group] = '';
|
|
}
|
|
$groups[$group] .= $range;
|
|
}
|
|
|
|
echo "done\n";
|
|
|
|
echo 'Writing to unicodejs.wordbreak.groups.js... ';
|
|
|
|
$json = preg_replace( '/ /', "\t", json_encode( $groups, JSON_PRETTY_PRINT ) );
|
|
file_put_contents(
|
|
dirname( __DIR__ ) . '/unicodejs.wordbreak.groups.js',
|
|
"/*jshint quotmark:double */\nunicodeJS.groups = " . $json . ";\n"
|
|
);
|
|
|
|
echo "done\n";
|