mediawiki-extensions-Visual.../modules/unicodejs/tools/unicodejs-properties.py
David Chan 6dacf615c0 Match non-BMP characters in wordbreak regexes
unicodejs.js:
* charRangeArrayRegexp to write surrogate-aware regexps
* private helper functions

unicodejs.wordbreak.test.js:
* test charRangeArrayRegexp
* corrected tests for non-BMP wordbreaks

unicodejs.wordbreak.js:
* use new surrogate-aware regexps

unicodejs.wordbreakproperties.js:
* generated from Unicode data

unicodejs.graphemebreakproperties.js:
* generated from Unicode data

unicodejs.wordbreak.groups.js:
* delete as no longer used

unicodejs-properties.py:
* generate unicodejs.wordbreakproperties.js from Unicode data
* generate unicodejs.graphemebreakproperties.js from Unicode data

index.php:
* update script tag links

/VisualEditor.php:
* update script tag links

/demos/ve/index.php:
* update script tag links

/maintenance/makeStaticLoader.php:
* update script tag links

Change-Id: I39c0386a85b0cf21d68d3385b84018a5d7648de5
2013-06-10 23:16:23 +01:00

52 lines
1.6 KiB
Python

#!/usr/bin/env python
"""Generates unicodejs.*properties.js from Unicode data"""
import re, urllib2
for breaktype in ['Grapheme', 'Word']:
# a list of property name strings like "Extend", "Format" etc
properties = []
# range[property] -> character range list e.g. [0x0040, [0x0060-0x0070], 0x00A3, ...]
ranges = {}
# Analyse unicode data file
url = "http://www.unicode.org/Public/UNIDATA/auxiliary/" + breaktype + "BreakProperty.txt"
for line in urllib2.urlopen( url ):
line = line.strip()
# Ignore comment or blank lines
if re.search( r"^\s*(#|$)", line ): continue
# Find things like one of the following:
# XXXX ; propertyname
# XXXX..YYYY ; propertyname
m = re.search( r"^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*(\w+)\s*#", line )
if not m:
raise ValueError( "Bad line: %r" % line )
start, end, prop = m.groups()
if start == 'D800' and end == 'DFFF':
continue # raw surrogates are not treated
if not ranges.has_key( prop ):
properties.append( prop )
ranges.setdefault( prop, [] ).append( (start, end) )
# Translate ranges into js fragments
fragments = []
for prop in properties:
rangeStrings = []
for start, end in ranges[prop]:
if not end:
rangeStrings.append( "0x" + start )
else:
rangeStrings.append( "[0x" + start + ", 0x" + end + "]" )
fragments.append( "'" + prop + "': [" + ", ".join( rangeStrings ) + "]" )
# Write js file
js = "unicodeJS." + breaktype.lower() + "breakproperties = {\n\t"
js += ",\n\t".join( fragments )
js += "\n};\n"
jsFilename = "../unicodejs." + breaktype.lower() + "breakproperties.js"
open( jsFilename, "w" ).write( js )
print "wrote " + jsFilename