mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/VisualEditor
synced 2024-11-06 06:29:11 +00:00
6dacf615c0
unicodejs.js: * charRangeArrayRegexp to write surrogate-aware regexps * private helper functions unicodejs.wordbreak.test.js: * test charRangeArrayRegexp * corrected tests for non-BMP wordbreaks unicodejs.wordbreak.js: * use new surrogate-aware regexps unicodejs.wordbreakproperties.js: * generated from Unicode data unicodejs.graphemebreakproperties.js: * generated from Unicode data unicodejs.wordbreak.groups.js: * delete as no longer used unicodejs-properties.py: * generate unicodejs.wordbreakproperties.js from Unicode data * generate unicodejs.graphemebreakproperties.js from Unicode data index.php: * update script tag links /VisualEditor.php: * update script tag links /demos/ve/index.php: * update script tag links /maintenance/makeStaticLoader.php: * update script tag links Change-Id: I39c0386a85b0cf21d68d3385b84018a5d7648de5
52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
#!/usr/bin/env python
|
|
|
|
"""Generates unicodejs.*properties.js from Unicode data"""
|
|
|
|
import re, urllib2
|
|
|
|
for breaktype in ['Grapheme', 'Word']:
|
|
# a list of property name strings like "Extend", "Format" etc
|
|
properties = []
|
|
|
|
# range[property] -> character range list e.g. [0x0040, [0x0060-0x0070], 0x00A3, ...]
|
|
ranges = {}
|
|
|
|
# Analyse unicode data file
|
|
url = "http://www.unicode.org/Public/UNIDATA/auxiliary/" + breaktype + "BreakProperty.txt"
|
|
for line in urllib2.urlopen( url ):
|
|
line = line.strip()
|
|
# Ignore comment or blank lines
|
|
if re.search( r"^\s*(#|$)", line ): continue
|
|
# Find things like one of the following:
|
|
# XXXX ; propertyname
|
|
# XXXX..YYYY ; propertyname
|
|
m = re.search( r"^([0-9A-F]{4,5})(?:\.\.([0-9A-F]{4,5}))?\s*;\s*(\w+)\s*#", line )
|
|
if not m:
|
|
raise ValueError( "Bad line: %r" % line )
|
|
start, end, prop = m.groups()
|
|
if start == 'D800' and end == 'DFFF':
|
|
continue # raw surrogates are not treated
|
|
|
|
if not ranges.has_key( prop ):
|
|
properties.append( prop )
|
|
ranges.setdefault( prop, [] ).append( (start, end) )
|
|
|
|
# Translate ranges into js fragments
|
|
fragments = []
|
|
for prop in properties:
|
|
rangeStrings = []
|
|
for start, end in ranges[prop]:
|
|
if not end:
|
|
rangeStrings.append( "0x" + start )
|
|
else:
|
|
rangeStrings.append( "[0x" + start + ", 0x" + end + "]" )
|
|
fragments.append( "'" + prop + "': [" + ", ".join( rangeStrings ) + "]" )
|
|
|
|
# Write js file
|
|
js = "unicodeJS." + breaktype.lower() + "breakproperties = {\n\t"
|
|
js += ",\n\t".join( fragments )
|
|
js += "\n};\n"
|
|
jsFilename = "../unicodejs." + breaktype.lower() + "breakproperties.js"
|
|
open( jsFilename, "w" ).write( js )
|
|
print "wrote " + jsFilename
|