mirror of
https://gerrit.wikimedia.org/r/mediawiki/extensions/Scribunto
synced 2024-11-30 11:04:17 +00:00
163 lines
4.1 KiB
Lua
163 lines
4.1 KiB
Lua
|
--[[---------------
|
||
|
Utf8 v0.4
|
||
|
-------------------
|
||
|
utf8 -> unicode ucs2 converter
|
||
|
|
||
|
How to use:
|
||
|
to convert:
|
||
|
ucs2_string = utf8.utf_to_uni(utf8_string)
|
||
|
|
||
|
to view a string in hex:
|
||
|
utf8.print_hex(str)
|
||
|
|
||
|
Under the MIT license.
|
||
|
|
||
|
Utf8 is a part of LuaBit Project(http://luaforge.net/projects/bit/).
|
||
|
|
||
|
copyright(c) 2007 hanzhao (abrash_han@hotmail.com)
|
||
|
--]]---------------
|
||
|
|
||
|
require 'hex'
|
||
|
require 'bit'
|
||
|
|
||
|
do
|
||
|
local BYTE_1_HEAD = hex.to_dec('0x00') -- 0### ####
|
||
|
local BYTE_2_HEAD = hex.to_dec('0xC0') -- 110# ####
|
||
|
local BYTE_3_HEAD = hex.to_dec('0xE0') -- 1110 ####
|
||
|
|
||
|
-- mask to get the head
|
||
|
local BYTE_1_MASK = hex.to_dec('0x80') -- 1### ####
|
||
|
local BYTE_2_MASK = hex.to_dec('0xE0') -- 111# ####
|
||
|
local BYTE_3_MASK = hex.to_dec('0xF0') -- 1111 ####
|
||
|
|
||
|
-- tail byte mask
|
||
|
local TAIL_MASK = hex.to_dec('0x3F') -- 10## ####
|
||
|
|
||
|
local mask_tbl = {
|
||
|
BYTE_3_MASK,
|
||
|
BYTE_2_MASK,
|
||
|
BYTE_1_MASK,
|
||
|
}
|
||
|
local head_tbl = {
|
||
|
BYTE_3_HEAD,
|
||
|
BYTE_2_HEAD,
|
||
|
BYTE_1_HEAD,
|
||
|
}
|
||
|
|
||
|
local len_tbl = {
|
||
|
[BYTE_1_HEAD] = 1,
|
||
|
[BYTE_2_HEAD] = 2,
|
||
|
[BYTE_3_HEAD] = 3,
|
||
|
}
|
||
|
|
||
|
local function utf_read_char(utf, start)
|
||
|
local head_byte = string.byte(utf, start)
|
||
|
--print('head byte ' .. hex.to_hex(head_byte))
|
||
|
for m = 1, table.getn(mask_tbl) do
|
||
|
local mask = mask_tbl[m]
|
||
|
-- head match
|
||
|
local head = bit.band(head_byte, mask)
|
||
|
--print('head ' .. hex.to_hex(head) .. ' ' .. hex.to_hex(mask))
|
||
|
if(head == head_tbl[m]) then
|
||
|
local len = len_tbl[head_tbl[m]]
|
||
|
--print('len ' .. len)
|
||
|
local tail_idx = start + len - 1
|
||
|
local char = 0
|
||
|
-- tail
|
||
|
for i = tail_idx, start + 1, -1 do
|
||
|
local tail_byte = string.byte(utf, i)
|
||
|
local byte = bit.band(tail_byte, TAIL_MASK)
|
||
|
--print('byte ' .. hex.to_hex(byte).. ' = ' .. hex.to_hex(tail_byte) .. '&'..hex.to_hex(TAIL_MASK))
|
||
|
if(tail_idx - i > 0) then
|
||
|
local sft = bit.blshift(byte, (tail_idx - i) * 6)
|
||
|
--print('shift ' .. hex.to_hex(sft) .. ' ' .. hex.to_hex(byte) .. ' ' .. ((tail_idx - i) * 6))
|
||
|
char = bit.bor(char, sft)
|
||
|
--print('char ' .. hex.to_hex(char))
|
||
|
else
|
||
|
char = byte
|
||
|
end
|
||
|
end -- tails
|
||
|
|
||
|
-- add head
|
||
|
local head_val = bit.band(head_byte, bit.bnot(mask))
|
||
|
--print('head val ' .. hex.to_hex(head_val))
|
||
|
head_val = bit.blshift(head_val, (len-1) * 6)
|
||
|
--print('head val ' .. hex.to_hex(head_val))
|
||
|
char = bit.bor(head_val, char)
|
||
|
--print('char ' .. hex.to_hex(char))
|
||
|
|
||
|
return char, len
|
||
|
end -- if head match
|
||
|
end -- for mask
|
||
|
error('not find proper head mask')
|
||
|
end
|
||
|
|
||
|
local function print_hex(str)
|
||
|
local cat = ''
|
||
|
for i=1, string.len(str) do
|
||
|
cat = cat .. ' ' .. hex.to_hex(string.byte(str, i))
|
||
|
end
|
||
|
print(cat)
|
||
|
end
|
||
|
|
||
|
local HI_MASK = hex.to_dec('0xF0')
|
||
|
local LO_MASK = hex.to_dec('0xFF')
|
||
|
|
||
|
local function char_to_str(char)
|
||
|
local hi, lo = bit.brshift(char, 8), bit.band(char, LO_MASK)
|
||
|
-- print(hex.to_hex(char)..' '..hex.to_hex(hi)..' ' .. hex.to_hex(lo))
|
||
|
if(hi == 0) then
|
||
|
return string.format('%c\0', lo)
|
||
|
elseif(lo == 0) then
|
||
|
return string.format('\0%c', hi)
|
||
|
else
|
||
|
return string.format('%c%c', lo, hi)
|
||
|
end
|
||
|
end
|
||
|
|
||
|
local function utf_to_uni(utf)
|
||
|
local n = string.len(utf)
|
||
|
local i = 1
|
||
|
local uni = ''
|
||
|
while(i <= n) do
|
||
|
--print('---')
|
||
|
char, len = utf_read_char(utf, i)
|
||
|
i = i + len
|
||
|
--print(string.len(char_to_str(char)))
|
||
|
|
||
|
uni = uni..char_to_str(char)
|
||
|
end
|
||
|
--print_hex(uni)
|
||
|
return uni
|
||
|
end
|
||
|
|
||
|
-- interface
|
||
|
utf8 = {
|
||
|
utf_to_uni = utf_to_uni,
|
||
|
print_hex = print_hex,
|
||
|
}
|
||
|
|
||
|
end
|
||
|
|
||
|
--[[
|
||
|
-- test
|
||
|
byte_3 = string.format('%c%c%c', hex.to_dec('0xE7'), hex.to_dec('0x83'), hex.to_dec('0xad'))
|
||
|
print(string.len(byte_3))
|
||
|
utf8.utf_to_uni(byte_3)
|
||
|
--]]
|
||
|
--[[
|
||
|
byte_2 = string.format('%c%c', hex.to_dec('0xC2'), hex.to_dec('0x9D'))
|
||
|
utf8.utf_to_uni(byte_2)
|
||
|
|
||
|
byte_1 = string.format('%c', hex.to_dec('0xB'))
|
||
|
utf8.utf_to_uni(byte_1)
|
||
|
--]]
|
||
|
--[[
|
||
|
test_mul = string.format(
|
||
|
'%c%c%c%c%c%c%c%c%c',
|
||
|
hex.to_dec('0xE8'),hex.to_dec('0xAF'), hex.to_dec('0xBA'),
|
||
|
hex.to_dec('0xE5'),hex.to_dec('0x9F'), hex.to_dec('0xBA'),
|
||
|
hex.to_dec('0xE4'),hex.to_dec('0xBA'), hex.to_dec('0x9A'))
|
||
|
|
||
|
utf8.print_hex(utf8.utf_to_uni(test_mul))
|
||
|
--]]
|