mediawiki-extensions-Scribunto/engines/LuaCommon/lualib/luabit/utf8.lua
tstarling cebe775ee8 Added more Lua environment features
Package library:

* Added a simulation of the Lua 5.1 package library.
* Removed mw.import(), replaced it with a package loader. Packages can be
  retrieved from the wiki, using require('Module:Foo'), or from files
  distributed with Scribunto, using require('foo'). The "Module:" prefix allows
  for source compatibility with existing Lua code.
* Added a couple of libraries from LuaForge: luabit and stringtools.
* Made fetchModuleFromParser() return null on error instead of throwing an
  exception, to more easily support the desired behaviour of the package loader,
  which needs to return null on error.
* Renamed mw.setupEnvironment() to mw.setup() since it is setting up things
  other than the environment now.
* In MWServer:handleRegisterLibrary(), remove the feature which interprets dots
  in library names, since LuaSandbox doesn't support this.

Improved module isolation and related refactoring:

* Expose restricted versions of getfenv() and setfenv() to user Lua code.
  Requires luasandbox r114952.
* Don't cache the export list returned by module execution for later function
  calls. This breaks isolation of #invoke calls, since the local variables are
  persistent.
* Removed ScribuntoFunctionBase and its children, since it doesn't really have
  a purpose if it can't cache anything. Instead, invoke functions using a module
  method called invoke().
* Removed Module::initialize(), replaced it with a validate() function. This is
  a more elegant interface and works better with the new module caching scheme.
* Use a Status object for the return value of Engine::validate() instead of an
  array. Use the formatting facilities of the Status class.

Other:

* Removed "too many returns" error, doesn't fit in with Lua conventions.
* Use the standalone engine by default, so that the extension will work without
  configuration for more people.
* Added an accessor for $engine->interpreter
* Fix mw.clone() to correctly clone metatables
* If the standalone interpreter exits due to an error, there are some contexts
  where the initial error will be caught and ignored, and the user will see the
  error from checkValid() instead. In this case, rethrow the original error for
  a more informative message.
* Load mw.lua into the initial standalone environment, to reduce code
  duplication between mw.lua and MWServer.lua.
* Fixed a bug in Scribunto_LuaStandaloneInterpreter::handleCall() for functions
  that return no results.
* Fixed a bug in encodeLuaVar() for strings with "\r". Added test case.
* In MWServer.lua, don't call error() for internal errors, instead just print
  the error and exit. This avoids a protocol violation when an error is
  encountered from within handleCall().
* Added lots of documentation. Lua doc comments are in LuaDoc format.

Change-Id: Ie2fd572c362bedf02f45d3fa5352a5280e034740
2012-04-18 13:46:18 +10:00

163 lines
4.1 KiB
Lua

--[[---------------
Utf8 v0.4
-------------------
utf8 -> unicode ucs2 converter
How to use:
to convert:
ucs2_string = utf8.utf_to_uni(utf8_string)
to view a string in hex:
utf8.print_hex(str)
Under the MIT license.
Utf8 is a part of LuaBit Project(http://luaforge.net/projects/bit/).
copyright(c) 2007 hanzhao (abrash_han@hotmail.com)
--]]---------------
require 'hex'
require 'bit'
do
local BYTE_1_HEAD = hex.to_dec('0x00') -- 0### ####
local BYTE_2_HEAD = hex.to_dec('0xC0') -- 110# ####
local BYTE_3_HEAD = hex.to_dec('0xE0') -- 1110 ####
-- mask to get the head
local BYTE_1_MASK = hex.to_dec('0x80') -- 1### ####
local BYTE_2_MASK = hex.to_dec('0xE0') -- 111# ####
local BYTE_3_MASK = hex.to_dec('0xF0') -- 1111 ####
-- tail byte mask
local TAIL_MASK = hex.to_dec('0x3F') -- 10## ####
local mask_tbl = {
BYTE_3_MASK,
BYTE_2_MASK,
BYTE_1_MASK,
}
local head_tbl = {
BYTE_3_HEAD,
BYTE_2_HEAD,
BYTE_1_HEAD,
}
local len_tbl = {
[BYTE_1_HEAD] = 1,
[BYTE_2_HEAD] = 2,
[BYTE_3_HEAD] = 3,
}
local function utf_read_char(utf, start)
local head_byte = string.byte(utf, start)
--print('head byte ' .. hex.to_hex(head_byte))
for m = 1, table.getn(mask_tbl) do
local mask = mask_tbl[m]
-- head match
local head = bit.band(head_byte, mask)
--print('head ' .. hex.to_hex(head) .. ' ' .. hex.to_hex(mask))
if(head == head_tbl[m]) then
local len = len_tbl[head_tbl[m]]
--print('len ' .. len)
local tail_idx = start + len - 1
local char = 0
-- tail
for i = tail_idx, start + 1, -1 do
local tail_byte = string.byte(utf, i)
local byte = bit.band(tail_byte, TAIL_MASK)
--print('byte ' .. hex.to_hex(byte).. ' = ' .. hex.to_hex(tail_byte) .. '&'..hex.to_hex(TAIL_MASK))
if(tail_idx - i > 0) then
local sft = bit.blshift(byte, (tail_idx - i) * 6)
--print('shift ' .. hex.to_hex(sft) .. ' ' .. hex.to_hex(byte) .. ' ' .. ((tail_idx - i) * 6))
char = bit.bor(char, sft)
--print('char ' .. hex.to_hex(char))
else
char = byte
end
end -- tails
-- add head
local head_val = bit.band(head_byte, bit.bnot(mask))
--print('head val ' .. hex.to_hex(head_val))
head_val = bit.blshift(head_val, (len-1) * 6)
--print('head val ' .. hex.to_hex(head_val))
char = bit.bor(head_val, char)
--print('char ' .. hex.to_hex(char))
return char, len
end -- if head match
end -- for mask
error('not find proper head mask')
end
local function print_hex(str)
local cat = ''
for i=1, string.len(str) do
cat = cat .. ' ' .. hex.to_hex(string.byte(str, i))
end
print(cat)
end
local HI_MASK = hex.to_dec('0xF0')
local LO_MASK = hex.to_dec('0xFF')
local function char_to_str(char)
local hi, lo = bit.brshift(char, 8), bit.band(char, LO_MASK)
-- print(hex.to_hex(char)..' '..hex.to_hex(hi)..' ' .. hex.to_hex(lo))
if(hi == 0) then
return string.format('%c\0', lo)
elseif(lo == 0) then
return string.format('\0%c', hi)
else
return string.format('%c%c', lo, hi)
end
end
local function utf_to_uni(utf)
local n = string.len(utf)
local i = 1
local uni = ''
while(i <= n) do
--print('---')
char, len = utf_read_char(utf, i)
i = i + len
--print(string.len(char_to_str(char)))
uni = uni..char_to_str(char)
end
--print_hex(uni)
return uni
end
-- interface
utf8 = {
utf_to_uni = utf_to_uni,
print_hex = print_hex,
}
end
--[[
-- test
byte_3 = string.format('%c%c%c', hex.to_dec('0xE7'), hex.to_dec('0x83'), hex.to_dec('0xad'))
print(string.len(byte_3))
utf8.utf_to_uni(byte_3)
--]]
--[[
byte_2 = string.format('%c%c', hex.to_dec('0xC2'), hex.to_dec('0x9D'))
utf8.utf_to_uni(byte_2)
byte_1 = string.format('%c', hex.to_dec('0xB'))
utf8.utf_to_uni(byte_1)
--]]
--[[
test_mul = string.format(
'%c%c%c%c%c%c%c%c%c',
hex.to_dec('0xE8'),hex.to_dec('0xAF'), hex.to_dec('0xBA'),
hex.to_dec('0xE5'),hex.to_dec('0x9F'), hex.to_dec('0xBA'),
hex.to_dec('0xE4'),hex.to_dec('0xBA'), hex.to_dec('0x9A'))
utf8.print_hex(utf8.utf_to_uni(test_mul))
--]]