coolbins/system/usr/share/nmap/nselib/unicode.lua

---
-- Library methods for handling unicode strings.
--
-- @author Daniel Miller
-- @copyright Same as Nmap--See https://nmap.org/book/man-legal.html


local string = require "string"
local table = require "table"
local stdnse = require "stdnse"
local unittest = require "unittest"
local tableaux = require "tableaux"
_ENV = stdnse.module("unicode", stdnse.seeall)

-- Localize a few functions for a tiny speed boost, since these will be looped
-- over every char of a string
local byte = string.byte
local char = string.char
local pack = string.pack
local unpack = string.unpack
local concat = table.concat


---Decode a buffer containing Unicode data.
--@param buf The string/buffer to be decoded
--@param decoder A Unicode decoder function (such as utf8_dec)
--@param bigendian For encodings that care about byte-order (such as UTF-16),
--                 set this to true to force big-endian byte order. Default:
--                 false (little-endian)
--@return A list-table containing the code points as numbers
function decode(buf, decoder, bigendian)
  local cp = {}
  local pos = 1
  while pos <= #buf do
    pos, cp[#cp+1] = decoder(buf, pos, bigendian)
  end
  return cp
end

---Encode a list of Unicode code points
--@param list A list-table of code points as numbers
--@param encoder A Unicode encoder function (such as utf8_enc)
--@param bigendian For encodings that care about byte-order (such as UTF-16),
--                 set this to true to force big-endian byte order. Default:
--                 false (little-endian)
--@return An encoded string
function encode(list, encoder, bigendian)
  local buf = {}
  for i, cp in ipairs(list) do
    buf[i] = encoder(cp, bigendian)
  end
  return table.concat(buf, "")
end

---Transcode a string from one format to another
--
--The string will be decoded and re-encoded in one pass. This saves some
--overhead vs simply passing the output of <code>unicode.encode</code> to
--<code>unicode.decode</code>.
--@param buf The string/buffer to be transcoded
--@param decoder A Unicode decoder function (such as utf16_dec)
--@param encoder A Unicode encoder function (such as utf8_enc)
--@param bigendian_dec Set this to true to force big-endian decoding.
--@param bigendian_enc Set this to true to force big-endian encoding.
--@return An encoded string
function transcode(buf, decoder, encoder, bigendian_dec, bigendian_enc)
  local out = {}
  local cp
  local pos = 1
  while pos <= #buf do
    pos, cp = decoder(buf, pos, bigendian_dec)
    out[#out+1] = encoder(cp, bigendian_enc)
  end
  return table.concat(out)
end

--- Determine (poorly) the character encoding of a string
--
-- First, the string is checked for a Byte-order Mark (BOM). This can be
-- examined to determine UTF-16 with endianness or UTF-8. If no BOM is found,
-- the string is examined.
--
-- If null bytes are encountered, UTF-16 is assumed. Endianness is determined
-- by byte position, assuming the null is the high-order byte. Otherwise, if
-- byte values over 127 are found, UTF-8 decoding is attempted. If this fails,
-- the result is 'other', otherwise it is 'utf-8'. If no high bytes are found,
-- the result is 'ascii'.
--
--@param buf The string/buffer to be identified
--@param len The number of bytes to inspect in order to identify the string.
--           Default: 100
--@return A string describing the encoding: 'ascii', 'utf-8', 'utf-16be',
--        'utf-16le', or 'other' meaning some unidentified 8-bit encoding
function chardet(buf, len)
  local limit = len or 100
  if limit > #buf then
    limit = #buf
  end
  -- Check BOM
  if limit >= 2 then
    local bom1, bom2 = byte(buf, 1, 2)
    if bom1 == 0xff and bom2 == 0xfe then
      return 'utf-16le'
    elseif bom1 == 0xfe and bom2 == 0xff then
      return 'utf-16be'
    elseif limit >= 3 then
      local bom3 = byte(buf, 3)
      if bom1 == 0xef and bom2 == 0xbb and bom3 == 0xbf then
        return 'utf-8'
      end
    end
  end
  -- Try bytes
  local pos = 1
  local high = false
  local utf8 = true
  while pos < limit do
    local c = byte(buf, pos)
    if c == 0 then
      if pos % 2 == 0 then
        return 'utf-16le'
      else
        return 'utf-16be'
      end
      utf8 = false
      pos = pos + 1
    elseif c > 127 then
      if not high then
        high = true
      end
      if utf8 then
        local p, cp = utf8_dec(buf, pos)
        if not p then
          utf8 = false
        else
          pos = p
        end
      end
      if not utf8 then
        pos = pos + 1
      end
    else
      pos = pos + 1
    end
  end
  if high then
    if utf8 then
      return 'utf-8'
    else
      return 'other'
    end
  else
    return 'ascii'
  end
end

---Encode a Unicode code point to UTF-16. See RFC 2781.
--
-- Windows OS prior to Windows 2000 only supports UCS-2, so beware using this
-- function to encode code points above 0xFFFF.
--@param cp The Unicode code point as a number
--@param bigendian Set this to true to encode big-endian UTF-16. Default is
--                 false (little-endian)
--@return A string containing the code point in UTF-16 encoding.
function utf16_enc(cp, bigendian)
  local fmt = "<I2"
  if bigendian then
    fmt = ">I2"
  end

  if cp % 1.0 ~= 0.0 or cp < 0 then
    -- Only defined for nonnegative integers.
    return nil
  elseif cp <= 0xFFFF then
    return pack(fmt, cp)
  elseif cp <= 0x10FFFF then
    cp = cp - 0x10000
    return pack(fmt .. fmt, 0xD800 + (cp >> 10), 0xDC00 + (cp & 0x3FF))
  else
    return nil
  end
end

---Decodes a UTF-16 character.
--
-- Does not check that the returned code point is a real character.
-- Specifically, it can be fooled by out-of-order lead- and trail-surrogate
-- characters.
--@param buf A string containing the character
--@param pos The index in the string where the character begins
--@param bigendian Set this to true to encode big-endian UTF-16. Default is
--                 false (little-endian)
--@return pos The index in the string where the character ended
--@return cp The code point of the character as a number
function utf16_dec(buf, pos, bigendian)
  local fmt = "<I2"
  if bigendian then
    fmt = ">I2"
  end

  local cp
  cp, pos = unpack(fmt, buf, pos)
  if cp >= 0xD800 and cp <= 0xDFFF then
    local high = (cp - 0xD800) << 10
    cp, pos = unpack(fmt, buf, pos)
    cp = 0x10000 + high + cp - 0xDC00
  end
  return pos, cp
end

---Encode a Unicode code point to UTF-8. See RFC 3629.
--
-- Does not check that cp is a real character; that is, doesn't exclude the
-- surrogate range U+D800 - U+DFFF and a handful of others.
--@param cp The Unicode code point as a number
--@return A string containing the code point in UTF-8 encoding.
function utf8_enc(cp)
  local bytes = {}
  local n, mask

  if cp % 1.0 ~= 0.0 or cp < 0 then
    -- Only defined for nonnegative integers.
    return nil
  elseif cp <= 0x7F then
    -- Special case of one-byte encoding.
    return char(cp)
  elseif cp <= 0x7FF then
    n = 2
    mask = 0xC0
  elseif cp <= 0xFFFF then
    n = 3
    mask = 0xE0
  elseif cp <= 0x10FFFF then
    n = 4
    mask = 0xF0
  else
    return nil
  end

  while n > 1 do
    bytes[n] = char(0x80 + (cp & 0x3F))
    cp = cp >> 6
    n = n - 1
  end
  bytes[1] = char(mask + cp)

  return table.concat(bytes)
end

---Decodes a UTF-8 character.
--
-- Does not check that the returned code point is a real character.
--@param buf A string containing the character
--@param pos The index in the string where the character begins
--@return pos The index in the string where the character ended or nil on error
--@return cp The code point of the character as a number, or an error string
function utf8_dec(buf, pos)
  pos = pos or 1
  local n, mask
  local bv = byte(buf, pos)
  if bv <= 0x7F then
    return pos+1, bv
  elseif bv <= 0xDF then
    --110xxxxx 10xxxxxx
    n = 1
    mask = 0xC0
  elseif bv <= 0xEF then
    --1110xxxx 10xxxxxx 10xxxxxx
    n = 2
    mask = 0xE0
  elseif bv <= 0xF7 then
    --11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
    n = 3
    mask = 0xF0
  else
    return nil, string.format("Invalid UTF-8 byte at %d", pos)
  end

  local cp = bv - mask

  if pos + n > #buf then
    return nil, string.format("Incomplete UTF-8 sequence at %d", pos)
  end
  for i = 1, n do
    bv = byte(buf, pos + i)
    if bv < 0x80 or bv > 0xBF then
      return nil, string.format("Invalid UTF-8 sequence at %d", pos + i)
    end
    cp = (cp << 6) + (bv & 0x3F)
  end

  return pos + 1 + n, cp
end

-- Code Page 437, native US-English Windows OEM code page
local cp437_decode = {
  [0x80] = 0x00c7,
  [0x81] = 0x00fc,
  [0x82] = 0x00e9,
  [0x83] = 0x00e2,
  [0x84] = 0x00e4,
  [0x85] = 0x00e0,
  [0x86] = 0x00e5,
  [0x87] = 0x00e7,
  [0x88] = 0x00ea,
  [0x89] = 0x00eb,
  [0x8a] = 0x00e8,
  [0x8b] = 0x00ef,
  [0x8c] = 0x00ee,
  [0x8d] = 0x00ec,
  [0x8e] = 0x00c4,
  [0x8f] = 0x00c5,
  [0x90] = 0x00c9,
  [0x91] = 0x00e6,
  [0x92] = 0x00c6,
  [0x93] = 0x00f4,
  [0x94] = 0x00f6,
  [0x95] = 0x00f2,
  [0x96] = 0x00fb,
  [0x97] = 0x00f9,
  [0x98] = 0x00ff,
  [0x99] = 0x00d6,
  [0x9a] = 0x00dc,
  [0x9b] = 0x00a2,
  [0x9c] = 0x00a3,
  [0x9d] = 0x00a5,
  [0x9e] = 0x20a7,
  [0x9f] = 0x0192,
  [0xa0] = 0x00e1,
  [0xa1] = 0x00ed,
  [0xa2] = 0x00f3,
  [0xa3] = 0x00fa,
  [0xa4] = 0x00f1,
  [0xa5] = 0x00d1,
  [0xa6] = 0x00aa,
  [0xa7] = 0x00ba,
  [0xa8] = 0x00bf,
  [0xa9] = 0x2310,
  [0xaa] = 0x00ac,
  [0xab] = 0x00bd,
  [0xac] = 0x00bc,
  [0xad] = 0x00a1,
  [0xae] = 0x00ab,
  [0xaf] = 0x00bb,
  [0xb0] = 0x2591,
  [0xb1] = 0x2592,
  [0xb2] = 0x2593,
  [0xb3] = 0x2502,
  [0xb4] = 0x2524,
  [0xb5] = 0x2561,
  [0xb6] = 0x2562,
  [0xb7] = 0x2556,
  [0xb8] = 0x2555,
  [0xb9] = 0x2563,
  [0xba] = 0x2551,
  [0xbb] = 0x2557,
  [0xbc] = 0x255d,
  [0xbd] = 0x255c,
  [0xbe] = 0x255b,
  [0xbf] = 0x2510,
  [0xc0] = 0x2514,
  [0xc1] = 0x2534,
  [0xc2] = 0x252c,
  [0xc3] = 0x251c,
  [0xc4] = 0x2500,
  [0xc5] = 0x253c,
  [0xc6] = 0x255e,
  [0xc7] = 0x255f,
  [0xc8] = 0x255a,
  [0xc9] = 0x2554,
  [0xca] = 0x2569,
  [0xcb] = 0x2566,
  [0xcc] = 0x2560,
  [0xcd] = 0x2550,
  [0xce] = 0x256c,
  [0xcf] = 0x2567,
  [0xd0] = 0x2568,
  [0xd1] = 0x2564,
  [0xd2] = 0x2565,
  [0xd3] = 0x2559,
  [0xd4] = 0x2558,
  [0xd5] = 0x2552,
  [0xd6] = 0x2553,
  [0xd7] = 0x256b,
  [0xd8] = 0x256a,
  [0xd9] = 0x2518,
  [0xda] = 0x250c,
  [0xdb] = 0x2588,
  [0xdc] = 0x2584,
  [0xdd] = 0x258c,
  [0xde] = 0x2590,
  [0xdf] = 0x2580,
  [0xe0] = 0x03b1,
  [0xe1] = 0x00df,
  [0xe2] = 0x0393,
  [0xe3] = 0x03c0,
  [0xe4] = 0x03a3,
  [0xe5] = 0x03c3,
  [0xe6] = 0x00b5,
  [0xe7] = 0x03c4,
  [0xe8] = 0x03a6,
  [0xe9] = 0x0398,
  [0xea] = 0x03a9,
  [0xeb] = 0x03b4,
  [0xec] = 0x221e,
  [0xed] = 0x03c6,
  [0xee] = 0x03b5,
  [0xef] = 0x2229,
  [0xf0] = 0x2261,
  [0xf1] = 0x00b1,
  [0xf2] = 0x2265,
  [0xf3] = 0x2264,
  [0xf4] = 0x2320,
  [0xf5] = 0x2321,
  [0xf6] = 0x00f7,
  [0xf7] = 0x2248,
  [0xf8] = 0x00b0,
  [0xf9] = 0x2219,
  [0xfa] = 0x00b7,
  [0xfb] = 0x221a,
  [0xfc] = 0x207f,
  [0xfd] = 0x00b2,
  [0xfe] = 0x25a0,
  [0xff] = 0x00a0,
}
local cp437_encode = tableaux.invert(cp437_decode)

---Encode a Unicode code point to CP437
--
-- Returns nil if the code point cannot be found in CP437
--@param cp The Unicode code point as a number
--@return A string containing the related CP437 character
function cp437_enc(cp)
  if cp < 0x80 then
    return char(cp)
  else
    local bv = cp437_encode[cp]
    if bv == nil then
      return nil
    else
      return char(bv)
    end
  end
end

---Decodes a CP437 character
--@param buf A string containing the character
--@param pos The index in the string where the character begins
--@return pos The index in the string where the character ended
--@return cp The code point of the character as a number
function cp437_dec(buf, pos)
  pos = pos or 1
  local bv = byte(buf, pos)
  if bv < 0x80 then
    return pos + 1, bv
  else
    return pos + 1, cp437_decode[bv]
  end
end

---Helper function for the common case of UTF-16 to UTF-8 transcoding, such as
--from a Windows/SMB unicode string to a printable ASCII (subset of UTF-8)
--string.
--@param from A string in UTF-16, little-endian
--@return The string in UTF-8
function utf16to8(from)
  return transcode(from, utf16_dec, utf8_enc, false, nil)
end

---Helper function for the common case of UTF-8 to UTF-16 transcoding, such as
--from a printable ASCII (subset of UTF-8) string to a Windows/SMB unicode
--string.
--@param from A string in UTF-8
--@return The string in UTF-16, little-endian
function utf8to16(from)
  return transcode(from, utf8_dec, utf16_enc, nil, false)
end

if not unittest.testing() then
  return _ENV
end

test_suite = unittest.TestSuite:new()
test_suite:add_test(function()
    local pos, cp = utf8_dec("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E")
    return pos == 4 and cp == 0x65E5, string.format("Expected 4, 0x65E5; got %d, 0x%x", pos, cp)
  end, "utf8_dec")

test_suite:add_test(unittest.equal(encode({0x65E5,0x672C,0x8A9E}, utf8_enc), "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),"encode utf-8")
test_suite:add_test(unittest.equal(encode({0x12345,61,82,97}, utf16_enc), "\x08\xD8\x45\xDF=\0R\0a\0"),"encode utf-16")
test_suite:add_test(unittest.equal(encode({0x12345,61,82,97}, utf16_enc, true), "\xD8\x08\xDF\x45\0=\0R\0a"),"encode utf-16, big-endian")
test_suite:add_test(unittest.table_equal(decode("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E", utf8_dec), {0x65E5,0x672C,0x8A9E}),"decode utf-8")
test_suite:add_test(unittest.table_equal(decode("\x08\xD8\x45\xDF=\0R\0a\0", utf16_dec), {0x12345,61,82,97}),"decode utf-16")
test_suite:add_test(unittest.table_equal(decode("\xD8\x08\xDF\x45\0=\0R\0a", utf16_dec, true), {0x12345,61,82,97}),"decode utf-16, big-endian")
test_suite:add_test(unittest.equal(utf16to8("\x08\xD8\x45\xDF=\0R\0a\0"), "\xF0\x92\x8D\x85=Ra"),"utf16to8")
test_suite:add_test(unittest.equal(utf8to16("\xF0\x92\x8D\x85=Ra"), "\x08\xD8\x45\xDF=\0R\0a\0"),"utf8to16")
test_suite:add_test(unittest.equal(encode({0x221e, 0x2248, 0x30}, cp437_enc), "\xec\xf70"), "encode cp437")
test_suite:add_test(unittest.table_equal(decode("\x81ber", cp437_dec), {0xfc, 0x62, 0x65, 0x72}), "decode cp437")
test_suite:add_test(unittest.equal(chardet("\x08\xD8\x45\xDF=\0R\0a\0"), 'utf-16le'), "detect utf-16le")
test_suite:add_test(unittest.equal(chardet("\xD8\x08\xDF\x45\0=\0R\0a"), 'utf-16be'), "detect utf-16be")
test_suite:add_test(unittest.equal(chardet("...\xF0\x92\x8D\x85=Ra"), 'utf-8'), "detect utf-8")
test_suite:add_test(unittest.equal(chardet("This sentence is completely normal."), 'ascii'), "detect ascii")
test_suite:add_test(unittest.equal(chardet('Comme ci, comme \xe7a'), 'other'), "detect other")

return _ENV
Migrated stuff in from previous project 2025-01-17 11:41:33 +02:00			`---`
			`-- Library methods for handling unicode strings.`
			`--`
			`-- @author Daniel Miller`
			`-- @copyright Same as Nmap--See https://nmap.org/book/man-legal.html`


			`local string = require "string"`
			`local table = require "table"`
			`local stdnse = require "stdnse"`
			`local unittest = require "unittest"`
			`local tableaux = require "tableaux"`
			`_ENV = stdnse.module("unicode", stdnse.seeall)`

			`-- Localize a few functions for a tiny speed boost, since these will be looped`
			`-- over every char of a string`
			`local byte = string.byte`
			`local char = string.char`
			`local pack = string.pack`
			`local unpack = string.unpack`
			`local concat = table.concat`


			`---Decode a buffer containing Unicode data.`
			`--@param buf The string/buffer to be decoded`
			`--@param decoder A Unicode decoder function (such as utf8_dec)`
			`--@param bigendian For encodings that care about byte-order (such as UTF-16),`
			`-- set this to true to force big-endian byte order. Default:`
			`-- false (little-endian)`
			`--@return A list-table containing the code points as numbers`
			`function decode(buf, decoder, bigendian)`
			`local cp = {}`
			`local pos = 1`
			`while pos <= #buf do`
			`pos, cp[#cp+1] = decoder(buf, pos, bigendian)`
			`end`
			`return cp`
			`end`

			`---Encode a list of Unicode code points`
			`--@param list A list-table of code points as numbers`
			`--@param encoder A Unicode encoder function (such as utf8_enc)`
			`--@param bigendian For encodings that care about byte-order (such as UTF-16),`
			`-- set this to true to force big-endian byte order. Default:`
			`-- false (little-endian)`
			`--@return An encoded string`
			`function encode(list, encoder, bigendian)`
			`local buf = {}`
			`for i, cp in ipairs(list) do`
			`buf[i] = encoder(cp, bigendian)`
			`end`
			`return table.concat(buf, "")`
			`end`

			`---Transcode a string from one format to another`
			`--`
			`--The string will be decoded and re-encoded in one pass. This saves some`
			`--overhead vs simply passing the output of <code>unicode.encode</code> to`
			`--<code>unicode.decode</code>.`
			`--@param buf The string/buffer to be transcoded`
			`--@param decoder A Unicode decoder function (such as utf16_dec)`
			`--@param encoder A Unicode encoder function (such as utf8_enc)`
			`--@param bigendian_dec Set this to true to force big-endian decoding.`
			`--@param bigendian_enc Set this to true to force big-endian encoding.`
			`--@return An encoded string`
			`function transcode(buf, decoder, encoder, bigendian_dec, bigendian_enc)`
			`local out = {}`
			`local cp`
			`local pos = 1`
			`while pos <= #buf do`
			`pos, cp = decoder(buf, pos, bigendian_dec)`
			`out[#out+1] = encoder(cp, bigendian_enc)`
			`end`
			`return table.concat(out)`
			`end`

			`--- Determine (poorly) the character encoding of a string`
			`--`
			`-- First, the string is checked for a Byte-order Mark (BOM). This can be`
			`-- examined to determine UTF-16 with endianness or UTF-8. If no BOM is found,`
			`-- the string is examined.`
			`--`
			`-- If null bytes are encountered, UTF-16 is assumed. Endianness is determined`
			`-- by byte position, assuming the null is the high-order byte. Otherwise, if`
			`-- byte values over 127 are found, UTF-8 decoding is attempted. If this fails,`
			`-- the result is 'other', otherwise it is 'utf-8'. If no high bytes are found,`
			`-- the result is 'ascii'.`
			`--`
			`--@param buf The string/buffer to be identified`
			`--@param len The number of bytes to inspect in order to identify the string.`
			`-- Default: 100`
			`--@return A string describing the encoding: 'ascii', 'utf-8', 'utf-16be',`
			`-- 'utf-16le', or 'other' meaning some unidentified 8-bit encoding`
			`function chardet(buf, len)`
			`local limit = len or 100`
			`if limit > #buf then`
			`limit = #buf`
			`end`
			`-- Check BOM`
			`if limit >= 2 then`
			`local bom1, bom2 = byte(buf, 1, 2)`
			`if bom1 == 0xff and bom2 == 0xfe then`
			`return 'utf-16le'`
			`elseif bom1 == 0xfe and bom2 == 0xff then`
			`return 'utf-16be'`
			`elseif limit >= 3 then`
			`local bom3 = byte(buf, 3)`
			`if bom1 == 0xef and bom2 == 0xbb and bom3 == 0xbf then`
			`return 'utf-8'`
			`end`
			`end`
			`end`
			`-- Try bytes`
			`local pos = 1`
			`local high = false`
			`local utf8 = true`
			`while pos < limit do`
			`local c = byte(buf, pos)`
			`if c == 0 then`
			`if pos % 2 == 0 then`
			`return 'utf-16le'`
			`else`
			`return 'utf-16be'`
			`end`
			`utf8 = false`
			`pos = pos + 1`
			`elseif c > 127 then`
			`if not high then`
			`high = true`
			`end`
			`if utf8 then`
			`local p, cp = utf8_dec(buf, pos)`
			`if not p then`
			`utf8 = false`
			`else`
			`pos = p`
			`end`
			`end`
			`if not utf8 then`
			`pos = pos + 1`
			`end`
			`else`
			`pos = pos + 1`
			`end`
			`end`
			`if high then`
			`if utf8 then`
			`return 'utf-8'`
			`else`
			`return 'other'`
			`end`
			`else`
			`return 'ascii'`
			`end`
			`end`

			`---Encode a Unicode code point to UTF-16. See RFC 2781.`
			`--`
			`-- Windows OS prior to Windows 2000 only supports UCS-2, so beware using this`
			`-- function to encode code points above 0xFFFF.`
			`--@param cp The Unicode code point as a number`
			`--@param bigendian Set this to true to encode big-endian UTF-16. Default is`
			`-- false (little-endian)`
			`--@return A string containing the code point in UTF-16 encoding.`
			`function utf16_enc(cp, bigendian)`
			`local fmt = "<I2"`
			`if bigendian then`
			`fmt = ">I2"`
			`end`

			`if cp % 1.0 ~= 0.0 or cp < 0 then`
			`-- Only defined for nonnegative integers.`
			`return nil`
			`elseif cp <= 0xFFFF then`
			`return pack(fmt, cp)`
			`elseif cp <= 0x10FFFF then`
			`cp = cp - 0x10000`
			`return pack(fmt .. fmt, 0xD800 + (cp >> 10), 0xDC00 + (cp & 0x3FF))`
			`else`
			`return nil`
			`end`
			`end`

			`---Decodes a UTF-16 character.`
			`--`
			`-- Does not check that the returned code point is a real character.`
			`-- Specifically, it can be fooled by out-of-order lead- and trail-surrogate`
			`-- characters.`
			`--@param buf A string containing the character`
			`--@param pos The index in the string where the character begins`
			`--@param bigendian Set this to true to encode big-endian UTF-16. Default is`
			`-- false (little-endian)`
			`--@return pos The index in the string where the character ended`
			`--@return cp The code point of the character as a number`
			`function utf16_dec(buf, pos, bigendian)`
			`local fmt = "<I2"`
			`if bigendian then`
			`fmt = ">I2"`
			`end`

			`local cp`
			`cp, pos = unpack(fmt, buf, pos)`
			`if cp >= 0xD800 and cp <= 0xDFFF then`
			`local high = (cp - 0xD800) << 10`
			`cp, pos = unpack(fmt, buf, pos)`
			`cp = 0x10000 + high + cp - 0xDC00`
			`end`
			`return pos, cp`
			`end`

			`---Encode a Unicode code point to UTF-8. See RFC 3629.`
			`--`
			`-- Does not check that cp is a real character; that is, doesn't exclude the`
			`-- surrogate range U+D800 - U+DFFF and a handful of others.`
			`--@param cp The Unicode code point as a number`
			`--@return A string containing the code point in UTF-8 encoding.`
			`function utf8_enc(cp)`
			`local bytes = {}`
			`local n, mask`

			`if cp % 1.0 ~= 0.0 or cp < 0 then`
			`-- Only defined for nonnegative integers.`
			`return nil`
			`elseif cp <= 0x7F then`
			`-- Special case of one-byte encoding.`
			`return char(cp)`
			`elseif cp <= 0x7FF then`
			`n = 2`
			`mask = 0xC0`
			`elseif cp <= 0xFFFF then`
			`n = 3`
			`mask = 0xE0`
			`elseif cp <= 0x10FFFF then`
			`n = 4`
			`mask = 0xF0`
			`else`
			`return nil`
			`end`

			`while n > 1 do`
			`bytes[n] = char(0x80 + (cp & 0x3F))`
			`cp = cp >> 6`
			`n = n - 1`
			`end`
			`bytes[1] = char(mask + cp)`

			`return table.concat(bytes)`
			`end`

			`---Decodes a UTF-8 character.`
			`--`
			`-- Does not check that the returned code point is a real character.`
			`--@param buf A string containing the character`
			`--@param pos The index in the string where the character begins`
			`--@return pos The index in the string where the character ended or nil on error`
			`--@return cp The code point of the character as a number, or an error string`
			`function utf8_dec(buf, pos)`
			`pos = pos or 1`
			`local n, mask`
			`local bv = byte(buf, pos)`
			`if bv <= 0x7F then`
			`return pos+1, bv`
			`elseif bv <= 0xDF then`
			`--110xxxxx 10xxxxxx`
			`n = 1`
			`mask = 0xC0`
			`elseif bv <= 0xEF then`
			`--1110xxxx 10xxxxxx 10xxxxxx`
			`n = 2`
			`mask = 0xE0`
			`elseif bv <= 0xF7 then`
			`--11110xxx 10xxxxxx 10xxxxxx 10xxxxxx`
			`n = 3`
			`mask = 0xF0`
			`else`
			`return nil, string.format("Invalid UTF-8 byte at %d", pos)`
			`end`

			`local cp = bv - mask`

			`if pos + n > #buf then`
			`return nil, string.format("Incomplete UTF-8 sequence at %d", pos)`
			`end`
			`for i = 1, n do`
			`bv = byte(buf, pos + i)`
			`if bv < 0x80 or bv > 0xBF then`
			`return nil, string.format("Invalid UTF-8 sequence at %d", pos + i)`
			`end`
			`cp = (cp << 6) + (bv & 0x3F)`
			`end`

			`return pos + 1 + n, cp`
			`end`

			`-- Code Page 437, native US-English Windows OEM code page`
			`local cp437_decode = {`
			`[0x80] = 0x00c7,`
			`[0x81] = 0x00fc,`
			`[0x82] = 0x00e9,`
			`[0x83] = 0x00e2,`
			`[0x84] = 0x00e4,`
			`[0x85] = 0x00e0,`
			`[0x86] = 0x00e5,`
			`[0x87] = 0x00e7,`
			`[0x88] = 0x00ea,`
			`[0x89] = 0x00eb,`
			`[0x8a] = 0x00e8,`
			`[0x8b] = 0x00ef,`
			`[0x8c] = 0x00ee,`
			`[0x8d] = 0x00ec,`
			`[0x8e] = 0x00c4,`
			`[0x8f] = 0x00c5,`
			`[0x90] = 0x00c9,`
			`[0x91] = 0x00e6,`
			`[0x92] = 0x00c6,`
			`[0x93] = 0x00f4,`
			`[0x94] = 0x00f6,`
			`[0x95] = 0x00f2,`
			`[0x96] = 0x00fb,`
			`[0x97] = 0x00f9,`
			`[0x98] = 0x00ff,`
			`[0x99] = 0x00d6,`
			`[0x9a] = 0x00dc,`
			`[0x9b] = 0x00a2,`
			`[0x9c] = 0x00a3,`
			`[0x9d] = 0x00a5,`
			`[0x9e] = 0x20a7,`
			`[0x9f] = 0x0192,`
			`[0xa0] = 0x00e1,`
			`[0xa1] = 0x00ed,`
			`[0xa2] = 0x00f3,`
			`[0xa3] = 0x00fa,`
			`[0xa4] = 0x00f1,`
			`[0xa5] = 0x00d1,`
			`[0xa6] = 0x00aa,`
			`[0xa7] = 0x00ba,`
			`[0xa8] = 0x00bf,`
			`[0xa9] = 0x2310,`
			`[0xaa] = 0x00ac,`
			`[0xab] = 0x00bd,`
			`[0xac] = 0x00bc,`
			`[0xad] = 0x00a1,`
			`[0xae] = 0x00ab,`
			`[0xaf] = 0x00bb,`
			`[0xb0] = 0x2591,`
			`[0xb1] = 0x2592,`
			`[0xb2] = 0x2593,`
			`[0xb3] = 0x2502,`
			`[0xb4] = 0x2524,`
			`[0xb5] = 0x2561,`
			`[0xb6] = 0x2562,`
			`[0xb7] = 0x2556,`
			`[0xb8] = 0x2555,`
			`[0xb9] = 0x2563,`
			`[0xba] = 0x2551,`
			`[0xbb] = 0x2557,`
			`[0xbc] = 0x255d,`
			`[0xbd] = 0x255c,`
			`[0xbe] = 0x255b,`
			`[0xbf] = 0x2510,`
			`[0xc0] = 0x2514,`
			`[0xc1] = 0x2534,`
			`[0xc2] = 0x252c,`
			`[0xc3] = 0x251c,`
			`[0xc4] = 0x2500,`
			`[0xc5] = 0x253c,`
			`[0xc6] = 0x255e,`
			`[0xc7] = 0x255f,`
			`[0xc8] = 0x255a,`
			`[0xc9] = 0x2554,`
			`[0xca] = 0x2569,`
			`[0xcb] = 0x2566,`
			`[0xcc] = 0x2560,`
			`[0xcd] = 0x2550,`
			`[0xce] = 0x256c,`
			`[0xcf] = 0x2567,`
			`[0xd0] = 0x2568,`
			`[0xd1] = 0x2564,`
			`[0xd2] = 0x2565,`
			`[0xd3] = 0x2559,`
			`[0xd4] = 0x2558,`
			`[0xd5] = 0x2552,`
			`[0xd6] = 0x2553,`
			`[0xd7] = 0x256b,`
			`[0xd8] = 0x256a,`
			`[0xd9] = 0x2518,`
			`[0xda] = 0x250c,`
			`[0xdb] = 0x2588,`
			`[0xdc] = 0x2584,`
			`[0xdd] = 0x258c,`
			`[0xde] = 0x2590,`
			`[0xdf] = 0x2580,`
			`[0xe0] = 0x03b1,`
			`[0xe1] = 0x00df,`
			`[0xe2] = 0x0393,`
			`[0xe3] = 0x03c0,`
			`[0xe4] = 0x03a3,`
			`[0xe5] = 0x03c3,`
			`[0xe6] = 0x00b5,`
			`[0xe7] = 0x03c4,`
			`[0xe8] = 0x03a6,`
			`[0xe9] = 0x0398,`
			`[0xea] = 0x03a9,`
			`[0xeb] = 0x03b4,`
			`[0xec] = 0x221e,`
			`[0xed] = 0x03c6,`
			`[0xee] = 0x03b5,`
			`[0xef] = 0x2229,`
			`[0xf0] = 0x2261,`
			`[0xf1] = 0x00b1,`
			`[0xf2] = 0x2265,`
			`[0xf3] = 0x2264,`
			`[0xf4] = 0x2320,`
			`[0xf5] = 0x2321,`
			`[0xf6] = 0x00f7,`
			`[0xf7] = 0x2248,`
			`[0xf8] = 0x00b0,`
			`[0xf9] = 0x2219,`
			`[0xfa] = 0x00b7,`
			`[0xfb] = 0x221a,`
			`[0xfc] = 0x207f,`
			`[0xfd] = 0x00b2,`
			`[0xfe] = 0x25a0,`
			`[0xff] = 0x00a0,`
			`}`
			`local cp437_encode = tableaux.invert(cp437_decode)`

			`---Encode a Unicode code point to CP437`
			`--`
			`-- Returns nil if the code point cannot be found in CP437`
			`--@param cp The Unicode code point as a number`
			`--@return A string containing the related CP437 character`
			`function cp437_enc(cp)`
			`if cp < 0x80 then`
			`return char(cp)`
			`else`
			`local bv = cp437_encode[cp]`
			`if bv == nil then`
			`return nil`
			`else`
			`return char(bv)`
			`end`
			`end`
			`end`

			`---Decodes a CP437 character`
			`--@param buf A string containing the character`
			`--@param pos The index in the string where the character begins`
			`--@return pos The index in the string where the character ended`
			`--@return cp The code point of the character as a number`
			`function cp437_dec(buf, pos)`
			`pos = pos or 1`
			`local bv = byte(buf, pos)`
			`if bv < 0x80 then`
			`return pos + 1, bv`
			`else`
			`return pos + 1, cp437_decode[bv]`
			`end`
			`end`

			`---Helper function for the common case of UTF-16 to UTF-8 transcoding, such as`
			`--from a Windows/SMB unicode string to a printable ASCII (subset of UTF-8)`
			`--string.`
			`--@param from A string in UTF-16, little-endian`
			`--@return The string in UTF-8`
			`function utf16to8(from)`
			`return transcode(from, utf16_dec, utf8_enc, false, nil)`
			`end`

			`---Helper function for the common case of UTF-8 to UTF-16 transcoding, such as`
			`--from a printable ASCII (subset of UTF-8) string to a Windows/SMB unicode`
			`--string.`
			`--@param from A string in UTF-8`
			`--@return The string in UTF-16, little-endian`
			`function utf8to16(from)`
			`return transcode(from, utf8_dec, utf16_enc, nil, false)`
			`end`

			`if not unittest.testing() then`
			`return _ENV`
			`end`

			`test_suite = unittest.TestSuite:new()`
			`test_suite:add_test(function()`
			`local pos, cp = utf8_dec("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E")`
			`return pos == 4 and cp == 0x65E5, string.format("Expected 4, 0x65E5; got %d, 0x%x", pos, cp)`
			`end, "utf8_dec")`

			`test_suite:add_test(unittest.equal(encode({0x65E5,0x672C,0x8A9E}, utf8_enc), "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"),"encode utf-8")`
			`test_suite:add_test(unittest.equal(encode({0x12345,61,82,97}, utf16_enc), "\x08\xD8\x45\xDF=\0R\0a\0"),"encode utf-16")`
			`test_suite:add_test(unittest.equal(encode({0x12345,61,82,97}, utf16_enc, true), "\xD8\x08\xDF\x45\0=\0R\0a"),"encode utf-16, big-endian")`
			`test_suite:add_test(unittest.table_equal(decode("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E", utf8_dec), {0x65E5,0x672C,0x8A9E}),"decode utf-8")`
			`test_suite:add_test(unittest.table_equal(decode("\x08\xD8\x45\xDF=\0R\0a\0", utf16_dec), {0x12345,61,82,97}),"decode utf-16")`
			`test_suite:add_test(unittest.table_equal(decode("\xD8\x08\xDF\x45\0=\0R\0a", utf16_dec, true), {0x12345,61,82,97}),"decode utf-16, big-endian")`
			`test_suite:add_test(unittest.equal(utf16to8("\x08\xD8\x45\xDF=\0R\0a\0"), "\xF0\x92\x8D\x85=Ra"),"utf16to8")`
			`test_suite:add_test(unittest.equal(utf8to16("\xF0\x92\x8D\x85=Ra"), "\x08\xD8\x45\xDF=\0R\0a\0"),"utf8to16")`
			`test_suite:add_test(unittest.equal(encode({0x221e, 0x2248, 0x30}, cp437_enc), "\xec\xf70"), "encode cp437")`
			`test_suite:add_test(unittest.table_equal(decode("\x81ber", cp437_dec), {0xfc, 0x62, 0x65, 0x72}), "decode cp437")`
			`test_suite:add_test(unittest.equal(chardet("\x08\xD8\x45\xDF=\0R\0a\0"), 'utf-16le'), "detect utf-16le")`
			`test_suite:add_test(unittest.equal(chardet("\xD8\x08\xDF\x45\0=\0R\0a"), 'utf-16be'), "detect utf-16be")`
			`test_suite:add_test(unittest.equal(chardet("...\xF0\x92\x8D\x85=Ra"), 'utf-8'), "detect utf-8")`
			`test_suite:add_test(unittest.equal(chardet("This sentence is completely normal."), 'ascii'), "detect ascii")`
			`test_suite:add_test(unittest.equal(chardet('Comme ci, comme \xe7a'), 'other'), "detect other")`

			`return _ENV`