480 lines
12 KiB
Lua
480 lines
12 KiB
Lua
---
|
|
-- Library methods for handling punycode strings.
|
|
--
|
|
-- Punycode is a simple and efficient transfer encoding syntax designed
|
|
-- for use with Internationalized Domain Names in Applications (IDNA).
|
|
-- It uniquely and reversibly transforms a Unicode string into an ASCII
|
|
-- string. ASCII characters in the Unicode string are represented
|
|
-- literally, and non-ASCII characters are represented by ASCII
|
|
-- characters that are allowed in host name labels (letters, digits, and
|
|
-- hyphens). This document defines a general algorithm called
|
|
-- Bootstring that allows a string of basic code points to uniquely
|
|
-- represent any string of code points drawn from a larger set.
|
|
-- Punycode is an instance of Bootstring that uses particular parameter
|
|
-- values specified by this document, appropriate for IDNA.
|
|
--
|
|
-- Advantages of Bootstring algorithm are Completeness, Uniqueness,
|
|
-- Reversibility, Efficient encoding, Simplicity and Readability.
|
|
--
|
|
-- Portions of this library were adapted from punycode.js by Mathias Bynens
|
|
-- under the MIT License.
|
|
--
|
|
-- References:
|
|
-- * http://ietf.org/rfc/rfc3492.txt
|
|
-- * punycode.js: https://mths.be/punycode
|
|
--
|
|
-- @author Rewanth Cool
|
|
-- @copyright Same as Nmap--See https://nmap.org/book/man-legal.html
|
|
|
|
local stdnse = require "stdnse"
|
|
local string = require "string"
|
|
local math = require "math"
|
|
local table = require "table"
|
|
local unicode = require "unicode"
|
|
local unittest = require "unittest"
|
|
|
|
_ENV = stdnse.module("punycode", stdnse.seeall)
|
|
|
|
-- Localize few functions for a tiny speed boost, since these will be
|
|
-- used frequently.
|
|
local floor = math.floor
|
|
local byte = string.byte
|
|
local char = string.char
|
|
local find = string.find
|
|
local match = string.match
|
|
local reverse = string.reverse
|
|
local sub = string.sub
|
|
|
|
-- Highest positive signed 32-bit float value
|
|
local maxInt = 0x7FFFFFFF
|
|
|
|
-- Bootstring parameters
|
|
local base = 0x24
|
|
local tMin = 0x1
|
|
local tMax = 0x1A
|
|
local skew = 0x26
|
|
local damp = 0x2BC
|
|
local initialBias = 0x48
|
|
local initialN = 0x80
|
|
local delimiter = char("0x2D")
|
|
|
|
-- Convenience shortcuts
|
|
local baseMinusTMin = base - tMin
|
|
|
|
-- Bias adaptation function as per section 3.4 of RFC 3492.
|
|
-- https://tools.ietf.org/html/rfc3492#section-3.4
|
|
-- The following function is adapted from punycode.js by Mathias Bynens
|
|
-- under the MIT License.
|
|
local function adapt(delta, numPoints, firstTime)
|
|
|
|
local k = 0;
|
|
|
|
if firstTime then
|
|
delta = floor(delta / damp)
|
|
else
|
|
delta = (delta >> 1)
|
|
end
|
|
|
|
delta = delta + floor(delta / numPoints)
|
|
|
|
while delta > (baseMinusTMin * tMax >> 1) do
|
|
delta = floor(delta / baseMinusTMin)
|
|
k = k + base
|
|
end
|
|
|
|
return floor(k + (baseMinusTMin + 1) * delta / (delta + skew))
|
|
|
|
end
|
|
|
|
-- The following function converts boolean value to integer.
|
|
--
|
|
-- @param status boolean value is given as input.
|
|
-- @return Returns 0/1 based on the given boolean input.
|
|
local function boolToNum(status)
|
|
|
|
if status == true then
|
|
return 1
|
|
else
|
|
return 0
|
|
end
|
|
|
|
end
|
|
|
|
-- This function converts a basic code point into a digit/integer.
|
|
--
|
|
-- @param codePoint The basic numeric code point value.
|
|
-- @return The numeric value of a basic code point (for use in
|
|
-- representing integers) in the range `0` to `base - 1`, or `base` if
|
|
-- the code point does not represent a value.
|
|
-- The following function is adapted from punycode.js by Mathias Bynens
|
|
-- under the MIT License.
|
|
local function basicToDigit(codePoint)
|
|
|
|
if (codePoint - 0x30 < 0x0A) then
|
|
return codePoint - 0x16
|
|
end
|
|
if (codePoint - 0x41 < 0x1A) then
|
|
return codePoint - 0x41
|
|
end
|
|
if (codePoint - 0x61 < 0x1A) then
|
|
return codePoint - 0x61
|
|
end
|
|
|
|
return base
|
|
|
|
end
|
|
|
|
|
|
-- This function converts a digit/integer into a basic code point.
|
|
--
|
|
-- @param digit The numeric value of a basic code point.
|
|
-- @return The basic code point whose value (when used for
|
|
-- representing integers) is `digit`, which needs to be in the range
|
|
-- `0` to `base - 1`. If `flag` is non-zero, the uppercase form is
|
|
-- used; else, the lowercase form is used. The behavior is undefined
|
|
-- if `flag` is non-zero and `digit` has no uppercase form.
|
|
-- The following function is adapted from punycode.js by Mathias Bynens
|
|
-- under the MIT License.
|
|
local function digitToBasic(digit, flag)
|
|
-- 0..25 map to ASCII a..z or A..Z
|
|
-- 26..35 map to ASCII 0..9
|
|
return digit + 22 + 75 * boolToNum(digit < 26) - (boolToNum((flag ~= 0)) << 5)
|
|
end
|
|
|
|
-- Creates a string based on an array of numeric code points.
|
|
--
|
|
-- @param input list-table of Unicode code points
|
|
-- @param decoder Sets the decoding format to be used.
|
|
-- @return The new encoded string
|
|
-- The following function is adapted from punycode.js by Mathias Bynens
|
|
-- under the MIT License.
|
|
function encode_input(input)
|
|
|
|
local output = {}
|
|
|
|
-- Cache the length.
|
|
local inputLength = #input
|
|
|
|
-- Initialize the state.
|
|
local n = initialN
|
|
local delta = 0
|
|
local bias = initialBias
|
|
|
|
-- Handle the basic code points.
|
|
for _, v in ipairs(input) do
|
|
if v < 0x80 then
|
|
table.insert(output, char(v))
|
|
end
|
|
end
|
|
|
|
local basicLength = #output
|
|
local handledCPCount = basicLength
|
|
|
|
-- `handledCPCount` is the number of code points that have been handled
|
|
-- `basicLength` is the number of basic code points.
|
|
-- Finish the basic string with a delimiter unless it's empty.
|
|
if (basicLength > 0) then
|
|
table.insert(output, delimiter)
|
|
end
|
|
|
|
-- Main encoding loop:
|
|
while (handledCPCount < inputLength) do
|
|
-- All non-basic code points < n have been handled already. Find
|
|
-- the next larger one:
|
|
local m = maxInt
|
|
for _, v in ipairs(input) do
|
|
if v >= n and v < m then
|
|
m = v
|
|
end
|
|
end
|
|
|
|
-- Increase `delta` enough to advance the decoder's <n,i> state to
|
|
-- <m,0>, but guard against overflow.
|
|
local handledCPCountPlusOne = handledCPCount + 1
|
|
if (m - n > floor((maxInt - delta) / handledCPCountPlusOne)) then
|
|
--error('overflow')
|
|
return nil, "Overflow exception occurred."
|
|
end
|
|
|
|
delta = delta + (m - n) * handledCPCountPlusOne
|
|
n = m
|
|
|
|
for _, currentValue in ipairs(input) do
|
|
|
|
if currentValue < n then
|
|
delta = delta + 1 --Move this down incase of wrong answer
|
|
if delta > maxInt then
|
|
--error("overflow")
|
|
return nil, "Overflow exception occurred."
|
|
end
|
|
end
|
|
|
|
if (currentValue == n) then
|
|
-- Represent delta as a generalized variable-length integer.
|
|
local q = delta
|
|
local k = base
|
|
|
|
repeat
|
|
local t
|
|
|
|
if k <= bias then
|
|
t = tMin
|
|
else
|
|
if k >= bias + tMax then
|
|
t = tMax
|
|
else
|
|
t = k - bias
|
|
end
|
|
end
|
|
|
|
if q < t then
|
|
break
|
|
end
|
|
|
|
local qMinusT = q - t
|
|
local baseMinusT = base - t
|
|
local ans = digitToBasic(t + qMinusT % baseMinusT, 0)
|
|
|
|
table.insert(output, char(ans))
|
|
|
|
q = floor(qMinusT / baseMinusT)
|
|
|
|
k = k + base
|
|
until false
|
|
|
|
local ans = digitToBasic(q, 0)
|
|
table.insert(output, char(ans))
|
|
bias = adapt(delta, handledCPCountPlusOne, handledCPCount == basicLength)
|
|
|
|
delta = 0
|
|
handledCPCount = handledCPCount + 1
|
|
end
|
|
end
|
|
|
|
delta = delta + 1
|
|
n = n + 1
|
|
|
|
end
|
|
|
|
return table.concat(output, '')
|
|
|
|
end
|
|
|
|
-- Converts a Punycode string of ASCII-only symbols to a
|
|
-- list-table of Unicode code points.
|
|
--
|
|
-- @param input The Punycode string of ASCII-only symbols.
|
|
-- @return The resulting list-table of Unicode code points.
|
|
-- The following function is adapted from punycode.js by Mathias Bynens
|
|
-- under the MIT License.
|
|
function decode_input(input)
|
|
|
|
local output = {}
|
|
local inputLength = #input
|
|
local i = 0
|
|
local n = initialN
|
|
local bias = initialBias
|
|
|
|
local basic
|
|
if find(reverse(input), delimiter) then
|
|
basic = #input - find(reverse(input), delimiter)
|
|
else
|
|
basic = -1
|
|
end
|
|
|
|
if basic < 0 then
|
|
basic = 0
|
|
end
|
|
|
|
for j = 1, basic do
|
|
local c = sub(input, j, j)
|
|
local value = byte(c)
|
|
|
|
if value >= 0x80 then
|
|
--error("Not basic")
|
|
return nil, "Not basic exception occurred."
|
|
end
|
|
table.insert(output, value)
|
|
end
|
|
|
|
local index
|
|
if basic > 0 then
|
|
index = basic + 1
|
|
else
|
|
index = 0
|
|
end
|
|
|
|
while index < inputLength do
|
|
local oldi = i
|
|
local w = 1
|
|
local k = base
|
|
|
|
repeat
|
|
|
|
if index >= inputLength then
|
|
--error("Invalid input")
|
|
return nil, "Invalid input exception occurred."
|
|
end
|
|
|
|
local c = sub(input, index+1, index+1)
|
|
local value = byte(c)
|
|
local digit = basicToDigit(value)
|
|
|
|
index = index + 1
|
|
|
|
if (digit >= base or digit > floor((maxInt - i) / w)) then
|
|
--error('overflow');
|
|
return nil, "Overflow exception occurred."
|
|
end
|
|
i = i + digit * w;
|
|
|
|
local t
|
|
if k <= bias then
|
|
t = tMin
|
|
else
|
|
if k >= bias + tMax then
|
|
t = tMax
|
|
else
|
|
t = k - bias
|
|
end
|
|
end
|
|
|
|
if digit < t then
|
|
break
|
|
end
|
|
|
|
local baseMinusT = base - t;
|
|
if (w > floor(maxInt / baseMinusT)) then
|
|
--error('overflow');
|
|
return nil, "Overflow exception occurred."
|
|
end
|
|
|
|
w = w * baseMinusT;
|
|
k = k + base
|
|
|
|
until false
|
|
|
|
local out = #output + 1;
|
|
|
|
bias = adapt(i - oldi, out, oldi == 0)
|
|
|
|
-- `i` was supposed to wrap around from `out` to `0`,
|
|
-- incrementing `n` each time, so we'll fix that now:
|
|
if (floor(i / out) > maxInt - n) then
|
|
--error('overflow');
|
|
return nil, "Overflow exception occurred."
|
|
end
|
|
|
|
n = n + floor(i / out);
|
|
i = i % out;
|
|
for temp = #output, i, -1 do
|
|
output[temp+1] = output[temp]
|
|
end
|
|
output[i+1] = n
|
|
i = i + 1
|
|
end
|
|
|
|
return output
|
|
|
|
end
|
|
|
|
-- Performs punycode encoding on a label
|
|
--
|
|
-- If the label is already ASCII, it is returned as a string. If any encoding
|
|
-- was required, the "xn--" prefix is added.
|
|
--
|
|
-- @param u A list-table of Unicode code points representing a domain label
|
|
-- @return A punycode-encoded ASCII string
|
|
function encode_label(u)
|
|
|
|
local flag = false
|
|
|
|
-- Looks for non-ASCII character
|
|
for _, val in pairs(u) do
|
|
|
|
if not (val >=0 and val <= 127) then
|
|
flag = true
|
|
break
|
|
end
|
|
|
|
end
|
|
|
|
if flag then
|
|
|
|
local res, err = encode_input(u)
|
|
if err then
|
|
return nil, err
|
|
end
|
|
|
|
return 'xn--' .. res
|
|
|
|
else
|
|
return unicode.encode(u, unicode.utf8_enc)
|
|
end
|
|
|
|
end
|
|
|
|
--- Decodes a punycode-encoded label to Unicode.
|
|
--
|
|
-- If the label starts with "xn--", it will be punycode-decoded. Otherwise, it
|
|
-- will be decoded as UTF-8 (ASCII). The return value is always a table of
|
|
-- Unicode code points.
|
|
--
|
|
-- @param s String of input.
|
|
-- @return A table of Unicode code points.
|
|
function decode_label(s)
|
|
|
|
if match(s, "^xn%-%-") then
|
|
|
|
local res, err = decode_input(sub(s, 5))
|
|
if err then
|
|
return nil, err
|
|
end
|
|
|
|
return res
|
|
|
|
else
|
|
return unicode.decode(s, unicode.utf8_dec)
|
|
end
|
|
|
|
end
|
|
|
|
--Ignore the rest if we are not testing.
|
|
if not unittest.testing() then
|
|
return _ENV
|
|
end
|
|
|
|
-- Table of punycode test cases.
|
|
local testCases = {
|
|
{
|
|
"xn--0zwm56d",
|
|
"\xe6\xb5\x8b\xe8\xaf\x95"
|
|
},
|
|
{
|
|
"xn--knigsgsschen-lcb0w",
|
|
"k\xc3\xb6nigsg\xc3\xa4sschen"
|
|
},
|
|
{
|
|
"xn--ab-fsf",
|
|
"a\xe0\xa5\x8db"
|
|
},
|
|
{
|
|
"xn--maana-pta",
|
|
"ma\xc3\xb1ana"
|
|
},
|
|
{
|
|
"xn----dqo34k",
|
|
"\xe2\x98\x83-\xe2\x8c\x98"
|
|
}
|
|
}
|
|
|
|
test_suite = unittest.TestSuite:new()
|
|
|
|
-- Running test cases against Encoding function.
|
|
for i, v in ipairs(testCases) do
|
|
test_suite:add_test(unittest.equal(unicode.encode(decode_label(v[1]), unicode.utf8_enc), v[2]))
|
|
test_suite:add_test(unittest.equal(encode_label(unicode.decode(v[2], unicode.utf8_dec)), v[1]))
|
|
end
|
|
|
|
return _ENV
|