472 lines
12 KiB
VimL
472 lines
12 KiB
VimL
" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
|
|
" The format of the UnicodeData.txt file is explained here:
|
|
" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
|
|
" For the other files see the header.
|
|
"
|
|
" Might need to update the URL to the emoji-data.txt
|
|
" Usage: Vim -S <this-file>
|
|
"
|
|
" Author: Bram Moolenaar
|
|
" Last Update: 2020 Aug 24
|
|
|
|
" Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
|
|
func! ParseDataToProps()
|
|
let s:dataprops = []
|
|
let lnum = 1
|
|
while lnum <= line('$')
|
|
let l = split(getline(lnum), '\s*;\s*', 1)
|
|
if len(l) != 15
|
|
echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
|
|
return
|
|
endif
|
|
call add(s:dataprops, l)
|
|
let lnum += 1
|
|
endwhile
|
|
endfunc
|
|
|
|
" Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
|
|
func! ParseFoldProps()
|
|
let s:foldprops = []
|
|
let lnum = 1
|
|
while lnum <= line('$')
|
|
let line = getline(lnum)
|
|
if line !~ '^#' && line !~ '^\s*$'
|
|
let l = split(line, '\s*;\s*', 1)
|
|
if len(l) != 4
|
|
echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
|
|
return
|
|
endif
|
|
call add(s:foldprops, l)
|
|
endif
|
|
let lnum += 1
|
|
endwhile
|
|
endfunc
|
|
|
|
" Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
|
|
func! ParseWidthProps()
|
|
let s:widthprops = []
|
|
let lnum = 1
|
|
while lnum <= line('$')
|
|
let line = getline(lnum)
|
|
if line !~ '^#' && line !~ '^\s*$'
|
|
let l = split(line, '\s*;\s*', 1)
|
|
if len(l) != 2
|
|
echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
|
|
return
|
|
endif
|
|
call add(s:widthprops, l)
|
|
endif
|
|
let lnum += 1
|
|
endwhile
|
|
endfunc
|
|
|
|
" Build the toLower or toUpper table in a new buffer.
|
|
" Uses s:dataprops.
|
|
func! BuildCaseTable(name, index)
|
|
let start = -1
|
|
let end = -1
|
|
let step = 0
|
|
let add = -1
|
|
let ranges = []
|
|
for p in s:dataprops
|
|
if p[a:index] != ''
|
|
let n = ('0x' . p[0]) + 0
|
|
let nl = ('0x' . p[a:index]) + 0
|
|
if start >= 0 && add == nl - n && (step == 0 || n - end == step)
|
|
" continue with same range.
|
|
let step = n - end
|
|
let end = n
|
|
else
|
|
if start >= 0
|
|
" produce previous range
|
|
call Range(ranges, start, end, step, add)
|
|
endif
|
|
let start = n
|
|
let end = n
|
|
let step = 0
|
|
let add = nl - n
|
|
endif
|
|
endif
|
|
endfor
|
|
if start >= 0
|
|
call Range(ranges, start, end, step, add)
|
|
endif
|
|
|
|
" New buffer to put the result in.
|
|
new
|
|
exe "file to" . a:name
|
|
call setline(1, "static convertStruct to" . a:name . "[] =")
|
|
call setline(2, "{")
|
|
call append('$', ranges)
|
|
call setline('$', getline('$')[:-2]) " remove last comma
|
|
call setline(line('$') + 1, "};")
|
|
wincmd p
|
|
endfunc
|
|
|
|
" Build the foldCase table in a new buffer.
|
|
" Uses s:foldprops.
|
|
func! BuildFoldTable()
|
|
let start = -1
|
|
let end = -1
|
|
let step = 0
|
|
let add = -1
|
|
let ranges = []
|
|
for p in s:foldprops
|
|
if p[1] == 'C' || p[1] == 'S'
|
|
let n = ('0x' . p[0]) + 0
|
|
let nl = ('0x' . p[2]) + 0
|
|
if start >= 0 && add == nl - n && (step == 0 || n - end == step)
|
|
" continue with same range.
|
|
let step = n - end
|
|
let end = n
|
|
else
|
|
if start >= 0
|
|
" produce previous range
|
|
call Range(ranges, start, end, step, add)
|
|
endif
|
|
let start = n
|
|
let end = n
|
|
let step = 0
|
|
let add = nl - n
|
|
endif
|
|
endif
|
|
endfor
|
|
if start >= 0
|
|
call Range(ranges, start, end, step, add)
|
|
endif
|
|
|
|
" New buffer to put the result in.
|
|
new
|
|
file foldCase
|
|
call setline(1, "static convertStruct foldCase[] =")
|
|
call setline(2, "{")
|
|
call append('$', ranges)
|
|
call setline('$', getline('$')[:-2]) " remove last comma
|
|
call setline(line('$') + 1, "};")
|
|
wincmd p
|
|
endfunc
|
|
|
|
func! Range(ranges, start, end, step, add)
|
|
let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
|
|
call add(a:ranges, s)
|
|
endfunc
|
|
|
|
" Build the combining table.
|
|
" Uses s:dataprops.
|
|
func! BuildCombiningTable()
|
|
let start = -1
|
|
let end = -1
|
|
let ranges = []
|
|
for p in s:dataprops
|
|
" The 'Mc' property was removed, it does take up space.
|
|
if p[2] == 'Mn' || p[2] == 'Me'
|
|
let n = ('0x' . p[0]) + 0
|
|
if start >= 0 && end + 1 == n
|
|
" continue with same range.
|
|
let end = n
|
|
else
|
|
if start >= 0
|
|
" produce previous range
|
|
call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
|
|
endif
|
|
let start = n
|
|
let end = n
|
|
endif
|
|
endif
|
|
endfor
|
|
if start >= 0
|
|
call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
|
|
endif
|
|
|
|
" New buffer to put the result in.
|
|
new
|
|
file combining
|
|
call setline(1, " static struct interval combining[] =")
|
|
call setline(2, " {")
|
|
call append('$', ranges)
|
|
call setline('$', getline('$')[:-2]) " remove last comma
|
|
call setline(line('$') + 1, " };")
|
|
wincmd p
|
|
endfunc
|
|
|
|
" Build the double width or ambiguous width table in a new buffer.
|
|
" Uses s:widthprops and s:dataprops.
|
|
func! BuildWidthTable(pattern, tableName)
|
|
let start = -1
|
|
let end = -1
|
|
let ranges = []
|
|
let dataidx = 0
|
|
" Account for indentation differences between ambiguous and doublewidth
|
|
" table in mbyte.c
|
|
if a:pattern == 'A'
|
|
let spc = ' '
|
|
else
|
|
let spc = "\t"
|
|
endif
|
|
for p in s:widthprops
|
|
if p[1][0] =~ a:pattern
|
|
if p[0] =~ '\.\.'
|
|
" It is a range. we don't check for composing char then.
|
|
let rng = split(p[0], '\.\.')
|
|
if len(rng) != 2
|
|
echoerr "Cannot parse range: '" . p[0] . "' in width table"
|
|
endif
|
|
let n = ('0x' . rng[0]) + 0
|
|
let n_last = ('0x' . rng[1]) + 0
|
|
else
|
|
let n = ('0x' . p[0]) + 0
|
|
let n_last = n
|
|
endif
|
|
" Find this char in the data table.
|
|
while 1
|
|
let dn = ('0x' . s:dataprops[dataidx][0]) + 0
|
|
if dn >= n
|
|
break
|
|
endif
|
|
let dataidx += 1
|
|
endwhile
|
|
if dn != n && n_last == n
|
|
echoerr "Cannot find character " . n . " in data table"
|
|
endif
|
|
" Only use the char when it's not a composing char.
|
|
" But use all chars from a range.
|
|
let dp = s:dataprops[dataidx]
|
|
if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
|
|
if start >= 0 && end + 1 == n
|
|
" continue with same range.
|
|
else
|
|
if start >= 0
|
|
" produce previous range
|
|
call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
|
|
if a:pattern == 'A'
|
|
call add(s:ambitable, [start, end])
|
|
else
|
|
call add(s:doubletable, [start, end])
|
|
endif
|
|
endif
|
|
let start = n
|
|
endif
|
|
let end = n_last
|
|
endif
|
|
endif
|
|
endfor
|
|
if start >= 0
|
|
call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
|
|
if a:pattern == 'A'
|
|
call add(s:ambitable, [start, end])
|
|
else
|
|
call add(s:doubletable, [start, end])
|
|
endif
|
|
endif
|
|
|
|
" New buffer to put the result in.
|
|
new
|
|
exe "file " . a:tableName
|
|
if a:pattern == 'A'
|
|
call setline(1, "static struct interval " . a:tableName . "[] =")
|
|
call setline(2, "{")
|
|
else
|
|
call setline(1, " static struct interval " . a:tableName . "[] =")
|
|
call setline(2, " {")
|
|
endif
|
|
call append('$', ranges)
|
|
call setline('$', getline('$')[:-2]) " remove last comma
|
|
if a:pattern == 'A'
|
|
call setline(line('$') + 1, "};")
|
|
else
|
|
call setline(line('$') + 1, " };")
|
|
endif
|
|
wincmd p
|
|
endfunc
|
|
|
|
|
|
" Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
|
|
" and put them in dictionary "chardict"
|
|
func AddLinesToCharDict(lines, chardict)
|
|
for line in a:lines
|
|
let tokens = split(line, '\.\.')
|
|
let first = str2nr(tokens[0], 16)
|
|
if len(tokens) == 1
|
|
let last = first
|
|
else
|
|
let last = str2nr(tokens[1], 16)
|
|
endif
|
|
for nr in range(first, last)
|
|
let a:chardict[nr] = 1
|
|
endfor
|
|
endfor
|
|
endfunc
|
|
|
|
func Test_AddLinesToCharDict()
|
|
let dict = {}
|
|
call AddLinesToCharDict([
|
|
\ '1234 blah blah',
|
|
\ '1235 blah blah',
|
|
\ '12a0..12a2 blah blah',
|
|
\ '12a1 blah blah',
|
|
\ ], dict)
|
|
call assert_equal({0x1234: 1, 0x1235: 1,
|
|
\ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
|
|
\ }, dict)
|
|
if v:errors != []
|
|
echoerr 'AddLinesToCharDict' v:errors
|
|
return 1
|
|
endif
|
|
return 0
|
|
endfunc
|
|
|
|
|
|
func CharDictToPairList(chardict)
|
|
let result = []
|
|
let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
|
|
let low = keys[0]
|
|
let high = keys[0]
|
|
for key in keys
|
|
if key > high + 1
|
|
call add(result, [low, high])
|
|
let low = key
|
|
let high = key
|
|
else
|
|
let high = key
|
|
endif
|
|
endfor
|
|
call add(result, [low, high])
|
|
return result
|
|
endfunc
|
|
|
|
func Test_CharDictToPairList()
|
|
let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
|
|
\ 0x1024: 1,
|
|
\ 0x2022: 1,
|
|
\ 0x2024: 1, 0x2025: 1}
|
|
call assert_equal([
|
|
\ [0x1020, 0x1022],
|
|
\ [0x1024, 0x1024],
|
|
\ [0x2022, 0x2022],
|
|
\ [0x2024, 0x2025],
|
|
\ ], CharDictToPairList(dict))
|
|
if v:errors != []
|
|
echoerr 'CharDictToPairList' v:errors
|
|
return 1
|
|
endif
|
|
return 0
|
|
endfunc
|
|
|
|
|
|
" Build the amoji width table in a new buffer.
|
|
func BuildEmojiTable()
|
|
" First make the table for all emojis.
|
|
let pattern = '; Emoji\s\+#\s'
|
|
let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
|
|
|
|
" Make a dictionary with an entry for each character.
|
|
let chardict = {}
|
|
call AddLinesToCharDict(lines, chardict)
|
|
let pairlist = CharDictToPairList(chardict)
|
|
let allranges = map(pairlist, 'printf(" {0x%04x, 0x%04x},", v:val[0], v:val[1])')
|
|
|
|
" New buffer to put the result in.
|
|
new
|
|
exe 'file emoji_all'
|
|
call setline(1, "static struct interval emoji_all[] =")
|
|
call setline(2, "{")
|
|
call append('$', allranges)
|
|
call setline('$', getline('$')[:-2]) " remove last comma
|
|
call setline(line('$') + 1, "};")
|
|
wincmd p
|
|
|
|
" Make the table for wide emojis.
|
|
let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
|
|
let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
|
|
|
|
" Make a dictionary with an entry for each character.
|
|
let chardict = {}
|
|
call AddLinesToCharDict(lines, chardict)
|
|
|
|
" exclude characters that are in the "ambiguous" or "doublewidth" table
|
|
for ambi in s:ambitable
|
|
for nr in range(ambi[0], ambi[1])
|
|
if has_key(chardict, nr)
|
|
call remove(chardict, nr)
|
|
endif
|
|
endfor
|
|
endfor
|
|
|
|
for wide in s:doubletable
|
|
for nr in range(wide[0], wide[1])
|
|
if has_key(chardict, nr)
|
|
call remove(chardict, nr)
|
|
endif
|
|
endfor
|
|
endfor
|
|
|
|
let pairlist = CharDictToPairList(chardict)
|
|
let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
|
|
|
|
" New buffer to put the result in.
|
|
new
|
|
exe 'file emoji_wide'
|
|
call setline(1, " static struct interval emoji_wide[] =")
|
|
call setline(2, " {")
|
|
call append('$', wide_ranges)
|
|
call setline('$', getline('$')[:-2]) " remove last comma
|
|
call setline(line('$') + 1, " };")
|
|
wincmd p
|
|
endfunc
|
|
|
|
" First test a few things
|
|
let v:errors = []
|
|
if Test_AddLinesToCharDict() || Test_CharDictToPairList()
|
|
finish
|
|
endif
|
|
|
|
|
|
" Try to avoid hitting E36
|
|
set equalalways
|
|
|
|
" Edit the Unicode text file. Requires the netrw plugin.
|
|
edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
|
|
|
|
" Parse each line, create a list of lists.
|
|
call ParseDataToProps()
|
|
|
|
" Build the toLower table.
|
|
call BuildCaseTable("Lower", 13)
|
|
|
|
" Build the toUpper table.
|
|
call BuildCaseTable("Upper", 12)
|
|
|
|
" Build the ranges of composing chars.
|
|
call BuildCombiningTable()
|
|
|
|
" Edit the case folding text file. Requires the netrw plugin.
|
|
edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
|
|
|
|
" Parse each line, create a list of lists.
|
|
call ParseFoldProps()
|
|
|
|
" Build the foldCase table.
|
|
call BuildFoldTable()
|
|
|
|
" Edit the width text file. Requires the netrw plugin.
|
|
edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
|
|
|
|
" Parse each line, create a list of lists.
|
|
call ParseWidthProps()
|
|
|
|
" Build the double width table.
|
|
let s:doubletable = []
|
|
call BuildWidthTable('[WF]', 'doublewidth')
|
|
|
|
" Build the ambiguous width table.
|
|
let s:ambitable = []
|
|
call BuildWidthTable('A', 'ambiguous')
|
|
|
|
" Edit the emoji text file. Requires the netrw plugin.
|
|
" commented out, because it drops too many characters
|
|
"edit https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
|
|
"
|
|
"" Build the emoji table. Ver. 1.0 - 6.0
|
|
"" Must come after the "ambiguous" and "doublewidth" tables
|
|
"call BuildEmojiTable()
|