mirror of
				https://github.com/vim/vim.git
				synced 2025-10-26 09:14:23 -04:00 
			
		
		
		
	Problem: Unicode tables are outdated Solution: Update Unicode tables to v16 closes: #15693 Signed-off-by: Christian Brabandt <cb@256bit.org>
		
			
				
	
	
		
			476 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			VimL
		
	
	
	
	
	
			
		
		
	
	
			476 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			VimL
		
	
	
	
	
	
| " Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
 | |
| " The format of the UnicodeData.txt file is explained here:
 | |
| " http://www.unicode.org/Public/5.1.0/ucd/UCD.html
 | |
| " For the other files see the header.
 | |
| "
 | |
| " Might need to update the URL to the emoji-data.txt
 | |
| " Usage: Vim -S <this-file>
 | |
| "
 | |
| " Author: Bram Moolenaar
 | |
| " Last Update: 2020 Aug 24
 | |
| 
 | |
| " Parse lines of UnicodeData.txt.  Creates a list of lists in s:dataprops.
 | |
| func! ParseDataToProps()
 | |
|   let s:dataprops = []
 | |
|   let lnum = 1
 | |
|   while lnum <= line('$')
 | |
|     let l = split(getline(lnum), '\s*;\s*', 1)
 | |
|     if len(l) != 15
 | |
|       echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
 | |
|       return
 | |
|     endif
 | |
|     call add(s:dataprops, l)
 | |
|     let lnum += 1
 | |
|   endwhile
 | |
| endfunc
 | |
| 
 | |
| " Parse lines of CaseFolding.txt.  Creates a list of lists in s:foldprops.
 | |
| func! ParseFoldProps()
 | |
|   let s:foldprops = []
 | |
|   let lnum = 1
 | |
|   while lnum <= line('$')
 | |
|     let line = getline(lnum)
 | |
|     if line !~ '^#' && line !~ '^\s*$'
 | |
|       let l = split(line, '\s*;\s*', 1)
 | |
|       if len(l) != 4
 | |
|         echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
 | |
|         return
 | |
|       endif
 | |
|       call add(s:foldprops, l)
 | |
|     endif
 | |
|     let lnum += 1
 | |
|   endwhile
 | |
| endfunc
 | |
| 
 | |
| " Parse lines of EastAsianWidth.txt.  Creates a list of lists in s:widthprops.
 | |
| func! ParseWidthProps()
 | |
|   let s:widthprops = []
 | |
|   let lnum = 1
 | |
|   while lnum <= line('$')
 | |
|     let line = getline(lnum)
 | |
|     if line !~ '^#' && line !~ '^\s*$'
 | |
|       let l = split(line, '\s*;\s*', 1)
 | |
|       if len(l) != 2
 | |
|         echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
 | |
|         return
 | |
|       endif
 | |
|       call add(s:widthprops, l)
 | |
|     endif
 | |
|     let lnum += 1
 | |
|   endwhile
 | |
| endfunc
 | |
| 
 | |
| " Build the toLower or toUpper table in a new buffer.
 | |
| " Uses s:dataprops.
 | |
| func! BuildCaseTable(name, index)
 | |
|   let start = -1
 | |
|   let end = -1
 | |
|   let step = 0
 | |
|   let add = -1
 | |
|   let ranges = []
 | |
|   for p in s:dataprops
 | |
|     if p[a:index] != ''
 | |
|       let n = ('0x' . p[0]) + 0
 | |
|       let nl = ('0x' . p[a:index]) + 0
 | |
|       if start >= 0 && add == nl - n && (step == 0 || n - end == step)
 | |
|         " continue with same range.
 | |
|         let step = n - end
 | |
|         let end = n
 | |
|       else
 | |
|         if start >= 0
 | |
|           " produce previous range
 | |
|           call Range(ranges, start, end, step, add)
 | |
|         endif
 | |
|         let start = n
 | |
|         let end = n
 | |
|         let step = 0
 | |
|         let add = nl - n
 | |
|       endif
 | |
|     endif
 | |
|   endfor
 | |
|   if start >= 0
 | |
|     call Range(ranges, start, end, step, add)
 | |
|   endif
 | |
| 
 | |
|   " New buffer to put the result in.
 | |
|   new
 | |
|   exe "file to" . a:name
 | |
|   call setline(1, "static convertStruct to" . a:name . "[] =")
 | |
|   call setline(2, "{")
 | |
|   call append('$', ranges)
 | |
|   call setline('$', getline('$')[:-2])  " remove last comma
 | |
|   call setline(line('$') + 1, "};")
 | |
|   wincmd p
 | |
| endfunc
 | |
| 
 | |
| " Build the foldCase table in a new buffer.
 | |
| " Uses s:foldprops.
 | |
| func! BuildFoldTable()
 | |
|   let start = -1
 | |
|   let end = -1
 | |
|   let step = 0
 | |
|   let add = -1
 | |
|   let ranges = []
 | |
|   for p in s:foldprops
 | |
|     if p[1] == 'C' || p[1] == 'S'
 | |
|       let n = ('0x' . p[0]) + 0
 | |
|       let nl = ('0x' . p[2]) + 0
 | |
|       if start >= 0 && add == nl - n && (step == 0 || n - end == step)
 | |
|         " continue with same range.
 | |
|         let step = n - end
 | |
|         let end = n
 | |
|       else
 | |
|         if start >= 0
 | |
|           " produce previous range
 | |
|           call Range(ranges, start, end, step, add)
 | |
|         endif
 | |
|         let start = n
 | |
|         let end = n
 | |
|         let step = 0
 | |
|         let add = nl - n
 | |
|       endif
 | |
|     endif
 | |
|   endfor
 | |
|   if start >= 0
 | |
|     call Range(ranges, start, end, step, add)
 | |
|   endif
 | |
| 
 | |
|   " New buffer to put the result in.
 | |
|   new
 | |
|   file foldCase
 | |
|   call setline(1, "static convertStruct foldCase[] =")
 | |
|   call setline(2, "{")
 | |
|   call append('$', ranges)
 | |
|   call setline('$', getline('$')[:-2])  " remove last comma
 | |
|   call setline(line('$') + 1, "};")
 | |
|   wincmd p
 | |
| endfunc
 | |
| 
 | |
| func! Range(ranges, start, end, step, add)
 | |
|   let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
 | |
|   call add(a:ranges, s)
 | |
| endfunc
 | |
| 
 | |
| " Build the combining table.
 | |
| " Uses s:dataprops.
 | |
| func! BuildCombiningTable()
 | |
|   let start = -1
 | |
|   let end = -1
 | |
|   let ranges = []
 | |
|   for p in s:dataprops
 | |
|     " The 'Mc' property was removed, it does take up space.
 | |
|     if p[2] == 'Mn' || p[2] == 'Me'
 | |
|       let n = ('0x' . p[0]) + 0
 | |
|       if start >= 0 && end + 1 == n
 | |
|         " continue with same range.
 | |
|         let end = n
 | |
|       else
 | |
|         if start >= 0
 | |
|           " produce previous range
 | |
|           call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
 | |
|         endif
 | |
|         let start = n
 | |
|         let end = n
 | |
|       endif
 | |
|     endif
 | |
|   endfor
 | |
|   if start >= 0
 | |
|     call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
 | |
|   endif
 | |
| 
 | |
|   " New buffer to put the result in.
 | |
|   new
 | |
|   file combining
 | |
|   call setline(1, "    static struct interval combining[] =")
 | |
|   call setline(2, "    {")
 | |
|   call append('$', ranges)
 | |
|   call setline('$', getline('$')[:-2])  " remove last comma
 | |
|   call setline(line('$') + 1, "    };")
 | |
|   wincmd p
 | |
| endfunc
 | |
| 
 | |
| " Build the double width or ambiguous width table in a new buffer.
 | |
| " Uses s:widthprops and s:dataprops.
 | |
| func! BuildWidthTable(pattern, tableName)
 | |
|   let start = -1
 | |
|   let end = -1
 | |
|   let ranges = []
 | |
|   let dataidx = 0
 | |
|   " Account for indentation differences between ambiguous and doublewidth
 | |
|   " table in mbyte.c
 | |
|   if a:pattern == 'A'
 | |
|     let spc = '    '
 | |
|   else
 | |
|     let spc = "\t"
 | |
|   endif
 | |
|   for p in s:widthprops
 | |
|     if p[1][0] =~ a:pattern
 | |
|       if p[0] =~ '\.\.'
 | |
|         " It is a range.  we don't check for composing char then.
 | |
|         let rng = split(p[0], '\.\.')
 | |
|         if len(rng) != 2
 | |
|           echoerr "Cannot parse range: '" . p[0] . "' in width table"
 | |
|         endif
 | |
|         let n = ('0x' . rng[0]) + 0
 | |
|         let n_last =  ('0x' . rng[1]) + 0
 | |
|       else
 | |
|         let n = ('0x' . p[0]) + 0
 | |
|         let n_last = n
 | |
|       endif
 | |
|       " Find this char in the data table.
 | |
|       while 1
 | |
|         let dn = ('0x' . s:dataprops[dataidx][0]) + 0
 | |
|         if dn >= n
 | |
|           break
 | |
|         endif
 | |
|         let dataidx += 1
 | |
|       endwhile
 | |
|       if dn != n && n_last == n
 | |
|         echoerr "Cannot find character " . n . " in data table"
 | |
|       endif
 | |
|       " Only use the char when it's not a composing char.
 | |
|       " But use all chars from a range.
 | |
|       let dp = s:dataprops[dataidx]
 | |
|       if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
 | |
|         if start >= 0 && end + 1 == n
 | |
|           " continue with same range.
 | |
|         else
 | |
|           if start >= 0
 | |
|             " produce previous range
 | |
|             call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
 | |
| 	    if a:pattern == 'A'
 | |
| 	      call add(s:ambitable, [start, end])
 | |
| 	    else
 | |
| 	      call add(s:doubletable, [start, end])
 | |
| 	    endif
 | |
|           endif
 | |
|           let start = n
 | |
|         endif
 | |
|         let end = n_last
 | |
|       endif
 | |
|     endif
 | |
|   endfor
 | |
|   if start >= 0
 | |
|     call add(ranges, printf("%s{0x%04x, 0x%04x},", spc, start, end))
 | |
|     if a:pattern == 'A'
 | |
|       call add(s:ambitable, [start, end])
 | |
|     else
 | |
|       call add(s:doubletable, [start, end])
 | |
|     endif
 | |
|   endif
 | |
| 
 | |
|   " New buffer to put the result in.
 | |
|   new
 | |
|   exe "file " . a:tableName
 | |
|   if a:pattern == 'A'
 | |
|     call setline(1, "static struct interval " . a:tableName . "[] =")
 | |
|     call setline(2, "{")
 | |
|   else
 | |
|     call setline(1, "    static struct interval " . a:tableName . "[] =")
 | |
|     call setline(2, "    {")
 | |
|   endif
 | |
|   call append('$', ranges)
 | |
|   call setline('$', getline('$')[:-2])  " remove last comma
 | |
|   if a:pattern == 'A'
 | |
|     call setline(line('$') + 1, "};")
 | |
|   else
 | |
|     call setline(line('$') + 1, "    };")
 | |
|   endif
 | |
|   wincmd p
 | |
| endfunc
 | |
| 
 | |
| 
 | |
| " Get characters from a list of lines in form "12ab .." or "12ab..56cd ..."
 | |
| " and put them in dictionary "chardict"
 | |
| func AddLinesToCharDict(lines, chardict)
 | |
|   for line in a:lines
 | |
|     let tokens = split(line, '\.\.')
 | |
|     let first = str2nr(tokens[0], 16)
 | |
|     if len(tokens) == 1
 | |
|       let last = first
 | |
|     else
 | |
|       let last = str2nr(tokens[1], 16)
 | |
|     endif
 | |
|     for nr in range(first, last)
 | |
|       let a:chardict[nr] = 1
 | |
|     endfor
 | |
|   endfor
 | |
| endfunc
 | |
| 
 | |
| func Test_AddLinesToCharDict()
 | |
|   let dict = {}
 | |
|   call AddLinesToCharDict([
 | |
| 	\ '1234 blah blah',
 | |
| 	\ '1235 blah blah',
 | |
| 	\ '12a0..12a2 blah blah',
 | |
| 	\ '12a1 blah blah',
 | |
| 	\ ], dict)
 | |
|   call assert_equal({0x1234: 1, 0x1235: 1,
 | |
| 	\ 0x12a0: 1, 0x12a1: 1, 0x12a2: 1,
 | |
| 	\ }, dict)
 | |
|   if v:errors != []
 | |
|     echoerr 'AddLinesToCharDict' v:errors
 | |
|     return 1
 | |
|   endif
 | |
|   return 0
 | |
| endfunc
 | |
| 
 | |
| 
 | |
| func CharDictToPairList(chardict)
 | |
|   let result = []
 | |
|   let keys = keys(a:chardict)->map('str2nr(v:val)')->sort('N')
 | |
|   let low = keys[0]
 | |
|   let high = keys[0]
 | |
|   for key in keys
 | |
|     if key > high + 1
 | |
|       call add(result, [low, high])
 | |
|       let low = key
 | |
|       let high = key
 | |
|     else
 | |
|       let high = key
 | |
|     endif
 | |
|   endfor
 | |
|   call add(result, [low, high])
 | |
|   return result
 | |
| endfunc
 | |
| 
 | |
| func Test_CharDictToPairList()
 | |
|   let dict = {0x1020: 1, 0x1021: 1, 0x1022: 1,
 | |
| 	\ 0x1024: 1,
 | |
| 	\ 0x2022: 1,
 | |
| 	\ 0x2024: 1, 0x2025: 1}
 | |
|   call assert_equal([
 | |
| 	\ [0x1020, 0x1022],
 | |
| 	\ [0x1024, 0x1024],
 | |
| 	\ [0x2022, 0x2022],
 | |
| 	\ [0x2024, 0x2025],
 | |
| 	\ ], CharDictToPairList(dict))
 | |
|   if v:errors != []
 | |
|     echoerr 'CharDictToPairList' v:errors
 | |
|     return 1
 | |
|   endif
 | |
|   return 0
 | |
| endfunc
 | |
| 
 | |
| 
 | |
| " Build the amoji width table in a new buffer.
 | |
| func BuildEmojiTable()
 | |
|   " First make the table for all emojis.
 | |
|   let pattern = '; Emoji\s\+#\s'
 | |
|   let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
 | |
| 
 | |
|   " Make a dictionary with an entry for each character.
 | |
|   let chardict = {}
 | |
|   call AddLinesToCharDict(lines, chardict)
 | |
|   let pairlist = CharDictToPairList(chardict)
 | |
|   let allranges = map(pairlist, 'printf("    {0x%04x, 0x%04x},", v:val[0], v:val[1])')
 | |
| 
 | |
|   " New buffer to put the result in.
 | |
|   new
 | |
|   exe 'file emoji_all'
 | |
|   call setline(1, "static struct interval emoji_all[] =")
 | |
|   call setline(2, "{")
 | |
|   call append('$', allranges)
 | |
|   call setline('$', getline('$')[:-2])  " remove last comma
 | |
|   call setline(line('$') + 1, "};")
 | |
|   wincmd p
 | |
| 
 | |
|   " Make the table for wide emojis.
 | |
|   let pattern = '; Emoji_\(Presentation\|Modifier_Base\)\s\+#\s'
 | |
|   let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~pattern'), 'matchstr(v:val,"^\\S\\+")')
 | |
| 
 | |
|   " Make a dictionary with an entry for each character.
 | |
|   let chardict = {}
 | |
|   call AddLinesToCharDict(lines, chardict)
 | |
| 
 | |
|   " exclude characters that are in the "ambiguous" or "doublewidth" table
 | |
|   for ambi in s:ambitable
 | |
|     for nr in range(ambi[0], ambi[1])
 | |
|       if has_key(chardict, nr)
 | |
| 	call remove(chardict, nr)
 | |
|       endif
 | |
|     endfor
 | |
|   endfor
 | |
| 
 | |
|   for wide in s:doubletable
 | |
|     for nr in range(wide[0], wide[1])
 | |
|       if has_key(chardict, nr)
 | |
| 	call remove(chardict, nr)
 | |
|       endif
 | |
|     endfor
 | |
|   endfor
 | |
| 
 | |
|   let pairlist = CharDictToPairList(chardict)
 | |
|   let wide_ranges = map(pairlist, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
 | |
| 
 | |
|   " New buffer to put the result in.
 | |
|   new
 | |
|   exe 'file emoji_wide'
 | |
|   call setline(1, "    static struct interval emoji_wide[] =")
 | |
|   call setline(2, "    {")
 | |
|   call append('$', wide_ranges)
 | |
|   call setline('$', getline('$')[:-2])  " remove last comma
 | |
|   call setline(line('$') + 1, "    };")
 | |
|   wincmd p
 | |
| endfunc
 | |
| 
 | |
| " First test a few things
 | |
| let v:errors = []
 | |
| if Test_AddLinesToCharDict() || Test_CharDictToPairList()
 | |
|   finish
 | |
| endif
 | |
| 
 | |
| if !exists("g:loaded_netrw")
 | |
|   echomsg "Netrw not available, cannot download"
 | |
|   finish
 | |
| endif
 | |
| 
 | |
| " Try to avoid hitting E36
 | |
| set equalalways
 | |
| 
 | |
| " Edit the Unicode text file.  Requires the netrw plugin.
 | |
| edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
 | |
| 
 | |
| " Parse each line, create a list of lists.
 | |
| call ParseDataToProps()
 | |
| 
 | |
| " Build the toLower table.
 | |
| call BuildCaseTable("Lower", 13)
 | |
| 
 | |
| " Build the toUpper table.
 | |
| call BuildCaseTable("Upper", 12)
 | |
| 
 | |
| " Build the ranges of composing chars.
 | |
| call BuildCombiningTable()
 | |
| 
 | |
| " Edit the case folding text file.  Requires the netrw plugin.
 | |
| edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
 | |
| 
 | |
| " Parse each line, create a list of lists.
 | |
| call ParseFoldProps()
 | |
| 
 | |
| " Build the foldCase table.
 | |
| call BuildFoldTable()
 | |
| 
 | |
| " Edit the width text file.  Requires the netrw plugin.
 | |
| edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
 | |
| 
 | |
| " Parse each line, create a list of lists.
 | |
| call ParseWidthProps()
 | |
| 
 | |
| " Build the double width table.
 | |
| let s:doubletable = []
 | |
| call BuildWidthTable('[WF]', 'doublewidth')
 | |
| 
 | |
| " Build the ambiguous width table.
 | |
| let s:ambitable = []
 | |
| call BuildWidthTable('A', 'ambiguous')
 | |
| 
 | |
| " Edit the emoji text file.  Requires the netrw plugin.
 | |
| " commented out, because it drops too many characters
 | |
| "edit https://unicode.org/Public/15.0.0/ucd/emoji/emoji-data.txt
 | |
| "
 | |
| "" Build the emoji table. Ver. 1.0 - 6.0
 | |
| "" Must come after the "ambiguous" and "doublewidth" tables
 | |
| "call BuildEmojiTable()
 |