User:Kephir/Unicode
Jump to navigation
Jump to search
Below are scripts I use for maintaining the Unicode database at Module:Unicode data.
How to use scripts on this page
[edit]Ingredients: GNU Make, GNU awk, GNU wget.
- Put the scripts below in a dedicated directory.
- Run
make clean
, thenmake
. - The Unicode database will be downloaded, and several
.lua
files will be generated from it. Save them in appropriate locations as specified in the table below. - Update the Unicode version number wherever you find it (e.g. Module:character list)
- Done. Throw a drinking party.
File name | Wiktionary page |
---|---|
aliases.lua
|
Module:Unicode data/aliases |
blocks.lua
|
the blocks table in Module:Unicode data
|
combining.lua
|
Module:Unicode data/combining |
control.lua
|
Module:Unicode data/control (includes Cx and Zx) |
names_XXX.lua
|
Module:Unicode data/names/XXX |
Scripts
[edit]Makefile
.PHONY: all names clean update
all: blocks.lua control.lua aliases.lua combining.lua names
clean:
rm -f *.lua *.txt
names: names.awk UnicodeData.txt
gawk -F';' -f $^
update: all
./upload
%.lua: %.awk
gawk -F';' -f $^ >'$@'
Derived%.txt:
wget 'http://unicode.org/Public/UNIDATA/extracted/$@' -O '$@'
%.txt:
wget 'http://unicode.org/Public/UNIDATA/$@' -O '$@'
blocks.lua: Blocks.txt
control.lua: DerivedGeneralCategory.txt
aliases.lua: NameAliases.txt
combining.lua: DerivedCombiningClass.txt
names_%.lua: names
aliases.awk
/^[0-9A-F]/ {
codepoint = strtonum("0x" $1)
codepoints[codepoint][length(codepoints[codepoint])] = $2";"$3
}
END {
print "local correction, control, alternate, figment, abbreviation = "
print "\t\"correction\", \"control\", \"alternate\", \"figment\", \"abbreviation\""
print ""
print "return {"
for (cpoint = 0; cpoint < 0x10ffff; ++cpoint) {
if (!(cpoint in codepoints))
continue
printf "\t[0x%06x] = {\n", cpoint
for (i in codepoints[cpoint]) {
split(codepoints[cpoint][i], fields, ";")
printf "\t\t{ %12s, \"%s\" };\n", fields[2], fields[1]
}
printf "\t};\n"
}
print "}"
}
names.awk
BEGIN {
last = -1
}
/^[0-9A-Fa-f]/ && !($2 ~ /^</) {
cp = strtonum("0x"$1)
page = sprintf("%03X", int(cp / 0x1000))
if (last != page) {
if (last != -1)
print "}" >> "names_" last ".lua"
print "return {" > "names_" page ".lua"
last = page
}
printf("\t[ 0x%04X ] = \"%s\",\n", strtonum("0x"$1), $2) >> "names_" page ".lua"
}
END {
print "}" >> "names_" last ".lua"
}
blocks.awk
BEGIN {
FS=" *(\\.\\.|;) *"
max_name_len = 0
i = 0
}
/^[0-9A-Fa-f]/ {
starts[i] = strtonum("0x"$1)
ends[i] = strtonum("0x"$2)
names[i] = $3
++i
name_length = length($3)
max_name_len = (name_length > max_name_len) ? name_length : max_name_len
}
END {
print ("local blocks = {")
for (i = 0; i in starts; ++i)
printf("\t{ 0x%06X, 0x%06X, %-*s },\n",
starts[i], ends[i], max_name_len + 2, "\"" names[i] "\"")
print ("}")
}
combining.awk
BEGIN {
FS="\\.\\.| *[;#] *"
}
!($1 ~ /^[0-9A-F]/) {
next
}
($3 ~ /^[0-9]*$/) {
if ($3 == "0")
next
ranges[strtonum("0x" $1)] = strtonum("0x" $2)
kinds[strtonum("0x" $1)] = $3
next
}
!($3 ~ /^[0-9]*$/) {
if ($2 == "0")
next
singles[strtonum("0x" $1)] = $2
next
}
END {
print "return {"
print "\tsingle = {"
for (i = 0; i < 0x10ffff; ++i) {
if (i in singles)
printf("\t\t[0x%06X] = %4s\n", i, singles[i] ",")
}
print "\t};"
print "\tranges = {"
for (i = 0; i < 0x10ffff; ++i) {
if (i in ranges)
printf("\t\t{ 0x%06X, 0x%06X, %3s },\n", i, ranges[i], kinds[i])
}
print "\t};"
print "}"
}
control.awk
BEGIN {
FS="\\.\\.| *[;#] *"
}
!($1 ~ /^[0-9A-F]/) {
next
}
($2 ~ /^[CZ][a-z]/) {
singles[strtonum("0x" $1)] = $2
}
($3 ~ /^[CZ][a-z]/) {
ranges[strtonum("0x" $1)] = strtonum("0x" $2)
kinds[strtonum("0x" $1)] = $3
}
END {
print "local Cc, Cf, Cs, Co, Cn ="
print "\t\"control\", \"format\", \"surrogate\", \"private-use\", \"unassigned\""
print "local Zs, Zl, Zp ="
print "\t\"space-separator\", \"line-separator\", \"paragraph-separator\""
print ""
print "return {"
print "\tsingle = {"
for (i = 0; i < 0x10ffff; ++i) {
if (i in singles)
printf("\t\t[0x%06X] = %s,\n", i, singles[i])
}
print "\t};"
print "\tranges = {"
for (i = 0; i < 0x10ffff; ++i) {
if (i in ranges)
printf("\t\t{ 0x%06X, 0x%06X, %s },\n", i, ranges[i], kinds[i])
}
print "\t};"
print "}"
}