diff options
Diffstat (limited to 'admin/charsets')
-rw-r--r-- | admin/charsets/.arch-inventory | 4 | ||||
-rw-r--r-- | admin/charsets/.gitignore | 2 | ||||
-rw-r--r-- | admin/charsets/Makefile | 349 | ||||
-rw-r--r-- | admin/charsets/big5.awk | 54 | ||||
-rw-r--r-- | admin/charsets/compact.awk | 125 | ||||
-rw-r--r-- | admin/charsets/cp51932.awk | 59 | ||||
-rw-r--r-- | admin/charsets/cp932.awk | 118 | ||||
-rw-r--r-- | admin/charsets/eucjp-ms.awk | 85 | ||||
-rw-r--r-- | admin/charsets/gb180302.awk | 82 | ||||
-rw-r--r-- | admin/charsets/gb180304.awk | 104 | ||||
-rw-r--r-- | admin/charsets/kuten.awk | 7 | ||||
-rwxr-xr-x | admin/charsets/mapconv | 143 | ||||
-rw-r--r-- | admin/charsets/mule-charsets.el | 60 |
13 files changed, 1192 insertions, 0 deletions
diff --git a/admin/charsets/.arch-inventory b/admin/charsets/.arch-inventory new file mode 100644 index 0000000000..0924093e90 --- /dev/null +++ b/admin/charsets/.arch-inventory @@ -0,0 +1,4 @@ +# Unlike most emacs dirs, admin/charsets has a simple non-autoconf-generated makefile +source ^(Makefile)$ + +# arch-tag: ee36cfe3-96f8-4e91-aec4-008c80a85e6b diff --git a/admin/charsets/.gitignore b/admin/charsets/.gitignore new file mode 100644 index 0000000000..ea375dc591 --- /dev/null +++ b/admin/charsets/.gitignore @@ -0,0 +1,2 @@ +*.map +*.el diff --git a/admin/charsets/Makefile b/admin/charsets/Makefile new file mode 100644 index 0000000000..ceecbce821 --- /dev/null +++ b/admin/charsets/Makefile @@ -0,0 +1,349 @@ +# Makefile -- Makefile to generate charset maps in etc/charsets. +# Copyright (C) 2003 +# National Institute of Advanced Industrial Science and Technology (AIST) +# Registration Number H13PRO009 +# +# This file is part of GNU Emacs. + +# GNU Emacs is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# GNU Emacs is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with GNU Emacs; see the file COPYING. If not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# Commentary + +# At first, set these environment variables: +# GLIBC_CHARMAPS +# Directory of glibc-VERSION/localedate/charmaps. +# VERSION must be 2.3 or the later. +# MISC_CHARMAPS +# Direcory containing these charmap files: +# o bulgarian-mik.txt.gz +# provided at <http://czyborra.com/charsets/> +# o PTCP154 +# provided at <http://www.iana.org/assignments/charset-reg/> +# o stdenc.txt and symbol.txt +# provided at <http://www.unicode.org/Public/MAPPINGS/> +# o cp932.txt +# provided at <http://www.unicode.org/Public/MAPPINGS/VENDERS> +# o Uni2JIS +# provided at <http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/CJK.html> +# o 720.htm and 858.htm +# provided at <http://www.microsoft.com/globaldev/reference/oem/> +# o eucJP-13th.txt, eucJP-udc.txt, eucJP-ibmext.txt +# provided at <http://www.opengroup.or.jp/jvc/cde/> +# o cns2ucsdkw.txt +# available by: +# % cvs -d :pserver:[email protected]:\ +# /cvsroot/kanji-database login +# % cvs -d :pserver:[email protected]:\ +# /cvsroot/kanji-database co kanji-database +# OLDEMACS +# emacs of version 21.3.50 or later +# +# Then, do this: +# % make XXX.map (or make all) +# % make install + +CHARSETS = ${ISO8859} ${IBM} ${CODEPAGE} ${CJK} ${MISC} ${MULE} + +# Note: We can not prepend "ISO-" to these map files because of file +# name limits on DOS. +ISO8859 = \ + 8859-2.map 8859-3.map 8859-4.map 8859-5.map 8859-6.map 8859-7.map \ + 8859-8.map 8859-9.map 8859-10.map 8859-11.map 8859-13.map 8859-14.map \ + 8859-15.map 8859-16.map + +IBM = \ + IBM037.map IBM038.map \ + IBM256.map IBM273.map IBM274.map IBM275.map IBM277.map IBM278.map \ + IBM280.map IBM281.map IBM284.map IBM285.map IBM290.map IBM297.map \ + IBM420.map IBM423.map IBM424.map IBM437.map IBM500.map IBM850.map \ + IBM851.map IBM852.map IBM855.map IBM856.map IBM857.map IBM860.map \ + IBM861.map IBM862.map IBM863.map IBM864.map IBM865.map IBM866.map \ + IBM868.map IBM869.map IBM870.map IBM871.map IBM874.map IBM875.map \ + IBM880.map IBM891.map IBM903.map IBM904.map IBM905.map IBM918.map \ + IBM1004.map IBM1026.map IBM1047.map + +CODEPAGE = \ + CP737.map CP775.map CP1125.map\ + CP1250.map CP1251.map CP1252.map CP1253.map CP1254.map \ + CP1255.map CP1256.map CP1257.map CP1258.map \ + CP10007.map \ + CP720.map CP858.map + +CJK = GB2312.map GBK.map GB180302.map GB180304.map \ + BIG5.map BIG5-HKSCS.map\ + CNS-1.map CNS-2.map CNS-3.map CNS-4.map CNS-5.map CNS-6.map CNS-7.map \ + CNS-F.map \ + JISX0201.map JISX0208.map JISX0212.map JISX2131.map JISX2132.map \ + JISC6226.map CP932-2BYTE.map JISX213A.map\ + KSC5601.map KSC5636.map JOHAB.map + +MISC = KOI-8.map KOI8-R.map KOI8-U.map KOI8-T.map ALTERNATIVNYJ.map \ + MIK.map PTCP154.map \ + TIS-620.map VISCII.map VSCII.map VSCII-2.map\ + KA-PS.map KA-ACADEMY.map \ + HP-ROMAN8.map NEXTSTEP.map MACINTOSH.map EBCDICUK.map EBCDICUS.map \ + stdenc.map symbol.map \ + CP949-2BYTE.map \ + BIG5-1.map BIG5-2.map + +# Emacs-mule charsets. +MULE = MULE-ethiopic.map MULE-ipa.map MULE-is13194.map \ + MULE-sisheng.map MULE-tibetan.map \ + MULE-lviscii.map MULE-uviscii.map + +TRANS_TABLE = cp51932.el eucjp-ms.el + +all: ${CHARSETS} ${TRANS_TABLE} + +AWK = gawk + +# Rules for each charset + +VSCII.map: ${GLIBC_CHARMAPS}/TCVN5712-1 mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[0-9a-f].[ ]/' GLIBC-1 compact.awk > $@ + +VSCII-2.map: ${GLIBC_CHARMAPS}/TCVN5712-1 mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[2-7a-f].[ ]/' GLIBC-1 compact.awk \ + | sed 's/0x20-0x7F.*/0x00-0x7F 0x0000/' > $@ + +ALTERNATIVNYJ.map: IBM866.map + # Generating $@... + @echo "# Modified from ibm866 according to the chart at" > $@ + @echo "# http://www.cyrillic.com/ref/cyrillic/koi-8alt.html," >> $@ + @echo "# with guesses for the Unicodes of the glyphs." >> $@ + @sed -e '/0xF2/ s/ .*/ 0x2019/' \ + -e '/0xF3/ s/ .*/ 0x2018/' \ + -e '/0xF4/ s/ .*/ 0x0301/' \ + -e '/0xF5/ s/ .*/ 0x0300/' \ + -e '/0xF6/ s/ .*/ 0x203A/' \ + -e '/0xF7/ s/ .*/ 0x2039/' \ + -e '/0xF8/ s/ .*/ 0x2191/' \ + -e '/0xF9/ s/ .*/ 0x2193/' \ + -e '/0xFA/ s/ .*/ 0x00B1/' \ + -e '/0xFB/ s/ .*/ 0x00F7/' < $< >> $@ + +MIK.map: ${MISC_CHARMAPS}/bulgarian-mik.txt.gz mapconv compact.awk + # Generating $@... + @mapconv $< '1,$$' CZYBORRA compact.awk > $@ + +PTCP154.map: ${MISC_CHARMAPS}/PTCP154 mapconv compact.awk + # Generating $@... + @mapconv $< '/^0x/' IANA compact.awk > $@ + +stdenc.map: ${MISC_CHARMAPS}/stdenc.txt mapconv compact.awk + # Generating $@... + @mapconv $< '/^[0-9A-Fa-f]/' UNICODE compact.awk > $@ + +symbol.map: ${MISC_CHARMAPS}/symbol.txt mapconv compact.awk + # Generating $@... + @mapconv $< '/^[0-9A-Fa-f]/' UNICODE compact.awk > $@ + +CP720.map: ${MISC_CHARMAPS}/720.htm mapconv compact.awk + # Generating $@... + @mapconv $< '/^[0-9A-F]/' MICROSOFT compact.awk > $@ + +CP858.map: ${MISC_CHARMAPS}/858.htm mapconv compact.awk + # Generating $@... + @mapconv $< '/^[0-9A-F]/' MICROSOFT compact.awk > $@ + +CP949-2BYTE.map: ${GLIBC_CHARMAPS}/CP949 mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[89a-f]/' GLIBC-2 compact.awk > $@ + +GB2312.map: ${GLIBC_CHARMAPS}/GB2312 mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[a-f]/' GLIBC-2-7 compact.awk > $@ + +GBK.map: ${GLIBC_CHARMAPS}/GBK mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[89a-f]/' GLIBC-2 compact.awk > $@ + +GB180302.map: ${GLIBC_CHARMAPS}/GB18030 mapconv gb180302.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x..\/x..[ ]/' GLIBC-2 gb180302.awk > $@ + +GB180304.map: GB180302.map gb180304.awk + # Generating $@... + @$(AWK) -f gb180304.awk < $< > $@ + +JISX0201.map: ${GLIBC_CHARMAPS}/JIS_X0201 mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[0-9]/' GLIBC-1 compact.awk > $@ + @echo "# Generated by hand" >> $@ + @echo "0xA1-0xDF 0xFF61" >> $@ + +JISX0208.map: ${GLIBC_CHARMAPS}/EUC-JP mapconv + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[a-f]/' GLIBC-2-7 \ + | sed 's/0x2015/0x2014/' > $@ + +JISX0212.map: ${GLIBC_CHARMAPS}/EUC-JP mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x8f/ s,/x8f,,' GLIBC-2-7 compact.awk > $@ + +JISX2131.map: ${GLIBC_CHARMAPS}/EUC-JISX0213 mapconv + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[a-f]/' GLIBC-2-7 \ + | sed -e 's/0x2015/0x2014/' -e 's/0x2299/0x29BF/' > $@ + +JISX2132.map: ${GLIBC_CHARMAPS}/EUC-JISX0213 mapconv + # Generating $@... + @mapconv $< '/^<.*[ ]\/x8f/ s,/x8f,,' GLIBC-2-7 > $@ + +JISX213A.map: + # Generating $@ + @(echo "0x2E21 0x4FF1"; \ + echo "0x2F7E 0x525D"; \ + echo "0x4F54 0x20B9F"; \ + echo "0x4F7E 0x541E"; \ + echo "0x7427 0x5653"; \ + echo "0x7E7A 0x59F8"; \ + echo "0x7E7B 0x5C5B"; \ + echo "0x7E7C 0x5E77"; \ + echo "0x7E7D 0x7626"; \ + echo "0x7E7E 0x7E6B") > $@ + +CP932-2BYTE.map: ${MISC_CHARMAPS}/cp932.txt mapconv cp932.awk + # Generating $@... + @mapconv $< '/^0x[89A-F][0-9A-F][0-9A-F]/' UNICODE2 cp932.awk > $@ + +cp51932.el: CP932-2BYTE.map cp51932.awk + @$(AWK) -f cp51932.awk < CP932-2BYTE.map > $@ + +eucjp-ms.el: ${MISC_CHARMAPS}/eucJP-13th.txt ${MISC_CHARMAPS}/eucJP-udc.txt \ + ${MISC_CHARMAPS}/eucJP-ibmext.txt eucjp-ms.awk + @(cd ${MISC_CHARMAPS}; \ + cat eucJP-13th.txt eucJP-udc.txt eucJP-ibmext.txt) \ + | $(AWK) -f eucjp-ms.awk > $@ + +JISC6226.map : ${MISC_CHARMAPS}/Uni2JIS mapconv kuten.awk + # Generating $@... + @mapconv $< '/^[^#].*0-/' YASUOKA kuten.awk > $@ + +KSC5601.map: ${GLIBC_CHARMAPS}/EUC-KR mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[a-f]/' GLIBC-2-7 compact.awk > $@ + +BIG5.map: ${GLIBC_CHARMAPS}/BIG5 mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[a-f]/' GLIBC-2 > $@ + +BIG5-1.map: BIG5.map mapconv big5.awk + # Generating $@... + @echo "Generated from $<" > $@ + @sed -n -e '/0xa140/,/0xc8fe/p' < $< | gawk -f big5.awk >> $@ + +BIG5-2.map: BIG5.map mapconv big5.awk + # Generating $@... + @echo "Generated from $<" > $@ + @sed -n -e '/0xc940/,$$ p' < $< | gawk -f big5.awk >> $@ + +BIG5-HKSCS.map: ${GLIBC_CHARMAPS}/BIG5-HKSCS mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[89a-f].\//' GLIBC-2 compact.awk > $@ + +JOHAB.map: ${GLIBC_CHARMAPS}/JOHAB mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[89a-f]/' GLIBC-2 compact.awk > $@ + +CNS-1.map: ${GLIBC_CHARMAPS}/EUC-TW mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x[a-f]/' GLIBC-2-7 compact.awk > $@ + +# CNS-1.map: ${MISC_CHARMAPS}/cns2ucsdkw.txt mapconv compact.awk +# # Generating $@... +# @mapconv $< '/^C1/' KANJI-DATABASE compact.awk > $@ + +CNS-2.map: ${MISC_CHARMAPS}/cns2ucsdkw.txt mapconv compact.awk + # Generating $@... + @mapconv $< '/^C2/' KANJI-DATABASE compact.awk > $@ + +CNS-3.map: ${MISC_CHARMAPS}/cns2ucsdkw.txt mapconv compact.awk + # Generating $@... + @mapconv $< '/^C3/' KANJI-DATABASE compact.awk > $@ + +CNS-4.map: ${MISC_CHARMAPS}/cns2ucsdkw.txt mapconv compact.awk + # Generating $@... + @mapconv $< '/^C4/' KANJI-DATABASE compact.awk > $@ + +CNS-5.map: ${MISC_CHARMAPS}/cns2ucsdkw.txt mapconv compact.awk + # Generating $@... + @mapconv $< '/^C5/' KANJI-DATABASE compact.awk > $@ + +CNS-6.map: ${MISC_CHARMAPS}/cns2ucsdkw.txt mapconv compact.awk + # Generating $@... + @mapconv $< '/^C6/' KANJI-DATABASE compact.awk > $@ + +CNS-7.map: ${MISC_CHARMAPS}/cns2ucsdkw.txt mapconv compact.awk + # Generating $@... + @mapconv $< '/^C7/' KANJI-DATABASE compact.awk > $@ + +CNS-F.map: ${GLIBC_CHARMAPS}/EUC-TW mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*\/x8e\/xaf/ s,/x8e/xaf,,' GLIBC-2-7 compact.awk > $@ + +# General target to produce map files for mule charsets. +MULE-%.map: mule-charsets.el + # Generating $@... + @${OLDEMACS} -batch -l ./mule-charsets.el $@ + +# General target to produce map files for ISO-8859, GEORGIAN, and +# EBCDIC charsets. We can not use the original file name because of +# file name limit on DOS. "KA" is ISO 639 language code for Georgian. + +8859-%.map: ${GLIBC_CHARMAPS}/ISO-8859-% mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x/' GLIBC-1 compact.awk > $@ + +KA-%.map: ${GLIBC_CHARMAPS}/GEORGIAN-% mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x/' GLIBC-1 compact.awk > $@ + +EBCDIC%.map: ${GLIBC_CHARMAPS}/EBCDIC-% mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x/' GLIBC-1 compact.awk > $@ + +# General target to produce map files for single-byte charsets. + +%.map: ${GLIBC_CHARMAPS}/% mapconv compact.awk + # Generating $@... + @mapconv $< '/^<.*[ ]\/x/' GLIBC-1 compact.awk > $@ + +install: + @for f in ${CHARSETS}; do \ + if test -r $$f; then \ + if ! cmp -s $$f ../../etc/charsets/$$f; then \ + echo updating $$f; \ + cp $$f ../../etc/charsets; \ + fi; \ + fi; \ + done + @for f in ${TRANS_TABLE}; do \ + if test -r $$f; then \ + if ! cmp -s $$f ../../lisp/international/$$f; then \ + echo updating $$f; \ + cp $$f ../../lisp/international; \ + fi; \ + fi; \ + done + +# Clear files that are automatically generated. +clean: + rm -f ${CHARSETS} ${TRANS_TABLE} + +# arch-tag: 90b3bf30-1fef-45bf-b30c-665c30c22310 diff --git a/admin/charsets/big5.awk b/admin/charsets/big5.awk new file mode 100644 index 0000000000..e238f7541c --- /dev/null +++ b/admin/charsets/big5.awk @@ -0,0 +1,54 @@ +BEGIN { + tohex["A"] = 10; + tohex["B"] = 11; + tohex["C"] = 12; + tohex["D"] = 13; + tohex["E"] = 14; + tohex["F"] = 15; + tohex["a"] = 10; + tohex["b"] = 11; + tohex["c"] = 12; + tohex["d"] = 13; + tohex["e"] = 14; + tohex["f"] = 15; +} + +function decode_hex(str) { + n = 0; + len = length(str); + for (i = 1; i <= len; i++) + { + c = substr (str, i, 1); + if (c >= "0" && c <= "9") + n = n * 16 + (c - "0"); + else + n = n * 16 + tohex[c]; + } + return n; +} + +function decode_big5(big5) { + b0 = int(big5 / 256); + b1 = big5 % 256; +# (0xFF - 0xA1 + 0x7F - 0x40) = 157 +# (0xA1 - (0x7F - 0x40)) = 98 +# (0xC9 - 0xA1) * (0xFF - 0xA1 + 0x7F - 0x40) = 6280 + if (b1 < 127) + idx = (b0 - 161) * 157 + (b1 - 64); + else + idx = (b0 - 161) * 157 + (b1 - 98); + if (b0 >= 201) + idx -= 6280; + b0 = int(idx / 94) + 33; + b1 = (idx % 94) + 33; + return (b0 * 256 + b1) +} + +{ + big5 = decode_hex($1); + code = decode_big5(big5); + printf "0x%04X %s\n", code, $2; +} + + +# arch-tag: 36f08d21-0d24-4b67-852d-a9a51299586d diff --git a/admin/charsets/compact.awk b/admin/charsets/compact.awk new file mode 100644 index 0000000000..ba756b1ae5 --- /dev/null +++ b/admin/charsets/compact.awk @@ -0,0 +1,125 @@ +# compact.awk -- Make charset map compact. +# Copyright (C) 2003 +# National Institute of Advanced Industrial Science and Technology (AIST) +# Registration Number H13PRO009 +# +# This file is part of GNU Emacs. +# +# GNU Emacs is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# GNU Emacs is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Emacs; see the file COPYING. If not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# Comment: +# Make a charset map compact by changing this kind of line sequence: +# 0x00 0x0000 +# 0x01 0x0001 +# ... +# 0x7F 0x007F +# to one line of this format: +# 0x00-0x7F 0x0000 + +BEGIN { + tohex["0"] = 1; + tohex["1"] = 2; + tohex["2"] = 3; + tohex["3"] = 4; + tohex["4"] = 5; + tohex["5"] = 6; + tohex["6"] = 7; + tohex["7"] = 8; + tohex["8"] = 9; + tohex["9"] = 10; + tohex["A"] = 11; + tohex["B"] = 12; + tohex["C"] = 13; + tohex["D"] = 14; + tohex["E"] = 15; + tohex["F"] = 16; + tohex["a"] = 11; + tohex["b"] = 12; + tohex["c"] = 13; + tohex["d"] = 14; + tohex["e"] = 15; + tohex["f"] = 16; + from_code = 0; + to_code = -1; + to_unicode = 0; + from_unicode = 0; +} + +function decode_hex(str, idx) { + n = 0; + len = length(str); + for (i = idx; i <= len; i++) + { + c = tohex[substr (str, i, 1)]; + if (c == 0) + break; + n = n * 16 + c - 1; + } + return n; +} + +/^\#/ { + print; + next; +} + +{ + code = decode_hex($1, 3); + unicode = decode_hex($2, 3); + if ((code == to_code + 1) && (unicode == to_unicode + 1)) + { + to_code++; + to_unicode++; + } + else + { + if (to_code < 256) + { + if (from_code == to_code) + printf "0x%02X 0x%04X\n", from_code, from_unicode; + else if (from_code < to_code) + printf "0x%02X-0x%02X 0x%04X\n", from_code, to_code, from_unicode; + } + else + { + if (from_code == to_code) + printf "0x%04X 0x%04X\n", from_code, from_unicode; + else if (from_code < to_code) + printf "0x%04X-0x%04X 0x%04X\n", from_code, to_code, from_unicode; + } + from_code = to_code = code; + from_unicode = to_unicode = unicode; + } +} + +END { + if (to_code < 256) + { + if (from_code == to_code) + printf "0x%02X 0x%04X\n", from_code, from_unicode; + else + printf "0x%02X-0x%02X 0x%04X\n", from_code, to_code, from_unicode; + } + else + { + if (from_code == to_code) + printf "0x%04X 0x%04X\n", from_code, from_unicode; + else + printf "0x%04X-0x%04X 0x%04X\n", from_code, to_code, from_unicode; + } +} + +# arch-tag: 7e6f57c3-8e62-4af3-8916-ca67bca3a0ce diff --git a/admin/charsets/cp51932.awk b/admin/charsets/cp51932.awk new file mode 100644 index 0000000000..e30f4e29f1 --- /dev/null +++ b/admin/charsets/cp51932.awk @@ -0,0 +1,59 @@ +# cp51932.awk -- Generate a translation table for CP51932. +# Copyright (C) 2004 +# National Institute of Advanced Industrial Science and Technology (AIST) +# Registration Number H13PRO009 +# +# This file is part of GNU Emacs. +# +# GNU Emacs is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# GNU Emacs is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Emacs; see the file COPYING. If not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# Comment: + +# Genereate a translation table for CP51932 (EUC-JP of MicroSoft Version). +# It maps invalid JISX0208 code points used by CP51932 to Unicode. +# 4th field of the input has these meanings: +# 0: JISX0208 characters. +# 1: NEC special characters. +# 2: IBM extension characters. +# 3: NEC selection of IBM extension characters. +# Among them, 1 and 3 are the target characters. 2 should have +# already been mapped to 1 or 3. + +BEGIN { + print ";;; cp51932.el -- translation table for CP51932. -*- no-byte-compile: t -*-"; + print ";;; Automatically genrated from CP932-2BYTE.map"; + print "(let ((map"; + printf " '(;JISEXT<->UNICODE"; +} + +/# [13]/ { + printf "\n (#x%s . #x%s)", $5 ,substr($2, 3, 4); +} + +END { + print ")))"; + print " (mapc #'(lambda (x)"; + print " (setcar x (decode-char 'japanese-jisx0208 (car x))))"; + print " map)"; + print " (define-translation-table 'cp51932-decode map)"; + print " (mapc #'(lambda (x)"; + print " (let ((tmp (car x)))"; + print " (setcar x (cdr x)) (setcdr x tmp)))"; + print " map)"; + print " (define-translation-table 'cp51932-encode map))"; +} + +# arch-tag: bbae996b-2d1c-4e85-bb55-ac30146d7504 diff --git a/admin/charsets/cp932.awk b/admin/charsets/cp932.awk new file mode 100644 index 0000000000..3c1da2d51b --- /dev/null +++ b/admin/charsets/cp932.awk @@ -0,0 +1,118 @@ +# cp932.awk -- Add sort keys and append user defined area to CP932-2BYTE.map. +# Copyright (C) 2004 +# National Institute of Advanced Industrial Science and Technology (AIST) +# Registration Number H13PRO009 +# +# This file is part of GNU Emacs. +# +# GNU Emacs is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# GNU Emacs is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Emacs; see the file COPYING. If not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# Comment: +# Add a sort key 0, 1, 2, or 3 at the tail of each line as a comment +# to realize the round trip mapping to Unicode works as described in +# this page: +# http://support.microsoft.com/default.aspx?scid=kb;EN-US;170559 +# Each sort key means as below: +# 0: JISX0208 characters. +# 1: NEC special characters. +# 2: IBM extension characters. +# 3: NEC selection of IBM extension characters. +# 4: user defined area + +BEGIN { + tohex["A"] = 10; + tohex["B"] = 11; + tohex["C"] = 12; + tohex["D"] = 13; + tohex["E"] = 14; + tohex["F"] = 15; +} + +function decode_hex(str) { + n = 0; + len = length(str); + for (i = 1; i <= len; i++) + { + c = substr(str, i, 1); + if (c >= "0" && c <= "9") + n = n * 16 + (c - "0"); + else + n = n * 16 + tohex[c]; + } + return n; +} + +function sjis_to_jis_ku(code) +{ + s1 = int(code / 256); + s2 = code % 256; + if (s2 >= 159) # s2 >= 0x9F + { + if (s1 >= 224) # s1 >= 0xE0 + j1 = s1 * 2 - 352; # j1 = s1 * 2 - 0x160 + else + j1 = s1 * 2 - 224; # j1 = s1 * 2 - 0xE0 + j2 = s2 - 126 # j2 = s2 - #x7E + } + else + { + if (s1 >= 224) + j1 = s1 * 2 - 353; # j1 = s1 * 2 - 0x161 + else + j1 = s1 * 2 - 225; # j1 = s1 * 2 - 0xE1 + if (s2 >= 127) # s2 >= #x7F + j2 = s2 - 32; + else + j2 = s2 - 31; + } + return j1 - 32; +} + +/^0x[89E]/ { + sjis=decode_hex(substr($1, 3, 4)) + ku=sjis_to_jis_ku(sjis); + if (ku == 13) + printf "%s # 1 %02X%02X\n", $0, j1, j2; + else if (ku >= 89 && ku <= 92) + printf "%s # 3 %02X%02X\n", $0, j1, j2; + else + printf "%s # 0 %02X%02X\n", $0, j1, j2; + next; +} + +/^0xF/ { + printf "%s # 2\n", $0; + next; +} + +{ + print; +} + +END { + code = 57344; # 0xE000 + for (i = 240; i < 250; i++) + { + for (j = 64; j <= 126; j++) + printf "0x%02X%02X 0x%04X # 4\n", i, j, code++; + for (j = 128; j <= 158; j++) + printf "0x%02X%02X 0x%04X # 4\n", i, j, code++; + for (; j <= 252; j++) + printf "0x%02X%02X 0x%04X # 4\n", i, j, code++; + } +} + +# arch-tag: 998dc444-759d-43ef-87e3-2ab205011394 diff --git a/admin/charsets/eucjp-ms.awk b/admin/charsets/eucjp-ms.awk new file mode 100644 index 0000000000..051e388e7f --- /dev/null +++ b/admin/charsets/eucjp-ms.awk @@ -0,0 +1,85 @@ +# eucjp-ms.awk -- Generate a translation table for eucJP-ms. +# Copyright (C) 2004 +# National Institute of Advanced Industrial Science and Technology (AIST) +# Registration Number H13PRO009 +# +# This file is part of GNU Emacs. +# +# GNU Emacs is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# GNU Emacs is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Emacs; see the file COPYING. If not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# Comment: + +# eucJP-ms is one of eucJP-open encoding defined at this page: +# http://www.opengroup.or.jp/jvc/cde/appendix.html + +BEGIN { + print ";;; eucjp-ms.el -- translation table for eucJP-ms. -*- no-byte-compile: t -*-"; + print ";;; Automatically genrated from eucJP-13th.txt, eucJP-udc.txt, eucJP-ibmext.txt"; + print "(let ((map"; + printf " '(;JISEXT<->UNICODE"; + + tohex["A"] = 10; + tohex["B"] = 11; + tohex["C"] = 12; + tohex["D"] = 13; + tohex["E"] = 14; + tohex["F"] = 15; +} + +function decode_hex(str) { + n = 0; + len = length(str); + for (i = 1; i <= len; i++) + { + c = substr(str, i, 1); + if (c >= "0" && c <= "9") + n = n * 16 + (c - "0"); + else + n = n * 16 + tohex[c]; + } + return n; +} + +/0x8F/ { + code = decode_hex(substr($1, 5, 4)); + code -= 32896; # code -= 0x8080 + printf "\n (#x%04x #x%s)", code, substr($2, 3, 4); + next; +} + +/0x[A-F]/ { + code = decode_hex(substr($1, 3, 4)); + code -= 32896; # code -= 0x8080 + printf "\n (#x%04x . #x%s)", code, substr($2, 3, 4); +} + +END { + print ")))"; + print " (mapc #'(lambda (x)"; + print " (if (integerp (cdr x))"; + print " (setcar x (decode-char 'japanese-jisx0208 (car x)))"; + print " (setcar x (decode-char 'japanese-jisx0212 (car x)))"; + print " (setcdr x (cadr x))))"; + print " map)"; + print " (define-translation-table 'eucjp-ms-decode map)"; + print " (mapc #'(lambda (x)"; + print " (let ((tmp (car x)))"; + print " (setcar x (cdr x)) (setcdr x tmp)))"; + print " map)"; + print " (define-translation-table 'eucjp-ms-encode map))"; +} + +# arch-tag: d9cc7af7-2d6e-48cd-8eed-a6d25226de7c diff --git a/admin/charsets/gb180302.awk b/admin/charsets/gb180302.awk new file mode 100644 index 0000000000..5eaf587763 --- /dev/null +++ b/admin/charsets/gb180302.awk @@ -0,0 +1,82 @@ +BEGIN { + tohex["A"] = 10; + tohex["B"] = 11; + tohex["C"] = 12; + tohex["D"] = 13; + tohex["E"] = 14; + tohex["F"] = 15; + tohex["a"] = 10; + tohex["b"] = 11; + tohex["c"] = 12; + tohex["d"] = 13; + tohex["e"] = 14; + tohex["f"] = 15; + from_gb = 0; + to_gb = -1; + to_unicode = 0; + from_unicode = 0; +} + +function decode_hex(str) { + n = 0; + len = length(str); + for (i = 1; i <= len; i++) + { + c = substr (str, i, 1); + if (c >= "0" && c <= "9") + n = n * 16 + (c - "0"); + else + n = n * 16 + tohex[c]; + } + return n; +} + +function gb_to_index(gb) { + b0 = int(gb / 256); + b1 = gb % 256; + idx = (((b0 - 129)) * 191 + b1 - 64); +# if (b1 >= 128) +# idx--; + return idx +} + +function index_to_gb(idx) { + b0 = int(idx / 191) + 129; + b1 = (idx % 191) + 64; +# if (b1 >= 127) +# b1++; + return (b0 * 256 + b1); +} + +/^\#/ { + print; + next; +} + +{ + gb = gb_to_index(decode_hex(substr($1, 3, 4))); + unicode = decode_hex(substr($2, 3, 4)); + if ((gb == to_gb + 1) && (unicode == to_unicode + 1)) + { + to_gb++; + to_unicode++; + } + else + { + if (from_gb == to_gb) + printf "0x%04X 0x%04X\n", index_to_gb(from_gb), from_unicode; + else if (from_gb < to_gb) + printf "0x%04X-0x%04X 0x%04X\n", + index_to_gb(from_gb), index_to_gb(to_gb), from_unicode; + from_gb = to_gb = gb; + from_unicode = to_unicode = unicode; + } +} + +END { + if (from_gb <= to_gb) + printf "0x%04X-0x%04X 0x%04X\n", + index_to_gb(from_gb), index_to_gb(to_gb), from_unicode; +} + +# arch-tag: d7dbad89-a512-41a4-8ee0-ba1a4505b8c1 diff --git a/admin/charsets/gb180304.awk b/admin/charsets/gb180304.awk new file mode 100644 index 0000000000..f3f50db9a8 --- /dev/null +++ b/admin/charsets/gb180304.awk @@ -0,0 +1,104 @@ +BEGIN { + tohex["A"] = 10; + tohex["B"] = 11; + tohex["C"] = 12; + tohex["D"] = 13; + tohex["E"] = 14; + tohex["F"] = 15; + tohex["a"] = 10; + tohex["b"] = 11; + tohex["c"] = 12; + tohex["d"] = 13; + tohex["e"] = 14; + tohex["f"] = 15; +} + +function decode_hex(str) { + n = 0; + len = length(str); + for (i = 1; i <= len; i++) + { + c = substr (str, i, 1); + if (c >= "0" && c <= "9") + n = n * 16 + (c - "0"); + else + n = n * 16 + tohex[c]; + } + return n; +} + +function gb_to_index(gb) { + b0 = int(gb / 256); + b1 = gb % 256; + idx = (((b0 - 129)) * 191 + b1 - 64); +# if (b1 >= 127) +# idx--; + return idx +} + +function index_to_gb(idx) { + b3 = (idx % 10) + 48; + idx = int(idx / 10); + b2 = (idx % 126) + 129; + idx = int(idx / 126); + b1 = (idx % 10) + 48; + b0 = int(idx / 10) + 129; + return sprintf("%02X%02X%02X%02X", b0, b1, b2, b3); +} + +/^\#/ { + print; + next; +} + +/0x....-0x..../ { + gb_from = gb_to_index(decode_hex(substr($1, 3, 4))); + gb_to = gb_to_index(decode_hex(substr($1, 10, 4))); + unicode = decode_hex(substr($2, 3, 4)); + while (gb_from <= gb_to) + { + table[unicode++] = 1; + gb_from++; + } + next; +} + +{ + gb = decode_hex(substr($1, 3, 4)); + unicode = decode_hex(substr($2, 3, 4)); + table[unicode] = 1; +} + +END { + from_gb = -1; + to_gb = 0; + from_i = 0; + table[65536] = 1; + for (i = 128; i <= 65536; i++) + { + if (table[i] == 0) + { + if (i < 55296 || i >= 57344) + { + if (from_gb < 0) + { + from_gb = to_gb; + from_i = i; + } + to_gb++; + } + } + else if (from_gb >= 0) + { + if (from_gb + 1 == to_gb) + printf "0x%s\t\t0x%04X\n", + index_to_gb(from_gb), from_i; + else + printf "0x%s-0x%s\t0x%04X\n", + index_to_gb(from_gb), index_to_gb(to_gb - 1), from_i; + from_gb = -1; + } + } +} + +# arch-tag: 8e5a22ae-610e-411f-ae17-d6e528b30d71 diff --git a/admin/charsets/kuten.awk b/admin/charsets/kuten.awk new file mode 100644 index 0000000000..9d43f2e0e8 --- /dev/null +++ b/admin/charsets/kuten.awk @@ -0,0 +1,7 @@ +/^[0-9]/ { + ku=substr($1, 3, 2) + 32; + ten=substr($1, 5, 2) + 32; + printf "0x%02X%02X %s\n", ku, ten, $2; +} + +# arch-tag: dade6b45-b4c5-42ab-9d49-d6bf23a710b6 diff --git a/admin/charsets/mapconv b/admin/charsets/mapconv new file mode 100755 index 0000000000..641afc037b --- /dev/null +++ b/admin/charsets/mapconv @@ -0,0 +1,143 @@ +#!/bin/sh +# +# Copyright (C) 2003 +# National Institute of Advanced Industrial Science and Technology (AIST) +# Registration Number H13PRO009 +# +# This file is part of GNU Emacs. +# +# GNU Emacs is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# GNU Emacs is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with GNU Emacs; see the file COPYING. If not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# Comment: +# Convert charset map of various format into this: +# 0xXX 0xYYYY +# where, +# XX is a code point of the charset in hexa-decimal, +# YYYY is the corresponding Unicode character code in hexa-decimal. +# Arguments are: +# $1: source map file +# $2: address pattern for sed (optionally with substitution command) +# $3: format of source map file +# GLIBC-1 GLIBC-2 GLIBC-2-7 CZYBORRA IANA UNICODE YASUOKA MICROSOFT +# $4: awk script + +BASE=`basename $1` + +case "$3" in + GLIBC*) + SOURCE="glibc-2.3.2/localedata/charmaps/${BASE}";; + CZYBORRA) + SOURCE="http://czyborra.com/charsets/${BASE}";; + IANA) + SOURCE="http://www.iana.org/assignments/charset-reg/${BASE}";; + UNICODE) + SOURCE="http://www.unicode.org/Public/MAPPINGS/.../${BASE}";; + UNICODE2) + SOURCE="http://www.unicode.org/Public/MAPPINGS/.../${BASE}";; + YASUOKA) + SOURCE="http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/.../${BASE}";; + MICROSOFT) + SOURCE="http://www.microsoft.com/globaldev/reference/oem/${BASE}";; + KANJI-DATABASE) + SOURCE="data at http://sourceforge.net/cvs/?group_id=26261";; + *) + echo "Unknown file type: $3"; + exit 1;; +esac + +echo "# Generated from $SOURCE" + +if [ -n "$4" ] ; then + if [ -f "$4" ] ; then + AWKPROG="gawk -f $4" + else + echo "Awk program does not exist: $4" + exit 1 + fi +else + AWKPROG=cat +fi + +if [ "$3" == "GLIBC-1" ] ; then + # Source format is: + # <UYYYY> /xXX + sed -n -e "$2 p" < $1 \ + | sed -e 's,<U\([^>]*\)>[ ]*/x\(..\).*,0x\2 0x\1,' \ + | sort | ${AWKPROG} +elif [ "$3" == "GLIBC-2" ] ; then + # Source format is: + # <UYYYY> /xXX/xZZ + sed -n -e "$2 p" < $1 \ + | sed -e 's,<U\([^>]*\)>[ ]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \ + | sort | ${AWKPROG} +elif [ "$3" == "GLIBC-2-7" ] ; then + # Source format is: + # <UYYYY> /xXX/xZZ + # We must drop MSBs of XX and ZZ + sed -n -e "$2 p" < $1 \ + | sed -e 's/xa/x2/g' -e 's/xb/x3/g' -e 's/xc/x4/g' \ + -e 's/xd/x5/g' -e 's/xe/x6/g' -e 's/xf/x7/g' \ + -e 's,<U\([^>]*\)>[ ]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \ + | tee temp \ + | sort | ${AWKPROG} +elif [ "$3" == "CZYBORRA" ] ; then + # Source format is: + # =XX U+YYYY + zcat $1 | sed -n -e "$2 p" \ + | sed -e 's/=\(..\)[^U]*U+\([0-9A-F]*\).*/0x\1 0x\2/' \ + | sort | ${AWKPROG} +elif [ "$3" == "IANA" ] ; then + # Source format is: + # 0xXX 0xYYYY + sed -n -e "$2 p" < $1 \ + | sed -e 's/\(0x[0-9A-Fa-f]*\)[^0]*\(0x[0-9A-Fa-f]*\).*/\1 \2/' \ + | sort | ${AWKPROG} +elif [ "$3" == "UNICODE" ] ; then + # Source format is: + # YYYY XX + sed -n -e "$2 p" < $1 \ + | sed -e 's/\([0-9A-F]*\)[^0-9A-F]*\([0-9A-F]*\).*/0x\2 0x\1/' \ + | sort | ${AWKPROG} +elif [ "$3" == "UNICODE2" ] ; then + # Source format is: + # 0xXXXX 0xYYYY # ... + sed -n -e "$2 p" < $1 \ + | sed -e 's/\([0-9A-Fx]*\)[^0]*\([0-9A-Fx]*\).*/\1 \2/' \ + | ${AWKPROG} | sort -n -k 4,4 +elif [ "$3" == "YASUOKA" ] ; then + # Source format is: + # YYYY 0-XXXX (XXXX is a Kuten code) + sed -n -e "$2 p" < $1 \ + | sed -e 's/\([0-9A-F]*\)[^0]*0-\([0-9]*\).*/0x\2 0x\1/' \ + | sort | ${AWKPROG} +elif [ "$3" == "MICROSOFT" ] ; then + # Source format is: + # XX = U+YYYY + sed -n -e "$2 p" < $1 \ + | sed -e 's/\([0-9A-F]*\).*U+\([0-9A-F]*\).*/0x\1 0x\2/' \ + | sort | ${AWKPROG} +elif [ "$3" == "KANJI-DATABASE" ] ; then + # Source format is: + # C?-XXXX U+YYYYY ..... + sed -n -e "$2 p" < $1 \ + | sed -e 's/...\(....\) U+\([0-9A-F]*\).*/0x\1 0x\2/' \ + | sort | ${AWKPROG} +else + echo "Invalid arguments" + exit 1 +fi + +# arch-tag: c33acb47-7eb6-4872-b871-15e1447e8f0e diff --git a/admin/charsets/mule-charsets.el b/admin/charsets/mule-charsets.el new file mode 100644 index 0000000000..158121a2cf --- /dev/null +++ b/admin/charsets/mule-charsets.el @@ -0,0 +1,60 @@ +;; mule-charsets.el -- Generate Mule-orignal charset maps. +;; Copyright (C) 2003 +;; National Institute of Advanced Industrial Science and Technology (AIST) +;; Registration Number H13PRO009 + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 2, or (at your option) +;; any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs; see the file COPYING. If not, write to the +;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +;; Boston, MA 02111-1307, USA. + +(if (or (< emacs-major-version 21) + (< emacs-minor-version 3) + (and (= emacs-minor-version 3) + (string< emacs-version "21.3.50"))) + (error "Use Emacs of version 21.3.50 or later")) + +(defun func (start end) + (while (<= start end) + (let ((split (split-char start)) + (unicode (encode-char start 'ucs))) + (if unicode + (if (nth 2 split) + (insert (format "0x%02X%02X 0x%04X\n" + (nth 1 split) (nth 2 split) unicode)) + (insert (format "0x%02X 0x%04X\n" (nth 1 split) unicode))))) + (setq start (1+ start)))) + +(defconst charset-alist + '(("MULE-ethiopic.map" . ethiopic) + ("MULE-ipa.map" . ipa) + ("MULE-is13194.map" . indian-is13194) + ("MULE-sisheng.map" . chinese-sisheng) + ("MULE-tibetan.map" . tibetan) + ("MULE-lviscii.map" . vietnamese-viscii-lower) + ("MULE-uviscii.map" . vietnamese-viscii-upper))) + +(setq file (car command-line-args-left)) +(or (stringp file) + (error "Invalid file name: %s" file)) +(setq charset (cdr (assoc file charset-alist))) +(or charset + (error "Invalid charset: %s" (car command-line-args-left))) + +(with-temp-buffer + (map-charset-chars 'func charset) + (write-file file)) + +;;; arch-tag: 515989d7-2e2d-41cc-9163-05ad472fede4 |