(utf-8-subst-table)

(utf-8-subst-rev-table, utf-8-translation-table-for-decode) (utf-8-fragment-on-decoding, ccl-untranslated-to-ucs) (utf-8-ccl-regs, utf-8-translate-cjk): New. (ccl-encode-mule-utf-8): Use utf-8-subst-rev-table. (ccl-decode-mule-utf-8, ccl-untranslated-to-ucs) (utf-8-untranslated-to-ucs, utf-8-compose): Rewritten. (mule-utf-8): Remove pre-write-conversion. (utf-8-post-read-conversion): Comment out.
author: Dave Love <[email protected]> 2002-07-17 15:04:25 +0000
committer: Dave Love <[email protected]> 2002-07-17 15:04:25 +0000
commit: 9ca2ac2dbd525c58754d1cb4d8db98cc6e65f505 (patch)
tree: d58351e721a9e0160ba884fc0c5a3be858708c84 /lisp/international/utf-8.el
parent: f9bd23fdb89db3c6abbae24e332a300cbf1cf5ac (diff)
1 files changed, 402 insertions, 189 deletions
diff --git a/lisp/international/utf-8.el b/lisp/international/utf-8.el
index 068a7bbeaa..b3f6390322 100644
--- a/lisp/international/utf-8.el
+++ b/lisp/international/utf-8.el
@@ -1,10 +1,11 @@
-;;; utf-8.el --- limited UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
+;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
 
 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
 ;; Licensed to the Free Software Foundation.
-;; Copyright (C) 2001 Free Software Foundation, Inc.
+;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
 
 ;; Author: TAKAHASHI Naoto  <[email protected]>
+;; Maintainer: FSF
 ;; Keywords: multilingual, Unicode, UTF-8, i18n
 
 ;; This file is part of GNU Emacs.
@@ -39,11 +40,18 @@
 ;; On decoding, Unicode characters that do not fit into the above
 ;; character sets are handled as `eight-bit-control' or
 ;; `eight-bit-graphic' characters to retain the information about the
-;; original byte sequence.
+;; original byte sequence and text properties record the corresponding
+;; unicode.
+;;
+;; Fixme: note that reading and writing invalid utf-8 may not be
+;; idempotent -- to represent the bytes to fix that needs a new charset.
 ;;
 ;; Characters from other character sets can be encoded with
 ;; mule-utf-8 by populating the table `ucs-mule-to-mule-unicode' and
-;; registering the translation with `register-char-codings'.
+;; registering the translation with `register-char-codings'.  Hash
+;; tables `utf-8-subst-table' and `utf-8-subst-rev-table' are used to
+;; support encoding and decoding of about a quarter of the CJK space
+;; between U+3400 and U+DFFF.
 
 ;; UTF-8 is defined in RFC 2279.  A sketch of the encoding is:
 
@@ -60,7 +68,111 @@
   "Translation table for encoding to `mule-utf-8'.")
 ;; Could have been done by ucs-tables loaded before.
 (unless (get 'ucs-mule-to-mule-unicode 'translation-table)
-  (define-translation-table 'ucs-mule-to-mule-unicode ucs-mule-to-mule-unicode))
+  (define-translation-table 'ucs-mule-to-mule-unicode
+    ucs-mule-to-mule-unicode))
+
+(defvar utf-8-subst-table (make-hash-table :test 'eq))
+(defvar utf-8-subst-rev-table (make-hash-table :test 'eq))
+(define-translation-hash-table 'utf-8-subst-table utf-8-subst-table)
+(define-translation-hash-table 'utf-8-subst-rev-table utf-8-subst-rev-table)
+
+(defvar utf-8-translation-table-for-decode (make-translation-table)
+  "Translation table applied after decoding utf-8 to mule-unicode.
+This is only actually applied to characters which would normally be
+decoded into mule-unicode-0100-24ff.")
+(define-translation-table 'utf-8-translation-table-for-decode
+  utf-8-translation-table-for-decode)
+
+;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
+;; space of mule-unicode.  For Latin scripts this isn't very
+;; important.  Hebrew and Arabic might go here too when there's proper
+;; support for them.
+(mapc
+ (lambda (pair)
+   (aset utf-8-translation-table-for-decode (car pair) (cdr pair)))
+ '((?$,1&d(B . ?,F4(B) (?$,1&e(B . ?,F5(B) (?$,1&f(B . ?,F6(B) (?$,1&h(B . ?,F8(B) (?$,1&i(B . ?,F9(B)
+   (?$,1&j(B . ?,F:(B) (?$,1&l(B . ?,F<(B) (?$,1&n(B . ?,F>(B) (?$,1&o(B . ?,F?(B) (?$,1&p(B . ?,F@(B)
+   (?$,1&q(B . ?,FA(B) (?$,1&r(B . ?,FB(B) (?$,1&s(B . ?,FC(B) (?$,1&t(B . ?,FD(B) (?$,1&u(B . ?,FE(B)
+   (?$,1&v(B . ?,FF(B) (?$,1&w(B . ?,FG(B) (?$,1&x(B . ?,FH(B) (?$,1&y(B . ?,FI(B) (?$,1&z(B . ?,FJ(B)
+   (?$,1&{(B . ?,FK(B) (?$,1&|(B . ?,FL(B) (?$,1&}(B . ?,FM(B) (?$,1&~(B . ?,FN(B) (?$,1&(B . ?,FO(B)
+   (?$,1' (B . ?,FP(B) (?$,1'!(B . ?,FQ(B) (?$,1'#(B . ?,FS(B) (?$,1'$(B . ?,FT(B) (?$,1'%(B . ?,FU(B)
+   (?$,1'&(B . ?,FV(B) (?$,1''(B . ?,FW(B) (?$,1'((B . ?,FX(B) (?$,1')(B . ?,FY(B) (?$,1'*(B . ?,FZ(B)
+   (?$,1'+(B . ?,F[(B) (?$,1',(B . ?,F\(B) (?$,1'-(B . ?,F](B) (?$,1'.(B . ?,F^(B) (?$,1'/(B . ?,F_(B)
+   (?$,1'0(B . ?,F`(B) (?$,1'1(B . ?,Fa(B) (?$,1'2(B . ?,Fb(B) (?$,1'3(B . ?,Fc(B) (?$,1'4(B . ?,Fd(B)
+   (?$,1'5(B . ?,Fe(B) (?$,1'6(B . ?,Ff(B) (?$,1'7(B . ?,Fg(B) (?$,1'8(B . ?,Fh(B) (?$,1'9(B . ?,Fi(B)
+   (?$,1':(B . ?,Fj(B) (?$,1';(B . ?,Fk(B) (?$,1'<(B . ?,Fl(B) (?$,1'=(B . ?,Fm(B) (?$,1'>(B . ?,Fn(B)
+   (?$,1'?(B . ?,Fo(B) (?$,1'@(B . ?,Fp(B) (?$,1'A(B . ?,Fq(B) (?$,1'B(B . ?,Fr(B) (?$,1'C(B . ?,Fs(B)
+   (?$,1'D(B . ?,Ft(B) (?$,1'E(B . ?,Fu(B) (?$,1'F(B . ?,Fv(B) (?$,1'G(B . ?,Fw(B) (?$,1'H(B . ?,Fx(B)
+   (?$,1'I(B . ?,Fy(B) (?$,1'J(B . ?,Fz(B) (?$,1'K(B . ?,F{(B) (?$,1'L(B . ?,F|(B) (?$,1'M(B . ?,F}(B)
+   (?$,1'N(B . ?,F~(B)
+
+   (?$,1(!(B . ?,L!(B) (?$,1("(B . ?,L"(B) (?$,1(#(B . ?,L#(B) (?$,1($(B . ?,L$(B)
+   (?$,1(%(B . ?,L%(B) (?$,1(&(B . ?,L&(B) (?$,1('(B . ?,L'(B) (?$,1(((B . ?,L((B) (?$,1()(B . ?,L)(B)
+   (?$,1(*(B . ?,L*(B) (?$,1(+(B . ?,L+(B) (?$,1(,(B . ?,L,(B) (?$,1(.(B . ?,L.(B) (?$,1(/(B . ?,L/(B)
+   (?$,1(0(B . ?,L0(B) (?$,1(1(B . ?,L1(B) (?$,1(2(B . ?,L2(B) (?$,1(3(B . ?,L3(B) (?$,1(4(B . ?,L4(B)
+   (?$,1(5(B . ?,L5(B) (?$,1(6(B . ?,L6(B) (?$,1(7(B . ?,L7(B) (?$,1(8(B . ?,L8(B) (?$,1(9(B . ?,L9(B)
+   (?$,1(:(B . ?,L:(B) (?$,1(;(B . ?,L;(B) (?$,1(<(B . ?,L<(B) (?$,1(=(B . ?,L=(B) (?$,1(>(B . ?,L>(B)
+   (?$,1(?(B . ?,L?(B) (?$,1(@(B . ?,L@(B) (?$,1(A(B . ?,LA(B) (?$,1(B(B . ?,LB(B) (?$,1(C(B . ?,LC(B)
+   (?$,1(D(B . ?,LD(B) (?$,1(E(B . ?,LE(B) (?$,1(F(B . ?,LF(B) (?$,1(G(B . ?,LG(B) (?$,1(H(B . ?,LH(B)
+   (?$,1(I(B . ?,LI(B) (?$,1(J(B . ?,LJ(B) (?$,1(K(B . ?,LK(B) (?$,1(L(B . ?,LL(B) (?$,1(M(B . ?,LM(B)
+   (?$,1(N(B . ?,LN(B) (?$,1(O(B . ?,LO(B) (?$,1(P(B . ?,LP(B) (?$,1(Q(B . ?,LQ(B) (?$,1(R(B . ?,LR(B)
+   (?$,1(S(B . ?,LS(B) (?$,1(T(B . ?,LT(B) (?$,1(U(B . ?,LU(B) (?$,1(V(B . ?,LV(B) (?$,1(W(B . ?,LW(B)
+   (?$,1(X(B . ?,LX(B) (?$,1(Y(B . ?,LY(B) (?$,1(Z(B . ?,LZ(B) (?$,1([(B . ?,L[(B) (?$,1(\(B . ?,L\(B)
+   (?$,1(](B . ?,L](B) (?$,1(^(B . ?,L^(B) (?$,1(_(B . ?,L_(B) (?$,1(`(B . ?,L`(B) (?$,1(a(B . ?,La(B)
+   (?$,1(b(B . ?,Lb(B) (?$,1(c(B . ?,Lc(B) (?$,1(d(B . ?,Ld(B) (?$,1(e(B . ?,Le(B) (?$,1(f(B . ?,Lf(B)
+   (?$,1(g(B . ?,Lg(B) (?$,1(h(B . ?,Lh(B) (?$,1(i(B . ?,Li(B) (?$,1(j(B . ?,Lj(B) (?$,1(k(B . ?,Lk(B)
+   (?$,1(l(B . ?,Ll(B) (?$,1(m(B . ?,Lm(B) (?$,1(n(B . ?,Ln(B) (?$,1(o(B . ?,Lo(B) (?$,1(q(B . ?,Lq(B)
+   (?$,1(r(B . ?,Lr(B) (?$,1(s(B . ?,Ls(B) (?$,1(t(B . ?,Lt(B) (?$,1(u(B . ?,Lu(B) (?$,1(v(B . ?,Lv(B)
+   (?$,1(w(B . ?,Lw(B) (?$,1(x(B . ?,Lx(B) (?$,1(y(B . ?,Ly(B) (?$,1(z(B . ?,Lz(B) (?$,1({(B . ?,L{(B)
+   (?$,1(|(B . ?,L|(B) (?$,1(~(B . ?,L~(B) (?$,1((B . ?,L(B)))
+
+(defcustom utf-8-fragment-on-decoding nil
+  "Whether or not to decode some scripts in UTF-8 text into 8-bit characters.
+Setting this means that the relevant Cyrillic and Greek characters are
+decoded into the iso8859 charsets rather than into
+mule-unicode-0100-24ff.  The 8-bit characters take half as much space
+in the buffer, but using them may affect how the buffer can be re-encoded
+and may require a different input method to search for them, for instance.
+See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
+for mechanisms to make this largely transparent."
+  :set (lambda (s v)
+	 (if v
+	     (define-translation-table 'utf-8-translation-table-for-decode
+	       utf-8-translation-table-for-decode)
+	   (define-translation-table 'utf-8-translation-table-for-decode))
+	 (set-default s v))
+  :version "21.4"
+  :type 'boolean
+  :group 'mule)
+
+(defcustom utf-8-translate-cjk nil
+  "Whether the `mule-utf-8' coding system should encode many CJK characters.
+
+Enabling this loads tables which enable the coding system to encode
+characters in the charsets `korean-ksc5601', `chinese-gb2312' and
+`japanese-jisx0208', and to decode the corresponding unicodes into
+such characters.  This works by loading the library `utf-8-subst'; see
+its commentary.  The tables are fairly large (about 33000 entries), so this
+option is not the default."
+  :link '(emacs-commentary-link "utf-8-subst")
+  :set (lambda (s v)
+	 (when v
+	   (require 'utf-8-subst)
+	   (let ((table (make-char-table 'translation-table)))
+	     (coding-system-put 'mule-utf-8 'safe-charsets
+				(append (coding-system-get 'mule-utf-8
+							   'safe-charsets)
+					'(korean-ksc5601 chinese-gb2312
+							 japanese-jisx0208)))
+	     (maphash (lambda (k v)
+			(aset table k v))
+		      utf-8-subst-rev-table)
+	     (register-char-codings 'mule-utf-8 table)))
+	 (set-default s v))
+  :version "21.4"
+  :type 'boolean
+  :group 'mule)
+
 (define-ccl-program ccl-decode-mule-utf-8
   ;;
   ;;        charset         | bytes in utf-8 | bytes in emacs
@@ -90,66 +202,16 @@
       ;; 1byte encoding, i.e., ascii
       (if (r0 < #x80)
 	  (write r0)
-
-	;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
-	(if (r0 < #xe0)
-	    ((read r1)
-
-	     (if ((r1 & #b11000000) != #b10000000)
-		 ;; Invalid 2-byte sequence
-		 ((if (r0 < #xa0)
-		      (write-multibyte-character r5 r0)
-		    (write-multibyte-character r6 r0))
-		  (if (r1 < #x80)
-		      (write r1)
-		    (if (r1 < #xa0)
-			(write-multibyte-character r5 r1)
-		      (write-multibyte-character r6 r1))))
-
-	       ((r0 &= #x1f)
-		(r0 <<= 6)
-		(r1 &= #x3f)
-		(r1 += r0)
-		;; Now r1 holds scalar value
-
-		;; eight-bit-control
-		(if (r1 < 160)
-		    ((write-multibyte-character r5 r1))
-
-		  ;; latin-iso8859-1
-		  (if (r1 < 256)
-		      ((r0 = ,(charset-id 'latin-iso8859-1))
-		       (r1 -= 128)
-		       (write-multibyte-character r0 r1))
-
-		    ;; mule-unicode-0100-24ff (< 0800)
-		    ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-		     (r1 -= #x0100)
-		     (r2 = (((r1 / 96) + 32) << 7))
-		     (r1 %= 96)
-		     (r1 += (r2 + 32))
-		     (write-multibyte-character r0 r1)))))))
-
-	  ;; 3byte encoding
-	  ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
-	  (if (r0 < #xf0)
-	      ((read r1 r2)
-
-	       ;; This is set to 1 if the encoding is invalid.
-	       (r4 = 0)
-
-	       (r3 = (r1 & #b11000000))
-	       (r3 |= ((r2 >> 2) & #b00110000))
-	       (if (r3 != #b10100000)
-		   (r4 = 1)
-		 ((r3 = ((r0 & #x0f) << 12))
-		  (r3 += ((r1 & #x3f) << 6))
-		  (r3 += (r2 & #x3f))
-		  (if (r3 < #x0800)
-		      (r4 = 1))))
-
-	       (if (r4 != 0)
-		   ;; Invalid 3-byte sequence
+	(if (r0 < #xc0)		    ; continuation byte (invalid here)
+	    (if (r0 < #xa0)
+		(write-multibyte-character r5 r0)
+	      (write-multibyte-character r6 r0))
+	  ;; 2 byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
+	  (if (r0 < #xe0)
+	      ((read r1)
+
+	       (if ((r1 & #b11000000) != #b10000000)
+		   ;; Invalid 2-byte sequence
 		   ((if (r0 < #xa0)
 			(write-multibyte-character r5 r0)
 		      (write-multibyte-character r6 r0))
@@ -157,75 +219,195 @@
 			(write r1)
 		      (if (r1 < #xa0)
 			  (write-multibyte-character r5 r1)
-			(write-multibyte-character r6 r1)))
-		    (if (r2 < #x80)
-			(write r2)
-		      (if (r2 < #xa0)
-			  (write-multibyte-character r5 r2)
-			(write-multibyte-character r6 r2))))
+			(write-multibyte-character r6 r1))))
+
+		 ((r3 = r0)	   ; save in case of overlong sequence
+		  (r2 = r1)
+		  (r0 &= #x1f)
+		  (r0 <<= 6)
+		  (r2 = r1)	   ; save in case of overlong sequence
+		  (r1 &= #x3f)
+		  (r1 += r0)
+		  ;; Now r1 holds scalar value
+
+		  (if (r1 < 128)	; `overlong sequence'
+		      ((if (r3 < #xa0)
+			   (write-multibyte-character r5 r3)
+			 (write-multibyte-character r6 r3))
+		       (if (r2 < #x80)
+			   (write r2)
+			 (if (r2 < #xa0)
+			     (write-multibyte-character r5 r2)
+			   (write-multibyte-character r6 r2))))
+
+		    ;; eight-bit-control
+		    (if (r1 < 160)
+			((write-multibyte-character r5 r1))
+
+		      ;; latin-iso8859-1
+		      (if (r1 < 256)
+			  ((r0 = ,(charset-id 'latin-iso8859-1))
+			   (r1 -= 128)
+			   (write-multibyte-character r0 r1))
+
+			;; mule-unicode-0100-24ff (< 0800)
+			((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+			 (r1 -= #x0100)
+			 (r2 = (((r1 / 96) + 32) << 7))
+			 (r1 %= 96)
+			 (r1 += (r2 + 32))
+			 (translate-character
+			  utf-8-translation-table-for-decode r0 r1)
+			 (write-multibyte-character r0 r1))))))))
+
+	    ;; 3byte encoding
+	    ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
+	    (if (r0 < #xf0)
+		((read r1 r2)
+
+		 ;; This is set to 1 if the encoding is invalid.
+		 (r4 = 0)
+
+		 (r3 = (r1 & #b11000000))
+		 (r3 |= ((r2 >> 2) & #b00110000))
+		 (if (r3 != #b10100000)
+		     (r4 = 1)
+		   ((r3 = ((r0 & #x0f) << 12))
+		    (r3 += ((r1 & #x3f) << 6))
+		    (r3 += (r2 & #x3f))
+		    (if (r3 < #x0800)
+			(r4 = 1))))
+
+		 (if (r4 != 0)
+		     ;; Invalid 3-byte sequence
+		     ((if (r0 < #xa0)
+			  (write-multibyte-character r5 r0)
+			(write-multibyte-character r6 r0))
+		      (if (r1 < #x80)
+			  (write r1)
+			(if (r1 < #xa0)
+			    (write-multibyte-character r5 r1)
+			  (write-multibyte-character r6 r1)))
+		      (if (r2 < #x80)
+			  (write r2)
+			(if (r2 < #xa0)
+			    (write-multibyte-character r5 r2)
+			  (write-multibyte-character r6 r2))))
 		 
-		 ;; mule-unicode-0100-24ff (>= 0800)
-		 ((if (r3 < #x2500)
-		      ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
-		       (r3 -= #x0100)
-		       (r3 //= 96)
-		       (r1 = (r7 + 32))
-		       (r1 += ((r3 + 32) << 7))
-		       (write-multibyte-character r0 r1))
-		    
-		    ;; mule-unicode-2500-33ff
-		    (if (r3 < #x3400)
-			((r0 = ,(charset-id 'mule-unicode-2500-33ff))
-			 (r3 -= #x2500)
+		   ;; mule-unicode-0100-24ff (>= 0800)
+		   ((if (r3 < #x2500)
+			((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+			 (r3 -= #x0100)
 			 (r3 //= 96)
 			 (r1 = (r7 + 32))
 			 (r1 += ((r3 + 32) << 7))
+			 (translate-character
+			  utf-8-translation-table-for-decode r0 r1)
 			 (write-multibyte-character r0 r1))
-		      
-		      ;; U+3400 .. U+DFFF
-		    ;; keep those bytes as eight-bit-{control|graphic}
-		      (if (r3 < #xe000)
-			  ( ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
-			   (r3 = r6)
-			   (write-multibyte-character r3 r0)
-			   (if (r1 < #xa0)
-			       (r3 = r5))
-			   (write-multibyte-character r3 r1)
-			   (if (r2 < #xa0)
-			       (r3 = r5)
-			     (r3 = r6))
-			   (write-multibyte-character r3 r2))
+		    
+		      ;; mule-unicode-2500-33ff
+		      ;; Fixme: Perhaps allow translation via
+		      ;; utf-8-subst-table for #x2e80 up, so that we use
+		      ;; consistent charsets for all of CJK.  Would need
+		      ;; corresponding change to encoding tables.
+		      (if (r3 < #x3400)
+			  ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
+			   (r3 -= #x2500)
+			   (r3 //= 96)
+			   (r1 = (r7 + 32))
+			   (r1 += ((r3 + 32) << 7))
+			   (write-multibyte-character r0 r1))
+
+			;; U+3400 .. U+D7FF
+			;; Try to convert to CJK chars, else keep
+			;; them as eight-bit-{control|graphic}.
+			(if (r3 < #xd800)
+			    ((r4 = r3)	; don't zap r3
+			     (lookup-integer utf-8-subst-table r4 r5)
+			     (if r7
+				 ;; got a translation
+				 ((write-multibyte-character r4 r5)
+				  ;; Zapped through register starvation.
+				  (r5 = ,(charset-id 'eight-bit-control)))
+			       ;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
+			       ((r3 = r6)
+				(write-multibyte-character r3 r0)
+				(if (r1 < #xa0)
+				    (r3 = r5))
+				(write-multibyte-character r3 r1)
+				(if (r2 < #xa0)
+				    (r3 = r5)
+				  (r3 = r6))
+				(write-multibyte-character r3 r2))))
+
+			  ;; Surrogates, U+D800 .. U+DFFF
+			  ;; Fixme: process them properly.
+			  (if (r3 < #xe000)
+			      ((r3 = r6)
+			       (write-multibyte-character r3 r0) ; eight-bit-graphic
+			       (if (r1 < #xa0)
+				   (r3 = r5))
+			       (write-multibyte-character r3 r1)
+			       (if (r2 < #xa0)
+				   (r3 = r5)
+				 (r3 = r6))
+			       (write-multibyte-character r3 r2))
 			
-			;; mule-unicode-e000-ffff
-			((r0 = ,(charset-id 'mule-unicode-e000-ffff))
-			 (r3 -= #xe000)
-			 (r3 //= 96)
-			 (r1 = (r7 + 32))
-			 (r1 += ((r3 + 32) << 7))
-			 (write-multibyte-character r0 r1))))))))
-
-	    ;; 4byte encoding
-	    ;; keep those bytes as eight-bit-{control|graphic}
-	    ((read r1 r2 r3)
-	     ;; r0 > #xf0, thus eight-bit-graphic
-	     (write-multibyte-character r6 r0)
-	     (if (r1 < #xa0)
-		 (write-multibyte-character r5 r1)
-	       (write-multibyte-character r6 r1))
-	     (if (r2 < #xa0)
-		 (write-multibyte-character r5 r2)
-	       (write-multibyte-character r6 r2))
-	     (if (r3 < #xa0)
-		 (write-multibyte-character r5 r3)
-	       (write-multibyte-character r6 r3))))))
-
+			    ;; mule-unicode-e000-ffff
+			    ;; Fixme: fffe and ffff are invalid.
+			    ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
+			     (r3 -= #xe000)
+			     (r3 //= 96)
+			     (r1 = (r7 + 32))
+			     (r1 += ((r3 + 32) << 7))
+			     (write-multibyte-character r0 r1)))))))))
+
+	      (if (r0 < #xfe)
+		  ;; 4byte encoding
+		  ;; keep those bytes as eight-bit-{control|graphic}
+		  ;; Fixme: allow lookup in utf-8-subst-table.
+		  ((read r1 r2 r3)
+		   ;; r0 > #xf0, thus eight-bit-graphic
+		   (write-multibyte-character r6 r0)
+		   (if (r1 < #xa0)
+		       (if (r1 < #x80)	; invalid byte
+			   (write r1)
+			 (write-multibyte-character r5 r1))
+		     (write-multibyte-character r6 r1))
+		   (if (r2 < #xa0)
+		       (if (r2 < #x80)	; invalid byte
+			   (write r2)
+			 (write-multibyte-character r5 r2))
+		     (write-multibyte-character r6 r2))
+		   (if (r3 < #xa0)
+		       (if (r3 < #x80)	; invalid byte
+			   (write r3)
+			 (write-multibyte-character r5 r3))
+		     (write-multibyte-character r6 r3))
+		   (if (r0 >= #xf8)	; 5- or 6-byte encoding
+		       ((read r1)
+			(if (r1 < #xa0)
+			    (if (r1 < #x80) ; invalid byte
+				(write r1)
+			      (write-multibyte-character r5 r1))
+			  (write-multibyte-character r6 r1))
+			(if (r0 >= #xfc) ; 6-byte
+			    ((read r1)
+			     (if (r1 < #xa0)
+				 (if (r1 < #x80) ; invalid byte
+				     (write r1)
+				   (write-multibyte-character r5 r1))
+			       (write-multibyte-character r6 r1)))))))
+		;; else invalid byte >= #xfe
+		(write-multibyte-character r6 r0))))))
       (repeat))))
 
   "CCL program to decode UTF-8.
 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
-mule-unicode-*.  Encodings of un-representable Unicode characters are
-decoded asis into eight-bit-control and eight-bit-graphic
-characters.")
+mule-unicode-*, but see also `utf-8-translation-table-for-decode' and
+`utf-8-subst-table'.
+Encodings of un-representable Unicode characters are decoded asis into
+eight-bit-control and eight-bit-graphic characters.")
 
 (define-ccl-program ccl-encode-mule-utf-8
   `(1
@@ -288,7 +470,7 @@ characters.")
 	      (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
 		  ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 		   (r1 &= #x7f)
-		   (r1 += (r0 + 57312))	; 57312 == -160 + #xe000
+		   (r1 += (r0 + 57312))	; 57312 == -32 + #xe000
 		   (r0 = (((r1 & #xf000) >> 12) | #xe0))
 		   (r2 = ((r1 & #x3f) | #x80))
 		   (r1 &= #x0fc0)
@@ -329,11 +511,19 @@ characters.")
 				((write #xc2)
 				 (write r1)))))))
 
-		    ;; Unsupported character.
-		    ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
-		    ((write #xef)
-		     (write #xbf)
-		     (write #xbd)))))))))
+		    ((lookup-character utf-8-subst-rev-table r0 r1)
+		     (if r7		; lookup succeeded
+			 ((r1 = (((r0 & #xf000) >> 12) | #xe0))
+			  (r2 = ((r0 & #x3f) | #x80))
+			  (r0 &= #x0fc0)
+			  (r0 >>= 6)
+			  (r0 |= #x80)
+			  (write r1 r0 r2))
+		       ;; Unsupported character.
+		       ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
+		       ((write #xef)
+			(write #xbf)
+			(write #xbd)))))))))))
       (repeat)))
     (if (r1 >= #xa0)
 	(write r1)
@@ -341,69 +531,89 @@ characters.")
 	  ((write #xc2)
 	   (write r1)))))
 
-  "CCL program to encode into UTF-8.
-Only characters from the charsets ascii, eight-bit-control,
-eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
-Others are encoded as U+FFFD.")
+  "CCL program to encode into UTF-8.")
 
 ;; Dummy definition so that the CCL can be checked correctly; the
 ;; actual data are loaded on demand.
 (unless (boundp 'ucs-mule-8859-to-mule-unicode)	; don't zap it
   (define-translation-table 'ucs-mule-8859-to-mule-unicode))
 
+(define-ccl-program ccl-untranslated-to-ucs
+  `(0
+    (if (r0 < #xf0)			; 3-byte encoding, as above
+	((r4 = 0)
+	 (r3 = (r1 & #b11000000))
+	 (r3 |= ((r2 >> 2) & #b00110000))
+	 (if (r3 != #b10100000)
+	     (r4 = 1)
+	   ((r3 = ((r0 & #x0f) << 12))
+	    (r3 += ((r1 & #x3f) << 6))
+	    (r3 += (r2 & #x3f))
+	    (if (r3 < #x0800)
+		(r4 = 1))))
+	 (if (r4 != 0)
+	     (r0 = 0)
+	   (r0 = r3)))
+      (if (r0 < #xf8)			; 4-byte (Mule-UCS recipe)
+	  ((r4 = (r1 >> 6))
+	   (if (r4 != #b10)
+	       (r0 = 0)
+	     ((r4 = (r2 >> 6))
+	      (if (r4 != #b10)
+		  (r0 = 0)
+		((r4 = (r3 >> 6))
+		 (if (r4 != #b10)
+		     (r0 = 0)
+		   ((r1 = ((r1  & #x3F) << 12))
+		    (r2 = ((r2  & #x3F) << 6))
+		    (r3 &= #x3F)
+		    (r0 = (((((r0 & #x07) << 18) | r1) | r2) | r3)))))))))
+	(r0 = 0))))
+  "Decode 3- or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
+r0 == 0 for invalid sequence.")
+
+(defvar utf-8-ccl-regs (make-vector 8 0))
+
 (defsubst utf-8-untranslated-to-ucs ()
-  (let ((b1 (char-after))
-	(b2 (char-after (1+ (point))))
-	(b3 (char-after (+ 2 (point))))
-	(b4 (char-after (+ 4 (point)))))
-    (if (and b1 b2 b3)
-	(cond ((< b1 ?\xf0)
-	       (setq b2 (lsh (logand b2 ?\x3f) 6))
-	       (setq b3 (logand b3 ?\x3f))
-	       (logior b3 (logior b2 (lsh (logand b1 ?\x0f) 12))))
-	      (b4
-	       (setq b2 (lsh (logand b2 ?\x3f) 12))
-	       (setq b3 (lsh (logand b3 ?\x3f) 6))
-	       (setq b4 (logand b4 ?\x3f))
-	       (logior b4 (logior b3 (logior b2 (lsh (logand b1 ?\x07)
-						     18)))))))))
+  "Return the UCS code for an untranslated sequence of raw bytes t point.
+Only for 3- or 4-byte sequences."
+  (aset utf-8-ccl-regs 0 (or (char-after) 0))
+  (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
+  (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
+  (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
+  (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs)
+  (aref utf-8-ccl-regs 0))
 
 (defun utf-8-help-echo (window object position)
   (format "Untranslated Unicode U+%04X"
 	  (get-char-property position 'untranslated-utf-8 object)))
 
-(defvar utf-8-subst-table nil
-  "If non-nil, a hash table mapping `untranslatable utf-8' to Emacs characters.")
-
 ;; We compose the untranslatable sequences into a single character.
 ;; This is infelicitous for editing, because there's currently no
 ;; mechanism for treating compositions as atomic, but is OK for
-;; display.  We try to compose an appropriate character from a hash
-;; table of CJK characters to display correctly.  Otherwise we use
-;; U+FFFD.  What we really should have is hash table lookup from CCL
-;; so that we could do this properly.  This function GCs too much.
+;; display.  They are composed to U+FFFD with help-echo which
+;; indicates the unicodes they represent.  This function GCs too much.
 (defsubst utf-8-compose ()
   "Put a suitable composition on an untranslatable sequence.
 Return the sequence's length."
   (let* ((u (utf-8-untranslated-to-ucs))
-	 (l (and u (if (>= u ?\x10000)
+	 (l (unless (zerop u)
+	      (if (>= u #x10000)
 		       4
-		     3)))
-	 (subst (and utf-8-subst-table (gethash u utf-8-subst-table))))
-    (when u
+		     3))))
+    (when l
       (put-text-property (point) (min (point-max) (+ l (point)))
 			 'untranslated-utf-8 u)
-      (unless subst
-	  (put-text-property (point) (min (point-max) (+ l (point)))
-			     'help-echo 'utf-8-help-echo)
-	  (setq subst ?$,3u=(B))
-      (compose-region (point) (+ l (point)) subst)
+      (put-text-property (point) (min (point-max) (+ l (point)))
+			 'help-echo 'utf-8-help-echo)
+      (compose-region (point) (+ l (point)) ?$,3u=(B)
       l)))
 
 (defcustom utf-8-compose-scripts nil
-  "*Non-nil means compose various scipts on decoding utf-8 text."
+  "*Non-nil means compose various scripts on decoding utf-8 text."
   :group 'mule
-  :type 'boolean)	; omitted in Emacs 21.1
+  :version "21.4"
+  :type 'boolean)
 
 (defun utf-8-post-read-conversion (length)
   "Compose untranslated utf-8 sequences into single characters.
@@ -412,38 +622,39 @@ Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
     ;; Can't do eval-when-compile to insert a multibyte constant
     ;; version of the string in the loop, since it's always loaded as
     ;; unibyte from a byte-compiled file.
-    (let ((range (string-as-multibyte "^\341-\377"))) 
-      (while (and (skip-chars-forward
-		   range)
+    (let ((range (string-as-multibyte "^\xe1-\xf7")))
+      (while (and (skip-chars-forward range)
 		  (not (eobp)))
 	(forward-char (utf-8-compose)))))
-  ;; Fixme: Takahashi-san implies it may not work this easily -- needs
-  ;; checking with him.
+  ;; Fixme: Takahashi-san implies it may not work this easily.  I
+  ;; asked why but didn't get a reply. -- fx
   (when (and utf-8-compose-scripts (> length 1))
     ;; These currently have definitions which cover the relevant
-    ;; Unicodes.  We could avoid loading thai-util &c by checking
+    ;; unicodes.  We could avoid loading thai-util &c by checking
     ;; whether the region contains any characters with the appropriate
     ;; categories.  There aren't yet Unicode-based rules for Tibetan.
     (save-excursion (setq length (diacritic-post-read-conversion length)))
     (save-excursion (setq length (thai-post-read-conversion length)))
     (save-excursion (setq length (lao-post-read-conversion length)))
-    (save-excursion (setq length (devanagari-post-read-conversion length))))
+    (save-excursion
+      (setq length (in-is13194-devanagari-post-read-conversion length))))
   length)
 
-(defun utf-8-pre-write-conversion (beg end)
-  "Semi-dummy pre-write function effectively to autoload ucs-tables."
-  ;; Ensure translation table is loaded.
-  (require 'ucs-tables)
-  ;; Don't do this again.
-  (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
-  nil)
+;; ucs-tables is preloaded
+;; (defun utf-8-pre-write-conversion (beg end)
+;;   "Semi-dummy pre-write function effectively to autoload ucs-tables."
+;;   ;; Ensure translation table is loaded.
+;;   (require 'ucs-tables)
+;;   ;; Don't do this again.
+;;   (coding-system-put 'mule-utf-8 'pre-write-conversion nil)
+;;   nil)
 
 (make-coding-system
  'mule-utf-8 4 ?u
  "UTF-8 encoding for Emacs-supported Unicode characters.
-The supported Emacs character sets are the following, plus others
-which may be included in the translation table
-`ucs-mule-to-mule-unicode':
+The supported Emacs character sets are the following, plus any other
+characters included in the tables `ucs-mule-to-mule-unicode' and
+`utf-8-subst-rev-table':
  ascii
  eight-bit-control
  eight-bit-graphic
@@ -462,10 +673,12 @@ which may be included in the translation table
  mule-unicode-e000-ffff
 
 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
-are decoded into sequences of eight-bit-control and eight-bit-graphic
-characters to preserve their byte sequences and composed to display as
-a single character.  Emacs characters that can't be encoded to these
-ranges are encoded as U+FFFD."
+may be decoded into korean-ksc5601, chinese-gb2312, japanese-jisx0208
+\(see user option `utf-8-translate-cjk'); otherwise, sequences of
+eight-bit-control and eight-bit-graphic characters are used to
+preserve their byte sequences, and these are composed to display as a
+single character.  Emacs characters that otherwise can't be encoded
+are encoded as U+FFFD."
 
  '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
  '((safe-charsets
@@ -497,7 +710,7 @@ ranges are encoded as U+FFFD."
    (mime-charset . utf-8)
    (coding-category . coding-category-utf-8)
    (valid-codes (0 . 255))
-   (pre-write-conversion . utf-8-pre-write-conversion)
+;;    (pre-write-conversion . utf-8-pre-write-conversion)
    (post-read-conversion . utf-8-post-read-conversion)))
 
 (define-coding-system-alias 'utf-8 'mule-utf-8)
author	Dave Love <[email protected]>	2002-07-17 15:04:25 +0000
committer	Dave Love <[email protected]>	2002-07-17 15:04:25 +0000
commit	9ca2ac2dbd525c58754d1cb4d8db98cc6e65f505 (patch)
tree	d58351e721a9e0160ba884fc0c5a3be858708c84 /lisp/international/utf-8.el
parent	f9bd23fdb89db3c6abbae24e332a300cbf1cf5ac (diff)