Warning comments about subtleties of fetching characters from buffers/strings.

src/buffer.h (FETCH_CHAR, FETCH_MULTIBYTE_CHAR): src/character.h (STRING_CHAR, STRING_CHAR_AND_LENGTH): Add comments about subtle differences between FETCH_CHAR* and STRING_CHAR* macros related to unification of CJK characters. For the details, see the discussion following the message here: http://debbugs.gnu.org/cgi/bugreport.cgi?bug=11073#14.
2012-04-06 16:10:30 +03:00 · 2012-04-06 16:10:30 +03:00 · 2f8e16b2a3
commit 2f8e16b2a3
parent ea0ff31442
3 changed files with 34 additions and 4 deletions
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,12 @@
+2012-04-06  Eli Zaretskii  <eliz@gnu.org>
+
+	* buffer.h (FETCH_CHAR, FETCH_MULTIBYTE_CHAR):
+	* character.h (STRING_CHAR, STRING_CHAR_AND_LENGTH): Add comments
+	about subtle differences between FETCH_CHAR* and STRING_CHAR*
+	macros related to unification of CJK characters.  For the details,
+	see the discussion following the message here:
+	http://debbugs.gnu.org/cgi/bugreport.cgi?bug=11073#14.
+
 2012-04-04  Chong Yidong  <cyd@gnu.org>

 	* keyboard.c (Vdelayed_warnings_list): Doc fix.
--- a/src/buffer.h
+++ b/src/buffer.h
@ -343,7 +343,8 @@ while (0)
 - (ptr - (current_buffer)->text->beg <= GPT_BYTE - BEG_BYTE ? 0 : GAP_SIZE) \
 + BEG_BYTE)

-/* Return character at byte position POS.  */
+/* Return character at byte position POS.  See the caveat WARNING for
+   FETCH_MULTIBYTE_CHAR below.  */

 #define FETCH_CHAR(pos)				      	\
  (!NILP (BVAR (current_buffer, enable_multibyte_characters))	\
@ -359,7 +360,17 @@ extern unsigned char *_fetch_multibyte_char_p;

 /* Return character code of multi-byte form at byte position POS.  If POS
   doesn't point the head of valid multi-byte form, only the byte at
-   POS is returned.  No range checking.  */
+   POS is returned.  No range checking.
+
+   WARNING: The character returned by this macro could be "unified"
+   inside STRING_CHAR, if the original character in the buffer belongs
+   to one of the Private Use Areas (PUAs) of codepoints that Emacs
+   uses to support non-unified CJK characters.  If that happens,
+   CHAR_BYTES will return a value that is different from the length of
+   the original multibyte sequence stored in the buffer.  Therefore,
+   do _not_ use FETCH_MULTIBYTE_CHAR if you need to advance through
+   the buffer to the next character after fetching this one.  Instead,
+   use either FETCH_CHAR_ADVANCE or STRING_CHAR_AND_LENGTH.  */

 #define FETCH_MULTIBYTE_CHAR(pos)				 	\
  (_fetch_multibyte_char_p = (((pos) >= GPT_BYTE ? GAP_SIZE : 0) 	\
--- a/src/character.h
+++ b/src/character.h
@ -292,7 +292,9 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  } while (0)

 /* Return the character code of character whose multibyte form is at
-   P.  */
+   P.  Note that this macro unifies CJK characters whose codepoints
+   are in the Private Use Areas (PUAs), so it might return a different
+   codepoint from the one actually stored at P.  */

 #define STRING_CHAR(p)						\
  (!((p)[0] & 0x80)						\
@ -309,7 +311,15 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */


 /* Like STRING_CHAR, but set ACTUAL_LEN to the length of multibyte
-   form.  */
+   form.
+
+   Note: This macro returns the actual length of the character's
+   multibyte sequence as it is stored in a buffer or string.  The
+   character it returns might have a different codepoint that has a
+   different multibyte sequence of a different legth, due to possible
+   unification of CJK characters inside string_char.  Therefore do NOT
+   assume that the length returned by this macro is identical to the
+   length of the multibyte sequence of the character it returns.  */

 #define STRING_CHAR_AND_LENGTH(p, actual_len)			\
  (!((p)[0] & 0x80)						\