(read_from_string_index_byte): New variable.

(read_from_string_index): Now counts characters. (readchar, unreadchar, Fread_from_string): Changed accordingly. (readchar): Read a multibyte char all at once from a buffer, marker or string. (unreadchar): Unread a multibyte char all at once. (read1): Properly handle non-escaped multibyte chars. They force a string to be multibyte. When reading direct from a file, any multibyte sequence means a multibyte string. Insist on MAX_LENGTH_OF_MULTI_BYTE_FORM bytes when checking for read_buffer full; this way need not check specially for multibyte.
1998-04-23 21:22:51 +00:00 · 1998-04-23 21:22:51 +00:00 · bed23cb24b
commit bed23cb24b
parent d67e2df9c3
1 changed files with 116 additions and 113 deletions
--- a/src/lread.c
+++ b/src/lread.c
@ -131,6 +131,7 @@ static int read_pure;

 /* For use within read-from-string (this reader is non-reentrant!!)  */
 static int read_from_string_index;
+static int read_from_string_index_byte;
 static int read_from_string_limit;

 /* Number of bytes left to read in the buffer character
@ -169,64 +170,59 @@ readchar (readcharfun)
     Lisp_Object readcharfun;
 {
  Lisp_Object tem;
-  register struct buffer *inbuffer;
  register int c, mpos;

  if (BUFFERP (readcharfun))
    {
-      inbuffer = XBUFFER (readcharfun);
+      register struct buffer *inbuffer = XBUFFER (readcharfun);

-      if (readchar_backlog == 0)
+      int pt_byte = BUF_PT_BYTE (inbuffer);
+      int orig_pt_byte = pt_byte;
+
+      if (pt_byte >= BUF_ZV_BYTE (inbuffer))
+	return -1;
+
+      if (! NILP (inbuffer->enable_multibyte_characters))
 	{
-	  int pt_byte = BUF_PT_BYTE (inbuffer);
-	  int orig_pt_byte = pt_byte;
-
-	  if (pt_byte >= BUF_ZV_BYTE (inbuffer))
-	    return -1;
-
-	  if (! NILP (inbuffer->enable_multibyte_characters))
-	    BUF_INC_POS (inbuffer, pt_byte);
-	  else
-	    pt_byte++;
-	  SET_BUF_PT_BOTH (inbuffer, BUF_PT (inbuffer) + 1, pt_byte);
-	  readchar_backlog = pt_byte - orig_pt_byte;
+	  unsigned char *p = BUF_BYTE_ADDRESS (inbuffer, pt_byte);
+	  BUF_INC_POS (inbuffer, pt_byte);
+	  c = STRING_CHAR (p, pt_byte - orig_pt_byte);
 	}
+      else
+	{
+	  c = BUF_FETCH_BYTE (inbuffer, pt_byte);
+	  pt_byte++;
+	}
+      SET_BUF_PT_BOTH (inbuffer, BUF_PT (inbuffer) + 1, pt_byte);

-      /* We get the address of the byte just passed,
-	 which is the last byte of the character.
-	 The other bytes in this character are consecutive with it,
-	 because the gap can't be in the middle of a character.  */
-      return *(BUF_BYTE_ADDRESS (inbuffer, BUF_PT_BYTE (inbuffer) - 1)
-	       - --readchar_backlog);
+      return c;
    }
  if (MARKERP (readcharfun))
    {
-      inbuffer = XMARKER (readcharfun)->buffer;
+      register struct buffer *inbuffer = XMARKER (readcharfun)->buffer;

-      if (readchar_backlog == 0)
+      int bytepos = marker_byte_position (readcharfun);
+      int orig_bytepos = bytepos;
+
+      if (bytepos >= BUF_ZV_BYTE (inbuffer))
+	return -1;
+
+      if (! NILP (inbuffer->enable_multibyte_characters))
 	{
-	  int bytepos = marker_byte_position (readcharfun);
-	  int orig_bytepos = bytepos;
-
-	  if (bytepos >= BUF_ZV_BYTE (inbuffer))
-	    return -1;
-
-	  if (! NILP (inbuffer->enable_multibyte_characters))
-	    INC_POS (bytepos);
-	  else
-	    bytepos++;
-	  XMARKER (readcharfun)->bytepos = bytepos;
-	  XMARKER (readcharfun)->charpos++;
-
-	  readchar_backlog = bytepos - orig_bytepos;
+	  unsigned char *p = BUF_BYTE_ADDRESS (inbuffer, bytepos);
+	  BUF_INC_POS (inbuffer, bytepos);
+	  c = STRING_CHAR (p, bytepos - orig_bytepos);
+	}
+      else
+	{
+	  c = BUF_FETCH_BYTE (inbuffer, bytepos);
+	  bytepos++;
 	}

-      /* We get the address of the byte just passed,
-	 which is the last byte of the character.
-	 The other bytes in this character are consecutive with it,
-	 because the gap can't be in the middle of a character.  */
-      return *(BUF_BYTE_ADDRESS (inbuffer, XMARKER (readcharfun)->bytepos - 1)
-	       - --readchar_backlog);
+      XMARKER (readcharfun)->bytepos = bytepos;
+      XMARKER (readcharfun)->charpos++;
+
+      return c;
    }
  if (EQ (readcharfun, Qget_file_char))
    {
@ -244,13 +240,14 @@ readchar (readcharfun)

  if (STRINGP (readcharfun))
    {
-      register int c;
-      /* This used to be return of a conditional expression,
-	 but that truncated -1 to a char on VMS.  */
-      if (read_from_string_index < read_from_string_limit)
-	c = XSTRING (readcharfun)->data[read_from_string_index++];
-      else
+      if (read_from_string_index >= read_from_string_limit)
 	c = -1;
+      else if (STRING_MULTIBYTE (readcharfun))
+	FETCH_STRING_CHAR_ADVANCE (c, readcharfun,
+				   read_from_string_index,
+				   read_from_string_index_byte);
+      else
+	c = XSTRING (readcharfun)->data[read_from_string_index++];

      return c;
    }
@ -276,42 +273,36 @@ unreadchar (readcharfun, c)
    ;
  else if (BUFFERP (readcharfun))
    {
-      if (!SINGLE_BYTE_CHAR_P (c))
-	readchar_backlog++;
+      struct buffer *b = XBUFFER (readcharfun);
+      int bytepos = BUF_PT_BYTE (b);
+
+      BUF_PT (b)--;
+      if (! NILP (b->enable_multibyte_characters))
+	BUF_DEC_POS (b, bytepos);
      else
-	{
-	  struct buffer *b = XBUFFER (readcharfun);
-	  int bytepos = BUF_PT_BYTE (b);
+	bytepos--;

-	  BUF_PT (b)--;
-	  if (! NILP (b->enable_multibyte_characters))
-	    BUF_DEC_POS (b, bytepos);
-	  else
-	    bytepos--;
-
-	  BUF_PT_BYTE (b) = bytepos;
-	}
+      BUF_PT_BYTE (b) = bytepos;
    }
  else if (MARKERP (readcharfun))
    {
-      if (!SINGLE_BYTE_CHAR_P (c))
-	readchar_backlog++;
+      struct buffer *b = XMARKER (readcharfun)->buffer;
+      int bytepos = XMARKER (readcharfun)->bytepos;
+
+      XMARKER (readcharfun)->charpos--;
+      if (! NILP (b->enable_multibyte_characters))
+	BUF_DEC_POS (b, bytepos);
      else
-	{
-	  struct buffer *b = XMARKER (readcharfun)->buffer;
-	  int bytepos = XMARKER (readcharfun)->bytepos;
+	bytepos--;

-	  XMARKER (readcharfun)->charpos--;
-	  if (! NILP (b->enable_multibyte_characters))
-	    BUF_DEC_POS (b, bytepos);
-	  else
-	    bytepos--;
-
-	  XMARKER (readcharfun)->bytepos = bytepos;
-	}
+      XMARKER (readcharfun)->bytepos = bytepos;
    }
  else if (STRINGP (readcharfun))
-    read_from_string_index--;
+    {
+      read_from_string_index--;
+      read_from_string_index_byte
+	= string_char_to_byte (readcharfun, read_from_string_index);
+    }
  else if (EQ (readcharfun, Qget_file_char))
    ungetc (c, instream);
  else
@ -321,7 +312,7 @@ unreadchar (readcharfun, c)
 static Lisp_Object read0 (), read1 (), read_list (), read_vector ();
 static int read_multibyte ();

-/* get a character from the tty */
+/* Get a character from the tty.  */

 extern Lisp_Object read_char ();

@ -1180,12 +1171,11 @@ START and END optionally delimit a substring of STRING from which to read;\n\
  CHECK_STRING (string,0);

  if (NILP (end))
-    endval = STRING_BYTES (XSTRING (string));
+    endval = XSTRING (string)->size;
  else
    {
      CHECK_NUMBER (end, 2);
-      endval = string_char_to_byte (string, XINT (end));
-      if (endval < 0 || endval > STRING_BYTES (XSTRING (string)))
+      if (endval < 0 || endval > XSTRING (string)->size)
 	args_out_of_range (string, end);
    }

@ -1194,21 +1184,19 @@ START and END optionally delimit a substring of STRING from which to read;\n\
  else
    {
      CHECK_NUMBER (start, 1);
-      startval = string_char_to_byte (string, XINT (start));
      if (startval < 0 || startval > endval)
 	args_out_of_range (string, start);
    }

  read_from_string_index = startval;
+  read_from_string_index_byte = string_char_to_byte (string, startval);
  read_from_string_limit = endval;

  new_backquote_flag = 0;
  read_objects = Qnil;

  tem = read0 (string);
-  endval = string_byte_to_char (string,
-				read_from_string_index);
-  return Fcons (tem, make_number (endval));
+  return Fcons (tem, make_number (read_from_string_index));
 }

 /* Use this for recursive reads, in contexts where internal tokens
@ -1744,49 +1732,45 @@ read1 (readcharfun, pch, first_in_list)
 	while ((c = READCHAR) >= 0
 	       && c != '\"')
 	  {
-	    if (p == end)
+	    if (end - p < MAX_LENGTH_OF_MULTI_BYTE_FORM)
 	      {
 		char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2);
 		p += new - read_buffer;
 		read_buffer += new - read_buffer;
 		end = read_buffer + read_buffer_size;
 	      }
+
 	    if (c == '\\')
 	      {
 		c = read_escape (readcharfun, 1);
-		if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_META)))
+
+		/* C is -1 if \ newline has just been seen */
+		if (c == -1)
 		  {
-		    unsigned char workbuf[4];
-		    unsigned char *str = workbuf;
-		    int length;
-
-		    length = non_ascii_char_to_string (c, workbuf, &str);
-		    if (length > 1)
-		      force_multibyte = 1;
-
-		    if (p + length > end)
-		      {
-			char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2);
-			p += new - read_buffer;
-			read_buffer += new - read_buffer;
-			end = read_buffer + read_buffer_size;
-		      }
- 
-		    bcopy (str, p, length);
-		    p += length;
+		    if (p == read_buffer)
+		      cancel = 1;
 		    continue;
 		  }
+
 		/* If an escape specifies a non-ASCII single-byte character,
 		   this must be a unibyte string.  */
-		else if (! ASCII_BYTE_P (c))
+		if (SINGLE_BYTE_CHAR_P ((c & ~CHAR_META))
+		    && ! ASCII_BYTE_P (c))
 		  force_singlebyte = 1;
 	      }

-	    /* c is -1 if \ newline has just been seen */
-	    if (c == -1)
+	    if (! SINGLE_BYTE_CHAR_P ((c & ~CHAR_META)))
 	      {
-		if (p == read_buffer)
-		  cancel = 1;
+		unsigned char workbuf[4];
+		unsigned char *str = workbuf;
+		int length;
+
+		length = non_ascii_char_to_string (c, workbuf, &str);
+		if (length > 1)
+		  force_multibyte = 1;
+
+		bcopy (str, p, length);
+		p += length;
 	      }
 	    else
 	      {
@ -1814,7 +1798,7 @@ read1 (readcharfun, pch, first_in_list)
 	  return make_number (0);

 	if (force_singlebyte && force_multibyte)
-	  error ("Multibyte and single-byte escapes in one string constant");
+	  error ("Multibyte and unibyte characters in one string constant");

 	if (force_singlebyte)
 	  nchars = p - read_buffer;
@ -1831,7 +1815,14 @@ read1 (readcharfun, pch, first_in_list)
 		return Fstring_make_unibyte (string);
 	      }
 	  }
+	else if (EQ (readcharfun, Qget_file_char))
+	  /* Nowadays, reading directly from a file
+	     is used only for compiled Emacs Lisp files,
+	     and those always use the Emacs internal encoding.  */
+	  nchars = multibyte_chars_in_text (read_buffer, p - read_buffer);
 	else
+	  /* In all other cases, if we read these bytes as
+	     separate characters, treat them as separate characters now.  */
 	  nchars = p - read_buffer;

 	if (read_pure)
@ -1884,7 +1875,7 @@ read1 (readcharfun, pch, first_in_list)
 		      || c == '[' || c == ']' || c == '#'
 		      ))
 	    {
-	      if (p == end)
+	      if (end - p < MAX_LENGTH_OF_MULTI_BYTE_FORM)
 		{
 		  register char *new = (char *) xrealloc (read_buffer, read_buffer_size *= 2);
 		  p += new - read_buffer;
@ -1897,7 +1888,19 @@ read1 (readcharfun, pch, first_in_list)
 		  quoted = 1;
 		}

-	      *p++ = c;
+	      if (! SINGLE_BYTE_CHAR_P (c))
+		{
+		  unsigned char workbuf[4];
+		  unsigned char *str = workbuf;
+		  int length;
+
+		  length = non_ascii_char_to_string (c, workbuf, &str);
+
+		  bcopy (str, p, length);
+		  p += length;
+		}
+	      else
+		*p++ = c;

 	      c = READCHAR;
 	    }
@ -2553,7 +2556,7 @@ init_obarray ()
  Qvariable_documentation = intern ("variable-documentation");
  staticpro (&Qvariable_documentation);

-  read_buffer_size = 100;
+  read_buffer_size = 100 + MAX_LENGTH_OF_MULTI_BYTE_FORM;
  read_buffer = (char *) malloc (read_buffer_size);
 }