Merge remote-tracking branch 'origin/master' into feature/android

2023-03-11 19:08:24 +08:00 · 2023-03-11 19:08:24 +08:00 · 26640b7a4f
commit 26640b7a4f
parent a697ca5562 b8e7061232
2 changed files with 174 additions and 147 deletions
--- a/src/lread.c
+++ b/src/lread.c
@ -2890,154 +2890,137 @@ character_name_to_code (char const *name, ptrdiff_t name_len,
   Unicode 9.0.0 the maximum is 83, so this should be safe.  */
 enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };

-/* Read a \-escape sequence, assuming we already read the `\'.
-   If the escape sequence forces unibyte, return eight-bit char.  */
-
-static int
-read_escape (Lisp_Object readcharfun)
+static AVOID
+invalid_escape_syntax_error (void)
 {
-  int c = READCHAR;
-  /* \u allows up to four hex digits, \U up to eight.  Default to the
-     behavior for \u, and change this value in the case that \U is seen.  */
-  int unicode_hex_count = 4;
+  error ("Invalid escape character syntax");
+}
+
+/* Read a character escape sequence, assuming we just read a backslash
+   and one more character (next_char).  */
+static int
+read_char_escape (Lisp_Object readcharfun, int next_char)
+{
+  int modifiers = 0;
+  ptrdiff_t ncontrol = 0;
+  int chr;
+
+ again: ;
+  int c = next_char;
+  int unicode_hex_count;
+  int mod;

  switch (c)
    {
    case -1:
      end_of_file_error ();

-    case 'a':
-      return '\007';
-    case 'b':
-      return '\b';
-    case 'd':
-      return 0177;
-    case 'e':
-      return 033;
-    case 'f':
-      return '\f';
-    case 'n':
-      return '\n';
-    case 'r':
-      return '\r';
-    case 't':
-      return '\t';
-    case 'v':
-      return '\v';
+    case 'a': chr = '\a'; break;
+    case 'b': chr = '\b'; break;
+    case 'd': chr =  127; break;
+    case 'e': chr =   27; break;
+    case 'f': chr = '\f'; break;
+    case 'n': chr = '\n'; break;
+    case 'r': chr = '\r'; break;
+    case 't': chr = '\t'; break;
+    case 'v': chr = '\v'; break;

    case '\n':
      /* ?\LF is an error; it's probably a user mistake.  */
      error ("Invalid escape character syntax");

-    case 'M':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | meta_modifier;
+    /* \M-x etc: set modifier bit and parse the char to which it applies,
+       allowing for chains such as \M-\S-\A-\H-\s-\C-q.  */
+    case 'M': mod = meta_modifier;  goto mod_key;
+    case 'S': mod = shift_modifier; goto mod_key;
+    case 'H': mod = hyper_modifier; goto mod_key;
+    case 'A': mod = alt_modifier;   goto mod_key;
+    case 's': mod = super_modifier; goto mod_key;

-    case 'S':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | shift_modifier;
-
-    case 'H':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | hyper_modifier;
-
-    case 'A':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | alt_modifier;
-
-    case 's':
-      c = READCHAR;
-      if (c != '-')
-	{
-	  UNREAD (c);
-	  return ' ';
-	}
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      return c | super_modifier;
-
-    case 'C':
-      c = READCHAR;
-      if (c != '-')
-	error ("Invalid escape character syntax");
-      FALLTHROUGH;
-    case '^':
-      c = READCHAR;
-      if (c == '\\')
-	c = read_escape (readcharfun);
-      if ((c & ~CHAR_MODIFIER_MASK) == '?')
-	return 0177 | (c & CHAR_MODIFIER_MASK);
-      else if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
-	return c | ctrl_modifier;
-      /* ASCII control chars are made from letters (both cases),
-	 as well as the non-letters within 0100...0137.  */
-      else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
-	return (c & (037 | ~0177));
-      else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
-	return (c & (037 | ~0177));
-      else
-	return c | ctrl_modifier;
-
-    case '0':
-    case '1':
-    case '2':
-    case '3':
-    case '4':
-    case '5':
-    case '6':
-    case '7':
-      /* An octal escape, as in ANSI C.  */
+    mod_key:
      {
-	register int i = c - '0';
-	register int count = 0;
-	while (++count < 3)
+	int c1 = READCHAR;
+	if (c1 != '-')
 	  {
-	    if ((c = READCHAR) >= '0' && c <= '7')
+	    if (c == 's')
 	      {
-		i *= 8;
-		i += c - '0';
+		/* \s not followed by a hyphen is SPC.  */
+		UNREAD (c1);
+		chr = ' ';
+		break;
 	      }
 	    else
+	      /* \M, \S, \H, \A not followed by a hyphen is an error.  */
+	      invalid_escape_syntax_error ();
+	  }
+	modifiers |= mod;
+	c1 = READCHAR;
+	if (c1 == '\\')
+	  {
+	    next_char = READCHAR;
+	    goto again;
+	  }
+	chr = c1;
+	break;
+      }
+
+    /* Control modifiers (\C-x or \^x) are messy and not actually idempotent.
+       For example, ?\C-\C-a = ?\C-\001 = 0x4000001.
+       Keep a count of them and apply them separately.  */
+    case 'C':
+      {
+	int c1 = READCHAR;
+	if (c1 != '-')
+	  invalid_escape_syntax_error ();
+      }
+      FALLTHROUGH;
+    /* The prefixes \C- and \^ are equivalent.  */
+    case '^':
+      {
+	ncontrol++;
+	int c1 = READCHAR;
+	if (c1 == '\\')
+	  {
+	    next_char = READCHAR;
+	    goto again;
+	  }
+	chr = c1;
+	break;
+      }
+
+    /* 1-3 octal digits.  Values in 0x80..0xff are encoded as raw bytes.  */
+    case '0': case '1': case '2': case '3':
+    case '4': case '5': case '6': case '7':
+      {
+	int i = c - '0';
+	int count = 0;
+	while (count < 2)
+	  {
+	    int c = READCHAR;
+	    if (c < '0' || c > '7')
 	      {
 		UNREAD (c);
 		break;
 	      }
+	    i = (i << 3) + (c - '0');
+	    count++;
 	  }

 	if (i >= 0x80 && i < 0x100)
 	  i = BYTE8_TO_CHAR (i);
-	return i;
+	chr = i;
+	break;
      }

+    /* 1 or more hex digits.  Values may encode modifiers.
+       Values in 0x80..0xff using 2 hex digits are encoded as raw bytes.  */
    case 'x':
-      /* A hex escape, as in ANSI C.  */
      {
 	unsigned int i = 0;
 	int count = 0;
 	while (1)
 	  {
-	    c = READCHAR;
+	    int c = READCHAR;
 	    int digit = char_hexdigit (c);
 	    if (digit < 0)
 	      {
@ -3047,40 +3030,37 @@ read_escape (Lisp_Object readcharfun)
 	    i = (i << 4) + digit;
 	    /* Allow hex escapes as large as ?\xfffffff, because some
 	       packages use them to denote characters with modifiers.  */
-	    if ((CHAR_META | (CHAR_META - 1)) < i)
+	    if (i > (CHAR_META | (CHAR_META - 1)))
 	      error ("Hex character out of range: \\x%x...", i);
 	    count += count < 3;
 	  }

+	if (count == 0)
+	  invalid_escape_syntax_error ();
 	if (count < 3 && i >= 0x80)
-	  return BYTE8_TO_CHAR (i);
-	return i;
+	  i = BYTE8_TO_CHAR (i);
+	modifiers |= i & CHAR_MODIFIER_MASK;
+	chr = i & ~CHAR_MODIFIER_MASK;
+	break;
      }

+    /* 8-digit Unicode hex escape: \UHHHHHHHH */
    case 'U':
-      /* Post-Unicode-2.0: Up to eight hex chars.  */
      unicode_hex_count = 8;
-      FALLTHROUGH;
-    case 'u':
+      goto unicode_hex;

-      /* A Unicode escape.  We only permit them in strings and characters,
-	 not arbitrarily in the source code, as in some other languages.  */
+    /* 4-digit Unicode hex escape: \uHHHH */
+    case 'u':
+      unicode_hex_count = 4;
+    unicode_hex:
      {
 	unsigned int i = 0;
-	int count = 0;
-
-	while (++count <= unicode_hex_count)
+	for (int count = 0; count < unicode_hex_count; count++)
 	  {
-	    c = READCHAR;
+	    int c = READCHAR;
 	    if (c < 0)
-	      {
-		if (unicode_hex_count > 4)
-		  error ("Malformed Unicode escape: \\U%x", i);
-		else
-		  error ("Malformed Unicode escape: \\u%x", i);
-	      }
-	    /* `isdigit' and `isalpha' may be locale-specific, which we don't
-	       want.  */
+	      error ("Malformed Unicode escape: \\%c%x",
+		     unicode_hex_count == 4 ? 'u' : 'U', i);
 	    int digit = char_hexdigit (c);
 	    if (digit < 0)
 	      error ("Non-hex character used for Unicode escape: %c (%d)",
@ -3089,13 +3069,14 @@ read_escape (Lisp_Object readcharfun)
 	  }
 	if (i > 0x10FFFF)
 	  error ("Non-Unicode character: 0x%x", i);
-	return i;
+	chr = i;
+	break;
      }

+    /* Named character: \N{name} */
    case 'N':
-      /* Named character.  */
      {
-        c = READCHAR;
+        int c = READCHAR;
        if (c != '{')
          invalid_syntax ("Expected opening brace after \\N", readcharfun);
        char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1];
@ -3103,12 +3084,12 @@ read_escape (Lisp_Object readcharfun)
        ptrdiff_t length = 0;
        while (true)
          {
-            c = READCHAR;
+            int c = READCHAR;
            if (c < 0)
              end_of_file_error ();
            if (c == '}')
              break;
-            if (! (0 < c && c < 0x80))
+            if (c >= 0x80)
              {
                AUTO_STRING (format,
                             "Invalid character U+%04X in character name");
@ -3137,13 +3118,41 @@ read_escape (Lisp_Object readcharfun)
 	name[length] = '\0';

 	/* character_name_to_code can invoke read0, recursively.
-	   This is why read0's buffer is not static.  */
-	return character_name_to_code (name, length, readcharfun);
+	   This is why read0 needs to be re-entrant.  */
+	chr = character_name_to_code (name, length, readcharfun);
+	break;
      }

    default:
-      return c;
+      chr = c;
+      break;
    }
+  eassert (chr >= 0 && chr < (1 << CHARACTERBITS));
+
+  /* Apply Control modifiers, using the rules:
+     \C-X = ascii_ctrl(nomod(X)) | mods(X)  if nomod(X) is one of:
+                                                A-Z a-z ? @ [ \ ] ^ _
+
+            X | ctrl_modifier               otherwise
+
+     where
+         nomod(c) = c without modifiers
+	 mods(c)  = the modifiers of c
+         ascii_ctrl(c) = 127       if c = '?'
+                         c & 0x1f  otherwise
+  */
+  while (ncontrol > 0)
+    {
+      if ((chr >= '@' && chr <= '_') || (chr >= 'a' && chr <= 'z'))
+	chr &= 0x1f;
+      else if (chr == '?')
+	chr = 127;
+      else
+	modifiers |= ctrl_modifier;
+      ncontrol--;
+    }
+
+  return chr | modifiers;
 }

 /* Return the digit that CHARACTER stands for in the given BASE.
@ -3265,7 +3274,7 @@ read_char_literal (Lisp_Object readcharfun)
    }

  if (ch == '\\')
-    ch = read_escape (readcharfun);
+    ch = read_char_escape (readcharfun, READCHAR);

  int modifiers = ch & CHAR_MODIFIER_MASK;
  ch &= ~CHAR_MODIFIER_MASK;
@ -3331,8 +3340,7 @@ read_string_literal (Lisp_Object readcharfun)
 	      /* `\SPC' and `\LF' generate no characters at all.  */
 	      continue;
 	    default:
-	      UNREAD (ch);
-	      ch = read_escape (readcharfun);
+	      ch = read_char_escape (readcharfun, ch);
 	      break;
 	    }

--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@ -116,8 +116,27 @@
  (should-error (read "#") :type 'invalid-read-syntax))

 (ert-deftest lread-char-modifiers ()
-  (should (eq ?\C-\M-é (+ (- ?\M-a ?a) ?\C-é)))
-  (should (eq (- ?\C-ŗ ?ŗ) (- ?\C-é ?é))))
+  (should (equal ?\C-\M-é (+ (- ?\M-a ?a) ?\C-é)))
+  (should (equal (- ?\C-ŗ ?ŗ) (- ?\C-é ?é)))
+  (should (equal ?\C-\C-c #x4000003))
+  (should (equal ?\C-\M-\C-c #xc000003))
+  (should (equal ?\M-\C-\C-c #xc000003))
+  (should (equal ?\C-\C-\M-c #xc000003))
+  (should (equal ?\M-\S-\H-\A-\C-\s-x #xbc00018))
+
+  (should (equal "\s-x" " -x"))
+  (should (equal "\C-x" "\x18"))
+  (should (equal "\^x" "\x18"))
+  (should (equal "\M-x" "\xf8")))
+
+(ert-deftest lread-many-modifiers ()
+  ;; The string literal "\M-\M-...\M-a" should be equivalent to "\M-a",
+  ;; and we should not run out of stack space parsing it.
+  (let* ((n 500000)
+         (s (concat "\""
+                    (apply #'concat (make-list n "\\M-"))
+                    "a\"")))
+    (should (equal (read-from-string s) (cons "\M-a" (+ (* n 3) 3))))))

 (ert-deftest lread-record-1 ()
  (should (equal '(#s(foo) #s(foo))