Fix Hexl handling of coding-systems with BOM

* lisp/international/mule-cmds.el (encode-coding-char): If
CODING-SYSTEM produces BOM, remove the BOM bytes from the produced
byte sequence.  (Bug#48324)

* lisp/hexl.el (hexl-mode): Use bufferpos-to-filepos to convert
point to offset into the original file.
(hexl-mode-exit, hexl-maybe-dehexlify-buffer): Use
filepos-to-bufferpos to restore point in the original buffer.
(hexl-mode, hexl-insert-multibyte-char)
(hexl-self-insert-command, hexl-insert-hex-char)
(hexl-insert-decimal-char, hexl-insert-octal-char)
(hexl-find-file): Enhance the doc strings, mainly explaining the
complications of inserting multibyte characters.
(hexl-insert-multibyte-char): Don't treat CH as unibyte if the
coding-system isn't ASCII-compatible.  Don't treat null bytes as
multibyte.
This commit is contained in:
Eli Zaretskii 2021-05-11 14:55:29 +03:00
parent 6d24a8e626
commit f3f1947e5b
2 changed files with 120 additions and 44 deletions

View file

@ -303,22 +303,30 @@ also supported.
There are several ways to change text in hexl mode:
ASCII characters (character between space (0x20) and tilde (0x7E)) are
bound to self-insert so you can simply type the character and it will
insert itself (actually overstrike) into the buffer.
Self-inserting characters are bound to `hexl-self-insert' so you
can simply type the character and it will insert itself (actually
overstrike) into the buffer. However, inserting non-ASCII characters
requires caution: the buffer's coding-system should correspond to
the encoding on disk, and multibyte characters should be inserted
with cursor on the first byte of a multibyte sequence whose length
is identical to the length of the multibyte sequence to be inserted,
otherwise this could produce invalid multibyte sequences. Non-ASCII
characters in ISO-2022 encodings should preferably inserted byte by
byte, to avoid problems caused by the designation sequences before
the actual characters.
\\[hexl-quoted-insert] followed by another keystroke allows you to insert the key even if
it isn't bound to self-insert. An octal number can be supplied in place
of another key to insert the octal number's ASCII representation.
\\[hexl-insert-hex-char] will insert a given hexadecimal value (if it is between 0 and 0xFF)
into the buffer at the current point.
\\[hexl-insert-hex-char] will insert a given hexadecimal value
into the buffer at the current address.
\\[hexl-insert-octal-char] will insert a given octal value (if it is between 0 and 0377)
into the buffer at the current point.
\\[hexl-insert-octal-char] will insert a given octal value
into the buffer at the current address.
\\[hexl-insert-decimal-char] will insert a given decimal value (if it is between 0 and 255)
into the buffer at the current point.
\\[hexl-insert-decimal-char] will insert a given decimal value
into the buffer at the current address..
\\[hexl-mode-exit] will exit `hexl-mode'.
@ -332,26 +340,16 @@ You can use \\[hexl-find-file] to visit a file in Hexl mode.
(unless (eq major-mode 'hexl-mode)
(let ((modified (buffer-modified-p))
(inhibit-read-only t)
(original-point (- (point) (point-min))))
(and (eobp) (not (bobp))
(setq original-point (1- original-point)))
(point-offset (bufferpos-to-filepos (point) 'exact)))
;; If `hexl-mode' is invoked with an argument the buffer is assumed to
;; be in hexl format.
(when (memq arg '(1 nil))
;; If the buffer's EOL type is -dos, we need to account for
;; extra CR characters added when hexlify-buffer writes the
;; buffer to a file.
;; FIXME: This doesn't take into account multibyte coding systems.
(when (eq (coding-system-eol-type buffer-file-coding-system) 1)
(setq original-point (+ (count-lines (point-min) (point))
original-point))
(or (bolp) (setq original-point (1- original-point))))
(hexlify-buffer)
(restore-buffer-modified-p modified))
(setq hexl-max-address
(+ (* (/ (1- (buffer-size)) (hexl-line-displen)) 16) 15))
(condition-case nil
(hexl-goto-address original-point)
(hexl-goto-address point-offset)
(error nil)))
(let ((max-address hexl-max-address))
@ -440,7 +438,8 @@ You can use \\[hexl-find-file] to visit a file in Hexl mode.
(defun hexl-find-file (filename)
"Edit file FILENAME as a binary file in hex dump format.
Switch to a buffer visiting file FILENAME, creating one if none exists,
and edit the file in `hexl-mode'."
and edit the file in `hexl-mode'. The buffer's coding-system will be
no-conversion, unlike if you visit it normally and then invoke `hexl-mode'."
(interactive
(list
(let ((completion-ignored-extensions nil))
@ -478,17 +477,11 @@ With arg, don't unhexlify buffer."
(if (or (eq arg 1) (not arg))
(let ((modified (buffer-modified-p))
(inhibit-read-only t)
(original-point (1+ (hexl-current-address))))
(point-offset (hexl-current-address)))
(dehexlify-buffer)
(remove-hook 'write-contents-functions #'hexl-save-buffer t)
(restore-buffer-modified-p modified)
(goto-char original-point)
;; Maybe adjust point for the removed CR characters.
(when (eq (coding-system-eol-type buffer-file-coding-system) 1)
(setq original-point (- original-point
(count-lines (point-min) (point))))
(or (bobp) (setq original-point (1+ original-point))))
(goto-char original-point)))
(goto-char (filepos-to-bufferpos point-offset 'exact))))
(remove-hook 'change-major-mode-hook #'hexl-maybe-dehexlify-buffer t)
(major-mode-restore))
@ -499,11 +492,11 @@ Ask the user for confirmation."
(if (y-or-n-p "Convert contents back to binary format? ")
(let ((modified (buffer-modified-p))
(inhibit-read-only t)
(original-point (1+ (hexl-current-address))))
(point-offset (hexl-current-address)))
(dehexlify-buffer)
(remove-hook 'write-contents-functions #'hexl-save-buffer t)
(restore-buffer-modified-p modified)
(goto-char original-point))))
(goto-char (filepos-to-bufferpos point-offset 'exact)))))
(defun hexl-current-address (&optional validate)
"Return current hexl-address."
@ -879,14 +872,27 @@ This discards the buffer's undo information."
"Insert a possibly multibyte character CH NUM times.
Non-ASCII characters are first encoded with `buffer-file-coding-system',
and their encoded form is inserted byte by byte."
and their encoded form is inserted byte by byte. Note that if the
hexl buffer was produced by `hexl-find-file', its coding-system
is no-conversion.
Inserting non-ASCII characters requires caution: the buffer's
coding-system should correspond to the encoding on disk, and
multibyte characters should be inserted with cursor on the first
byte of a multibyte sequence whose length is identical to the
length of the multibyte sequence to be inserted, otherwise this
could produce invalid multibyte sequences. Non-ASCII characters
in ISO-2022 encodings should preferably inserted byte by byte, to
avoid problems caused by the designation sequences before the
actual characters."
(let ((charset (char-charset ch))
(coding (if (or (null buffer-file-coding-system)
;; coding-system-type equals t means undecided.
(eq (coding-system-type buffer-file-coding-system) t))
(default-value 'buffer-file-coding-system)
buffer-file-coding-system)))
(cond ((and (> ch 0) (< ch 256))
(cond ((and (>= ch 0) (< ch 256)
(coding-system-get coding :ascii-compatible-p))
(hexl-insert-char ch num))
((eq charset 'unknown)
(error
@ -924,7 +930,19 @@ and their encoded form is inserted byte by byte."
Interactively, with a numeric argument, insert this character that many times.
Non-ASCII characters are first encoded with `buffer-file-coding-system',
and their encoded form is inserted byte by byte."
and their encoded form is inserted byte by byte. Note that if the
hexl buffer was produced by `hexl-find-file', its coding-system
is no-conversion.
Inserting non-ASCII characters requires caution: the buffer's
coding-system should correspond to the encoding on disk, and
multibyte characters should be inserted with cursor on the first
byte of a multibyte sequence whose length is identical to the
length of the multibyte sequence to be inserted, otherwise this
could produce invalid multibyte sequences. Non-ASCII characters
in ISO-2022 encodings should preferably inserted byte by byte, to
avoid problems caused by the designation sequences before the
actual characters."
(interactive "p")
(hexl-insert-multibyte-char last-command-event arg))
@ -964,7 +982,21 @@ CH must be a unibyte character whose value is between 0 and 255."
;; hex conversion
(defun hexl-insert-hex-char (arg)
"Insert a character given by its hexadecimal code ARG times at point."
"Insert a character given by its hexadecimal code ARG times at point.
Values above 0xFF are treated as multibyte characters, and first encoded
using `buffer-file-coding-system'. Note that if the hexl buffer was
produced by `hexl-find-file', its coding-system is no-conversion.
Inserting non-ASCII characters requires caution: the buffer's
coding-system should correspond to the encoding on disk, and
multibyte characters should be inserted with cursor on the first
byte of a multibyte sequence whose length is identical to the
length of the multibyte sequence to be inserted, otherwise this
could produce invalid multibyte sequences. Non-ASCII characters
in ISO-2022 encodings should preferably inserted byte by byte, to
avoid problems caused by the designation sequences before the
actual characters."
(interactive "p")
(let ((num (hexl-hex-string-to-integer (read-string "Hex number: "))))
(if (< num 0)
@ -997,7 +1029,21 @@ Embedded whitespace, dashes, and periods in the string are ignored."
(setq arg (- arg 1)))))
(defun hexl-insert-decimal-char (arg)
"Insert a character given by its decimal code ARG times at point."
"Insert a character given by its decimal code ARG times at point.
Values above 256 are treated as multibyte characters, and first encoded
using `buffer-file-coding-system'. Note that if the hexl buffer was
produced by `hexl-find-file', its coding-system is no-conversion.
Inserting non-ASCII characters requires caution: the buffer's
coding-system should correspond to the encoding on disk, and
multibyte characters should be inserted with cursor on the first
byte of a multibyte sequence whose length is identical to the
length of the multibyte sequence to be inserted, otherwise this
could produce invalid multibyte sequences. Non-ASCII characters
in ISO-2022 encodings should preferably inserted byte by byte, to
avoid problems caused by the designation sequences before the
actual characters."
(interactive "p")
(let ((num (string-to-number (read-string "Decimal Number: "))))
(if (< num 0)
@ -1005,7 +1051,21 @@ Embedded whitespace, dashes, and periods in the string are ignored."
(hexl-insert-multibyte-char num arg))))
(defun hexl-insert-octal-char (arg)
"Insert a character given by its octal code ARG times at point."
"Insert a character given by its octal code ARG times at point.
Values above \377 are treated as multibyte characters, and first encoded
using `buffer-file-coding-system'. Note that if the hexl buffer was
produced by `hexl-find-file', its coding-system is no-conversion.
Inserting non-ASCII characters requires caution: the buffer's
coding-system should correspond to the encoding on disk, and
multibyte characters should be inserted with cursor on the first
byte of a multibyte sequence whose length is identical to the
length of the multibyte sequence to be inserted, otherwise this
could produce invalid multibyte sequences. Non-ASCII characters
in ISO-2022 encodings should preferably inserted byte by byte, to
avoid problems caused by the designation sequences before the
actual characters."
(interactive "p")
(let ((num (hexl-octal-string-to-integer (read-string "Octal Number: "))))
(if (< num 0)

View file

@ -2963,18 +2963,22 @@ STR should be a unibyte string."
str " "))
(defun encode-coding-char (char coding-system &optional charset)
"Encode CHAR by CODING-SYSTEM and return the resulting string.
"Encode CHAR by CODING-SYSTEM and return the resulting string of bytes.
If CODING-SYSTEM can't safely encode CHAR, return nil.
The 3rd optional argument CHARSET, if non-nil, is a charset preferred
on encoding."
(let* ((str1 (string char))
(str2 (string char char))
(found (find-coding-systems-string str1))
enc1 enc2 i1 i2)
(if (eq (car-safe found) 'undecided) ;Aka (not (multibyte-string-p str1))
;; `char' is ASCII.
(bom-p (coding-system-get coding-system :bom))
enc1 enc2 i0 i1 i2)
;; If CHAR is ASCII and CODING-SYSTEM doesn't prepend a BOM, just
;; encode CHAR.
(if (and (eq (car-safe found) 'undecided)
(null bom-p))
(encode-coding-string str1 coding-system)
(when (memq (coding-system-base coding-system) found)
(when (or (eq (car-safe found) 'undecided)
(memq (coding-system-base coding-system) found))
;; We must find the encoded string of CHAR. But, just encoding
;; CHAR will put extra control sequences (usually to designate
;; ASCII charset) at the tail if type of CODING is ISO 2022.
@ -2995,7 +2999,19 @@ on encoding."
;; Now (substring enc1 i1) and (substring enc2 i2) are the same,
;; and they are the extra control sequences at the tail to
;; exclude.
(substring enc2 0 i2)))))
;; We also need to exclude the leading 2 or 3 bytes if they
;; come from a BOM.
(setq i0
(if bom-p
(cond
((eq (coding-system-type coding-system) 'utf-8)
3)
((eq (coding-system-type coding-system) 'utf-16)
2)
(t 0))
0))
(substring enc2 i0 i2)))))
;; Backwards compatibility. These might be better with :init-value t,
;; but that breaks loadup.