Fix Hexl handling of coding-systems with BOM
* lisp/international/mule-cmds.el (encode-coding-char): If CODING-SYSTEM produces BOM, remove the BOM bytes from the produced byte sequence. (Bug#48324) * lisp/hexl.el (hexl-mode): Use bufferpos-to-filepos to convert point to offset into the original file. (hexl-mode-exit, hexl-maybe-dehexlify-buffer): Use filepos-to-bufferpos to restore point in the original buffer. (hexl-mode, hexl-insert-multibyte-char) (hexl-self-insert-command, hexl-insert-hex-char) (hexl-insert-decimal-char, hexl-insert-octal-char) (hexl-find-file): Enhance the doc strings, mainly explaining the complications of inserting multibyte characters. (hexl-insert-multibyte-char): Don't treat CH as unibyte if the coding-system isn't ASCII-compatible. Don't treat null bytes as multibyte.
This commit is contained in:
parent
6d24a8e626
commit
f3f1947e5b
2 changed files with 120 additions and 44 deletions
136
lisp/hexl.el
136
lisp/hexl.el
|
@ -303,22 +303,30 @@ also supported.
|
|||
|
||||
There are several ways to change text in hexl mode:
|
||||
|
||||
ASCII characters (character between space (0x20) and tilde (0x7E)) are
|
||||
bound to self-insert so you can simply type the character and it will
|
||||
insert itself (actually overstrike) into the buffer.
|
||||
Self-inserting characters are bound to `hexl-self-insert' so you
|
||||
can simply type the character and it will insert itself (actually
|
||||
overstrike) into the buffer. However, inserting non-ASCII characters
|
||||
requires caution: the buffer's coding-system should correspond to
|
||||
the encoding on disk, and multibyte characters should be inserted
|
||||
with cursor on the first byte of a multibyte sequence whose length
|
||||
is identical to the length of the multibyte sequence to be inserted,
|
||||
otherwise this could produce invalid multibyte sequences. Non-ASCII
|
||||
characters in ISO-2022 encodings should preferably inserted byte by
|
||||
byte, to avoid problems caused by the designation sequences before
|
||||
the actual characters.
|
||||
|
||||
\\[hexl-quoted-insert] followed by another keystroke allows you to insert the key even if
|
||||
it isn't bound to self-insert. An octal number can be supplied in place
|
||||
of another key to insert the octal number's ASCII representation.
|
||||
|
||||
\\[hexl-insert-hex-char] will insert a given hexadecimal value (if it is between 0 and 0xFF)
|
||||
into the buffer at the current point.
|
||||
\\[hexl-insert-hex-char] will insert a given hexadecimal value
|
||||
into the buffer at the current address.
|
||||
|
||||
\\[hexl-insert-octal-char] will insert a given octal value (if it is between 0 and 0377)
|
||||
into the buffer at the current point.
|
||||
\\[hexl-insert-octal-char] will insert a given octal value
|
||||
into the buffer at the current address.
|
||||
|
||||
\\[hexl-insert-decimal-char] will insert a given decimal value (if it is between 0 and 255)
|
||||
into the buffer at the current point.
|
||||
\\[hexl-insert-decimal-char] will insert a given decimal value
|
||||
into the buffer at the current address..
|
||||
|
||||
\\[hexl-mode-exit] will exit `hexl-mode'.
|
||||
|
||||
|
@ -332,26 +340,16 @@ You can use \\[hexl-find-file] to visit a file in Hexl mode.
|
|||
(unless (eq major-mode 'hexl-mode)
|
||||
(let ((modified (buffer-modified-p))
|
||||
(inhibit-read-only t)
|
||||
(original-point (- (point) (point-min))))
|
||||
(and (eobp) (not (bobp))
|
||||
(setq original-point (1- original-point)))
|
||||
(point-offset (bufferpos-to-filepos (point) 'exact)))
|
||||
;; If `hexl-mode' is invoked with an argument the buffer is assumed to
|
||||
;; be in hexl format.
|
||||
(when (memq arg '(1 nil))
|
||||
;; If the buffer's EOL type is -dos, we need to account for
|
||||
;; extra CR characters added when hexlify-buffer writes the
|
||||
;; buffer to a file.
|
||||
;; FIXME: This doesn't take into account multibyte coding systems.
|
||||
(when (eq (coding-system-eol-type buffer-file-coding-system) 1)
|
||||
(setq original-point (+ (count-lines (point-min) (point))
|
||||
original-point))
|
||||
(or (bolp) (setq original-point (1- original-point))))
|
||||
(hexlify-buffer)
|
||||
(restore-buffer-modified-p modified))
|
||||
(setq hexl-max-address
|
||||
(+ (* (/ (1- (buffer-size)) (hexl-line-displen)) 16) 15))
|
||||
(condition-case nil
|
||||
(hexl-goto-address original-point)
|
||||
(hexl-goto-address point-offset)
|
||||
(error nil)))
|
||||
|
||||
(let ((max-address hexl-max-address))
|
||||
|
@ -440,7 +438,8 @@ You can use \\[hexl-find-file] to visit a file in Hexl mode.
|
|||
(defun hexl-find-file (filename)
|
||||
"Edit file FILENAME as a binary file in hex dump format.
|
||||
Switch to a buffer visiting file FILENAME, creating one if none exists,
|
||||
and edit the file in `hexl-mode'."
|
||||
and edit the file in `hexl-mode'. The buffer's coding-system will be
|
||||
no-conversion, unlike if you visit it normally and then invoke `hexl-mode'."
|
||||
(interactive
|
||||
(list
|
||||
(let ((completion-ignored-extensions nil))
|
||||
|
@ -478,17 +477,11 @@ With arg, don't unhexlify buffer."
|
|||
(if (or (eq arg 1) (not arg))
|
||||
(let ((modified (buffer-modified-p))
|
||||
(inhibit-read-only t)
|
||||
(original-point (1+ (hexl-current-address))))
|
||||
(point-offset (hexl-current-address)))
|
||||
(dehexlify-buffer)
|
||||
(remove-hook 'write-contents-functions #'hexl-save-buffer t)
|
||||
(restore-buffer-modified-p modified)
|
||||
(goto-char original-point)
|
||||
;; Maybe adjust point for the removed CR characters.
|
||||
(when (eq (coding-system-eol-type buffer-file-coding-system) 1)
|
||||
(setq original-point (- original-point
|
||||
(count-lines (point-min) (point))))
|
||||
(or (bobp) (setq original-point (1+ original-point))))
|
||||
(goto-char original-point)))
|
||||
(goto-char (filepos-to-bufferpos point-offset 'exact))))
|
||||
|
||||
(remove-hook 'change-major-mode-hook #'hexl-maybe-dehexlify-buffer t)
|
||||
(major-mode-restore))
|
||||
|
@ -499,11 +492,11 @@ Ask the user for confirmation."
|
|||
(if (y-or-n-p "Convert contents back to binary format? ")
|
||||
(let ((modified (buffer-modified-p))
|
||||
(inhibit-read-only t)
|
||||
(original-point (1+ (hexl-current-address))))
|
||||
(point-offset (hexl-current-address)))
|
||||
(dehexlify-buffer)
|
||||
(remove-hook 'write-contents-functions #'hexl-save-buffer t)
|
||||
(restore-buffer-modified-p modified)
|
||||
(goto-char original-point))))
|
||||
(goto-char (filepos-to-bufferpos point-offset 'exact)))))
|
||||
|
||||
(defun hexl-current-address (&optional validate)
|
||||
"Return current hexl-address."
|
||||
|
@ -879,14 +872,27 @@ This discards the buffer's undo information."
|
|||
"Insert a possibly multibyte character CH NUM times.
|
||||
|
||||
Non-ASCII characters are first encoded with `buffer-file-coding-system',
|
||||
and their encoded form is inserted byte by byte."
|
||||
and their encoded form is inserted byte by byte. Note that if the
|
||||
hexl buffer was produced by `hexl-find-file', its coding-system
|
||||
is no-conversion.
|
||||
|
||||
Inserting non-ASCII characters requires caution: the buffer's
|
||||
coding-system should correspond to the encoding on disk, and
|
||||
multibyte characters should be inserted with cursor on the first
|
||||
byte of a multibyte sequence whose length is identical to the
|
||||
length of the multibyte sequence to be inserted, otherwise this
|
||||
could produce invalid multibyte sequences. Non-ASCII characters
|
||||
in ISO-2022 encodings should preferably inserted byte by byte, to
|
||||
avoid problems caused by the designation sequences before the
|
||||
actual characters."
|
||||
(let ((charset (char-charset ch))
|
||||
(coding (if (or (null buffer-file-coding-system)
|
||||
;; coding-system-type equals t means undecided.
|
||||
(eq (coding-system-type buffer-file-coding-system) t))
|
||||
(default-value 'buffer-file-coding-system)
|
||||
buffer-file-coding-system)))
|
||||
(cond ((and (> ch 0) (< ch 256))
|
||||
(cond ((and (>= ch 0) (< ch 256)
|
||||
(coding-system-get coding :ascii-compatible-p))
|
||||
(hexl-insert-char ch num))
|
||||
((eq charset 'unknown)
|
||||
(error
|
||||
|
@ -924,7 +930,19 @@ and their encoded form is inserted byte by byte."
|
|||
Interactively, with a numeric argument, insert this character that many times.
|
||||
|
||||
Non-ASCII characters are first encoded with `buffer-file-coding-system',
|
||||
and their encoded form is inserted byte by byte."
|
||||
and their encoded form is inserted byte by byte. Note that if the
|
||||
hexl buffer was produced by `hexl-find-file', its coding-system
|
||||
is no-conversion.
|
||||
|
||||
Inserting non-ASCII characters requires caution: the buffer's
|
||||
coding-system should correspond to the encoding on disk, and
|
||||
multibyte characters should be inserted with cursor on the first
|
||||
byte of a multibyte sequence whose length is identical to the
|
||||
length of the multibyte sequence to be inserted, otherwise this
|
||||
could produce invalid multibyte sequences. Non-ASCII characters
|
||||
in ISO-2022 encodings should preferably inserted byte by byte, to
|
||||
avoid problems caused by the designation sequences before the
|
||||
actual characters."
|
||||
(interactive "p")
|
||||
(hexl-insert-multibyte-char last-command-event arg))
|
||||
|
||||
|
@ -964,7 +982,21 @@ CH must be a unibyte character whose value is between 0 and 255."
|
|||
;; hex conversion
|
||||
|
||||
(defun hexl-insert-hex-char (arg)
|
||||
"Insert a character given by its hexadecimal code ARG times at point."
|
||||
"Insert a character given by its hexadecimal code ARG times at point.
|
||||
|
||||
Values above 0xFF are treated as multibyte characters, and first encoded
|
||||
using `buffer-file-coding-system'. Note that if the hexl buffer was
|
||||
produced by `hexl-find-file', its coding-system is no-conversion.
|
||||
|
||||
Inserting non-ASCII characters requires caution: the buffer's
|
||||
coding-system should correspond to the encoding on disk, and
|
||||
multibyte characters should be inserted with cursor on the first
|
||||
byte of a multibyte sequence whose length is identical to the
|
||||
length of the multibyte sequence to be inserted, otherwise this
|
||||
could produce invalid multibyte sequences. Non-ASCII characters
|
||||
in ISO-2022 encodings should preferably inserted byte by byte, to
|
||||
avoid problems caused by the designation sequences before the
|
||||
actual characters."
|
||||
(interactive "p")
|
||||
(let ((num (hexl-hex-string-to-integer (read-string "Hex number: "))))
|
||||
(if (< num 0)
|
||||
|
@ -997,7 +1029,21 @@ Embedded whitespace, dashes, and periods in the string are ignored."
|
|||
(setq arg (- arg 1)))))
|
||||
|
||||
(defun hexl-insert-decimal-char (arg)
|
||||
"Insert a character given by its decimal code ARG times at point."
|
||||
"Insert a character given by its decimal code ARG times at point.
|
||||
|
||||
Values above 256 are treated as multibyte characters, and first encoded
|
||||
using `buffer-file-coding-system'. Note that if the hexl buffer was
|
||||
produced by `hexl-find-file', its coding-system is no-conversion.
|
||||
|
||||
Inserting non-ASCII characters requires caution: the buffer's
|
||||
coding-system should correspond to the encoding on disk, and
|
||||
multibyte characters should be inserted with cursor on the first
|
||||
byte of a multibyte sequence whose length is identical to the
|
||||
length of the multibyte sequence to be inserted, otherwise this
|
||||
could produce invalid multibyte sequences. Non-ASCII characters
|
||||
in ISO-2022 encodings should preferably inserted byte by byte, to
|
||||
avoid problems caused by the designation sequences before the
|
||||
actual characters."
|
||||
(interactive "p")
|
||||
(let ((num (string-to-number (read-string "Decimal Number: "))))
|
||||
(if (< num 0)
|
||||
|
@ -1005,7 +1051,21 @@ Embedded whitespace, dashes, and periods in the string are ignored."
|
|||
(hexl-insert-multibyte-char num arg))))
|
||||
|
||||
(defun hexl-insert-octal-char (arg)
|
||||
"Insert a character given by its octal code ARG times at point."
|
||||
"Insert a character given by its octal code ARG times at point.
|
||||
|
||||
Values above \377 are treated as multibyte characters, and first encoded
|
||||
using `buffer-file-coding-system'. Note that if the hexl buffer was
|
||||
produced by `hexl-find-file', its coding-system is no-conversion.
|
||||
|
||||
Inserting non-ASCII characters requires caution: the buffer's
|
||||
coding-system should correspond to the encoding on disk, and
|
||||
multibyte characters should be inserted with cursor on the first
|
||||
byte of a multibyte sequence whose length is identical to the
|
||||
length of the multibyte sequence to be inserted, otherwise this
|
||||
could produce invalid multibyte sequences. Non-ASCII characters
|
||||
in ISO-2022 encodings should preferably inserted byte by byte, to
|
||||
avoid problems caused by the designation sequences before the
|
||||
actual characters."
|
||||
(interactive "p")
|
||||
(let ((num (hexl-octal-string-to-integer (read-string "Octal Number: "))))
|
||||
(if (< num 0)
|
||||
|
|
|
@ -2963,18 +2963,22 @@ STR should be a unibyte string."
|
|||
str " "))
|
||||
|
||||
(defun encode-coding-char (char coding-system &optional charset)
|
||||
"Encode CHAR by CODING-SYSTEM and return the resulting string.
|
||||
"Encode CHAR by CODING-SYSTEM and return the resulting string of bytes.
|
||||
If CODING-SYSTEM can't safely encode CHAR, return nil.
|
||||
The 3rd optional argument CHARSET, if non-nil, is a charset preferred
|
||||
on encoding."
|
||||
(let* ((str1 (string char))
|
||||
(str2 (string char char))
|
||||
(found (find-coding-systems-string str1))
|
||||
enc1 enc2 i1 i2)
|
||||
(if (eq (car-safe found) 'undecided) ;Aka (not (multibyte-string-p str1))
|
||||
;; `char' is ASCII.
|
||||
(bom-p (coding-system-get coding-system :bom))
|
||||
enc1 enc2 i0 i1 i2)
|
||||
;; If CHAR is ASCII and CODING-SYSTEM doesn't prepend a BOM, just
|
||||
;; encode CHAR.
|
||||
(if (and (eq (car-safe found) 'undecided)
|
||||
(null bom-p))
|
||||
(encode-coding-string str1 coding-system)
|
||||
(when (memq (coding-system-base coding-system) found)
|
||||
(when (or (eq (car-safe found) 'undecided)
|
||||
(memq (coding-system-base coding-system) found))
|
||||
;; We must find the encoded string of CHAR. But, just encoding
|
||||
;; CHAR will put extra control sequences (usually to designate
|
||||
;; ASCII charset) at the tail if type of CODING is ISO 2022.
|
||||
|
@ -2995,7 +2999,19 @@ on encoding."
|
|||
;; Now (substring enc1 i1) and (substring enc2 i2) are the same,
|
||||
;; and they are the extra control sequences at the tail to
|
||||
;; exclude.
|
||||
(substring enc2 0 i2)))))
|
||||
|
||||
;; We also need to exclude the leading 2 or 3 bytes if they
|
||||
;; come from a BOM.
|
||||
(setq i0
|
||||
(if bom-p
|
||||
(cond
|
||||
((eq (coding-system-type coding-system) 'utf-8)
|
||||
3)
|
||||
((eq (coding-system-type coding-system) 'utf-16)
|
||||
2)
|
||||
(t 0))
|
||||
0))
|
||||
(substring enc2 i0 i2)))))
|
||||
|
||||
;; Backwards compatibility. These might be better with :init-value t,
|
||||
;; but that breaks loadup.
|
||||
|
|
Loading…
Add table
Reference in a new issue