Make eww more liberal when interpreting some invalid HTML
* lisp/net/eww.el (eww--preprocess-html): New function (bug#37009) to be more lenient with invalid HTML and translate common invalid HTML like "a <= b" into "a <= b" to be more liberal in what we accept before parsing. (eww-display-html): Use it. (eww-readable): Ditto.
This commit is contained in:
parent
49a4b86925
commit
568f1488a6
1 changed files with 14 additions and 0 deletions
|
@ -326,6 +326,18 @@ the default EWW buffer."
|
|||
#'url-hexify-string (split-string url) "+"))))))
|
||||
url)
|
||||
|
||||
(defun eww--preprocess-html (start end)
|
||||
"Translate all < characters that do not look like start of tags into <."
|
||||
(save-excursion
|
||||
(save-restriction
|
||||
(narrow-to-region start end)
|
||||
(goto-char start)
|
||||
(let ((case-fold-search t))
|
||||
(while (re-search-forward "<[^0-9a-z!/]" nil t)
|
||||
(goto-char (match-beginning 0))
|
||||
(delete-region (point) (1+ (point)))
|
||||
(insert "<"))))))
|
||||
|
||||
;;;###autoload (defalias 'browse-web 'eww)
|
||||
|
||||
;;;###autoload
|
||||
|
@ -479,6 +491,7 @@ Currently this means either text/html or application/xhtml+xml."
|
|||
;; Remove CRLF and replace NUL with � before parsing.
|
||||
(while (re-search-forward "\\(\r$\\)\\|\0" nil t)
|
||||
(replace-match (if (match-beginning 1) "" "�") t t)))
|
||||
(eww--preprocess-html (point) (point-max))
|
||||
(libxml-parse-html-region (point) (point-max))))))
|
||||
(source (and (null document)
|
||||
(buffer-substring (point) (point-max)))))
|
||||
|
@ -716,6 +729,7 @@ the like."
|
|||
(condition-case nil
|
||||
(decode-coding-region (point-min) (point-max) 'utf-8)
|
||||
(coding-system-error nil))
|
||||
(eww--preprocess-html (point-min) (point-max))
|
||||
(libxml-parse-html-region (point-min) (point-max))))
|
||||
(base (plist-get eww-data :url)))
|
||||
(eww-score-readability dom)
|
||||
|
|
Loading…
Add table
Reference in a new issue