Add a new, somewhat experimental "readability" command to eww

* net/eww.el (eww-readable): New command and keystroke.

* net/shr.el (shr-retransform-dom): New function.
This commit is contained in:
Lars Magne Ingebrigtsen 2014-11-03 01:01:20 +01:00
parent 816cad6e24
commit 2e8259b044
4 changed files with 86 additions and 0 deletions

View file

@ -133,6 +133,12 @@ result of the calculation into the current buffer.
*** New minor mode global-eldoc-mode
*** eldoc-documentation-function now defaults to nil
** eww
*** A new command `R' (`eww-readable') will try do identify the main
textual parts of a web page and display only that, leaving menus and
the like off the page.
** Message mode
*** text/html messages that contain inline image parts will be

View file

@ -1,5 +1,9 @@
2014-11-02 Lars Magne Ingebrigtsen <larsi@gnus.org>
* net/eww.el (eww-readable): New command and keystroke.
* net/shr.el (shr-retransform-dom): New function.
* net/eww.el (eww-display-html): Set `eww-current-source' in the
correct buffer.
(eww-view-source): Use it.

View file

@ -402,6 +402,7 @@ word(s) will be searched for via `eww-search-prefix'."
(setq-local eww-contents-url nil))
(defun eww-view-source ()
"View the HTML source code of the current page."
(interactive)
(let ((buf (get-buffer-create "*eww-source*"))
(source eww-current-source))
@ -413,6 +414,60 @@ word(s) will be searched for via `eww-search-prefix'."
(html-mode)))
(view-buffer buf)))
(defun eww-readable ()
"View the main \"readable\" parts of the current web page.
This command uses heuristics to find the parts of the web page that
contains the main textual portion, leaving out navigation menus and
the like."
(interactive)
(let* ((source eww-current-source)
(dom (shr-transform-dom
(with-temp-buffer
(insert source)
(libxml-parse-html-region (point-min) (point-max))))))
(eww-score-readability dom)
(eww-display-html 'utf-8 nil (shr-retransform-dom
(eww-highest-readability dom)))
(setq eww-current-source source)))
(defun eww-score-readability (node)
(let ((score -1))
(cond
((memq (car node) '(script head))
(setq score -2))
((eq (car node) 'meta)
(setq score -1))
((eq (car node) 'a)
(setq score (- (length (split-string
(or (cdr (assoc 'text (cdr node))) ""))))))
(t
(dolist (elem (cdr node))
(cond
((eq (car elem) 'text)
(setq score (+ score (length (split-string (cdr elem))))))
((consp (cdr elem))
(setq score (+ score
(or (cdr (assoc :eww-readability-score (cdr elem)))
(eww-score-readability elem)))))))))
;; Cache the score of the node to avoid recomputing all the time.
(setcdr node (cons (cons :eww-readability-score score) (cdr node)))
score))
(defun eww-highest-readability (node)
(let ((result node)
highest)
(dolist (elem (cdr node))
(when (and (consp (cdr elem))
(> (or (cdr (assoc
:eww-readability-score
(setq highest
(eww-highest-readability elem))))
most-negative-fixnum)
(or (cdr (assoc :eww-readability-score (cdr result)))
most-negative-fixnum)))
(setq result highest)))
result))
(defvar eww-mode-map
(let ((map (make-sparse-keymap)))
(suppress-keymap map)
@ -435,6 +490,7 @@ word(s) will be searched for via `eww-search-prefix'."
(define-key map "w" 'eww-copy-page-url)
(define-key map "C" 'url-cookie-list)
(define-key map "v" 'eww-view-source)
(define-key map "R" 'eww-readable)
(define-key map "H" 'eww-list-histories)
(define-key map "b" 'eww-add-bookmark)

View file

@ -370,6 +370,26 @@ size, and full-buffer size."
(push (shr-transform-dom sub) result)))
(nreverse result)))
(defun shr-retransform-dom (dom)
"Transform the shr DOM back into the libxml DOM."
(let ((tag (car dom))
(attributes nil)
(text nil)
(sub-nodes nil))
(dolist (elem (cdr dom))
(cond
((eq (car elem) 'text)
(setq text (cdr elem)))
((not (consp (cdr elem)))
(push (cons (intern (substring (symbol-name (car elem)) 1) obarray)
(cdr elem))
attributes))
(t
(push (shr-retransform-dom elem) sub-nodes))))
(append (list tag (nreverse attributes))
(nreverse sub-nodes)
(and text (list text)))))
(defsubst shr-generic (cont)
(dolist (sub cont)
(cond