* lisp/xml.el: Handle entity and character reference expansion correctly.

(xml-default-ns): New variable.
(xml-entity-alist): Use XML spec definitions for lt and amp.
(xml-parse-region): Make first two arguments optional.  Discard
text properties.
(xml-parse-tag-1): New function, spun off from xml-parse-tag.  All
callers changed.
(xml-parse-tag): Call xml-parse-tag-1.  For backward
compatibility, this function should not modify buffer contents.
(xml-parse-tag-1): Fix opening-tag regexp.
(xml-parse-string): Rewrite, handling entity and character
references properly.
(xml--entity-replacement-text): Signal an error if a parameter
entity is undefined.

* test/automated/xml-parse-tests.el (xml-parse-tests--data): More
testcases.
This commit is contained in:
Chong Yidong 2012-07-03 00:21:54 +08:00
parent 2b5208f181
commit a7aef6f5c6
4 changed files with 220 additions and 190 deletions

View file

@ -1,3 +1,21 @@
2012-07-02 Chong Yidong <cyd@gnu.org>
* xml.el: Fix entity and character reference expansion, allowing
them to expand into markup as per XML spec.
(xml-default-ns): New variable.
(xml-entity-alist): Use XML spec definitions for lt and amp.
(xml-parse-region): Make first two arguments optional. Discard
text properties.
(xml-parse-tag-1): New function, spun off from xml-parse-tag. All
callers changed.
(xml-parse-tag): Call xml-parse-tag-1. For backward
compatibility, this function should not modify buffer contents.
(xml-parse-tag-1): Fix opening-tag regexp.
(xml-parse-string): Rewrite, handling entity and character
references properly.
(xml--entity-replacement-text): Signal an error if a parameter
entity is undefined.
2012-07-02 Stefan Monnier <monnier@iro.umontreal.ca>
* comint.el (comint-output-filter): Filter out repeated prompts.

View file

@ -80,22 +80,23 @@
;; a worthwhile tradeoff especially since we're usually parsing files
;; instead of hand-crafted XML.
;;*******************************************************************
;;**
;;** Macros to parse the list
;;**
;;*******************************************************************
;;; Macros to parse the list
(defconst xml-undefined-entity "?"
"What to substitute for undefined entities")
(defconst xml-default-ns '(("" . "")
("xml" . "http://www.w3.org/XML/1998/namespace")
("xmlns" . "http://www.w3.org/2000/xmlns/"))
"Alist mapping default XML namespaces to their URIs.")
(defvar xml-entity-alist
'(("lt" . "<")
'(("lt" . "&#60;")
("gt" . ">")
("apos" . "'")
("quot" . "\"")
("amp" . "&"))
"Alist of defined XML entities.")
("amp" . "&#38;"))
"Alist mapping XML entities to their replacement text.")
(defvar xml-parameter-entity-alist nil
"Alist of defined XML parametric entities.")
@ -156,11 +157,7 @@ An empty string is returned if the attribute was not found.
See also `xml-get-attribute-or-nil'."
(or (xml-get-attribute-or-nil node attribute) ""))
;;*******************************************************************
;;**
;;** Creating the list
;;**
;;*******************************************************************
;;; Creating the list
;;;###autoload
(defun xml-parse-file (file &optional parse-dtd parse-ns)
@ -299,8 +296,10 @@ If PARSE-NS is non-nil, then QNAMES are expanded."
;; prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
;;;###autoload
(defun xml-parse-region (beg end &optional buffer parse-dtd parse-ns)
(defun xml-parse-region (&optional beg end buffer parse-dtd parse-ns)
"Parse the region from BEG to END in BUFFER.
If BEG is nil, it defaults to `point-min'.
If END is nil, it defaults to `point-max'.
If BUFFER is nil, it defaults to the current buffer.
Returns the XML list for the region, or raises an error if the region
is not well-formed XML.
@ -312,7 +311,7 @@ If PARSE-NS is non-nil, then QNAMES are expanded."
(unless buffer
(setq buffer (current-buffer)))
(with-temp-buffer
(insert-buffer-substring buffer beg end)
(insert-buffer-substring-no-properties buffer beg end)
(xml--parse-buffer parse-dtd parse-ns)))
(defun xml--parse-buffer (parse-dtd parse-ns)
@ -327,7 +326,7 @@ If PARSE-NS is non-nil, then QNAMES are expanded."
(if (search-forward "<" nil t)
(progn
(forward-char -1)
(setq result (xml-parse-tag parse-dtd parse-ns))
(setq result (xml-parse-tag-1 parse-dtd parse-ns))
(cond
((null result)
;; Not looking at an xml start tag.
@ -379,8 +378,7 @@ specify that the name shouldn't be given a namespace."
(xml-parameter-entity-alist xml-parameter-entity-alist)
children)
(while (not (eobp))
(let ((bit (xml-parse-tag
parse-dtd parse-ns)))
(let ((bit (xml-parse-tag-1 parse-dtd parse-ns)))
(if children
(setq children (append (list bit) children))
(if (stringp bit)
@ -392,30 +390,32 @@ specify that the name shouldn't be given a namespace."
"Parse the tag at point.
If PARSE-DTD is non-nil, the DTD of the document, if any, is parsed and
returned as the first element in the list.
If PARSE-NS is non-nil, then QNAMES are expanded.
Returns one of:
If PARSE-NS is non-nil, expand QNAMES; if the value of PARSE-NS
is a list, use it as an alist mapping namespaces to URIs.
Return one of:
- a list : the matching node
- nil : the point is not looking at a tag.
- a pair : the first element is the DTD, the second is the node."
(let ((buf (current-buffer))
(pos (point)))
(with-temp-buffer
(insert-buffer-substring-no-properties buf pos)
(goto-char (point-min))
(xml-parse-tag-1 parse-dtd parse-ns))))
(defun xml-parse-tag-1 (&optional parse-dtd parse-ns)
"Like `xml-parse-tag', but possibly modify the buffer while working."
(let ((xml-validating-parser (or parse-dtd xml-validating-parser))
(xml-ns (if (consp parse-ns)
parse-ns
(if parse-ns
(list
;; Default for empty prefix is no namespace
(cons "" "")
;; "xml" namespace
(cons "xml" "http://www.w3.org/XML/1998/namespace")
;; We need to seed the xmlns namespace
(cons "xmlns" "http://www.w3.org/2000/xmlns/"))))))
(xml-ns (cond ((consp parse-ns) parse-ns)
(parse-ns xml-default-ns))))
(cond
;; Processing instructions (like the <?xml version="1.0"?> tag at the
;; beginning of a document).
;; Processing instructions, like <?xml version="1.0"?>.
((looking-at "<\\?")
(search-forward "?>")
(skip-syntax-forward " ")
(xml-parse-tag parse-dtd xml-ns))
;; Character data (CDATA) sections, in which no tag should be interpreted
(xml-parse-tag-1 parse-dtd xml-ns))
;; Character data (CDATA) sections, in which no tag should be interpreted
((looking-at "<!\\[CDATA\\[")
(let ((pos (match-end 0)))
(unless (search-forward "]]>" nil t)
@ -423,33 +423,32 @@ Returns one of:
(concat
(buffer-substring-no-properties pos (match-beginning 0))
(xml-parse-string))))
;; DTD for the document
;; DTD for the document
((looking-at "<!DOCTYPE[ \t\n\r]")
(let ((dtd (xml-parse-dtd parse-ns)))
(skip-syntax-forward " ")
(if xml-validating-parser
(cons dtd (xml-parse-tag nil xml-ns))
(xml-parse-tag nil xml-ns))))
;; skip comments
(cons dtd (xml-parse-tag-1 nil xml-ns))
(xml-parse-tag-1 nil xml-ns))))
;; skip comments
((looking-at "<!--")
(search-forward "-->")
;; FIXME: This loses the skipped-over spaces.
(skip-syntax-forward " ")
(unless (eobp)
(let ((xml-sub-parser t))
(xml-parse-tag parse-dtd xml-ns))))
;; end tag
(xml-parse-tag-1 parse-dtd xml-ns))))
;; end tag
((looking-at "</")
'())
;; opening tag
((looking-at "<\\([^/>[:space:]]+\\)")
;; opening tag
((looking-at (eval-when-compile (concat "<\\(" xml-name-re "\\)")))
(goto-char (match-end 1))
;; Parse this node
(let* ((node-name (match-string-no-properties 1))
;; Parse the attribute list.
(attrs (xml-parse-attlist xml-ns))
children)
;; add the xmlns:* attrs to our cache
(when (consp xml-ns)
(dolist (attr attrs)
@ -458,70 +457,114 @@ Returns one of:
(caar attr)))
(push (cons (cdar attr) (cdr attr))
xml-ns))))
(setq children (list attrs (xml-maybe-do-ns node-name "" xml-ns)))
(cond
;; is this an empty element ?
((looking-at "/>")
(forward-char 2)
(nreverse children))
;; is this a valid start tag ?
((eq (char-after) ?>)
(forward-char 1)
;; Now check that we have the right end-tag.
(let ((end (concat "</" node-name "\\s-*>")))
(while (not (looking-at end))
(cond
((eobp)
(error "XML: (Not Well-Formed) End of buffer while reading element `%s'"
node-name))
((looking-at "</")
(forward-char 2)
(error "XML: (Not Well-Formed) Invalid end tag `%s' (expecting `%s')"
(let ((pos (point)))
(buffer-substring pos (if (re-search-forward "\\s-*>" nil t)
(match-beginning 0)
(point-max))))
node-name))
;; Read a sub-element and push it onto CHILDREN.
((= (char-after) ?<)
(let ((tag (xml-parse-tag-1 nil xml-ns)))
(when tag
(push tag children))))
;; Read some character data.
(t
(let ((expansion (xml-parse-string)))
(push (if (stringp (car children))
;; If two strings were separated by a
;; comment, concat them.
(concat (pop children) expansion)
expansion)
children)))))
;; Move point past the end-tag.
(goto-char (match-end 0))
(nreverse children)))
;; Otherwise this was an invalid start tag (expected ">" not found.)
(t
(error "XML: (Well-Formed) Couldn't parse tag: %s"
(buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
;; is this an empty element ?
(if (looking-at "/>")
(progn
(forward-char 2)
(nreverse children))
;; is this a valid start tag ?
(if (eq (char-after) ?>)
(progn
(forward-char 1)
;; Now check that we have the right end-tag. Note that this
;; one might contain spaces after the tag name
(let ((end (concat "</" node-name "\\s-*>")))
(while (not (looking-at end))
(cond
((looking-at "</")
(error "XML: (Not Well-Formed) Invalid end tag (expecting %s) at pos %d"
node-name (point)))
((= (char-after) ?<)
(let ((tag (xml-parse-tag nil xml-ns)))
(when tag
(push tag children))))
(t
(let ((expansion (xml-parse-string)))
(setq children
(if (stringp expansion)
(if (stringp (car children))
;; The two strings were separated by a comment.
(setq children (append (list (concat (car children) expansion))
(cdr children)))
(setq children (append (list expansion) children)))
(setq children (append expansion children))))))))
(goto-char (match-end 0))
(nreverse children)))
;; This was an invalid start tag (Expected ">", but didn't see it.)
(error "XML: (Well-Formed) Couldn't parse tag: %s"
(buffer-substring-no-properties (- (point) 10) (+ (point) 1)))))))
(t ;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
(unless xml-sub-parser ; Usually, we error out.
;; (Not one of PI, CDATA, Comment, End tag, or Start tag)
(t
(unless xml-sub-parser ; Usually, we error out.
(error "XML: (Well-Formed) Invalid character"))
;; However, if we're parsing incrementally, then we need to deal
;; with stray CDATA.
(xml-parse-string)))))
(defun xml-parse-string ()
"Parse the next whatever. Could be a string, or an element."
(let* ((pos (point))
(string (progn (skip-chars-forward "^<")
(buffer-substring-no-properties pos (point)))))
;; Clean up the string. As per XML specifications, the XML
;; processor should always pass the whole string to the
;; application. But \r's should be replaced:
;; http://www.w3.org/TR/2000/REC-xml-20001006#sec-line-ends
(setq pos 0)
(while (string-match "\r\n?" string pos)
(setq string (replace-match "\n" t t string))
(setq pos (1+ (match-beginning 0))))
(xml-substitute-special string)))
"Parse character data at point, and return it as a string.
Leave point at the start of the next thing to parse. This
function can modify the buffer by expanding entity and character
references."
(let ((start (point))
ref val)
(while (and (not (eobp))
(not (looking-at "<")))
;; Find the next < or & character.
(skip-chars-forward "^<&")
(when (eq (char-after) ?&)
;; If we find an entity or character reference, expand it.
(unless (looking-at (eval-when-compile
(concat "&\\(?:#\\([0-9]+\\)\\|#x\\([0-9a-fA-F]+\\)\\|\\("
xml-name-re "\\)\\);")))
(error "XML: (Not Well-Formed) Invalid entity reference"))
;; For a character reference, the next entity or character
;; reference must be after the replacement. [4.6] "Numerical
;; character references are expanded immediately when
;; recognized and MUST be treated as character data."
(cond ((setq ref (match-string 1))
;; Decimal character reference
(setq val (save-match-data
(decode-char 'ucs (string-to-number ref))))
(and (null val)
xml-validating-parser
(error "XML: (Validity) Invalid character `%s'" ref))
(replace-match (or (string val) xml-undefined-entity) t t))
;; Hexadecimal character reference
((setq ref (match-string 2))
(setq val (save-match-data
(decode-char 'ucs (string-to-number ref 16))))
(and (null val)
xml-validating-parser
(error "XML: (Validity) Invalid character `x%s'" ref))
(replace-match (or (string val) xml-undefined-entity) t t))
;; For an entity reference, search again from the start
;; of the replaced text, since the replacement can
;; contain entity or character references, or markup.
((setq ref (match-string 3))
(setq val (assoc ref xml-entity-alist))
(and (null val)
xml-validating-parser
(error "XML: (Validity) Undefined entity `%s'" ref))
(replace-match (cdr val) t t)
(goto-char (match-beginning 0))))))
;; [2.11] Clean up line breaks.
(let ((end-marker (point-marker)))
(goto-char start)
(while (re-search-forward "\r\n?" end-marker t)
(replace-match "\n" t t))
(goto-char end-marker)
(buffer-substring start (point)))))
(defun xml-parse-attlist (&optional xml-ns)
"Return the attribute-list after point.
@ -564,15 +607,11 @@ Leave point at the first non-blank character after the tag."
(skip-syntax-forward " "))
(nreverse attlist)))
;;*******************************************************************
;;**
;;** The DTD (document type declaration)
;;** The following functions know how to skip or parse the DTD of
;;** a document
;;**
;;*******************************************************************
;;; DTD (document type declaration)
;; Fixme: This fails at least if the DTD contains conditional sections.
;; The following functions know how to skip or parse the DTD of a
;; document. FIXME: it fails at least if the DTD contains conditional
;; sections.
(defun xml-skip-dtd ()
"Skip the DTD at point.
@ -789,9 +828,10 @@ references and parameter-entity references."
;; Parameter entity reference
((setq ref (match-string 3 string))
(setq val (assoc ref xml-parameter-entity-alist))
(if val
(push (cdr val) children)
(push (concat "%" ref ";") children))))
(and (null val)
xml-validating-parser
(error "XML: (Validity) Undefined parameter entity `%s'" ref))
(push (or (cdr val) xml-undefined-entity) children)))
(setq string remainder)))
(mapconcat 'identity (nreverse (cons string children)) "")))
@ -828,79 +868,40 @@ references and parameter-entity references."
(t
elem))))
;;*******************************************************************
;;**
;;** Substituting special XML sequences
;;**
;;*******************************************************************
;;; Substituting special XML sequences
(defun xml-substitute-special (string)
"Return STRING, after substituting entity references."
;; This originally made repeated passes through the string from the
;; beginning, which isn't correct, since then either "&amp;amp;" or
;; "&#38;amp;" won't DTRT.
(let ((point 0)
children end-point)
(while (string-match "&\\([^;]*\\);" string point)
(setq end-point (match-end 0))
(let* ((this-part (match-string-no-properties 1 string))
(prev-part (substring string point (match-beginning 0)))
(entity (assoc this-part xml-entity-alist))
(expansion
(cond ((string-match "#\\([0-9]+\\)" this-part)
(let ((c (decode-char
'ucs
(string-to-number (match-string-no-properties 1 this-part)))))
(if c (string c))))
((string-match "#x\\([[:xdigit:]]+\\)" this-part)
(let ((c (decode-char
'ucs
(string-to-number (match-string-no-properties 1 this-part) 16))))
(if c (string c))))
(entity
(cdr entity))
((eq (length this-part) 0)
(error "XML: (Not Well-Formed) No entity given"))
(t
(if xml-validating-parser
(error "XML: (Validity) Undefined entity `%s'"
this-part)
xml-undefined-entity)))))
(cond ((null children)
;; FIXME: If we have an entity that expands into XML, this won't work.
(setq children
(concat prev-part expansion)))
((stringp children)
(if (stringp expansion)
(setq children (concat children prev-part expansion))
(setq children (list expansion (concat prev-part children)))))
((and (stringp expansion)
(stringp (car children)))
(setcar children (concat prev-part expansion (car children))))
((stringp expansion)
(setq children (append (concat prev-part expansion)
children)))
((stringp (car children))
(setcar children (concat (car children) prev-part))
(setq children (append expansion children)))
(t
(setq children (list expansion
prev-part
children))))
(setq point end-point)))
(cond ((stringp children)
(concat children (substring string point)))
((stringp (car (last children)))
(concat (car (last children)) (substring string point)))
((null children)
string)
(t
(concat (mapconcat 'identity
(nreverse children)
"")
(substring string point))))))
"Return STRING, after substituting entity and character references.
STRING is assumed to occur in an XML attribute value."
(let ((ref-re (eval-when-compile
(concat "&\\(?:#\\(x\\)?\\([0-9]+\\)\\|\\("
xml-name-re "\\)\\);")))
children)
(while (string-match ref-re string)
(push (substring string 0 (match-beginning 0)) children)
(let* ((remainder (substring string (match-end 0)))
(ref (match-string 2 string)))
(if ref
;; [4.6] Character references are included as
;; character data.
(let ((val (decode-char 'ucs (string-to-number
ref (if (match-string 1 string) 16)))))
(push (cond (val (string val))
(xml-validating-parser
(error "XML: (Validity) Undefined character `x%s'" ref))
(t xml-undefined-entity))
children)
(setq string remainder))
;; [4.4.5] Entity references are "included in literal".
;; Note that we don't need do anything special to treat
;; quotes as normal data characters.
(setq ref (match-string 3 string))
(let ((val (or (cdr (assoc ref xml-entity-alist))
(if xml-validating-parser
(error "XML: (Validity) Undefined entity `%s'" ref)
xml-undefined-entity))))
(setq string (concat val remainder))))))
(mapconcat 'identity (nreverse (cons string children)) "")))
(defun xml-substitute-numeric-entities (string)
"Substitute SGML numeric entities by their respective utf characters.
@ -921,12 +922,7 @@ by \"*\"."
string)
nil))
;;*******************************************************************
;;**
;;** Printing a tree.
;;** This function is intended mainly for debugging purposes.
;;**
;;*******************************************************************
;;; Printing a parse tree (mainly for debugging).
(defun xml-debug-print (xml &optional indent-string)
"Outputs the XML in the current buffer.

View file

@ -1,3 +1,8 @@
2012-07-02 Chong Yidong <cyd@gnu.org>
* automated/xml-parse-tests.el (xml-parse-tests--data): More
testcases.
2012-07-01 Chong Yidong <cyd@gnu.org>
* automated/xml-parse-tests.el: New file.

View file

@ -33,15 +33,26 @@
'(;; General entity substitution
("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY ent \"AbC\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" .
((foo ((a . "b")) (bar nil "AbC;"))))
("<?xml version=\"1.0\"?><foo>&amp;amp;&#38;apos;&apos;&lt;&gt;&quot;</foo>" .
((foo () "&amp;&apos;'<>\"")))
;; Parameter entity substitution
("<?xml version=\"1.0\"?><!DOCTYPE foo SYSTEM \"bar.dtd\" [<!ENTITY % pent \"AbC\"><!ENTITY ent \"%pent;\">]><foo a=\"b\"><bar>&ent;;</bar></foo>" .
((foo ((a . "b")) (bar nil "AbC;"))))
;; Tricky parameter entity substitution (like XML spec Appendix D)
("<?xml version='1.0'?><!DOCTYPE foo [ <!ENTITY % xx '&#37;zz;'><!ENTITY % zz '&#60;!ENTITY ent \"b\" >' > %xx; ]><foo>A&ent;C</foo>" .
((foo nil "AbC")))
((foo () "AbC")))
;; Bug#7172
("<?xml version=\"1.0\"?><!DOCTYPE foo [ <!ELEMENT EXAM_PLE EMPTY> ]><foo></foo>" .
((foo nil))))
((foo ())))
;; Entities referencing entities, in character data
("<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">]><foo>&abc;</foo>" .
((foo () "aBc")))
;; Entities referencing entities, in attribute values
("<!DOCTYPE foo [ <!ENTITY b \"B\"><!ENTITY abc \"a&b;c\">]><foo a=\"-&abc;-\">1</foo>" .
((foo ((a . "-aBc-")) "1")))
;; Character references must be treated as character data
("<foo>AT&amp;T;</foo>" . ((foo () "AT&T;")))
("<foo>&#38;amp;</foo>" . ((foo () "&amp;"))))
"Alist of XML strings and their expected parse trees.")
(ert-deftest xml-parse-tests ()