;;; xmltok.el --- XML tokenization ;; Copyright (C) 2003 Free Software Foundation, Inc. ;; Author: James Clark ;; Keywords: XML ;; This program is free software; you can redistribute it and/or ;; modify it under the terms of the GNU General Public License as ;; published by the Free Software Foundation; either version 2 of ;; the License, or (at your option) any later version. ;; This program is distributed in the hope that it will be ;; useful, but WITHOUT ANY WARRANTY; without even the implied ;; warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. See the GNU General Public License for more details. ;; You should have received a copy of the GNU General Public ;; License along with this program; if not, write to the Free ;; Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, ;; MA 02111-1307 USA ;;; Commentary: ;; This implements an XML 1.0 parser. It also implements the XML ;; Namespaces Recommendation. It is designed to be conforming, but it ;; works a bit differently from a normal XML parser. An XML document ;; consists of the prolog and an instance. The prolog is parsed as a ;; single unit using `xmltok-forward-prolog'. The instance is ;; considered as a sequence of tokens, where a token is something like ;; a start-tag, a comment, a chunk of data or a CDATA section. The ;; tokenization of the instance is stateless: the tokenization of one ;; part of the instance does not depend on tokenization of the ;; preceding part of the instance. This allows the instance to be ;; parsed incrementally. The main entry point is `xmltok-forward': ;; this can be called at any point in the instance provided it is ;; between tokens. The other entry point is `xmltok-forward-special' ;; which skips over tokens other comments, processing instructions or ;; CDATA sections (i.e. the constructs in an instance that can contain ;; less than signs that don't start a token). ;; ;; This is a non-validating XML 1.0 processor. It does not resolve ;; parameter entities (including the external DTD subset) and it does ;; not resolve external general entities. ;; ;; It is non-conformant by design in the following respects. ;; ;; 1. It expects the client to detect aspects of well-formedness that ;; are not internal to a single token, specifically checking that ;; end-tags match start-tags and that the instance contains exactly ;; one element. ;; ;; 2. It expects the client to detect duplicate attributes. Detection ;; of duplicate attributes after expansion of namespace prefixes ;; requires the namespace processing state. Detection of duplicate ;; attributes before expansion of namespace prefixes does not, but is ;; redundant given that the client will do detection of duplicate ;; attributes after expansion of namespace prefixes. ;; ;; 3. It allows the client to recover from well-formedness errors. ;; This is essential for use in applications where the document is ;; being parsed during the editing process. ;; ;; 4. It does not support documents that do not conform to the lexical ;; requirements of the XML Namespaces Recommendation (e.g. a document ;; with a colon in an entity name). ;; ;; There are also a number of things that have not yet been ;; implemented that make it non-conformant. ;; ;; 1. It does not implement default attributes. ATTLIST declarations ;; are parsed, but no checking is done on the content of attribute ;; value literals specifying default attribute values, and default ;; attribute values are not reported to the client. ;; ;; 2. It does not implement internal entities containing elements. If ;; an internal entity is referenced and parsing its replacement text ;; yields one or more tags, then it will skip the reference and ;; report this to the client. ;; ;; 3. It does not check the syntax of public identifiers in the DTD. ;; ;; 4. It allows some non-ASCII characters in certain situations where ;; it should not. For example, it only enforces XML 1.0's ;; restrictions on name characters strictly for ASCII characters. The ;; problem here is XML's character model is based squarely on Unicode, ;; whereas Emacs's is not (as of version 21). It is not clear what ;; the right thing to do is. ;;; Code: (defvar xmltok-type nil) (defvar xmltok-start nil) (defvar xmltok-name-colon nil) (defvar xmltok-name-end nil) (defvar xmltok-replacement nil "String containing replacement for a character or entity reference.") (defvar xmltok-attributes nil "List containing attributes of last scanned element. Each member of the list is a vector representing an attribute, which can be accessed using the functions `xmltok-attribute-name-start', `xmltok-attribute-name-colon', `xmltok-attribute-name-end', `xmltok-attribute-value-start', `xmltok-attribute-value-end', `xmltok-attribute-raw-normalized-value', `xmltok-attribute-refs'.") (defvar xmltok-namespace-attributes nil "List containing namespace declarations of last scanned element. List has same format as `xmltok-attributes'.") (defvar xmltok-dtd nil "Information about the DTD used by `xmltok-forward'. `xmltok-forward-prolog' sets this up. It consists of an alist of general entity names vs definitions. The first member of the alist is t if references to entities not in the alist are well-formed \(e.g. because there's an external subset that wasn't parsed). Each general entity name is a string. The definition is either nil, a symbol, a string, a cons cell. If the definition is nil, then it means that it's an internal entity but the result of parsing it is unknown. If it is a symbol, then the symbol is either `unparsed', meaning the entity is an unparsed entity, `external', meaning the entity is or references an external entity, `element', meaning the entity includes one or more elements, or `not-well-formed', meaning the replacement text is not well-formed. If the definition is a string, then the replacement text of the entity is that string; this happens only during the parsing of the prolog. If the definition is a cons cell \(ER . AR), then ER specifies the string that results from referencing the entity in element content and AR is either nil, meaning the replacement text included a <, or a string which is the normalized attribute value.") (defvar xmltok-dependent-regions nil "List of descriptors of regions that a parsed token depends on. A token depends on a region if the region occurs after the token and a change in the region may require the token to be reparsed. This only happens with markup that is not well-formed. For example, if a , then the then the buffer must be reparsed from the space-count 0) (setq xmltok-type 'space)) (t (goto-char (1+ (point))) (xmltok-scan-after-lt)))) ((eq ch ?\&) (cond ((> space-count 0) (setq xmltok-type 'space)) (t (goto-char (1+ (point))) (xmltok-scan-after-amp (lambda (start end) (xmltok-handle-entity start end)))))) ((re-search-forward "[<&]\\|\\(]]>\\)" nil t) (cond ((not (match-beginning 1)) (goto-char (match-beginning 0)) ;; must have got a non-space char (setq xmltok-type 'data)) ((= (match-beginning 1) xmltok-start) (xmltok-add-error "Found `]]>' not closing a CDATA section") (setq xmltok-type 'not-well-formed)) (t (goto-char (match-beginning 0)) (setq xmltok-type (if (= (point) (+ xmltok-start space-count)) 'space 'data))))) ((eq ch nil) (setq xmltok-type (if (> space-count 0) 'space nil))) (t (goto-char (point-max)) (setq xmltok-type 'data))))) (defun xmltok-forward-special (bound) "Scan forward past the first special token starting at or after point. Return nil if there is no special token that starts before BOUND. CDATA sections, processing instructions and comments (and indeed anything starting with < following by ? or !) count as special. Return the type of the token." (when (re-search-forward "<[?!]" (1+ bound) t) (setq xmltok-start (match-beginning 0)) (goto-char (1+ xmltok-start)) (let ((case-fold-search nil)) (xmltok-scan-after-lt)))) (eval-when-compile ;; A symbolic regexp is represented by a list whose CAR is the string ;; containing the regexp and whose cdr is a list of symbolic names ;; for the groups in the string. ;; Construct a symbolic regexp from a regexp. (defun xmltok-r (str) (cons str nil)) ;; Concatenate zero of more regexps and symbolic regexps. (defun xmltok+ (&rest args) (let (strs names) (while args (let ((arg (car args))) (if (stringp arg) (setq strs (cons arg strs)) (setq strs (cons (car arg) strs)) (setq names (cons (cdr arg) names))) (setq args (cdr args)))) (cons (apply 'concat (nreverse strs)) (apply 'append (nreverse names)))))) (eval-when-compile ;; Make a symbolic group named NAME from the regexp R. ;; R may be a symbolic regexp or an ordinary regexp. (defmacro xmltok-g (name &rest r) (let ((sym (make-symbol "r"))) `(let ((,sym (xmltok+ ,@r))) (if (stringp ,sym) (cons (concat "\\(" ,sym "\\)") (cons ',name nil)) (cons (concat "\\(" (car ,sym) "\\)") (cons ',name (cdr ,sym))))))) (defun xmltok-p (&rest r) (xmltok+ "\\(?:" (apply 'xmltok+ r) "\\)")) ;; Get the group index of ELEM in a LIST of symbols. (defun xmltok-get-index (elem list) (or elem (error "Missing group name")) (let ((found nil) (i 1)) (while list (cond ((eq elem (car list)) (setq found i) (setq list nil)) (t (setq i (1+ i)) (setq list (cdr list))))) (or found (error "Bad group name %s" elem)))) ;; Define a macro SYM using a symbolic regexp R. ;; SYM can be called in three ways: ;; (SYM regexp) ;; expands to the regexp in R ;; (SYM start G) ;; expands to ;; (match-beginning N) ;; where N is the group index of G in R. ;; (SYM end G) ;; expands to ;; (match-end N) ;; where N is the group index of G in R. (defmacro xmltok-defregexp (sym r) `(defalias ',sym (let ((r ,r)) `(macro lambda (action &optional group-name) (cond ((eq action 'regexp) ,(car r)) ((or (eq action 'start) (eq action 'beginning)) (list 'match-beginning (xmltok-get-index group-name ',(cdr r)))) ((eq action 'end) (list 'match-end (xmltok-get-index group-name ',(cdr r)))) ((eq action 'string) (list 'match-string (xmltok-get-index group-name ',(cdr r)))) ((eq action 'string-no-properties) (list 'match-string-no-properties (xmltok-get-index group-name ',(cdr r)))) (t (error "Invalid action: %s" action)))))))) (eval-when-compile (let* ((or "\\|") (open "\\(?:") (gopen "\\(") (close "\\)") (name-start-char "[_[:alpha:]]") (name-continue-not-start-char "[-.[:digit:]]") (name-continue-char "[-._[:alnum:]]") (* "*") (+ "+") (opt "?") (question "\\?") (s "[ \r\t\n]") (s+ (concat s +)) (s* (concat s *)) (ncname (concat name-start-char name-continue-char *)) (entity-ref (xmltok+ (xmltok-g entity-name ncname) (xmltok-g entity-ref-close ";") opt)) (decimal-ref (xmltok+ (xmltok-g decimal "[0-9]" +) (xmltok-g decimal-ref-close ";") opt)) (hex-ref (xmltok+ "x" open (xmltok-g hex "[0-9a-fA-F]" +) (xmltok-g hex-ref-close ";") opt close opt)) (char-ref (xmltok+ (xmltok-g number-sign "#") open decimal-ref or hex-ref close opt)) (start-tag-close (xmltok+ open (xmltok-g start-tag-close s* ">") or open (xmltok-g empty-tag-slash s* "/") (xmltok-g empty-tag-close ">") opt close or (xmltok-g start-tag-s s+) close)) (start-tag (xmltok+ (xmltok-g start-tag-name ncname (xmltok-g start-tag-colon ":" ncname) opt) start-tag-close opt)) (end-tag (xmltok+ (xmltok-g end-tag-slash "/") open (xmltok-g end-tag-name ncname (xmltok-g end-tag-colon ":" ncname) opt) (xmltok-g end-tag-close s* ">") opt close opt)) (comment (xmltok+ (xmltok-g markup-declaration "!") (xmltok-g comment-first-dash "-" (xmltok-g comment-open "-") opt) opt)) (cdata-section (xmltok+ "!" (xmltok-g marked-section-open "\\[") open "C" open "D" open "A" open "T" open "A" (xmltok-g cdata-section-open "\\[" ) opt close opt ; A close opt ; T close opt ; A close opt ; D close opt)) ; C (processing-instruction (xmltok-g processing-instruction-question question))) (xmltok-defregexp xmltok-ncname (xmltok+ open ncname close)) (xmltok-defregexp xmltok-after-amp (xmltok+ entity-ref or char-ref)) (xmltok-defregexp xmltok-after-lt (xmltok+ start-tag or end-tag ;; cdata-section must come before comment ;; because we treat "))) (xmltok-defregexp xmltok-prolog (let* ((single-char (xmltok-g single-char "[[|,(\"'>]")) (internal-subset-close (xmltok-g internal-subset-close "][ \t\r\n]*>")) (starts-with-close-paren (xmltok-g close-paren ")" (xmltok-p (xmltok-g close-paren-occur "[+?]") or (xmltok-g close-paren-star "\\*")) opt)) (starts-with-percent (xmltok-g percent "%" (xmltok-g param-entity-ref ncname (xmltok-g param-entity-ref-close ";") opt) opt)) (starts-with-nmtoken-not-name (xmltok-g nmtoken (xmltok-p name-continue-not-start-char or ":") (xmltok-p name-continue-char or ":") *)) (nmtoken-after-colon (xmltok+ (xmltok-p name-continue-not-start-char or ":") (xmltok-p name-continue-char or ":") * or name-start-char name-continue-char * ":" (xmltok-p name-continue-char or ":") *)) (after-ncname (xmltok+ (xmltok-g ncname-nmtoken ":" (xmltok-p nmtoken-after-colon)) or (xmltok-p (xmltok-g colon ":" ncname) (xmltok-g colon-name-occur "[?+*]") opt) or (xmltok-g ncname-occur "[?+*]") or (xmltok-g ncname-colon ":"))) (starts-with-name (xmltok-g name ncname (xmltok-p after-ncname) opt)) (starts-with-hash (xmltok-g pound "#" (xmltok-g hash-name ncname))) (markup-declaration (xmltok-g markup-declaration "!" (xmltok-p (xmltok-g comment-first-dash "-" (xmltok-g comment-open "-") opt) or (xmltok-g named-markup-declaration ncname)) opt)) (after-lt (xmltok+ markup-declaration or (xmltok-g processing-instruction-question question) or (xmltok-g instance-start ncname))) (starts-with-lt (xmltok-g less-than "<" (xmltok-p after-lt) opt))) (xmltok+ starts-with-lt or single-char or starts-with-close-paren or starts-with-percent or starts-with-name or starts-with-nmtoken-not-name or starts-with-hash or internal-subset-close))))) (defconst xmltok-ncname-regexp (xmltok-ncname regexp)) (defun xmltok-scan-after-lt () (cond ((not (looking-at (xmltok-after-lt regexp))) (xmltok-add-error "`<' that is not markup must be entered as `<'") (setq xmltok-type 'not-well-formed)) (t (goto-char (match-end 0)) (cond ((xmltok-after-lt start start-tag-close) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-attributes nil) (setq xmltok-namespace-attributes nil) (setq xmltok-type 'start-tag)) ((xmltok-after-lt start end-tag-close) (setq xmltok-name-end (xmltok-after-lt end end-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start end-tag-colon)) (setq xmltok-type 'end-tag)) ((xmltok-after-lt start start-tag-s) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-namespace-attributes nil) (setq xmltok-attributes nil) (xmltok-scan-attributes) xmltok-type) ((xmltok-after-lt start empty-tag-close) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-attributes nil) (setq xmltok-namespace-attributes nil) (setq xmltok-type 'empty-element)) ((xmltok-after-lt start cdata-section-open) (setq xmltok-type (if (search-forward "]]>" nil t) 'cdata-section (xmltok-add-error "No closing ]]>") (xmltok-add-dependent 'xmltok-unclosed-reparse-p nil nil "]]>") 'not-well-formed))) ((xmltok-after-lt start processing-instruction-question) (xmltok-scan-after-processing-instruction-open)) ((xmltok-after-lt start comment-open) (xmltok-scan-after-comment-open)) ((xmltok-after-lt start empty-tag-slash) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-attributes nil) (setq xmltok-namespace-attributes nil) (xmltok-add-error "Expected `/>'" (1- (point))) (setq xmltok-type 'partial-empty-element)) ((xmltok-after-lt start start-tag-name) (xmltok-add-error "Missing `>'" nil (1+ xmltok-start)) (setq xmltok-name-end (xmltok-after-lt end start-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start start-tag-colon)) (setq xmltok-namespace-attributes nil) (setq xmltok-attributes nil) (setq xmltok-type 'partial-start-tag)) ((xmltok-after-lt start end-tag-name) (setq xmltok-name-end (xmltok-after-lt end end-tag-name)) (setq xmltok-name-colon (xmltok-after-lt start end-tag-colon)) (cond ((and (not xmltok-name-colon) (eq (char-after) ?:)) (goto-char (1+ (point))) (xmltok-add-error "Expected name following `:'" (1- (point)))) (t (xmltok-add-error "Missing `>'" nil (1+ xmltok-start)))) (setq xmltok-type 'partial-end-tag)) ((xmltok-after-lt start end-tag-slash) (xmltok-add-error "Expected name following `) in unclosed PI (defun xmltok-scan-after-processing-instruction-open () (cond ((not (search-forward "?>" nil t)) (xmltok-add-error "No closing ?>" xmltok-start (+ xmltok-start 2)) (xmltok-add-dependent 'xmltok-unclosed-reparse-p nil nil "?>") (setq xmltok-type 'not-well-formed)) (t (cond ((not (save-excursion (goto-char (+ 2 xmltok-start)) (and (looking-at (xmltok-ncname regexp)) (setq xmltok-name-end (match-end 0))))) (setq xmltok-name-end (+ xmltok-start 2)) (xmltok-add-error "") (xmltok-add-dependent 'xmltok-unclosed-reparse-p nil nil ;; not --> because ;; -- is not allowed ;; in comments in XML "--") 'not-well-formed) ((eq (char-after) ?>) (goto-char (1+ (point))) 'comment) (t (xmltok-add-dependent 'xmltok-semi-closed-reparse-p nil (point) "--" 2) ;; just include the