-*******************************************************************************
-merge from trunk to acl6 branch (for 6.1.beta)
-command: ../../join.sh trunk trunk_to_acl6_merge2 trunk_to_acl6_merge3 xmlutils
-*******************************************************************************
+2003-02-13 Kevin Layer <layer@crikey>
-*******************************************************************************
-merge from trunk to acl6 branch
-command: ../../join.sh trunk trunk_to_acl6_merge1 trunk_to_acl6_merge2 xmlutils
-*******************************************************************************
+ from jkf:
+ * phtml.cl: bug13050: parse-html close tag closes consecutive
+ identical open tags
*******************************************************************************
-merge from trunk to acl6 branch
-command: ../../join.sh trunk acl6 trunk_to_acl6_merge1 xmlutils
+Mon Jun 17 11:29:36 PDT 2002
+merge from trunk to acl62 branch (for 6.2)
+command: ../../join.sh trunk acl62 trunk_to_acl62_merge1 xmlutils
*******************************************************************************
+2002-05-14 John Foderaro <jkf@tiger.franz.com>
+
+ * phtml.cl: - add :parse-entities arg to parse-html. If true then
+ entities are converted to the character they represent.
+
+2002-04-29 John Foderaro <jkf@tiger.franz.com>
+
+ * pxml1.cl (check-xmldecl): - if the xml tag specifies an
+ encoding then set lisp's external format to that encoding.
+
2001-06-08 Steve Haflich <smh@romeo>
* pxml.htm: Added mention that it is necessary to load or require
-;; $Id: build.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $
+;; $Id: build.cl,v 1.2 2003/06/20 02:21:23 kevin Exp $
(in-package :user)
dh_clean -k
# Add here commands to install the package into debian/xmlutils.
dh_installdirs $(clc-systems) $(clc-xmlutils) $(doc-dir)
- dh_install debian/xmlutils.asd phtml.cl $(wildcard pxml[0-3].cl) build.cl $(clc-xmlutils)
+ dh_install xmlutils.asd phtml.cl $(wildcard pxml[0-3].cl) build.cl $(clc-xmlutils)
dh_install $(shell echo *.html) $(doc-dir)
dh_link $(clc-xmlutils)/xmlutils.asd $(clc-systems)/xmlutils.asd
+++ /dev/null
-;;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Base: 10 -*-
-;;;; *************************************************************************
-;;;; FILE IDENTIFICATION
-;;;;
-;;;; Name: xmlutils.asd
-;;;; Purpose: ASDF definition file for Xmlutils
-;;;; Programmer: Kevin M. Rosenberg
-;;;; Date Started: Sep 2002
-;;;;
-;;;; $Id: xmlutils.asd,v 1.4 2002/11/08 16:51:40 kevin Exp $
-;;;;
-;;;; This file, part of cl-xmlutils, is Copyright (c) 2002 by Kevin M. Rosenberg
-;;;;
-;;;; cl-xmlutils users are granted the rights to distribute and use this software
-;;;; as governed by the terms of the GNU Lesser General Public License
-;;;; (http://www.gnu.org/licenses/lgpl.html)
-;;;; *************************************************************************
-
-(in-package :asdf)
-
-#-allegro (require :acl-compat)
-
-#+lispworks
-(eval-when (:compile-toplevel :load-toplevel :execute)
- (defvar system::*stack-overflow-behavior* :warn)
- (setq system::*stack-overflow-behavior* :warn))
-
-(defsystem :xmlutils
- :name "cl-xmlutils"
- :author "Franz, Inc"
- :maintainer "Kevin M. Rosenberg <kmr@debian.org>"
- :licence "GNU Lesser General Public License"
- :description "Franz's Test Harness Package"
- :long-description "Xmlutils provides a library for parsing HTML and XML documents."
-
- :perform (load-op :after (op xmlutils)
- (pushnew :xmlutils cl:*features*))
-
- :components
- ((:file "phtml")
- (:file "pxml0")
- (:file "pxml1" :depends-on ("pxml0"))
- (:file "pxml2" :depends-on ("pxml1"))
- (:file "pxml3" :depends-on ("pxml2"))
- ))
-
-(defmethod source-file-type ((c cl-source-file) (s (eql (find-system :xmlutils))))
- "cl")
-
;; Suite 330, Boston, MA 02111-1307 USA
;;
-;; $Id: phtml-test.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $
+;; $Id: phtml-test.cl,v 1.2 2003/06/20 02:21:23 kevin Exp $
(eval-when (compile load eval)
(require :tester))
+(sys:defpatch "phtml" 1
+ "parse-html close tag closes consecutive identical open tags."
+ :type :system
+ :post-loadable t)
+
;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA
;;
;; This code is free software; you can redistribute it and/or
;; Suite 330, Boston, MA 02111-1307 USA
;;
-;; $Id: phtml.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $
+;; $Id: phtml.cl,v 1.2 2003/06/20 02:21:23 kevin Exp $
;; phtml.cl - parse html
;; Change Log
+;; 05/14/02 - add :parse-entities arg to parse-html. If true then
+;; entities are converted to the character they represent.
;;
;; 02/05/01 symbols mapped to preferred case at runtime (as opposed to
;; a compile time macro determining the case mapping)
(not (zerop (logand (svref *characteristics* code) bit))))))
+(defvar *html-entity-to-code*
+ (let ((table (make-hash-table :test #'equal)))
+ (dolist (ent '(("nbsp" . 160)
+ ("iexcl" . 161)
+ ("cent" . 162)
+ ("pound" . 163)
+ ("curren" . 164)
+ ("yen" . 165)
+ ("brvbar" . 166)
+ ("sect" . 167)
+ ("uml" . 168)
+ ("copy" . 169)
+ ("ordf" . 170)
+ ("laquo" . 171)
+ ("not" . 172)
+ ("shy" . 173)
+ ("reg" . 174)
+ ("macr" . 175)
+ ("deg" . 176)
+ ("plusmn" . 177)
+ ("sup2" . 178)
+ ("sup3" . 179)
+ ("acute" . 180)
+ ("micro" . 181)
+ ("para" . 182)
+ ("middot" . 183)
+ ("cedil" . 184)
+ ("sup1" . 185)
+ ("ordm" . 186)
+ ("raquo" . 187)
+ ("frac14" . 188)
+ ("frac12" . 189)
+ ("frac34" . 190)
+ ("iquest" . 191)
+ ("Agrave" . 192)
+ ("Aacute" . 193)
+ ("Acirc" . 194)
+ ("Atilde" . 195)
+ ("Auml" . 196)
+ ("Aring" . 197)
+ ("AElig" . 198)
+ ("Ccedil" . 199)
+ ("Egrave" . 200)
+ ("Eacute" . 201)
+ ("Ecirc" . 202)
+ ("Euml" . 203)
+ ("Igrave" . 204)
+ ("Iacute" . 205)
+ ("Icirc" . 206)
+ ("Iuml" . 207)
+ ("ETH" . 208)
+ ("Ntilde" . 209)
+ ("Ograve" . 210)
+ ("Oacute" . 211)
+ ("Ocirc" . 212)
+ ("Otilde" . 213)
+ ("Ouml" . 214)
+ ("times" . 215)
+ ("Oslash" . 216)
+ ("Ugrave" . 217)
+ ("Uacute" . 218)
+ ("Ucirc" . 219)
+ ("Uuml" . 220)
+ ("Yacute" . 221)
+ ("THORN" . 222)
+ ("szlig" . 223)
+ ("agrave" . 224)
+ ("aacute" . 225)
+ ("acirc" . 226)
+ ("atilde" . 227)
+ ("auml" . 228)
+ ("aring" . 229)
+ ("aelig" . 230)
+ ("ccedil" . 231)
+ ("egrave" . 232)
+ ("eacute" . 233)
+ ("ecirc" . 234)
+ ("euml" . 235)
+ ("igrave" . 236)
+ ("iacute" . 237)
+ ("icirc" . 238)
+ ("iuml" . 239)
+ ("eth" . 240)
+ ("ntilde" . 241)
+ ("ograve" . 242)
+ ("oacute" . 243)
+ ("ocirc" . 244)
+ ("otilde" . 245)
+ ("ouml" . 246)
+ ("divide" . 247)
+ ("oslash" . 248)
+ ("ugrave" . 249)
+ ("uacute" . 250)
+ ("ucirc" . 251)
+ ("uuml" . 252)
+ ("yacute" . 253)
+ ("thorn" . 254)
+ ("yuml" . 255)
+ ("fnof" . 402)
+ ("Alpha" . 913)
+ ("Beta" . 914)
+ ("Gamma" . 915)
+ ("Delta" . 916)
+ ("Epsilon" . 917)
+ ("Zeta" . 918)
+ ("Eta" . 919)
+ ("Theta" . 920)
+ ("Iota" . 921)
+ ("Kappa" . 922)
+ ("Lambda" . 923)
+ ("Mu" . 924)
+ ("Nu" . 925)
+ ("Xi" . 926)
+ ("Omicron" . 927)
+ ("Pi" . 928)
+ ("Rho" . 929)
+ ("Sigma" . 931)
+ ("Tau" . 932)
+ ("Upsilon" . 933)
+ ("Phi" . 934)
+ ("Chi" . 935)
+ ("Psi" . 936)
+ ("Omega" . 937)
+ ("alpha" . 945)
+ ("beta" . 946)
+ ("gamma" . 947)
+ ("delta" . 948)
+ ("epsilon" . 949)
+ ("zeta" . 950)
+ ("eta" . 951)
+ ("theta" . 952)
+ ("iota" . 953)
+ ("kappa" . 954)
+ ("lambda" . 955)
+ ("mu" . 956)
+ ("nu" . 957)
+ ("xi" . 958)
+ ("omicron" . 959)
+ ("pi" . 960)
+ ("rho" . 961)
+ ("sigmaf" . 962)
+ ("sigma" . 963)
+ ("tau" . 964)
+ ("upsilon" . 965)
+ ("phi" . 966)
+ ("chi" . 967)
+ ("psi" . 968)
+ ("omega" . 969)
+ ("thetasym" . 977)
+ ("upsih" . 978)
+ ("piv" . 982)
+ ("bull" . 8226)
+ ("hellip" . 8230)
+ ("prime" . 8242)
+ ("Prime" . 8243)
+ ("oline" . 8254)
+ ("frasl" . 8260)
+ ("weierp" . 8472)
+ ("image" . 8465)
+ ("real" . 8476)
+ ("trade" . 8482)
+ ("alefsym" . 8501)
+ ("larr" . 8592)
+ ("uarr" . 8593)
+ ("rarr" . 8594)
+ ("darr" . 8595)
+ ("harr" . 8596)
+ ("crarr" . 8629)
+ ("lArr" . 8656)
+ ("uArr" . 8657)
+ ("rArr" . 8658)
+ ("dArr" . 8659)
+ ("hArr" . 8660)
+ ("forall" . 8704)
+ ("part" . 8706)
+ ("exist" . 8707)
+ ("empty" . 8709)
+ ("nabla" . 8711)
+ ("isin" . 8712)
+ ("notin" . 8713)
+ ("ni" . 8715)
+ ("prod" . 8719)
+ ("sum" . 8721)
+ ("minus" . 8722)
+ ("lowast" . 8727)
+ ("radic" . 8730)
+ ("prop" . 8733)
+ ("infin" . 8734)
+ ("ang" . 8736)
+ ("and" . 8743)
+ ("or" . 8744)
+ ("cap" . 8745)
+ ("cup" . 8746)
+ ("int" . 8747)
+ ("there4" . 8756)
+ ("sim" . 8764)
+ ("cong" . 8773)
+ ("asymp" . 8776)
+ ("ne" . 8800)
+ ("equiv" . 8801)
+ ("le" . 8804)
+ ("ge" . 8805)
+ ("sub" . 8834)
+ ("sup" . 8835)
+ ("nsub" . 8836)
+ ("sube" . 8838)
+ ("supe" . 8839)
+ ("oplus" . 8853)
+ ("otimes" . 8855)
+ ("perp" . 8869)
+ ("sdot" . 8901)
+ ("lceil" . 8968)
+ ("rceil" . 8969)
+ ("lfloor" . 8970)
+ ("rfloor" . 8971)
+ ("lang" . 9001)
+ ("rang" . 9002)
+ ("loz" . 9674)
+ ("spades" . 9824)
+ ("clubs" . 9827)
+ ("hearts" . 9829)
+ ("diams" . 9830)
+ ("quot" . 34)
+ ("amp" . 38)
+ ("lt" . 60)
+ ("gt" . 62)
+ ("OElig" . 338)
+ ("oelig" . 339)
+ ("Scaron" . 352)
+ ("scaron" . 353)
+ ("Yuml" . 376)
+ ("circ" . 710)
+ ("tilde" . 732)
+ ("ensp" . 8194)
+ ("emsp" . 8195)
+ ("thinsp" . 8201)
+ ("zwnj" . 8204)
+ ("zwj" . 8205)
+ ("lrm" . 8206)
+ ("rlm" . 8207)
+ ("ndash" . 8211)
+ ("mdash" . 8212)
+ ("lsquo" . 8216)
+ ("rsquo" . 8217)
+ ("sbquo" . 8218)
+ ("ldquo" . 8220)
+ ("rdquo" . 8221)
+ ("bdquo" . 8222)
+ ("dagger" . 8224)
+ ("Dagger" . 8225)
+ ("permil" . 8240)
+ ("lsaquo" . 8249)
+ ("rsaquo" . 8250)
+ ("euro" . 8364)
+ ))
+ (setf (gethash (car ent) table) (cdr ent)))
+ table))
+
+
+
(defstruct tokenbuf
cur ;; next index to use to grab from tokenbuf
max ;; index one beyond last character
(defun next-token (stream ignore-strings raw-mode-delimiter
- read-sequence-func tokenbuf)
+ read-sequence-func tokenbuf parse-entities)
(declare (optimize (speed 3) (safety 1)))
;; return two values:
;; the next token from the stream.
(return)
else ; collect a tag
(setq state state-readtagfirst))
+ elseif (and parse-entities (eq ch #\&))
+ then ; reading an entity. entity ends at semicolon
+ (let (res (max 10))
+ (loop (let ((ch (next-char stream)))
+ (if* (null ch)
+ then (error "End of file after & entity marker")
+ elseif (eq ch #\;)
+ then (return)
+ elseif (zerop (decf max))
+ then (error "No semicolon found after entity starting: &~{~a~}" (nreverse res))
+ else (push ch res))))
+ (setq res (nreverse res))
+ (if* (eq (car res) #\#)
+ then ; decimal entity
+ (let ((count 0))
+ (dolist (ch (cdr res))
+ (let ((code (char-code ch)))
+ (if* (<= #.(char-code #\0)
+ code
+ #.(char-code #\9))
+ then (setq count
+ (+ (* 10 count)
+ (- code
+ #.(char-code #\0))))
+ else (error "non decimal digit after &# - ~s" ch)
+ )))
+ (add-to-coll coll (code-char count)))
+ else (let ((name (make-array (length res)
+ :element-type 'character
+ :initial-contents res)))
+ (let ((ch (gethash name *html-entity-to-code*)))
+ (if* ch
+ then (add-to-coll coll (code-char ch))
+ else (error "No such entity as ~s" name))))))
+
else ; we will check for & here eventually
(if* (not (eq ch #\return))
then (add-to-coll coll ch))))
(#.state-readtag
(when (null tag-to-return)
- (error "unexpected end of input encountered"))
+ (error "unexpected end of input encountered"))
;; we've read a tag with no attributes
(put-back-collector coll)
(values tag-to-return
(defmethod parse-html ((p stream) &key callback-only callbacks collect-rogue-tags
- no-body-tags)
+ no-body-tags
+ parse-entities)
(declare (optimize (speed 3) (safety 1)))
(phtml-internal p nil callback-only callbacks collect-rogue-tags
- no-body-tags))
+ no-body-tags parse-entities))
(defmacro tag-callback (tag)
`(rest (assoc ,tag callbacks)))
-(defun phtml-internal (p read-sequence-func callback-only callbacks collect-rogue-tags
-
- no-body-tags)
+(defun phtml-internal (p read-sequence-func callback-only
+ callbacks collect-rogue-tags
+ no-body-tags
+ parse-entities)
(declare (optimize (speed 3) (safety 1)))
(let ((raw-mode-delimiter nil)
(pending nil)
(guts)
(rogue-tags)
)
- (labels ((close-off-tags (name stop-at collect-rogues)
+ (labels ((close-off-tags (name stop-at collect-rogues once-only)
;; close off an open 'name' tag, but search no further
;; than a 'stop-at' tag.
+ #+ignore (format t "close off name ~s, stop at ~s, ct ~s~%"
+ name stop-at current-tag)
(if* (member (tag-name current-tag) name :test #'eq)
then ;; close current tag(s)
(loop
*known-tags*)))
(push (tag-name current-tag) rogue-tags))
(close-current-tag)
- (when (or (member (tag-name current-tag)
- *ch-format*)
- (not (member
- (tag-name current-tag) name :test #'eq)))
- (return)))
+ (if* (or once-only
+ (member (tag-name current-tag)
+ *ch-format*)
+ (not (member
+ (tag-name current-tag) name :test #'eq)))
+ then (return)))
elseif (member (tag-name current-tag) stop-at :test #'eq)
then nil
else ; search if there is a tag to close
(push element guts))))
(save-state ()
- ;; push the current tag state since we're starting
+ ;; push the current tag state since we're starting:
;; a new open tag
- (push (cons current-tag guts) pending))
+ (push (cons current-tag guts) pending)
+ #+ignore (format t "state saved, pending ~s~%" pending)
+ )
(strip-rev-pcdata (stuff)
(if* (eq kind :start-tag) then (push val new-opens)
elseif (member val new-opens :test #'eq) then
(setf new-opens (remove val new-opens :count 1))
- else (close-off-tags (list val) nil nil)
+ else (close-off-tags (list val) nil nil nil)
)))))
(get-next-token (force)
(if* (or force (null (tokenbuf-first-pass tokenbuf))) then
(multiple-value-bind (val kind)
(next-token p nil raw-mode-delimiter read-sequence-func
- tokenbuf)
- (values val kind))
+ tokenbuf parse-entities)
+ (values val kind))
else
(let ((val (first (tokenbuf-first-pass tokenbuf)))
(kind (second (tokenbuf-first-pass tokenbuf))))
(loop
(multiple-value-bind (val kind)
(get-next-token nil)
- ;;(format t "val: ~s kind: ~s~%" val kind)
+ #+ignore (format t "val: ~s kind: ~s last-tag ~s pending ~s~%" val kind
+ last-tag pending)
(case kind
(:pcdata
(when (or (and callback-only current-callback-tags)
(when (and (= (length raw-mode-delimiter) 1) ;; xml tag...
(or (and callback-only current-callback-tags)
(not callback-only)))
- (close-off-tags (list last-tag) nil nil))
+ (close-off-tags (list last-tag) nil nil t))
(setf raw-mode-delimiter nil)
)
then "</STYLE>"
else "</style>"))
elseif (or (eq last-tag :script)
- (and (listp last-tag) (eq (first last-tag) :script)))
+ (and (listp last-tag) (eq (first last-tag) :script)))
then
(setf raw-mode-delimiter
(if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
(not callback-only))
(if* auto-close
then (setq auto-close-stop (tag-auto-close-stop name))
- (close-off-tags auto-close auto-close-stop nil))
+ (close-off-tags auto-close auto-close-stop nil nil))
(when (and pending-ch-format (not no-end))
(if* (member name *ch-format* :test #'eq) then nil
elseif (member name *in-line* :test #'eq) then
(check-in-line name)
else ;; close ALL pending char tags and then reopen
(dolist (this-tag (reverse pending-ch-format))
- (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil))
+ (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil nil))
))
(if* no-end
then ; this is a singleton tag
(setf raw-mode-delimiter nil)
(when (or (and callback-only current-callback-tags)
(not callback-only))
- (close-off-tags (list val) nil nil)
+ (close-off-tags (list val) nil nil t)
(when (member val *ch-format* :test #'eq)
(setf pending-ch-format
(remove val pending-ch-format :count 1
;; close off all tags
(when (or (and callback-only current-callback-tags)
(not callback-only))
- (close-off-tags '(:start-parse) nil collect-rogue-tags))
+ (close-off-tags '(:start-parse) nil collect-rogue-tags nil))
(put-back-tokenbuf tokenbuf)
(if collect-rogue-tags
(return (values (cdar guts) rogue-tags))
(defmethod parse-html (file &key callback-only callbacks collect-rogue-tags
- no-body-tags)
+ no-body-tags parse-entities)
(declare (optimize (speed 3) (safety 1)))
(with-open-file (p file :direction :input)
(parse-html p :callback-only callback-only :callbacks callbacks
:collect-rogue-tags collect-rogue-tags
- :no-body-tags no-body-tags)))
+ :no-body-tags no-body-tags
+ :parse-entities parse-entities
+ )))
(defmethod parse-html ((str string) &key callback-only callbacks collect-rogue-tags
- no-body-tags)
+ no-body-tags parse-entities)
(declare (optimize (speed 3) (safety 1)))
(parse-html (make-string-input-stream str)
:callback-only callback-only :callbacks callbacks
:collect-rogue-tags collect-rogue-tags
- :no-body-tags no-body-tags))
+ :no-body-tags no-body-tags
+ :parse-entities parse-entities
+ ))
--- /dev/null
+<html>
+
+<head>
+<title>A Lisp Based HTML Parser</title>
+<meta name="GENERATOR" content="Microsoft FrontPage 3.0">
+</head>
+
+<body>
+
+<p><big><strong><big>A Lisp Based HTML Parser</big></strong></big></p>
+
+<p><a href="#intro">Introduction/Simple Example</a><br>
+<a href="#lhtml">LHTML parse output format</a><br>
+<a href="#case">Case mode notes</a><br>
+<a href="#comment">Parsing HTML comments</a><br>
+<a href="#script">Parsing <SCRIPT> and <STYLE> tags</a><br>
+<a href="#sgml">Parsing SGML <! tags</a><br>
+<a href="#illegal">Parsing Illegal and Deprecated Tags</a><br>
+<a href="#default">Default Attribute Values</a><br>
+<a href="#char">Parsing Interleaved Character Formatting Tags</a><br>
+<a href="#reference">parse-html reference</a><br>
+ <a href="#methods">methods</a><br>
+ <a href="#internal">phtml-internal</a></p>
+
+<p><a name="intro"></a>The <strong>parse-html</strong> generic function processes HTML
+input, returning a list of HTML tags, attributes, and text. Here is a simple example:<br>
+<br>
+(parse-html "<HTML><br>
+
+<HEAD><br>
+
+<TITLE>Example HTML input</TITLE><br>
+
+<BODY><br>
+
+<P>Here is some text with a <B>bold</B> word<br>and a <A
+HREF=\"help.html\">link</P><br>
+
+</HTML>")</p>
+
+<p>generates:<br>
+<br>
+((:html (:head (:title "Example HTML input"))<br>
+ (:body (:p "Here is some text with a " (:b "bold") "
+word" :br "and a " <br>
+
+((:a :href "help.html") "link")))))<br>
+</p>
+
+<p>The output format is known as LHTML format; it is the same format that the<br>
+aserve htmlgen macro accepts. <br>
+<br>
+<a name="lhtml"></a><strong><big>LHTML format</big></strong><br>
+<br>
+LHTML is a list representation of HTML tags and content.<br>
+<br>
+Each list member may be:
+
+<ol>
+ <li>a string containing text content, such as "Here is some text with a "<br>
+ </li>
+ <li>a keyword package symbol representing a HTML tag with no associated attributes <br>
+ or content, such as :br.<br>
+ </li>
+ <li>a list representing an HTML tag with associated attributes and/or content,<br>
+ such as (:b "bold") or ((:a :href "help.html") "link"). If
+ the HTML tag<br>
+ does not have associated attributes, then the first list member will be a<br>
+ keyword package symbol representing the HTML tag, and the other elements will <br>
+ represent the content, which can be a string (text content), a keyword package symbol
+ (HTML<br>
+ tag with no attributes or content), or list (nested HTML tag with<br>
+ associated attributes and/or content). If there are associated attributes,<br>
+ then the first list member will be a list containing a keyword package symbol<br>
+ followed by two list members for each associated attribute; the first member is a keyword<br>
+ package symbol representing the attribute, and the next member is a string corresponding<br>
+ to the attribute value.<br>
+ </li>
+</ol>
+
+<p><a name="case"></a><strong>Case Mode and LHTML</strong></p>
+
+<p>If excl:*current-case-mode* is :CASE-INSENSITIVE-UPPER, keyword package symbols will be<br>
+in upper case; otherwise, they will be in lower case.</p>
+
+<p><a name="comment"></a><strong>HTML Comments</strong></p>
+
+<p>HTML comments are represented use a :comment symbol. For example,<br>
+<br>
+(parse-html "<!-- this is a comment-->")<br>
+<br>
+--> ((:comment " this is a comment"))</p>
+
+<p><a name="script"></a><strong>HTML <SCRIPT> and <STYLE> tags</strong></p>
+
+<p>All <SCRIPT> and <STYLE> content is not parsed; it is returned as text
+content.<br>
+<br>
+For example,<br>
+<br>
+(parse-html "<SCRIPT>this <B>will not</B> be
+parsed</SCRIPT>")<br>
+<br>
+--> ((:script "this <B>will not</B> be parsed"))</p>
+
+<p><a name="sgml"></a><strong>XML and SGML <! tags</strong></p>
+
+<p>Since, some HTML pages contain special XML/SGML tags, non-comment tags<br>
+starting with '<!' are treated specially:<br>
+<br>
+(parse-html "<!doctype this is some text>")<br>
+<br>
+--> ((:!doctype " this is some text"))</p>
+
+<p><a name="illegal"></a><strong>Illegal and Deprecated HTML</strong></p>
+
+<p>There is plenty of illegal and deprecated HTML on the web that popular browsers<br>
+nonetheless successfully display. The parse-html parser is generous - it will not<br>
+raise an error condition upon encountering most input. In particular, it does not<br>
+maintain a list of legal HTML tags and will successfully parse nonsense input.<br>
+<br>
+For example,<br>
+<br>
+(parse-html "<this> <is> <some> <nonsense>
+<input>")<br>
+<br>
+--> ((:this (:is (:some (:nonsense :input)))))<br>
+<br>
+In some situations, you may prefer a two-pass parse that results in a parse where<br>
+deep nesting related to unrecognized tags is minimized:<br>
+<br>
+(let ((string "<this> <is> <some> <nonsense> </some>
+<input>"))<br>
+ (multiple-value-bind (res rogues)<br>
+ (parse-html string
+:collect-rogue-tags t)<br>
+ (declare (ignorable
+res))<br>
+ (parse-html string
+:no-body-tags rogues)))<br>
+<br>
+--> (:this :is (:some (:nonsense)) :input)<br>
+<br>
+See the <strong>:collect-rogue-tags</strong> and <strong>:no-body-tags</strong> argument
+descriptions in the reference<br>
+section below for more information.</p>
+
+<p><a name="default"></a><strong>Default Attribute values</strong></p>
+
+<p>As per the HTML 4.0 specification, attributes without specified values are given a
+lower case<br>
+string value that matches the attribute name.<br>
+<br>
+For example,<br>
+<br>
+(parse-html "<P here ARE some attributes>")<br>
+<br>
+--> (((:p :here "here" :are "are" :some "some"
+:attributes "attributes")))</p>
+
+<p><a name="char"></a><strong>Interleaved Character Formatting Tags</strong></p>
+
+<p>Existing HTML pages often have character format tags that are interleaved among<br>
+other tags. Such interleaving is removed in a manner consistent with the HTML 4.0<br>
+specification.<br>
+<br>
+For example,<br>
+<br>
+(parse-html "<P>Here is <B>bold text<P>that spans</B>two
+paragraphs")<br>
+<br>
+--> ((:p "Here is " (:b "bold text")) (:p (:b "that
+spans") "two paragraphs"))</p>
+
+<hr>
+
+<p><a name="reference"></a><strong><big>parse-html Reference</big></strong><br>
+<br>
+parse-html [Generic function]<br>
+<br>
+Arguments: input-source &key callbacks callback-only<br>
+ collect-rogue-tags
+no-body-tags parse-entities<br>
+<br>
+Returns LHTML output, as described above.<br>
+<br>
+The callbacks argument, if non-nil, should be an association list. Each list member's<br>
+car (first) element specifies a keyword package symbol, and each list member's cdr (rest)<br>
+element specifies a function object or a symbol naming a function. The function should<br>
+expect one argument. The function will be invoked once for each time the HTML tag<br>
+corresponding to the specified keyword package symbol is encountered in the HTML input;
+the<br>
+argument will be an LHTML list containing the tag, along with associated attributes and<br>
+content. The default callbacks argument value is nil.<br>
+<br>
+The callback-only argument, if non-nil, directs parse-html to not generate a complete
+LHTML<br>
+output. Instead, LHTML lists will only be generated when necessary as arguments for
+functions<br>
+specified in the callbacks association list. This results in faster parser execution. The
+default<br>
+callback-only argument value is nil.<br>
+<br>
+The collect-rogue-tags argument, if non-nil, directs parse-html to return an additional
+value, <br>
+a list containing any unrecognized tags closed by the end of input.<br>
+<br>
+The no-body-tags argument, if non-nil, should be a list containing unknown tags that, if<br>
+encountered, will be treated as a tag with no body or content, and thus, no associated end<br>
+tag. Typically, the argument is a list or modified list resulting from an earlier
+parse-html<br>
+execution with the :collect-rogue-tags argument specified as non-nil.</p>
+
+<p>If the parse-entities argument is true then entities are converted to the character
+they name. Thus for example the &lt; entity is converted to the less than sign.<br>
+<br>
+<a name="methods"></a><strong>parse-html Methods</strong><br>
+<br>
+parse-html (p stream) &key callbacks callback-only<br>
+ collect-rogue-tags
+no-body-tags parse-entities<br>
+<br>
+parse-html (str string) &key callbacks callback-only<br>
+ collect-rogue-tags
+no-body-tags parse-entities<br>
+<br>
+parse-html (file t) &key callbacks callback-only<br>
+ collect-rogue-tags
+no-body-tags parse-entities<br>
+<br>
+The t method assumes the argument is a pathname suitable<br>
+for use with the with-open-file macro.<br>
+<br>
+<br>
+<a name="internal"></a><strong>phtml-internal [Function]</strong><br>
+<br>
+Arguments: stream read-sequence-func callback-only callbacks<br>
+collect-rogue-tags no-body-tags parse-entities<br>
+<br>
+This function may be used when more control is needed for supplying<br>
+the HTML input. The read-sequence-func argument, if non-nil, should be a function<br>
+object or a symbol naming a function. When phtml-internal requires another buffer<br>
+of HTML input, it will invoke the read-sequence-func function with two arguments -<br>
+the first argument is an internal buffer character array and the second argument is<br>
+the phtml-internal stream argument. If read-sequence-fun is nil, phtml-internal<br>
+will invoke read-sequence to fill the buffer. The read-sequence-func function must<br>
+return the number of character array elements successfully stored in the buffer.<br>
+<br>
+<br>
+<br>
+<br>
+<br>
+<br>
+<br>
+</p>
+</body>
+</html>
--- /dev/null
+<html>
+
+<head>
+<title>A Lisp Based XML Parser</title>
+<meta name="GENERATOR" content="Microsoft FrontPage 3.0">
+</head>
+
+<body>
+
+<p><strong><big><big>A Lisp Based XML Parser</big></big></strong></p>
+
+<p><a href="#intro">Introduction/Simple Example</a><br>
+<a href="#lxml">LXML parse output format</a><br>
+<a href="#props">parse-xml non-validating parser properties</a><br>
+<a href="#modern">case and international character support issues</a><br>
+<a href="#keyword">parse-xml and packages</a><br>
+<a href="#namespace">parse-xml, the XML Namespace specification, and packages</a><br>
+<a href="#unicode-scalar">ACL does not support Unicode 4 byte scalar values</a><br>
+<a href="#big-endian">only little-endian Unicode tested in ACL 6.0 beta</a><br>
+<a href="#debug">debugging aids</a><br>
+<a href="#conformance">XML Conformance test results</a><br>
+<a href="#build">Compiling and Loading the parser</a><br>
+<a href="#reference">parse-xml reference</a></p>
+
+<p><a name="intro"></a>The <strong>parse-xml </strong>generic function processes XML
+input, returning a list of XML tags,<br>
+attributes, and text. Here is a simple example:<br>
+<br>
+(parse-xml "<item1><item2 att1='one'/>this is some
+text</item1>")<br>
+<br>
+--><br>
+<br>
+((item1 ((item2 att1 "one")) "this is some text"))<br>
+<br>
+The output format is known as LXML format.<br>
+<br>
+<a name="lxml"></a><strong>LXML Format</strong><br>
+<br>
+LXML is a list representation of XML tags and content.<br>
+<br>
+Each list member may be:<br>
+<br>
+a. a string containing text content, such as "Here is some text with a "<br>
+<br>
+b. a list representing a XML tag with associated attributes and/or content,
+such as ('item1 "text") or (('item1 :att1 "help.html")
+"link"). If the XML tag
+does not have associated attributes, then the first list member will be a
+symbol representing the XML tag, and the other elements will
+represent the content, which can be a string (text content), a symbol (XML
+tag with no attributes or content), or list (nested XML tag with
+associated attributes and/or content). If there are associated attributes,
+then the first list member will be a list containing a symbol
+followed by two list members for each associated attribute; the first member is a
+symbol representing the attribute, and the next member is a string corresponding
+to the attribute value.<br>
+<br>
+c. XML comments and or processing instructions - see the more detailed example below for
+further information.</p>
+
+<p><a name="props"></a><strong>Non Validating Parser Properties</strong></p>
+
+<p>Parse-xml is a non-validating XML parser. It will detect non-well-formed XML input.
+When<br>
+processing valid XML input, parse-xml will optionally produce the same output as a
+validating <br>
+parser would, including the processing of an external DTD subset and external entity
+declarations.<br>
+<br>
+By default, parse-xml outputs a DTD parse along with the parsed XML contents. The DTD
+parse may<br>
+be optionally suppressed. The following example shows DTD parsed output components:</p>
+
+<p>(defvar *xml-example-external-url*<br>
+ "<!ENTITY ext1 'this is some external entity %param1;'>")<br>
+<br>
+(defun example-callback (var-name token &optional public)<br>
+ (declare (ignorable token public))<br>
+ (setf var-name (uri-path var-name))<br>
+ (if* (equal var-name "null") then nil<br>
+ else<br>
+ (let ((string (eval (intern var-name (find-package
+:user)))))<br>
+ (make-string-input-stream string))))<br>
+<br>
+(defvar *xml-example-string*<br>
+"<?xml version='1.0' encoding='utf-8'?><br>
+<!-- the following XML input is well-formed but its validity has not been checked ...
+--><br>
+<?piexample this is an example processing instruction tag ?><br>
+<!DOCTYPE example SYSTEM '*xml-example-external-url*' [<br>
+ <!ELEMENT item1 (item2* | (item3+ , item4))><br>
+ <!ELEMENT item2 ANY><br>
+ <!ELEMENT item3 (#PCDATA)><br>
+ <!ELEMENT item4 (#PCDATA)><br>
+ <!ATTLIST item1<br>
+ att1 CDATA #FIXED 'att1-default'<br>
+ att2 ID #REQUIRED<br>
+ att3 ( one | two | three ) 'one'<br>
+ att4 NOTATION ( four | five ) 'four' ><br>
+ <!ENTITY % param1 'text'><br>
+ <!ENTITY nentity SYSTEM 'null' NDATA somedata><br>
+ <!NOTATION notation SYSTEM 'notation-processor'><br>
+ ]><br>
+<item1 att2='1'><item3>&ext1;</item3></item1>")<br>
+<br>
+(pprint (parse-xml *xml-example-string* :external-callback 'example-callback))<br>
+<br>
+--><br>
+<br>
+((:xml :version "1.0" :encoding "utf-8")<br>
+ (:comment " the following XML input is well-formed but may or may not be valid
+")<br>
+ (:pi :piexample "this is an example processing instruction tag ")<br>
+ (:DOCTYPE :example<br>
+ (:[ (:ELEMENT :item1 (:choice (:* :item2) (:seq (:+ :item3) :item4))) <br>
+ (:ELEMENT :item2 :ANY)<br>
+ (:ELEMENT :item3 :PCDATA) (:ELEMENT :item4
+:PCDATA)<br>
+ (:ATTLIST item1 (att1 :CDATA :FIXED
+"att1-default") (att2 :ID :REQUIRED)<br>
+ (att3
+(:enumeration :one :two :three) "one") <br>
+ (att4 (:NOTATION
+:four :five) "four"))<br>
+ (:ENTITY :param1 :param "text") <br>
+ (:ENTITY :nentity :SYSTEM "null"
+:NDATA :somedata)<br>
+ (:NOTATION :notation :SYSTEM
+"notation-processor"))<br>
+ (:external (:ENTITY :ext1 "this is some external entity
+text")))<br>
+ ((item1 att1 "att1-default" att2 "1" att3 "one"
+att4 "four") <br>
+ (item3 "this is some external entity
+text")))<br>
+<br>
+<br>
+<strong><big>Usage Notes</big></strong><br>
+<br>
+<ol>
+<li><a name="modern"></a>The parse-xml function has been primarily compiled and tested in a
+modern ACL. However, in an ANSI Lisp with wide character support, it DOES pass the valid
+component of the conformance suite in the same manner as it does in a Modern Lisp. The
+parser's successful operation in all potential situations depends on wide character support.
+<br><br>
+</li>
+<li><a name="keyword"></a>The parser uses the keyword package for DTD tokens and other
+special XML tokens. Since element and attribute token symbols are usually interned
+in the current package, it is not recommended to execute parse-xml
+when the current package is the keyword package.
+<br><br>
+</li>
+<li><a name="namespace"></a>The XML parser supports the XML Namespaces specification. The
+parser recognizes a "xmlns" attribute and attribute names starting with
+"xmlns:".
+As per the specification, the parser expects that the associated value
+is an URI string. The parser then associates XML Namespace prefixes with a
+Lisp package provided via the parse-xml :uri-to-package option or, if
+necessary, a package created on the fly. The following example demonstrates
+this behavior:<br>
+
+<p>(setf *xml-example-string4*<br>
+ "<bibliography<br>
+ xmlns:bib='http://www.bibliography.org/XML/bib.ns'<br>
+ xmlns='urn:com:books-r-us'><br>
+ <bib:book owner='Smith'><br>
+ <bib:title>A Tale of Two Cities</bib:title><br>
+ <bib:bibliography<br>
+ xmlns:bib='http://www.franz.com/XML/bib.ns'<br>
+ xmlns='urn:com:books-r-us'><br>
+ <bib:library branch='Main'>UK
+Library</bib:library><br>
+ <bib:date calendar='Julian'>1999</bib:date><br>
+ </bib:bibliography><br>
+ <bib:date calendar='Julian'>1999</bib:date><br>
+ </bib:book><br>
+</bibliography>")<br>
+<br>
+(setf *uri-to-package* nil)<br>
+(setf *uri-to-package*<br>
+ (acons (parse-uri <a href="http://www.bibliography.org/XML/bib.ns">"http://www.bibliography.org/XML/bib.ns"</a>)<br>
+ (make-package "bib") *uri-to-package*))<br>
+(setf *uri-to-package*<br>
+ (acons (parse-uri <a href="http://www.bibliography.org/XML/bib.ns">"</a>urn:com:books-r-us<a
+href="http://www.bibliography.org/XML/bib.ns">"</a>)<br>
+ (make-package "royal") *uri-to-package*))<br>
+(setf *uri-to-package*<br>
+ (acons (parse-uri <a href="http://www.bibliography.org/XML/bib.ns">"</a>http://www.franz.com/XML/bib.ns<a
+href="http://www.bibliography.org/XML/bib.ns">"</a>)<br>
+ (make-package "franz-ns") *uri-to-package*))<br>
+(pprint (multiple-value-list<br>
+ (parse-xml
+*xml-example-string4*<br>
+ :uri-to-package
+*uri-to-package*)))<br>
+<br>
+--><br>
+((((bibliography |xmlns:bib| <a href="http://www.bibliography.org/XML/bib.ns">"http://www.bibliography.org/XML/bib.ns"</a><br>
+ xmlns "urn:com:books-r-us")<br>
+ "<br>
+ "<br>
+ ((bib::book royal::owner "Smith") "<br>
+ " (bib::title "A Tale of Two
+Cities") "<br>
+ "<br>
+ ((bib::bibliography royal::|xmlns:bib|<br>
+ "http://www.franz.com/XML/bib.ns" royal::xmlns<br>
+ "urn:com:books-r-us")<br>
+ "<br>
+ " ((franz-ns::library royal::branch
+"Main") "UK Library") "<br>
+ " ((franz-ns::date royal::calendar
+"Julian") "1999") "<br>
+ ")<br>
+ "<br>
+ " ((bib::date royal::calendar
+"Julian") "1999") "<br>
+ ")<br>
+ "<br>
+ "))<br>
+((#<uri http://www.franz.com/XML/bib.ns> . #<The franz-ns package>)<br>
+ (#<uri urn:com:books-r-us> . #<The royal package>)<br>
+ (#<uri http://www.bibliography.org/XML/bib.ns> . #<The bib package>)))<br>
+<br>
+</li>
+<li>In the absence of XML Namespace attributes, element and attribute symbols are interned
+in the current package. Note that this implies that attributes and elements referenced
+in DTD content will be interned in the current package.
+</li>
+<li>The parse-xml function has been tested using the OASIS conformance test suite (see
+details below). The test suite has wide coverage across possible XML and DTD syntax,
+but there may be some syntax paths that have not yet been tested or completely
+supported. Here is a list of currently known syntax parsing issues:
+<ul>
+<li><a name="unicode-scalar"></a>ACL does not support 4 byte Unicode scalar values, so
+input containing such data
+will not be processed correctly. (Note, however, that parse-xml does correctly detect
+and process wide Unicode input.)
+</li>
+<li><a name="big-endian"></a>The OASIS tests that contain wide Unicode all use a
+little-endian encoded Unicode.
+Changes to the unicode-check function are required to also support big-endian encoded
+Unicode. (Note also that this issue may be resolved by an ACL 6.0 final release change.)
+</li>
+<li>An initial <?xml declaration in external entity files is skipped without a check
+being made to see if the <?xml declaration is itself incorrect.
+</li>
+</ul>
+</li>
+<li><a name="debug"></a>When investigating possible parser errors or examining more closely
+where the parser
+determined that the input was non-well-formed, the net.xml.parser internal symbols
+*debug-xml* and *debug-dtd* are useful. When not bound to nil, these variables cause
+lexical analysis and intermediate parsing results to be output to *standard-output*.
+</li>
+<li><a name="loading"></a>It is necessary to load the <b>pxml</b> module before using it.
+Typically this can be done by evaluating <b>(require :pxml)</b>.
+</li>
+</ol>
+<a name="conformance"></a><strong>XML Conformance Test Suite</strong><br>
+<br>
+Using the OASIS test suite <a href="http://www.oasis-open.org">(http://www.oasis-open.org)</a>,
+here are the current parse-xml results:<br>
+<br>
+xmltest/invalid: Not tested, since parse-xml is a non-validating parser<br>
+<br>
+not-wf/<br>
+<br>
+ ext.sa: 3 tests; all pass<br>
+ not-sa: 8 tests; all pass<br>
+ sa: 186 tests; the following fail:<br>
+<br>
+ 170.xml: fails because ACL does not support 4
+byte Unicode scalar values<br>
+<br>
+valid/<br>
+<br>
+ ext-sa: 14 tests; all pass<br>
+ not-sa: 31 tests; all pass<br>
+ sa: 119 tests: the following fail:<br>
+<br>
+ 052.xml, 064.xml, 089.xml: fails because ACL
+does not support 4 byte <br>
+
+Unicode scalar values<br>
+<br>
+<a name="build"></a><big><strong>Compiling and Loading</strong></big><br>
+<br>
+Load build.cl into a modern ACL session will result in a pxml.fasl file that can
+subsequently be<br>
+loaded in a modern ACL to provide XML parsing functionality.<br>
+<br>
+-------------------------------------------------------------------------------------------<br>
+<br>
+<a name="reference"></a><big><strong>parse-xml reference</strong></big><br>
+<br>
+parse-xml [Generic
+function]<br>
+<br>
+Arguments: input-source &key external-callback content-only <br>
+ general-entities
+parameter-entities<br>
+ uri-to-package<br>
+<br>
+Returns multiple values:<br>
+<ol>
+<li>LXML and parsed DTD output, as described above.</li>
+<li>An association list containing the uri-to-package argument conses (if any)
+and conses associated with any XML Namespace packages created during the
+parse (see uri-to-package argument description, below).</li>
+</ol>
+The external-callback argument, if specified, is a function object or symbol
+that parse-xml will execute when encountering an external DTD subset
+or external entity DTD declaration. Here is an example which shows that
+arguments the function should expect, and the value it should return:
+<br><pre>
+(defun file-callback (uri-object token &optional public)
+ ;; The uri-object is an ACL URI object created from
+ ;; the XML input. In this example, this function
+ ;; assumes that all uri's will be file specifications.
+ ;;
+ ;; The token argument identifies what token is associated
+ ;; with the external parse (for example :DOCTYPE for external
+ ;; DTD subset
+ ;;
+ ;; The public argument contains the associated PUBLIC string,
+ ;; when present
+ ;;
+ (declare (ignorable token public))
+ ;; An open stream is returned on success,
+ ;; a nil return value indicates that the external
+ ;; parse should not occur.
+ ;; Note that parse-xml will close the open stream before exiting.
+ (ignore-errors (open (uri-path uri-object))))
+</pre>
+<p>
+The general-entities argument is an association list containing general entity symbol
+and replacement text pairs. The entity symbols should be in the keyword package.
+Note that this option may be useful in generating desirable parse results in
+situations where you do not wish to parse external entities or the external DTD subset.
+<p>
+The parameter-entities argument is an association list containing parameter entity symbol
+and replacement text pairs. The entity symbols should be in the keyword package.
+Note that this option may be useful in generating desirable parse results in
+situations where you do not wish to parse external entities or the external DTD subset.
+<p>
+The uri-to-package argument is an association list containing uri objects and package
+objects. Typically, the uri objects correspond to XML Namespace attribute values, and
+the package objects correspond to the desired package for interning symbols associated
+with the uri namespace. If the parser encounters an uri object not contained in this list,
+it will generate a new package. The first generated package will be named
+net.xml.namespace.0,
+the second will be named net.xml.namespace.1, and so on.
+<h3>parse-xml methods</h3>
+<pre>
+(parse-xml (p stream) &key
+ external-callback content-only
+ general-entities
+ parameter-entities
+ uri-to-package)
+
+(parse-xml (str string) &key
+ external-callback content-only
+ general-entities
+ parameter-entities
+ uri-to-package)
+</pre>
+An easy way to parse a file containing XML input:
+<pre>
+(with-open-file (p "example.xml")
+ (parse-xml p :content-only p))
+</pre>
+<h3>net.xml.parser unexported special variables:</h3>
+<p>
+*debug-xml*<br>
+<br>
+When true, parse-xml generates XML lexical state and intermediary
+parse result debugging output.
+<p>
+*debug-dtd*<br>
+<br>
+When true, parse-xml generates DTD lexical state and intermediary
+parse result debugging output.
+</body>
+</html>
;; version) or write to the Free Software Foundation, Inc., 59 Temple Place,
;; Suite 330, Boston, MA 02111-1307 USA
;;
-;; $Id: pxml0.cl,v 1.3 2002/10/16 03:45:52 kevin Exp $
-
;; pxml.cl - parse xml
;;
;; Change Log
(in-package :net.xml.parser)
-
-#-allegro
-(eval-when (:compile-toplevel :load-toplevel :execute)
- (defvar excl::*dribble-bug-hooks* nil)
-#+ignore (export '*dribble-bug-hooks* 'excl))
-
(unless (fboundp 'pxml-dribble-bug-hook)
(let ((pxml-version-strings nil))
(defun pxml-dribble-bug-hook (stream-or-string)
do (write-string string stream-or-string)
(terpri stream-or-string))))
- (push 'pxml-dribble-bug-hook excl::*dribble-bug-hooks*)))
+ (push 'pxml-dribble-bug-hook excl:*dribble-bug-hooks*)))
-(funcall 'pxml-dribble-bug-hook "$Id: pxml0.cl,v 1.3 2002/10/16 03:45:52 kevin Exp $")
+(funcall 'pxml-dribble-bug-hook "$Id: pxml0.cl,v 1.4 2003/06/20 02:21:23 kevin Exp $")
(defun xml-char-p (char)
(declare (optimize (speed 3) (safety 1)))
;; version) or write to the Free Software Foundation, Inc., 59 Temple Place,
;; Suite 330, Boston, MA 02111-1307 USA
;;
-;; $Id: pxml1.cl,v 1.2 2002/10/16 03:45:52 kevin Exp $
-
-;; Change Log
+;; Change Log
;;
;; 10/14/00 add namespace support; xml-error fix
(in-package :net.xml.parser)
-(pxml-dribble-bug-hook "$Id: pxml1.cl,v 1.2 2002/10/16 03:45:52 kevin Exp $")
+(pxml-dribble-bug-hook "$Id: pxml1.cl,v 1.3 2003/06/20 02:21:23 kevin Exp $")
(defparameter *collectors* (list nil nil nil nil nil nil nil nil))
(declare (ignorable tokenbuf) (optimize (speed 3) (safety 1)))
;; need no-OO check because external format support isn't completely done yet
(when (not (typep p 'string-input-simple-stream))
- #+(and allegro (version>= 6 0 pre-final 1))
+ #+(version>= 6 0 pre-final 1)
(let ((format (ignore-errors (excl:sniff-for-unicode p))))
(if* (eq format (find-external-format :unicode))
then
- #+allegro (setf (stream-external-format p) format)
+ (setf (stream-external-format p) format)
else
- #+allegro (setf (stream-external-format p) (find-external-format :utf8))))
- #-(and allegro (version>= 6 0 pre-final 1))
+ (setf (stream-external-format p) (find-external-format :utf8))))
+ #-(version>= 6 0 pre-final 1)
(let* ((c (read-char p nil)) c2
(c-code (if c (char-code c) nil)))
(if* (eq #xFF c-code) then
(setf c-code (if c (char-code c2) nil))
(if* (eq #xFE c-code) then
(format t "set unicode~%")
- #+allegro (setf (stream-external-format p)
- (find-external-format #+(and allegro (version>= 6 0 pre-final 1)) :unicode
- #-(and allegro (version>= 6 0 pre-final 1)) :fat-little))
+ (setf (stream-external-format p)
+ (find-external-format #+(version>= 6 0 pre-final 1) :unicode
+ #-(version>= 6 0 pre-final 1) :fat-little))
else
(xml-error "stream has incomplete Unicode marker"))
- else #+allegro (setf (stream-external-format p)
+ else (setf (stream-external-format p)
(find-external-format :utf8))
(when c
(push c (iostruct-unget-char tokenbuf))
(xml-error "XML declaration tag does not include correct 'encoding' or 'standalone' attribute"))
(when (and (fourth val) (string= "standalone" (symbol-name (fourth val))))
(if* (equal (fifth val) "yes") then
- (setf (iostruct-standalonep tokenbuf) t)
+ (setf (iostruct-standalonep tokenbuf) t)
elseif (not (equal (fifth val) "no")) then
(xml-error "XML declaration tag does not include correct 'standalone' attribute value")))
(dotimes (i (length (third val)))
(not (member c '(#\. #\_ #\- #\:)))
)
(xml-error "XML declaration tag does not include correct 'version' attribute value"))))
- (when (and (fourth val) (eql :encoding (fourth val)))
- (dotimes (i (length (fifth val)))
- (let ((c (schar (fifth val) i)))
- (when (and (not (alpha-char-p c))
- (if* (> i 0) then
- (and (not (digit-char-p c))
- (not (member c '(#\. #\_ #\-))))
- else t))
- (xml-error "XML declaration tag does not include correct 'encoding' attribute value")))))
- )
+ (if* (and (fourth val) (eql :encoding (fourth val)))
+ then (dotimes (i (length (fifth val)))
+ (let ((c (schar (fifth val) i)))
+ (when (and (not (alpha-char-p c))
+ (if* (> i 0) then
+ (and (not (digit-char-p c))
+ (not (member c '(#\. #\_ #\-))))
+ else t))
+ (xml-error "XML declaration tag does not include correct 'encoding' attribute value"))))
+ ;; jkf 3/26/02
+ ;; if we have a stream we're reading from set its external-format
+ ;; to the encoding
+ ;; note - tokenbuf is really an iostruct, not a tokenbuf
+ (if* (tokenbuf-stream (iostruct-tokenbuf tokenbuf))
+ then (setf (stream-external-format
+ (tokenbuf-stream (iostruct-tokenbuf tokenbuf)))
+ (find-external-format (fifth val))))
+
+
+ ))
(defun xml-error (text)
(declare (optimize (speed 3) (safety 1)))
;; version) or write to the Free Software Foundation, Inc., 59 Temple Place,
;; Suite 330, Boston, MA 02111-1307 USA
;;
-;; $Id: pxml2.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $
-
-;; Change Log
+;; Change Log
;;
;; 10/14/00 add namespace support
(in-package :net.xml.parser)
-(pxml-dribble-bug-hook "$Id: pxml2.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $")
+(pxml-dribble-bug-hook "$Id: pxml2.cl,v 1.2 2003/06/20 02:21:23 kevin Exp $")
;; state titles can be better chosen and explained
;; version) or write to the Free Software Foundation, Inc., 59 Temple Place,
;; Suite 330, Boston, MA 02111-1307 USA
;;
-;; $Id: pxml3.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $
(in-package :net.xml.parser)
-(pxml-dribble-bug-hook "$Id: pxml3.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $")
+(pxml-dribble-bug-hook "$Id: pxml3.cl,v 1.2 2003/06/20 02:21:23 kevin Exp $")
(defvar *debug-dtd* nil)
--- /dev/null
+;;;; -*- Mode: LISP; Syntax: ANSI-Common-Lisp; Base: 10 -*-
+;;;; *************************************************************************
+;;;; FILE IDENTIFICATION
+;;;;
+;;;; Name: xmlutils.asd
+;;;; Purpose: ASDF definition file for Xmlutils
+;;;; Programmer: Kevin M. Rosenberg
+;;;; Date Started: Sep 2002
+;;;;
+;;;; $Id: xmlutils.asd,v 1.1 2003/06/20 02:21:23 kevin Exp $
+;;;;
+;;;; This file, part of cl-xmlutils, is Copyright (c) 2002 by Kevin M. Rosenberg
+;;;;
+;;;; cl-xmlutils users are granted the rights to distribute and use this software
+;;;; as governed by the terms of the GNU Lesser General Public License
+;;;; (http://www.gnu.org/licenses/lgpl.html)
+;;;; *************************************************************************
+
+(in-package #:cl-user)
+(defpackage #:xmlutils-system (:use #:asdf #:cl))
+(in-package #:xmlutils-system)
+
+#-allegro (require :acl-compat)
+
+#+lispworks
+(eval-when (:compile-toplevel :load-toplevel :execute)
+ (defvar system::*stack-overflow-behavior* :warn)
+ (setq system::*stack-overflow-behavior* :warn))
+
+(defsystem xmlutils
+ :name "cl-xmlutils"
+ :author "Franz, Inc"
+ :maintainer "Kevin M. Rosenberg <kmr@debian.org>"
+ :licence "GNU Lesser General Public License"
+ :description "Franz's Test Harness Package"
+ :long-description "Xmlutils provides a library for parsing HTML and XML documents."
+
+ :components
+ ((:file "phtml")
+ (:file "pxml0")
+ (:file "pxml1" :depends-on ("pxml0"))
+ (:file "pxml2" :depends-on ("pxml1"))
+ (:file "pxml3" :depends-on ("pxml2"))
+ ))
+
+(defmethod source-file-type ((c cl-source-file) (s (eql (find-system 'xmlutils))))
+ "cl")
+