From b5da6339c28ee272d0a32eb5c26a9f7446e71d9f Mon Sep 17 00:00:00 2001 From: "Kevin M. Rosenberg" Date: Tue, 15 Oct 2002 12:23:03 +0000 Subject: [PATCH] r3027: *** empty log message *** --- ChangeLog | 282 ++++++ build.cl | 28 + phtml-test.cl | 406 ++++++++ phtml.cl | 1079 +++++++++++++++++++++ phtml.htm | 254 +++++ phtml.html | 254 +++++ phtml.txt | 191 ++++ pxml-test.cl | 161 ++++ pxml.htm | 387 ++++++++ pxml.html | 387 ++++++++ pxml.txt | 345 +++++++ pxml0.cl | 241 +++++ pxml1.cl | 437 +++++++++ pxml2.cl | 2093 +++++++++++++++++++++++++++++++++++++++++ pxml3.cl | 2510 +++++++++++++++++++++++++++++++++++++++++++++++++ 15 files changed, 9055 insertions(+) create mode 100644 ChangeLog create mode 100644 build.cl create mode 100644 phtml-test.cl create mode 100644 phtml.cl create mode 100644 phtml.htm create mode 100644 phtml.html create mode 100644 phtml.txt create mode 100644 pxml-test.cl create mode 100644 pxml.htm create mode 100644 pxml.html create mode 100644 pxml.txt create mode 100644 pxml0.cl create mode 100644 pxml1.cl create mode 100644 pxml2.cl create mode 100644 pxml3.cl diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..2c39fab --- /dev/null +++ b/ChangeLog @@ -0,0 +1,282 @@ +******************************************************************************* +merge from trunk to acl6 branch (for 6.1.beta) +command: ../../join.sh trunk trunk_to_acl6_merge2 trunk_to_acl6_merge3 xmlutils +******************************************************************************* + +******************************************************************************* +merge from trunk to acl6 branch +command: ../../join.sh trunk trunk_to_acl6_merge1 trunk_to_acl6_merge2 xmlutils +******************************************************************************* + +******************************************************************************* +merge from trunk to acl6 branch +command: ../../join.sh trunk acl6 trunk_to_acl6_merge1 xmlutils +******************************************************************************* + +2001-06-08 Steve Haflich + + * pxml.htm: Added mention that it is necessary to load or require + the module. Cleaned up a little html formatting. + +2001-05-30 John Foderaro + + * phtml.cl - add :,_,- and . to valid attribute name characters. + +2001-03-23 Steve Haflich + + * pxml3.cl: state-dtd-!-element-type-paren-choice-name2 was + missing code to handle decls such as + + which was presumably overlooked in the merge of + *-choice and *-seq into a single set of parser states + It is quite clear that the dtd parsr does not return correct + tree structure in all cases, but this fix may allow it at + least to accept legal dtds without signalling error. + Also made minor *debug-dtd* enhancements. + * pxml[0,1,2,3]: Added dribble-bug version number tracking. + +2001-02-05 Steve Jacobson + + phtml.cl: symbols mapped to preferred case at runtime (as opposed to + a compile time macro determining the case mapping) + +2000-12-20 Steve Jacobson + + pxml-test.cl: NameSpace example change to reflect URI module fix + pxml.htm: NameSpace example change + pxml2.cl: bug10165 fix + pxml3.cl: bug10165 fix + +2000-12-05 Steve Jacobson + + phtml.cl: add user visible change comment; fix comment spelling error + pxml.htm: change ANSI notes to reflect code changes + pxml1.cl: use symbol-name where needed to support ANSI case usage + pxml2.cl: use symbol-name where needed to support ANSI case usage + pxml3.cl: add two declarations to prevent forward referencing compile warnings + +2000-10-27 Steve Jacobson + + phtml.cl: check callbacks arg when tag has no body + REMEMBER TO ADD TEST TO PHTML-TEST!!! + +2000-10-14 Steve Jacobson + + phtml-test.cl: add test related to raw mode infinite loop + phtml.cl: remove risk of negative buffer index + caused by multiple un-next-char calls in raw mode + fixed it by moving first-pass parse buffer to tokenbuf + removed :script from *in-line* to prevent infinite loop + (it should not have been there, anyway) + fixed :table tag-auto-close-stop typo + don't reopen char format tags within raw mode tags + pxml1.cl: changes required by 6.0 unicode changes + pxml2.cl: show char code in debug output + + +2000-09-05 Steve Jacobson + + New files: + + phtml.htm: HTML version of doc file + pxml.htm: HTML version of doc file + + Changed files: + + pxml-test.cl: Add namespace example; change because of xml-error change + pxml.txt: Namespace changes and other edits + pxml1.cl: namespace support; add format string to xml-error 'error call + pxml2.cl: namespace support + +2000-08-16 Steve Jacobson + + pxml-test.cl: cleanup for distribution + *.cl: add AllegroServe license text + +2000-08-10 Steve Jacobson + + phtml-test.cl: add tests for latest changes + phtml.cl: allow underscore as tag character + fix --> (:! "if ..]") bug ('[' lost) + add collect-rogue-tags & no-body-tags arguments to + support 2 pass parse for really bad pages (e.g. New + York Times page) + special inline character formatting close/reopen strategy + preserves any attributes (smh reported bug) + phtml.txt: new argument and + + * phtml.cl: handle pair with no text between the tags + correctly. Change works for any "raw" mode tag. + + * phtml-test.cl: add test for the above change + +2000-07-17 Kevin Layer + + * *.cl: add rcs id's + +2000-07-17 Steve Jacobson + + * phtml.cl: more robust handling of illegal attribute value HTML: + parse-html "") => ((frame :src "foo.html")) + (skip spaces after '=' to look for attribute value) + + is not a paired tag; it's standalone like + + * phtml-test.cl: add tests for the above two changes + + +2000-06-29 Steve Jacobson + + * phtml: export phtml-internal + + * phtml.txt: phtml-internal now exported; + describe read-sequence-func return value + +2000-06-26 Steve Jacobson + + * phtml.cl: let colon be permissable tag name character; + parse xml type tags in raw mode with no contents; + parse :script & :style correctly when there are attributes + present + + * phtml-test.cl: add tests for above changes + +2000-06-23 Steve Jacobson + + * phtml.cl: add autoclose property to :p tag + + * phtml-test.cl: adjust test to reflect that

tags can't nest + + * phtml.txt: new file: preliminary documentation + +2000-06-20 Steve Jacobson + + * phtml.cl: identify end of input errors + + * phtml-test.cl: add end of input error test + +2000-06-10 Steve Jacobson + + * phtml.cl: made input buffer usage thread-safe, without increasing + consing. + +Fri May 26 22:55:52 PST 2000 Duane Rettig + + * Makefile: set SHELL variable + +2000-05-24 Steve Jacobson + + * phtml-test.cl: added more character format (,,etc.) tests + test changes to reflect new callback API + + * phtml.cl: removed element-callback support; replaced it with + parse-html :callbacks argument + took *entity-mapping* out of source until we decide to + add entity processing + added some more tags to "character formating" group + changed "character formating" tag parsing to both + coerce parse results to HTML 4.0 spec and also + prevent generating syntax equivalent yet unexpected + results + +2000-05-17 Steve Jacobson + + * phtml.cl: output keyword symbols in upper case when phtml.cl is compiled in + :CASE-INSENSITIVE-UPPER lisp. + runtime raw mode ( + this is some title text + this is some body text + with some text + + lmcelroy@performigence.com + +
+ this is some more text + tests parser 'looseness' + +

    +
  • item 1 +
  • item 2
+
+
a term +
its definition +
another term +
another definition
+ + + + + + + + + + + + +
this cell is aligned right + this cell is centered +
this cell is aligned right + this cell is centered +
this cell is aligned right + this cell is centered +
this cell is aligned right + this cell is centered
+ + + Navigate the site: + + + + WWW is an abbreviation + force + whitespace only") + +(setf *expected-result* + '((:html + (:comment "this should be

one

string") + (:head + (:style "this should be

one

string") + (:title "this is some title text")) + (:body + "this is some body text" + ((:a :name "this is an anchor") "with some text") + (:comment "testing allowing looser attribute parsing") + ((:a :href "mailto:lmcelroy@performigence.com") + "lmcelroy@performigence.com") + :br + "this is some more text" + (:bogus "tests parser 'looseness'") + (:select + (:option "1") + (:option "2")) + (:ul + (:li "item 1") + (:li "item 2")) + (:dl + (:dt "a term") + (:dd "its definition") + (:dt "another term") + (:dd "another definition")) + (:table + (:colgroup + ((:col :align "right")) + ((:col :align "center"))) + (:thead + (:tr + (:th "this cell is aligned right") + (:th "this cell is centered"))) + (:tfoot + (:tr + (:th "this cell is aligned right") + (:th "this cell is centered"))) + (:tbody + (:tr + (:td "this cell is aligned right") + (:td "this cell is centered"))) + (:tbody + (:tr + (:td "this cell is aligned right") + (:td "this cell is centered")))) + (:pp + (:object + (:pp "Navigate the site:" + ((:map :name "mainmap") + ((:area :shape "rect" :coords "0,100,100,200")) + ((:area :shape "rect" :coords "100,100,100,200")))))) + (:abbr "WWW") + "is an abbreviation" + (:b "force") + (:pp "whitespace only") + )))) + +(setf *test-string2* + "text more text + + + text more text + text

more text yet more text

+
  • text
  • more text
+ prevbarbaz + foobarbaz + foobarbaz + foobaz + foobarbaz + + + " + ) + +(setf *expected-result2* + '((:i ((:b :id "1") "text")) ((:b :id "1") " more text") + (:!doctype "this is some text") + (:! "[if xxx]") + (:i (:b "text")) (:b) " more text" + (:b "text") (:p (:b "more text") " yet more text") + (:ul (:li (:b "text")) (:li (:b "more text"))) (:b) + "prev" (:b ((:a :href "foo") "bar") "baz") + (:b "foo" (:a "bar") "baz") + (:b "foo") (:a (:b "bar") "baz") + (:b "foo") (:script "bar") (:b (:a "baz")) + (:b "foo" (:i "bar") "baz") + ((:script :a "b") " some text if (year < 1000) year += 1900; more text ") + ((:script :a "b")) + (:frameset ((:frame :foo "foo")) ((:frame :bar "bar"))) + )) + +(setf *test-string3* + " + + + + + +
+ + +E-Mail Updates from NYTimes.com +") + +(setf *expected-result3* + '(((:icmeta :url "nytimes.html")) ((:nyt_header :version "1.0" :type "homepage")) + ((:body :bgcolor "#ffffff" :background "back5.gif" :vlink "4" :link "6") + ((:nyt_banner :version "1.0" :type "homepage")) + ((:table :border "0" :cellspacing "0" :cellpadding "0") + (:tr + ((:td :bgcolor "0" :rowspan "4" :width "126" :align "left" :valign "center") + ((:nyt_ad :version "1.0" :location "") + ((:a :href "ads.gif" :target "top") + ((:img :src "http://ads2.gif" :border "0" :width "120" :height "90" :alt + "E-Mail Updates from NYTimes.com")))))))))) + + +(defmethod lhtml-equal ((a t) (b t)) + (equal a b)) + +(defmethod lhtml-equal ((a list) (b list)) + (let ((i 0) (j 0)) + (loop + (if* (and (= i (length a)) (= j (length b))) then (return t) + elseif (and (< i (length a)) (white-space-p (nth i a))) then + (incf i) + elseif (white-space-p (nth j b)) then + (incf j) + elseif (and (= i (length a)) (/= j (length b))) then + (return + (loop + (when (= j (length b)) (return t)) + (when (not (white-space-p (nth j b))) (return nil)) + (incf j))) + elseif (and (/= i (length a)) (= j (length b))) then + (return + (loop + (when (= i (length a)) (return t)) + (when (not (white-space-p (nth i a))) (return nil)) + (incf i))) + elseif (not (lhtml-equal (nth i a) (nth j b))) then + (return nil) + else + (incf i) + (incf j))))) + +(defmethod lhtml-equal ((a string) (b string)) + (let ((i 0) (j 0)) + ;; skip white space in beginning + (loop + (let ((char (elt a i))) + (when (and (not (eq char #\space)) + (not (eq char #\tab)) + (not (eq char #\return)) + (not (eq char #\linefeed))) + (return))) + (incf i)) + (loop + (let ((char (elt b j))) + (when (and (not (eq char #\space)) + (not (eq char #\tab)) + (not (eq char #\return)) + (not (eq char #\linefeed))) + (return))) + (incf j)) + (loop + (when (and (= i (length a)) (= j (length b))) (return t)) + (when (and (= i (length a)) (/= j (length b))) + (return + (loop + (when (= j (length b)) (return t)) + (let ((char (elt b j))) + (when (and (not (eq char #\space)) + (not (eq char #\tab)) + (not (eq char #\return)) + (not (eq char #\linefeed))) + (return t))) + (incf j)))) + (when (and (/= i (length a)) (= j (length b))) + (return + (loop + (when (= i (length a)) (return t)) + (let ((char (elt a i))) + (when (and (not (eq char #\space)) + (not (eq char #\tab)) + (not (eq char #\return)) + (not (eq char #\linefeed))) + (return t))) + (incf i)))) + (when (not (eq (elt a i) (elt b j))) (return nil)) + (incf i) + (incf j)))) + +(defmethod white-space-p ((a t)) + nil) + +(defmethod white-space-p ((a string)) + (let ((i 0) + (length (length a))) + (loop + (when (= i length) (return t)) + (let ((char (elt a i))) + (when (and (not (eq char #\space)) + (not (eq char #\tab)) + (not (eq char #\return)) + (not (eq char #\linefeed))) + (return nil))) + (incf i)))) + +;;------------------------------------------------ + +(defvar *callback-called* 0) + +(let ((*pass* 0)) + (defun callback-test-func (arg) + ;; incf *callback-called* so we know exactly how many times this is + ;; called + (incf *callback-called*) + (if* (= *pass* 0) + then + (incf *pass*) + (test t (lhtml-equal arg + '((:a :name "this is an anchor") + "with some text"))) + else + (setf *pass* 0) + (test t (lhtml-equal arg + '((:a :href + "mailto:lmcelroy@performigence.com") + "lmcelroy@performigence.com")))))) + +(let ((*pass* 0)) + (defun nested-callback (arg) + ;; incf *callback-called* so we know exactly how many times this is + ;; called + (incf *callback-called*) + (if* (= *pass* 0) + then + (incf *pass*) + (test t (lhtml-equal arg + '(:pp "Navigate the site:" + ((:map :name "mainmap") + ((:area :shape "rect" :coords "0,100,100,200")) + ((:area :shape "rect" :coords "100,100,100,200")))))) + elseif (= *pass* 1) + then + (incf *pass*) + (test t (lhtml-equal arg + '(:pp + (:object + (:pp "Navigate the site:" + ((:map :name "mainmap") + ((:area :shape "rect" :coords "0,100,100,200")) + ((:area :shape "rect" + :coords "100,100,100,200")))))))) + else + (setf *pass* 0) + (test t (lhtml-equal arg + '(:pp "whitespace only")))))) + +(defun testit () + (let ((util.test:*test-errors* 0) + (util.test:*test-successes* 0)) + (test t (lhtml-equal (parse-html *test-string2*) *expected-result2*)) + (setf *callback-called* 0) + (test t (lhtml-equal (parse-html *test-string*) *expected-result*)) + (test 0 *callback-called*) + ;;(setf (element-callback :a) 'callback-test-func) + (setf *callback-called* 0) + (test t (lhtml-equal (parse-html *test-string* + :callbacks (acons :a 'callback-test-func nil)) + *expected-result*)) + (test 2 *callback-called*) + (setf *callback-called* 0) + (test t (lhtml-equal (parse-html *test-string*) *expected-result*)) + (test 0 *callback-called*) + (setf *callback-called* 0) + ;; make sure function is OK arg + ;;(setf (element-callback :a) (symbol-function 'callback-test-func)) + (test t (lhtml-equal + (parse-html *test-string* + :callbacks (acons :a (symbol-function 'callback-test-func) nil)) + *expected-result*)) + (test 2 *callback-called*) + ;; try with :callback-only t + (setf *callback-called* 0) + ;;(setf (element-callback :a) 'callback-test-func) + (parse-html *test-string* :callback-only t + :callbacks (acons :a 'callback-test-func nil)) ;; won't return parse output + (test 2 *callback-called*) + ;; try nested callback + (setf *callback-called* 0) + ;;(setf (element-callback :p) 'nested-callback) + (test t (lhtml-equal (parse-html *test-string* + :callbacks (acons :pp 'nested-callback nil)) + *expected-result*)) + (test 3 *callback-called*) + (setf *callback-called* 0) + (parse-html *test-string* :callback-only t + :callbacks (acons :pp 'nested-callback nil)) + (test 3 *callback-called*) + (test-error (parse-html "b ,var mmax)) + ,@body)) + + (addit (index charistic) + `(setf (svref arr ,index) + (logior (svref arr ,index) + ,charistic))) + ) + + (with-range (i #\A #\Z) + (addit i (+ char-tagcharacter + char-attribnamechar + char-attribundelimattribvalue))) + + (with-range (i #\a #\z) + (addit i (+ char-tagcharacter + char-attribnamechar + char-attribundelimattribvalue))) + + (with-range (i #\0 #\9) + (addit i (+ char-tagcharacter + char-attribnamechar + char-attribundelimattribvalue))) + + ;; let colon be legal tag character + (addit (char-code #\:) (+ char-attribnamechar + char-tagcharacter)) + + ;; NY times special tags have _ + (addit (char-code #\_) (+ char-attribnamechar + char-tagcharacter)) + + ; now the unusual cases + (addit (char-code #\-) (+ char-attribnamechar + char-attribundelimattribvalue)) + (addit (char-code #\.) (+ char-attribnamechar + char-attribundelimattribvalue)) + + ;; adding all typeable chars except for whitespace and > + (addit (char-code #\:) char-attribundelimattribvalue) + (addit (char-code #\@) char-attribundelimattribvalue) + (addit (char-code #\/) char-attribundelimattribvalue) + (addit (char-code #\!) char-attribundelimattribvalue) + (addit (char-code #\#) char-attribundelimattribvalue) + (addit (char-code #\$) char-attribundelimattribvalue) + (addit (char-code #\%) char-attribundelimattribvalue) + (addit (char-code #\^) char-attribundelimattribvalue) + (addit (char-code #\&) char-attribundelimattribvalue) + (addit (char-code #\() char-attribundelimattribvalue) + (addit (char-code #\)) char-attribundelimattribvalue) + (addit (char-code #\_) char-attribundelimattribvalue) + (addit (char-code #\=) char-attribundelimattribvalue) + (addit (char-code #\+) char-attribundelimattribvalue) + (addit (char-code #\\) char-attribundelimattribvalue) + (addit (char-code #\|) char-attribundelimattribvalue) + (addit (char-code #\{) char-attribundelimattribvalue) + (addit (char-code #\}) char-attribundelimattribvalue) + (addit (char-code #\[) char-attribundelimattribvalue) + (addit (char-code #\]) char-attribundelimattribvalue) + (addit (char-code #\;) char-attribundelimattribvalue) + (addit (char-code #\') char-attribundelimattribvalue) + (addit (char-code #\") char-attribundelimattribvalue) + (addit (char-code #\,) char-attribundelimattribvalue) + (addit (char-code #\<) char-attribundelimattribvalue) + (addit (char-code #\?) char-attribundelimattribvalue) + + ; i'm not sure what can be in a tag name but we know that + ; ! and - must be there since it's used in comments + + (addit (char-code #\-) char-tagcharacter) + (addit (char-code #\!) char-tagcharacter) + + ; spaces + (addit (char-code #\space) char-spacechar) + (addit (char-code #\tab) char-spacechar) + (addit (char-code #\return) char-spacechar) + (addit (char-code #\linefeed) char-spacechar) + + ) + + + + arr)) + + +(defun char-characteristic (char bit) + (declare (optimize (speed 3) (safety 1))) + ;; return true if the given char has the given bit set in + ;; the characteristic array + (let ((code (char-code char))) + (if* (<= 0 code 127) + then ; in range + (not (zerop (logand (svref *characteristics* code) bit)))))) + + +(defstruct tokenbuf + cur ;; next index to use to grab from tokenbuf + max ;; index one beyond last character + data ;; character array + first-pass ;; previously parsed tokens + ) + +;; cache of tokenbuf structs +(defparameter *tokenbufs* (list nil nil nil nil)) + +(defun get-tokenbuf () + (declare (optimize (speed 3) (safety 1))) + (let (buf) + (mp::without-scheduling + (do* ((bufs *tokenbufs* (cdr bufs)) + (this (car bufs) (car bufs))) + ((null bufs)) + (if* this + then (setf (car bufs) nil) + (setq buf this) + (return)))) + (if* buf + then (setf (tokenbuf-cur buf) 0) + (setf (tokenbuf-max buf) 0) + buf + else (make-tokenbuf + :cur 0 + :max 0 + :data (make-array 1024 :element-type 'character))))) + +(defun put-back-tokenbuf (buf) + (declare (optimize (speed 3) (safety 1))) + (mp::without-scheduling + (do ((bufs *tokenbufs* (cdr bufs))) + ((null bufs) + ; toss it away + nil) + (if* (null (car bufs)) + then (setf (car bufs) buf) + (return))))) + +(defun to-preferred-case (ch) + (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER) + then (char-upcase ch) + else (char-downcase ch))) + + +(defun next-token (stream ignore-strings raw-mode-delimiter + read-sequence-func tokenbuf) + (declare (optimize (speed 3) (safety 1))) + ;; return two values: + ;; the next token from the stream. + ;; the kind of token (:pcdata, :start-tag, :end-tag, :eof) + ;; + ;; if read-sequence-func is non-nil, + ;; read-sequence-func is called to fetch the next character + (macrolet ((next-char (stream) + `(let ((cur (tokenbuf-cur tokenbuf)) + (tb (tokenbuf-data tokenbuf))) + (if* (>= cur (tokenbuf-max tokenbuf)) + then ; fill buffer + (if* (zerop (setf (tokenbuf-max tokenbuf) + (if* read-sequence-func + then (funcall read-sequence-func tb stream) + else (read-sequence tb stream)))) + then (setq cur nil) ; eof + else (setq cur 0))) + (if* cur + then (prog1 (schar tb cur) + (setf (tokenbuf-cur tokenbuf) (1+ cur)))))) + + + (un-next-char (stream ch) + `(decf (tokenbuf-cur tokenbuf))) + + (clear-coll (coll) + `(setf (collector-next coll) 0)) + + (add-to-coll (coll ch) + `(let ((.next. (collector-next ,coll))) + (if* (>= .next. (collector-max ,coll)) + then (grow-and-add ,coll ,ch) + else (setf (schar (collector-data ,coll) .next.) + ,ch) + (setf (collector-next ,coll) (1+ .next.))))) + + ) + + (let ((state (if* raw-mode-delimiter then state-rawdata else state-pcdata)) + (coll (get-collector)) + (ch) + + (value-delim) + + (tag-to-return) + (attribs-to-return) + + (end-tag) + + (attrib-name) + (attrib-value) + + (name-length 0) ;; count only when it could be a comment + + (raw-length 0) + (xml-bailout) + ) + + (loop + + (setq ch (next-char stream)) + ;;(format t "ch: ~s state: ~s~%" ch state) + + (if* (null ch) + then (return) ; eof -- exit loop + ) + + + (case state + (#.state-pcdata + ; collect everything until we see a < + (if* (eq ch #\<) + then ; if we've collected nothing then get a tag + (if* (> (collector-next coll) 0) + then ; have collected something, return this string + (un-next-char stream ch) ; push back the < + (return) + else ; collect a tag + (setq state state-readtagfirst)) + else ; we will check for & here eventually + (if* (not (eq ch #\return)) + then (add-to-coll coll ch)))) + + (#.state-readtagfirst + ; starting to read a tag name + (if* (eq #\/ ch) + then ; end tag + (setq end-tag t) + else (if* (eq #\! ch) ; possible comment + then (setf xml-bailout t) + (setq name-length 0)) + (un-next-char stream ch)) + (setq state state-readtag)) + + (#.state-readtag + ;; reading the whole tag name + (if* (char-characteristic ch char-tagcharacter) + then (add-to-coll coll (to-preferred-case ch)) + (incf name-length) + (if* (and (eq name-length 3) + (coll-has-comment coll)) + then (clear-coll coll) + (setq state state-readcomment)) + + else (setq tag-to-return (compute-tag coll)) + (clear-coll coll) + (if* (eq ch #\>) + then (return) ; we're done + elseif xml-bailout then + (un-next-char stream ch) + (return) + else (if* (eq tag-to-return :!--) + then ; a comment + (setq state state-readcomment) + else (un-next-char stream ch) + (setq state state-findattribname))))) + + (#.state-findattribname + ;; search until we find the start of an attribute name + ;; or the end of the tag + (if* (eq ch #\>) + then ; end of the line + (return) + elseif (eq ch #\=) + then ; value for previous attribute name + ; (syntax "foo = bar" is bogus I think but it's + ; used some places, here is where we handle this + (pop attribs-to-return) + (setq attrib-name (pop attribs-to-return)) + (setq state state-findvalue) + elseif (char-characteristic ch char-attribnamechar) + then (un-next-char stream ch) + (setq state state-attribname) + else nil ; ignore other things + )) + + (#.state-findvalue + ;; find the start of the value + (if* (char-characteristic ch char-spacechar) + thenret ; keep looking + elseif (eq ch #\>) + then ; no value, set the value to be the + ; name as a string + (setq attrib-value + (string-downcase (string attrib-name))) + + (push attrib-name attribs-to-return) + (push attrib-value attribs-to-return) + (un-next-char stream ch) + (setq state state-findattribname) + else (un-next-char stream ch) + (setq state state-attribstartvalue))) + + + (#.state-attribname + ;; collect attribute name + + (if* (char-characteristic ch char-attribnamechar) + then (add-to-coll coll (to-preferred-case ch)) + elseif (eq #\= ch) + then ; end of attribute name, value is next + (setq attrib-name (compute-tag coll)) + (clear-coll coll) + (setq state state-attribstartvalue) + else ; end of attribute name with no value, + (setq attrib-name (compute-tag coll)) + (clear-coll coll) + (setq attrib-value + (string-downcase (string attrib-name))) + (push attrib-name attribs-to-return) + (push attrib-value attribs-to-return) + (un-next-char stream ch) + (setq state state-findattribname))) + + (#.state-attribstartvalue + ;; begin to collect value + (if* (or (eq ch #\") + (eq ch #\')) + then (setq value-delim ch) + (setq state state-attribvaluedelim) + ;; gobble spaces; assume since we've seen a '=' there really is a value + elseif (eq #\space ch) then nil + else (un-next-char stream ch) + (setq state state-attribvaluenodelim))) + + (#.state-attribvaluedelim + (if* (eq ch value-delim) + then (setq attrib-value (compute-coll-string coll)) + (clear-coll coll) + (push attrib-name attribs-to-return) + (push attrib-value attribs-to-return) + (setq state state-findattribname) + else (add-to-coll coll ch))) + + (#.state-attribvaluenodelim + ;; an attribute value not delimited by ' or " and thus restricted + ;; in the possible characters + (if* (char-characteristic ch char-attribundelimattribvalue) + then (add-to-coll coll ch) + else (un-next-char stream ch) + (setq attrib-value (compute-coll-string coll)) + (clear-coll coll) + (push attrib-name attribs-to-return) + (push attrib-value attribs-to-return) + (setq state state-findattribname))) + + (#.state-readcomment + ;; a comment ends on the first --, but we'll look for --> + ;; since that's what most people expect + (if* (eq ch #\-) + then (setq state state-readcomment-one) + else (add-to-coll coll ch))) + + (#.state-readcomment-one + ;; seen one -, looking for -> + + (if* (eq ch #\-) + then (setq state state-readcomment-two) + else ; not a comment end, put back the -'s + (add-to-coll coll #\-) + (add-to-coll coll ch) + (setq state state-readcomment))) + + (#.state-readcomment-two + ;; seen two -'s, looking for > + + (if* (eq ch #\>) + then ; end of the line + (return) + elseif (eq ch #\-) + then ; still at two -'s, have to put out first + (add-to-coll coll #\-) + else ; put out two hypens and back to looking for a hypen + (add-to-coll coll #\-) + (add-to-coll coll #\-) + (setq state state-readcomment))) + + (#.state-rawdata + ;; collect everything until we see the delimiter + (if* (eq (to-preferred-case ch) (elt raw-mode-delimiter raw-length)) + then + (incf raw-length) + (when (= raw-length (length raw-mode-delimiter)) + ;; push the end tag back so it can then be lexed + ;; but don't do it for xml stuff + (when (/= (length raw-mode-delimiter) 1) + (push :end-tag (tokenbuf-first-pass tokenbuf)) + (if* (equal raw-mode-delimiter "") + then (push :STYLE (tokenbuf-first-pass tokenbuf)) + elseif (equal raw-mode-delimiter "") + then (push :style (tokenbuf-first-pass tokenbuf)) + elseif (equal raw-mode-delimiter "") + then (push :SCRIPT (tokenbuf-first-pass tokenbuf)) + elseif (equal raw-mode-delimiter "") + then (push :script (tokenbuf-first-pass tokenbuf)) + else (error "unexpected raw-mode-delimiter")) + ) + ;; set state to state-pcdata for next section + (return)) + else + ;; push partial matches into data string + (dotimes (i raw-length) + (add-to-coll coll (elt raw-mode-delimiter i))) + (setf raw-length 0) + (add-to-coll coll ch))) + + )) + + + ;; out of the loop. + ;; if we're in certain states then it means we should return a value + ;; + (case state + ((#.state-pcdata #.state-rawdata) + ;; return the buffer as a string + (if* (zerop (collector-next coll)) + then (values nil (if (eq state state-pcdata) :eof :pcdata)) + else (values (prog1 + (if* (null ignore-strings) + then (compute-coll-string coll)) + (put-back-collector coll)) + :pcdata))) + + (#.state-readtag + (when (null tag-to-return) + (error "unexpected end of input encountered")) + ;; we've read a tag with no attributes + (put-back-collector coll) + (values tag-to-return + (if* end-tag + then :end-tag + else (if* xml-bailout then :xml else :start-tag)) + )) + + (#.state-findattribname + ;; returning a tag with possible attributes + (put-back-collector coll) + (if* end-tag + then ; ignore any attributes + (values tag-to-return :end-tag) + elseif attribs-to-return + then (values (cons tag-to-return + (nreverse attribs-to-return)) + :start-tag) + else (values tag-to-return :start-tag))) + + (#.state-readcomment-two + ;; returning a comment + (values (prog1 (if* (null ignore-strings) + then (compute-coll-string coll)) + (put-back-collector coll)) + :comment)) + + (t + (if* (null ch) then (error "unexpected end of input encountered") + else (error "internal error, can't be here in state ~d" state))))))) + + +(defvar *kwd-package* (find-package :keyword)) + +(defun compute-tag (coll) + (declare (optimize (speed 3) (safety 1))) + ;; compute the symbol named by what's in the collector + (excl::intern* (collector-data coll) (collector-next coll) *kwd-package*)) + + + +(defun compute-coll-string (coll) + (declare (optimize (speed 3) (safety 1))) + ;; return the string that's in the collection + (let ((str (make-string (collector-next coll))) + (from (collector-data coll))) + (dotimes (i (collector-next coll)) + (setf (schar str i) (schar from i))) + + str)) + +(defun coll-has-comment (coll) + (declare (optimize (speed 3) (safety 1))) + ;; true if the collector has exactly "!--" in it + (and (eq 3 (collector-next coll)) + (let ((data (collector-data coll))) + (and (eq #\! (schar data 0)) + (eq #\- (schar data 1)) + (eq #\- (schar data 2)))))) + + +;;;;;;;;;;; quick and dirty parse + +; the elements with no body and thus no end tag +(dolist (opt '(:area :base :basefont :bgsound :br :button :col + ;;:colgroup - no, this is an element with contents + :embed :hr :img :frame + :input :isindex :keygen :link :meta + :plaintext :spacer :wbr)) + (setf (tag-no-end opt) t)) + +(defvar *in-line* '(:tt :i :b :big :small :em :strong :dfn :code :samp :kbd + :var :cite :abbr :acronym :a :img :object :br :map + :q :sub :sup :span :bdo :input :select :textarea :label :button :font)) + +(defvar *ch-format* '(:i :b :tt :big :small :strike :s :u + :em :strong :font)) + +(defvar *known-tags* '(:!doctype :a :acronym :address :applet :area :b :base :basefont + :bdo :bgsound :big :blink :blockquote :body :br :button :caption + :center :cite :code :col :colgroup :comment :dd :del :dfn :dir + :div :dl :dt :em :embed :fieldset :font :form :frame :frameset + :h1 :h2 :h3 :h4 :h5 :h6 :head :hr :html :i :iframe :img :input + :ins :isindex :kbd :label :layer :legend :li :link :listing :map + :marquee :menu :meta :multicol :nobr :noframes :noscript :object + :ol :option :p :param :plaintext :pre :q :samp :script :select + :small :spacer :span :s :strike :strong :style :sub :sup :table + :tbody :td :textarea :tfoot :th :thead :title :tr :tt :u :ul :var + :wbr :xmp)) + +; the elements whose start tag can end a previous tag + +(setf (tag-auto-close :tr) '(:tr :td :th :colgroup)) +(setf (tag-auto-close-stop :tr) '(:table)) + +(setf (tag-auto-close :td) '(:td :th)) +(setf (tag-auto-close-stop :td) '(:table)) + +(setf (tag-auto-close :th) '(:td :th)) +(setf (tag-auto-close-stop :th) '(:table)) + +(setf (tag-auto-close :dt) '(:dt :dd)) +(setf (tag-auto-close-stop :dt) '(:dl)) + +(setf (tag-auto-close :li) '(:li)) +(setf (tag-auto-close-stop :li) '(:ul :ol)) + +;; new stuff to close off tags with optional close tags +(setf (tag-auto-close :address) '(:head :p)) +(setf (tag-auto-close :blockquote) '(:head :p)) +(setf (tag-auto-close :body) '(:body :frameset :head)) + +(setf (tag-auto-close :dd) '(:dd :dt)) +(setf (tag-auto-close-stop :dd) '(:dl)) + +(setf (tag-auto-close :dl) '(:head :p)) +(setf (tag-auto-close :div) '(:head :p)) +(setf (tag-auto-close :fieldset) '(:head :p)) +(setf (tag-auto-close :form) '(:head :p)) +(setf (tag-auto-close :frameset) '(:body :frameset :head)) +(setf (tag-auto-close :hr) '(:head :p)) +(setf (tag-auto-close :h1) '(:head :p)) +(setf (tag-auto-close :h2) '(:head :p)) +(setf (tag-auto-close :h3) '(:head :p)) +(setf (tag-auto-close :h4) '(:head :p)) +(setf (tag-auto-close :h5) '(:head :p)) +(setf (tag-auto-close :h6) '(:head :p)) +(setf (tag-auto-close :noscript) '(:head :p)) +(setf (tag-auto-close :ol) '(:head :p)) + +(setf (tag-auto-close :option) '(:option)) +(setf (tag-auto-close-stop :option) '(:select)) + +(setf (tag-auto-close :p) '(:head :p)) + +(setf (tag-auto-close :pre) '(:head :p)) +(setf (tag-auto-close :table) '(:head :p)) + +(setf (tag-auto-close :tbody) '(:colgroup :tfoot :tbody :thead)) +(setf (tag-auto-close-stop :tbody) '(:table)) + +(setf (tag-auto-close :tfoot) '(:colgroup :tfoot :tbody :thead)) +(setf (tag-auto-close-stop :tfoot) '(:table)) + +(setf (tag-auto-close :thead) '(:colgroup :tfoot :tbody :thead)) +(setf (tag-auto-close-stop :thead) '(:table)) + +(setf (tag-auto-close :ul) '(:head :p)) + +(setf (tag-no-pcdata :table) t) +(setf (tag-no-pcdata :tr) t) + + +(defmethod parse-html ((p stream) &key callback-only callbacks collect-rogue-tags + no-body-tags) + (declare (optimize (speed 3) (safety 1))) + (phtml-internal p nil callback-only callbacks collect-rogue-tags + no-body-tags)) + +(defmacro tag-callback (tag) + `(rest (assoc ,tag callbacks))) + +(defun phtml-internal (p read-sequence-func callback-only callbacks collect-rogue-tags + + no-body-tags) + (declare (optimize (speed 3) (safety 1))) + (let ((raw-mode-delimiter nil) + (pending nil) + (current-tag :start-parse) + (last-tag :start-parse) + (current-callback-tags nil) + (pending-ch-format nil) + (closed-pending-ch-format nil) + (new-opens nil) + (tokenbuf (get-tokenbuf)) + (guts) + (rogue-tags) + ) + (labels ((close-off-tags (name stop-at collect-rogues) + ;; close off an open 'name' tag, but search no further + ;; than a 'stop-at' tag. + (if* (member (tag-name current-tag) name :test #'eq) + then ;; close current tag(s) + (loop + (when (and collect-rogues + (not (member (tag-name current-tag) + *known-tags*))) + (push (tag-name current-tag) rogue-tags)) + (close-current-tag) + (when (or (member (tag-name current-tag) + *ch-format*) + (not (member + (tag-name current-tag) name :test #'eq))) + (return))) + elseif (member (tag-name current-tag) stop-at :test #'eq) + then nil + else ; search if there is a tag to close + (dolist (ent pending) + (if* (member (tag-name (car ent)) name :test #'eq) + then ; found one to close + (loop + (when (and collect-rogues + (not (member (tag-name current-tag) + *known-tags*))) + (push (tag-name current-tag) rogue-tags)) + (close-current-tag) + (if* (member (tag-name current-tag) name + :test #'eq) + then (close-current-tag) + (return))) + (return) + elseif (member (tag-name (car ent)) stop-at + :test #'eq) + then (return) ;; do nothing + )))) + + (close-current-tag () + ;; close off the current tag and open the pending tag + (when (member (tag-name current-tag) *ch-format* :test #'eq) + (push current-tag closed-pending-ch-format) + ) + (let (element) + (if* (tag-no-pcdata (tag-name current-tag)) + then (setq element `(,current-tag + ,@(strip-rev-pcdata guts))) + else (setq element `(,current-tag ,@(nreverse guts)))) + (let ((callback (tag-callback (tag-name current-tag)))) + (when callback + (setf current-callback-tags (rest current-callback-tags)) + (funcall callback element))) + (let* ((prev (pop pending))) + (setq current-tag (car prev) + guts (cdr prev)) + (push element guts)))) + + (save-state () + ;; push the current tag state since we're starting + ;; a new open tag + (push (cons current-tag guts) pending)) + + + (strip-rev-pcdata (stuff) + ;; reverse the list stuff, omitting all the strings + (let (res) + (dolist (st stuff) + (if* (not (stringp st)) then (push st res))) + res)) + (check-in-line (check-tag) + (setf new-opens nil) + (let (val kind (i 0) + (length (length (tokenbuf-first-pass tokenbuf)))) + (loop + (if* (< i length) then + (setf val (nth i (tokenbuf-first-pass tokenbuf))) + (setf kind (nth (+ i 1) (tokenbuf-first-pass tokenbuf))) + (setf i (+ i 2)) + (if* (= i length) then (setf (tokenbuf-first-pass tokenbuf) + (nreverse (tokenbuf-first-pass tokenbuf)))) + else + (multiple-value-setq (val kind) + (get-next-token t)) + (push val (tokenbuf-first-pass tokenbuf)) + (push kind (tokenbuf-first-pass tokenbuf)) + ) + (when (eq kind :eof) + (if* (= i length) then + (setf (tokenbuf-first-pass tokenbuf) + (nreverse (tokenbuf-first-pass tokenbuf)))) + (return)) + (when (and (eq val check-tag) (eq kind :end-tag)) + (if* (= i length) then + (setf (tokenbuf-first-pass tokenbuf) + (nreverse (tokenbuf-first-pass tokenbuf)))) + (return)) + (when (member val *ch-format* :test #'eq) + (if* (eq kind :start-tag) then (push val new-opens) + elseif (member val new-opens :test #'eq) then + (setf new-opens (remove val new-opens :count 1)) + else (close-off-tags (list val) nil nil) + ))))) + + (get-next-token (force) + (if* (or force (null (tokenbuf-first-pass tokenbuf))) then + (multiple-value-bind (val kind) + (next-token p nil raw-mode-delimiter read-sequence-func + tokenbuf) + (values val kind)) + else + (let ((val (first (tokenbuf-first-pass tokenbuf))) + (kind (second (tokenbuf-first-pass tokenbuf)))) + (setf (tokenbuf-first-pass tokenbuf) + (rest (rest (tokenbuf-first-pass tokenbuf)))) + (values val kind)))) + ) + (loop + (multiple-value-bind (val kind) + (get-next-token nil) + ;;(format t "val: ~s kind: ~s~%" val kind) + (case kind + (:pcdata + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (if* (member last-tag *in-line*) + then + (push val guts) + else + (when (dotimes (i (length val) nil) + (when (not (char-characteristic (elt val i) + char-spacechar)) + (return t))) + (push val guts)))) + (when (and (= (length raw-mode-delimiter) 1) ;; xml tag... + (or (and callback-only current-callback-tags) + (not callback-only))) + (close-off-tags (list last-tag) nil nil)) + (setf raw-mode-delimiter nil) + ) + + (:xml + (setf last-tag val) + (setf raw-mode-delimiter ">") + (let* ((name (tag-name val))) + (when (and callback-only (tag-callback name)) + (push name current-callback-tags)) + (save-state) + (setq current-tag val) + (setq guts nil) + )) + + (:start-tag + (setf last-tag val) + (if* (or (eq last-tag :style) + (and (listp last-tag) (eq (first last-tag) :style))) + then + (setf raw-mode-delimiter + (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER) + then "" + else "")) + elseif (or (eq last-tag :script) + (and (listp last-tag) (eq (first last-tag) :script))) + then + (setf raw-mode-delimiter + (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER) + then "" + else ""))) + ; maybe this is an end tag too + (let* ((name (tag-name val)) + (auto-close (tag-auto-close name)) + (auto-close-stop nil) + (no-end (or (tag-no-end name) (member name no-body-tags)))) + (when (and callback-only (tag-callback name)) + (push name current-callback-tags)) + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (if* auto-close + then (setq auto-close-stop (tag-auto-close-stop name)) + (close-off-tags auto-close auto-close-stop nil)) + (when (and pending-ch-format (not no-end)) + (if* (member name *ch-format* :test #'eq) then nil + elseif (member name *in-line* :test #'eq) then + ;; close off only tags that are within *in-line* block + (check-in-line name) + else ;; close ALL pending char tags and then reopen + (dolist (this-tag (reverse pending-ch-format)) + (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil)) + )) + (if* no-end + then ; this is a singleton tag + (let ((callback (tag-callback (tag-name (if* (atom val) + then val + else (first val)))))) + (when callback + (funcall callback (if* (atom val) + then val + else (list val))))) + (push (if* (atom val) + then val + else (list val)) + guts) + else (save-state) + (setq current-tag val) + (setq guts nil)) + (if* (member name *ch-format* :test #'eq) + then (push val pending-ch-format) + else (when (not + (or (eq last-tag :style) + (and (listp last-tag) (eq (first last-tag) :style)) + (eq last-tag :script) + (and (listp last-tag) (eq (first last-tag) :script)))) + (dolist (tmp (reverse closed-pending-ch-format)) + (save-state) + (setf current-tag tmp) + (setf guts nil))) + ) + (when (not + (or (eq last-tag :style) + (and (listp last-tag) (eq (first last-tag) :style)) + (eq last-tag :script) + (and (listp last-tag) (eq (first last-tag) :script)))) + (setf closed-pending-ch-format nil)) + ))) + + (:end-tag + (setf raw-mode-delimiter nil) + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (close-off-tags (list val) nil nil) + (when (member val *ch-format* :test #'eq) + (setf pending-ch-format + (remove val pending-ch-format :count 1 + :test #'(lambda (x y) (eq x (if (listp y) (first y) y))))) + (setf closed-pending-ch-format + (remove val closed-pending-ch-format :count 1 + :test #'(lambda (x y) (eq x (if (listp y) (first y) y))))) + ) + (dolist (tmp (reverse closed-pending-ch-format)) + (save-state) + (setf current-tag tmp) + (setf guts nil)) + (setf closed-pending-ch-format nil) + )) + + (:comment + (setf raw-mode-delimiter nil) + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (push `(:comment ,val) guts))) + + (:eof + (setf raw-mode-delimiter nil) + ;; close off all tags + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (close-off-tags '(:start-parse) nil collect-rogue-tags)) + (put-back-tokenbuf tokenbuf) + (if collect-rogue-tags + (return (values (cdar guts) rogue-tags)) + (return (cdar guts)))))))))) + + + +(defmethod parse-html (file &key callback-only callbacks collect-rogue-tags + no-body-tags) + (declare (optimize (speed 3) (safety 1))) + (with-open-file (p file :direction :input) + (parse-html p :callback-only callback-only :callbacks callbacks + :collect-rogue-tags collect-rogue-tags + :no-body-tags no-body-tags))) + + +(defmethod parse-html ((str string) &key callback-only callbacks collect-rogue-tags + no-body-tags) + (declare (optimize (speed 3) (safety 1))) + (parse-html (make-string-input-stream str) + :callback-only callback-only :callbacks callbacks + :collect-rogue-tags collect-rogue-tags + :no-body-tags no-body-tags)) + + + + + + + + + +;;;;;;;;;;;; test + +;;;(defun doit (ignore-data) +;;; (with-open-file (p "readme.htm") +;;; (loop +;;; (multiple-value-bind (val kind) (next-token p ignore-data) +;;; ;(format t "~s -> ~s~%" kind val) +;;; +;;; (if* (eq kind :eof) then (return)))))) +;;; +;;;(defun pdoit (&optional (file "testa.html")) +;;; (with-open-file (p file) +;;; (parse-html p))) +;;; +;;; +;;;;; requires http client module to work +;;;(defun getparse (host path) +;;; (parse-html (httpr-body +;;; (parse-response +;;; (simple-get host path))))) + +(provide :phtml) diff --git a/phtml.htm b/phtml.htm new file mode 100644 index 0000000..255dcf2 --- /dev/null +++ b/phtml.htm @@ -0,0 +1,254 @@ + + + +A Lisp Based HTML Parser + + + + + +

A Lisp Based HTML Parser

+ +

Introduction/Simple Example
+LHTML  parse output format
+Case mode notes
+Parsing HTML comments
+Parsing <SCRIPT> and <STYLE> tags
+Parsing SGML <! tags
+Parsing Illegal and Deprecated Tags
+Default Attribute Values
+Parsing Interleaved Character Formatting Tags
+parse-html reference
+   methods
+   phtml-internal

+ +

The parse-html generic function processes HTML +input, returning a list of HTML tags, attributes, and text. Here is a simple example:
+
+(parse-html "<HTML>
+                    +<HEAD>
+                    +<TITLE>Example HTML input</TITLE>
+                    +<BODY>
+                    +<P>Here is some text with a <B>bold</B> word<br>and a <A +HREF=\"help.html\">link</P>
+                    +</HTML>")

+ +

generates:
+
+((:html (:head (:title "Example HTML input"))
+  (:body (:p "Here is some text with a " (:b "bold") " +word" :br "and a "
+                  +((:a :href "help.html") "link")))))
+

+ +

The output format is known as LHTML format; it is the same format that the
+aserve htmlgen macro accepts.
+
+LHTML format
+
+LHTML is a list representation of HTML tags and content.
+
+Each list member may be: + +

    +
  1. a string containing text content, such as "Here is some text with a "
    +
  2. +
  3. a keyword package symbol representing a HTML tag with no associated attributes
    + or content, such as :br.
    +
  4. +
  5. a list representing an HTML tag with associated attributes and/or content,
    + such as (:b "bold") or ((:a :href "help.html") "link"). If + the HTML tag
    + does not have associated attributes, then the first list member will be a
    + keyword package symbol representing the HTML tag, and the other elements will
    + represent the content, which can be a string (text content), a keyword package symbol + (HTML
    + tag with no attributes or content), or list (nested HTML tag with
    + associated attributes and/or content). If there are associated attributes,
    + then the first list member will be a list containing a keyword package symbol
    + followed by two list members for each associated attribute; the first member is a keyword
    + package symbol representing the attribute, and the next member is a string corresponding
    + to the attribute value.
    +
  6. +
+ +

Case Mode and LHTML

+ +

If excl:*current-case-mode* is :CASE-INSENSITIVE-UPPER, keyword package symbols will be
+in upper case; otherwise, they will be in lower case.

+ +

HTML Comments

+ +

HTML comments are represented use a :comment symbol. For example,
+
+(parse-html "<!-- this is a comment-->")
+
+--> ((:comment " this is a comment"))

+ +

HTML <SCRIPT> and <STYLE> tags

+ +

All <SCRIPT> and <STYLE> content is not parsed; it is returned as text +content.
+
+For example,
+
+(parse-html "<SCRIPT>this <B>will not</B> be +parsed</SCRIPT>")
+
+--> ((:script "this <B>will not</B> be parsed"))

+ +

XML and SGML <! tags

+ +

Since, some HTML pages contain special XML/SGML tags, non-comment tags
+starting with '<!' are treated specially:
+
+(parse-html "<!doctype this is some text>")
+
+--> ((:!doctype " this is some text"))

+ +

Illegal and Deprecated HTML

+ +

There is plenty of illegal and deprecated HTML on the web that popular browsers
+nonetheless successfully display. The parse-html parser is generous - it will not
+raise an error condition upon encountering most input. In particular, it does not
+maintain a list of legal HTML tags and will successfully parse nonsense input.
+
+For example,
+
+(parse-html "<this> <is> <some> <nonsense> +<input>")
+
+--> ((:this (:is (:some (:nonsense :input)))))
+
+In some situations, you may prefer a two-pass parse that results in a parse where
+deep nesting related to unrecognized tags is minimized:
+
+(let ((string "<this> <is> <some> <nonsense> </some> +<input>"))
+        (multiple-value-bind (res rogues)
+          (parse-html string +:collect-rogue-tags t)
+            (declare (ignorable +res))
+            (parse-html string +:no-body-tags rogues)))
+
+--> (:this :is (:some (:nonsense)) :input)
+
+See the :collect-rogue-tags and :no-body-tags argument +descriptions in the reference
+section below for more information.

+ +

Default Attribute values

+ +

As per the HTML 4.0 specification, attributes without specified values are given a +lower case
+string value that matches the attribute name.
+
+For example,
+
+(parse-html "<P here ARE some attributes>")
+
+--> (((:p :here "here" :are "are" :some "some" +:attributes "attributes")))

+ +

Interleaved Character Formatting Tags

+ +

Existing HTML pages often have character format tags that are interleaved among
+other tags. Such interleaving is removed in a manner consistent with the HTML 4.0
+specification.
+
+For example,
+
+(parse-html "<P>Here is <B>bold text<P>that spans</B>two +paragraphs")
+
+--> ((:p "Here is " (:b "bold text")) (:p (:b "that +spans") "two paragraphs"))

+ +
+ +

parse-html Reference
+
+parse-html [Generic function]
+
+Arguments: input-source &key callbacks callback-only
+            collect-rogue-tags +no-body-tags
+
+Returns LHTML output, as described above.
+
+The callbacks argument, if non-nil, should be an association list. Each list member's
+car (first) element specifies a keyword package symbol, and each list member's cdr (rest)
+element specifies a function object or a symbol naming a function. The function should
+expect one argument. The function will be invoked once for each time the HTML tag
+corresponding to the specified keyword package symbol is encountered in the HTML input; +the
+argument will be an LHTML list containing the tag, along with associated attributes and
+content. The default callbacks argument value is nil.
+
+The callback-only argument, if non-nil, directs parse-html to not generate a complete +LHTML
+output. Instead, LHTML lists will only be generated when necessary as arguments for +functions
+specified in the callbacks association list. This results in faster parser execution. The +default
+callback-only argument value is nil.
+
+The collect-rogue-tags argument, if non-nil, directs parse-html to return an additional +value,
+a list containing any unrecognized tags closed by the end of input.
+
+The no-body-tags argument, if non-nil, should be a list containing unknown tags that, if
+encountered, will be treated as a tag with no body or content, and thus, no associated end
+tag. Typically, the argument is a list or modified list resulting from an earlier +parse-html
+execution with the :collect-rogue-tags argument specified as non-nil.
+
+parse-html Methods
+
+parse-html (p stream) &key callbacks callback-only
+            collect-rogue-tags +no-body-tags
+
+parse-html (str string) &key callbacks callback-only
+            collect-rogue-tags +no-body-tags
+
+parse-html (file t) &key callbacks callback-only
+            collect-rogue-tags +no-body-tags
+
+The t method assumes the argument is a pathname suitable
+for use with the with-open-file macro.
+
+
+phtml-internal [Function]
+
+Arguments: stream read-sequence-func callback-only callbacks
+collect-rogue-tags no-body-tags
+
+This function may be used when more control is needed for supplying
+the HTML input. The read-sequence-func argument, if non-nil, should be a function
+object or a symbol naming a function. When phtml-internal requires another buffer
+of HTML input, it will invoke the read-sequence-func function with two arguments -
+the first argument is an internal buffer character array and the second argument is
+the phtml-internal stream argument. If read-sequence-fun is nil, phtml-internal
+will invoke read-sequence to fill the buffer. The read-sequence-func function must
+return the number of character array elements successfully stored in the buffer.
+
+
+
+
+
+
+
+

+ + diff --git a/phtml.html b/phtml.html new file mode 100644 index 0000000..255dcf2 --- /dev/null +++ b/phtml.html @@ -0,0 +1,254 @@ + + + +A Lisp Based HTML Parser + + + + + +

A Lisp Based HTML Parser

+ +

Introduction/Simple Example
+LHTML  parse output format
+Case mode notes
+Parsing HTML comments
+Parsing <SCRIPT> and <STYLE> tags
+Parsing SGML <! tags
+Parsing Illegal and Deprecated Tags
+Default Attribute Values
+Parsing Interleaved Character Formatting Tags
+parse-html reference
+   methods
+   phtml-internal

+ +

The parse-html generic function processes HTML +input, returning a list of HTML tags, attributes, and text. Here is a simple example:
+
+(parse-html "<HTML>
+                    +<HEAD>
+                    +<TITLE>Example HTML input</TITLE>
+                    +<BODY>
+                    +<P>Here is some text with a <B>bold</B> word<br>and a <A +HREF=\"help.html\">link</P>
+                    +</HTML>")

+ +

generates:
+
+((:html (:head (:title "Example HTML input"))
+  (:body (:p "Here is some text with a " (:b "bold") " +word" :br "and a "
+                  +((:a :href "help.html") "link")))))
+

+ +

The output format is known as LHTML format; it is the same format that the
+aserve htmlgen macro accepts.
+
+LHTML format
+
+LHTML is a list representation of HTML tags and content.
+
+Each list member may be: + +

    +
  1. a string containing text content, such as "Here is some text with a "
    +
  2. +
  3. a keyword package symbol representing a HTML tag with no associated attributes
    + or content, such as :br.
    +
  4. +
  5. a list representing an HTML tag with associated attributes and/or content,
    + such as (:b "bold") or ((:a :href "help.html") "link"). If + the HTML tag
    + does not have associated attributes, then the first list member will be a
    + keyword package symbol representing the HTML tag, and the other elements will
    + represent the content, which can be a string (text content), a keyword package symbol + (HTML
    + tag with no attributes or content), or list (nested HTML tag with
    + associated attributes and/or content). If there are associated attributes,
    + then the first list member will be a list containing a keyword package symbol
    + followed by two list members for each associated attribute; the first member is a keyword
    + package symbol representing the attribute, and the next member is a string corresponding
    + to the attribute value.
    +
  6. +
+ +

Case Mode and LHTML

+ +

If excl:*current-case-mode* is :CASE-INSENSITIVE-UPPER, keyword package symbols will be
+in upper case; otherwise, they will be in lower case.

+ +

HTML Comments

+ +

HTML comments are represented use a :comment symbol. For example,
+
+(parse-html "<!-- this is a comment-->")
+
+--> ((:comment " this is a comment"))

+ +

HTML <SCRIPT> and <STYLE> tags

+ +

All <SCRIPT> and <STYLE> content is not parsed; it is returned as text +content.
+
+For example,
+
+(parse-html "<SCRIPT>this <B>will not</B> be +parsed</SCRIPT>")
+
+--> ((:script "this <B>will not</B> be parsed"))

+ +

XML and SGML <! tags

+ +

Since, some HTML pages contain special XML/SGML tags, non-comment tags
+starting with '<!' are treated specially:
+
+(parse-html "<!doctype this is some text>")
+
+--> ((:!doctype " this is some text"))

+ +

Illegal and Deprecated HTML

+ +

There is plenty of illegal and deprecated HTML on the web that popular browsers
+nonetheless successfully display. The parse-html parser is generous - it will not
+raise an error condition upon encountering most input. In particular, it does not
+maintain a list of legal HTML tags and will successfully parse nonsense input.
+
+For example,
+
+(parse-html "<this> <is> <some> <nonsense> +<input>")
+
+--> ((:this (:is (:some (:nonsense :input)))))
+
+In some situations, you may prefer a two-pass parse that results in a parse where
+deep nesting related to unrecognized tags is minimized:
+
+(let ((string "<this> <is> <some> <nonsense> </some> +<input>"))
+        (multiple-value-bind (res rogues)
+          (parse-html string +:collect-rogue-tags t)
+            (declare (ignorable +res))
+            (parse-html string +:no-body-tags rogues)))
+
+--> (:this :is (:some (:nonsense)) :input)
+
+See the :collect-rogue-tags and :no-body-tags argument +descriptions in the reference
+section below for more information.

+ +

Default Attribute values

+ +

As per the HTML 4.0 specification, attributes without specified values are given a +lower case
+string value that matches the attribute name.
+
+For example,
+
+(parse-html "<P here ARE some attributes>")
+
+--> (((:p :here "here" :are "are" :some "some" +:attributes "attributes")))

+ +

Interleaved Character Formatting Tags

+ +

Existing HTML pages often have character format tags that are interleaved among
+other tags. Such interleaving is removed in a manner consistent with the HTML 4.0
+specification.
+
+For example,
+
+(parse-html "<P>Here is <B>bold text<P>that spans</B>two +paragraphs")
+
+--> ((:p "Here is " (:b "bold text")) (:p (:b "that +spans") "two paragraphs"))

+ +
+ +

parse-html Reference
+
+parse-html [Generic function]
+
+Arguments: input-source &key callbacks callback-only
+            collect-rogue-tags +no-body-tags
+
+Returns LHTML output, as described above.
+
+The callbacks argument, if non-nil, should be an association list. Each list member's
+car (first) element specifies a keyword package symbol, and each list member's cdr (rest)
+element specifies a function object or a symbol naming a function. The function should
+expect one argument. The function will be invoked once for each time the HTML tag
+corresponding to the specified keyword package symbol is encountered in the HTML input; +the
+argument will be an LHTML list containing the tag, along with associated attributes and
+content. The default callbacks argument value is nil.
+
+The callback-only argument, if non-nil, directs parse-html to not generate a complete +LHTML
+output. Instead, LHTML lists will only be generated when necessary as arguments for +functions
+specified in the callbacks association list. This results in faster parser execution. The +default
+callback-only argument value is nil.
+
+The collect-rogue-tags argument, if non-nil, directs parse-html to return an additional +value,
+a list containing any unrecognized tags closed by the end of input.
+
+The no-body-tags argument, if non-nil, should be a list containing unknown tags that, if
+encountered, will be treated as a tag with no body or content, and thus, no associated end
+tag. Typically, the argument is a list or modified list resulting from an earlier +parse-html
+execution with the :collect-rogue-tags argument specified as non-nil.
+
+parse-html Methods
+
+parse-html (p stream) &key callbacks callback-only
+            collect-rogue-tags +no-body-tags
+
+parse-html (str string) &key callbacks callback-only
+            collect-rogue-tags +no-body-tags
+
+parse-html (file t) &key callbacks callback-only
+            collect-rogue-tags +no-body-tags
+
+The t method assumes the argument is a pathname suitable
+for use with the with-open-file macro.
+
+
+phtml-internal [Function]
+
+Arguments: stream read-sequence-func callback-only callbacks
+collect-rogue-tags no-body-tags
+
+This function may be used when more control is needed for supplying
+the HTML input. The read-sequence-func argument, if non-nil, should be a function
+object or a symbol naming a function. When phtml-internal requires another buffer
+of HTML input, it will invoke the read-sequence-func function with two arguments -
+the first argument is an internal buffer character array and the second argument is
+the phtml-internal stream argument. If read-sequence-fun is nil, phtml-internal
+will invoke read-sequence to fill the buffer. The read-sequence-func function must
+return the number of character array elements successfully stored in the buffer.
+
+
+
+
+
+
+
+

+ + diff --git a/phtml.txt b/phtml.txt new file mode 100644 index 0000000..f6528b5 --- /dev/null +++ b/phtml.txt @@ -0,0 +1,191 @@ +Preliminary HTML Parser documentation + +Pending tasks: + + . integrate with aserve components, such as htmlgen and LHTML description + + +Description + +The parse-html function processes HTML input, returning a list of HTML tags, +attributes, and text. Here is a simple example: + +(parse-html " + + Example HTML input + +

Here is some text with a bold word
and a link

+ ") + +--> + +((:html (:head (:title "Example HTML input")) + (:body (:p "Here is some text with a " (:b "bold") " word" :br "and a " + ((:a :href "help.html") "link"))))) + + +The output format is known as LHTML format; it is the same format that the +aserve htmlgen macro accepts. + +Here is a description of LHTML: + +LHTML is a list representation of HTML tags and content. + +Each list member may be: + +a. a string containing text content, such as "Here is some text with a " + +b. a keyword package symbol representing a HTML tag with no associated attributes + or content, such as :br. + +c. a list representing an HTML tag with associated attributes and/or content, + such as (:b "bold") or ((:a :href "help.html") "link"). If the HTML tag + does not have associated attributes, then the first list member will be a + keyword package symbol representing the HTML tag, and the other elements will + represent the content, which can be a string (text content), a keyword package symbol (HTML + tag with no attributes or content), or list (nested HTML tag with + associated attributes and/or content). If there are associated attributes, + then the first list member will be a list containing a keyword package symbol + followed by two list members for each associated attribute; the first member is a keyword + package symbol representing the attribute, and the next member is a string corresponding + to the attribute value. + +Here are some additional details about parse-html output: + +1. If excl:*current-case-mode* is :CASE-INSENSITIVE-UPPER, keyword package symbols will be + in upper case; otherwise, they will be in lower case. + +2. HTML comments are represented use a :comment symbol. For example, + + (parse-html "") + +--> ((:comment " this is a comment")) + +3. All ") + +--> ((:script "this will not be parsed")) + +4. Since, some HTML pages contain special XML/SGML tags, non-comment tags + starting with '") + +--> ((:!doctype " this is some text")) + +5. There is plenty of illegal and deprecated HTML on the web that popular browsers + nonetheless successfully display. The parse-html parser is generous - it will not + raise an error condition upon encountering most input. In particular, it does not + maintain a list of legal HTML tags and will successfully parse nonsense input. + + For example, + + (parse-html " ") + +--> ((:this (:is (:some (:nonsense :input))))) + + In some situations, you may prefer a two-pass parse that results in a parse where + deep nesting related to unrecognized tags is minimized: + + (let ((string " ")) + (multiple-value-bind (res rogues) + (parse-html string :collect-rogue-tags t) + (declare (ignorable res)) + (parse-html string :no-body-tags rogues))) + +--> (:this :is (:some (:nonsense)) :input) + + See the :collect-rogue-tags and :no-body-tags argument descriptions in the reference + section below for more information. + +6. As per the HTML 4.0 specification, attributes without specified values are given a lower case + string value that matches the attribute name. + + For example, + + (parse-html "

") + +--> (((:p :here "here" :are "are" :some "some" :attributes "attributes"))) + +7. Existing HTML pages often have character format tags that are interleaved among + other tags. Such interleaving is removed in a manner consistent with the HTML 4.0 + specification. + + For example, + + (parse-html "

Here is bold text

that spanstwo paragraphs") + +--> ((:p "Here is " (:b "bold text")) (:p (:b "that spans") "two paragraphs")) + +----------------------------------------------------- + +parse-html reference + +parse-html [Generic function] + +Arguments: input-source &key callbacks callback-only + collect-rogue-tags no-body-tags + +Returns LHTML output, as described above. + +The callbacks argument, if non-nil, should be an association list. Each list member's +car (first) element specifies a keyword package symbol, and each list member's cdr (rest) +element specifies a function object or a symbol naming a function. The function should +expect one argument. The function will be invoked once for each time the HTML tag +corresponding to the specified keyword package symbol is encountered in the HTML input; the +argument will be an LHTML list containing the tag, along with associated attributes and +content. The default callbacks argument value is nil. + +The callback-only argument, if non-nil, directs parse-html to not generate a complete LHTML +output. Instead, LHTML lists will only be generated when necessary as arguments for functions +specified in the callbacks association list. This results in faster parser execution. The default +callback-only argument value is nil. + +The collect-rogue-tags argument, if non-nil, directs parse-html to return an additional value, +a list containing any unrecognized tags closed by the end of input. + +The no-body-tags argument, if non-nil, should be a list containing unknown tags that, if +encountered, will be treated as a tag with no body or content, and thus, no associated end +tag. Typically, the argument is a list or modified list resulting from an earlier parse-html +execution with the :collect-rogue-tags argument specified as non-nil. + + + +parse-html Methods + +parse-html (p stream) &key callbacks callback-only + collect-rogue-tags no-body-tags + +parse-html (str string) &key callbacks callback-only + collect-rogue-tags no-body-tags + +parse-html (file t) &key callbacks callback-only + collect-rogue-tags no-body-tags + +The t method assumes the argument is a pathname suitable +for use with the with-open-file macro. + + +phtml-internal [Function] + +Arguments: stream read-sequence-func callback-only callbacks + collect-rogue-tags no-body-tags + +This function may be used when more control is needed for supplying +the HTML input. The read-sequence-func argument, if non-nil, should be a function +object or a symbol naming a function. When phtml-internal requires another buffer +of HTML input, it will invoke the read-sequence-func function with two arguments - +the first argument is an internal buffer character array and the second argument is +the phtml-internal stream argument. If read-sequence-fun is nil, phtml-internal +will invoke read-sequence to fill the buffer. The read-sequence-func function must +return the number of character array elements successfully stored in the buffer. + + + + + + + diff --git a/pxml-test.cl b/pxml-test.cl new file mode 100644 index 0000000..0392f70 --- /dev/null +++ b/pxml-test.cl @@ -0,0 +1,161 @@ +;; +;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA +;; +;; This code is free software; you can redistribute it and/or +;; modify it under the terms of the version 2.1 of +;; the GNU Lesser General Public License as published by +;; the Free Software Foundation, as clarified by the AllegroServe +;; prequel found in license-allegroserve.txt. +;; +;; This code is distributed in the hope that it will be useful, +;; but without any warranty; without even the implied warranty of +;; merchantability or fitness for a particular purpose. See the GNU +;; Lesser General Public License for more details. +;; +;; Version 2.1 of the GNU Lesser General Public License is in the file +;; license-lgpl.txt that was distributed with this file. +;; If it is not present, you can access it from +;; http://www.gnu.org/copyleft/lesser.txt (until superseded by a newer +;; version) or write to the Free Software Foundation, Inc., 59 Temple Place, +;; Suite 330, Boston, MA 02111-1307 USA +;; + +;; Change Log +;; +;; 10/14/00 add namespace example; xml-error related change + +(eval-when (compile load eval) + (require :tester)) + +(defpackage :user (:use :net.uri :net.xml.parser)) ;; assumes pxml.cl loaded +(in-package :user) + +;; these functions are used in the OASIS xmltest subdirectories +;; see pxml.txt for more information + +(defun file-callback (filename token &optional public) + (declare (ignorable token public)) + ;;(format t "filename: ~s token: ~s public: ~s~%" filename token public) + (ignore-errors (open (uri-path filename)))) + +(defun test-one-file (int external-callback) + (let ((filename (concatenate 'string (format nil "~3,'0d" int) ".xml"))) + (equalp (with-open-file (p filename) + (parse-xml p :external-callback external-callback + :content-only t)) + (with-open-file (p (concatenate 'string "out/" filename)) + (parse-xml p))))) + +(defun test-some-files (max &key skip-list external-callback) + (dotimes (i max) + (if* (member (+ 1 i) skip-list) then + (format t "i: ~s skipping...~%" (+ 1 i)) + else + (format t "i: ~s equalp: ~s~%" (+ 1 i) (test-one-file (+ 1 i) external-callback))))) + +;; have to be in valid/sa directory when this is run +(defun test-sa-files () + (test-some-files 119 :external-callback 'file-callback :skip-list (list 52 64 89))) + +;; have to be in valid/ext-sa directory when this is run +(defun test-ext-sa-files () + (test-some-files 14 :external-callback 'file-callback )) + +;; have to be in valid/not-sa directory when this is run +(defun test-not-sa-files () + (test-some-files 31 :external-callback 'file-callback )) + +(defun test-one-bad-file (filename external-callback) + (ignore-errors + (with-open-file (p filename) + (parse-xml p :external-callback external-callback + :content-only t)))) + +(defun test-some-bad-files (max external-callback) + (dotimes (i max) + (let* ((index (+ 1 i)) + (filename (concatenate 'string (format nil "~3,'0d" index) ".xml"))) + (multiple-value-bind (val error) + (test-one-bad-file filename external-callback) + (format t "i: ~s error: ~s~%" + index (if error + (simple-condition-format-arguments error) val)))))) + +;; have to be in not-wf/sa directory when this is run +(defun test-not-wf-sa-files () + (test-some-bad-files 186 'file-callback)) + +;; have to be in not-wf/ext-sa directory when this is run +(defun test-not-wf-ext-sa-files () + (test-some-bad-files 3 'file-callback)) + +;; have to be in not-wf/not-sa directory when this is run +(defun test-not-wf-not-sa-files () + (test-some-bad-files 8 'file-callback)) + +;; the next stuff is used in the .txt file for documentation + +(defvar *xml-example-external-url* + "") + +(defun example-callback (var-name token &optional public) + (declare (ignorable token public)) + (setf var-name (uri-path var-name)) + (if* (equal var-name "null") then nil + else + (let ((string (eval (intern var-name (find-package :user))))) + (make-string-input-stream string)))) + +(defvar *xml-example-string* + " + + + + + + + + + + +]> +&ext1;") + +(defvar *xml-example-string2*) +(defvar *xml-example-string3*) + +;; bug fix testing +(setf *xml-example-string2* + " +]> +") + +(setf *xml-example-string3* + " +]> +") + +(defvar *xml-example-string4*) + +(setf *xml-example-string4* + " + + A Tale of Two Cities + + UK Library + 1999 + + 1999 + + ") \ No newline at end of file diff --git a/pxml.htm b/pxml.htm new file mode 100644 index 0000000..2cf26d5 --- /dev/null +++ b/pxml.htm @@ -0,0 +1,387 @@ + + + +A Lisp Based XML Parser + + + + + +

A Lisp Based XML Parser

+ +

Introduction/Simple Example
+LXML parse output format
+parse-xml non-validating parser properties
+case and international character support issues
+parse-xml and packages
+parse-xml, the XML Namespace specification, and packages
+ACL does not support Unicode 4 byte scalar values
+only little-endian Unicode tested in ACL 6.0 beta
+debugging aids
+XML Conformance test results
+Compiling and Loading the parser
+parse-xml reference

+ +

The parse-xml generic function processes XML +input, returning a list of XML tags,
+attributes, and text. Here is a simple example:
+
+(parse-xml "<item1><item2 att1='one'/>this is some +text</item1>")
+
+-->
+
+((item1 ((item2 att1 "one")) "this is some text"))
+
+The output format is known as LXML format.
+
+LXML Format
+
+LXML is a list representation of XML tags and content.
+
+Each list member may be:
+
+a. a string containing text content, such as "Here is some text with a "
+
+b. a list representing a XML tag with associated attributes and/or content, +such as ('item1 "text") or (('item1 :att1 "help.html") +"link"). If the XML tag +does not have associated attributes, then the first list member will be a +symbol representing the XML tag, and the other elements will +represent the content, which can be a string (text content), a symbol (XML +tag with no attributes or content), or list (nested XML tag with +associated attributes and/or content). If there are associated attributes, +then the first list member will be a list containing a symbol +followed by two list members for each associated attribute; the first member is a +symbol representing the attribute, and the next member is a string corresponding +to the attribute value.
+
+c. XML comments and or processing instructions - see the more detailed example below for +further information.

+ +

Non Validating Parser Properties

+ +

Parse-xml is a non-validating XML parser. It will detect non-well-formed XML input. +When
+processing valid XML input, parse-xml will optionally produce the same output as a +validating
+parser would, including the processing of an external DTD subset and external entity +declarations.
+
+By default, parse-xml outputs a DTD parse along with the parsed XML contents. The DTD +parse may
+be optionally suppressed. The following example shows DTD parsed output components:

+ +

(defvar *xml-example-external-url*
+   "<!ENTITY ext1 'this is some external entity %param1;'>")
+
+(defun example-callback (var-name token &optional public)
+  (declare (ignorable token public))
+  (setf var-name (uri-path var-name))
+  (if* (equal var-name "null") then nil
+    else
+      (let ((string (eval (intern var-name (find-package +:user)))))
+      (make-string-input-stream string))))
+
+(defvar *xml-example-string*
+"<?xml version='1.0' encoding='utf-8'?>
+<!-- the following XML input is well-formed but its validity has not been checked ... +-->
+<?piexample this is an example processing instruction tag ?>
+<!DOCTYPE example SYSTEM '*xml-example-external-url*' [
+   <!ELEMENT item1 (item2* | (item3+ , item4))>
+   <!ELEMENT item2 ANY>
+   <!ELEMENT item3 (#PCDATA)>
+   <!ELEMENT item4 (#PCDATA)>
+   <!ATTLIST item1
+      att1 CDATA #FIXED 'att1-default'
+      att2 ID #REQUIRED
+      att3 ( one | two | three ) 'one'
+      att4 NOTATION ( four | five ) 'four' >
+   <!ENTITY % param1 'text'>
+   <!ENTITY nentity SYSTEM 'null' NDATA somedata>
+   <!NOTATION notation SYSTEM 'notation-processor'>
+   ]>
+<item1 att2='1'><item3>&ext1;</item3></item1>")
+
+(pprint (parse-xml *xml-example-string* :external-callback 'example-callback))
+
+-->
+
+((:xml :version "1.0" :encoding "utf-8")
+  (:comment " the following XML input is well-formed but may or may not be valid +")
+  (:pi :piexample "this is an example processing instruction tag ")
+  (:DOCTYPE :example
+    (:[ (:ELEMENT :item1 (:choice (:* :item2) (:seq (:+ :item3) :item4)))
+        (:ELEMENT :item2 :ANY)
+        (:ELEMENT :item3 :PCDATA) (:ELEMENT :item4 +:PCDATA)
+        (:ATTLIST item1 (att1 :CDATA :FIXED +"att1-default") (att2 :ID :REQUIRED)
+             (att3 +(:enumeration :one :two :three) "one")
+             (att4 (:NOTATION +:four :five) "four"))
+        (:ENTITY :param1 :param "text")
+        (:ENTITY :nentity :SYSTEM "null" +:NDATA :somedata)
+        (:NOTATION :notation :SYSTEM +"notation-processor"))
+    (:external (:ENTITY :ext1 "this is some external entity +text")))
+   ((item1 att1 "att1-default" att2 "1" att3 "one" +att4 "four")
+       (item3 "this is some external entity +text")))
+
+
+Usage Notes
+
+

    +
  1. The parse-xml function has been primarily compiled and tested in a +modern ACL. However, in an ANSI Lisp with wide character support, it DOES pass the valid +component of the conformance suite in the same manner as it does in a Modern Lisp. The +parser's successful operation in all potential situations depends on wide character support. +

    +
  2. +
  3. The parser uses the keyword package for DTD tokens and other +special XML tokens. Since element and attribute token symbols are usually interned +in the current package, it is not recommended to execute parse-xml +when the current package is the keyword package. +

    +
  4. +
  5. The XML parser supports the XML Namespaces specification. The +parser recognizes a "xmlns" attribute and attribute names starting with +"xmlns:". +As per the specification, the parser expects that the associated value +is an URI string. The parser then associates XML Namespace prefixes with a +Lisp package provided via the parse-xml :uri-to-package option or, if +necessary, a package created on the fly. The following example demonstrates +this behavior:
    + +

    (setf *xml-example-string4*
    +   "<bibliography
    +      xmlns:bib='http://www.bibliography.org/XML/bib.ns'
    +      xmlns='urn:com:books-r-us'>
    +   <bib:book owner='Smith'>
    +      <bib:title>A Tale of Two Cities</bib:title>
    +      <bib:bibliography
    +         xmlns:bib='http://www.franz.com/XML/bib.ns'
    +         xmlns='urn:com:books-r-us'>
    +      <bib:library branch='Main'>UK +Library</bib:library>
    +      <bib:date calendar='Julian'>1999</bib:date>
    +      </bib:bibliography>
    +   <bib:date calendar='Julian'>1999</bib:date>
    +   </bib:book>
    +</bibliography>")
    +
    +(setf *uri-to-package* nil)
    +(setf *uri-to-package*
    +   (acons (parse-uri "http://www.bibliography.org/XML/bib.ns")
    +      (make-package "bib") *uri-to-package*))
    +(setf *uri-to-package*
    +   (acons (parse-uri "urn:com:books-r-us")
    +      (make-package "royal") *uri-to-package*))
    +(setf *uri-to-package*
    +   (acons (parse-uri "http://www.franz.com/XML/bib.ns")
    +      (make-package "franz-ns") *uri-to-package*))
    +(pprint (multiple-value-list
    +             (parse-xml +*xml-example-string4*
    +                  :uri-to-package +*uri-to-package*)))
    +
    +-->
    +((((bibliography |xmlns:bib| "http://www.bibliography.org/XML/bib.ns"
    +     xmlns "urn:com:books-r-us")
    +    "
    +    "
    +   ((bib::book royal::owner "Smith") "
    +        " (bib::title "A Tale of Two +Cities") "
    +        "
    +    ((bib::bibliography royal::|xmlns:bib|
    +      "http://www.franz.com/XML/bib.ns" royal::xmlns
    +      "urn:com:books-r-us")
    +     "
    +         " ((franz-ns::library royal::branch +"Main") "UK Library") "
    +         " ((franz-ns::date royal::calendar +"Julian") "1999") "
    +         ")
    +     "
    +         " ((bib::date royal::calendar +"Julian") "1999") "
    +         ")
    +     "
    +         "))
    +((#<uri http://www.franz.com/XML/bib.ns> . #<The franz-ns package>)
    +  (#<uri urn:com:books-r-us> . #<The royal package>)
    +  (#<uri http://www.bibliography.org/XML/bib.ns> . #<The bib package>)))
    +
    +

  6. +
  7. In the absence of XML Namespace attributes, element and attribute symbols are interned +in the current package. Note that this implies that attributes and elements referenced +in DTD content will be interned in the current package. +
  8. +
  9. The parse-xml function has been tested using the OASIS conformance test suite (see +details below). The test suite has wide coverage across possible XML and DTD syntax, +but there may be some syntax paths that have not yet been tested or completely +supported. Here is a list of currently known syntax parsing issues: +
      +
    • ACL does not support 4 byte Unicode scalar values, so +input containing such data +will not be processed correctly. (Note, however, that parse-xml does correctly detect +and process wide Unicode input.) +
    • +
    • The OASIS tests that contain wide Unicode all use a +little-endian encoded Unicode. +Changes to the unicode-check function are required to also support big-endian encoded +Unicode. (Note also that this issue may be resolved by an ACL 6.0 final release change.) +
    • +
    • An initial <?xml declaration in external entity files is skipped without a check +being made to see if the <?xml declaration is itself incorrect. +
    • +
    +
  10. +
  11. When investigating possible parser errors or examining more closely +where the parser +determined that the input was non-well-formed, the net.xml.parser internal symbols +*debug-xml* and *debug-dtd* are useful. When not bound to nil, these variables cause +lexical analysis and intermediate parsing results to be output to *standard-output*. +
  12. +
  13. It is necessary to load the pxml module before using it. +Typically this can be done by evaluating (require :pxml). +
  14. +
+XML Conformance Test Suite
+
+Using the OASIS test suite (http://www.oasis-open.org), +here are the current parse-xml results:
+
+xmltest/invalid:    Not tested, since parse-xml is a non-validating parser
+
+not-wf/
+
+    ext.sa: 3 tests; all pass
+    not-sa: 8 tests; all pass
+    sa: 186 tests; the following fail:
+
+        170.xml: fails because ACL does not support 4 +byte Unicode scalar values
+
+valid/
+
+    ext-sa: 14 tests; all pass
+    not-sa: 31 tests; all pass
+    sa: 119 tests: the following fail:
+
+        052.xml, 064.xml, 089.xml: fails because ACL +does not support 4 byte
+                    +Unicode scalar values
+
+Compiling and Loading
+
+Load build.cl into a modern ACL session will result in a pxml.fasl file that can +subsequently be
+loaded in a modern ACL to provide XML parsing functionality.
+
+-------------------------------------------------------------------------------------------
+
+parse-xml reference
+
+parse-xml            [Generic +function]
+
+Arguments: input-source &key external-callback content-only
+            general-entities +parameter-entities
+            uri-to-package
+
+Returns multiple values:
+
    +
  1. LXML and parsed DTD output, as described above.
  2. +
  3. An association list containing the uri-to-package argument conses (if any) +and conses associated with any XML Namespace packages created during the +parse (see uri-to-package argument description, below).
  4. +
+The external-callback argument, if specified, is a function object or symbol +that parse-xml will execute when encountering an external DTD subset +or external entity DTD declaration. Here is an example which shows that +arguments the function should expect, and the value it should return: +
+(defun file-callback (uri-object token &optional public)
+  ;; The uri-object is an ACL URI object created from
+  ;; the XML input. In this example, this function
+  ;; assumes that all uri's will be file specifications.
+  ;;
+  ;; The token argument identifies what token is associated
+  ;; with the external parse (for example :DOCTYPE for external
+  ;; DTD subset
+  ;;
+  ;; The public argument contains the associated PUBLIC string,
+  ;; when present
+  ;;
+  (declare (ignorable token public))
+  ;; An open stream is returned on success,
+  ;; a nil return value indicates that the external
+  ;; parse should not occur.
+  ;; Note that parse-xml will close the open stream before exiting.
+  (ignore-errors (open (uri-path uri-object))))
+
+

+The general-entities argument is an association list containing general entity symbol +and replacement text pairs. The entity symbols should be in the keyword package. +Note that this option may be useful in generating desirable parse results in +situations where you do not wish to parse external entities or the external DTD subset. +

+The parameter-entities argument is an association list containing parameter entity symbol +and replacement text pairs. The entity symbols should be in the keyword package. +Note that this option may be useful in generating desirable parse results in +situations where you do not wish to parse external entities or the external DTD subset. +

+The uri-to-package argument is an association list containing uri objects and package +objects. Typically, the uri objects correspond to XML Namespace attribute values, and +the package objects correspond to the desired package for interning symbols associated +with the uri namespace. If the parser encounters an uri object not contained in this list, +it will generate a new package. The first generated package will be named +net.xml.namespace.0, +the second will be named net.xml.namespace.1, and so on. +

parse-xml methods

+
+(parse-xml (p stream) &key
+                      external-callback content-only
+                      general-entities
+                      parameter-entities
+                      uri-to-package)
+
+(parse-xml (str string) &key
+                        external-callback content-only
+                        general-entities
+                        parameter-entities
+                        uri-to-package)
+
+An easy way to parse a file containing XML input: +
+(with-open-file (p "example.xml")
+  (parse-xml p :content-only p))
+
+

net.xml.parser unexported special variables:

+

+*debug-xml*
+
+When true, parse-xml generates XML lexical state and intermediary +parse result debugging output. +

+*debug-dtd*
+
+When true, parse-xml generates DTD lexical state and intermediary +parse result debugging output. + + diff --git a/pxml.html b/pxml.html new file mode 100644 index 0000000..2cf26d5 --- /dev/null +++ b/pxml.html @@ -0,0 +1,387 @@ + + + +A Lisp Based XML Parser + + + + + +

A Lisp Based XML Parser

+ +

Introduction/Simple Example
+LXML parse output format
+parse-xml non-validating parser properties
+case and international character support issues
+parse-xml and packages
+parse-xml, the XML Namespace specification, and packages
+ACL does not support Unicode 4 byte scalar values
+only little-endian Unicode tested in ACL 6.0 beta
+debugging aids
+XML Conformance test results
+Compiling and Loading the parser
+parse-xml reference

+ +

The parse-xml generic function processes XML +input, returning a list of XML tags,
+attributes, and text. Here is a simple example:
+
+(parse-xml "<item1><item2 att1='one'/>this is some +text</item1>")
+
+-->
+
+((item1 ((item2 att1 "one")) "this is some text"))
+
+The output format is known as LXML format.
+
+LXML Format
+
+LXML is a list representation of XML tags and content.
+
+Each list member may be:
+
+a. a string containing text content, such as "Here is some text with a "
+
+b. a list representing a XML tag with associated attributes and/or content, +such as ('item1 "text") or (('item1 :att1 "help.html") +"link"). If the XML tag +does not have associated attributes, then the first list member will be a +symbol representing the XML tag, and the other elements will +represent the content, which can be a string (text content), a symbol (XML +tag with no attributes or content), or list (nested XML tag with +associated attributes and/or content). If there are associated attributes, +then the first list member will be a list containing a symbol +followed by two list members for each associated attribute; the first member is a +symbol representing the attribute, and the next member is a string corresponding +to the attribute value.
+
+c. XML comments and or processing instructions - see the more detailed example below for +further information.

+ +

Non Validating Parser Properties

+ +

Parse-xml is a non-validating XML parser. It will detect non-well-formed XML input. +When
+processing valid XML input, parse-xml will optionally produce the same output as a +validating
+parser would, including the processing of an external DTD subset and external entity +declarations.
+
+By default, parse-xml outputs a DTD parse along with the parsed XML contents. The DTD +parse may
+be optionally suppressed. The following example shows DTD parsed output components:

+ +

(defvar *xml-example-external-url*
+   "<!ENTITY ext1 'this is some external entity %param1;'>")
+
+(defun example-callback (var-name token &optional public)
+  (declare (ignorable token public))
+  (setf var-name (uri-path var-name))
+  (if* (equal var-name "null") then nil
+    else
+      (let ((string (eval (intern var-name (find-package +:user)))))
+      (make-string-input-stream string))))
+
+(defvar *xml-example-string*
+"<?xml version='1.0' encoding='utf-8'?>
+<!-- the following XML input is well-formed but its validity has not been checked ... +-->
+<?piexample this is an example processing instruction tag ?>
+<!DOCTYPE example SYSTEM '*xml-example-external-url*' [
+   <!ELEMENT item1 (item2* | (item3+ , item4))>
+   <!ELEMENT item2 ANY>
+   <!ELEMENT item3 (#PCDATA)>
+   <!ELEMENT item4 (#PCDATA)>
+   <!ATTLIST item1
+      att1 CDATA #FIXED 'att1-default'
+      att2 ID #REQUIRED
+      att3 ( one | two | three ) 'one'
+      att4 NOTATION ( four | five ) 'four' >
+   <!ENTITY % param1 'text'>
+   <!ENTITY nentity SYSTEM 'null' NDATA somedata>
+   <!NOTATION notation SYSTEM 'notation-processor'>
+   ]>
+<item1 att2='1'><item3>&ext1;</item3></item1>")
+
+(pprint (parse-xml *xml-example-string* :external-callback 'example-callback))
+
+-->
+
+((:xml :version "1.0" :encoding "utf-8")
+  (:comment " the following XML input is well-formed but may or may not be valid +")
+  (:pi :piexample "this is an example processing instruction tag ")
+  (:DOCTYPE :example
+    (:[ (:ELEMENT :item1 (:choice (:* :item2) (:seq (:+ :item3) :item4)))
+        (:ELEMENT :item2 :ANY)
+        (:ELEMENT :item3 :PCDATA) (:ELEMENT :item4 +:PCDATA)
+        (:ATTLIST item1 (att1 :CDATA :FIXED +"att1-default") (att2 :ID :REQUIRED)
+             (att3 +(:enumeration :one :two :three) "one")
+             (att4 (:NOTATION +:four :five) "four"))
+        (:ENTITY :param1 :param "text")
+        (:ENTITY :nentity :SYSTEM "null" +:NDATA :somedata)
+        (:NOTATION :notation :SYSTEM +"notation-processor"))
+    (:external (:ENTITY :ext1 "this is some external entity +text")))
+   ((item1 att1 "att1-default" att2 "1" att3 "one" +att4 "four")
+       (item3 "this is some external entity +text")))
+
+
+Usage Notes
+
+

    +
  1. The parse-xml function has been primarily compiled and tested in a +modern ACL. However, in an ANSI Lisp with wide character support, it DOES pass the valid +component of the conformance suite in the same manner as it does in a Modern Lisp. The +parser's successful operation in all potential situations depends on wide character support. +

    +
  2. +
  3. The parser uses the keyword package for DTD tokens and other +special XML tokens. Since element and attribute token symbols are usually interned +in the current package, it is not recommended to execute parse-xml +when the current package is the keyword package. +

    +
  4. +
  5. The XML parser supports the XML Namespaces specification. The +parser recognizes a "xmlns" attribute and attribute names starting with +"xmlns:". +As per the specification, the parser expects that the associated value +is an URI string. The parser then associates XML Namespace prefixes with a +Lisp package provided via the parse-xml :uri-to-package option or, if +necessary, a package created on the fly. The following example demonstrates +this behavior:
    + +

    (setf *xml-example-string4*
    +   "<bibliography
    +      xmlns:bib='http://www.bibliography.org/XML/bib.ns'
    +      xmlns='urn:com:books-r-us'>
    +   <bib:book owner='Smith'>
    +      <bib:title>A Tale of Two Cities</bib:title>
    +      <bib:bibliography
    +         xmlns:bib='http://www.franz.com/XML/bib.ns'
    +         xmlns='urn:com:books-r-us'>
    +      <bib:library branch='Main'>UK +Library</bib:library>
    +      <bib:date calendar='Julian'>1999</bib:date>
    +      </bib:bibliography>
    +   <bib:date calendar='Julian'>1999</bib:date>
    +   </bib:book>
    +</bibliography>")
    +
    +(setf *uri-to-package* nil)
    +(setf *uri-to-package*
    +   (acons (parse-uri "http://www.bibliography.org/XML/bib.ns")
    +      (make-package "bib") *uri-to-package*))
    +(setf *uri-to-package*
    +   (acons (parse-uri "urn:com:books-r-us")
    +      (make-package "royal") *uri-to-package*))
    +(setf *uri-to-package*
    +   (acons (parse-uri "http://www.franz.com/XML/bib.ns")
    +      (make-package "franz-ns") *uri-to-package*))
    +(pprint (multiple-value-list
    +             (parse-xml +*xml-example-string4*
    +                  :uri-to-package +*uri-to-package*)))
    +
    +-->
    +((((bibliography |xmlns:bib| "http://www.bibliography.org/XML/bib.ns"
    +     xmlns "urn:com:books-r-us")
    +    "
    +    "
    +   ((bib::book royal::owner "Smith") "
    +        " (bib::title "A Tale of Two +Cities") "
    +        "
    +    ((bib::bibliography royal::|xmlns:bib|
    +      "http://www.franz.com/XML/bib.ns" royal::xmlns
    +      "urn:com:books-r-us")
    +     "
    +         " ((franz-ns::library royal::branch +"Main") "UK Library") "
    +         " ((franz-ns::date royal::calendar +"Julian") "1999") "
    +         ")
    +     "
    +         " ((bib::date royal::calendar +"Julian") "1999") "
    +         ")
    +     "
    +         "))
    +((#<uri http://www.franz.com/XML/bib.ns> . #<The franz-ns package>)
    +  (#<uri urn:com:books-r-us> . #<The royal package>)
    +  (#<uri http://www.bibliography.org/XML/bib.ns> . #<The bib package>)))
    +
    +

  6. +
  7. In the absence of XML Namespace attributes, element and attribute symbols are interned +in the current package. Note that this implies that attributes and elements referenced +in DTD content will be interned in the current package. +
  8. +
  9. The parse-xml function has been tested using the OASIS conformance test suite (see +details below). The test suite has wide coverage across possible XML and DTD syntax, +but there may be some syntax paths that have not yet been tested or completely +supported. Here is a list of currently known syntax parsing issues: +
      +
    • ACL does not support 4 byte Unicode scalar values, so +input containing such data +will not be processed correctly. (Note, however, that parse-xml does correctly detect +and process wide Unicode input.) +
    • +
    • The OASIS tests that contain wide Unicode all use a +little-endian encoded Unicode. +Changes to the unicode-check function are required to also support big-endian encoded +Unicode. (Note also that this issue may be resolved by an ACL 6.0 final release change.) +
    • +
    • An initial <?xml declaration in external entity files is skipped without a check +being made to see if the <?xml declaration is itself incorrect. +
    • +
    +
  10. +
  11. When investigating possible parser errors or examining more closely +where the parser +determined that the input was non-well-formed, the net.xml.parser internal symbols +*debug-xml* and *debug-dtd* are useful. When not bound to nil, these variables cause +lexical analysis and intermediate parsing results to be output to *standard-output*. +
  12. +
  13. It is necessary to load the pxml module before using it. +Typically this can be done by evaluating (require :pxml). +
  14. +
+XML Conformance Test Suite
+
+Using the OASIS test suite (http://www.oasis-open.org), +here are the current parse-xml results:
+
+xmltest/invalid:    Not tested, since parse-xml is a non-validating parser
+
+not-wf/
+
+    ext.sa: 3 tests; all pass
+    not-sa: 8 tests; all pass
+    sa: 186 tests; the following fail:
+
+        170.xml: fails because ACL does not support 4 +byte Unicode scalar values
+
+valid/
+
+    ext-sa: 14 tests; all pass
+    not-sa: 31 tests; all pass
+    sa: 119 tests: the following fail:
+
+        052.xml, 064.xml, 089.xml: fails because ACL +does not support 4 byte
+                    +Unicode scalar values
+
+Compiling and Loading
+
+Load build.cl into a modern ACL session will result in a pxml.fasl file that can +subsequently be
+loaded in a modern ACL to provide XML parsing functionality.
+
+-------------------------------------------------------------------------------------------
+
+parse-xml reference
+
+parse-xml            [Generic +function]
+
+Arguments: input-source &key external-callback content-only
+            general-entities +parameter-entities
+            uri-to-package
+
+Returns multiple values:
+
    +
  1. LXML and parsed DTD output, as described above.
  2. +
  3. An association list containing the uri-to-package argument conses (if any) +and conses associated with any XML Namespace packages created during the +parse (see uri-to-package argument description, below).
  4. +
+The external-callback argument, if specified, is a function object or symbol +that parse-xml will execute when encountering an external DTD subset +or external entity DTD declaration. Here is an example which shows that +arguments the function should expect, and the value it should return: +
+(defun file-callback (uri-object token &optional public)
+  ;; The uri-object is an ACL URI object created from
+  ;; the XML input. In this example, this function
+  ;; assumes that all uri's will be file specifications.
+  ;;
+  ;; The token argument identifies what token is associated
+  ;; with the external parse (for example :DOCTYPE for external
+  ;; DTD subset
+  ;;
+  ;; The public argument contains the associated PUBLIC string,
+  ;; when present
+  ;;
+  (declare (ignorable token public))
+  ;; An open stream is returned on success,
+  ;; a nil return value indicates that the external
+  ;; parse should not occur.
+  ;; Note that parse-xml will close the open stream before exiting.
+  (ignore-errors (open (uri-path uri-object))))
+
+

+The general-entities argument is an association list containing general entity symbol +and replacement text pairs. The entity symbols should be in the keyword package. +Note that this option may be useful in generating desirable parse results in +situations where you do not wish to parse external entities or the external DTD subset. +

+The parameter-entities argument is an association list containing parameter entity symbol +and replacement text pairs. The entity symbols should be in the keyword package. +Note that this option may be useful in generating desirable parse results in +situations where you do not wish to parse external entities or the external DTD subset. +

+The uri-to-package argument is an association list containing uri objects and package +objects. Typically, the uri objects correspond to XML Namespace attribute values, and +the package objects correspond to the desired package for interning symbols associated +with the uri namespace. If the parser encounters an uri object not contained in this list, +it will generate a new package. The first generated package will be named +net.xml.namespace.0, +the second will be named net.xml.namespace.1, and so on. +

parse-xml methods

+
+(parse-xml (p stream) &key
+                      external-callback content-only
+                      general-entities
+                      parameter-entities
+                      uri-to-package)
+
+(parse-xml (str string) &key
+                        external-callback content-only
+                        general-entities
+                        parameter-entities
+                        uri-to-package)
+
+An easy way to parse a file containing XML input: +
+(with-open-file (p "example.xml")
+  (parse-xml p :content-only p))
+
+

net.xml.parser unexported special variables:

+

+*debug-xml*
+
+When true, parse-xml generates XML lexical state and intermediary +parse result debugging output. +

+*debug-dtd*
+
+When true, parse-xml generates DTD lexical state and intermediary +parse result debugging output. + + diff --git a/pxml.txt b/pxml.txt new file mode 100644 index 0000000..520cf2b --- /dev/null +++ b/pxml.txt @@ -0,0 +1,345 @@ +Description + +The parse-xml function processes XML input, returning a list of XML tags, +attributes, and text. Here is a simple example: + +(parse-xml "this is some text") + +--> + +((item1 ((item2 att1 "one")) "this is some text")) + +The output format is known as LXML format. + +Here is a description of LXML: + +LXML is a list representation of XML tags and content. + +Each list member may be: + +a. a string containing text content, such as "Here is some text with a " + +b. a list representing a XML tag with associated attributes and/or content, + such as ('item1 "text") or (('item1 :att1 "help.html") "link"). If the XML tag + does not have associated attributes, then the first list member will be a + symbol representing the XML tag, and the other elements will + represent the content, which can be a string (text content), a symbol (XML + tag with no attributes or content), or list (nested XML tag with + associated attributes and/or content). If there are associated attributes, + then the first list member will be a list containing a symbol + followed by two list members for each associated attribute; the first member is a + symbol representing the attribute, and the next member is a string corresponding + to the attribute value. + +c. XML comments and or processing instructions - see the more detailed example below for + further information. + +Parse-xml is a non-validating XML parser. It will detect non-well-formed XML input. When +processing valid XML input, parse-xml will optionally produce the same output as a validating +parser would, including the processing of an external DTD subset and external entity declarations. + +By default, parse-xml outputs a DTD parse along with the parsed XML contents. The DTD parse may +be optionally suppressed. The following example shows DTD parsed output components: + +(defvar *xml-example-external-url* + "") + +(defun example-callback (var-name token &optional public) + (declare (ignorable token public)) + (setf var-name (uri-path var-name)) + (if* (equal var-name "null") then nil + else + (let ((string (eval (intern var-name (find-package :user))))) + (make-string-input-stream string)))) + +(defvar *xml-example-string* + " + + + + + + + + + + +]> +&ext1;") + +(pprint (parse-xml *xml-example-string* :external-callback 'example-callback)) + +--> + +((:xml :version "1.0" :encoding "utf-8") + (:comment " the following XML input is well-formed but may or may not be valid ") + (:pi :piexample "this is an example processing instruction tag ") + (:DOCTYPE :example + (:[ (:ELEMENT :item1 (:choice (:* :item2) (:seq (:+ :item3) :item4))) + (:ELEMENT :item2 :ANY) + (:ELEMENT :item3 :PCDATA) (:ELEMENT :item4 :PCDATA) + (:ATTLIST item1 (att1 :CDATA :FIXED "att1-default") (att2 :ID :REQUIRED) + (att3 (:enumeration :one :two :three) "one") + (att4 (:NOTATION :four :five) "four")) + (:ENTITY :param1 :param "text") + (:ENTITY :nentity :SYSTEM "null" :NDATA :somedata) + (:NOTATION :notation :SYSTEM "notation-processor")) + (:external (:ENTITY :ext1 "this is some external entity text"))) + ((item1 att1 "att1-default" att2 "1" att3 "one" att4 "four") + (item3 "this is some external entity text"))) + + +Usage Notes: + +1. The parse-xml function has been compiled and tested only in a + modern ACL. Its successful operation depends on both the mixed + case support and wide character support found in modern ACL. + +2. The parser uses the keyword package for DTD tokens and other + special XML tokens. Since element and attribute token symbols are usually interned + in the current package, it is not recommended to execute parse-xml + when the current package is the keyword package. + +3. The XML parser supports the XML Namespaces specification. The parser + recognizes a "xmlns" attribute and attribute names starting with "xmlns:". + As per the specification, the parser expects that the associated value + is an URI string. The parser then associates XML Namespace prefixes with a + Lisp package provided via the parse-xml :uri-to-package option or, if + necessary, a package created on the fly. The following example demonstrates + this behavior: + + (setf *xml-example-string4* + " + + A Tale of Two Cities + + UK Library + 1999 + + 1999 + + ") + + (setf *uri-to-package* nil) + (setf *uri-to-package* + (acons (parse-uri "http://www.bibliography.org/XML/bib.ns") + (make-package "bib") *uri-to-package*)) + (setf *uri-to-package* + (acons (parse-uri "urn:royal-mail.gov.uk/XML/ns/postal.ns,1999") + (make-package "royal") *uri-to-package*)) + (setf *uri-to-package* + (acons (parse-uri "http://www.franz.com/XML/bib.ns") + (make-package "franz-ns") *uri-to-package*)) + (pprint (multiple-value-list + (parse-xml *xml-example-string4* + :uri-to-package *uri-to-package*))) + +--> + +((((bibliography |xmlns:bib| "http://www.bibliography.org/XML/bib.ns" xmlns + "urn:royal-mail.gov.uk/XML/ns/postal.ns,1999") + " + " + ((bib::book royal::owner "Smith") " + " (bib::title "A Tale of Two Cities") " + " + ((bib::bibliography royal::|xmlns:bib| "http://www.franz.com/XML/bib.ns" royal::xmlns + "urn:royal-mail2.gov.uk/XML/ns/postal.ns,1999") + " + " ((franz-ns::library net.xml.namespace.0::branch "Main") "UK Library") " + " ((franz-ns::date net.xml.namespace.0::calendar "Julian") "1999") " + ") + " + " ((bib::date royal::calendar "Julian") "1999") " + ") + " + ")) + ((# . #) + (# . #) + (# . #) + (# . #))) + + In the absence of XML Namespace attributes, element and attribute symbols are interned + in the current package. Note that this implies that attributes and elements referenced + in DTD content will be interned in the current package. + +4. The ACL 6.0 beta does not contain a little-endian Unicode external format. To + process XML input containing Unicode characters correctly: + + a. Place the following in a file called ef-fat-little.cl in the ACL code + directory: + +(provide :ef-fat-little) + +(in-package :excl) + +(def-external-format :fat-little-base + :size 2) + +(def-char-to-octets-macro :fat-little-base (char + state + &key put-next-octet external-format) + (declare (ignore external-format state)) + `(let ((code (char-code ,char))) + (,put-next-octet (ldb (byte 8 0) code)) + (,put-next-octet (ldb (byte 8 8) code)))) + +(def-octets-to-char-macro :fat-little-base (state-loc + &key get-next-octet external-format + octets-count-loc unget-octets) + (declare (ignore external-format state-loc unget-octets)) + `(let ((lo ,get-next-octet) + (hi (progn (incf ,octets-count-loc) + ,get-next-octet))) + (code-char (+ (ash hi 8) lo)))) + +(create-newline-ef :name :fat-little :base-name :fat-little-base + :nicknames '(:unicode-little)) + + + b. Compile the file using a modern ACL. + +5. The parse-xml function has been tested using the OASIS conformance test suite (see + details below). The test suite has wide coverage across possible XML and DTD syntax, + but there may be some syntax paths that have not yet been tested or completely + supported. Here is a list of currently known syntax parsing issues: + + a. ACL does not support 4 byte Unicode scalar values, so input containing such data + will not be processed correctly. (Note, however, that parse-xml does correctly detect + and process wide Unicode input.) + + b. The OASIS tests that contain wide Unicode all use a little-endian encoded Unicode. + Changes to the unicode-check function are required to also support big-endian encoded + Unicode. (Note also that this issue may be resolved by an ACL 6.0 final release change.) + + c. An initial colon-index -1) then + (let ((string1 (make-string colon-index)) + new-package string2) + (dotimes (i colon-index) + (setf (schar string1 i) (schar data i))) + (setf new-package (assoc string1 ns-to-package :test 'string=)) + (if* new-package + then + (setf string2 (make-string (- (collector-next coll) + (+ 1 colon-index)))) + (dotimes (i (- (collector-next coll) + (+ 1 colon-index))) + (setf (schar string2 i) + (schar data (+ colon-index 1 i)))) + (excl::intern string2 (rest new-package)) + else + (excl::intern* (collector-data coll) + (collector-next coll) package))) + else + (let ((new-package (assoc :none ns-to-package))) + (when new-package + (setf package (rest new-package)))) + (excl::intern* (collector-data coll) + (collector-next coll) package))) + )) + )) + +(defun compute-coll-string (coll) + (declare (optimize (speed 3) (safety 1))) + ;; return the string that's in the collection + (let ((str (make-string (collector-next coll))) + (from (collector-data coll))) + (dotimes (i (collector-next coll)) + (setf (schar str i) (schar from i))) + + str)) + +(defun grow-and-add (coll ch) + (declare (optimize (speed 3) (safety 1))) + ;; increase the size of the data portion of the collector and then + ;; add the given char at the end + (let* ((odata (collector-data coll)) + (ndata (make-string (* 2 (length odata))))) + (dotimes (i (length odata)) + (setf (schar ndata i) (schar odata i))) + (setf (collector-data coll) ndata) + (setf (collector-max coll) (length ndata)) + (let ((next (collector-next coll))) + (setf (schar ndata next) ch) + (setf (collector-next coll) (1+ next))))) + +(defun put-back-tokenbuf (buf) + (declare (optimize (speed 3) (safety 1))) + (mp::without-scheduling + (do ((bufs *tokenbufs* (cdr bufs))) + ((null bufs) + ; toss it away + nil) + (if* (null (car bufs)) + then (setf (car bufs) buf) + (return))))) + +(defun get-collector () + (declare (optimize (speed 3) (safety 1))) + (let (col) + (mp::without-scheduling + (do* ((cols *collectors* (cdr cols)) + (this (car cols) (car cols))) + ((null cols)) + (if* this + then (setf (car cols) nil) + (setq col this) + (return)))) + (if* col + then (setf (collector-next col) 0) + col + else (make-collector + :next 0 + :max 100 + :data (make-string 100))))) + +(defmacro next-char (tokenbuf read-sequence-func) + `(let ((cur (tokenbuf-cur ,tokenbuf)) + (tb (tokenbuf-data ,tokenbuf))) + (if* (>= cur (tokenbuf-max ,tokenbuf)) + then ;; fill buffer + (if* (or (not (tokenbuf-stream ,tokenbuf)) + (zerop (setf (tokenbuf-max ,tokenbuf) + (if* ,read-sequence-func + then (funcall ,read-sequence-func tb + (tokenbuf-stream ,tokenbuf)) + else (read-sequence tb (tokenbuf-stream ,tokenbuf)))))) + then (setq cur nil) ;; eof + else (setq cur 0))) + (if* cur + then (prog1 + (let ((cc (schar tb cur))) + (if (and (tokenbuf-stream ,tokenbuf) (eq #\return cc)) #\newline cc)) + (setf (tokenbuf-cur ,tokenbuf) (1+ cur)))))) + +(defun get-next-char (iostruct) + (declare (optimize (speed 3) (safety 1))) + (let* (from-stream (tmp-char + (let (char) + (if* (iostruct-unget-char iostruct) then + ;; from-stream is used to do input CR/LF normalization + (setf from-stream t) + (setf char (first (iostruct-unget-char iostruct))) + (setf (iostruct-unget-char iostruct) (rest (iostruct-unget-char iostruct))) + char + elseif (iostruct-entity-bufs iostruct) then + (let (entity-buf) + (loop + (setf entity-buf (first (iostruct-entity-bufs iostruct))) + (if* (streamp (tokenbuf-stream entity-buf)) + then (setf from-stream t) + else (setf from-stream nil)) + (setf char (next-char entity-buf (iostruct-read-sequence-func iostruct))) + (when char (return)) + (when (streamp (tokenbuf-stream entity-buf)) + (close (tokenbuf-stream entity-buf)) + (put-back-tokenbuf entity-buf)) + (setf (iostruct-entity-bufs iostruct) (rest (iostruct-entity-bufs iostruct))) + (setf (iostruct-entity-names iostruct) (rest (iostruct-entity-names iostruct))) + (when (not (iostruct-entity-bufs iostruct)) (return)))) + (if* char then char + else (next-char (iostruct-tokenbuf iostruct) + (iostruct-read-sequence-func iostruct))) + else (setf from-stream t) + (next-char (iostruct-tokenbuf iostruct) + (iostruct-read-sequence-func iostruct)))))) + (if* (and from-stream (eq tmp-char #\return)) then #\newline else tmp-char))) + +(defun unicode-check (p tokenbuf) + (declare (ignorable tokenbuf) (optimize (speed 3) (safety 1))) + ;; need no-OO check because external format support isn't completely done yet + (when (not (typep p 'string-input-simple-stream)) + #+(version>= 6 0 pre-final 1) + (let ((format (ignore-errors (excl:sniff-for-unicode p)))) + (if* (eq format (find-external-format :unicode)) + then + (setf (stream-external-format p) format) + else + (setf (stream-external-format p) (find-external-format :utf8)))) + #-(version>= 6 0 pre-final 1) + (let* ((c (read-char p nil)) c2 + (c-code (if c (char-code c) nil))) + (if* (eq #xFF c-code) then + (setf c2 (read-char p nil)) + (setf c-code (if c (char-code c2) nil)) + (if* (eq #xFE c-code) then + (format t "set unicode~%") + (setf (stream-external-format p) + (find-external-format #+(version>= 6 0 pre-final 1) :unicode + #-(version>= 6 0 pre-final 1) :fat-little)) + else + (xml-error "stream has incomplete Unicode marker")) + else (setf (stream-external-format p) + (find-external-format :utf8)) + (when c + (push c (iostruct-unget-char tokenbuf)) + #+ignore (unread-char c p) ;; bug when there is single ^M in file + ))))) + +(defun add-default-values (val attlist-data) + (declare (ignorable old-coll) (optimize (speed 3) (safety 1))) + (if* (symbolp val) + then + (let* ((tag-defaults (assoc val attlist-data)) defaults) + (dolist (def (rest tag-defaults)) + (if* (stringp (third def)) then + (push (first def) defaults) + (push (if (eq (second def) :CDATA) (third def) + (normalize-attrib-value (third def))) defaults) + elseif (and (eq (third def) :FIXED) (stringp (fourth def))) then + (push (first def) defaults) + (push (if (eq (second def) :CDATA) (fourth def) + (normalize-attrib-value (fourth def))) defaults) + )) + (if* defaults then + (setf val (append (list val) (nreverse defaults))) + else val) + ) + else + ;; first make sure there are no errors in given list + (let ((pairs (rest val))) + (loop + (when (null pairs) (return)) + (let ((this-one (first pairs))) + (setf pairs (rest (rest pairs))) + (when (member this-one pairs) + (xml-error (concatenate 'string "Entity: " + (string (first val)) + " has multiple " + (string this-one) + " attribute values")))))) + (let ((tag-defaults (assoc (first val) attlist-data)) defaults) + (dolist (def (rest tag-defaults)) + (let ((old (member (first def) (rest val)))) + (if* (not old) then + (if* (stringp (third def)) then + (push (first def) defaults) + (push (third def) defaults) + elseif (and (eq (third def) :FIXED) (stringp (fourth def))) then + (push (first def) defaults) + (push (fourth def) defaults)) + else + (push (first old) defaults) + (push (second old) defaults)))) + (if* defaults then + ;; now look for attributes in original list that weren't in dtd + (let ((tmp-val (rest val)) att att-val) + (loop + (when (null tmp-val) (return)) + (setf att (first tmp-val)) + (setf att-val (second tmp-val)) + (setf tmp-val (rest (rest tmp-val))) + (when (not (member att defaults)) + (push att defaults) + (push att-val defaults)))) + (setf val (append (list (first val)) (nreverse defaults))) + else val)) + )) + +(defun normalize-public-value (public-value) + (setf public-value (string-trim '(#\space) public-value)) + (let ((count 0) (stop (length public-value)) (last-ch nil) cch) + (loop + (when (= count stop) (return public-value)) + (setf cch (schar public-value count)) + (if* (and (eq cch #\space) (eq last-ch #\space)) then + (setf public-value + (remove #\space public-value :start count :count 1)) + (decf stop) + else (incf count) + (setf last-ch cch))))) + + +(defun normalize-attrib-value (attrib-value &optional first-pass) + (declare (optimize (speed 3) (safety 1))) + (when first-pass + (let ((count 0) (stop (length attrib-value)) (last-ch nil) cch) + (loop + (when (= count stop) (return)) + (setf cch (schar attrib-value count)) + (if* (or (eq cch #\return) (eq cch #\tab)) then (setf (schar attrib-value count) #\space) + elseif (and (eq cch #\newline) (not (eq last-ch #\return))) then + (setf (schar attrib-value count) #\space) + elseif (and (eq cch #\newline) (eq last-ch #\return)) then + (setf attrib-value + (remove #\space attrib-value :start count :count 1)) + (decf stop)) + (incf count) + (setf last-ch cch)))) + (setf attrib-value (string-trim '(#\space) attrib-value)) + (let ((count 0) (stop (length attrib-value)) (last-ch nil) cch) + (loop + (when (= count stop) (return attrib-value)) + (setf cch (schar attrib-value count)) + (if* (and (eq cch #\space) (eq last-ch #\space)) then + (setf attrib-value + (remove #\space attrib-value :start count :count 1)) + (decf stop) + else (incf count) + (setf last-ch cch))))) + +(defun check-xmldecl (val tokenbuf) + (declare (ignorable old-coll) (optimize (speed 3) (safety 1))) + (when (not (and (symbolp (second val)) (string= "version" (symbol-name (second val))))) + (xml-error "XML declaration tag does not include correct 'version' attribute")) + (when (and (fourth val) + (or (not (symbolp (fourth val))) + (and (not (string= "standalone" (symbol-name (fourth val)))) + (not (string= "encoding" (symbol-name (fourth val))))))) + (xml-error "XML declaration tag does not include correct 'encoding' or 'standalone' attribute")) + (when (and (fourth val) (string= "standalone" (symbol-name (fourth val)))) + (if* (equal (fifth val) "yes") then + (setf (iostruct-standalonep tokenbuf) t) + elseif (not (equal (fifth val) "no")) then + (xml-error "XML declaration tag does not include correct 'standalone' attribute value"))) + (dotimes (i (length (third val))) + (let ((c (schar (third val) i))) + (when (and (not (alpha-char-p c)) + (not (digit-char-p c)) + (not (member c '(#\. #\_ #\- #\:))) + ) + (xml-error "XML declaration tag does not include correct 'version' attribute value")))) + (when (and (fourth val) (eql :encoding (fourth val))) + (dotimes (i (length (fifth val))) + (let ((c (schar (fifth val) i))) + (when (and (not (alpha-char-p c)) + (if* (> i 0) then + (and (not (digit-char-p c)) + (not (member c '(#\. #\_ #\-)))) + else t)) + (xml-error "XML declaration tag does not include correct 'encoding' attribute value"))))) + ) + +(defun xml-error (text) + (declare (optimize (speed 3) (safety 1))) + (funcall 'error "~a" (concatenate 'string "XML not well-formed - " text))) diff --git a/pxml2.cl b/pxml2.cl new file mode 100644 index 0000000..27e2bf7 --- /dev/null +++ b/pxml2.cl @@ -0,0 +1,2093 @@ +;; +;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA +;; +;; This code is free software; you can redistribute it and/or +;; modify it under the terms of the version 2.1 of +;; the GNU Lesser General Public License as published by +;; the Free Software Foundation, as clarified by the AllegroServe +;; prequel found in license-allegroserve.txt. +;; +;; This code is distributed in the hope that it will be useful, +;; but without any warranty; without even the implied warranty of +;; merchantability or fitness for a particular purpose. See the GNU +;; Lesser General Public License for more details. +;; +;; Version 2.1 of the GNU Lesser General Public License is in the file +;; license-lgpl.txt that was distributed with this file. +;; If it is not present, you can access it from +;; http://www.gnu.org/copyleft/lesser.txt (until superseded by a newer +;; version) or write to the Free Software Foundation, Inc., 59 Temple Place, +;; Suite 330, Boston, MA 02111-1307 USA +;; +;; $Id: pxml2.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $ + +;; Change Log +;; +;; 10/14/00 add namespace support + +(in-package :net.xml.parser) + +(pxml-dribble-bug-hook "$Id: pxml2.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $") + +;; state titles can be better chosen and explained + +(defvar *debug-xml* nil) + +(defmethod parse-xml ((str string) &key external-callback general-entities parameter-entities + content-only uri-to-package) + (declare (optimize (speed 3) (safety 1))) + (parse-xml (make-string-input-stream str) :external-callback external-callback + :general-entities general-entities + :parameter-entities parameter-entities :content-only content-only + :uri-to-package uri-to-package)) + +(defmethod parse-xml ((p stream) &key external-callback general-entities + parameter-entities content-only uri-to-package) + (declare (optimize (speed 3) (safety 1))) + (pxml-internal0 p nil external-callback general-entities parameter-entities content-only + uri-to-package)) + +(eval-when (compile load eval) + (defconstant state-docstart 0) ;; looking for XMLdecl, Misc, doctypedecl, 1st element + (defconstant state-docstart-misc 1) ;; looking for Misc, doctypedecl, 1st element + (defconstant state-docstart-misc2 2) ;; looking for Misc, 1st element + (defconstant state-element-done 3) ;; looking for Misc + (defconstant state-element-contents 4) ;; looking for element content + ) + +(defun all-xml-whitespace-p (val) + (dotimes (i (length val) t) + (when (not (xml-space-p (elt val i))) (return nil)))) + +(defun pxml-internal0 (p read-sequence-func external-callback + general-entities parameter-entities content-only uri-to-package) + (declare (optimize (speed 3) (safety 1))) + (let ((tokenbuf (make-iostruct :tokenbuf (get-tokenbuf) + :do-entity t + :read-sequence-func read-sequence-func))) + ;; set up stream right + (setf (tokenbuf-stream (iostruct-tokenbuf tokenbuf)) p) + ;; set up user specified entities + (setf (iostruct-parameter-entities tokenbuf) parameter-entities) + (setf (iostruct-general-entities tokenbuf) general-entities) + (setf (iostruct-uri-to-package tokenbuf) uri-to-package) + ;; look for Unicode file + (unicode-check p tokenbuf) + (unwind-protect + (values (pxml-internal tokenbuf external-callback content-only) + (iostruct-uri-to-package tokenbuf)) + (dolist (entity-buf (iostruct-entity-bufs tokenbuf)) + (when (streamp (tokenbuf-stream entity-buf)) + (close (tokenbuf-stream entity-buf)) + (put-back-tokenbuf entity-buf)))) + )) + +(defun pxml-internal (tokenbuf external-callback content-only) + (declare (optimize (speed 3) (safety 1))) + (let ((state state-docstart) + (guts) + (pending) + (attlist-data) + (public-string) + (system-string) + (entity-open-tags) + ) + + (loop + (multiple-value-bind (val kind kind2) + (next-token tokenbuf external-callback attlist-data) + (when *debug-xml* + (format t "val: ~s kind: ~s kind2: ~s state: ~s~%" val kind kind2 state)) + (case state + (#.state-docstart + (if* (and (listp val) (eq :xml (first val)) (eq kind :xml) (eq kind2 :end-tag)) + then + (check-xmldecl val tokenbuf) + (when (not content-only) (push val guts)) + (setf state state-docstart-misc) + elseif (eq kind :comment) + then + (when (not content-only) (push val guts)) + (setf state state-docstart-misc) + elseif (and (listp val) (eq :DOCTYPE (first val))) + then + (if* (eq (third val) :SYSTEM) then + (setf system-string (fourth val)) + (setf val (remove (third val) val)) + (setf val (remove (third val) val)) + elseif (eq (third val) :PUBLIC) then + (setf public-string (normalize-public-value (fourth val))) + (setf system-string (fifth val)) + (setf val (remove (third val) val)) + (setf val (remove (third val) val)) + (setf val (remove (third val) val))) + (when system-string + (if* external-callback then + (let ((ext-stream (apply external-callback + (list (parse-uri system-string) + :DOCTYPE + public-string + )))) + (when ext-stream + (let (ext-io (entity-buf (get-tokenbuf))) + (setf (tokenbuf-stream entity-buf) ext-stream) + (setf ext-io (make-iostruct :tokenbuf entity-buf + :do-entity + (iostruct-do-entity tokenbuf) + :read-sequence-func + (iostruct-read-sequence-func tokenbuf))) + (unicode-check ext-stream ext-io) + (setf (iostruct-parameter-entities ext-io) + (iostruct-parameter-entities tokenbuf)) + (setf (iostruct-general-entities ext-io) + (iostruct-general-entities tokenbuf)) + (unwind-protect + (setf val (append val + (list (append + (list :external) + (parse-dtd + ext-io + t external-callback))))) + (setf (iostruct-seen-any-dtd tokenbuf) t) + (setf (iostruct-seen-external-dtd tokenbuf) t) + (setf (iostruct-seen-parameter-reference tokenbuf) + (iostruct-seen-parameter-reference ext-io)) + (setf (iostruct-general-entities tokenbuf) + (iostruct-general-entities ext-io)) + (setf (iostruct-parameter-entities tokenbuf) + (iostruct-parameter-entities ext-io)) + (setf (iostruct-do-entity tokenbuf) + (iostruct-do-entity ext-io)) + (dolist (entity-buf2 (iostruct-entity-bufs ext-io)) + (when (streamp (tokenbuf-stream entity-buf2)) + (close (tokenbuf-stream entity-buf2)) + (put-back-tokenbuf entity-buf2))) + (close (tokenbuf-stream entity-buf)) + (put-back-tokenbuf entity-buf)) + ))) + else + (setf (iostruct-do-entity tokenbuf) nil))) + (setf attlist-data + (process-attlist (rest (rest val)) attlist-data)) + (when (not content-only) (push val guts)) + (setf state state-docstart-misc2) + elseif (eq kind :pi) + then + (push val guts) + (setf state state-docstart-misc) + elseif (eq kind :pcdata) + then + (when (or (not kind2) (not (all-xml-whitespace-p val))) + (if* (not kind2) then + (xml-error "An entity reference occured where only whitespace or the first element may occur") + else + (xml-error (concatenate 'string + "unrecognized content '" + (subseq val 0 (min (length val) 40)) "'")))) + (setf state state-docstart-misc) + elseif (or (symbolp val) + (and (listp val) (symbolp (first val)))) + then + (when (eq kind :start-tag) + (setf val (add-default-values val attlist-data))) + (if* (and (eq kind :start-tag) (eq kind2 :end-tag)) + then (push (list val) guts) + (setf state state-element-done) + elseif (eq kind :start-tag) + then (push (list val) pending) + ;;(format t "pending: ~s guts: ~s <1>~%" pending guts) + (when (iostruct-entity-bufs tokenbuf) + (push (if (symbolp val) val (first val)) entity-open-tags)) + (setf state state-element-contents) + else (xml-error (concatenate 'string + "encountered token at illegal syntax position: '" + (string kind) "'" + (if* (null guts) then + " at start of contents" + else + (concatenate 'string + " following: '" + (format nil "~s" (first guts)) + "'"))))) + else + (print (list val kind kind2)) + (break "need to check for other allowable docstarts"))) + (#.state-docstart-misc2 + (if* (eq kind :pcdata) + then + (when (or (not kind2) (not (all-xml-whitespace-p val))) + (if* (not kind2) then + (xml-error "An entity reference occured where only whitespace or the first element may occur") + else + (xml-error (concatenate 'string + "unrecognized content '" + (subseq val 0 (min (length val) 40)) "'")))) + elseif (and (listp val) (eq :comment (first val))) + then + (when (not content-only) (push val guts)) + elseif (eq kind :pi) + then + (push val guts) + elseif (eq kind :eof) + then + (xml-error "unexpected end of file encountered") + elseif (or (symbolp val) + (and (listp val) (symbolp (first val)))) + then + (when (eq kind :start-tag) + (setf val (add-default-values val attlist-data))) + (if* (and (eq kind :start-tag) (eq kind2 :end-tag)) + then (push (list val) guts) + (setf state state-element-done) + elseif (eq kind :start-tag) + then (push (list val) pending) + ;;(format t "pending: ~s guts: ~s <2>~%" pending guts) + (when (iostruct-entity-bufs tokenbuf) + (push (if (symbolp val) val (first val)) entity-open-tags)) + (setf state state-element-contents) + else (xml-error (concatenate 'string + "encountered token at illegal syntax position: '" + (string kind) "'" + (if* (null guts) then + " at start of contents" + else + (concatenate 'string + " following: '" + (format nil "~s" (first guts)) + "'"))))) + else + (error "this branch unexpected <1>"))) + (#.state-docstart-misc + (if* (eq kind :pcdata) + then + (when (or (not kind2) (not (all-xml-whitespace-p val))) + (if* (not kind2) then + (xml-error "An entity reference occured where only whitespace or the first element may occur") + else + (xml-error (concatenate 'string + "unrecognized content '" + (subseq val 0 (min (length val) 40)) "'")))) + elseif (and (listp val) (eq :DOCTYPE (first val))) + then + (if* (eq (third val) :SYSTEM) then + (setf system-string (fourth val)) + (setf val (remove (third val) val)) + (setf val (remove (third val) val)) + elseif (eq (third val) :PUBLIC) then + (setf public-string (normalize-public-value (fourth val))) + (setf system-string (fifth val)) + (setf val (remove (third val) val)) + (setf val (remove (third val) val)) + (setf val (remove (third val) val))) + (when system-string + (if* external-callback then + (let ((ext-stream (apply external-callback + (list (parse-uri system-string) + :DOCTYPE + public-string + )))) + (when ext-stream + (let (ext-io (entity-buf (get-tokenbuf))) + (setf (tokenbuf-stream entity-buf) ext-stream) + (setf ext-io (make-iostruct :tokenbuf entity-buf + :do-entity + (iostruct-do-entity tokenbuf) + :read-sequence-func + (iostruct-read-sequence-func tokenbuf))) + (unicode-check ext-stream ext-io) + (setf (iostruct-parameter-entities ext-io) + (iostruct-parameter-entities tokenbuf)) + (setf (iostruct-general-entities ext-io) + (iostruct-general-entities tokenbuf)) + (unwind-protect + (setf val (append val + (list (append + (list :external) + (parse-dtd + ext-io + t external-callback))))) + (setf (iostruct-seen-any-dtd tokenbuf) t) + (setf (iostruct-seen-external-dtd tokenbuf) t) + (setf (iostruct-seen-parameter-reference tokenbuf) + (iostruct-seen-parameter-reference ext-io)) + (setf (iostruct-general-entities tokenbuf) + (iostruct-general-entities ext-io)) + (setf (iostruct-parameter-entities tokenbuf) + (iostruct-parameter-entities ext-io)) + (setf (iostruct-do-entity tokenbuf) + (iostruct-do-entity ext-io)) + (dolist (entity-buf2 (iostruct-entity-bufs ext-io)) + (when (streamp (tokenbuf-stream entity-buf2)) + (close (tokenbuf-stream entity-buf2)) + (put-back-tokenbuf entity-buf2))) + (close (tokenbuf-stream entity-buf)) + (put-back-tokenbuf entity-buf)) + ))) + else + (setf (iostruct-do-entity tokenbuf) nil))) + (setf attlist-data + (process-attlist (rest (rest val)) attlist-data)) + (when (not content-only) (push val guts)) + (setf state state-docstart-misc2) + elseif (and (listp val) (eq :comment (first val))) + then + (when (not content-only) (push val guts)) + elseif (eq kind :pi) + then + (push val guts) + elseif (or (symbolp val) + (and (listp val) (symbolp (first val)))) + then + (when (eq kind :start-tag) + (setf val (add-default-values val attlist-data))) + (if* (and (eq kind :start-tag) (eq kind2 :end-tag)) + then (push (list val) guts) + (setf state state-element-done) + elseif (eq kind :start-tag) + then (push (list val) pending) + ;;(format t "pending: ~s guts: ~s <3>~%" pending guts) + (when (iostruct-entity-bufs tokenbuf) + (push (if (symbolp val) val (first val)) entity-open-tags)) + (setf state state-element-contents) + else (xml-error (concatenate 'string + "encountered token at illegal syntax position: '" + (string kind) "'" + (concatenate 'string + " following: '" + (format nil "~s" (first guts)) + "'")))) + else + (print (list val kind kind2)) + (break "check for other docstart-misc states"))) + (#.state-element-contents + (if* (or (symbolp val) + (and (listp val) (symbolp (first val)))) + then + (when (eq kind :start-tag) + (setf val (add-default-values val attlist-data))) + (if* (eq kind :end-tag) + then (let ((candidate (first (first pending)))) + (when (listp candidate) (setf candidate (first candidate))) + (if* (eq candidate val) + then + (if* (iostruct-entity-bufs tokenbuf) then + (when (not (eq (first entity-open-tags) val)) + (xml-error + (concatenate 'string + (string val) + " element closed in entity that did not open it"))) + (setf entity-open-tags (rest entity-open-tags)) + else + (when (eq (first entity-open-tags) val) + (xml-error + (concatenate 'string + (string val) + " element closed outside of entity that did not open it"))) + ) + (if* (= (length pending) 1) + then + (push (first pending) guts) + (setf state state-element-done) + else + (setf (second pending) + (append (second pending) (list (first pending))))) + (setf pending (rest pending)) + ;;(format t "pending: ~s guts: ~s <4>~%" pending guts) + else (xml-error (format nil + "encountered end tag: ~s expected: ~s" + val candidate)))) + elseif (and (eq kind :start-tag) (eq kind2 :end-tag)) + then + (setf (first pending) + (append (first pending) (list (list val)))) + ;;(format t "pending: ~s guts: ~s <5>~%" pending guts) + elseif (eq kind :start-tag) + then + (push (list val) pending) + ;;(format t "pending: ~s guts: ~s <6>~%" pending guts) + (when (iostruct-entity-bufs tokenbuf) + (push (if (symbolp val) val (first val)) entity-open-tags)) + elseif (eq kind :cdata) then + (setf (first pending) + (append (first pending) (rest val))) + (let ((old (first pending)) + (new)) + (dolist (item old) + (if* (and (stringp (first new)) (stringp item)) then + (setf (first new) + (concatenate 'string (first new) item)) + else (push item new))) + (setf (first pending) (reverse new))) + elseif (eq kind :comment) then + (when (not content-only) (push val guts)) + elseif (eq kind :pi) + then + (setf (first pending) + (append (first pending) (list val))) + elseif (eq kind :eof) + then + (xml-error "unexpected end of file encountered") + else (xml-error (format nil "unexpected token: ~s" val))) + elseif (eq kind :pcdata) + then + (setf (first pending) + (append (first pending) (list val))) + (let ((old (first pending)) + (new)) + (dolist (item old) + (if* (and (stringp (first new)) (stringp item)) then + (setf (first new) + (concatenate 'string (first new) item)) + else (push item new))) + (setf (first pending) (reverse new))) + else (xml-error (format nil "unexpected token: ~s" val)))) + (#.state-element-done + (if* (eq kind :pcdata) + then + (when (or (not kind2) (not (all-xml-whitespace-p val))) + (if* (not kind2) then + (xml-error "An entity reference occured where only whitespace or the first element may occur") + else + (xml-error (concatenate 'string + "unrecognized content '" + (subseq val 0 (min (length val) 40)) "'")))) + elseif (eq kind :eof) then + (put-back-tokenbuf (iostruct-tokenbuf tokenbuf)) + (return (nreverse guts)) + elseif (eq kind :comment) then + (when (not content-only) (push val guts)) + elseif (eq kind :pi) + then (push val guts) + else + (xml-error (concatenate 'string + "encountered token at illegal syntax position: '" + (string kind) "'" + (concatenate 'string + " following: '" + (format nil "~s" (first guts)) + "'"))) + )) + (t + (error "need to support state:~s token:~s kind:~s kind2:~s " state val kind kind2))) + )))) + +(eval-when (compile load eval) + (defconstant state-pcdata 0) ;;looking for < (tag start), & (reference); all else is string data + (defconstant state-readtagfirst 1) ;; seen < looking for /,?,!,name start + (defconstant state-readtag-? 2) ;; seen + (defconstant state-findattributename 6) ;; found ,space,name start + (defconstant state-readpi 7) + (defconstant state-noattributename 8) + (defconstant state-attribname 9) ;; found or contents + (defconstant state-!-contents 21) ;; found ,[,space + (defconstant state-!-doctype 22) ;; found ,[,name + (defconstant state-begin-dtd 23) + (defconstant state-!-doctype-ext 24) ;; found ,[ + (defconstant state-!-doctype-ext3 30) ;; processed DTD looking for space,> + (defconstant state-!-doctype-public2 31) ;; found ,attrib name + (defconstant state-readtag3 34) ;; found + (defconstant state-readtag4 35) ;; found + (defconstant state-readtag-end3 44) ;; found + (defconstant state-pcdata2 45) ;; seen & looking for name start + (defconstant state-pcdata3 46) ;; seen &# looking for character reference code + (defconstant state-pcdata4 47) ;; working on entity reference name looking for ; + (defconstant state-pcdata5 48) ;; working on hex character code reference + (defconstant state-pcdata6 49) ;; working on decimal character code reference + (defconstant state-findattributename0 50) + (defconstant state-readtag6a 51) + (defconstant state-readtag-!-conditional4 52) + (defconstant state-readtag-!-conditional5 53) + (defconstant state-readtag-!-conditional6 54) + (defconstant state-readtag-!-conditional7 55) + ;;(defconstant state-pcdata-parsed 56) + (defconstant state-pcdata7 57) + (defconstant state-pcdata8 58) + (defconstant state-readtag12 59) + (defconstant state-attribname2 60) + ) + +(defun next-token (tokenbuf external-callback attlist-data) + (declare (optimize (speed 3) (safety 1))) + ;; return two values: + ;; the next token from the stream. + ;; the kind of token + ;; + ;; if read-sequence-func is non-nil, + ;; read-sequence-func is called to fetch the next character + (macrolet ((add-to-entity-buf (entity-symbol p-value) + `(progn + (push (make-tokenbuf :cur 0 :max (length p-value) :data p-value) + (iostruct-entity-bufs tokenbuf)))) + + (un-next-char (ch) + `(push ,ch (iostruct-unget-char tokenbuf))) + + (clear-coll (coll) + `(setf (collector-next ,coll) 0)) + + (add-to-coll (coll ch) + `(let ((.next. (collector-next ,coll))) + (if* (>= .next. (collector-max ,coll)) + then (grow-and-add ,coll ,ch) + else (setf (schar (collector-data ,coll) .next.) + ,ch) + (setf (collector-next ,coll) (1+ .next.))))) + + (to-preferred-case (ch) + ;; should check the case mode + `(char-downcase ,ch)) + + ) + + (let ((state state-pcdata) + (coll (get-collector)) + (entity (get-collector)) + (tag-to-return) + (tag-to-return-string) + (attrib-name) + (empty-delim) + (value-delim) + (attrib-value) + (attribs-to-return) + (contents-to-return) + (char-code 0) + (special-tag-count 0) + (attrib-value-tokenbuf) + (last-ch) + (cdatap t) + (pcdatap t) + (entity-source) + (ns-token) + (ch)) + + (loop + + (setq ch (get-next-char tokenbuf)) + (when *debug-xml* (format t "ch: ~s code: ~x state:~s entity-names:~s~%" + ch (char-code ch) state (iostruct-entity-names tokenbuf))) + (if* (null ch) + then (return) ; eof -- exit loop + ) + + + (case state + (#.state-pcdata + (if* (eq ch #\<) + then + (setf entity-source (first (iostruct-entity-bufs tokenbuf))) + (if* (> (collector-next coll) 0) + then ; have collected something, return this string + (un-next-char ch) ; push back the < + (return) + else ; collect a tag + (setq state state-readtagfirst)) + elseif (eq #\& ch) + then (setf state state-pcdata2) + (setf entity-source (first (iostruct-entity-bufs tokenbuf))) + (setf pcdatap nil) + elseif (eq #\] ch) then (setf state state-pcdata7) + elseif (not (xml-char-p ch)) then + (xml-error (concatenate 'string + "Illegal character: " + (string ch) + " detected in input")) + else + (add-to-coll coll ch) + #+ignore + (if* (not (eq ch #\return)) + then (add-to-coll coll ch)))) + + (#.state-pcdata7 + (if* (eq #\] ch) then (setf state state-pcdata8) + else (setf state state-pcdata) + (add-to-coll coll #\]) (un-next-char ch))) + + (#.state-pcdata8 + (if* (eq #\> ch) then + (add-to-coll coll #\]) + (add-to-coll coll #\]) + (add-to-coll coll #\>) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "content cannot contain ']]>':'" + (compute-coll-string coll) + "'")) + elseif (eq #\] ch) then + (add-to-coll coll #\]) + else (setf state state-pcdata) + (add-to-coll coll #\]) (add-to-coll coll #\]) (un-next-char ch))) + + (#.state-pcdata2 + (if* (eq #\# ch) + then (setf state state-pcdata3) + elseif (xml-name-start-char-p ch) + then (setf state state-pcdata4) + (un-next-char ch) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal reference name, starting at: '&" + (compute-coll-string coll) + "'")) + )) + + (#.state-pcdata3 + (if* (eq #\x ch) + then (setf state state-pcdata5) + elseif (<= (char-code #\0) (char-code ch) (char-code #\9)) + then (setf state state-pcdata6) + (un-next-char ch) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal character reference code, starting at: '&#" + (compute-coll-string coll) + "'")) + )) + + (#.state-pcdata4 + (if* (xml-name-char-p ch) + then (add-to-coll entity ch) + elseif (eq #\; ch) + then (let ((entity-symbol (compute-tag entity))) + (clear-coll entity) + (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then + (xml-error + (concatenate 'string + (string entity-symbol) + " reference cannot be constructed from entity reference/character data sequence")) + else + (setf entity-source nil)) + (if* (string= (symbol-name entity-symbol) "amp") then (add-to-coll coll #\&) + elseif (string= (symbol-name entity-symbol) "lt") then (add-to-coll coll #\<) + elseif (string= (symbol-name entity-symbol) "gt") then (add-to-coll coll #\>) + elseif (string= (symbol-name entity-symbol) "apos") then (add-to-coll coll #\') + elseif (string= (symbol-name entity-symbol) "quot") then (add-to-coll coll #\") + else + (let (p-value) + (if* (and (iostruct-do-entity tokenbuf) + (setf p-value + (assoc entity-symbol + (iostruct-general-entities tokenbuf)))) then + (setf p-value (rest p-value)) + (when (member entity-symbol (iostruct-entity-names tokenbuf)) + (xml-error (concatenate 'string + "entity:" + (string entity-symbol) + " in recursive reference"))) + (push entity-symbol (iostruct-entity-names tokenbuf)) + (if* (stringp p-value) then + (add-to-entity-buf entity-symbol p-value) + elseif (null external-callback) then + (setf (iostruct-do-entity tokenbuf) nil) + elseif p-value then + (let ((entity-stream (apply external-callback p-value))) + (if* entity-stream then + (let ((entity-buf (get-tokenbuf))) + (setf (tokenbuf-stream entity-buf) entity-stream) + (unicode-check entity-stream tokenbuf) + (push entity-buf + (iostruct-entity-bufs tokenbuf)) + ;; check for possible external textdecl + (let ((count 0) cch + (string " ch) then + (let ((tag-string (compute-coll-string coll))) + (when (and (iostruct-ns-scope tokenbuf) + (string= tag-string + (first (first (iostruct-ns-scope tokenbuf))))) + (dolist (item (second (first (iostruct-ns-scope tokenbuf)))) + (setf (iostruct-ns-to-package tokenbuf) + (remove (assoc item (iostruct-ns-to-package tokenbuf)) + (iostruct-ns-to-package tokenbuf)))) + (setf (iostruct-ns-scope tokenbuf) + (rest (iostruct-ns-scope tokenbuf))))) + (setq tag-to-return (compute-tag coll *package* + (iostruct-ns-to-package tokenbuf))) + (return) + elseif (xml-space-p ch) then (setf state state-readtag-end3) + (let ((tag-string (compute-coll-string coll))) + (when (and (iostruct-ns-scope tokenbuf) + (string= tag-string + (first (first (iostruct-ns-scope tokenbuf))))) + (setf (iostruct-ns-scope tokenbuf) + (rest (iostruct-ns-scope tokenbuf))))) + (setq tag-to-return (compute-tag coll *package* + (iostruct-ns-to-package tokenbuf))) + else (let ((tmp (compute-coll-string coll))) + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal end tag name, starting at: ' ch) then (return) + else (let ((tmp (compute-coll-string coll))) + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal end tag name, starting at: '" + (compute-coll-string coll) + "' end tag name: " tmp ))) + )) + + (#.state-readtagfirst + ; starting to read a tag name + (if* (eq #\/ ch) + then (setf state state-readtag-end) + elseif (eq #\? ch) + then (setf state state-readtag-?) + (setf empty-delim #\?) + elseif (eq #\! ch) + then (setf state state-readtag-!) + (setf empty-delim nil) + elseif (xml-name-start-char-p ch) + then (setf state state-readtag) + (setf empty-delim #\/) + (un-next-char ch) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal character following '<', starting at '" + (compute-coll-string coll) + "'")) + )) + + (#.state-readtag-! + (if* (xml-name-start-char-p ch) + then + (setf state state-readtag-!-name) + (un-next-char ch) + elseif (eq #\[ ch) + then + (setf state state-readtag-!-conditional) + elseif (eq #\- ch) + then + (setf state state-readtag-!-comment) + else + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal character following ' ch) + then + (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then + (xml-error + "CDATA cannot be constructed from entity reference/character data sequence") + else + (setf entity-source nil)) + (return) + elseif (eq #\] ch) then + (add-to-coll coll #\]) ;; come back here to check again + else (setf state state-readtag-!-conditional5) + (add-to-coll coll #\]) + (add-to-coll coll #\]) + (add-to-coll coll ch))) + + (#.state-readtag-!-comment + (if* (eq #\- ch) + then (setf state state-readtag-!-readcomment) + (setf tag-to-return :comment) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal token following ' ch) + then (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then + (xml-error + (concatenate 'string + (string tag-to-return) + " tag cannot be constructed from entity reference/character data sequence")) + else + (setf entity-source nil)) + (return) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal token following '--' comment terminator, starting at '--" + (compute-coll-string coll) + "'")) + )) + + (#.state-readtag + (if* (xml-name-char-p ch) ;; starting char already passed more restrictive test + then + (add-to-coll coll ch) + else + (if* (xml-space-p ch) then + (setf tag-to-return-string (compute-coll-string coll)) + (setq tag-to-return + (compute-tag coll *package* + (iostruct-ns-to-package tokenbuf))) + (clear-coll coll) + (setf state state-readtag2) + elseif (eq #\> ch) then + (setq tag-to-return + (compute-tag coll *package* + (iostruct-ns-to-package tokenbuf))) + (clear-coll coll) + (return) + elseif (eq #\/ ch) then + (setq tag-to-return + (compute-tag coll *package* + (iostruct-ns-to-package tokenbuf))) + (clear-coll coll) + (setf state state-readtag3) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "illegal token name, starting at '" + (compute-coll-string coll) + "'")) + ))) + + (#.state-readtag2 + (if* (xml-space-p ch) then nil + elseif (eq #\> ch) then (return) + elseif (eq #\/ ch) then (setf state state-readtag3) + elseif (xml-name-start-char-p ch) then + (un-next-char ch) + (setf state state-readtag4) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "illegal token, starting at '" + (compute-coll-string coll) + "' following element token start: " (string tag-to-return))) + )) + + (#.state-readtag4 + (if* (xml-name-char-p ch) ;; starting char already passed more restrictive test + then + (add-to-coll coll ch) + elseif (eq #\= ch) then + (setq attrib-name (compute-tag coll *package* + (iostruct-ns-to-package tokenbuf))) + (clear-coll coll) + (let ((name (symbol-name attrib-name))) + (when (and (>= (length name) 5) + (string= name "xmlns" :end1 5)) + (if* (= (length name) 5) + then + (setf ns-token :none) + elseif (eq (schar name 5) #\:) + then + (setf ns-token (subseq name 6))))) + (setf state state-readtag5) + elseif (xml-space-p ch) then + (setq attrib-name (compute-tag coll *package* + (iostruct-ns-to-package tokenbuf))) + (clear-coll coll) + (let ((name (symbol-name attrib-name))) + (when (and (>= (length name) 5) + (string= name "xmlns" :end1 5)) + (if* (= (length name) 5) + then + (setf ns-token :none) + else + (setf ns-token (subseq name 6))))) + (setf state state-readtag12) + else (let ((tmp (compute-coll-string coll))) + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "looking for attribute '=', found: '" + (compute-coll-string coll) + "' following attribute name: " tmp))) + )) + + (#.state-readtag12 + (if* (xml-space-p ch) then nil + elseif (eq #\= ch) then (setf state state-readtag5) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "looking for attribute '=', found: '" + (compute-coll-string coll) + "' following attribute name: " (string attrib-name))))) + + (#.state-readtag5 + ;; begin to collect attribute value + (if* (or (eq ch #\") + (eq ch #\')) + then (setq value-delim ch) + (let* ((tag-defaults (assoc tag-to-return attlist-data)) + (this-attrib (assoc attrib-name tag-defaults))) + (when (and (second this-attrib) (not (eq (second this-attrib) :CDATA))) + (setf cdatap nil)) + ) + (setq state state-readtag6) + elseif (xml-space-p ch) then nil + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "attribute value not delimited by ' or \" : '" + (compute-coll-string coll) + "' following attribute: " (string attrib-name))) + )) + + (#.state-readtag6 + (let ((from-entity (and attrib-value-tokenbuf + (eq attrib-value-tokenbuf + (first (iostruct-entity-bufs tokenbuf)))))) + (when (not from-entity) (setf attrib-value-tokenbuf nil)) + (if* from-entity then + (if* (eq #\newline ch) then (setf ch #\space) + elseif (eq #\return ch) then (setf ch #\space) + elseif (eq #\tab ch) then (setf ch #\space) + )) + (if* (and (not from-entity) (eq ch value-delim)) + then (setq attrib-value (compute-coll-string coll)) + (when (not cdatap) + (setf attrib-value (normalize-attrib-value attrib-value))) + (clear-coll coll) + (push attrib-name attribs-to-return) + (push attrib-value attribs-to-return) + (when ns-token + (let ((package (assoc (parse-uri attrib-value) + (iostruct-uri-to-package tokenbuf) + :test 'uri=))) + (if* package then (setf package (rest package)) + else + (setf package + (let ((i 0) new-package) + (loop + (let* ((candidate (concatenate 'string + "net.xml.namespace." + (format nil "~s" i))) + (exists (find-package candidate))) + (if* exists + then (incf i) + else (setf new-package (make-package candidate)) + (setf (iostruct-uri-to-package tokenbuf) + (acons (parse-uri attrib-value) new-package + (iostruct-uri-to-package tokenbuf))) + (return new-package))))))) + (setf (iostruct-ns-to-package tokenbuf) + (acons ns-token package (iostruct-ns-to-package tokenbuf))) + ) + (if* (and (first (iostruct-ns-scope tokenbuf)) + (string= (first (first (iostruct-ns-scope tokenbuf))) + tag-to-return-string)) + then + (push ns-token (second (first (iostruct-ns-scope tokenbuf)))) + else + (push (list tag-to-return-string (list ns-token)) + (iostruct-ns-scope tokenbuf))) + (setf ns-token nil)) + (setq state state-readtag6a) + elseif (eq #\newline ch) then + (when (not (eq #\return last-ch)) (add-to-coll coll #\space)) + elseif (or (eq #\tab ch) (eq #\return ch)) then + (add-to-coll coll #\space) + elseif (eq #\& ch) + then (setq state state-readtag7) + (setf entity-source (first (iostruct-entity-bufs tokenbuf))) + elseif (and (xml-char-p ch) (not (eq #\< ch))) + then (add-to-coll coll ch) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "attribute value cannot contain '<': '" + (compute-coll-string coll) + "' following attribute: " (string attrib-name))) + ) + (setf last-ch ch))) + + (#.state-readtag6a + (if* (xml-space-p ch) then (setf state state-readtag2) + elseif (eq #\> ch) then (setf state state-readtag2) + (return) + elseif (eq #\/ ch) then (setf state state-readtag3) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "illegal token, starting at '" + (compute-coll-string coll) + "' following element token start: " (string tag-to-return))) + )) + + (#.state-readtag7 + (if* (eq #\# ch) + then (setf state state-readtag8) + elseif (xml-name-start-char-p ch) + then (setf state state-readtag9) + (un-next-char ch) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "attribute value contains illegal reference name: '&" + (compute-coll-string coll) + "' in attribute value for: " (string attrib-name))) + )) + + (#.state-readtag8 + (if* (eq #\x ch) + then (setf state state-readtag10) + elseif (<= (char-code #\0) (char-code ch) (char-code #\9)) + then (setf state state-readtag11) + (un-next-char ch) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "attribute value contains illegal character reference code: '" + (compute-coll-string coll) + "' in attribute value for: " (string attrib-name))) + )) + + (#.state-readtag10 + (let ((code (char-code ch))) + (if* (eq #\; ch) + then (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then + (xml-error + (concatenate 'string + (string (code-char char-code)) + " reference cannot be constructed from entity reference/character data sequence")) + else + (setf entity-source nil)) + (add-to-coll coll (code-char char-code)) + (setf char-code 0) + (setq state state-readtag6) + elseif (<= (char-code #\0) code (char-code #\9)) + then (setf char-code (+ (* char-code 16) (- code (char-code #\0)))) + elseif (<= (char-code #\A) code (char-code #\F)) + then (setf char-code (+ 10 (* char-code 16) (- code (char-code #\A)))) + elseif (<= (char-code #\a) code (char-code #\f)) + then (setf char-code (+ 10 (* char-code 16) (- code (char-code #\a)))) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "attribute value contains illegal hexidecimal character reference code: '" + (compute-coll-string coll) + "' in attribute value for: " (string attrib-name))) + ))) + + (#.state-readtag11 + (let ((code (char-code ch))) + (if* (eq #\; ch) + then (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then + (xml-error + (concatenate 'string + (string (code-char char-code)) + " reference cannot be constructed from entity reference/character data sequence")) + else + (setf entity-source nil)) + (add-to-coll coll (code-char char-code)) + (setf char-code 0) + (setq state state-readtag6) + elseif (<= (char-code #\0) code (char-code #\9)) + then (setf char-code (+ (* char-code 10) (- code (char-code #\0)))) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "attribute value contains illegal decimal character reference code: '" + (compute-coll-string coll) + "' in attribute value for: " (string attrib-name))) + ))) + + (#.state-readtag9 + (if* (xml-name-char-p ch) + then (add-to-coll entity ch) + elseif (eq #\; ch) + then (let ((entity-symbol (compute-tag entity))) + (clear-coll entity) + (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then + (xml-error + (concatenate 'string + (string entity-symbol) + " reference cannot be constructed from entity reference/character data sequence")) + else + (setf entity-source nil)) + (if* (string= (symbol-name entity-symbol) "amp") then (add-to-coll coll #\&) + elseif (string= (symbol-name entity-symbol) "lt") then (add-to-coll coll #\<) + elseif (string= (symbol-name entity-symbol) "gt") then (add-to-coll coll #\>) + elseif (string= (symbol-name entity-symbol) "apos") then (add-to-coll coll #\') + elseif (string= (symbol-name entity-symbol) "quot") then (add-to-coll coll #\") + else (let (p-value) + (if* (and (iostruct-do-entity tokenbuf) + (setf p-value + (assoc entity-symbol + (iostruct-general-entities tokenbuf)))) then + (setf p-value (rest p-value)) + (when (member entity-symbol (iostruct-entity-names tokenbuf)) + (xml-error (concatenate 'string + "entity:" + (string entity-symbol) + " in recursive reference"))) + (push entity-symbol (iostruct-entity-names tokenbuf)) + (if* (stringp p-value) then + (add-to-entity-buf entity-symbol p-value) + (when (not attrib-value-tokenbuf) + (setf attrib-value-tokenbuf + (first (iostruct-entity-bufs tokenbuf)))) + elseif (null external-callback) then + (setf (iostruct-do-entity tokenbuf) nil) + elseif p-value then + (let ((entity-stream (apply external-callback p-value))) + (if* entity-stream then + (let ((entity-buf (get-tokenbuf))) + (setf (tokenbuf-stream entity-buf) entity-stream) + (unicode-check entity-stream tokenbuf) + (push entity-buf + (iostruct-entity-bufs tokenbuf)) + ;; check for possible external textdecl + (let ((count 0) cch + (string " ch) then (return) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "expected '>' found '" + (compute-coll-string coll) + "' in element: " (string tag-to-return))) + )) + + (#.state-readtag-!-name + (if* (xml-name-char-p ch) ;; starting char already passed more restrictive test + then + (add-to-coll coll ch) + else + (when (not (xml-space-p ch)) + (xml-error (concatenate 'string + "expecting whitespace following: ' ch) + then (return) + else (un-next-char ch) + (setf state state-!-contents))) + + (#.state-begin-dtd + (un-next-char ch) + (let ((val (parse-dtd tokenbuf nil external-callback))) + (setf (iostruct-seen-any-dtd tokenbuf) t) + (push (append (list :[) val) + contents-to-return)) + (setf state state-!-doctype-ext3)) + + (#.state-!-contents + (if* (xml-name-char-p ch) + then (add-to-coll coll ch) + elseif (eq #\> ch) + then (push (compute-coll-string coll) contents-to-return) + (clear-coll coll) + (return) + elseif (eq #\[ ch) + then (push (compute-tag coll) contents-to-return) + (clear-coll coll) + (setf state state-begin-dtd) + elseif (and (xml-space-p ch) (eq tag-to-return :DOCTYPE)) + ;; look at tag-to-return and set state accordingly + then (push (compute-tag coll) contents-to-return) + (clear-coll coll) + (setf state state-!-doctype) + else (xml-error + (concatenate 'string + "illegal name: '" + (string tag-to-return) + "' in ch) then (return) + elseif (eq #\[ ch) + then (setf state state-begin-dtd) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "illegal char in DOCTYPE token: '" + (compute-coll-string coll) "'")) + )) + + (#.state-!-doctype-ext3 + (if* (xml-space-p ch) then nil + elseif (eq #\> ch) then (return) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "illegal char in DOCTYPE token following dtd: '" + (compute-coll-string coll) "'")) + )) + + (#.state-!-doctype + ;; skip whitespace; possible exits: >, SYSTEM, PUBLIC, [ + (if* (xml-space-p ch) then nil + elseif (xml-name-start-char-p ch) + then + (setf state state-!-doctype-ext) + (un-next-char ch) + elseif (eq #\> ch) then (return) + elseif (eq #\[ ch) + then (setf state state-begin-dtd) + else (xml-error + (concatenate 'string + "illegal character: '" + (string ch) + "' in ch) + then (return) + elseif (eq #\? ch) then + (add-to-coll coll #\?) ;; come back here to try again + else (setf state state-readpi) + (add-to-coll coll #\?) + (add-to-coll coll ch))) + + (#.state-findattributename0 + (if* (xml-space-p ch) then (setf state state-findattributename) + elseif (eq ch empty-delim) + then (setf state state-noattributename) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "expected space or tag end before: '" + (compute-coll-string coll) "'")))) + (#.state-findattributename + ;; search until we find the start of an attribute name + ;; or the end of the tag + (if* (eq ch empty-delim) + then (setf state state-noattributename) + elseif (xml-space-p ch) + then nil ;; skip whitespace + elseif (xml-name-start-char-p ch) + then + (un-next-char ch) + (setf state state-attribname) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "illegal char in ch) + then + (return) ;; ready to build return token + else + (xml-error + (concatenate 'string + "expected '>' found: '" (string ch) "' in :~s ~s ~s ~s" state + tag-to-return + contents-to-return + ret)))) + ))) + +(defun swallow-xml-token (tokenbuf external-callback) + (declare (ignorable old-coll) (optimize (speed 3) (safety 1))) + (let ((xml (next-token tokenbuf external-callback nil))) + (if* (and (eq (fourth xml) :standalone) (stringp (fifth xml)) + (equal (fifth xml) "yes")) then + (xml-error "external XML entity cannot be standalone document") + elseif (and (eq (sixth xml) :standalone) (stringp (seventh xml)) + (equal (seventh xml) "yes")) then + (xml-error "external XML entity cannot be standalone document")))) + +;; return the string with entity references replaced by text +;; normalizing will happen later +;; we're ok on different types - just ignore IMPLIED & REQUIRED; and possibly skip FIXED +(defun parse-default-value (value-list tokenbuf external-callback) + (declare (optimize (speed 3) (safety 1))) + (let (value-string) + (if* (stringp (first value-list)) then (setf value-string (first value-list)) + elseif (eq (first value-list) :FIXED) then (setf value-string (second value-list))) + (let ((tmp-result (parse-xml + (concatenate 'string + "") + :external-callback external-callback + :general-entities + (iostruct-general-entities tokenbuf)))) + (if* (stringp (first value-list)) then + (setf (first value-list) + (third (first (first tmp-result)))) + elseif (eq (first value-list) :FIXED) then + (setf (second value-list) + (third (first (first tmp-result))))))) + value-list) + +(defun process-attlist (args attlist-data) + (declare (optimize (speed 3) (safety 1))) + (dolist (arg1 args attlist-data) + ;;(format t "arg1: ~s~%" arg1) + (dolist (item (rest arg1)) + ;;(format t "item: ~s~%" item) + (when (eq :ATTLIST (first item)) + (let* ((name (second item)) + (name-data (assoc name attlist-data)) + (new-name-data (rest name-data))) + ;;(format t "name: ~s name-data: ~s new-name-data: ~s~%" name name-data new-name-data) + (dolist (attrib-data (rest (rest item))) + ;;(format t "attrib-data: ~s~%" attrib-data) + #+ignore + (setf (rest (rest attrib-data)) + (parse-default-value (rest (rest attrib-data)) tokenbuf external-callback)) + (when (not (assoc (first attrib-data) new-name-data)) + (setf new-name-data (acons (first attrib-data) (rest attrib-data) new-name-data)))) + (if* name-data then + (rplacd (assoc name attlist-data) (nreverse new-name-data)) + else (setf attlist-data (acons name (nreverse new-name-data) attlist-data)))))))) + +(provide :pxml) diff --git a/pxml3.cl b/pxml3.cl new file mode 100644 index 0000000..aefd3d3 --- /dev/null +++ b/pxml3.cl @@ -0,0 +1,2510 @@ +;; +;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA +;; +;; This code is free software; you can redistribute it and/or +;; modify it under the terms of the version 2.1 of +;; the GNU Lesser General Public License as published by +;; the Free Software Foundation, as clarified by the AllegroServe +;; prequel found in license-allegroserve.txt. +;; +;; This code is distributed in the hope that it will be useful, +;; but without any warranty; without even the implied warranty of +;; merchantability or fitness for a particular purpose. See the GNU +;; Lesser General Public License for more details. +;; +;; Version 2.1 of the GNU Lesser General Public License is in the file +;; license-lgpl.txt that was distributed with this file. +;; If it is not present, you can access it from +;; http://www.gnu.org/copyleft/lesser.txt (until superseded by a newer +;; version) or write to the Free Software Foundation, Inc., 59 Temple Place, +;; Suite 330, Boston, MA 02111-1307 USA +;; +;; $Id: pxml3.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $ + +(in-package :net.xml.parser) + +(pxml-dribble-bug-hook "$Id: pxml3.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $") + +(defvar *debug-dtd* nil) + +(defun parse-dtd (tokenbuf + external external-callback) + (declare (optimize (speed 3) (safety 1))) + (let ((guts) + (include-count 0)) + (loop + (multiple-value-bind (val kind) + (next-dtd-token tokenbuf + external include-count external-callback) + (if* (eq kind :end-dtd) then + (return (nreverse guts)) + elseif (eq kind :include) then + (incf include-count) + elseif (eq kind :ignore) then nil + elseif (eq kind :include-end) then + (if* (> include-count 0) then (decf include-count) + else (xml-error "unexpected ']]>' token")) + else (when (iostruct-do-entity tokenbuf) (push val guts))))))) + +(defparameter dtd-parser-states ()) + +(macrolet ((def-dtd-parser-state (var val) + `(progn (eval-when (compile load eval) (defconstant ,var ,val)) + (pushnew '(,val . ,var) dtd-parser-states :key #'car)))) + (def-dtd-parser-state state-dtdstart 0) + (def-dtd-parser-state state-tokenstart 1) + (def-dtd-parser-state state-dtd-? 2) + (def-dtd-parser-state state-dtd-! 3) + (def-dtd-parser-state state-dtd-comment 4) + (def-dtd-parser-state state-dtd-!-token 5) + (def-dtd-parser-state state-dtd-!-element 6) + (def-dtd-parser-state state-dtd-!-element-name 7) + (def-dtd-parser-state state-dtd-!-element-content 8) + (def-dtd-parser-state state-dtd-!-element-type 9) + (def-dtd-parser-state state-dtd-!-element-type-paren 10) + (def-dtd-parser-state state-dtd-!-element-type-token 11) + (def-dtd-parser-state state-dtd-!-element-type-end 12) + (def-dtd-parser-state state-dtd-!-element-type-paren-name 13) + (def-dtd-parser-state state-dtd-!-element-type-paren-pcd 14) + (def-dtd-parser-state state-dtd-!-element-type-paren-pcd2 15) + (def-dtd-parser-state state-dtd-!-element-type-paren-pcd3 16) + (def-dtd-parser-state state-dtd-!-element-type-paren-pcd4 17) + (def-dtd-parser-state state-dtd-!-element-type-paren-pcd5 18) + (def-dtd-parser-state state-dtd-!-element-type-paren-pcd6 19) + (def-dtd-parser-state state-dtd-!-element-type-paren-pcd7 20) + (def-dtd-parser-state state-dtd-!-element-type-paren-pcd8 21) + (def-dtd-parser-state state-dtd-!-element-type-paren-pcd9 22) + (def-dtd-parser-state state-dtd-!-element-type-paren-name2 23) + ;;(def-dtd-parser-state state-dtd-!-element-type-paren-seq 24) folded into choice + (def-dtd-parser-state state-dtd-!-element-type-paren-choice 25) + (def-dtd-parser-state state-dtd-!-element-type-paren2 26) + (def-dtd-parser-state state-dtd-!-element-type-paren-choice-name 27) + (def-dtd-parser-state state-dtd-!-element-type-paren-choice-paren 28) + (def-dtd-parser-state state-dtd-!-element-type-paren-choice-name2 29) + (def-dtd-parser-state state-dtd-!-element-type-paren3 30) + (def-dtd-parser-state state-dtd-!-element-type-paren-choice-name3 31) + (def-dtd-parser-state state-dtd-!-attlist 32) + (def-dtd-parser-state state-dtd-!-attlist-name 33) + (def-dtd-parser-state state-dtd-!-attdef 34) + (def-dtd-parser-state state-dtd-!-attdef-name 35) + (def-dtd-parser-state state-dtd-!-attdef-type 36) + ;;(def-dtd-parser-state state-dtd-!-attdef-enumeration 37) + (def-dtd-parser-state state-dtd-!-attdef-decl 38) + (def-dtd-parser-state state-dtd-!-attdef-decl-type 39) + (def-dtd-parser-state state-dtd-!-attdef-decl-value 40) + (def-dtd-parser-state state-dtd-!-attdef-decl-value2 41) + (def-dtd-parser-state state-dtd-!-attdef-decl-value3 42) + (def-dtd-parser-state state-dtd-!-attdef-decl-value4 43) + (def-dtd-parser-state state-dtd-!-attdef-decl-value5 44) + (def-dtd-parser-state state-dtd-!-attdef-decl-value6 45) + (def-dtd-parser-state state-dtd-!-attdef-decl-value7 46) + (def-dtd-parser-state state-dtd-!-attdef-notation 47) + (def-dtd-parser-state state-dtd-!-attdef-notation2 48) + (def-dtd-parser-state state-dtd-!-attdef-notation3 49) + (def-dtd-parser-state state-dtd-!-attdef-notation4 50) + (def-dtd-parser-state state-dtd-!-attdef-type2 51) + (def-dtd-parser-state state-dtd-!-entity 52) + (def-dtd-parser-state state-dtd-!-entity2 53) + (def-dtd-parser-state state-dtd-!-entity3 54) + (def-dtd-parser-state state-dtd-!-entity4 55) + (def-dtd-parser-state state-dtd-!-entity-value 56) + (def-dtd-parser-state state-dtd-!-entity5 57) + (def-dtd-parser-state state-dtd-!-entity6 58) + (def-dtd-parser-state state-!-dtd-system 59) + (def-dtd-parser-state state-!-dtd-public 60) + (def-dtd-parser-state state-!-dtd-system2 61) + (def-dtd-parser-state state-!-dtd-system3 62) + (def-dtd-parser-state state-!-dtd-system4 63) + (def-dtd-parser-state state-!-dtd-system5 64) + (def-dtd-parser-state state-!-dtd-system6 65) + (def-dtd-parser-state state-!-dtd-system7 66) + (def-dtd-parser-state state-!-dtd-public2 67) + (def-dtd-parser-state state-dtd-!-notation 68) + (def-dtd-parser-state state-dtd-!-notation2 69) + (def-dtd-parser-state state-dtd-!-notation3 70) + (def-dtd-parser-state state-dtd-?-2 71) + (def-dtd-parser-state state-dtd-?-3 72) + (def-dtd-parser-state state-dtd-?-4 73) + (def-dtd-parser-state state-dtd-comment2 74) + (def-dtd-parser-state state-dtd-comment3 75) + (def-dtd-parser-state state-dtd-comment4 76) + (def-dtd-parser-state state-dtd-!-entity7 77) + (def-dtd-parser-state state-dtd-!-attdef-notation5 78) + (def-dtd-parser-state state-!-dtd-public3 79) + (def-dtd-parser-state state-dtd-!-cond 80) + (def-dtd-parser-state state-dtd-!-cond2 81) + (def-dtd-parser-state state-dtd-!-include 82) + (def-dtd-parser-state state-dtd-!-ignore 83) + (def-dtd-parser-state state-dtd-!-include2 84) + (def-dtd-parser-state state-dtd-!-include3 85) + (def-dtd-parser-state state-dtd-!-include4 86) + (def-dtd-parser-state state-dtd-!-ignore2 87) + (def-dtd-parser-state state-dtd-!-ignore3 88) + (def-dtd-parser-state state-dtd-!-ignore4 89) + (def-dtd-parser-state state-dtd-!-ignore5 90) + (def-dtd-parser-state state-dtd-!-ignore6 91) + (def-dtd-parser-state state-dtd-!-ignore7 92) + ) + +(defun next-dtd-token (tokenbuf + external include-count external-callback) + (declare (:fbound parse-default-value) (optimize (speed 3) (safety 1))) + (macrolet ((add-to-entity-buf (entity-symbol p-value) + `(progn + (push (make-tokenbuf :cur 0 :max (length ,p-value) :data ,p-value) + (iostruct-entity-bufs tokenbuf)))) + + (un-next-char (ch) + `(push ,ch (iostruct-unget-char tokenbuf))) + + (clear-coll (coll) + `(setf (collector-next ,coll) 0)) + + (add-to-coll (coll ch) + `(let ((.next. (collector-next ,coll))) + (if* (>= .next. (collector-max ,coll)) + then (grow-and-add ,coll ,ch) + else (setf (schar (collector-data ,coll) .next.) + ,ch) + (setf (collector-next ,coll) (1+ .next.))))) + + (to-preferred-case (ch) + ;; should check the case mode + `(char-downcase ,ch)) + + ) + (let ((state state-dtdstart) + (coll (get-collector)) + (entity (get-collector)) + (tag-to-return) + (contents-to-return) + (pending (list nil)) + (pending-type) + (value-delim) + (public-string) + (char-code 0) + (check-count 0) + (ignore-count 0) + (reference-save-state) + (prefp) + (entityp) + (pentityp) + (prev-state) + (ch)) + (loop + (setq ch (get-next-char tokenbuf)) + (when *debug-dtd* + (format t "~@~%" + ch (or (cdr (assoc state dtd-parser-states)) state) + contents-to-return pending pending-type + (iostruct-entity-names tokenbuf))) + (if* (null ch) + then (setf prev-state state) + (setf state :eof) + (return) ;; eof -- exit loop + ) + + (case state + (#.state-dtdstart + (if* (and (eq #\] ch) + external (> include-count 0)) then + (setf state state-dtd-!-include3) + elseif (and (eq #\] ch) (not external)) then (return) + elseif (eq #\< ch) then (setf state state-tokenstart) + elseif (xml-space-p ch) then nil + elseif (eq #\% ch) then (external-param-reference tokenbuf coll external-callback) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD characters, starting at: '" + (compute-coll-string coll) + "'")) + )) + (#.state-dtd-!-include3 + (if* (eq #\] ch) then (setf state state-dtd-!-include4) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD token, starting at: ']" + (compute-coll-string coll) + "'")))) + (#.state-dtd-!-include4 + (if* (eq #\> ch) then (return) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD token, starting at: ']]" + (compute-coll-string coll) + "'")))) + #+ignore + (#.state-dtd-pref + (if* (xml-name-start-char-p ch) then + (add-to-coll coll ch) + (setf state state-dtd-pref2) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD parameter reference name, starting at: '" + (compute-coll-string coll) + "'")) + )) + (#.state-tokenstart + (if* (eq #\? ch) then (setf state state-dtd-?) + elseif (eq #\! ch) then (setf state state-dtd-!) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD characters, starting at: '<" + (compute-coll-string coll) + "'")) + )) + (#.state-dtd-? + (if* (xml-name-char-p ch) + then + (add-to-coll coll ch) + elseif (and external (eq #\% ch)) then + (external-param-reference tokenbuf coll external-callback) + else + (when (not (xml-space-p ch)) + (xml-error (concatenate 'string + "expecting name following: ' ch) + then + (push (compute-coll-string coll) contents-to-return) + (clear-coll coll) + (return) + else (setf state state-dtd-?-3) + (add-to-coll coll #\?) + (add-to-coll coll ch))) + (#.state-dtd-! + (if* (eq #\- ch) then (setf state state-dtd-comment) + elseif (xml-name-start-char-p ch) then (setf state state-dtd-!-token) + (un-next-char ch) + elseif (and (eq #\[ ch) external) then + (setf state state-dtd-!-cond) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD characters, starting at: ' ch) then (decf ignore-count) + (when (= ignore-count 0) (return)) + else (un-next-char ch) + (setf state state-dtd-!-ignore3))) + (#.state-dtd-!-include + (if* (and (eq check-count 6) (eq ch #\E)) then + (setf state state-dtd-!-include2) + elseif (eq ch (elt "INCLUD" check-count)) then + (incf check-count) + else (xml-error " ch) + then (push (compute-coll-string coll) contents-to-return) + (clear-coll coll) + (return) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal token following '--' comment terminator, starting at '--" + (compute-coll-string coll) + "'")) + )) + (#.state-dtd-!-token + (if* (xml-name-char-p ch) then (add-to-coll coll ch) + elseif (and external (eq #\% ch)) then + (external-param-reference tokenbuf coll external-callback) + elseif (xml-space-p ch) then + (setf tag-to-return (compute-tag coll)) + (clear-coll coll) + (if* (eq tag-to-return :ELEMENT) then (setf state state-dtd-!-element) + elseif (eq tag-to-return :ATTLIST) then + (setf state state-dtd-!-attlist) + elseif (eq tag-to-return :ENTITY) then + (setf entityp t) + (setf state state-dtd-!-entity) + elseif (eq tag-to-return :NOTATION) then + (setf state state-dtd-!-notation) + else + (xml-error (concatenate 'string + "illegal DTD characters, starting at: ' ch)) then + (setf state state-!-dtd-system) + (return) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error + (concatenate 'string + "Expected space before: '" + (compute-coll-string coll) "'")) + )) + (#.state-!-dtd-system + (if* (xml-space-p ch) then nil + elseif (and external (eq #\% ch)) then + (external-param-reference tokenbuf coll external-callback) + elseif (or (eq #\" ch) (eq #\' ch)) then + (setf state state-!-dtd-system2) + (setf value-delim ch) + elseif (and (not entityp) + (eq #\> ch)) then (return) + else (xml-error + (concatenate 'string + "expected quote or double-quote got: '" + (string ch) + "' in ch) then (return) + else + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD ch) then (return) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD + elseif (eq #\> ch) then + (push (compute-tag coll) contents-to-return) + (clear-coll coll) + (return) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD (length tmp) 0) + (when (null (first pending)) (setf pending (rest pending))) + (push tmp pending))) + (if* (> (length pending) 1) then + (push (nreverse pending) contents-to-return) + else (push (first pending) contents-to-return)) + (setf pending (list nil)) + (setf state state-dtd-!-entity5) + (clear-coll coll) + (if* pentityp then + (when (not (assoc (third contents-to-return) + (iostruct-parameter-entities tokenbuf))) + (setf (iostruct-parameter-entities tokenbuf) + (acons (third contents-to-return) + (first contents-to-return) + (iostruct-parameter-entities tokenbuf)))) + else + (when (not (assoc (second contents-to-return) + (iostruct-general-entities tokenbuf))) + (setf (iostruct-general-entities tokenbuf) + (acons (second contents-to-return) + (first contents-to-return) + (iostruct-general-entities tokenbuf))))) + elseif (eq #\& ch) then + (setf reference-save-state state-dtd-!-entity-value) + (setf state state-dtd-!-attdef-decl-value3) + elseif (eq #\% ch) then + (setf prefp t) + (setf reference-save-state state-dtd-!-entity-value) + (setf state state-dtd-!-attdef-decl-value3) + elseif (xml-char-p ch) + then (add-to-coll coll ch) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD ch) then (return) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD contents following ch) then + (push (compute-tag coll *package*) + contents-to-return) + (clear-coll coll) + (return) + else (push (compute-tag coll) + contents-to-return) + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD ch) then (return) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD )) then + (setf count 6) + else (setf last-ch cch)) + elseif (< count 6) then + (when (and (= count 5) + (xml-space-p cch)) + (setf cch #\space)) + (if* (not (eq cch + (schar string count) + )) then + (loop + (when (= tmp-count count) + (return)) + (add-to-coll coll + (schar string + tmp-count)) + (incf tmp-count)) + (add-to-coll coll cch) + (setf count 10) + else (incf count)) + elseif (= count 6) then + (dotimes (i 6) + (add-to-coll coll (schar string i))) + (setf count 10) + else (add-to-coll coll cch)))) + (setf (iostruct-entity-names tokenbuf) + (rest (iostruct-entity-names tokenbuf))) + (close entity-stream) + (put-back-tokenbuf tmp-buf))))) + ) + (setf state state-dtdstart) + else nil + ))) + (setf state reference-save-state) + else (let ((tmp (compute-coll-string entity))) + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "reference not terminated by ';', starting at: '&" + tmp + (compute-coll-string coll) + "'"))) + )) + (#.state-dtd-!-attdef-decl-value6 + (let ((code (char-code ch))) + (if* (eq #\; ch) + then (add-to-coll coll (code-char char-code)) + (setf char-code 0) + (setq state state-dtd-!-attdef-decl-value) + elseif (<= (char-code #\0) code (char-code #\9)) + then (setf char-code (+ (* char-code 16) (- code (char-code #\0)))) + elseif (<= (char-code #\A) code (char-code #\F)) + then (setf char-code (+ 10 (* char-code 16) (- code (char-code #\A)))) + elseif (<= (char-code #\a) code (char-code #\f)) + then (setf char-code (+ 10 (* char-code 16) (- code (char-code #\a)))) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal hexidecimal character reference code, starting at: '" + (compute-coll-string coll) + "', calculated char code: " + (format nil "~s" char-code))) + ))) + (#.state-dtd-!-attdef-decl-value7 + (let ((code (char-code ch))) + (if* (eq #\; ch) + then (add-to-coll coll (code-char char-code)) + (setf char-code 0) + (setq state reference-save-state) + elseif (<= (char-code #\0) code (char-code #\9)) + then (setf char-code (+ (* char-code 10) (- code (char-code #\0)))) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal decimal character reference code, starting at: '" + (compute-coll-string coll) + "', calculated char code: " + (format nil "~s" char-code))) + ))) + (#.state-dtd-!-attdef-decl-type + (if* (xml-name-char-p ch) then (add-to-coll coll ch) + elseif (and external (eq #\% ch)) then + (external-param-reference tokenbuf coll external-callback) + elseif (or (xml-space-p ch) (eq #\> ch)) then + (let ((token (compute-tag coll))) + (when (and (not (eq :REQUIRED token)) + (not (eq :IMPLIED token)) + (not (eq :FIXED token))) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD ch) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD ch) then + (setf contents-to-return + (append contents-to-return (list (nreverse pending)))) + (return) + else (setf contents-to-return + (append contents-to-return (list (nreverse pending)))) + (setf pending (list nil)) + (setf state state-dtd-!-attdef))) + (clear-coll coll) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD ch) then + ;; there only one name... + (setf (first contents-to-return) (first (first contents-to-return))) + (return) + elseif (eq #\* ch) then + (setf state state-dtd-!-element-type-paren-pcd5) + (setf (first contents-to-return) (nreverse (first contents-to-return))) + (if* (> (length (first contents-to-return)) 1) then + (setf (first contents-to-return) + (list (append (list :choice) + (first contents-to-return)))) + elseif (listp (first (first contents-to-return))) then + (setf (first contents-to-return) + (first (first contents-to-return)))) + (push :* (first contents-to-return)) + elseif (eq #\? ch) then + (setf state state-dtd-!-element-type-paren-pcd5) + (setf (first contents-to-return) (nreverse (first contents-to-return))) + (if* (> (length (first contents-to-return)) 1) then + (setf (first contents-to-return) + (list (append (list :choice) + (first contents-to-return)))) + elseif (listp (first (first contents-to-return))) then + (setf (first contents-to-return) + (first (first contents-to-return)))) + (push :? (first contents-to-return)) + elseif (eq #\+ ch) then + (setf state state-dtd-!-element-type-paren-pcd5) + (setf (first contents-to-return) (nreverse (first contents-to-return))) + (if* (> (length (first contents-to-return)) 1) then + (setf (first contents-to-return) + (list (append (list :choice) + (first contents-to-return)))) + elseif (listp (first (first contents-to-return))) then + (setf (first contents-to-return) + (first (first contents-to-return)))) + (push :+ (first contents-to-return)) + elseif (and external (eq #\% ch)) then + (external-param-reference tokenbuf coll external-callback) + elseif (xml-space-p ch) then + (setf state state-dtd-!-element-type-paren-pcd5) + (setf (first contents-to-return) (nreverse (first contents-to-return))) + (when (> (length (first contents-to-return)) 1) + (setf (first contents-to-return) + (list (append (list :\choice) + (first contents-to-return))))) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD (length (first pending)) 1) then + (push (first pending-type) (first pending)) + (setf pending-type (rest pending-type)) + else (setf (first pending) (first (first pending)))) + (push (first pending) contents-to-return) + (setf state state-dtd-!-element-type-paren3) + else (setf (first pending) (nreverse (first pending))) + (if* (> (length (first pending)) 1) then + (push (first pending-type) (first pending)) + (setf pending-type (rest pending-type)) + else (setf (first pending) (first (first pending)))) + (if* (second pending) then + (push (first pending) (second pending)) + else (setf (second pending) (list (first pending)))) + (setf pending (rest pending)) + (setf state state-dtd-!-element-type-paren-choice-name3) + ) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD (length (first pending)) 1) then + (push (first pending-type) (first pending)) + (setf pending-type (rest pending-type)) + else (setf (first pending) (first (first pending)))) + (push (first pending) contents-to-return) + (setf state state-dtd-!-element-type-paren3) + else (setf (first pending) (nreverse (first pending))) + (push (first pending-type) (first pending)) + (setf pending-type (rest pending-type)) + (if* (second pending) then + (push (first pending) (second pending)) + else (setf (second pending) + ;; (list (first pending)) ;2001-03-22 + (first pending) ;2001-03-22 + )) + (setf pending (rest pending)) + (setf state state-dtd-!-element-type-paren-choice-name3) + ) + elseif (eq #\, ch) then + (when (and (first pending) (not (eq :seq (first pending-type)))) + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal '|' and ',' mix starting at '" + (compute-coll-string coll) + "'"))) + (push (compute-tag coll) (first pending)) + (clear-coll coll) + (push :seq pending-type) + (setf state state-dtd-!-element-type-paren-choice) + elseif (eq #\| ch) then + (when (and (first pending) (not (eq :choice (first pending-type)))) + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal '|' and ',' mix starting at '" + (compute-coll-string coll) + "'"))) + (push (compute-tag coll) (first pending)) + (clear-coll coll) + (push :choice pending-type) + (setf state state-dtd-!-element-type-paren-choice) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD (length (first pending)) 1) then + (push (first pending-type) (first pending)) + (setf pending-type (rest pending-type)) + else (setf (first pending) (first (first pending)))) + (push (first pending) contents-to-return) + (setf state state-dtd-!-element-type-paren3) + else (setf (first pending) (nreverse (first pending))) + (push (first pending-type) (first pending)) + (setf pending-type (rest pending-type)) + (if* (second pending) then + (push (first pending) (second pending)) + else (setf (second pending) (list (first pending)))) + (setf state state-dtd-!-element-type-paren-choice-name3) + ) + (setf pending (rest pending)) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD (length (first pending)) 1) then + (push (first pending-type) (first pending)) + (setf pending-type (rest pending-type)) + else (setf (first pending) (first (first pending)))) + (push (first pending) contents-to-return) + (setf pending (rest pending)) + (setf state state-dtd-!-element-type-paren3) + else (setf (first pending) (nreverse (first pending))) + (if* (> (length (first pending)) 1) then + (push (first pending-type) (first pending)) + (setf pending-type (rest pending-type)) + else (setf (first pending) (first (first pending)))) + (if* (second pending) then + (push (first pending) (second pending)) + else (setf (second pending) (list (first pending)))) + (setf pending (rest pending)) + (setf state state-dtd-!-element-type-paren-choice) + ) + elseif (eq #\, ch) then + (when (and (rest (first pending)) (not (eq :seq (first pending-type)))) + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal '|' and ',' mix starting at '" + (compute-coll-string coll) + "'"))) + (push :seq pending-type) + (setf state state-dtd-!-element-type-paren-choice) + elseif (eq #\| ch) then + (when (and (rest (first pending)) (not (eq :choice (first pending-type)))) + (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal '|' and ',' mix starting at '" + (compute-coll-string coll) + "'"))) + (push :choice pending-type) + (setf state state-dtd-!-element-type-paren-choice) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD ch) then (return) + else (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD ch) then (return) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD contents following ch) then (return) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD contents following ch) then (return) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD contents following (length (first contents-to-return)) 1) + (setf (first contents-to-return) + (list (append (list :choice) + (first contents-to-return))))) + (push :* (first contents-to-return)) + else (clear-coll coll) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "illegal DTD contents in ch) then + (let ((token (compute-tag coll))) + (when (not (or (eq token :EMPTY) (eq token :ANY))) + (xml-error (concatenate 'string + "illegal DTD ch) then (return) + else (xml-error (concatenate 'string + "expected '>', got '" + (string ch) + "' in DTD include-count 0) (not (eq prev-state state-dtdstart))) then + (xml-error "unexpected end of input while processing external DTD")) + (values nil :end-dtd)) + (t + (print (list tag-to-return contents-to-return)) + (error "need to support dtd state:~s" state))) + ) + )) + +(defun external-param-reference (tokenbuf old-coll external-callback) + (declare (:fbound next-token) (ignorable old-coll) (optimize (speed 3) (safety 1))) + (setf (iostruct-seen-parameter-reference tokenbuf) t) + (macrolet ((add-to-entity-buf (entity-symbol p-value) + `(progn + (push (make-tokenbuf :cur 0 :max (length ,p-value) :data ,p-value) + (iostruct-entity-bufs tokenbuf)))) + (clear-coll (coll) + `(setf (collector-next ,coll) 0)) + (un-next-char (ch) + `(push ,ch (iostruct-unget-char tokenbuf))) + (add-to-coll (coll ch) + `(let ((.next. (collector-next ,coll))) + (if* (>= .next. (collector-max ,coll)) + then (grow-and-add ,coll ,ch) + else (setf (schar (collector-data ,coll) .next.) + ,ch) + (setf (collector-next ,coll) (1+ .next.)))))) + (let ((ch (get-next-char tokenbuf)) + (coll (get-collector)) + p-value entity-symbol) + (add-to-coll coll ch) + (when (not (xml-name-start-char-p ch)) + (dotimes (i 15) + (add-to-coll coll ch) + (setq ch (get-next-char tokenbuf)) + (if* (null ch) + then (return))) + (xml-error (concatenate 'string + "Illegal DTD parameter entity name starting at: " + (compute-coll-string coll)))) + (loop + (setf ch (get-next-char tokenbuf)) + (if* (eq #\; ch) then + (setf entity-symbol (compute-tag coll)) + (clear-coll coll) + #+ignore (format t "entity symbol: ~s entities: ~s match: ~s~%" + entity-symbol (iostruct-parameter-entities tokenbuf) + (assoc entity-symbol + (iostruct-parameter-entities tokenbuf))) + (if* (and (iostruct-do-entity tokenbuf) + (setf p-value + (assoc entity-symbol + (iostruct-parameter-entities tokenbuf)))) then + (setf p-value (rest p-value)) + (when (member entity-symbol (iostruct-entity-names tokenbuf)) + (xml-error (concatenate 'string + "entity:" + (string entity-symbol) + " in recursive reference"))) + (push entity-symbol (iostruct-entity-names tokenbuf)) + (if* (stringp p-value) then + (setf p-value (concatenate 'string " " p-value " ")) + (add-to-entity-buf entity-symbol p-value) + elseif (null external-callback) then + (setf (iostruct-do-entity tokenbuf) nil) + elseif p-value then + (let ((entity-stream (apply external-callback p-value))) + (when entity-stream + (let ((entity-buf (get-tokenbuf))) + (setf (tokenbuf-stream entity-buf) entity-stream) + (unicode-check entity-stream tokenbuf) + (add-to-entity-buf entity-symbol " ") + (push entity-buf + (iostruct-entity-bufs tokenbuf)) + (let ((count 0) cch + (string "