2 ;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA
4 ;; This code is free software; you can redistribute it and/or
5 ;; modify it under the terms of the version 2.1 of
6 ;; the GNU Lesser General Public License as published by
7 ;; the Free Software Foundation, as clarified by the AllegroServe
8 ;; prequel found in license-allegroserve.txt.
10 ;; This code is distributed in the hope that it will be useful,
11 ;; but without any warranty; without even the implied warranty of
12 ;; merchantability or fitness for a particular purpose. See the GNU
13 ;; Lesser General Public License for more details.
15 ;; Version 2.1 of the GNU Lesser General Public License is in the file
16 ;; license-lgpl.txt that was distributed with this file.
17 ;; If it is not present, you can access it from
18 ;; http://www.gnu.org/copyleft/lesser.txt (until superseded by a newer
19 ;; version) or write to the Free Software Foundation, Inc., 59 Temple Place,
20 ;; Suite 330, Boston, MA 02111-1307 USA
24 ;; 10/14/00 add namespace support
26 (in-package :net.xml.parser)
28 (pxml-dribble-bug-hook "$Id$")
30 ;; state titles can be better chosen and explained
32 (defvar *debug-xml* nil)
34 (defmethod parse-xml ((str string) &key external-callback general-entities parameter-entities
35 content-only uri-to-package)
36 (declare (optimize (speed 3) (safety 1)))
37 (parse-xml (make-string-input-stream str) :external-callback external-callback
38 :general-entities general-entities
39 :parameter-entities parameter-entities :content-only content-only
40 :uri-to-package uri-to-package))
42 (defmethod parse-xml ((p stream) &key external-callback general-entities
43 parameter-entities content-only uri-to-package)
44 (declare (optimize (speed 3) (safety 1)))
45 (pxml-internal0 p nil external-callback general-entities parameter-entities content-only
48 (eval-when (compile load eval)
49 (defconstant state-docstart 0) ;; looking for XMLdecl, Misc, doctypedecl, 1st element
50 (defconstant state-docstart-misc 1) ;; looking for Misc, doctypedecl, 1st element
51 (defconstant state-docstart-misc2 2) ;; looking for Misc, 1st element
52 (defconstant state-element-done 3) ;; looking for Misc
53 (defconstant state-element-contents 4) ;; looking for element content
56 (defun all-xml-whitespace-p (val)
57 (dotimes (i (length val) t)
58 (when (not (xml-space-p (elt val i))) (return nil))))
60 (defun pxml-internal0 (p read-sequence-func external-callback
61 general-entities parameter-entities content-only uri-to-package)
62 (declare (optimize (speed 3) (safety 1)))
63 (let ((tokenbuf (make-iostruct :tokenbuf (get-tokenbuf)
65 :read-sequence-func read-sequence-func)))
66 ;; set up stream right
67 (setf (tokenbuf-stream (iostruct-tokenbuf tokenbuf)) p)
68 ;; set up user specified entities
69 (setf (iostruct-parameter-entities tokenbuf) parameter-entities)
70 (setf (iostruct-general-entities tokenbuf) general-entities)
71 (setf (iostruct-uri-to-package tokenbuf) uri-to-package)
72 ;; look for Unicode file
73 (unicode-check p tokenbuf)
75 (values (pxml-internal tokenbuf external-callback content-only)
76 (iostruct-uri-to-package tokenbuf))
77 (dolist (entity-buf (iostruct-entity-bufs tokenbuf))
78 (when (streamp (tokenbuf-stream entity-buf))
79 (close (tokenbuf-stream entity-buf))
80 (put-back-tokenbuf entity-buf))))
83 (defun pxml-internal (tokenbuf external-callback content-only)
84 (declare (optimize (speed 3) (safety 1)))
85 (let ((state state-docstart)
95 (multiple-value-bind (val kind kind2)
96 (next-token tokenbuf external-callback attlist-data)
98 (format t "val: ~s kind: ~s kind2: ~s state: ~s~%" val kind kind2 state))
101 (if* (and (listp val) (eq :xml (first val)) (eq kind :xml) (eq kind2 :end-tag))
103 (check-xmldecl val tokenbuf)
104 (when (not content-only) (push val guts))
105 (setf state state-docstart-misc)
106 elseif (eq kind :comment)
108 (when (not content-only) (push val guts))
109 (setf state state-docstart-misc)
110 elseif (and (listp val) (eq :DOCTYPE (first val)))
112 (if* (eq (third val) :SYSTEM) then
113 (setf system-string (fourth val))
114 (setf val (remove (third val) val))
115 (setf val (remove (third val) val))
116 elseif (eq (third val) :PUBLIC) then
117 (setf public-string (normalize-public-value (fourth val)))
118 (setf system-string (fifth val))
119 (setf val (remove (third val) val))
120 (setf val (remove (third val) val))
121 (setf val (remove (third val) val)))
123 (if* external-callback then
124 (let ((ext-stream (apply external-callback
125 (list (parse-uri system-string)
130 (let (ext-io (entity-buf (get-tokenbuf)))
131 (setf (tokenbuf-stream entity-buf) ext-stream)
132 (setf ext-io (make-iostruct :tokenbuf entity-buf
134 (iostruct-do-entity tokenbuf)
136 (iostruct-read-sequence-func tokenbuf)))
137 (unicode-check ext-stream ext-io)
138 (setf (iostruct-parameter-entities ext-io)
139 (iostruct-parameter-entities tokenbuf))
140 (setf (iostruct-general-entities ext-io)
141 (iostruct-general-entities tokenbuf))
143 (setf val (append val
148 t external-callback)))))
149 (setf (iostruct-seen-any-dtd tokenbuf) t)
150 (setf (iostruct-seen-external-dtd tokenbuf) t)
151 (setf (iostruct-seen-parameter-reference tokenbuf)
152 (iostruct-seen-parameter-reference ext-io))
153 (setf (iostruct-general-entities tokenbuf)
154 (iostruct-general-entities ext-io))
155 (setf (iostruct-parameter-entities tokenbuf)
156 (iostruct-parameter-entities ext-io))
157 (setf (iostruct-do-entity tokenbuf)
158 (iostruct-do-entity ext-io))
159 (dolist (entity-buf2 (iostruct-entity-bufs ext-io))
160 (when (streamp (tokenbuf-stream entity-buf2))
161 (close (tokenbuf-stream entity-buf2))
162 (put-back-tokenbuf entity-buf2)))
163 (close (tokenbuf-stream entity-buf))
164 (put-back-tokenbuf entity-buf))
167 (setf (iostruct-do-entity tokenbuf) nil)))
169 (process-attlist (rest (rest val)) attlist-data))
170 (when (not content-only) (push val guts))
171 (setf state state-docstart-misc2)
175 (setf state state-docstart-misc)
176 elseif (eq kind :pcdata)
178 (when (or (not kind2) (not (all-xml-whitespace-p val)))
179 (if* (not kind2) then
180 (xml-error "An entity reference occured where only whitespace or the first element may occur")
182 (xml-error (concatenate 'string
183 "unrecognized content '"
184 (subseq val 0 (min (length val) 40)) "'"))))
185 (setf state state-docstart-misc)
186 elseif (or (symbolp val)
187 (and (listp val) (symbolp (first val))))
189 (when (eq kind :start-tag)
190 (setf val (add-default-values val attlist-data)))
191 (if* (and (eq kind :start-tag) (eq kind2 :end-tag))
192 then (push (list val) guts)
193 (setf state state-element-done)
194 elseif (eq kind :start-tag)
195 then (push (list val) pending)
196 ;;(format t "pending: ~s guts: ~s <1>~%" pending guts)
197 (when (iostruct-entity-bufs tokenbuf)
198 (push (if (symbolp val) val (first val)) entity-open-tags))
199 (setf state state-element-contents)
200 else (xml-error (concatenate 'string
201 "encountered token at illegal syntax position: '"
203 (if* (null guts) then
204 " at start of contents"
208 (format nil "~s" (first guts))
211 (print (list val kind kind2))
212 (break "need to check for other allowable docstarts")))
213 (#.state-docstart-misc2
214 (if* (eq kind :pcdata)
216 (when (or (not kind2) (not (all-xml-whitespace-p val)))
217 (if* (not kind2) then
218 (xml-error "An entity reference occured where only whitespace or the first element may occur")
220 (xml-error (concatenate 'string
221 "unrecognized content '"
222 (subseq val 0 (min (length val) 40)) "'"))))
223 elseif (and (listp val) (eq :comment (first val)))
225 (when (not content-only) (push val guts))
229 elseif (eq kind :eof)
231 (xml-error "unexpected end of file encountered")
232 elseif (or (symbolp val)
233 (and (listp val) (symbolp (first val))))
235 (when (eq kind :start-tag)
236 (setf val (add-default-values val attlist-data)))
237 (if* (and (eq kind :start-tag) (eq kind2 :end-tag))
238 then (push (list val) guts)
239 (setf state state-element-done)
240 elseif (eq kind :start-tag)
241 then (push (list val) pending)
242 ;;(format t "pending: ~s guts: ~s <2>~%" pending guts)
243 (when (iostruct-entity-bufs tokenbuf)
244 (push (if (symbolp val) val (first val)) entity-open-tags))
245 (setf state state-element-contents)
246 else (xml-error (concatenate 'string
247 "encountered token at illegal syntax position: '"
249 (if* (null guts) then
250 " at start of contents"
254 (format nil "~s" (first guts))
257 (error "this branch unexpected <1>")))
258 (#.state-docstart-misc
259 (if* (eq kind :pcdata)
261 (when (or (not kind2) (not (all-xml-whitespace-p val)))
262 (if* (not kind2) then
263 (xml-error "An entity reference occured where only whitespace or the first element may occur")
265 (xml-error (concatenate 'string
266 "unrecognized content '"
267 (subseq val 0 (min (length val) 40)) "'"))))
268 elseif (and (listp val) (eq :DOCTYPE (first val)))
270 (if* (eq (third val) :SYSTEM) then
271 (setf system-string (fourth val))
272 (setf val (remove (third val) val))
273 (setf val (remove (third val) val))
274 elseif (eq (third val) :PUBLIC) then
275 (setf public-string (normalize-public-value (fourth val)))
276 (setf system-string (fifth val))
277 (setf val (remove (third val) val))
278 (setf val (remove (third val) val))
279 (setf val (remove (third val) val)))
281 (if* external-callback then
282 (let ((ext-stream (apply external-callback
283 (list (parse-uri system-string)
288 (let (ext-io (entity-buf (get-tokenbuf)))
289 (setf (tokenbuf-stream entity-buf) ext-stream)
290 (setf ext-io (make-iostruct :tokenbuf entity-buf
292 (iostruct-do-entity tokenbuf)
294 (iostruct-read-sequence-func tokenbuf)))
295 (unicode-check ext-stream ext-io)
296 (setf (iostruct-parameter-entities ext-io)
297 (iostruct-parameter-entities tokenbuf))
298 (setf (iostruct-general-entities ext-io)
299 (iostruct-general-entities tokenbuf))
301 (setf val (append val
306 t external-callback)))))
307 (setf (iostruct-seen-any-dtd tokenbuf) t)
308 (setf (iostruct-seen-external-dtd tokenbuf) t)
309 (setf (iostruct-seen-parameter-reference tokenbuf)
310 (iostruct-seen-parameter-reference ext-io))
311 (setf (iostruct-general-entities tokenbuf)
312 (iostruct-general-entities ext-io))
313 (setf (iostruct-parameter-entities tokenbuf)
314 (iostruct-parameter-entities ext-io))
315 (setf (iostruct-do-entity tokenbuf)
316 (iostruct-do-entity ext-io))
317 (dolist (entity-buf2 (iostruct-entity-bufs ext-io))
318 (when (streamp (tokenbuf-stream entity-buf2))
319 (close (tokenbuf-stream entity-buf2))
320 (put-back-tokenbuf entity-buf2)))
321 (close (tokenbuf-stream entity-buf))
322 (put-back-tokenbuf entity-buf))
325 (setf (iostruct-do-entity tokenbuf) nil)))
327 (process-attlist (rest (rest val)) attlist-data))
328 (when (not content-only) (push val guts))
329 (setf state state-docstart-misc2)
330 elseif (and (listp val) (eq :comment (first val)))
332 (when (not content-only) (push val guts))
336 elseif (or (symbolp val)
337 (and (listp val) (symbolp (first val))))
339 (when (eq kind :start-tag)
340 (setf val (add-default-values val attlist-data)))
341 (if* (and (eq kind :start-tag) (eq kind2 :end-tag))
342 then (push (list val) guts)
343 (setf state state-element-done)
344 elseif (eq kind :start-tag)
345 then (push (list val) pending)
346 ;;(format t "pending: ~s guts: ~s <3>~%" pending guts)
347 (when (iostruct-entity-bufs tokenbuf)
348 (push (if (symbolp val) val (first val)) entity-open-tags))
349 (setf state state-element-contents)
350 else (xml-error (concatenate 'string
351 "encountered token at illegal syntax position: '"
355 (format nil "~s" (first guts))
358 (print (list val kind kind2))
359 (break "check for other docstart-misc states")))
360 (#.state-element-contents
361 (if* (or (symbolp val)
362 (and (listp val) (symbolp (first val))))
364 (when (eq kind :start-tag)
365 (setf val (add-default-values val attlist-data)))
366 (if* (eq kind :end-tag)
367 then (let ((candidate (first (first pending))))
368 (when (listp candidate) (setf candidate (first candidate)))
369 (if* (eq candidate val)
371 (if* (iostruct-entity-bufs tokenbuf) then
372 (when (not (eq (first entity-open-tags) val))
376 " element closed in entity that did not open it")))
377 (setf entity-open-tags (rest entity-open-tags))
379 (when (eq (first entity-open-tags) val)
383 " element closed outside of entity that did not open it")))
385 (if* (= (length pending) 1)
387 (push (first pending) guts)
388 (setf state state-element-done)
390 (setf (second pending)
391 (append (second pending) (list (first pending)))))
392 (setf pending (rest pending))
393 ;;(format t "pending: ~s guts: ~s <4>~%" pending guts)
394 else (xml-error (format nil
395 "encountered end tag: ~s expected: ~s"
397 elseif (and (eq kind :start-tag) (eq kind2 :end-tag))
399 (setf (first pending)
400 (append (first pending) (list (list val))))
401 ;;(format t "pending: ~s guts: ~s <5>~%" pending guts)
402 elseif (eq kind :start-tag)
404 (push (list val) pending)
405 ;;(format t "pending: ~s guts: ~s <6>~%" pending guts)
406 (when (iostruct-entity-bufs tokenbuf)
407 (push (if (symbolp val) val (first val)) entity-open-tags))
408 elseif (eq kind :cdata) then
409 (setf (first pending)
410 (append (first pending) (rest val)))
411 (let ((old (first pending))
414 (if* (and (stringp (first new)) (stringp item)) then
416 (concatenate 'string (first new) item))
417 else (push item new)))
418 (setf (first pending) (reverse new)))
419 elseif (eq kind :comment) then
420 (when (not content-only) (push val guts))
423 (setf (first pending)
424 (append (first pending) (list val)))
425 elseif (eq kind :eof)
427 (xml-error "unexpected end of file encountered")
428 else (xml-error (format nil "unexpected token: ~s" val)))
429 elseif (eq kind :pcdata)
431 (setf (first pending)
432 (append (first pending) (list val)))
433 (let ((old (first pending))
436 (if* (and (stringp (first new)) (stringp item)) then
438 (concatenate 'string (first new) item))
439 else (push item new)))
440 (setf (first pending) (reverse new)))
441 else (xml-error (format nil "unexpected token: ~s" val))))
442 (#.state-element-done
443 (if* (eq kind :pcdata)
445 (when (or (not kind2) (not (all-xml-whitespace-p val)))
446 (if* (not kind2) then
447 (xml-error "An entity reference occured where only whitespace or the first element may occur")
449 (xml-error (concatenate 'string
450 "unrecognized content '"
451 (subseq val 0 (min (length val) 40)) "'"))))
452 elseif (eq kind :eof) then
453 (put-back-tokenbuf (iostruct-tokenbuf tokenbuf))
454 (return (nreverse guts))
455 elseif (eq kind :comment) then
456 (when (not content-only) (push val guts))
460 (xml-error (concatenate 'string
461 "encountered token at illegal syntax position: '"
465 (format nil "~s" (first guts))
469 (error "need to support state:~s token:~s kind:~s kind2:~s <parse>" state val kind kind2)))
472 (eval-when (compile load eval)
473 (defconstant state-pcdata 0) ;;looking for < (tag start), & (reference); all else is string data
474 (defconstant state-readtagfirst 1) ;; seen < looking for /,?,!,name start
475 (defconstant state-readtag-? 2) ;; seen <? looking for space,char
476 (defconstant state-readtag-! 3) ;; seen <! looking for name,[,-
477 (defconstant state-readtag-end 4) ;; found </ looking for tag name
478 (defconstant state-readtag 5) ;; found < name start looking for more name, /, >
479 (defconstant state-findattributename 6) ;; found <?xml space looking for ?,>,space,name start
480 (defconstant state-readpi 7)
481 (defconstant state-noattributename 8)
482 (defconstant state-attribname 9) ;; found <?xml space name start looking for more name,=
483 (defconstant state-attribstartvalue 10) ;; found <?xml space name= looking for ',"
484 (defconstant state-attribvaluedelim 11)
485 (defconstant state-readtag-!-name 12) ;; seen <!name(start) looking for more name chars or space
486 (defconstant state-readtag-!-conditional 13) ;; found <![ looking for CDATA, INCLUDE, IGNORE
487 (defconstant state-readtag-!-comment 14)
488 (defconstant state-readtag-!-readcomment 15)
489 (defconstant state-readtag-!-readcomment2 16)
490 (defconstant state-readtag-end-bracket 17)
491 (defconstant state-readpi2 18) ;; found <?name space char looking for char,?
492 (defconstant state-prereadpi 19);; found <?name space looking for space,character
493 (defconstant state-pre-!-contents 20) ;; found <!name space looking for > or contents
494 (defconstant state-!-contents 21) ;; found <!name space name start looking for more name,>,[,space
495 (defconstant state-!-doctype 22) ;; found <!DOCTYPE space looking for space,>,[,name
496 (defconstant state-begin-dtd 23)
497 (defconstant state-!-doctype-ext 24) ;; found <!DOCTYPE space name space name start looking for name,space
498 (defconstant state-!-doctype-system 25) ;; found <!DOCTYPE name SYSTEM looking for ',"
499 (defconstant state-!-doctype-public 26) ;; found <!DOCTYPE name PUBLIC looking for ',"
500 (defconstant state-!-doctype-system2 27) ;; found <!DOCTYPE name SYSTEM " looking for chars,"
501 (defconstant state-!-doctype-system3 28) ;; found <!DOCTYPE name SYSTEM ' looking for chars,'
502 (defconstant state-!-doctype-ext2 29) ;; found <!DOCTYPE name SYSTEM/PUBLIC etc. looking for space,>,[
503 (defconstant state-!-doctype-ext3 30) ;; processed DTD looking for space,>
504 (defconstant state-!-doctype-public2 31) ;; found <!DOCTYPE name PUBLIC " looking for text or "
505 (defconstant state-!-doctype-public3 32) ;; found <!DOCTYPE name PUBLIC ' looking for text or '
506 (defconstant state-readtag2 33) ;; found <name looking for space,/,>,attrib name
507 (defconstant state-readtag3 34) ;; found <name/ or <name / looking for >
508 (defconstant state-readtag4 35) ;; found <name attrib-name start looking for more name,=
509 (defconstant state-readtag5 36) ;; found attrib= looking for ',"
510 (defconstant state-readtag6 37) ;; found attrib=['"] looking for end delimiter,value,reference
511 (defconstant state-readtag7 38) ;; found & inside attribute value, looking for # or name start
512 (defconstant state-readtag8 39) ;; found &# in attribute value looking for char code
513 (defconstant state-readtag9 40) ;; found &name start looking for more name,;
514 (defconstant state-readtag10 41) ;; found &#x in attribute value looking for hex code
515 (defconstant state-readtag11 42) ;; found &#[0-9] looking for more digits,;
516 (defconstant state-readtag-end2 43) ;; found </ & tag name start looking for more tag, space, >
517 (defconstant state-readtag-end3 44) ;; found </ end tag name space looking for >
518 (defconstant state-pcdata2 45) ;; seen & looking for name start
519 (defconstant state-pcdata3 46) ;; seen &# looking for character reference code
520 (defconstant state-pcdata4 47) ;; working on entity reference name looking for ;
521 (defconstant state-pcdata5 48) ;; working on hex character code reference
522 (defconstant state-pcdata6 49) ;; working on decimal character code reference
523 (defconstant state-findattributename0 50)
524 (defconstant state-readtag6a 51)
525 (defconstant state-readtag-!-conditional4 52)
526 (defconstant state-readtag-!-conditional5 53)
527 (defconstant state-readtag-!-conditional6 54)
528 (defconstant state-readtag-!-conditional7 55)
529 ;;(defconstant state-pcdata-parsed 56)
530 (defconstant state-pcdata7 57)
531 (defconstant state-pcdata8 58)
532 (defconstant state-readtag12 59)
533 (defconstant state-attribname2 60)
536 (defun next-token (tokenbuf external-callback attlist-data)
537 (declare (optimize (speed 3) (safety 1)))
538 ;; return two values:
539 ;; the next token from the stream.
542 ;; if read-sequence-func is non-nil,
543 ;; read-sequence-func is called to fetch the next character
544 (macrolet ((add-to-entity-buf (entity-symbol p-value)
546 (push (make-tokenbuf :cur 0 :max (length p-value) :data p-value)
547 (iostruct-entity-bufs tokenbuf))))
550 `(push ,ch (iostruct-unget-char tokenbuf)))
553 `(setf (collector-next ,coll) 0))
555 (add-to-coll (coll ch)
556 `(let ((.next. (collector-next ,coll)))
557 (if* (>= .next. (collector-max ,coll))
558 then (grow-and-add ,coll ,ch)
559 else (setf (schar (collector-data ,coll) .next.)
561 (setf (collector-next ,coll) (1+ .next.)))))
563 (to-preferred-case (ch)
564 ;; should check the case mode
565 `(char-downcase ,ch))
569 (let ((state state-pcdata)
570 (coll (get-collector))
571 (entity (get-collector))
573 (tag-to-return-string)
581 (special-tag-count 0)
582 (attrib-value-tokenbuf)
592 (setq ch (get-next-char tokenbuf))
593 (when *debug-xml* (format t "ch: ~s code: ~x state:~s entity-names:~s~%"
594 ch (char-code ch) state (iostruct-entity-names tokenbuf)))
596 then (return) ; eof -- exit loop
604 (setf entity-source (first (iostruct-entity-bufs tokenbuf)))
605 (if* (> (collector-next coll) 0)
606 then ; have collected something, return this string
607 (un-next-char ch) ; push back the <
610 (setq state state-readtagfirst))
612 then (setf state state-pcdata2)
613 (setf entity-source (first (iostruct-entity-bufs tokenbuf)))
615 elseif (eq #\] ch) then (setf state state-pcdata7)
616 elseif (not (xml-char-p ch)) then
617 (xml-error (concatenate 'string
618 "Illegal character: "
620 " detected in input"))
622 (add-to-coll coll ch)
624 (if* (not (eq ch #\return))
625 then (add-to-coll coll ch))))
628 (if* (eq #\] ch) then (setf state state-pcdata8)
629 else (setf state state-pcdata)
630 (add-to-coll coll #\]) (un-next-char ch)))
633 (if* (eq #\> ch) then
634 (add-to-coll coll #\])
635 (add-to-coll coll #\])
636 (add-to-coll coll #\>)
638 (add-to-coll coll ch)
639 (setq ch (get-next-char tokenbuf))
642 (xml-error (concatenate 'string
643 "content cannot contain ']]>':'"
644 (compute-coll-string coll)
646 elseif (eq #\] ch) then
647 (add-to-coll coll #\])
648 else (setf state state-pcdata)
649 (add-to-coll coll #\]) (add-to-coll coll #\]) (un-next-char ch)))
653 then (setf state state-pcdata3)
654 elseif (xml-name-start-char-p ch)
655 then (setf state state-pcdata4)
657 else (clear-coll coll)
659 (add-to-coll coll ch)
660 (setq ch (get-next-char tokenbuf))
663 (xml-error (concatenate 'string
664 "illegal reference name, starting at: '&"
665 (compute-coll-string coll)
671 then (setf state state-pcdata5)
672 elseif (<= (char-code #\0) (char-code ch) (char-code #\9))
673 then (setf state state-pcdata6)
675 else (clear-coll coll)
677 (add-to-coll coll ch)
678 (setq ch (get-next-char tokenbuf))
681 (xml-error (concatenate 'string
682 "illegal character reference code, starting at: '&#"
683 (compute-coll-string coll)
688 (if* (xml-name-char-p ch)
689 then (add-to-coll entity ch)
691 then (let ((entity-symbol (compute-tag entity)))
693 (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then
696 (string entity-symbol)
697 " reference cannot be constructed from entity reference/character data sequence"))
699 (setf entity-source nil))
700 (if* (string= (symbol-name entity-symbol) "amp") then (add-to-coll coll #\&)
701 elseif (string= (symbol-name entity-symbol) "lt") then (add-to-coll coll #\<)
702 elseif (string= (symbol-name entity-symbol) "gt") then (add-to-coll coll #\>)
703 elseif (string= (symbol-name entity-symbol) "apos") then (add-to-coll coll #\')
704 elseif (string= (symbol-name entity-symbol) "quot") then (add-to-coll coll #\")
707 (if* (and (iostruct-do-entity tokenbuf)
710 (iostruct-general-entities tokenbuf)))) then
711 (setf p-value (rest p-value))
712 (when (member entity-symbol (iostruct-entity-names tokenbuf))
713 (xml-error (concatenate 'string
715 (string entity-symbol)
716 " in recursive reference")))
717 (push entity-symbol (iostruct-entity-names tokenbuf))
718 (if* (stringp p-value) then
719 (add-to-entity-buf entity-symbol p-value)
720 elseif (null external-callback) then
721 (setf (iostruct-do-entity tokenbuf) nil)
723 (let ((entity-stream (apply external-callback p-value)))
724 (if* entity-stream then
725 (let ((entity-buf (get-tokenbuf)))
726 (setf (tokenbuf-stream entity-buf) entity-stream)
727 (unicode-check entity-stream tokenbuf)
729 (iostruct-entity-bufs tokenbuf))
730 ;; check for possible external textdecl
733 (if* (dotimes (i (length string) t)
734 (setf cch (get-next-char tokenbuf))
739 (schar string count)))
744 (when (< count 0) (return))
745 (un-next-char (schar string count))
747 ;; swallow <?xml token
755 (when (< count 0) (return))
756 (un-next-char (schar string count))
760 (xml-error (concatenate 'string
761 "Reference to unparsed entity "
762 (string entity-symbol)))
765 elseif (or (not (iostruct-seen-any-dtd tokenbuf))
766 (iostruct-standalonep tokenbuf)
767 (and (iostruct-seen-any-dtd tokenbuf)
768 (not (iostruct-seen-external-dtd tokenbuf))
769 (not (iostruct-seen-parameter-reference tokenbuf))))
771 (xml-error (concatenate 'string
772 (string entity-symbol)
773 " must have entity declaration before being referenced"))
776 (setq state state-pcdata)
777 else (let ((tmp (compute-coll-string entity)))
780 (add-to-coll coll ch)
781 (setq ch (get-next-char tokenbuf))
784 (xml-error (concatenate 'string
785 "reference not terminated by ';', starting at: '&"
787 (compute-coll-string coll)
792 (let ((code (char-code ch)))
794 then (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then
797 (string (code-char char-code))
798 " reference cannot be constructed from entity reference/character data sequence"))
800 (setf entity-source nil))
801 (when (not (xml-char-p (code-char char-code)))
804 "Character reference: "
805 (format nil "~s" char-code)
806 " (decimal) is not valid XML input character")))
807 (add-to-coll coll (code-char char-code))
809 (setq state state-pcdata)
810 elseif (<= (char-code #\0) code (char-code #\9))
811 then (setf char-code (+ (* char-code 16) (- code (char-code #\0))))
812 elseif (<= (char-code #\A) code (char-code #\F))
813 then (setf char-code (+ 10 (* char-code 16) (- code (char-code #\A))))
814 elseif (<= (char-code #\a) code (char-code #\f))
815 then (setf char-code (+ 10 (* char-code 16) (- code (char-code #\a))))
816 else (clear-coll coll)
818 (add-to-coll coll ch)
819 (setq ch (get-next-char tokenbuf))
822 (xml-error (concatenate 'string
823 "illegal hexidecimal character reference code, starting at: '"
824 (compute-coll-string coll)
825 "', calculated char code: "
826 (format nil "~s" char-code)))
830 (let ((code (char-code ch)))
832 then (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then
835 (string (code-char char-code))
836 " reference cannot be constructed from entity reference/character data sequence"))
838 (setf entity-source nil))
839 (when (not (xml-char-p (code-char char-code)))
842 "Character reference: "
843 (format nil "~s" char-code)
844 " (decimal) is not valid XML input character")))
845 (add-to-coll coll (code-char char-code))
847 (setq state state-pcdata)
848 elseif (<= (char-code #\0) code (char-code #\9))
849 then (setf char-code (+ (* char-code 10) (- code (char-code #\0))))
850 else (clear-coll coll)
852 (add-to-coll coll ch)
853 (setq ch (get-next-char tokenbuf))
856 (xml-error (concatenate 'string
857 "illegal decimal character reference code, starting at: '"
858 (compute-coll-string coll)
859 "', calculated char code: "
860 (format nil "~s" char-code)))
864 (if* (xml-name-start-char-p ch)
865 then (setf state state-readtag-end2)
867 else (clear-coll coll)
869 (add-to-coll coll ch)
870 (setq ch (get-next-char tokenbuf))
873 (xml-error (concatenate 'string
874 "illegal end tag name, starting at: '</"
875 (compute-coll-string coll)
879 (#.state-readtag-end2
880 (if* (xml-name-char-p ch)
881 then (add-to-coll coll ch)
882 elseif (eq #\> ch) then
883 (let ((tag-string (compute-coll-string coll)))
884 (when (and (iostruct-ns-scope tokenbuf)
886 (first (first (iostruct-ns-scope tokenbuf)))))
887 (dolist (item (second (first (iostruct-ns-scope tokenbuf))))
888 (setf (iostruct-ns-to-package tokenbuf)
889 (remove (assoc item (iostruct-ns-to-package tokenbuf))
890 (iostruct-ns-to-package tokenbuf))))
891 (setf (iostruct-ns-scope tokenbuf)
892 (rest (iostruct-ns-scope tokenbuf)))))
893 (setq tag-to-return (compute-tag coll *package*
894 (iostruct-ns-to-package tokenbuf)))
896 elseif (xml-space-p ch) then (setf state state-readtag-end3)
897 (let ((tag-string (compute-coll-string coll)))
898 (when (and (iostruct-ns-scope tokenbuf)
900 (first (first (iostruct-ns-scope tokenbuf)))))
901 (setf (iostruct-ns-scope tokenbuf)
902 (rest (iostruct-ns-scope tokenbuf)))))
903 (setq tag-to-return (compute-tag coll *package*
904 (iostruct-ns-to-package tokenbuf)))
905 else (let ((tmp (compute-coll-string coll)))
908 (add-to-coll coll ch)
909 (setq ch (get-next-char tokenbuf))
912 (xml-error (concatenate 'string
913 "illegal end tag name, starting at: '</"
915 (compute-coll-string coll)
919 (#.state-readtag-end3
920 (if* (xml-space-p ch) then nil
921 elseif (eq #\> ch) then (return)
922 else (let ((tmp (compute-coll-string coll)))
925 (add-to-coll coll ch)
926 (setq ch (get-next-char tokenbuf))
929 (xml-error (concatenate 'string
930 "illegal end tag name, starting at: '"
931 (compute-coll-string coll)
932 "' end tag name: " tmp )))
935 (#.state-readtagfirst
936 ; starting to read a tag name
938 then (setf state state-readtag-end)
940 then (setf state state-readtag-?)
941 (setf empty-delim #\?)
943 then (setf state state-readtag-!)
944 (setf empty-delim nil)
945 elseif (xml-name-start-char-p ch)
946 then (setf state state-readtag)
947 (setf empty-delim #\/)
949 else (clear-coll coll)
951 (add-to-coll coll ch)
952 (setq ch (get-next-char tokenbuf))
955 (xml-error (concatenate 'string
956 "illegal character following '<', starting at '"
957 (compute-coll-string coll)
962 (if* (xml-name-start-char-p ch)
964 (setf state state-readtag-!-name)
968 (setf state state-readtag-!-conditional)
971 (setf state state-readtag-!-comment)
975 (add-to-coll coll ch)
976 (setq ch (get-next-char tokenbuf))
979 (xml-error (concatenate 'string
980 "illegal character following '<!', starting at '<!"
981 (compute-coll-string coll)
985 (#.state-readtag-!-conditional
986 (if* (eq #\C ch) then
987 (setf state state-readtag-!-conditional4)
988 (setf special-tag-count 1)
989 else (clear-coll coll)
991 (add-to-coll coll ch)
992 (setq ch (get-next-char tokenbuf))
995 (xml-error (concatenate 'string
996 "illegal character following '<![', starting at '<!["
997 (compute-coll-string coll)
1001 (#.state-readtag-!-conditional4
1002 (if* (not (eq (elt "CDATA[" special-tag-count) ch))
1003 then (clear-coll coll)
1005 (add-to-coll coll ch)
1006 (setq ch (get-next-char tokenbuf))
1009 (xml-error (concatenate 'string
1010 "illegal token following '<![', starting at '<!["
1011 (subseq "CDATA[" 0 special-tag-count)
1012 (compute-coll-string coll)
1014 elseif (eq #\[ ch) then (setf state state-readtag-!-conditional5)
1015 else (incf special-tag-count)))
1017 (#.state-readtag-!-conditional5
1019 then (setf state state-readtag-!-conditional6)
1020 elseif (not (xml-char-p ch)) then
1021 (xml-error (concatenate 'string
1022 "Illegal character: "
1024 " detected in CDATA input"))
1025 else (add-to-coll coll ch)))
1027 (#.state-readtag-!-conditional6
1029 then (setf state state-readtag-!-conditional7)
1030 else (setf state state-readtag-!-conditional5)
1031 (add-to-coll coll #\])
1032 (add-to-coll coll ch)))
1034 (#.state-readtag-!-conditional7
1037 (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then
1039 "CDATA cannot be constructed from entity reference/character data sequence")
1041 (setf entity-source nil))
1043 elseif (eq #\] ch) then
1044 (add-to-coll coll #\]) ;; come back here to check again
1045 else (setf state state-readtag-!-conditional5)
1046 (add-to-coll coll #\])
1047 (add-to-coll coll #\])
1048 (add-to-coll coll ch)))
1050 (#.state-readtag-!-comment
1052 then (setf state state-readtag-!-readcomment)
1053 (setf tag-to-return :comment)
1054 else (clear-coll coll)
1056 (add-to-coll coll ch)
1057 (setq ch (get-next-char tokenbuf))
1060 (xml-error (concatenate 'string
1061 "illegal token following '<![-', starting at '<!-"
1062 (compute-coll-string coll)
1066 (#.state-readtag-!-readcomment
1068 then (setf state state-readtag-!-readcomment2)
1069 elseif (not (xml-char-p ch)) then
1070 (xml-error (concatenate 'string
1071 "Illegal character: "
1073 " detected in input"))
1074 else (add-to-coll coll ch)))
1076 (#.state-readtag-!-readcomment2
1078 then (setf state state-readtag-end-bracket)
1079 else (setf state state-readtag-!-readcomment)
1080 (add-to-coll coll #\-) (add-to-coll coll ch)))
1082 (#.state-readtag-end-bracket
1084 then (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then
1086 (concatenate 'string
1087 (string tag-to-return)
1088 " tag cannot be constructed from entity reference/character data sequence"))
1090 (setf entity-source nil))
1092 else (clear-coll coll)
1094 (add-to-coll coll ch)
1095 (setq ch (get-next-char tokenbuf))
1098 (xml-error (concatenate 'string
1099 "illegal token following '--' comment terminator, starting at '--"
1100 (compute-coll-string coll)
1105 (if* (xml-name-char-p ch) ;; starting char already passed more restrictive test
1107 (add-to-coll coll ch)
1109 (if* (xml-space-p ch) then
1110 (setf tag-to-return-string (compute-coll-string coll))
1112 (compute-tag coll *package*
1113 (iostruct-ns-to-package tokenbuf)))
1115 (setf state state-readtag2)
1116 elseif (eq #\> ch) then
1118 (compute-tag coll *package*
1119 (iostruct-ns-to-package tokenbuf)))
1122 elseif (eq #\/ ch) then
1124 (compute-tag coll *package*
1125 (iostruct-ns-to-package tokenbuf)))
1127 (setf state state-readtag3)
1128 else (dotimes (i 15)
1129 (add-to-coll coll ch)
1130 (setq ch (get-next-char tokenbuf))
1134 (concatenate 'string
1135 "illegal token name, starting at '"
1136 (compute-coll-string coll)
1141 (if* (xml-space-p ch) then nil
1142 elseif (eq #\> ch) then (return)
1143 elseif (eq #\/ ch) then (setf state state-readtag3)
1144 elseif (xml-name-start-char-p ch) then
1146 (setf state state-readtag4)
1147 else (clear-coll coll)
1149 (add-to-coll coll ch)
1150 (setq ch (get-next-char tokenbuf))
1154 (concatenate 'string
1155 "illegal token, starting at '"
1156 (compute-coll-string coll)
1157 "' following element token start: " (string tag-to-return)))
1161 (if* (xml-name-char-p ch) ;; starting char already passed more restrictive test
1163 (add-to-coll coll ch)
1164 elseif (eq #\= ch) then
1165 (setq attrib-name (compute-tag coll *package*
1166 (iostruct-ns-to-package tokenbuf)))
1168 (let ((name (symbol-name attrib-name)))
1169 (when (and (>= (length name) 5)
1170 (string= name "xmlns" :end1 5))
1171 (if* (= (length name) 5)
1173 (setf ns-token :none)
1174 elseif (eq (schar name 5) #\:)
1176 (setf ns-token (subseq name 6)))))
1177 (setf state state-readtag5)
1178 elseif (xml-space-p ch) then
1179 (setq attrib-name (compute-tag coll *package*
1180 (iostruct-ns-to-package tokenbuf)))
1182 (let ((name (symbol-name attrib-name)))
1183 (when (and (>= (length name) 5)
1184 (string= name "xmlns" :end1 5))
1185 (if* (= (length name) 5)
1187 (setf ns-token :none)
1189 (setf ns-token (subseq name 6)))))
1190 (setf state state-readtag12)
1191 else (let ((tmp (compute-coll-string coll)))
1194 (add-to-coll coll ch)
1195 (setq ch (get-next-char tokenbuf))
1199 (concatenate 'string
1200 "looking for attribute '=', found: '"
1201 (compute-coll-string coll)
1202 "' following attribute name: " tmp)))
1206 (if* (xml-space-p ch) then nil
1207 elseif (eq #\= ch) then (setf state state-readtag5)
1210 (add-to-coll coll ch)
1211 (setq ch (get-next-char tokenbuf))
1215 (concatenate 'string
1216 "looking for attribute '=', found: '"
1217 (compute-coll-string coll)
1218 "' following attribute name: " (string attrib-name)))))
1221 ;; begin to collect attribute value
1222 (if* (or (eq ch #\")
1224 then (setq value-delim ch)
1225 (let* ((tag-defaults (assoc tag-to-return attlist-data))
1226 (this-attrib (assoc attrib-name tag-defaults)))
1227 (when (and (second this-attrib) (not (eq (second this-attrib) :CDATA)))
1230 (setq state state-readtag6)
1231 elseif (xml-space-p ch) then nil
1234 (add-to-coll coll ch)
1235 (setq ch (get-next-char tokenbuf))
1239 (concatenate 'string
1240 "attribute value not delimited by ' or \" : '"
1241 (compute-coll-string coll)
1242 "' following attribute: " (string attrib-name)))
1246 (let ((from-entity (and attrib-value-tokenbuf
1247 (eq attrib-value-tokenbuf
1248 (first (iostruct-entity-bufs tokenbuf))))))
1249 (when (not from-entity) (setf attrib-value-tokenbuf nil))
1250 (if* from-entity then
1251 (if* (eq #\newline ch) then (setf ch #\space)
1252 elseif (eq #\return ch) then (setf ch #\space)
1253 elseif (eq #\tab ch) then (setf ch #\space)
1255 (if* (and (not from-entity) (eq ch value-delim))
1256 then (setq attrib-value (compute-coll-string coll))
1258 (setf attrib-value (normalize-attrib-value attrib-value)))
1260 (push attrib-name attribs-to-return)
1261 (push attrib-value attribs-to-return)
1263 (let ((package (assoc (parse-uri attrib-value)
1264 (iostruct-uri-to-package tokenbuf)
1266 (if* package then (setf package (rest package))
1269 (let ((i 0) new-package)
1271 (let* ((candidate (concatenate 'string
1272 "net.xml.namespace."
1273 (format nil "~s" i)))
1274 (exists (find-package candidate)))
1277 else (setf new-package (make-package candidate))
1278 (setf (iostruct-uri-to-package tokenbuf)
1279 (acons (parse-uri attrib-value) new-package
1280 (iostruct-uri-to-package tokenbuf)))
1281 (return new-package)))))))
1282 (setf (iostruct-ns-to-package tokenbuf)
1283 (acons ns-token package (iostruct-ns-to-package tokenbuf)))
1285 (if* (and (first (iostruct-ns-scope tokenbuf))
1286 (string= (first (first (iostruct-ns-scope tokenbuf)))
1287 tag-to-return-string))
1289 (push ns-token (second (first (iostruct-ns-scope tokenbuf))))
1291 (push (list tag-to-return-string (list ns-token))
1292 (iostruct-ns-scope tokenbuf)))
1293 (setf ns-token nil))
1294 (setq state state-readtag6a)
1295 elseif (eq #\newline ch) then
1296 (when (not (eq #\return last-ch)) (add-to-coll coll #\space))
1297 elseif (or (eq #\tab ch) (eq #\return ch)) then
1298 (add-to-coll coll #\space)
1300 then (setq state state-readtag7)
1301 (setf entity-source (first (iostruct-entity-bufs tokenbuf)))
1302 elseif (and (xml-char-p ch) (not (eq #\< ch)))
1303 then (add-to-coll coll ch)
1306 (add-to-coll coll ch)
1307 (setq ch (get-next-char tokenbuf))
1311 (concatenate 'string
1312 "attribute value cannot contain '<': '"
1313 (compute-coll-string coll)
1314 "' following attribute: " (string attrib-name)))
1319 (if* (xml-space-p ch) then (setf state state-readtag2)
1320 elseif (eq #\> ch) then (setf state state-readtag2)
1322 elseif (eq #\/ ch) then (setf state state-readtag3)
1323 else (clear-coll coll)
1325 (add-to-coll coll ch)
1326 (setq ch (get-next-char tokenbuf))
1330 (concatenate 'string
1331 "illegal token, starting at '"
1332 (compute-coll-string coll)
1333 "' following element token start: " (string tag-to-return)))
1338 then (setf state state-readtag8)
1339 elseif (xml-name-start-char-p ch)
1340 then (setf state state-readtag9)
1342 else (clear-coll coll)
1344 (add-to-coll coll ch)
1345 (setq ch (get-next-char tokenbuf))
1349 (concatenate 'string
1350 "attribute value contains illegal reference name: '&"
1351 (compute-coll-string coll)
1352 "' in attribute value for: " (string attrib-name)))
1357 then (setf state state-readtag10)
1358 elseif (<= (char-code #\0) (char-code ch) (char-code #\9))
1359 then (setf state state-readtag11)
1361 else (clear-coll coll)
1363 (add-to-coll coll ch)
1364 (setq ch (get-next-char tokenbuf))
1368 (concatenate 'string
1369 "attribute value contains illegal character reference code: '"
1370 (compute-coll-string coll)
1371 "' in attribute value for: " (string attrib-name)))
1375 (let ((code (char-code ch)))
1377 then (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then
1379 (concatenate 'string
1380 (string (code-char char-code))
1381 " reference cannot be constructed from entity reference/character data sequence"))
1383 (setf entity-source nil))
1384 (add-to-coll coll (code-char char-code))
1386 (setq state state-readtag6)
1387 elseif (<= (char-code #\0) code (char-code #\9))
1388 then (setf char-code (+ (* char-code 16) (- code (char-code #\0))))
1389 elseif (<= (char-code #\A) code (char-code #\F))
1390 then (setf char-code (+ 10 (* char-code 16) (- code (char-code #\A))))
1391 elseif (<= (char-code #\a) code (char-code #\f))
1392 then (setf char-code (+ 10 (* char-code 16) (- code (char-code #\a))))
1393 else (clear-coll coll)
1395 (add-to-coll coll ch)
1396 (setq ch (get-next-char tokenbuf))
1400 (concatenate 'string
1401 "attribute value contains illegal hexidecimal character reference code: '"
1402 (compute-coll-string coll)
1403 "' in attribute value for: " (string attrib-name)))
1407 (let ((code (char-code ch)))
1409 then (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then
1411 (concatenate 'string
1412 (string (code-char char-code))
1413 " reference cannot be constructed from entity reference/character data sequence"))
1415 (setf entity-source nil))
1416 (add-to-coll coll (code-char char-code))
1418 (setq state state-readtag6)
1419 elseif (<= (char-code #\0) code (char-code #\9))
1420 then (setf char-code (+ (* char-code 10) (- code (char-code #\0))))
1421 else (clear-coll coll)
1423 (add-to-coll coll ch)
1424 (setq ch (get-next-char tokenbuf))
1428 (concatenate 'string
1429 "attribute value contains illegal decimal character reference code: '"
1430 (compute-coll-string coll)
1431 "' in attribute value for: " (string attrib-name)))
1435 (if* (xml-name-char-p ch)
1436 then (add-to-coll entity ch)
1438 then (let ((entity-symbol (compute-tag entity)))
1440 (if* (not (eq entity-source (first (iostruct-entity-bufs tokenbuf)))) then
1442 (concatenate 'string
1443 (string entity-symbol)
1444 " reference cannot be constructed from entity reference/character data sequence"))
1446 (setf entity-source nil))
1447 (if* (string= (symbol-name entity-symbol) "amp") then (add-to-coll coll #\&)
1448 elseif (string= (symbol-name entity-symbol) "lt") then (add-to-coll coll #\<)
1449 elseif (string= (symbol-name entity-symbol) "gt") then (add-to-coll coll #\>)
1450 elseif (string= (symbol-name entity-symbol) "apos") then (add-to-coll coll #\')
1451 elseif (string= (symbol-name entity-symbol) "quot") then (add-to-coll coll #\")
1453 (if* (and (iostruct-do-entity tokenbuf)
1455 (assoc entity-symbol
1456 (iostruct-general-entities tokenbuf)))) then
1457 (setf p-value (rest p-value))
1458 (when (member entity-symbol (iostruct-entity-names tokenbuf))
1459 (xml-error (concatenate 'string
1461 (string entity-symbol)
1462 " in recursive reference")))
1463 (push entity-symbol (iostruct-entity-names tokenbuf))
1464 (if* (stringp p-value) then
1465 (add-to-entity-buf entity-symbol p-value)
1466 (when (not attrib-value-tokenbuf)
1467 (setf attrib-value-tokenbuf
1468 (first (iostruct-entity-bufs tokenbuf))))
1469 elseif (null external-callback) then
1470 (setf (iostruct-do-entity tokenbuf) nil)
1472 (let ((entity-stream (apply external-callback p-value)))
1473 (if* entity-stream then
1474 (let ((entity-buf (get-tokenbuf)))
1475 (setf (tokenbuf-stream entity-buf) entity-stream)
1476 (unicode-check entity-stream tokenbuf)
1478 (iostruct-entity-bufs tokenbuf))
1479 ;; check for possible external textdecl
1482 (if* (dotimes (i (length string) t)
1483 (setf cch (get-next-char tokenbuf))
1488 (schar string count)))
1493 (when (< count 0) (return))
1494 (un-next-char (schar string count))
1496 ;; swallow <?xml token
1504 (when (< count 0) (return))
1505 (un-next-char (schar string count))
1509 (xml-error (concatenate 'string
1510 "Reference to unparsed entity "
1511 (string entity-symbol)))
1514 elseif (or (not (iostruct-seen-any-dtd tokenbuf))
1515 (and (iostruct-seen-any-dtd tokenbuf)
1516 (not (iostruct-seen-external-dtd tokenbuf))
1517 (not (iostruct-seen-parameter-reference tokenbuf))))
1519 (xml-error (concatenate 'string
1520 (string entity-symbol)
1521 " must have entity declaration before being referenced"))
1524 (setq state state-readtag6)
1525 else (dotimes (i 15)
1526 (add-to-coll coll ch)
1527 (setq ch (get-next-char tokenbuf))
1531 (concatenate 'string
1532 "attribute value contains illegal reference name: '&"
1533 (compute-coll-string coll)
1534 "' in attribute value for: " (string attrib-name)))
1538 (if* (eq #\> ch) then (return)
1539 else (clear-coll coll)
1541 (add-to-coll coll ch)
1542 (setq ch (get-next-char tokenbuf))
1546 (concatenate 'string
1547 "expected '>' found '"
1548 (compute-coll-string coll)
1549 "' in element: " (string tag-to-return)))
1552 (#.state-readtag-!-name
1553 (if* (xml-name-char-p ch) ;; starting char already passed more restrictive test
1555 (add-to-coll coll ch)
1557 (when (not (xml-space-p ch))
1558 (xml-error (concatenate 'string
1559 "expecting whitespace following: '<!"
1560 (compute-coll-string coll)
1561 "' ; got: '" (string ch) "'")))
1562 (setq tag-to-return (compute-tag coll))
1564 (setf state state-pre-!-contents)))
1567 (if* (xml-name-char-p ch)
1569 (add-to-coll coll ch)
1571 (when (and (not (xml-space-p ch)) (not (eq #\? ch)))
1572 (xml-error (concatenate 'string
1573 "expecting name following: '<?"
1574 (compute-coll-string coll)
1575 "' ; got: '" (string ch) "'"))
1577 (when (= (collector-next coll) 0)
1578 (xml-error "null <? token"))
1579 (if* (and (= (collector-next coll) 3)
1580 (eq (elt (collector-data coll) 0) #\x)
1581 (eq (elt (collector-data coll) 1) #\m)
1582 (eq (elt (collector-data coll) 2) #\l)
1585 (when (eq #\? ch) (xml-error "null <?xml token"))
1586 (setq tag-to-return :xml)
1587 (setf state state-findattributename)
1588 elseif (and (= (collector-next coll) 3)
1589 (or (eq (elt (collector-data coll) 0) #\x)
1590 (eq (elt (collector-data coll) 0) #\X))
1591 (or (eq (elt (collector-data coll) 1) #\m)
1592 (eq (elt (collector-data coll) 1) #\M))
1593 (or (eq (elt (collector-data coll) 2) #\l)
1594 (eq (elt (collector-data coll) 2) #\L))
1596 (xml-error "<?xml tag must be all lower case")
1598 (setq tag-to-return (compute-tag coll))
1599 (when (eq #\? ch) (un-next-char ch))
1600 (setf state state-prereadpi))
1603 (#.state-pre-!-contents
1604 (if* (xml-space-p ch)
1606 elseif (not (xml-char-p ch))
1607 then (xml-error (concatenate 'string ;; no test for this...
1608 "illegal character '"
1610 " following <!" (string tag-to-return)))
1613 else (un-next-char ch)
1614 (setf state state-!-contents)))
1618 (let ((val (parse-dtd tokenbuf nil external-callback)))
1619 (setf (iostruct-seen-any-dtd tokenbuf) t)
1620 (push (append (list :[) val)
1621 contents-to-return))
1622 (setf state state-!-doctype-ext3))
1625 (if* (xml-name-char-p ch)
1626 then (add-to-coll coll ch)
1628 then (push (compute-coll-string coll) contents-to-return)
1632 then (push (compute-tag coll) contents-to-return)
1634 (setf state state-begin-dtd)
1635 elseif (and (xml-space-p ch) (eq tag-to-return :DOCTYPE))
1636 ;; look at tag-to-return and set state accordingly
1637 then (push (compute-tag coll) contents-to-return)
1639 (setf state state-!-doctype)
1641 (concatenate 'string
1643 (string tag-to-return)
1647 (#.state-!-doctype-ext
1648 (if* (xml-name-char-p ch) ;; starting char already passed more restrictive test
1650 (add-to-coll coll ch)
1652 (when (not (xml-space-p ch))
1654 (add-to-coll coll ch)
1655 (setq ch (get-next-char tokenbuf))
1659 (concatenate 'string
1660 "illegal character in '"
1661 (compute-coll-string coll)
1662 "' in <! tag: " (string tag-to-return) " "
1663 (string (first contents-to-return))
1666 (let ((token (compute-tag coll)))
1667 (push token contents-to-return)
1669 (if* (eq :SYSTEM token) then (setf state state-!-doctype-system)
1670 elseif (eq :PUBLIC token) then (setf state state-!-doctype-public)
1672 (concatenate 'string
1673 "expected 'SYSTEM' or 'PUBLIC' got '"
1674 (string (first contents-to-return))
1675 "' in <! tag: " (string tag-to-return) " "
1676 (string (second contents-to-return))))
1680 (#.state-!-doctype-public
1681 (if* (xml-space-p ch) then nil
1682 elseif (eq #\" ch) then (setf state state-!-doctype-public2)
1683 elseif (eq #\' ch) then (setf state state-!-doctype-public3)
1685 (concatenate 'string
1686 "expected quote or double-quote got: '"
1688 "' in <! tag: " (string tag-to-return) " "
1689 (string (second contents-to-return)) " "
1690 (string (first contents-to-return))
1694 (#.state-!-doctype-system
1695 (if* (xml-space-p ch) then nil
1696 elseif (eq #\" ch) then (setf state state-!-doctype-system2)
1697 elseif (eq #\' ch) then (setf state state-!-doctype-system3)
1699 (concatenate 'string
1700 "expected quote or double-quote got: '"
1702 "' in <! tag: " (string tag-to-return) " "
1703 (string (second contents-to-return)) " "
1704 (string (first contents-to-return))
1708 (#.state-!-doctype-public2
1709 (if* (eq #\" ch) then (push (compute-coll-string coll)
1712 (setf state state-!-doctype-system)
1713 elseif (pub-id-char-p ch) then (add-to-coll coll ch)
1714 else (dotimes (i 15)
1715 (add-to-coll coll ch)
1716 (setq ch (get-next-char tokenbuf))
1720 (concatenate 'string
1721 "illegal character in DOCTYPE PUBLIC string: '"
1722 (compute-coll-string coll) "'"))
1725 (#.state-!-doctype-public3
1726 (if* (eq #\' ch) then (push (compute-coll-string coll)
1729 (setf state state-!-doctype-system)
1730 elseif (pub-id-char-p ch) then (add-to-coll coll ch)
1731 else (dotimes (i 15)
1732 (add-to-coll coll ch)
1733 (setq ch (get-next-char tokenbuf))
1737 (concatenate 'string
1738 "illegal character in DOCTYPE PUBLIC string: '"
1739 (compute-coll-string coll) "'"))
1742 (#.state-!-doctype-system2
1743 (when (not (xml-char-p ch))
1744 (xml-error "XML is not well formed")) ;; not tested
1745 (if* (eq #\" ch) then (push (compute-coll-string coll)
1748 (setf state state-!-doctype-ext2)
1749 else (add-to-coll coll ch)))
1751 (#.state-!-doctype-system3
1752 (when (not (xml-char-p ch))
1753 (xml-error "XML is not well formed")) ;; not tested
1754 (if* (eq #\' ch) then (push (compute-coll-string coll)
1757 (setf state state-!-doctype-ext2)
1758 else (add-to-coll coll ch)))
1760 (#.state-!-doctype-ext2
1761 (if* (xml-space-p ch) then nil
1762 elseif (eq #\> ch) then (return)
1764 then (setf state state-begin-dtd)
1767 (add-to-coll coll ch)
1768 (setq ch (get-next-char tokenbuf))
1772 (concatenate 'string
1773 "illegal char in DOCTYPE token: '"
1774 (compute-coll-string coll) "'"))
1777 (#.state-!-doctype-ext3
1778 (if* (xml-space-p ch) then nil
1779 elseif (eq #\> ch) then (return)
1782 (add-to-coll coll ch)
1783 (setq ch (get-next-char tokenbuf))
1787 (concatenate 'string
1788 "illegal char in DOCTYPE token following dtd: '"
1789 (compute-coll-string coll) "'"))
1793 ;; skip whitespace; possible exits: >, SYSTEM, PUBLIC, [
1794 (if* (xml-space-p ch) then nil
1795 elseif (xml-name-start-char-p ch)
1797 (setf state state-!-doctype-ext)
1799 elseif (eq #\> ch) then (return)
1801 then (setf state state-begin-dtd)
1803 (concatenate 'string
1804 "illegal character: '"
1806 "' in <! tag: " (string tag-to-return) " "
1807 (string (first contents-to-return))))
1811 (if* (xml-space-p ch)
1813 elseif (not (xml-char-p ch))
1814 then (xml-error "XML is not well formed") ;; no test
1815 else (un-next-char ch)
1816 (setf state state-readpi)))
1820 then (setf state state-readpi2)
1821 elseif (not (xml-char-p ch))
1822 then (xml-error "XML is not well formed") ;; no test
1823 else (add-to-coll coll ch)))
1828 elseif (eq #\? ch) then
1829 (add-to-coll coll #\?) ;; come back here to try again
1830 else (setf state state-readpi)
1831 (add-to-coll coll #\?)
1832 (add-to-coll coll ch)))
1834 (#.state-findattributename0
1835 (if* (xml-space-p ch) then (setf state state-findattributename)
1836 elseif (eq ch empty-delim)
1837 then (setf state state-noattributename)
1840 (add-to-coll coll ch)
1841 (setq ch (get-next-char tokenbuf))
1845 (concatenate 'string
1846 "expected space or tag end before: '"
1847 (compute-coll-string coll) "'"))))
1848 (#.state-findattributename
1849 ;; search until we find the start of an attribute name
1850 ;; or the end of the tag
1851 (if* (eq ch empty-delim)
1852 then (setf state state-noattributename)
1853 elseif (xml-space-p ch)
1854 then nil ;; skip whitespace
1855 elseif (xml-name-start-char-p ch)
1858 (setf state state-attribname)
1861 (add-to-coll coll ch)
1862 (setq ch (get-next-char tokenbuf))
1866 (concatenate 'string
1867 "illegal char in <?xml token: '"
1868 (compute-coll-string coll) "'"))
1872 ;; collect attribute name
1873 (if* (xml-name-char-p ch) ;; starting char already passed more restrictive test
1875 (add-to-coll coll ch)
1876 elseif (xml-space-p ch) then
1877 (setq attrib-name (compute-tag coll))
1879 (setq state state-attribname2)
1881 (when (not (eq #\= ch))
1883 (add-to-coll coll ch)
1884 (setq ch (get-next-char tokenbuf))
1888 (concatenate 'string
1889 "illegal char in <?xml attribute token: '"
1890 (compute-coll-string coll) "'"))
1892 (setq attrib-name (compute-tag coll))
1894 (setq state state-attribstartvalue)))
1896 (#.state-attribname2
1897 (if* (eq #\= ch) then (setq state state-attribstartvalue)
1898 elseif (xml-space-p ch) then nil
1902 (add-to-coll coll ch)
1903 (setq ch (get-next-char tokenbuf))
1907 (concatenate 'string
1908 "illegal char in <?xml attribute token: '"
1909 (compute-coll-string coll) "'"))))
1910 (#.state-attribstartvalue
1911 ;; begin to collect value
1912 (if* (or (eq ch #\")
1914 then (setq value-delim ch)
1915 (setq state state-attribvaluedelim)
1916 elseif (xml-space-p ch) then nil
1919 (add-to-coll coll ch)
1920 (setq ch (get-next-char tokenbuf))
1924 (concatenate 'string
1925 "expected ' or \" before <?xml attribute token value: '"
1926 (compute-coll-string coll) "'"))
1929 (#.state-attribvaluedelim
1930 (if* (eq ch value-delim)
1931 then (setq attrib-value (compute-coll-string coll))
1933 (push attrib-name attribs-to-return)
1934 (push attrib-value attribs-to-return)
1935 (setq state state-findattributename0)
1936 elseif (and (xml-char-p ch) (not (eq #\< ch)))
1937 then (add-to-coll coll ch)
1940 (add-to-coll coll ch)
1941 (setq ch (get-next-char tokenbuf))
1945 (concatenate 'string
1946 "illegal character in attribute token value: '"
1947 (compute-coll-string coll) "'"))
1950 (#.state-noattributename
1953 (return) ;; ready to build return token
1956 (concatenate 'string
1957 "expected '>' found: '" (string ch) "' in <?xml token"))
1961 (error "need to support state:~s" state))
1963 (put-back-collector entity)
1965 (#.state-noattributename ;; it's a bug if this state occurs with a non-empty element
1966 (put-back-collector coll)
1967 (if* attribs-to-return
1968 then (values (cons tag-to-return
1969 (nreverse attribs-to-return))
1970 (if (eq tag-to-return :xml) :xml :start-tag) :end-tag)
1972 (values tag-to-return :start-tag :end-tag)
1974 (#.state-readtag-end-bracket
1975 ;; this is a :commant tag
1976 (let ((ret (compute-coll-string coll)))
1977 (put-back-collector coll)
1978 (values (cons tag-to-return (list ret)) :comment :nil)))
1980 (let ((next-char (collector-next coll)))
1981 (put-back-collector coll)
1982 (if* (zerop next-char)
1983 then (values nil :eof nil)
1984 else (values (compute-coll-string coll) :pcdata pcdatap))))
1986 (let ((ret (compute-coll-string coll)))
1987 (put-back-collector coll)
1988 (values (append (list :pi tag-to-return) (list ret)) :pi nil)))
1989 ((#.state-readtag-!-conditional)
1990 (put-back-collector coll)
1991 (values (append (list tag-to-return) contents-to-return) :start-tag
1993 ((#.state-!-contents
1995 #.state-!-doctype-ext2
1996 #.state-!-doctype-ext3)
1997 (put-back-collector coll)
1998 (values (append (list tag-to-return) (nreverse contents-to-return)) :start-tag
2001 (put-back-collector coll)
2002 (values (if* attribs-to-return
2003 then (cons tag-to-return
2004 (nreverse attribs-to-return))
2005 else tag-to-return) :start-tag :end-tag))
2008 (put-back-collector coll)
2009 (values (if* attribs-to-return
2010 then (cons tag-to-return
2011 (nreverse attribs-to-return))
2012 else tag-to-return) :start-tag nil))
2013 ((#.state-readtag-end2
2014 #.state-readtag-end3)
2015 (put-back-collector coll)
2016 (values tag-to-return :end-tag nil))
2017 (#.state-readtag-!-conditional7
2018 (let ((ret (compute-coll-string coll)))
2019 (put-back-collector coll)
2020 (values (append (list :cdata) (list ret)) :cdata nil)))
2022 ;; if ch is null that means we encountered unexpected EOF
2024 (put-back-collector coll)
2025 (xml-error "unexpected end of input"))
2026 (print (list tag-to-return attribs-to-return))
2027 (let ((ret (compute-coll-string coll)))
2028 (put-back-collector coll)
2029 (error "need to support state <post>:~s ~s ~s ~s" state
2035 (defun swallow-xml-token (tokenbuf external-callback)
2036 (declare (ignorable old-coll) (optimize (speed 3) (safety 1)))
2037 (let ((xml (next-token tokenbuf external-callback nil)))
2038 (if* (and (eq (fourth xml) :standalone) (stringp (fifth xml))
2039 (equal (fifth xml) "yes")) then
2040 (xml-error "external XML entity cannot be standalone document")
2041 elseif (and (eq (sixth xml) :standalone) (stringp (seventh xml))
2042 (equal (seventh xml) "yes")) then
2043 (xml-error "external XML entity cannot be standalone document"))))
2045 ;; return the string with entity references replaced by text
2046 ;; normalizing will happen later
2047 ;; we're ok on different types - just ignore IMPLIED & REQUIRED; and possibly skip FIXED
2048 (defun parse-default-value (value-list tokenbuf external-callback)
2049 (declare (optimize (speed 3) (safety 1)))
2051 (if* (stringp (first value-list)) then (setf value-string (first value-list))
2052 elseif (eq (first value-list) :FIXED) then (setf value-string (second value-list)))
2053 (let ((tmp-result (parse-xml
2054 (concatenate 'string
2058 :external-callback external-callback
2060 (iostruct-general-entities tokenbuf))))
2061 (if* (stringp (first value-list)) then
2062 (setf (first value-list)
2063 (third (first (first tmp-result))))
2064 elseif (eq (first value-list) :FIXED) then
2065 (setf (second value-list)
2066 (third (first (first tmp-result)))))))
2069 (defun process-attlist (args attlist-data)
2070 (declare (optimize (speed 3) (safety 1)))
2071 (dolist (arg1 args attlist-data)
2072 ;;(format t "arg1: ~s~%" arg1)
2073 (dolist (item (rest arg1))
2074 ;;(format t "item: ~s~%" item)
2075 (when (eq :ATTLIST (first item))
2076 (let* ((name (second item))
2077 (name-data (assoc name attlist-data))
2078 (new-name-data (rest name-data)))
2079 ;;(format t "name: ~s name-data: ~s new-name-data: ~s~%" name name-data new-name-data)
2080 (dolist (attrib-data (rest (rest item)))
2081 ;;(format t "attrib-data: ~s~%" attrib-data)
2083 (setf (rest (rest attrib-data))
2084 (parse-default-value (rest (rest attrib-data)) tokenbuf external-callback))
2085 (when (not (assoc (first attrib-data) new-name-data))
2086 (setf new-name-data (acons (first attrib-data) (rest attrib-data) new-name-data))))
2088 (rplacd (assoc name attlist-data) (nreverse new-name-data))
2089 else (setf attlist-data (acons name (nreverse new-name-data) attlist-data))))))))