1 ;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA
3 ;; This code is free software; you can redistribute it and/or
4 ;; modify it under the terms of the version 2.1 of
5 ;; the GNU Lesser General Public License as published by
6 ;; the Free Software Foundation, as clarified by the AllegroServe
7 ;; prequel found in license-allegroserve.txt.
9 ;; This code is distributed in the hope that it will be useful,
10 ;; but without any warranty; without even the implied warranty of
11 ;; merchantability or fitness for a particular purpose. See the GNU
12 ;; Lesser General Public License for more details.
14 ;; Version 2.1 of the GNU Lesser General Public License is in the file
15 ;; license-lgpl.txt that was distributed with this file.
16 ;; If it is not present, you can access it from
17 ;; http://www.gnu.org/copyleft/lesser.txt (until superseded by a newer
18 ;; version) or write to the Free Software Foundation, Inc., 59 Temple Place,
19 ;; Suite 330, Boston, MA 02111-1307 USA
22 ;; $Id: phtml.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $
24 ;; phtml.cl - parse html
28 ;; 02/05/01 symbols mapped to preferred case at runtime (as opposed to
29 ;; a compile time macro determining the case mapping)
31 ;; 10/27/00 :callbacks arg now processed correctly for tags with no body
33 ;; 10/14/00 add first-pass member to tokenbuf structure; used to remove
34 ;; multiple un-next-char calls in raw mode
35 ;; removed :script from *in-line* (incorect and led to infinite loop)
36 ;; char format reopen not done in :script and :style
37 ;; fixed :table/:th tag-auto-close-stop typo
40 ; do character entity stuff
43 (defpackage net.html.parser
44 (:use :lisp :clos :excl)
49 (in-package :net.html.parser)
51 (defmacro tag-auto-close (tag) `(get ,tag 'tag-auto-close))
52 (defmacro tag-auto-close-stop (tag) `(get ,tag 'tag-auto-close-stop))
53 (defmacro tag-no-end (tag) `(get ,tag 'tag-no-end))
55 ; only subelements allowed in this element, no strings
56 (defmacro tag-no-pcdata (tag) `(get ,tag 'tag-no-pcdata))
58 ;; given :foo or (:foo ...) return :foo
59 (defmacro tag-name (expr)
69 (eval-when (compile load eval)
70 (defconstant state-pcdata 0) ; scanning for chars or a tag
71 (defconstant state-readtagfirst 1)
72 (defconstant state-readtag 2)
73 (defconstant state-findattribname 3)
74 (defconstant state-attribname 4)
75 (defconstant state-attribstartvalue 5)
76 (defconstant state-attribvaluedelim 6)
77 (defconstant state-attribvaluenodelim 7)
78 (defconstant state-readcomment 8)
79 (defconstant state-readcomment-one 9)
80 (defconstant state-readcomment-two 10)
81 (defconstant state-findvalue 11)
82 (defconstant state-rawdata 12)
87 next ; next index to set
88 max ; 1+max index to set
92 ;; keep a cache of collectors on this list
94 (defparameter *collectors* (list nil nil nil nil))
96 (defun get-collector ()
97 (declare (optimize (speed 3) (safety 1)))
99 (mp::without-scheduling
100 (do* ((cols *collectors* (cdr cols))
101 (this (car cols) (car cols)))
104 then (setf (car cols) nil)
108 then (setf (collector-next col) 0)
113 :data (make-string 100)))))
115 (defun put-back-collector (col)
116 (declare (optimize (speed 3) (safety 1)))
117 (mp::without-scheduling
118 (do ((cols *collectors* (cdr cols)))
122 (if* (null (car cols))
123 then (setf (car cols) col)
128 (defun grow-and-add (coll ch)
129 (declare (optimize (speed 3) (safety 1)))
130 ;; increase the size of the data portion of the collector and then
131 ;; add the given char at the end
132 (let* ((odata (collector-data coll))
133 (ndata (make-string (* 2 (length odata)))))
134 (dotimes (i (length odata))
135 (setf (schar ndata i) (schar odata i)))
136 (setf (collector-data coll) ndata)
137 (setf (collector-max coll) (length ndata))
138 (let ((next (collector-next coll)))
139 (setf (schar ndata next) ch)
140 (setf (collector-next coll) (1+ next)))))
148 ;; character characteristics
149 (defconstant char-tagcharacter 1) ; valid char for a tag
150 (defconstant char-attribnamechar 2) ; valid char for an attribute name
151 (defconstant char-attribundelimattribvalue 4) ; valid for undelimited value
152 (defconstant char-spacechar 8)
154 (defparameter *characteristics*
155 ;; array of bits describing character characteristics
156 (let ((arr (make-array 128 :initial-element 0)))
157 (declare (optimize (speed 3) (safety 1)))
158 (macrolet ((with-range ((var from to) &rest body)
159 `(do ((,var (char-code ,from) (1+ ,var))
160 (mmax (char-code ,to)))
164 (addit (index charistic)
165 `(setf (svref arr ,index)
166 (logior (svref arr ,index)
170 (with-range (i #\A #\Z)
171 (addit i (+ char-tagcharacter
173 char-attribundelimattribvalue)))
175 (with-range (i #\a #\z)
176 (addit i (+ char-tagcharacter
178 char-attribundelimattribvalue)))
180 (with-range (i #\0 #\9)
181 (addit i (+ char-tagcharacter
183 char-attribundelimattribvalue)))
185 ;; let colon be legal tag character
186 (addit (char-code #\:) (+ char-attribnamechar
189 ;; NY times special tags have _
190 (addit (char-code #\_) (+ char-attribnamechar
193 ; now the unusual cases
194 (addit (char-code #\-) (+ char-attribnamechar
195 char-attribundelimattribvalue))
196 (addit (char-code #\.) (+ char-attribnamechar
197 char-attribundelimattribvalue))
199 ;; adding all typeable chars except for whitespace and >
200 (addit (char-code #\:) char-attribundelimattribvalue)
201 (addit (char-code #\@) char-attribundelimattribvalue)
202 (addit (char-code #\/) char-attribundelimattribvalue)
203 (addit (char-code #\!) char-attribundelimattribvalue)
204 (addit (char-code #\#) char-attribundelimattribvalue)
205 (addit (char-code #\$) char-attribundelimattribvalue)
206 (addit (char-code #\%) char-attribundelimattribvalue)
207 (addit (char-code #\^) char-attribundelimattribvalue)
208 (addit (char-code #\&) char-attribundelimattribvalue)
209 (addit (char-code #\() char-attribundelimattribvalue)
210 (addit (char-code #\)) char-attribundelimattribvalue)
211 (addit (char-code #\_) char-attribundelimattribvalue)
212 (addit (char-code #\=) char-attribundelimattribvalue)
213 (addit (char-code #\+) char-attribundelimattribvalue)
214 (addit (char-code #\\) char-attribundelimattribvalue)
215 (addit (char-code #\|) char-attribundelimattribvalue)
216 (addit (char-code #\{) char-attribundelimattribvalue)
217 (addit (char-code #\}) char-attribundelimattribvalue)
218 (addit (char-code #\[) char-attribundelimattribvalue)
219 (addit (char-code #\]) char-attribundelimattribvalue)
220 (addit (char-code #\;) char-attribundelimattribvalue)
221 (addit (char-code #\') char-attribundelimattribvalue)
222 (addit (char-code #\") char-attribundelimattribvalue)
223 (addit (char-code #\,) char-attribundelimattribvalue)
224 (addit (char-code #\<) char-attribundelimattribvalue)
225 (addit (char-code #\?) char-attribundelimattribvalue)
227 ; i'm not sure what can be in a tag name but we know that
228 ; ! and - must be there since it's used in comments
230 (addit (char-code #\-) char-tagcharacter)
231 (addit (char-code #\!) char-tagcharacter)
234 (addit (char-code #\space) char-spacechar)
235 (addit (char-code #\tab) char-spacechar)
236 (addit (char-code #\return) char-spacechar)
237 (addit (char-code #\linefeed) char-spacechar)
246 (defun char-characteristic (char bit)
247 (declare (optimize (speed 3) (safety 1)))
248 ;; return true if the given char has the given bit set in
249 ;; the characteristic array
250 (let ((code (char-code char)))
253 (not (zerop (logand (svref *characteristics* code) bit))))))
257 cur ;; next index to use to grab from tokenbuf
258 max ;; index one beyond last character
259 data ;; character array
260 first-pass ;; previously parsed tokens
263 ;; cache of tokenbuf structs
264 (defparameter *tokenbufs* (list nil nil nil nil))
266 (defun get-tokenbuf ()
267 (declare (optimize (speed 3) (safety 1)))
269 (mp::without-scheduling
270 (do* ((bufs *tokenbufs* (cdr bufs))
271 (this (car bufs) (car bufs)))
274 then (setf (car bufs) nil)
278 then (setf (tokenbuf-cur buf) 0)
279 (setf (tokenbuf-max buf) 0)
284 :data (make-array 1024 :element-type 'character)))))
286 (defun put-back-tokenbuf (buf)
287 (declare (optimize (speed 3) (safety 1)))
288 (mp::without-scheduling
289 (do ((bufs *tokenbufs* (cdr bufs)))
293 (if* (null (car bufs))
294 then (setf (car bufs) buf)
297 (defun to-preferred-case (ch)
298 (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
299 then (char-upcase ch)
300 else (char-downcase ch)))
303 (defun next-token (stream ignore-strings raw-mode-delimiter
304 read-sequence-func tokenbuf)
305 (declare (optimize (speed 3) (safety 1)))
306 ;; return two values:
307 ;; the next token from the stream.
308 ;; the kind of token (:pcdata, :start-tag, :end-tag, :eof)
310 ;; if read-sequence-func is non-nil,
311 ;; read-sequence-func is called to fetch the next character
312 (macrolet ((next-char (stream)
313 `(let ((cur (tokenbuf-cur tokenbuf))
314 (tb (tokenbuf-data tokenbuf)))
315 (if* (>= cur (tokenbuf-max tokenbuf))
317 (if* (zerop (setf (tokenbuf-max tokenbuf)
318 (if* read-sequence-func
319 then (funcall read-sequence-func tb stream)
320 else (read-sequence tb stream))))
321 then (setq cur nil) ; eof
324 then (prog1 (schar tb cur)
325 (setf (tokenbuf-cur tokenbuf) (1+ cur))))))
328 (un-next-char (stream ch)
329 `(decf (tokenbuf-cur tokenbuf)))
332 `(setf (collector-next coll) 0))
334 (add-to-coll (coll ch)
335 `(let ((.next. (collector-next ,coll)))
336 (if* (>= .next. (collector-max ,coll))
337 then (grow-and-add ,coll ,ch)
338 else (setf (schar (collector-data ,coll) .next.)
340 (setf (collector-next ,coll) (1+ .next.)))))
344 (let ((state (if* raw-mode-delimiter then state-rawdata else state-pcdata))
345 (coll (get-collector))
358 (name-length 0) ;; count only when it could be a comment
366 (setq ch (next-char stream))
367 ;;(format t "ch: ~s state: ~s~%" ch state)
370 then (return) ; eof -- exit loop
376 ; collect everything until we see a <
378 then ; if we've collected nothing then get a tag
379 (if* (> (collector-next coll) 0)
380 then ; have collected something, return this string
381 (un-next-char stream ch) ; push back the <
384 (setq state state-readtagfirst))
385 else ; we will check for & here eventually
386 (if* (not (eq ch #\return))
387 then (add-to-coll coll ch))))
389 (#.state-readtagfirst
390 ; starting to read a tag name
394 else (if* (eq #\! ch) ; possible comment
395 then (setf xml-bailout t)
396 (setq name-length 0))
397 (un-next-char stream ch))
398 (setq state state-readtag))
401 ;; reading the whole tag name
402 (if* (char-characteristic ch char-tagcharacter)
403 then (add-to-coll coll (to-preferred-case ch))
405 (if* (and (eq name-length 3)
406 (coll-has-comment coll))
407 then (clear-coll coll)
408 (setq state state-readcomment))
410 else (setq tag-to-return (compute-tag coll))
413 then (return) ; we're done
414 elseif xml-bailout then
415 (un-next-char stream ch)
417 else (if* (eq tag-to-return :!--)
419 (setq state state-readcomment)
420 else (un-next-char stream ch)
421 (setq state state-findattribname)))))
423 (#.state-findattribname
424 ;; search until we find the start of an attribute name
425 ;; or the end of the tag
427 then ; end of the line
430 then ; value for previous attribute name
431 ; (syntax "foo = bar" is bogus I think but it's
432 ; used some places, here is where we handle this
433 (pop attribs-to-return)
434 (setq attrib-name (pop attribs-to-return))
435 (setq state state-findvalue)
436 elseif (char-characteristic ch char-attribnamechar)
437 then (un-next-char stream ch)
438 (setq state state-attribname)
439 else nil ; ignore other things
443 ;; find the start of the value
444 (if* (char-characteristic ch char-spacechar)
445 thenret ; keep looking
447 then ; no value, set the value to be the
450 (string-downcase (string attrib-name)))
452 (push attrib-name attribs-to-return)
453 (push attrib-value attribs-to-return)
454 (un-next-char stream ch)
455 (setq state state-findattribname)
456 else (un-next-char stream ch)
457 (setq state state-attribstartvalue)))
461 ;; collect attribute name
463 (if* (char-characteristic ch char-attribnamechar)
464 then (add-to-coll coll (to-preferred-case ch))
466 then ; end of attribute name, value is next
467 (setq attrib-name (compute-tag coll))
469 (setq state state-attribstartvalue)
470 else ; end of attribute name with no value,
471 (setq attrib-name (compute-tag coll))
474 (string-downcase (string attrib-name)))
475 (push attrib-name attribs-to-return)
476 (push attrib-value attribs-to-return)
477 (un-next-char stream ch)
478 (setq state state-findattribname)))
480 (#.state-attribstartvalue
481 ;; begin to collect value
484 then (setq value-delim ch)
485 (setq state state-attribvaluedelim)
486 ;; gobble spaces; assume since we've seen a '=' there really is a value
487 elseif (eq #\space ch) then nil
488 else (un-next-char stream ch)
489 (setq state state-attribvaluenodelim)))
491 (#.state-attribvaluedelim
492 (if* (eq ch value-delim)
493 then (setq attrib-value (compute-coll-string coll))
495 (push attrib-name attribs-to-return)
496 (push attrib-value attribs-to-return)
497 (setq state state-findattribname)
498 else (add-to-coll coll ch)))
500 (#.state-attribvaluenodelim
501 ;; an attribute value not delimited by ' or " and thus restricted
502 ;; in the possible characters
503 (if* (char-characteristic ch char-attribundelimattribvalue)
504 then (add-to-coll coll ch)
505 else (un-next-char stream ch)
506 (setq attrib-value (compute-coll-string coll))
508 (push attrib-name attribs-to-return)
509 (push attrib-value attribs-to-return)
510 (setq state state-findattribname)))
513 ;; a comment ends on the first --, but we'll look for -->
514 ;; since that's what most people expect
516 then (setq state state-readcomment-one)
517 else (add-to-coll coll ch)))
519 (#.state-readcomment-one
520 ;; seen one -, looking for ->
523 then (setq state state-readcomment-two)
524 else ; not a comment end, put back the -'s
525 (add-to-coll coll #\-)
526 (add-to-coll coll ch)
527 (setq state state-readcomment)))
529 (#.state-readcomment-two
530 ;; seen two -'s, looking for >
533 then ; end of the line
536 then ; still at two -'s, have to put out first
537 (add-to-coll coll #\-)
538 else ; put out two hypens and back to looking for a hypen
539 (add-to-coll coll #\-)
540 (add-to-coll coll #\-)
541 (setq state state-readcomment)))
544 ;; collect everything until we see the delimiter
545 (if* (eq (to-preferred-case ch) (elt raw-mode-delimiter raw-length))
548 (when (= raw-length (length raw-mode-delimiter))
549 ;; push the end tag back so it can then be lexed
550 ;; but don't do it for xml stuff
551 (when (/= (length raw-mode-delimiter) 1)
552 (push :end-tag (tokenbuf-first-pass tokenbuf))
553 (if* (equal raw-mode-delimiter "</STYLE>")
554 then (push :STYLE (tokenbuf-first-pass tokenbuf))
555 elseif (equal raw-mode-delimiter "</style>")
556 then (push :style (tokenbuf-first-pass tokenbuf))
557 elseif (equal raw-mode-delimiter "</SCRIPT>")
558 then (push :SCRIPT (tokenbuf-first-pass tokenbuf))
559 elseif (equal raw-mode-delimiter "</script>")
560 then (push :script (tokenbuf-first-pass tokenbuf))
561 else (error "unexpected raw-mode-delimiter"))
563 ;; set state to state-pcdata for next section
566 ;; push partial matches into data string
567 (dotimes (i raw-length)
568 (add-to-coll coll (elt raw-mode-delimiter i)))
570 (add-to-coll coll ch)))
576 ;; if we're in certain states then it means we should return a value
579 ((#.state-pcdata #.state-rawdata)
580 ;; return the buffer as a string
581 (if* (zerop (collector-next coll))
582 then (values nil (if (eq state state-pcdata) :eof :pcdata))
584 (if* (null ignore-strings)
585 then (compute-coll-string coll))
586 (put-back-collector coll))
590 (when (null tag-to-return)
591 (error "unexpected end of input encountered"))
592 ;; we've read a tag with no attributes
593 (put-back-collector coll)
594 (values tag-to-return
597 else (if* xml-bailout then :xml else :start-tag))
600 (#.state-findattribname
601 ;; returning a tag with possible attributes
602 (put-back-collector coll)
604 then ; ignore any attributes
605 (values tag-to-return :end-tag)
606 elseif attribs-to-return
607 then (values (cons tag-to-return
608 (nreverse attribs-to-return))
610 else (values tag-to-return :start-tag)))
612 (#.state-readcomment-two
613 ;; returning a comment
614 (values (prog1 (if* (null ignore-strings)
615 then (compute-coll-string coll))
616 (put-back-collector coll))
620 (if* (null ch) then (error "unexpected end of input encountered")
621 else (error "internal error, can't be here in state ~d" state)))))))
624 (defvar *kwd-package* (find-package :keyword))
626 (defun compute-tag (coll)
627 (declare (optimize (speed 3) (safety 1)))
628 ;; compute the symbol named by what's in the collector
629 (excl::intern* (collector-data coll) (collector-next coll) *kwd-package*))
633 (defun compute-coll-string (coll)
634 (declare (optimize (speed 3) (safety 1)))
635 ;; return the string that's in the collection
636 (let ((str (make-string (collector-next coll)))
637 (from (collector-data coll)))
638 (dotimes (i (collector-next coll))
639 (setf (schar str i) (schar from i)))
643 (defun coll-has-comment (coll)
644 (declare (optimize (speed 3) (safety 1)))
645 ;; true if the collector has exactly "!--" in it
646 (and (eq 3 (collector-next coll))
647 (let ((data (collector-data coll)))
648 (and (eq #\! (schar data 0))
649 (eq #\- (schar data 1))
650 (eq #\- (schar data 2))))))
653 ;;;;;;;;;;; quick and dirty parse
655 ; the elements with no body and thus no end tag
656 (dolist (opt '(:area :base :basefont :bgsound :br :button :col
657 ;;:colgroup - no, this is an element with contents
658 :embed :hr :img :frame
659 :input :isindex :keygen :link :meta
660 :plaintext :spacer :wbr))
661 (setf (tag-no-end opt) t))
663 (defvar *in-line* '(:tt :i :b :big :small :em :strong :dfn :code :samp :kbd
664 :var :cite :abbr :acronym :a :img :object :br :map
665 :q :sub :sup :span :bdo :input :select :textarea :label :button :font))
667 (defvar *ch-format* '(:i :b :tt :big :small :strike :s :u
670 (defvar *known-tags* '(:!doctype :a :acronym :address :applet :area :b :base :basefont
671 :bdo :bgsound :big :blink :blockquote :body :br :button :caption
672 :center :cite :code :col :colgroup :comment :dd :del :dfn :dir
673 :div :dl :dt :em :embed :fieldset :font :form :frame :frameset
674 :h1 :h2 :h3 :h4 :h5 :h6 :head :hr :html :i :iframe :img :input
675 :ins :isindex :kbd :label :layer :legend :li :link :listing :map
676 :marquee :menu :meta :multicol :nobr :noframes :noscript :object
677 :ol :option :p :param :plaintext :pre :q :samp :script :select
678 :small :spacer :span :s :strike :strong :style :sub :sup :table
679 :tbody :td :textarea :tfoot :th :thead :title :tr :tt :u :ul :var
682 ; the elements whose start tag can end a previous tag
684 (setf (tag-auto-close :tr) '(:tr :td :th :colgroup))
685 (setf (tag-auto-close-stop :tr) '(:table))
687 (setf (tag-auto-close :td) '(:td :th))
688 (setf (tag-auto-close-stop :td) '(:table))
690 (setf (tag-auto-close :th) '(:td :th))
691 (setf (tag-auto-close-stop :th) '(:table))
693 (setf (tag-auto-close :dt) '(:dt :dd))
694 (setf (tag-auto-close-stop :dt) '(:dl))
696 (setf (tag-auto-close :li) '(:li))
697 (setf (tag-auto-close-stop :li) '(:ul :ol))
699 ;; new stuff to close off tags with optional close tags
700 (setf (tag-auto-close :address) '(:head :p))
701 (setf (tag-auto-close :blockquote) '(:head :p))
702 (setf (tag-auto-close :body) '(:body :frameset :head))
704 (setf (tag-auto-close :dd) '(:dd :dt))
705 (setf (tag-auto-close-stop :dd) '(:dl))
707 (setf (tag-auto-close :dl) '(:head :p))
708 (setf (tag-auto-close :div) '(:head :p))
709 (setf (tag-auto-close :fieldset) '(:head :p))
710 (setf (tag-auto-close :form) '(:head :p))
711 (setf (tag-auto-close :frameset) '(:body :frameset :head))
712 (setf (tag-auto-close :hr) '(:head :p))
713 (setf (tag-auto-close :h1) '(:head :p))
714 (setf (tag-auto-close :h2) '(:head :p))
715 (setf (tag-auto-close :h3) '(:head :p))
716 (setf (tag-auto-close :h4) '(:head :p))
717 (setf (tag-auto-close :h5) '(:head :p))
718 (setf (tag-auto-close :h6) '(:head :p))
719 (setf (tag-auto-close :noscript) '(:head :p))
720 (setf (tag-auto-close :ol) '(:head :p))
722 (setf (tag-auto-close :option) '(:option))
723 (setf (tag-auto-close-stop :option) '(:select))
725 (setf (tag-auto-close :p) '(:head :p))
727 (setf (tag-auto-close :pre) '(:head :p))
728 (setf (tag-auto-close :table) '(:head :p))
730 (setf (tag-auto-close :tbody) '(:colgroup :tfoot :tbody :thead))
731 (setf (tag-auto-close-stop :tbody) '(:table))
733 (setf (tag-auto-close :tfoot) '(:colgroup :tfoot :tbody :thead))
734 (setf (tag-auto-close-stop :tfoot) '(:table))
736 (setf (tag-auto-close :thead) '(:colgroup :tfoot :tbody :thead))
737 (setf (tag-auto-close-stop :thead) '(:table))
739 (setf (tag-auto-close :ul) '(:head :p))
741 (setf (tag-no-pcdata :table) t)
742 (setf (tag-no-pcdata :tr) t)
745 (defmethod parse-html ((p stream) &key callback-only callbacks collect-rogue-tags
747 (declare (optimize (speed 3) (safety 1)))
748 (phtml-internal p nil callback-only callbacks collect-rogue-tags
751 (defmacro tag-callback (tag)
752 `(rest (assoc ,tag callbacks)))
754 (defun phtml-internal (p read-sequence-func callback-only callbacks collect-rogue-tags
757 (declare (optimize (speed 3) (safety 1)))
758 (let ((raw-mode-delimiter nil)
760 (current-tag :start-parse)
761 (last-tag :start-parse)
762 (current-callback-tags nil)
763 (pending-ch-format nil)
764 (closed-pending-ch-format nil)
766 (tokenbuf (get-tokenbuf))
770 (labels ((close-off-tags (name stop-at collect-rogues)
771 ;; close off an open 'name' tag, but search no further
772 ;; than a 'stop-at' tag.
773 (if* (member (tag-name current-tag) name :test #'eq)
774 then ;; close current tag(s)
776 (when (and collect-rogues
777 (not (member (tag-name current-tag)
779 (push (tag-name current-tag) rogue-tags))
781 (when (or (member (tag-name current-tag)
784 (tag-name current-tag) name :test #'eq)))
786 elseif (member (tag-name current-tag) stop-at :test #'eq)
788 else ; search if there is a tag to close
789 (dolist (ent pending)
790 (if* (member (tag-name (car ent)) name :test #'eq)
791 then ; found one to close
793 (when (and collect-rogues
794 (not (member (tag-name current-tag)
796 (push (tag-name current-tag) rogue-tags))
798 (if* (member (tag-name current-tag) name
800 then (close-current-tag)
803 elseif (member (tag-name (car ent)) stop-at
805 then (return) ;; do nothing
808 (close-current-tag ()
809 ;; close off the current tag and open the pending tag
810 (when (member (tag-name current-tag) *ch-format* :test #'eq)
811 (push current-tag closed-pending-ch-format)
814 (if* (tag-no-pcdata (tag-name current-tag))
815 then (setq element `(,current-tag
816 ,@(strip-rev-pcdata guts)))
817 else (setq element `(,current-tag ,@(nreverse guts))))
818 (let ((callback (tag-callback (tag-name current-tag))))
820 (setf current-callback-tags (rest current-callback-tags))
821 (funcall callback element)))
822 (let* ((prev (pop pending)))
823 (setq current-tag (car prev)
825 (push element guts))))
828 ;; push the current tag state since we're starting
830 (push (cons current-tag guts) pending))
833 (strip-rev-pcdata (stuff)
834 ;; reverse the list stuff, omitting all the strings
837 (if* (not (stringp st)) then (push st res)))
839 (check-in-line (check-tag)
842 (length (length (tokenbuf-first-pass tokenbuf))))
844 (if* (< i length) then
845 (setf val (nth i (tokenbuf-first-pass tokenbuf)))
846 (setf kind (nth (+ i 1) (tokenbuf-first-pass tokenbuf)))
848 (if* (= i length) then (setf (tokenbuf-first-pass tokenbuf)
849 (nreverse (tokenbuf-first-pass tokenbuf))))
851 (multiple-value-setq (val kind)
853 (push val (tokenbuf-first-pass tokenbuf))
854 (push kind (tokenbuf-first-pass tokenbuf))
857 (if* (= i length) then
858 (setf (tokenbuf-first-pass tokenbuf)
859 (nreverse (tokenbuf-first-pass tokenbuf))))
861 (when (and (eq val check-tag) (eq kind :end-tag))
862 (if* (= i length) then
863 (setf (tokenbuf-first-pass tokenbuf)
864 (nreverse (tokenbuf-first-pass tokenbuf))))
866 (when (member val *ch-format* :test #'eq)
867 (if* (eq kind :start-tag) then (push val new-opens)
868 elseif (member val new-opens :test #'eq) then
869 (setf new-opens (remove val new-opens :count 1))
870 else (close-off-tags (list val) nil nil)
873 (get-next-token (force)
874 (if* (or force (null (tokenbuf-first-pass tokenbuf))) then
875 (multiple-value-bind (val kind)
876 (next-token p nil raw-mode-delimiter read-sequence-func
880 (let ((val (first (tokenbuf-first-pass tokenbuf)))
881 (kind (second (tokenbuf-first-pass tokenbuf))))
882 (setf (tokenbuf-first-pass tokenbuf)
883 (rest (rest (tokenbuf-first-pass tokenbuf))))
887 (multiple-value-bind (val kind)
889 ;;(format t "val: ~s kind: ~s~%" val kind)
892 (when (or (and callback-only current-callback-tags)
894 (if* (member last-tag *in-line*)
898 (when (dotimes (i (length val) nil)
899 (when (not (char-characteristic (elt val i)
903 (when (and (= (length raw-mode-delimiter) 1) ;; xml tag...
904 (or (and callback-only current-callback-tags)
905 (not callback-only)))
906 (close-off-tags (list last-tag) nil nil))
907 (setf raw-mode-delimiter nil)
912 (setf raw-mode-delimiter ">")
913 (let* ((name (tag-name val)))
914 (when (and callback-only (tag-callback name))
915 (push name current-callback-tags))
917 (setq current-tag val)
923 (if* (or (eq last-tag :style)
924 (and (listp last-tag) (eq (first last-tag) :style)))
926 (setf raw-mode-delimiter
927 (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
930 elseif (or (eq last-tag :script)
931 (and (listp last-tag) (eq (first last-tag) :script)))
933 (setf raw-mode-delimiter
934 (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
937 ; maybe this is an end tag too
938 (let* ((name (tag-name val))
939 (auto-close (tag-auto-close name))
940 (auto-close-stop nil)
941 (no-end (or (tag-no-end name) (member name no-body-tags))))
942 (when (and callback-only (tag-callback name))
943 (push name current-callback-tags))
944 (when (or (and callback-only current-callback-tags)
947 then (setq auto-close-stop (tag-auto-close-stop name))
948 (close-off-tags auto-close auto-close-stop nil))
949 (when (and pending-ch-format (not no-end))
950 (if* (member name *ch-format* :test #'eq) then nil
951 elseif (member name *in-line* :test #'eq) then
952 ;; close off only tags that are within *in-line* block
954 else ;; close ALL pending char tags and then reopen
955 (dolist (this-tag (reverse pending-ch-format))
956 (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil))
959 then ; this is a singleton tag
960 (let ((callback (tag-callback (tag-name (if* (atom val)
962 else (first val))))))
964 (funcall callback (if* (atom val)
967 (push (if* (atom val)
972 (setq current-tag val)
974 (if* (member name *ch-format* :test #'eq)
975 then (push val pending-ch-format)
977 (or (eq last-tag :style)
978 (and (listp last-tag) (eq (first last-tag) :style))
979 (eq last-tag :script)
980 (and (listp last-tag) (eq (first last-tag) :script))))
981 (dolist (tmp (reverse closed-pending-ch-format))
983 (setf current-tag tmp)
987 (or (eq last-tag :style)
988 (and (listp last-tag) (eq (first last-tag) :style))
989 (eq last-tag :script)
990 (and (listp last-tag) (eq (first last-tag) :script))))
991 (setf closed-pending-ch-format nil))
995 (setf raw-mode-delimiter nil)
996 (when (or (and callback-only current-callback-tags)
998 (close-off-tags (list val) nil nil)
999 (when (member val *ch-format* :test #'eq)
1000 (setf pending-ch-format
1001 (remove val pending-ch-format :count 1
1002 :test #'(lambda (x y) (eq x (if (listp y) (first y) y)))))
1003 (setf closed-pending-ch-format
1004 (remove val closed-pending-ch-format :count 1
1005 :test #'(lambda (x y) (eq x (if (listp y) (first y) y)))))
1007 (dolist (tmp (reverse closed-pending-ch-format))
1009 (setf current-tag tmp)
1011 (setf closed-pending-ch-format nil)
1015 (setf raw-mode-delimiter nil)
1016 (when (or (and callback-only current-callback-tags)
1017 (not callback-only))
1018 (push `(:comment ,val) guts)))
1021 (setf raw-mode-delimiter nil)
1022 ;; close off all tags
1023 (when (or (and callback-only current-callback-tags)
1024 (not callback-only))
1025 (close-off-tags '(:start-parse) nil collect-rogue-tags))
1026 (put-back-tokenbuf tokenbuf)
1027 (if collect-rogue-tags
1028 (return (values (cdar guts) rogue-tags))
1029 (return (cdar guts))))))))))
1033 (defmethod parse-html (file &key callback-only callbacks collect-rogue-tags
1035 (declare (optimize (speed 3) (safety 1)))
1036 (with-open-file (p file :direction :input)
1037 (parse-html p :callback-only callback-only :callbacks callbacks
1038 :collect-rogue-tags collect-rogue-tags
1039 :no-body-tags no-body-tags)))
1042 (defmethod parse-html ((str string) &key callback-only callbacks collect-rogue-tags
1044 (declare (optimize (speed 3) (safety 1)))
1045 (parse-html (make-string-input-stream str)
1046 :callback-only callback-only :callbacks callbacks
1047 :collect-rogue-tags collect-rogue-tags
1048 :no-body-tags no-body-tags))
1060 ;;;(defun doit (ignore-data)
1061 ;;; (with-open-file (p "readme.htm")
1063 ;;; (multiple-value-bind (val kind) (next-token p ignore-data)
1064 ;;; ;(format t "~s -> ~s~%" kind val)
1066 ;;; (if* (eq kind :eof) then (return))))))
1068 ;;;(defun pdoit (&optional (file "testa.html"))
1069 ;;; (with-open-file (p file)
1070 ;;; (parse-html p)))
1073 ;;;;; requires http client module to work
1074 ;;;(defun getparse (host path)
1075 ;;; (parse-html (httpr-body
1077 ;;; (simple-get host path)))))