2 (sys:defpatch "phtml" 1
3 "parse-html close tag closes consecutive identical open tags."
7 ;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA
9 ;; This code is free software; you can redistribute it and/or
10 ;; modify it under the terms of the version 2.1 of
11 ;; the GNU Lesser General Public License as published by
12 ;; the Free Software Foundation, as clarified by the AllegroServe
13 ;; prequel found in license-allegroserve.txt.
15 ;; This code is distributed in the hope that it will be useful,
16 ;; but without any warranty; without even the implied warranty of
17 ;; merchantability or fitness for a particular purpose. See the GNU
18 ;; Lesser General Public License for more details.
20 ;; Version 2.1 of the GNU Lesser General Public License is in the file
21 ;; license-lgpl.txt that was distributed with this file.
22 ;; If it is not present, you can access it from
23 ;; http://www.gnu.org/copyleft/lesser.txt (until superseded by a newer
24 ;; version) or write to the Free Software Foundation, Inc., 59 Temple Place,
25 ;; Suite 330, Boston, MA 02111-1307 USA
30 ;; phtml.cl - parse html
33 ;; 05/14/02 - add :parse-entities arg to parse-html. If true then
34 ;; entities are converted to the character they represent.
36 ;; 02/05/01 symbols mapped to preferred case at runtime (as opposed to
37 ;; a compile time macro determining the case mapping)
39 ;; 10/27/00 :callbacks arg now processed correctly for tags with no body
41 ;; 10/14/00 add first-pass member to tokenbuf structure; used to remove
42 ;; multiple un-next-char calls in raw mode
43 ;; removed :script from *in-line* (incorect and led to infinite loop)
44 ;; char format reopen not done in :script and :style
45 ;; fixed :table/:th tag-auto-close-stop typo
48 ; do character entity stuff
51 (defpackage net.html.parser
52 (:use :cl #+allegro :clos :excl #+allegro :mp #-allegro :acl-mp)
57 (in-package :net.html.parser)
59 (defmacro tag-auto-close (tag) `(get ,tag 'tag-auto-close))
60 (defmacro tag-auto-close-stop (tag) `(get ,tag 'tag-auto-close-stop))
61 (defmacro tag-no-end (tag) `(get ,tag 'tag-no-end))
63 ; only subelements allowed in this element, no strings
64 (defmacro tag-no-pcdata (tag) `(get ,tag 'tag-no-pcdata))
66 ;; given :foo or (:foo ...) return :foo
67 (defmacro tag-name (expr)
77 (eval-when (compile load eval)
78 (defconstant state-pcdata 0) ; scanning for chars or a tag
79 (defconstant state-readtagfirst 1)
80 (defconstant state-readtag 2)
81 (defconstant state-findattribname 3)
82 (defconstant state-attribname 4)
83 (defconstant state-attribstartvalue 5)
84 (defconstant state-attribvaluedelim 6)
85 (defconstant state-attribvaluenodelim 7)
86 (defconstant state-readcomment 8)
87 (defconstant state-readcomment-one 9)
88 (defconstant state-readcomment-two 10)
89 (defconstant state-findvalue 11)
90 (defconstant state-rawdata 12)
95 next ; next index to set
96 max ; 1+max index to set
100 ;; keep a cache of collectors on this list
102 (defparameter *collectors* (list nil nil nil nil))
104 (defun get-collector ()
105 (declare (optimize (speed 3) (safety 1)))
108 (do* ((cols *collectors* (cdr cols))
109 (this (car cols) (car cols)))
112 then (setf (car cols) nil)
116 then (setf (collector-next col) 0)
121 :data (make-string 100)))))
123 (defun put-back-collector (col)
124 (declare (optimize (speed 3) (safety 1)))
126 (do ((cols *collectors* (cdr cols)))
130 (if* (null (car cols))
131 then (setf (car cols) col)
136 (defun grow-and-add (coll ch)
137 (declare (optimize (speed 3) (safety 1)))
138 ;; increase the size of the data portion of the collector and then
139 ;; add the given char at the end
140 (let* ((odata (collector-data coll))
141 (ndata (make-string (* 2 (length odata)))))
142 (dotimes (i (length odata))
143 (setf (schar ndata i) (schar odata i)))
144 (setf (collector-data coll) ndata)
145 (setf (collector-max coll) (length ndata))
146 (let ((next (collector-next coll)))
147 (setf (schar ndata next) ch)
148 (setf (collector-next coll) (1+ next)))))
156 ;; character characteristics
157 (defconstant char-tagcharacter 1) ; valid char for a tag
158 (defconstant char-attribnamechar 2) ; valid char for an attribute name
159 (defconstant char-attribundelimattribvalue 4) ; valid for undelimited value
160 (defconstant char-spacechar 8)
162 (defparameter *characteristics*
163 ;; array of bits describing character characteristics
164 (let ((arr (make-array 128 :initial-element 0)))
165 (declare (optimize (speed 3) (safety 1)))
166 (macrolet ((with-range ((var from to) &rest body)
167 `(do ((,var (char-code ,from) (1+ ,var))
168 (mmax (char-code ,to)))
172 (addit (index charistic)
173 `(setf (svref arr ,index)
174 (logior (svref arr ,index)
178 (with-range (i #\A #\Z)
179 (addit i (+ char-tagcharacter
181 char-attribundelimattribvalue)))
183 (with-range (i #\a #\z)
184 (addit i (+ char-tagcharacter
186 char-attribundelimattribvalue)))
188 (with-range (i #\0 #\9)
189 (addit i (+ char-tagcharacter
191 char-attribundelimattribvalue)))
193 ;; let colon be legal tag character
194 (addit (char-code #\:) (+ char-attribnamechar
197 ;; NY times special tags have _
198 (addit (char-code #\_) (+ char-attribnamechar
201 ; now the unusual cases
202 (addit (char-code #\-) (+ char-attribnamechar
203 char-attribundelimattribvalue))
204 (addit (char-code #\.) (+ char-attribnamechar
205 char-attribundelimattribvalue))
207 ;; adding all typeable chars except for whitespace and >
208 (addit (char-code #\:) char-attribundelimattribvalue)
209 (addit (char-code #\@) char-attribundelimattribvalue)
210 (addit (char-code #\/) char-attribundelimattribvalue)
211 (addit (char-code #\!) char-attribundelimattribvalue)
212 (addit (char-code #\#) char-attribundelimattribvalue)
213 (addit (char-code #\$) char-attribundelimattribvalue)
214 (addit (char-code #\%) char-attribundelimattribvalue)
215 (addit (char-code #\^) char-attribundelimattribvalue)
216 (addit (char-code #\&) char-attribundelimattribvalue)
217 (addit (char-code #\() char-attribundelimattribvalue)
218 (addit (char-code #\)) char-attribundelimattribvalue)
219 (addit (char-code #\_) char-attribundelimattribvalue)
220 (addit (char-code #\=) char-attribundelimattribvalue)
221 (addit (char-code #\+) char-attribundelimattribvalue)
222 (addit (char-code #\\) char-attribundelimattribvalue)
223 (addit (char-code #\|) char-attribundelimattribvalue)
224 (addit (char-code #\{) char-attribundelimattribvalue)
225 (addit (char-code #\}) char-attribundelimattribvalue)
226 (addit (char-code #\[) char-attribundelimattribvalue)
227 (addit (char-code #\]) char-attribundelimattribvalue)
228 (addit (char-code #\;) char-attribundelimattribvalue)
229 (addit (char-code #\') char-attribundelimattribvalue)
230 (addit (char-code #\") char-attribundelimattribvalue)
231 (addit (char-code #\,) char-attribundelimattribvalue)
232 (addit (char-code #\<) char-attribundelimattribvalue)
233 (addit (char-code #\?) char-attribundelimattribvalue)
235 ; i'm not sure what can be in a tag name but we know that
236 ; ! and - must be there since it's used in comments
238 (addit (char-code #\-) char-tagcharacter)
239 (addit (char-code #\!) char-tagcharacter)
242 (addit (char-code #\space) char-spacechar)
243 (addit (char-code #\tab) char-spacechar)
244 (addit (char-code #\return) char-spacechar)
245 (addit (char-code #\linefeed) char-spacechar)
254 (defun char-characteristic (char bit)
255 (declare (optimize (speed 3) (safety 1)))
256 ;; return true if the given char has the given bit set in
257 ;; the characteristic array
258 (let ((code (char-code char)))
261 (not (zerop (logand (svref *characteristics* code) bit))))))
264 (defvar *html-entity-to-code*
265 (let ((table (make-hash-table :test #'equal)))
266 (dolist (ent '(("nbsp" . 160)
519 (setf (gethash (car ent) table) (cdr ent)))
525 cur ;; next index to use to grab from tokenbuf
526 max ;; index one beyond last character
527 data ;; character array
528 first-pass ;; previously parsed tokens
531 ;; cache of tokenbuf structs
532 (defparameter *tokenbufs* (list nil nil nil nil))
534 (defun get-tokenbuf ()
535 (declare (optimize (speed 3) (safety 1)))
538 (do* ((bufs *tokenbufs* (cdr bufs))
539 (this (car bufs) (car bufs)))
542 then (setf (car bufs) nil)
546 then (setf (tokenbuf-cur buf) 0)
547 (setf (tokenbuf-max buf) 0)
552 :data (make-array 1024 :element-type 'character)))))
554 (defun put-back-tokenbuf (buf)
555 (declare (optimize (speed 3) (safety 1)))
557 (do ((bufs *tokenbufs* (cdr bufs)))
561 (if* (null (car bufs))
562 then (setf (car bufs) buf)
565 (defun to-preferred-case (ch)
566 (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
567 then (char-upcase ch)
568 else (char-downcase ch)))
571 (defun next-token (stream ignore-strings raw-mode-delimiter
572 read-sequence-func tokenbuf parse-entities)
573 (declare (optimize (speed 3) (safety 1)))
574 ;; return two values:
575 ;; the next token from the stream.
576 ;; the kind of token (:pcdata, :start-tag, :end-tag, :eof)
578 ;; if read-sequence-func is non-nil,
579 ;; read-sequence-func is called to fetch the next character
580 (macrolet ((next-char (stream)
581 `(let ((cur (tokenbuf-cur tokenbuf))
582 (tb (tokenbuf-data tokenbuf)))
583 (if* (>= cur (tokenbuf-max tokenbuf))
585 (if* (zerop (setf (tokenbuf-max tokenbuf)
586 (if* read-sequence-func
587 then (funcall read-sequence-func tb stream)
588 else (read-sequence tb stream))))
589 then (setq cur nil) ; eof
592 then (prog1 (schar tb cur)
593 (setf (tokenbuf-cur tokenbuf) (1+ cur))))))
596 (un-next-char (stream ch)
597 `(decf (tokenbuf-cur tokenbuf)))
600 `(setf (collector-next coll) 0))
602 (add-to-coll (coll ch)
603 `(let ((.next. (collector-next ,coll)))
604 (if* (>= .next. (collector-max ,coll))
605 then (grow-and-add ,coll ,ch)
606 else (setf (schar (collector-data ,coll) .next.)
608 (setf (collector-next ,coll) (1+ .next.)))))
612 (let ((state (if* raw-mode-delimiter then state-rawdata else state-pcdata))
613 (coll (get-collector))
626 (name-length 0) ;; count only when it could be a comment
634 (setq ch (next-char stream))
635 ;;(format t "ch: ~s state: ~s~%" ch state)
638 then (return) ; eof -- exit loop
644 ; collect everything until we see a <
646 then ; if we've collected nothing then get a tag
647 (if* (> (collector-next coll) 0)
648 then ; have collected something, return this string
649 (un-next-char stream ch) ; push back the <
652 (setq state state-readtagfirst))
653 elseif (and parse-entities (eq ch #\&))
654 then ; reading an entity. entity ends at semicolon
656 (loop (let ((ch (next-char stream)))
658 then (error "End of file after & entity marker")
661 elseif (zerop (decf max))
662 then (error "No semicolon found after entity starting: &~{~a~}" (nreverse res))
663 else (push ch res))))
664 (setq res (nreverse res))
665 (if* (eq (car res) #\#)
666 then ; decimal entity
668 (dolist (ch (cdr res))
669 (let ((code (char-code ch)))
670 (if* (<= #.(char-code #\0)
677 else (error "non decimal digit after &# - ~s" ch)
679 (add-to-coll coll (code-char count)))
680 else (let ((name (make-array (length res)
681 :element-type 'character
682 :initial-contents res)))
683 (let ((ch (gethash name *html-entity-to-code*)))
685 then (add-to-coll coll (code-char ch))
686 else (error "No such entity as ~s" name))))))
688 else ; we will check for & here eventually
689 (if* (not (eq ch #\return))
690 then (add-to-coll coll ch))))
692 (#.state-readtagfirst
693 ; starting to read a tag name
697 else (if* (eq #\! ch) ; possible comment
698 then (setf xml-bailout t)
699 (setq name-length 0))
700 (un-next-char stream ch))
701 (setq state state-readtag))
704 ;; reading the whole tag name
705 (if* (char-characteristic ch char-tagcharacter)
706 then (add-to-coll coll (to-preferred-case ch))
708 (if* (and (eq name-length 3)
709 (coll-has-comment coll))
710 then (clear-coll coll)
711 (setq state state-readcomment))
713 else (setq tag-to-return (compute-tag coll))
716 then (return) ; we're done
717 elseif xml-bailout then
718 (un-next-char stream ch)
720 else (if* (eq tag-to-return :!--)
722 (setq state state-readcomment)
723 else (un-next-char stream ch)
724 (setq state state-findattribname)))))
726 (#.state-findattribname
727 ;; search until we find the start of an attribute name
728 ;; or the end of the tag
730 then ; end of the line
733 then ; value for previous attribute name
734 ; (syntax "foo = bar" is bogus I think but it's
735 ; used some places, here is where we handle this
736 (pop attribs-to-return)
737 (setq attrib-name (pop attribs-to-return))
738 (setq state state-findvalue)
739 elseif (char-characteristic ch char-attribnamechar)
740 then (un-next-char stream ch)
741 (setq state state-attribname)
742 else nil ; ignore other things
746 ;; find the start of the value
747 (if* (char-characteristic ch char-spacechar)
748 thenret ; keep looking
750 then ; no value, set the value to be the
753 (string-downcase (string attrib-name)))
755 (push attrib-name attribs-to-return)
756 (push attrib-value attribs-to-return)
757 (un-next-char stream ch)
758 (setq state state-findattribname)
759 else (un-next-char stream ch)
760 (setq state state-attribstartvalue)))
764 ;; collect attribute name
766 (if* (char-characteristic ch char-attribnamechar)
767 then (add-to-coll coll (to-preferred-case ch))
769 then ; end of attribute name, value is next
770 (setq attrib-name (compute-tag coll))
772 (setq state state-attribstartvalue)
773 else ; end of attribute name with no value,
774 (setq attrib-name (compute-tag coll))
777 (string-downcase (string attrib-name)))
778 (push attrib-name attribs-to-return)
779 (push attrib-value attribs-to-return)
780 (un-next-char stream ch)
781 (setq state state-findattribname)))
783 (#.state-attribstartvalue
784 ;; begin to collect value
787 then (setq value-delim ch)
788 (setq state state-attribvaluedelim)
789 ;; gobble spaces; assume since we've seen a '=' there really is a value
790 elseif (eq #\space ch) then nil
791 else (un-next-char stream ch)
792 (setq state state-attribvaluenodelim)))
794 (#.state-attribvaluedelim
795 (if* (eq ch value-delim)
796 then (setq attrib-value (compute-coll-string coll))
798 (push attrib-name attribs-to-return)
799 (push attrib-value attribs-to-return)
800 (setq state state-findattribname)
801 else (add-to-coll coll ch)))
803 (#.state-attribvaluenodelim
804 ;; an attribute value not delimited by ' or " and thus restricted
805 ;; in the possible characters
806 (if* (char-characteristic ch char-attribundelimattribvalue)
807 then (add-to-coll coll ch)
808 else (un-next-char stream ch)
809 (setq attrib-value (compute-coll-string coll))
811 (push attrib-name attribs-to-return)
812 (push attrib-value attribs-to-return)
813 (setq state state-findattribname)))
816 ;; a comment ends on the first --, but we'll look for -->
817 ;; since that's what most people expect
819 then (setq state state-readcomment-one)
820 else (add-to-coll coll ch)))
822 (#.state-readcomment-one
823 ;; seen one -, looking for ->
826 then (setq state state-readcomment-two)
827 else ; not a comment end, put back the -'s
828 (add-to-coll coll #\-)
829 (add-to-coll coll ch)
830 (setq state state-readcomment)))
832 (#.state-readcomment-two
833 ;; seen two -'s, looking for >
836 then ; end of the line
839 then ; still at two -'s, have to put out first
840 (add-to-coll coll #\-)
841 else ; put out two hypens and back to looking for a hypen
842 (add-to-coll coll #\-)
843 (add-to-coll coll #\-)
844 (setq state state-readcomment)))
847 ;; collect everything until we see the delimiter
848 (if* (eq (to-preferred-case ch) (elt raw-mode-delimiter raw-length))
851 (when (= raw-length (length raw-mode-delimiter))
852 ;; push the end tag back so it can then be lexed
853 ;; but don't do it for xml stuff
854 (when (/= (length raw-mode-delimiter) 1)
855 (push :end-tag (tokenbuf-first-pass tokenbuf))
856 (if* (equal raw-mode-delimiter "</STYLE>")
857 then (push :STYLE (tokenbuf-first-pass tokenbuf))
858 elseif (equal raw-mode-delimiter "</style>")
859 then (push :style (tokenbuf-first-pass tokenbuf))
860 elseif (equal raw-mode-delimiter "</SCRIPT>")
861 then (push :SCRIPT (tokenbuf-first-pass tokenbuf))
862 elseif (equal raw-mode-delimiter "</script>")
863 then (push :script (tokenbuf-first-pass tokenbuf))
864 else (error "unexpected raw-mode-delimiter"))
866 ;; set state to state-pcdata for next section
869 ;; push partial matches into data string
870 (dotimes (i raw-length)
871 (add-to-coll coll (elt raw-mode-delimiter i)))
873 (add-to-coll coll ch)))
879 ;; if we're in certain states then it means we should return a value
882 ((#.state-pcdata #.state-rawdata)
883 ;; return the buffer as a string
884 (if* (zerop (collector-next coll))
885 then (values nil (if (eq state state-pcdata) :eof :pcdata))
887 (if* (null ignore-strings)
888 then (compute-coll-string coll))
889 (put-back-collector coll))
893 (when (null tag-to-return)
894 (error "unexpected end of input encountered"))
895 ;; we've read a tag with no attributes
896 (put-back-collector coll)
897 (values tag-to-return
900 else (if* xml-bailout then :xml else :start-tag))
903 (#.state-findattribname
904 ;; returning a tag with possible attributes
905 (put-back-collector coll)
907 then ; ignore any attributes
908 (values tag-to-return :end-tag)
909 elseif attribs-to-return
910 then (values (cons tag-to-return
911 (nreverse attribs-to-return))
913 else (values tag-to-return :start-tag)))
915 (#.state-readcomment-two
916 ;; returning a comment
917 (values (prog1 (if* (null ignore-strings)
918 then (compute-coll-string coll))
919 (put-back-collector coll))
923 (if* (null ch) then (error "unexpected end of input encountered")
924 else (error "internal error, can't be here in state ~d" state)))))))
927 (defvar *kwd-package* (find-package :keyword))
929 (defun compute-tag (coll)
930 (declare (optimize (speed 3) (safety 1)))
931 ;; compute the symbol named by what's in the collector
932 (excl::intern* (collector-data coll) (collector-next coll) *kwd-package*))
936 (defun compute-coll-string (coll)
937 (declare (optimize (speed 3) (safety 1)))
938 ;; return the string that's in the collection
939 (let ((str (make-string (collector-next coll)))
940 (from (collector-data coll)))
941 (dotimes (i (collector-next coll))
942 (setf (schar str i) (schar from i)))
946 (defun coll-has-comment (coll)
947 (declare (optimize (speed 3) (safety 1)))
948 ;; true if the collector has exactly "!--" in it
949 (and (eq 3 (collector-next coll))
950 (let ((data (collector-data coll)))
951 (and (eq #\! (schar data 0))
952 (eq #\- (schar data 1))
953 (eq #\- (schar data 2))))))
956 ;;;;;;;;;;; quick and dirty parse
958 ; the elements with no body and thus no end tag
959 (dolist (opt '(:area :base :basefont :bgsound :br :button :col
960 ;;:colgroup - no, this is an element with contents
961 :embed :hr :img :frame
962 :input :isindex :keygen :link :meta
963 :plaintext :spacer :wbr))
964 (setf (tag-no-end opt) t))
966 (defvar *in-line* '(:tt :i :b :big :small :em :strong :dfn :code :samp :kbd
967 :var :cite :abbr :acronym :a :img :object :br :map
968 :q :sub :sup :span :bdo :input :select :textarea :label :button :font))
970 (defvar *ch-format* '(:i :b :tt :big :small :strike :s :u
973 (defvar *known-tags* '(:!doctype :a :acronym :address :applet :area :b :base :basefont
974 :bdo :bgsound :big :blink :blockquote :body :br :button :caption
975 :center :cite :code :col :colgroup :comment :dd :del :dfn :dir
976 :div :dl :dt :em :embed :fieldset :font :form :frame :frameset
977 :h1 :h2 :h3 :h4 :h5 :h6 :head :hr :html :i :iframe :img :input
978 :ins :isindex :kbd :label :layer :legend :li :link :listing :map
979 :marquee :menu :meta :multicol :nobr :noframes :noscript :object
980 :ol :option :p :param :plaintext :pre :q :samp :script :select
981 :small :spacer :span :s :strike :strong :style :sub :sup :table
982 :tbody :td :textarea :tfoot :th :thead :title :tr :tt :u :ul :var
985 ; the elements whose start tag can end a previous tag
987 (setf (tag-auto-close :tr) '(:tr :td :th :colgroup))
988 (setf (tag-auto-close-stop :tr) '(:table))
990 (setf (tag-auto-close :td) '(:td :th))
991 (setf (tag-auto-close-stop :td) '(:table))
993 (setf (tag-auto-close :th) '(:td :th))
994 (setf (tag-auto-close-stop :th) '(:table))
996 (setf (tag-auto-close :dt) '(:dt :dd))
997 (setf (tag-auto-close-stop :dt) '(:dl))
999 (setf (tag-auto-close :li) '(:li))
1000 (setf (tag-auto-close-stop :li) '(:ul :ol))
1002 ;; new stuff to close off tags with optional close tags
1003 (setf (tag-auto-close :address) '(:head :p))
1004 (setf (tag-auto-close :blockquote) '(:head :p))
1005 (setf (tag-auto-close :body) '(:body :frameset :head))
1007 (setf (tag-auto-close :dd) '(:dd :dt))
1008 (setf (tag-auto-close-stop :dd) '(:dl))
1010 (setf (tag-auto-close :dl) '(:head :p))
1011 (setf (tag-auto-close :div) '(:head :p))
1012 (setf (tag-auto-close :fieldset) '(:head :p))
1013 (setf (tag-auto-close :form) '(:head :p))
1014 (setf (tag-auto-close :frameset) '(:body :frameset :head))
1015 (setf (tag-auto-close :hr) '(:head :p))
1016 (setf (tag-auto-close :h1) '(:head :p))
1017 (setf (tag-auto-close :h2) '(:head :p))
1018 (setf (tag-auto-close :h3) '(:head :p))
1019 (setf (tag-auto-close :h4) '(:head :p))
1020 (setf (tag-auto-close :h5) '(:head :p))
1021 (setf (tag-auto-close :h6) '(:head :p))
1022 (setf (tag-auto-close :noscript) '(:head :p))
1023 (setf (tag-auto-close :ol) '(:head :p))
1025 (setf (tag-auto-close :option) '(:option))
1026 (setf (tag-auto-close-stop :option) '(:select))
1028 (setf (tag-auto-close :p) '(:head :p))
1030 (setf (tag-auto-close :pre) '(:head :p))
1031 (setf (tag-auto-close :table) '(:head :p))
1033 (setf (tag-auto-close :tbody) '(:colgroup :tfoot :tbody :thead))
1034 (setf (tag-auto-close-stop :tbody) '(:table))
1036 (setf (tag-auto-close :tfoot) '(:colgroup :tfoot :tbody :thead))
1037 (setf (tag-auto-close-stop :tfoot) '(:table))
1039 (setf (tag-auto-close :thead) '(:colgroup :tfoot :tbody :thead))
1040 (setf (tag-auto-close-stop :thead) '(:table))
1042 (setf (tag-auto-close :ul) '(:head :p))
1044 (setf (tag-no-pcdata :table) t)
1045 (setf (tag-no-pcdata :tr) t)
1048 (defmethod parse-html ((p stream) &key callback-only callbacks collect-rogue-tags
1051 (declare (optimize (speed 3) (safety 1)))
1052 (phtml-internal p nil callback-only callbacks collect-rogue-tags
1053 no-body-tags parse-entities))
1055 (defmacro tag-callback (tag)
1056 `(rest (assoc ,tag callbacks)))
1058 (defun phtml-internal (p read-sequence-func callback-only
1059 callbacks collect-rogue-tags
1062 (declare (optimize (speed 3) (safety 1)))
1063 (let ((raw-mode-delimiter nil)
1065 (current-tag :start-parse)
1066 (last-tag :start-parse)
1067 (current-callback-tags nil)
1068 (pending-ch-format nil)
1069 (closed-pending-ch-format nil)
1071 (tokenbuf (get-tokenbuf))
1075 (labels ((close-off-tags (name stop-at collect-rogues once-only)
1076 ;; close off an open 'name' tag, but search no further
1077 ;; than a 'stop-at' tag.
1078 #+ignore (format t "close off name ~s, stop at ~s, ct ~s~%"
1079 name stop-at current-tag)
1080 (if* (member (tag-name current-tag) name :test #'eq)
1081 then ;; close current tag(s)
1083 (when (and collect-rogues
1084 (not (member (tag-name current-tag)
1086 (push (tag-name current-tag) rogue-tags))
1089 (member (tag-name current-tag)
1092 (tag-name current-tag) name :test #'eq)))
1094 elseif (member (tag-name current-tag) stop-at :test #'eq)
1096 else ; search if there is a tag to close
1097 (dolist (ent pending)
1098 (if* (member (tag-name (car ent)) name :test #'eq)
1099 then ; found one to close
1101 (when (and collect-rogues
1102 (not (member (tag-name current-tag)
1104 (push (tag-name current-tag) rogue-tags))
1106 (if* (member (tag-name current-tag) name
1108 then (close-current-tag)
1111 elseif (member (tag-name (car ent)) stop-at
1113 then (return) ;; do nothing
1116 (close-current-tag ()
1117 ;; close off the current tag and open the pending tag
1118 (when (member (tag-name current-tag) *ch-format* :test #'eq)
1119 (push current-tag closed-pending-ch-format)
1122 (if* (tag-no-pcdata (tag-name current-tag))
1123 then (setq element `(,current-tag
1124 ,@(strip-rev-pcdata guts)))
1125 else (setq element `(,current-tag ,@(nreverse guts))))
1126 (let ((callback (tag-callback (tag-name current-tag))))
1128 (setf current-callback-tags (rest current-callback-tags))
1129 (funcall callback element)))
1130 (let* ((prev (pop pending)))
1131 (setq current-tag (car prev)
1133 (push element guts))))
1136 ;; push the current tag state since we're starting:
1138 (push (cons current-tag guts) pending)
1139 #+ignore (format t "state saved, pending ~s~%" pending)
1143 (strip-rev-pcdata (stuff)
1144 ;; reverse the list stuff, omitting all the strings
1147 (if* (not (stringp st)) then (push st res)))
1149 (check-in-line (check-tag)
1150 (setf new-opens nil)
1151 (let (val kind (i 0)
1152 (length (length (tokenbuf-first-pass tokenbuf))))
1154 (if* (< i length) then
1155 (setf val (nth i (tokenbuf-first-pass tokenbuf)))
1156 (setf kind (nth (+ i 1) (tokenbuf-first-pass tokenbuf)))
1158 (if* (= i length) then (setf (tokenbuf-first-pass tokenbuf)
1159 (nreverse (tokenbuf-first-pass tokenbuf))))
1161 (multiple-value-setq (val kind)
1163 (push val (tokenbuf-first-pass tokenbuf))
1164 (push kind (tokenbuf-first-pass tokenbuf))
1166 (when (eq kind :eof)
1167 (if* (= i length) then
1168 (setf (tokenbuf-first-pass tokenbuf)
1169 (nreverse (tokenbuf-first-pass tokenbuf))))
1171 (when (and (eq val check-tag) (eq kind :end-tag))
1172 (if* (= i length) then
1173 (setf (tokenbuf-first-pass tokenbuf)
1174 (nreverse (tokenbuf-first-pass tokenbuf))))
1176 (when (member val *ch-format* :test #'eq)
1177 (if* (eq kind :start-tag) then (push val new-opens)
1178 elseif (member val new-opens :test #'eq) then
1179 (setf new-opens (remove val new-opens :count 1))
1180 else (close-off-tags (list val) nil nil nil)
1183 (get-next-token (force)
1184 (if* (or force (null (tokenbuf-first-pass tokenbuf))) then
1185 (multiple-value-bind (val kind)
1186 (next-token p nil raw-mode-delimiter read-sequence-func
1187 tokenbuf parse-entities)
1190 (let ((val (first (tokenbuf-first-pass tokenbuf)))
1191 (kind (second (tokenbuf-first-pass tokenbuf))))
1192 (setf (tokenbuf-first-pass tokenbuf)
1193 (rest (rest (tokenbuf-first-pass tokenbuf))))
1194 (values val kind))))
1197 (multiple-value-bind (val kind)
1198 (get-next-token nil)
1199 #+ignore (format t "val: ~s kind: ~s last-tag ~s pending ~s~%" val kind
1203 (when (or (and callback-only current-callback-tags)
1204 (not callback-only))
1205 (if* (member last-tag *in-line*)
1209 (when (dotimes (i (length val) nil)
1210 (when (not (char-characteristic (elt val i)
1214 (when (and (= (length raw-mode-delimiter) 1) ;; xml tag...
1215 (or (and callback-only current-callback-tags)
1216 (not callback-only)))
1217 (close-off-tags (list last-tag) nil nil t))
1218 (setf raw-mode-delimiter nil)
1223 (setf raw-mode-delimiter ">")
1224 (let* ((name (tag-name val)))
1225 (when (and callback-only (tag-callback name))
1226 (push name current-callback-tags))
1228 (setq current-tag val)
1234 (if* (or (eq last-tag :style)
1235 (and (listp last-tag) (eq (first last-tag) :style)))
1237 (setf raw-mode-delimiter
1238 (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
1241 elseif (or (eq last-tag :script)
1242 (and (listp last-tag) (eq (first last-tag) :script)))
1244 (setf raw-mode-delimiter
1245 (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
1248 ; maybe this is an end tag too
1249 (let* ((name (tag-name val))
1250 (auto-close (tag-auto-close name))
1251 (auto-close-stop nil)
1252 (no-end (or (tag-no-end name) (member name no-body-tags))))
1253 (when (and callback-only (tag-callback name))
1254 (push name current-callback-tags))
1255 (when (or (and callback-only current-callback-tags)
1256 (not callback-only))
1258 then (setq auto-close-stop (tag-auto-close-stop name))
1259 (close-off-tags auto-close auto-close-stop nil nil))
1260 (when (and pending-ch-format (not no-end))
1261 (if* (member name *ch-format* :test #'eq) then nil
1262 elseif (member name *in-line* :test #'eq) then
1263 ;; close off only tags that are within *in-line* block
1264 (check-in-line name)
1265 else ;; close ALL pending char tags and then reopen
1266 (dolist (this-tag (reverse pending-ch-format))
1267 (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil nil))
1270 then ; this is a singleton tag
1271 (let ((callback (tag-callback (tag-name (if* (atom val)
1273 else (first val))))))
1275 (funcall callback (if* (atom val)
1278 (push (if* (atom val)
1283 (setq current-tag val)
1285 (if* (member name *ch-format* :test #'eq)
1286 then (push val pending-ch-format)
1288 (or (eq last-tag :style)
1289 (and (listp last-tag) (eq (first last-tag) :style))
1290 (eq last-tag :script)
1291 (and (listp last-tag) (eq (first last-tag) :script))))
1292 (dolist (tmp (reverse closed-pending-ch-format))
1294 (setf current-tag tmp)
1298 (or (eq last-tag :style)
1299 (and (listp last-tag) (eq (first last-tag) :style))
1300 (eq last-tag :script)
1301 (and (listp last-tag) (eq (first last-tag) :script))))
1302 (setf closed-pending-ch-format nil))
1306 (setf raw-mode-delimiter nil)
1307 (when (or (and callback-only current-callback-tags)
1308 (not callback-only))
1309 (close-off-tags (list val) nil nil t)
1310 (when (member val *ch-format* :test #'eq)
1311 (setf pending-ch-format
1312 (remove val pending-ch-format :count 1
1313 :test #'(lambda (x y) (eq x (if (listp y) (first y) y)))))
1314 (setf closed-pending-ch-format
1315 (remove val closed-pending-ch-format :count 1
1316 :test #'(lambda (x y) (eq x (if (listp y) (first y) y)))))
1318 (dolist (tmp (reverse closed-pending-ch-format))
1320 (setf current-tag tmp)
1322 (setf closed-pending-ch-format nil)
1326 (setf raw-mode-delimiter nil)
1327 (when (or (and callback-only current-callback-tags)
1328 (not callback-only))
1329 (push `(:comment ,val) guts)))
1332 (setf raw-mode-delimiter nil)
1333 ;; close off all tags
1334 (when (or (and callback-only current-callback-tags)
1335 (not callback-only))
1336 (close-off-tags '(:start-parse) nil collect-rogue-tags nil))
1337 (put-back-tokenbuf tokenbuf)
1338 (if collect-rogue-tags
1339 (return (values (cdar guts) rogue-tags))
1340 (return (cdar guts))))))))))
1344 (defmethod parse-html (file &key callback-only callbacks collect-rogue-tags
1345 no-body-tags parse-entities)
1346 (declare (optimize (speed 3) (safety 1)))
1347 (with-open-file (p file :direction :input)
1348 (parse-html p :callback-only callback-only :callbacks callbacks
1349 :collect-rogue-tags collect-rogue-tags
1350 :no-body-tags no-body-tags
1351 :parse-entities parse-entities
1355 (defmethod parse-html ((str string) &key callback-only callbacks collect-rogue-tags
1356 no-body-tags parse-entities)
1357 (declare (optimize (speed 3) (safety 1)))
1358 (parse-html (make-string-input-stream str)
1359 :callback-only callback-only :callbacks callbacks
1360 :collect-rogue-tags collect-rogue-tags
1361 :no-body-tags no-body-tags
1362 :parse-entities parse-entities
1375 ;;;(defun doit (ignore-data)
1376 ;;; (with-open-file (p "readme.htm")
1378 ;;; (multiple-value-bind (val kind) (next-token p ignore-data)
1379 ;;; ;(format t "~s -> ~s~%" kind val)
1381 ;;; (if* (eq kind :eof) then (return))))))
1383 ;;;(defun pdoit (&optional (file "testa.html"))
1384 ;;; (with-open-file (p file)
1385 ;;; (parse-html p)))
1388 ;;;;; requires http client module to work
1389 ;;;(defun getparse (host path)
1390 ;;; (parse-html (httpr-body
1392 ;;; (simple-get host path)))))