1 (sys:defpatch "phtml" 1
2 "parse-html close tag closes consecutive identical open tags."
6 ;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA
8 ;; This code is free software; you can redistribute it and/or
9 ;; modify it under the terms of the version 2.1 of
10 ;; the GNU Lesser General Public License as published by
11 ;; the Free Software Foundation, as clarified by the AllegroServe
12 ;; prequel found in license-allegroserve.txt.
14 ;; This code is distributed in the hope that it will be useful,
15 ;; but without any warranty; without even the implied warranty of
16 ;; merchantability or fitness for a particular purpose. See the GNU
17 ;; Lesser General Public License for more details.
19 ;; Version 2.1 of the GNU Lesser General Public License is in the file
20 ;; license-lgpl.txt that was distributed with this file.
21 ;; If it is not present, you can access it from
22 ;; http://www.gnu.org/copyleft/lesser.txt (until superseded by a newer
23 ;; version) or write to the Free Software Foundation, Inc., 59 Temple Place,
24 ;; Suite 330, Boston, MA 02111-1307 USA
27 ;; $Id: phtml.cl,v 1.2 2003/06/20 02:21:23 kevin Exp $
29 ;; phtml.cl - parse html
32 ;; 05/14/02 - add :parse-entities arg to parse-html. If true then
33 ;; entities are converted to the character they represent.
35 ;; 02/05/01 symbols mapped to preferred case at runtime (as opposed to
36 ;; a compile time macro determining the case mapping)
38 ;; 10/27/00 :callbacks arg now processed correctly for tags with no body
40 ;; 10/14/00 add first-pass member to tokenbuf structure; used to remove
41 ;; multiple un-next-char calls in raw mode
42 ;; removed :script from *in-line* (incorect and led to infinite loop)
43 ;; char format reopen not done in :script and :style
44 ;; fixed :table/:th tag-auto-close-stop typo
47 ; do character entity stuff
50 (defpackage net.html.parser
51 (:use :lisp :clos :excl)
56 (in-package :net.html.parser)
58 (defmacro tag-auto-close (tag) `(get ,tag 'tag-auto-close))
59 (defmacro tag-auto-close-stop (tag) `(get ,tag 'tag-auto-close-stop))
60 (defmacro tag-no-end (tag) `(get ,tag 'tag-no-end))
62 ; only subelements allowed in this element, no strings
63 (defmacro tag-no-pcdata (tag) `(get ,tag 'tag-no-pcdata))
65 ;; given :foo or (:foo ...) return :foo
66 (defmacro tag-name (expr)
76 (eval-when (compile load eval)
77 (defconstant state-pcdata 0) ; scanning for chars or a tag
78 (defconstant state-readtagfirst 1)
79 (defconstant state-readtag 2)
80 (defconstant state-findattribname 3)
81 (defconstant state-attribname 4)
82 (defconstant state-attribstartvalue 5)
83 (defconstant state-attribvaluedelim 6)
84 (defconstant state-attribvaluenodelim 7)
85 (defconstant state-readcomment 8)
86 (defconstant state-readcomment-one 9)
87 (defconstant state-readcomment-two 10)
88 (defconstant state-findvalue 11)
89 (defconstant state-rawdata 12)
94 next ; next index to set
95 max ; 1+max index to set
99 ;; keep a cache of collectors on this list
101 (defparameter *collectors* (list nil nil nil nil))
103 (defun get-collector ()
104 (declare (optimize (speed 3) (safety 1)))
106 (mp::without-scheduling
107 (do* ((cols *collectors* (cdr cols))
108 (this (car cols) (car cols)))
111 then (setf (car cols) nil)
115 then (setf (collector-next col) 0)
120 :data (make-string 100)))))
122 (defun put-back-collector (col)
123 (declare (optimize (speed 3) (safety 1)))
124 (mp::without-scheduling
125 (do ((cols *collectors* (cdr cols)))
129 (if* (null (car cols))
130 then (setf (car cols) col)
135 (defun grow-and-add (coll ch)
136 (declare (optimize (speed 3) (safety 1)))
137 ;; increase the size of the data portion of the collector and then
138 ;; add the given char at the end
139 (let* ((odata (collector-data coll))
140 (ndata (make-string (* 2 (length odata)))))
141 (dotimes (i (length odata))
142 (setf (schar ndata i) (schar odata i)))
143 (setf (collector-data coll) ndata)
144 (setf (collector-max coll) (length ndata))
145 (let ((next (collector-next coll)))
146 (setf (schar ndata next) ch)
147 (setf (collector-next coll) (1+ next)))))
155 ;; character characteristics
156 (defconstant char-tagcharacter 1) ; valid char for a tag
157 (defconstant char-attribnamechar 2) ; valid char for an attribute name
158 (defconstant char-attribundelimattribvalue 4) ; valid for undelimited value
159 (defconstant char-spacechar 8)
161 (defparameter *characteristics*
162 ;; array of bits describing character characteristics
163 (let ((arr (make-array 128 :initial-element 0)))
164 (declare (optimize (speed 3) (safety 1)))
165 (macrolet ((with-range ((var from to) &rest body)
166 `(do ((,var (char-code ,from) (1+ ,var))
167 (mmax (char-code ,to)))
171 (addit (index charistic)
172 `(setf (svref arr ,index)
173 (logior (svref arr ,index)
177 (with-range (i #\A #\Z)
178 (addit i (+ char-tagcharacter
180 char-attribundelimattribvalue)))
182 (with-range (i #\a #\z)
183 (addit i (+ char-tagcharacter
185 char-attribundelimattribvalue)))
187 (with-range (i #\0 #\9)
188 (addit i (+ char-tagcharacter
190 char-attribundelimattribvalue)))
192 ;; let colon be legal tag character
193 (addit (char-code #\:) (+ char-attribnamechar
196 ;; NY times special tags have _
197 (addit (char-code #\_) (+ char-attribnamechar
200 ; now the unusual cases
201 (addit (char-code #\-) (+ char-attribnamechar
202 char-attribundelimattribvalue))
203 (addit (char-code #\.) (+ char-attribnamechar
204 char-attribundelimattribvalue))
206 ;; adding all typeable chars except for whitespace and >
207 (addit (char-code #\:) char-attribundelimattribvalue)
208 (addit (char-code #\@) char-attribundelimattribvalue)
209 (addit (char-code #\/) char-attribundelimattribvalue)
210 (addit (char-code #\!) char-attribundelimattribvalue)
211 (addit (char-code #\#) char-attribundelimattribvalue)
212 (addit (char-code #\$) char-attribundelimattribvalue)
213 (addit (char-code #\%) char-attribundelimattribvalue)
214 (addit (char-code #\^) char-attribundelimattribvalue)
215 (addit (char-code #\&) char-attribundelimattribvalue)
216 (addit (char-code #\() char-attribundelimattribvalue)
217 (addit (char-code #\)) char-attribundelimattribvalue)
218 (addit (char-code #\_) char-attribundelimattribvalue)
219 (addit (char-code #\=) char-attribundelimattribvalue)
220 (addit (char-code #\+) char-attribundelimattribvalue)
221 (addit (char-code #\\) char-attribundelimattribvalue)
222 (addit (char-code #\|) char-attribundelimattribvalue)
223 (addit (char-code #\{) char-attribundelimattribvalue)
224 (addit (char-code #\}) char-attribundelimattribvalue)
225 (addit (char-code #\[) char-attribundelimattribvalue)
226 (addit (char-code #\]) char-attribundelimattribvalue)
227 (addit (char-code #\;) char-attribundelimattribvalue)
228 (addit (char-code #\') char-attribundelimattribvalue)
229 (addit (char-code #\") char-attribundelimattribvalue)
230 (addit (char-code #\,) char-attribundelimattribvalue)
231 (addit (char-code #\<) char-attribundelimattribvalue)
232 (addit (char-code #\?) char-attribundelimattribvalue)
234 ; i'm not sure what can be in a tag name but we know that
235 ; ! and - must be there since it's used in comments
237 (addit (char-code #\-) char-tagcharacter)
238 (addit (char-code #\!) char-tagcharacter)
241 (addit (char-code #\space) char-spacechar)
242 (addit (char-code #\tab) char-spacechar)
243 (addit (char-code #\return) char-spacechar)
244 (addit (char-code #\linefeed) char-spacechar)
253 (defun char-characteristic (char bit)
254 (declare (optimize (speed 3) (safety 1)))
255 ;; return true if the given char has the given bit set in
256 ;; the characteristic array
257 (let ((code (char-code char)))
260 (not (zerop (logand (svref *characteristics* code) bit))))))
263 (defvar *html-entity-to-code*
264 (let ((table (make-hash-table :test #'equal)))
265 (dolist (ent '(("nbsp" . 160)
518 (setf (gethash (car ent) table) (cdr ent)))
524 cur ;; next index to use to grab from tokenbuf
525 max ;; index one beyond last character
526 data ;; character array
527 first-pass ;; previously parsed tokens
530 ;; cache of tokenbuf structs
531 (defparameter *tokenbufs* (list nil nil nil nil))
533 (defun get-tokenbuf ()
534 (declare (optimize (speed 3) (safety 1)))
536 (mp::without-scheduling
537 (do* ((bufs *tokenbufs* (cdr bufs))
538 (this (car bufs) (car bufs)))
541 then (setf (car bufs) nil)
545 then (setf (tokenbuf-cur buf) 0)
546 (setf (tokenbuf-max buf) 0)
551 :data (make-array 1024 :element-type 'character)))))
553 (defun put-back-tokenbuf (buf)
554 (declare (optimize (speed 3) (safety 1)))
555 (mp::without-scheduling
556 (do ((bufs *tokenbufs* (cdr bufs)))
560 (if* (null (car bufs))
561 then (setf (car bufs) buf)
564 (defun to-preferred-case (ch)
565 (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
566 then (char-upcase ch)
567 else (char-downcase ch)))
570 (defun next-token (stream ignore-strings raw-mode-delimiter
571 read-sequence-func tokenbuf parse-entities)
572 (declare (optimize (speed 3) (safety 1)))
573 ;; return two values:
574 ;; the next token from the stream.
575 ;; the kind of token (:pcdata, :start-tag, :end-tag, :eof)
577 ;; if read-sequence-func is non-nil,
578 ;; read-sequence-func is called to fetch the next character
579 (macrolet ((next-char (stream)
580 `(let ((cur (tokenbuf-cur tokenbuf))
581 (tb (tokenbuf-data tokenbuf)))
582 (if* (>= cur (tokenbuf-max tokenbuf))
584 (if* (zerop (setf (tokenbuf-max tokenbuf)
585 (if* read-sequence-func
586 then (funcall read-sequence-func tb stream)
587 else (read-sequence tb stream))))
588 then (setq cur nil) ; eof
591 then (prog1 (schar tb cur)
592 (setf (tokenbuf-cur tokenbuf) (1+ cur))))))
595 (un-next-char (stream ch)
596 `(decf (tokenbuf-cur tokenbuf)))
599 `(setf (collector-next coll) 0))
601 (add-to-coll (coll ch)
602 `(let ((.next. (collector-next ,coll)))
603 (if* (>= .next. (collector-max ,coll))
604 then (grow-and-add ,coll ,ch)
605 else (setf (schar (collector-data ,coll) .next.)
607 (setf (collector-next ,coll) (1+ .next.)))))
611 (let ((state (if* raw-mode-delimiter then state-rawdata else state-pcdata))
612 (coll (get-collector))
625 (name-length 0) ;; count only when it could be a comment
633 (setq ch (next-char stream))
634 ;;(format t "ch: ~s state: ~s~%" ch state)
637 then (return) ; eof -- exit loop
643 ; collect everything until we see a <
645 then ; if we've collected nothing then get a tag
646 (if* (> (collector-next coll) 0)
647 then ; have collected something, return this string
648 (un-next-char stream ch) ; push back the <
651 (setq state state-readtagfirst))
652 elseif (and parse-entities (eq ch #\&))
653 then ; reading an entity. entity ends at semicolon
655 (loop (let ((ch (next-char stream)))
657 then (error "End of file after & entity marker")
660 elseif (zerop (decf max))
661 then (error "No semicolon found after entity starting: &~{~a~}" (nreverse res))
662 else (push ch res))))
663 (setq res (nreverse res))
664 (if* (eq (car res) #\#)
665 then ; decimal entity
667 (dolist (ch (cdr res))
668 (let ((code (char-code ch)))
669 (if* (<= #.(char-code #\0)
676 else (error "non decimal digit after &# - ~s" ch)
678 (add-to-coll coll (code-char count)))
679 else (let ((name (make-array (length res)
680 :element-type 'character
681 :initial-contents res)))
682 (let ((ch (gethash name *html-entity-to-code*)))
684 then (add-to-coll coll (code-char ch))
685 else (error "No such entity as ~s" name))))))
687 else ; we will check for & here eventually
688 (if* (not (eq ch #\return))
689 then (add-to-coll coll ch))))
691 (#.state-readtagfirst
692 ; starting to read a tag name
696 else (if* (eq #\! ch) ; possible comment
697 then (setf xml-bailout t)
698 (setq name-length 0))
699 (un-next-char stream ch))
700 (setq state state-readtag))
703 ;; reading the whole tag name
704 (if* (char-characteristic ch char-tagcharacter)
705 then (add-to-coll coll (to-preferred-case ch))
707 (if* (and (eq name-length 3)
708 (coll-has-comment coll))
709 then (clear-coll coll)
710 (setq state state-readcomment))
712 else (setq tag-to-return (compute-tag coll))
715 then (return) ; we're done
716 elseif xml-bailout then
717 (un-next-char stream ch)
719 else (if* (eq tag-to-return :!--)
721 (setq state state-readcomment)
722 else (un-next-char stream ch)
723 (setq state state-findattribname)))))
725 (#.state-findattribname
726 ;; search until we find the start of an attribute name
727 ;; or the end of the tag
729 then ; end of the line
732 then ; value for previous attribute name
733 ; (syntax "foo = bar" is bogus I think but it's
734 ; used some places, here is where we handle this
735 (pop attribs-to-return)
736 (setq attrib-name (pop attribs-to-return))
737 (setq state state-findvalue)
738 elseif (char-characteristic ch char-attribnamechar)
739 then (un-next-char stream ch)
740 (setq state state-attribname)
741 else nil ; ignore other things
745 ;; find the start of the value
746 (if* (char-characteristic ch char-spacechar)
747 thenret ; keep looking
749 then ; no value, set the value to be the
752 (string-downcase (string attrib-name)))
754 (push attrib-name attribs-to-return)
755 (push attrib-value attribs-to-return)
756 (un-next-char stream ch)
757 (setq state state-findattribname)
758 else (un-next-char stream ch)
759 (setq state state-attribstartvalue)))
763 ;; collect attribute name
765 (if* (char-characteristic ch char-attribnamechar)
766 then (add-to-coll coll (to-preferred-case ch))
768 then ; end of attribute name, value is next
769 (setq attrib-name (compute-tag coll))
771 (setq state state-attribstartvalue)
772 else ; end of attribute name with no value,
773 (setq attrib-name (compute-tag coll))
776 (string-downcase (string attrib-name)))
777 (push attrib-name attribs-to-return)
778 (push attrib-value attribs-to-return)
779 (un-next-char stream ch)
780 (setq state state-findattribname)))
782 (#.state-attribstartvalue
783 ;; begin to collect value
786 then (setq value-delim ch)
787 (setq state state-attribvaluedelim)
788 ;; gobble spaces; assume since we've seen a '=' there really is a value
789 elseif (eq #\space ch) then nil
790 else (un-next-char stream ch)
791 (setq state state-attribvaluenodelim)))
793 (#.state-attribvaluedelim
794 (if* (eq ch value-delim)
795 then (setq attrib-value (compute-coll-string coll))
797 (push attrib-name attribs-to-return)
798 (push attrib-value attribs-to-return)
799 (setq state state-findattribname)
800 else (add-to-coll coll ch)))
802 (#.state-attribvaluenodelim
803 ;; an attribute value not delimited by ' or " and thus restricted
804 ;; in the possible characters
805 (if* (char-characteristic ch char-attribundelimattribvalue)
806 then (add-to-coll coll ch)
807 else (un-next-char stream ch)
808 (setq attrib-value (compute-coll-string coll))
810 (push attrib-name attribs-to-return)
811 (push attrib-value attribs-to-return)
812 (setq state state-findattribname)))
815 ;; a comment ends on the first --, but we'll look for -->
816 ;; since that's what most people expect
818 then (setq state state-readcomment-one)
819 else (add-to-coll coll ch)))
821 (#.state-readcomment-one
822 ;; seen one -, looking for ->
825 then (setq state state-readcomment-two)
826 else ; not a comment end, put back the -'s
827 (add-to-coll coll #\-)
828 (add-to-coll coll ch)
829 (setq state state-readcomment)))
831 (#.state-readcomment-two
832 ;; seen two -'s, looking for >
835 then ; end of the line
838 then ; still at two -'s, have to put out first
839 (add-to-coll coll #\-)
840 else ; put out two hypens and back to looking for a hypen
841 (add-to-coll coll #\-)
842 (add-to-coll coll #\-)
843 (setq state state-readcomment)))
846 ;; collect everything until we see the delimiter
847 (if* (eq (to-preferred-case ch) (elt raw-mode-delimiter raw-length))
850 (when (= raw-length (length raw-mode-delimiter))
851 ;; push the end tag back so it can then be lexed
852 ;; but don't do it for xml stuff
853 (when (/= (length raw-mode-delimiter) 1)
854 (push :end-tag (tokenbuf-first-pass tokenbuf))
855 (if* (equal raw-mode-delimiter "</STYLE>")
856 then (push :STYLE (tokenbuf-first-pass tokenbuf))
857 elseif (equal raw-mode-delimiter "</style>")
858 then (push :style (tokenbuf-first-pass tokenbuf))
859 elseif (equal raw-mode-delimiter "</SCRIPT>")
860 then (push :SCRIPT (tokenbuf-first-pass tokenbuf))
861 elseif (equal raw-mode-delimiter "</script>")
862 then (push :script (tokenbuf-first-pass tokenbuf))
863 else (error "unexpected raw-mode-delimiter"))
865 ;; set state to state-pcdata for next section
868 ;; push partial matches into data string
869 (dotimes (i raw-length)
870 (add-to-coll coll (elt raw-mode-delimiter i)))
872 (add-to-coll coll ch)))
878 ;; if we're in certain states then it means we should return a value
881 ((#.state-pcdata #.state-rawdata)
882 ;; return the buffer as a string
883 (if* (zerop (collector-next coll))
884 then (values nil (if (eq state state-pcdata) :eof :pcdata))
886 (if* (null ignore-strings)
887 then (compute-coll-string coll))
888 (put-back-collector coll))
892 (when (null tag-to-return)
893 (error "unexpected end of input encountered"))
894 ;; we've read a tag with no attributes
895 (put-back-collector coll)
896 (values tag-to-return
899 else (if* xml-bailout then :xml else :start-tag))
902 (#.state-findattribname
903 ;; returning a tag with possible attributes
904 (put-back-collector coll)
906 then ; ignore any attributes
907 (values tag-to-return :end-tag)
908 elseif attribs-to-return
909 then (values (cons tag-to-return
910 (nreverse attribs-to-return))
912 else (values tag-to-return :start-tag)))
914 (#.state-readcomment-two
915 ;; returning a comment
916 (values (prog1 (if* (null ignore-strings)
917 then (compute-coll-string coll))
918 (put-back-collector coll))
922 (if* (null ch) then (error "unexpected end of input encountered")
923 else (error "internal error, can't be here in state ~d" state)))))))
926 (defvar *kwd-package* (find-package :keyword))
928 (defun compute-tag (coll)
929 (declare (optimize (speed 3) (safety 1)))
930 ;; compute the symbol named by what's in the collector
931 (excl::intern* (collector-data coll) (collector-next coll) *kwd-package*))
935 (defun compute-coll-string (coll)
936 (declare (optimize (speed 3) (safety 1)))
937 ;; return the string that's in the collection
938 (let ((str (make-string (collector-next coll)))
939 (from (collector-data coll)))
940 (dotimes (i (collector-next coll))
941 (setf (schar str i) (schar from i)))
945 (defun coll-has-comment (coll)
946 (declare (optimize (speed 3) (safety 1)))
947 ;; true if the collector has exactly "!--" in it
948 (and (eq 3 (collector-next coll))
949 (let ((data (collector-data coll)))
950 (and (eq #\! (schar data 0))
951 (eq #\- (schar data 1))
952 (eq #\- (schar data 2))))))
955 ;;;;;;;;;;; quick and dirty parse
957 ; the elements with no body and thus no end tag
958 (dolist (opt '(:area :base :basefont :bgsound :br :button :col
959 ;;:colgroup - no, this is an element with contents
960 :embed :hr :img :frame
961 :input :isindex :keygen :link :meta
962 :plaintext :spacer :wbr))
963 (setf (tag-no-end opt) t))
965 (defvar *in-line* '(:tt :i :b :big :small :em :strong :dfn :code :samp :kbd
966 :var :cite :abbr :acronym :a :img :object :br :map
967 :q :sub :sup :span :bdo :input :select :textarea :label :button :font))
969 (defvar *ch-format* '(:i :b :tt :big :small :strike :s :u
972 (defvar *known-tags* '(:!doctype :a :acronym :address :applet :area :b :base :basefont
973 :bdo :bgsound :big :blink :blockquote :body :br :button :caption
974 :center :cite :code :col :colgroup :comment :dd :del :dfn :dir
975 :div :dl :dt :em :embed :fieldset :font :form :frame :frameset
976 :h1 :h2 :h3 :h4 :h5 :h6 :head :hr :html :i :iframe :img :input
977 :ins :isindex :kbd :label :layer :legend :li :link :listing :map
978 :marquee :menu :meta :multicol :nobr :noframes :noscript :object
979 :ol :option :p :param :plaintext :pre :q :samp :script :select
980 :small :spacer :span :s :strike :strong :style :sub :sup :table
981 :tbody :td :textarea :tfoot :th :thead :title :tr :tt :u :ul :var
984 ; the elements whose start tag can end a previous tag
986 (setf (tag-auto-close :tr) '(:tr :td :th :colgroup))
987 (setf (tag-auto-close-stop :tr) '(:table))
989 (setf (tag-auto-close :td) '(:td :th))
990 (setf (tag-auto-close-stop :td) '(:table))
992 (setf (tag-auto-close :th) '(:td :th))
993 (setf (tag-auto-close-stop :th) '(:table))
995 (setf (tag-auto-close :dt) '(:dt :dd))
996 (setf (tag-auto-close-stop :dt) '(:dl))
998 (setf (tag-auto-close :li) '(:li))
999 (setf (tag-auto-close-stop :li) '(:ul :ol))
1001 ;; new stuff to close off tags with optional close tags
1002 (setf (tag-auto-close :address) '(:head :p))
1003 (setf (tag-auto-close :blockquote) '(:head :p))
1004 (setf (tag-auto-close :body) '(:body :frameset :head))
1006 (setf (tag-auto-close :dd) '(:dd :dt))
1007 (setf (tag-auto-close-stop :dd) '(:dl))
1009 (setf (tag-auto-close :dl) '(:head :p))
1010 (setf (tag-auto-close :div) '(:head :p))
1011 (setf (tag-auto-close :fieldset) '(:head :p))
1012 (setf (tag-auto-close :form) '(:head :p))
1013 (setf (tag-auto-close :frameset) '(:body :frameset :head))
1014 (setf (tag-auto-close :hr) '(:head :p))
1015 (setf (tag-auto-close :h1) '(:head :p))
1016 (setf (tag-auto-close :h2) '(:head :p))
1017 (setf (tag-auto-close :h3) '(:head :p))
1018 (setf (tag-auto-close :h4) '(:head :p))
1019 (setf (tag-auto-close :h5) '(:head :p))
1020 (setf (tag-auto-close :h6) '(:head :p))
1021 (setf (tag-auto-close :noscript) '(:head :p))
1022 (setf (tag-auto-close :ol) '(:head :p))
1024 (setf (tag-auto-close :option) '(:option))
1025 (setf (tag-auto-close-stop :option) '(:select))
1027 (setf (tag-auto-close :p) '(:head :p))
1029 (setf (tag-auto-close :pre) '(:head :p))
1030 (setf (tag-auto-close :table) '(:head :p))
1032 (setf (tag-auto-close :tbody) '(:colgroup :tfoot :tbody :thead))
1033 (setf (tag-auto-close-stop :tbody) '(:table))
1035 (setf (tag-auto-close :tfoot) '(:colgroup :tfoot :tbody :thead))
1036 (setf (tag-auto-close-stop :tfoot) '(:table))
1038 (setf (tag-auto-close :thead) '(:colgroup :tfoot :tbody :thead))
1039 (setf (tag-auto-close-stop :thead) '(:table))
1041 (setf (tag-auto-close :ul) '(:head :p))
1043 (setf (tag-no-pcdata :table) t)
1044 (setf (tag-no-pcdata :tr) t)
1047 (defmethod parse-html ((p stream) &key callback-only callbacks collect-rogue-tags
1050 (declare (optimize (speed 3) (safety 1)))
1051 (phtml-internal p nil callback-only callbacks collect-rogue-tags
1052 no-body-tags parse-entities))
1054 (defmacro tag-callback (tag)
1055 `(rest (assoc ,tag callbacks)))
1057 (defun phtml-internal (p read-sequence-func callback-only
1058 callbacks collect-rogue-tags
1061 (declare (optimize (speed 3) (safety 1)))
1062 (let ((raw-mode-delimiter nil)
1064 (current-tag :start-parse)
1065 (last-tag :start-parse)
1066 (current-callback-tags nil)
1067 (pending-ch-format nil)
1068 (closed-pending-ch-format nil)
1070 (tokenbuf (get-tokenbuf))
1074 (labels ((close-off-tags (name stop-at collect-rogues once-only)
1075 ;; close off an open 'name' tag, but search no further
1076 ;; than a 'stop-at' tag.
1077 #+ignore (format t "close off name ~s, stop at ~s, ct ~s~%"
1078 name stop-at current-tag)
1079 (if* (member (tag-name current-tag) name :test #'eq)
1080 then ;; close current tag(s)
1082 (when (and collect-rogues
1083 (not (member (tag-name current-tag)
1085 (push (tag-name current-tag) rogue-tags))
1088 (member (tag-name current-tag)
1091 (tag-name current-tag) name :test #'eq)))
1093 elseif (member (tag-name current-tag) stop-at :test #'eq)
1095 else ; search if there is a tag to close
1096 (dolist (ent pending)
1097 (if* (member (tag-name (car ent)) name :test #'eq)
1098 then ; found one to close
1100 (when (and collect-rogues
1101 (not (member (tag-name current-tag)
1103 (push (tag-name current-tag) rogue-tags))
1105 (if* (member (tag-name current-tag) name
1107 then (close-current-tag)
1110 elseif (member (tag-name (car ent)) stop-at
1112 then (return) ;; do nothing
1115 (close-current-tag ()
1116 ;; close off the current tag and open the pending tag
1117 (when (member (tag-name current-tag) *ch-format* :test #'eq)
1118 (push current-tag closed-pending-ch-format)
1121 (if* (tag-no-pcdata (tag-name current-tag))
1122 then (setq element `(,current-tag
1123 ,@(strip-rev-pcdata guts)))
1124 else (setq element `(,current-tag ,@(nreverse guts))))
1125 (let ((callback (tag-callback (tag-name current-tag))))
1127 (setf current-callback-tags (rest current-callback-tags))
1128 (funcall callback element)))
1129 (let* ((prev (pop pending)))
1130 (setq current-tag (car prev)
1132 (push element guts))))
1135 ;; push the current tag state since we're starting:
1137 (push (cons current-tag guts) pending)
1138 #+ignore (format t "state saved, pending ~s~%" pending)
1142 (strip-rev-pcdata (stuff)
1143 ;; reverse the list stuff, omitting all the strings
1146 (if* (not (stringp st)) then (push st res)))
1148 (check-in-line (check-tag)
1149 (setf new-opens nil)
1150 (let (val kind (i 0)
1151 (length (length (tokenbuf-first-pass tokenbuf))))
1153 (if* (< i length) then
1154 (setf val (nth i (tokenbuf-first-pass tokenbuf)))
1155 (setf kind (nth (+ i 1) (tokenbuf-first-pass tokenbuf)))
1157 (if* (= i length) then (setf (tokenbuf-first-pass tokenbuf)
1158 (nreverse (tokenbuf-first-pass tokenbuf))))
1160 (multiple-value-setq (val kind)
1162 (push val (tokenbuf-first-pass tokenbuf))
1163 (push kind (tokenbuf-first-pass tokenbuf))
1165 (when (eq kind :eof)
1166 (if* (= i length) then
1167 (setf (tokenbuf-first-pass tokenbuf)
1168 (nreverse (tokenbuf-first-pass tokenbuf))))
1170 (when (and (eq val check-tag) (eq kind :end-tag))
1171 (if* (= i length) then
1172 (setf (tokenbuf-first-pass tokenbuf)
1173 (nreverse (tokenbuf-first-pass tokenbuf))))
1175 (when (member val *ch-format* :test #'eq)
1176 (if* (eq kind :start-tag) then (push val new-opens)
1177 elseif (member val new-opens :test #'eq) then
1178 (setf new-opens (remove val new-opens :count 1))
1179 else (close-off-tags (list val) nil nil nil)
1182 (get-next-token (force)
1183 (if* (or force (null (tokenbuf-first-pass tokenbuf))) then
1184 (multiple-value-bind (val kind)
1185 (next-token p nil raw-mode-delimiter read-sequence-func
1186 tokenbuf parse-entities)
1189 (let ((val (first (tokenbuf-first-pass tokenbuf)))
1190 (kind (second (tokenbuf-first-pass tokenbuf))))
1191 (setf (tokenbuf-first-pass tokenbuf)
1192 (rest (rest (tokenbuf-first-pass tokenbuf))))
1193 (values val kind))))
1196 (multiple-value-bind (val kind)
1197 (get-next-token nil)
1198 #+ignore (format t "val: ~s kind: ~s last-tag ~s pending ~s~%" val kind
1202 (when (or (and callback-only current-callback-tags)
1203 (not callback-only))
1204 (if* (member last-tag *in-line*)
1208 (when (dotimes (i (length val) nil)
1209 (when (not (char-characteristic (elt val i)
1213 (when (and (= (length raw-mode-delimiter) 1) ;; xml tag...
1214 (or (and callback-only current-callback-tags)
1215 (not callback-only)))
1216 (close-off-tags (list last-tag) nil nil t))
1217 (setf raw-mode-delimiter nil)
1222 (setf raw-mode-delimiter ">")
1223 (let* ((name (tag-name val)))
1224 (when (and callback-only (tag-callback name))
1225 (push name current-callback-tags))
1227 (setq current-tag val)
1233 (if* (or (eq last-tag :style)
1234 (and (listp last-tag) (eq (first last-tag) :style)))
1236 (setf raw-mode-delimiter
1237 (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
1240 elseif (or (eq last-tag :script)
1241 (and (listp last-tag) (eq (first last-tag) :script)))
1243 (setf raw-mode-delimiter
1244 (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
1247 ; maybe this is an end tag too
1248 (let* ((name (tag-name val))
1249 (auto-close (tag-auto-close name))
1250 (auto-close-stop nil)
1251 (no-end (or (tag-no-end name) (member name no-body-tags))))
1252 (when (and callback-only (tag-callback name))
1253 (push name current-callback-tags))
1254 (when (or (and callback-only current-callback-tags)
1255 (not callback-only))
1257 then (setq auto-close-stop (tag-auto-close-stop name))
1258 (close-off-tags auto-close auto-close-stop nil nil))
1259 (when (and pending-ch-format (not no-end))
1260 (if* (member name *ch-format* :test #'eq) then nil
1261 elseif (member name *in-line* :test #'eq) then
1262 ;; close off only tags that are within *in-line* block
1263 (check-in-line name)
1264 else ;; close ALL pending char tags and then reopen
1265 (dolist (this-tag (reverse pending-ch-format))
1266 (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil nil))
1269 then ; this is a singleton tag
1270 (let ((callback (tag-callback (tag-name (if* (atom val)
1272 else (first val))))))
1274 (funcall callback (if* (atom val)
1277 (push (if* (atom val)
1282 (setq current-tag val)
1284 (if* (member name *ch-format* :test #'eq)
1285 then (push val pending-ch-format)
1287 (or (eq last-tag :style)
1288 (and (listp last-tag) (eq (first last-tag) :style))
1289 (eq last-tag :script)
1290 (and (listp last-tag) (eq (first last-tag) :script))))
1291 (dolist (tmp (reverse closed-pending-ch-format))
1293 (setf current-tag tmp)
1297 (or (eq last-tag :style)
1298 (and (listp last-tag) (eq (first last-tag) :style))
1299 (eq last-tag :script)
1300 (and (listp last-tag) (eq (first last-tag) :script))))
1301 (setf closed-pending-ch-format nil))
1305 (setf raw-mode-delimiter nil)
1306 (when (or (and callback-only current-callback-tags)
1307 (not callback-only))
1308 (close-off-tags (list val) nil nil t)
1309 (when (member val *ch-format* :test #'eq)
1310 (setf pending-ch-format
1311 (remove val pending-ch-format :count 1
1312 :test #'(lambda (x y) (eq x (if (listp y) (first y) y)))))
1313 (setf closed-pending-ch-format
1314 (remove val closed-pending-ch-format :count 1
1315 :test #'(lambda (x y) (eq x (if (listp y) (first y) y)))))
1317 (dolist (tmp (reverse closed-pending-ch-format))
1319 (setf current-tag tmp)
1321 (setf closed-pending-ch-format nil)
1325 (setf raw-mode-delimiter nil)
1326 (when (or (and callback-only current-callback-tags)
1327 (not callback-only))
1328 (push `(:comment ,val) guts)))
1331 (setf raw-mode-delimiter nil)
1332 ;; close off all tags
1333 (when (or (and callback-only current-callback-tags)
1334 (not callback-only))
1335 (close-off-tags '(:start-parse) nil collect-rogue-tags nil))
1336 (put-back-tokenbuf tokenbuf)
1337 (if collect-rogue-tags
1338 (return (values (cdar guts) rogue-tags))
1339 (return (cdar guts))))))))))
1343 (defmethod parse-html (file &key callback-only callbacks collect-rogue-tags
1344 no-body-tags parse-entities)
1345 (declare (optimize (speed 3) (safety 1)))
1346 (with-open-file (p file :direction :input)
1347 (parse-html p :callback-only callback-only :callbacks callbacks
1348 :collect-rogue-tags collect-rogue-tags
1349 :no-body-tags no-body-tags
1350 :parse-entities parse-entities
1354 (defmethod parse-html ((str string) &key callback-only callbacks collect-rogue-tags
1355 no-body-tags parse-entities)
1356 (declare (optimize (speed 3) (safety 1)))
1357 (parse-html (make-string-input-stream str)
1358 :callback-only callback-only :callbacks callbacks
1359 :collect-rogue-tags collect-rogue-tags
1360 :no-body-tags no-body-tags
1361 :parse-entities parse-entities
1374 ;;;(defun doit (ignore-data)
1375 ;;; (with-open-file (p "readme.htm")
1377 ;;; (multiple-value-bind (val kind) (next-token p ignore-data)
1378 ;;; ;(format t "~s -> ~s~%" kind val)
1380 ;;; (if* (eq kind :eof) then (return))))))
1382 ;;;(defun pdoit (&optional (file "testa.html"))
1383 ;;; (with-open-file (p file)
1384 ;;; (parse-html p)))
1387 ;;;;; requires http client module to work
1388 ;;;(defun getparse (host path)
1389 ;;; (parse-html (httpr-body
1391 ;;; (simple-get host path)))))