+(sys:defpatch "phtml" 1
+ "parse-html close tag closes consecutive identical open tags."
+ :type :system
+ :post-loadable t)
+
;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA
;;
;; This code is free software; you can redistribute it and/or
;; Suite 330, Boston, MA 02111-1307 USA
;;
-;; $Id: phtml.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $
+;; $Id: phtml.cl,v 1.2 2003/06/20 02:21:23 kevin Exp $
;; phtml.cl - parse html
;; Change Log
+;; 05/14/02 - add :parse-entities arg to parse-html. If true then
+;; entities are converted to the character they represent.
;;
;; 02/05/01 symbols mapped to preferred case at runtime (as opposed to
;; a compile time macro determining the case mapping)
(not (zerop (logand (svref *characteristics* code) bit))))))
+(defvar *html-entity-to-code*
+ (let ((table (make-hash-table :test #'equal)))
+ (dolist (ent '(("nbsp" . 160)
+ ("iexcl" . 161)
+ ("cent" . 162)
+ ("pound" . 163)
+ ("curren" . 164)
+ ("yen" . 165)
+ ("brvbar" . 166)
+ ("sect" . 167)
+ ("uml" . 168)
+ ("copy" . 169)
+ ("ordf" . 170)
+ ("laquo" . 171)
+ ("not" . 172)
+ ("shy" . 173)
+ ("reg" . 174)
+ ("macr" . 175)
+ ("deg" . 176)
+ ("plusmn" . 177)
+ ("sup2" . 178)
+ ("sup3" . 179)
+ ("acute" . 180)
+ ("micro" . 181)
+ ("para" . 182)
+ ("middot" . 183)
+ ("cedil" . 184)
+ ("sup1" . 185)
+ ("ordm" . 186)
+ ("raquo" . 187)
+ ("frac14" . 188)
+ ("frac12" . 189)
+ ("frac34" . 190)
+ ("iquest" . 191)
+ ("Agrave" . 192)
+ ("Aacute" . 193)
+ ("Acirc" . 194)
+ ("Atilde" . 195)
+ ("Auml" . 196)
+ ("Aring" . 197)
+ ("AElig" . 198)
+ ("Ccedil" . 199)
+ ("Egrave" . 200)
+ ("Eacute" . 201)
+ ("Ecirc" . 202)
+ ("Euml" . 203)
+ ("Igrave" . 204)
+ ("Iacute" . 205)
+ ("Icirc" . 206)
+ ("Iuml" . 207)
+ ("ETH" . 208)
+ ("Ntilde" . 209)
+ ("Ograve" . 210)
+ ("Oacute" . 211)
+ ("Ocirc" . 212)
+ ("Otilde" . 213)
+ ("Ouml" . 214)
+ ("times" . 215)
+ ("Oslash" . 216)
+ ("Ugrave" . 217)
+ ("Uacute" . 218)
+ ("Ucirc" . 219)
+ ("Uuml" . 220)
+ ("Yacute" . 221)
+ ("THORN" . 222)
+ ("szlig" . 223)
+ ("agrave" . 224)
+ ("aacute" . 225)
+ ("acirc" . 226)
+ ("atilde" . 227)
+ ("auml" . 228)
+ ("aring" . 229)
+ ("aelig" . 230)
+ ("ccedil" . 231)
+ ("egrave" . 232)
+ ("eacute" . 233)
+ ("ecirc" . 234)
+ ("euml" . 235)
+ ("igrave" . 236)
+ ("iacute" . 237)
+ ("icirc" . 238)
+ ("iuml" . 239)
+ ("eth" . 240)
+ ("ntilde" . 241)
+ ("ograve" . 242)
+ ("oacute" . 243)
+ ("ocirc" . 244)
+ ("otilde" . 245)
+ ("ouml" . 246)
+ ("divide" . 247)
+ ("oslash" . 248)
+ ("ugrave" . 249)
+ ("uacute" . 250)
+ ("ucirc" . 251)
+ ("uuml" . 252)
+ ("yacute" . 253)
+ ("thorn" . 254)
+ ("yuml" . 255)
+ ("fnof" . 402)
+ ("Alpha" . 913)
+ ("Beta" . 914)
+ ("Gamma" . 915)
+ ("Delta" . 916)
+ ("Epsilon" . 917)
+ ("Zeta" . 918)
+ ("Eta" . 919)
+ ("Theta" . 920)
+ ("Iota" . 921)
+ ("Kappa" . 922)
+ ("Lambda" . 923)
+ ("Mu" . 924)
+ ("Nu" . 925)
+ ("Xi" . 926)
+ ("Omicron" . 927)
+ ("Pi" . 928)
+ ("Rho" . 929)
+ ("Sigma" . 931)
+ ("Tau" . 932)
+ ("Upsilon" . 933)
+ ("Phi" . 934)
+ ("Chi" . 935)
+ ("Psi" . 936)
+ ("Omega" . 937)
+ ("alpha" . 945)
+ ("beta" . 946)
+ ("gamma" . 947)
+ ("delta" . 948)
+ ("epsilon" . 949)
+ ("zeta" . 950)
+ ("eta" . 951)
+ ("theta" . 952)
+ ("iota" . 953)
+ ("kappa" . 954)
+ ("lambda" . 955)
+ ("mu" . 956)
+ ("nu" . 957)
+ ("xi" . 958)
+ ("omicron" . 959)
+ ("pi" . 960)
+ ("rho" . 961)
+ ("sigmaf" . 962)
+ ("sigma" . 963)
+ ("tau" . 964)
+ ("upsilon" . 965)
+ ("phi" . 966)
+ ("chi" . 967)
+ ("psi" . 968)
+ ("omega" . 969)
+ ("thetasym" . 977)
+ ("upsih" . 978)
+ ("piv" . 982)
+ ("bull" . 8226)
+ ("hellip" . 8230)
+ ("prime" . 8242)
+ ("Prime" . 8243)
+ ("oline" . 8254)
+ ("frasl" . 8260)
+ ("weierp" . 8472)
+ ("image" . 8465)
+ ("real" . 8476)
+ ("trade" . 8482)
+ ("alefsym" . 8501)
+ ("larr" . 8592)
+ ("uarr" . 8593)
+ ("rarr" . 8594)
+ ("darr" . 8595)
+ ("harr" . 8596)
+ ("crarr" . 8629)
+ ("lArr" . 8656)
+ ("uArr" . 8657)
+ ("rArr" . 8658)
+ ("dArr" . 8659)
+ ("hArr" . 8660)
+ ("forall" . 8704)
+ ("part" . 8706)
+ ("exist" . 8707)
+ ("empty" . 8709)
+ ("nabla" . 8711)
+ ("isin" . 8712)
+ ("notin" . 8713)
+ ("ni" . 8715)
+ ("prod" . 8719)
+ ("sum" . 8721)
+ ("minus" . 8722)
+ ("lowast" . 8727)
+ ("radic" . 8730)
+ ("prop" . 8733)
+ ("infin" . 8734)
+ ("ang" . 8736)
+ ("and" . 8743)
+ ("or" . 8744)
+ ("cap" . 8745)
+ ("cup" . 8746)
+ ("int" . 8747)
+ ("there4" . 8756)
+ ("sim" . 8764)
+ ("cong" . 8773)
+ ("asymp" . 8776)
+ ("ne" . 8800)
+ ("equiv" . 8801)
+ ("le" . 8804)
+ ("ge" . 8805)
+ ("sub" . 8834)
+ ("sup" . 8835)
+ ("nsub" . 8836)
+ ("sube" . 8838)
+ ("supe" . 8839)
+ ("oplus" . 8853)
+ ("otimes" . 8855)
+ ("perp" . 8869)
+ ("sdot" . 8901)
+ ("lceil" . 8968)
+ ("rceil" . 8969)
+ ("lfloor" . 8970)
+ ("rfloor" . 8971)
+ ("lang" . 9001)
+ ("rang" . 9002)
+ ("loz" . 9674)
+ ("spades" . 9824)
+ ("clubs" . 9827)
+ ("hearts" . 9829)
+ ("diams" . 9830)
+ ("quot" . 34)
+ ("amp" . 38)
+ ("lt" . 60)
+ ("gt" . 62)
+ ("OElig" . 338)
+ ("oelig" . 339)
+ ("Scaron" . 352)
+ ("scaron" . 353)
+ ("Yuml" . 376)
+ ("circ" . 710)
+ ("tilde" . 732)
+ ("ensp" . 8194)
+ ("emsp" . 8195)
+ ("thinsp" . 8201)
+ ("zwnj" . 8204)
+ ("zwj" . 8205)
+ ("lrm" . 8206)
+ ("rlm" . 8207)
+ ("ndash" . 8211)
+ ("mdash" . 8212)
+ ("lsquo" . 8216)
+ ("rsquo" . 8217)
+ ("sbquo" . 8218)
+ ("ldquo" . 8220)
+ ("rdquo" . 8221)
+ ("bdquo" . 8222)
+ ("dagger" . 8224)
+ ("Dagger" . 8225)
+ ("permil" . 8240)
+ ("lsaquo" . 8249)
+ ("rsaquo" . 8250)
+ ("euro" . 8364)
+ ))
+ (setf (gethash (car ent) table) (cdr ent)))
+ table))
+
+
+
(defstruct tokenbuf
cur ;; next index to use to grab from tokenbuf
max ;; index one beyond last character
(defun next-token (stream ignore-strings raw-mode-delimiter
- read-sequence-func tokenbuf)
+ read-sequence-func tokenbuf parse-entities)
(declare (optimize (speed 3) (safety 1)))
;; return two values:
;; the next token from the stream.
(return)
else ; collect a tag
(setq state state-readtagfirst))
+ elseif (and parse-entities (eq ch #\&))
+ then ; reading an entity. entity ends at semicolon
+ (let (res (max 10))
+ (loop (let ((ch (next-char stream)))
+ (if* (null ch)
+ then (error "End of file after & entity marker")
+ elseif (eq ch #\;)
+ then (return)
+ elseif (zerop (decf max))
+ then (error "No semicolon found after entity starting: &~{~a~}" (nreverse res))
+ else (push ch res))))
+ (setq res (nreverse res))
+ (if* (eq (car res) #\#)
+ then ; decimal entity
+ (let ((count 0))
+ (dolist (ch (cdr res))
+ (let ((code (char-code ch)))
+ (if* (<= #.(char-code #\0)
+ code
+ #.(char-code #\9))
+ then (setq count
+ (+ (* 10 count)
+ (- code
+ #.(char-code #\0))))
+ else (error "non decimal digit after &# - ~s" ch)
+ )))
+ (add-to-coll coll (code-char count)))
+ else (let ((name (make-array (length res)
+ :element-type 'character
+ :initial-contents res)))
+ (let ((ch (gethash name *html-entity-to-code*)))
+ (if* ch
+ then (add-to-coll coll (code-char ch))
+ else (error "No such entity as ~s" name))))))
+
else ; we will check for & here eventually
(if* (not (eq ch #\return))
then (add-to-coll coll ch))))
(#.state-readtag
(when (null tag-to-return)
- (error "unexpected end of input encountered"))
+ (error "unexpected end of input encountered"))
;; we've read a tag with no attributes
(put-back-collector coll)
(values tag-to-return
(defmethod parse-html ((p stream) &key callback-only callbacks collect-rogue-tags
- no-body-tags)
+ no-body-tags
+ parse-entities)
(declare (optimize (speed 3) (safety 1)))
(phtml-internal p nil callback-only callbacks collect-rogue-tags
- no-body-tags))
+ no-body-tags parse-entities))
(defmacro tag-callback (tag)
`(rest (assoc ,tag callbacks)))
-(defun phtml-internal (p read-sequence-func callback-only callbacks collect-rogue-tags
-
- no-body-tags)
+(defun phtml-internal (p read-sequence-func callback-only
+ callbacks collect-rogue-tags
+ no-body-tags
+ parse-entities)
(declare (optimize (speed 3) (safety 1)))
(let ((raw-mode-delimiter nil)
(pending nil)
(guts)
(rogue-tags)
)
- (labels ((close-off-tags (name stop-at collect-rogues)
+ (labels ((close-off-tags (name stop-at collect-rogues once-only)
;; close off an open 'name' tag, but search no further
;; than a 'stop-at' tag.
+ #+ignore (format t "close off name ~s, stop at ~s, ct ~s~%"
+ name stop-at current-tag)
(if* (member (tag-name current-tag) name :test #'eq)
then ;; close current tag(s)
(loop
*known-tags*)))
(push (tag-name current-tag) rogue-tags))
(close-current-tag)
- (when (or (member (tag-name current-tag)
- *ch-format*)
- (not (member
- (tag-name current-tag) name :test #'eq)))
- (return)))
+ (if* (or once-only
+ (member (tag-name current-tag)
+ *ch-format*)
+ (not (member
+ (tag-name current-tag) name :test #'eq)))
+ then (return)))
elseif (member (tag-name current-tag) stop-at :test #'eq)
then nil
else ; search if there is a tag to close
(push element guts))))
(save-state ()
- ;; push the current tag state since we're starting
+ ;; push the current tag state since we're starting:
;; a new open tag
- (push (cons current-tag guts) pending))
+ (push (cons current-tag guts) pending)
+ #+ignore (format t "state saved, pending ~s~%" pending)
+ )
(strip-rev-pcdata (stuff)
(if* (eq kind :start-tag) then (push val new-opens)
elseif (member val new-opens :test #'eq) then
(setf new-opens (remove val new-opens :count 1))
- else (close-off-tags (list val) nil nil)
+ else (close-off-tags (list val) nil nil nil)
)))))
(get-next-token (force)
(if* (or force (null (tokenbuf-first-pass tokenbuf))) then
(multiple-value-bind (val kind)
(next-token p nil raw-mode-delimiter read-sequence-func
- tokenbuf)
- (values val kind))
+ tokenbuf parse-entities)
+ (values val kind))
else
(let ((val (first (tokenbuf-first-pass tokenbuf)))
(kind (second (tokenbuf-first-pass tokenbuf))))
(loop
(multiple-value-bind (val kind)
(get-next-token nil)
- ;;(format t "val: ~s kind: ~s~%" val kind)
+ #+ignore (format t "val: ~s kind: ~s last-tag ~s pending ~s~%" val kind
+ last-tag pending)
(case kind
(:pcdata
(when (or (and callback-only current-callback-tags)
(when (and (= (length raw-mode-delimiter) 1) ;; xml tag...
(or (and callback-only current-callback-tags)
(not callback-only)))
- (close-off-tags (list last-tag) nil nil))
+ (close-off-tags (list last-tag) nil nil t))
(setf raw-mode-delimiter nil)
)
then "</STYLE>"
else "</style>"))
elseif (or (eq last-tag :script)
- (and (listp last-tag) (eq (first last-tag) :script)))
+ (and (listp last-tag) (eq (first last-tag) :script)))
then
(setf raw-mode-delimiter
(if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER)
(not callback-only))
(if* auto-close
then (setq auto-close-stop (tag-auto-close-stop name))
- (close-off-tags auto-close auto-close-stop nil))
+ (close-off-tags auto-close auto-close-stop nil nil))
(when (and pending-ch-format (not no-end))
(if* (member name *ch-format* :test #'eq) then nil
elseif (member name *in-line* :test #'eq) then
(check-in-line name)
else ;; close ALL pending char tags and then reopen
(dolist (this-tag (reverse pending-ch-format))
- (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil))
+ (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil nil))
))
(if* no-end
then ; this is a singleton tag
(setf raw-mode-delimiter nil)
(when (or (and callback-only current-callback-tags)
(not callback-only))
- (close-off-tags (list val) nil nil)
+ (close-off-tags (list val) nil nil t)
(when (member val *ch-format* :test #'eq)
(setf pending-ch-format
(remove val pending-ch-format :count 1
;; close off all tags
(when (or (and callback-only current-callback-tags)
(not callback-only))
- (close-off-tags '(:start-parse) nil collect-rogue-tags))
+ (close-off-tags '(:start-parse) nil collect-rogue-tags nil))
(put-back-tokenbuf tokenbuf)
(if collect-rogue-tags
(return (values (cdar guts) rogue-tags))
(defmethod parse-html (file &key callback-only callbacks collect-rogue-tags
- no-body-tags)
+ no-body-tags parse-entities)
(declare (optimize (speed 3) (safety 1)))
(with-open-file (p file :direction :input)
(parse-html p :callback-only callback-only :callbacks callbacks
:collect-rogue-tags collect-rogue-tags
- :no-body-tags no-body-tags)))
+ :no-body-tags no-body-tags
+ :parse-entities parse-entities
+ )))
(defmethod parse-html ((str string) &key callback-only callbacks collect-rogue-tags
- no-body-tags)
+ no-body-tags parse-entities)
(declare (optimize (speed 3) (safety 1)))
(parse-html (make-string-input-stream str)
:callback-only callback-only :callbacks callbacks
:collect-rogue-tags collect-rogue-tags
- :no-body-tags no-body-tags))
+ :no-body-tags no-body-tags
+ :parse-entities parse-entities
+ ))