X-Git-Url: http://git.kpe.io/?p=xmlutils.git;a=blobdiff_plain;f=phtml.cl;fp=phtml.cl;h=14cbb3a4b74ee5450aa473379702d764b5251339;hp=f763ac478abb9ed14331f313486f32ee1a2050fa;hb=1e8aa1df433841c85c5a0b44fbd92964672e18b5;hpb=2d40f4169cc89aaecf1a762cae1e2d7cd55587ab diff --git a/phtml.cl b/phtml.cl index f763ac4..14cbb3a 100644 --- a/phtml.cl +++ b/phtml.cl @@ -1,3 +1,8 @@ +(sys:defpatch "phtml" 1 + "parse-html close tag closes consecutive identical open tags." + :type :system + :post-loadable t) + ;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA ;; ;; This code is free software; you can redistribute it and/or @@ -19,11 +24,13 @@ ;; Suite 330, Boston, MA 02111-1307 USA ;; -;; $Id: phtml.cl,v 1.1 2002/10/15 12:23:03 kevin Exp $ +;; $Id: phtml.cl,v 1.2 2003/06/20 02:21:23 kevin Exp $ ;; phtml.cl - parse html ;; Change Log +;; 05/14/02 - add :parse-entities arg to parse-html. If true then +;; entities are converted to the character they represent. ;; ;; 02/05/01 symbols mapped to preferred case at runtime (as opposed to ;; a compile time macro determining the case mapping) @@ -253,6 +260,266 @@ (not (zerop (logand (svref *characteristics* code) bit)))))) +(defvar *html-entity-to-code* + (let ((table (make-hash-table :test #'equal))) + (dolist (ent '(("nbsp" . 160) + ("iexcl" . 161) + ("cent" . 162) + ("pound" . 163) + ("curren" . 164) + ("yen" . 165) + ("brvbar" . 166) + ("sect" . 167) + ("uml" . 168) + ("copy" . 169) + ("ordf" . 170) + ("laquo" . 171) + ("not" . 172) + ("shy" . 173) + ("reg" . 174) + ("macr" . 175) + ("deg" . 176) + ("plusmn" . 177) + ("sup2" . 178) + ("sup3" . 179) + ("acute" . 180) + ("micro" . 181) + ("para" . 182) + ("middot" . 183) + ("cedil" . 184) + ("sup1" . 185) + ("ordm" . 186) + ("raquo" . 187) + ("frac14" . 188) + ("frac12" . 189) + ("frac34" . 190) + ("iquest" . 191) + ("Agrave" . 192) + ("Aacute" . 193) + ("Acirc" . 194) + ("Atilde" . 195) + ("Auml" . 196) + ("Aring" . 197) + ("AElig" . 198) + ("Ccedil" . 199) + ("Egrave" . 200) + ("Eacute" . 201) + ("Ecirc" . 202) + ("Euml" . 203) + ("Igrave" . 204) + ("Iacute" . 205) + ("Icirc" . 206) + ("Iuml" . 207) + ("ETH" . 208) + ("Ntilde" . 209) + ("Ograve" . 210) + ("Oacute" . 211) + ("Ocirc" . 212) + ("Otilde" . 213) + ("Ouml" . 214) + ("times" . 215) + ("Oslash" . 216) + ("Ugrave" . 217) + ("Uacute" . 218) + ("Ucirc" . 219) + ("Uuml" . 220) + ("Yacute" . 221) + ("THORN" . 222) + ("szlig" . 223) + ("agrave" . 224) + ("aacute" . 225) + ("acirc" . 226) + ("atilde" . 227) + ("auml" . 228) + ("aring" . 229) + ("aelig" . 230) + ("ccedil" . 231) + ("egrave" . 232) + ("eacute" . 233) + ("ecirc" . 234) + ("euml" . 235) + ("igrave" . 236) + ("iacute" . 237) + ("icirc" . 238) + ("iuml" . 239) + ("eth" . 240) + ("ntilde" . 241) + ("ograve" . 242) + ("oacute" . 243) + ("ocirc" . 244) + ("otilde" . 245) + ("ouml" . 246) + ("divide" . 247) + ("oslash" . 248) + ("ugrave" . 249) + ("uacute" . 250) + ("ucirc" . 251) + ("uuml" . 252) + ("yacute" . 253) + ("thorn" . 254) + ("yuml" . 255) + ("fnof" . 402) + ("Alpha" . 913) + ("Beta" . 914) + ("Gamma" . 915) + ("Delta" . 916) + ("Epsilon" . 917) + ("Zeta" . 918) + ("Eta" . 919) + ("Theta" . 920) + ("Iota" . 921) + ("Kappa" . 922) + ("Lambda" . 923) + ("Mu" . 924) + ("Nu" . 925) + ("Xi" . 926) + ("Omicron" . 927) + ("Pi" . 928) + ("Rho" . 929) + ("Sigma" . 931) + ("Tau" . 932) + ("Upsilon" . 933) + ("Phi" . 934) + ("Chi" . 935) + ("Psi" . 936) + ("Omega" . 937) + ("alpha" . 945) + ("beta" . 946) + ("gamma" . 947) + ("delta" . 948) + ("epsilon" . 949) + ("zeta" . 950) + ("eta" . 951) + ("theta" . 952) + ("iota" . 953) + ("kappa" . 954) + ("lambda" . 955) + ("mu" . 956) + ("nu" . 957) + ("xi" . 958) + ("omicron" . 959) + ("pi" . 960) + ("rho" . 961) + ("sigmaf" . 962) + ("sigma" . 963) + ("tau" . 964) + ("upsilon" . 965) + ("phi" . 966) + ("chi" . 967) + ("psi" . 968) + ("omega" . 969) + ("thetasym" . 977) + ("upsih" . 978) + ("piv" . 982) + ("bull" . 8226) + ("hellip" . 8230) + ("prime" . 8242) + ("Prime" . 8243) + ("oline" . 8254) + ("frasl" . 8260) + ("weierp" . 8472) + ("image" . 8465) + ("real" . 8476) + ("trade" . 8482) + ("alefsym" . 8501) + ("larr" . 8592) + ("uarr" . 8593) + ("rarr" . 8594) + ("darr" . 8595) + ("harr" . 8596) + ("crarr" . 8629) + ("lArr" . 8656) + ("uArr" . 8657) + ("rArr" . 8658) + ("dArr" . 8659) + ("hArr" . 8660) + ("forall" . 8704) + ("part" . 8706) + ("exist" . 8707) + ("empty" . 8709) + ("nabla" . 8711) + ("isin" . 8712) + ("notin" . 8713) + ("ni" . 8715) + ("prod" . 8719) + ("sum" . 8721) + ("minus" . 8722) + ("lowast" . 8727) + ("radic" . 8730) + ("prop" . 8733) + ("infin" . 8734) + ("ang" . 8736) + ("and" . 8743) + ("or" . 8744) + ("cap" . 8745) + ("cup" . 8746) + ("int" . 8747) + ("there4" . 8756) + ("sim" . 8764) + ("cong" . 8773) + ("asymp" . 8776) + ("ne" . 8800) + ("equiv" . 8801) + ("le" . 8804) + ("ge" . 8805) + ("sub" . 8834) + ("sup" . 8835) + ("nsub" . 8836) + ("sube" . 8838) + ("supe" . 8839) + ("oplus" . 8853) + ("otimes" . 8855) + ("perp" . 8869) + ("sdot" . 8901) + ("lceil" . 8968) + ("rceil" . 8969) + ("lfloor" . 8970) + ("rfloor" . 8971) + ("lang" . 9001) + ("rang" . 9002) + ("loz" . 9674) + ("spades" . 9824) + ("clubs" . 9827) + ("hearts" . 9829) + ("diams" . 9830) + ("quot" . 34) + ("amp" . 38) + ("lt" . 60) + ("gt" . 62) + ("OElig" . 338) + ("oelig" . 339) + ("Scaron" . 352) + ("scaron" . 353) + ("Yuml" . 376) + ("circ" . 710) + ("tilde" . 732) + ("ensp" . 8194) + ("emsp" . 8195) + ("thinsp" . 8201) + ("zwnj" . 8204) + ("zwj" . 8205) + ("lrm" . 8206) + ("rlm" . 8207) + ("ndash" . 8211) + ("mdash" . 8212) + ("lsquo" . 8216) + ("rsquo" . 8217) + ("sbquo" . 8218) + ("ldquo" . 8220) + ("rdquo" . 8221) + ("bdquo" . 8222) + ("dagger" . 8224) + ("Dagger" . 8225) + ("permil" . 8240) + ("lsaquo" . 8249) + ("rsaquo" . 8250) + ("euro" . 8364) + )) + (setf (gethash (car ent) table) (cdr ent))) + table)) + + + (defstruct tokenbuf cur ;; next index to use to grab from tokenbuf max ;; index one beyond last character @@ -301,7 +568,7 @@ (defun next-token (stream ignore-strings raw-mode-delimiter - read-sequence-func tokenbuf) + read-sequence-func tokenbuf parse-entities) (declare (optimize (speed 3) (safety 1))) ;; return two values: ;; the next token from the stream. @@ -382,6 +649,41 @@ (return) else ; collect a tag (setq state state-readtagfirst)) + elseif (and parse-entities (eq ch #\&)) + then ; reading an entity. entity ends at semicolon + (let (res (max 10)) + (loop (let ((ch (next-char stream))) + (if* (null ch) + then (error "End of file after & entity marker") + elseif (eq ch #\;) + then (return) + elseif (zerop (decf max)) + then (error "No semicolon found after entity starting: &~{~a~}" (nreverse res)) + else (push ch res)))) + (setq res (nreverse res)) + (if* (eq (car res) #\#) + then ; decimal entity + (let ((count 0)) + (dolist (ch (cdr res)) + (let ((code (char-code ch))) + (if* (<= #.(char-code #\0) + code + #.(char-code #\9)) + then (setq count + (+ (* 10 count) + (- code + #.(char-code #\0)))) + else (error "non decimal digit after &# - ~s" ch) + ))) + (add-to-coll coll (code-char count))) + else (let ((name (make-array (length res) + :element-type 'character + :initial-contents res))) + (let ((ch (gethash name *html-entity-to-code*))) + (if* ch + then (add-to-coll coll (code-char ch)) + else (error "No such entity as ~s" name)))))) + else ; we will check for & here eventually (if* (not (eq ch #\return)) then (add-to-coll coll ch)))) @@ -588,7 +890,7 @@ (#.state-readtag (when (null tag-to-return) - (error "unexpected end of input encountered")) + (error "unexpected end of input encountered")) ;; we've read a tag with no attributes (put-back-collector coll) (values tag-to-return @@ -743,17 +1045,19 @@ (defmethod parse-html ((p stream) &key callback-only callbacks collect-rogue-tags - no-body-tags) + no-body-tags + parse-entities) (declare (optimize (speed 3) (safety 1))) (phtml-internal p nil callback-only callbacks collect-rogue-tags - no-body-tags)) + no-body-tags parse-entities)) (defmacro tag-callback (tag) `(rest (assoc ,tag callbacks))) -(defun phtml-internal (p read-sequence-func callback-only callbacks collect-rogue-tags - - no-body-tags) +(defun phtml-internal (p read-sequence-func callback-only + callbacks collect-rogue-tags + no-body-tags + parse-entities) (declare (optimize (speed 3) (safety 1))) (let ((raw-mode-delimiter nil) (pending nil) @@ -767,9 +1071,11 @@ (guts) (rogue-tags) ) - (labels ((close-off-tags (name stop-at collect-rogues) + (labels ((close-off-tags (name stop-at collect-rogues once-only) ;; close off an open 'name' tag, but search no further ;; than a 'stop-at' tag. + #+ignore (format t "close off name ~s, stop at ~s, ct ~s~%" + name stop-at current-tag) (if* (member (tag-name current-tag) name :test #'eq) then ;; close current tag(s) (loop @@ -778,11 +1084,12 @@ *known-tags*))) (push (tag-name current-tag) rogue-tags)) (close-current-tag) - (when (or (member (tag-name current-tag) - *ch-format*) - (not (member - (tag-name current-tag) name :test #'eq))) - (return))) + (if* (or once-only + (member (tag-name current-tag) + *ch-format*) + (not (member + (tag-name current-tag) name :test #'eq))) + then (return))) elseif (member (tag-name current-tag) stop-at :test #'eq) then nil else ; search if there is a tag to close @@ -825,9 +1132,11 @@ (push element guts)))) (save-state () - ;; push the current tag state since we're starting + ;; push the current tag state since we're starting: ;; a new open tag - (push (cons current-tag guts) pending)) + (push (cons current-tag guts) pending) + #+ignore (format t "state saved, pending ~s~%" pending) + ) (strip-rev-pcdata (stuff) @@ -867,15 +1176,15 @@ (if* (eq kind :start-tag) then (push val new-opens) elseif (member val new-opens :test #'eq) then (setf new-opens (remove val new-opens :count 1)) - else (close-off-tags (list val) nil nil) + else (close-off-tags (list val) nil nil nil) ))))) (get-next-token (force) (if* (or force (null (tokenbuf-first-pass tokenbuf))) then (multiple-value-bind (val kind) (next-token p nil raw-mode-delimiter read-sequence-func - tokenbuf) - (values val kind)) + tokenbuf parse-entities) + (values val kind)) else (let ((val (first (tokenbuf-first-pass tokenbuf))) (kind (second (tokenbuf-first-pass tokenbuf)))) @@ -886,7 +1195,8 @@ (loop (multiple-value-bind (val kind) (get-next-token nil) - ;;(format t "val: ~s kind: ~s~%" val kind) + #+ignore (format t "val: ~s kind: ~s last-tag ~s pending ~s~%" val kind + last-tag pending) (case kind (:pcdata (when (or (and callback-only current-callback-tags) @@ -903,7 +1213,7 @@ (when (and (= (length raw-mode-delimiter) 1) ;; xml tag... (or (and callback-only current-callback-tags) (not callback-only))) - (close-off-tags (list last-tag) nil nil)) + (close-off-tags (list last-tag) nil nil t)) (setf raw-mode-delimiter nil) ) @@ -928,7 +1238,7 @@ then "" else "")) elseif (or (eq last-tag :script) - (and (listp last-tag) (eq (first last-tag) :script))) + (and (listp last-tag) (eq (first last-tag) :script))) then (setf raw-mode-delimiter (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER) @@ -945,7 +1255,7 @@ (not callback-only)) (if* auto-close then (setq auto-close-stop (tag-auto-close-stop name)) - (close-off-tags auto-close auto-close-stop nil)) + (close-off-tags auto-close auto-close-stop nil nil)) (when (and pending-ch-format (not no-end)) (if* (member name *ch-format* :test #'eq) then nil elseif (member name *in-line* :test #'eq) then @@ -953,7 +1263,7 @@ (check-in-line name) else ;; close ALL pending char tags and then reopen (dolist (this-tag (reverse pending-ch-format)) - (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil)) + (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil nil)) )) (if* no-end then ; this is a singleton tag @@ -995,7 +1305,7 @@ (setf raw-mode-delimiter nil) (when (or (and callback-only current-callback-tags) (not callback-only)) - (close-off-tags (list val) nil nil) + (close-off-tags (list val) nil nil t) (when (member val *ch-format* :test #'eq) (setf pending-ch-format (remove val pending-ch-format :count 1 @@ -1022,7 +1332,7 @@ ;; close off all tags (when (or (and callback-only current-callback-tags) (not callback-only)) - (close-off-tags '(:start-parse) nil collect-rogue-tags)) + (close-off-tags '(:start-parse) nil collect-rogue-tags nil)) (put-back-tokenbuf tokenbuf) (if collect-rogue-tags (return (values (cdar guts) rogue-tags)) @@ -1031,21 +1341,25 @@ (defmethod parse-html (file &key callback-only callbacks collect-rogue-tags - no-body-tags) + no-body-tags parse-entities) (declare (optimize (speed 3) (safety 1))) (with-open-file (p file :direction :input) (parse-html p :callback-only callback-only :callbacks callbacks :collect-rogue-tags collect-rogue-tags - :no-body-tags no-body-tags))) + :no-body-tags no-body-tags + :parse-entities parse-entities + ))) (defmethod parse-html ((str string) &key callback-only callbacks collect-rogue-tags - no-body-tags) + no-body-tags parse-entities) (declare (optimize (speed 3) (safety 1))) (parse-html (make-string-input-stream str) :callback-only callback-only :callbacks callbacks :collect-rogue-tags collect-rogue-tags - :no-body-tags no-body-tags)) + :no-body-tags no-body-tags + :parse-entities parse-entities + ))