X-Git-Url: http://git.kpe.io/?p=xmlutils.git;a=blobdiff_plain;f=phtml.cl;fp=phtml.cl;h=b9c2fc1109281e9f3d641907d4a8988195aeb6ec;hp=444218326d52bbdca8478e28183102c578967f5e;hb=2e566ae3baa533146fbdb77af653adfda5356b76;hpb=96edd80309cfaea1949768cd4b3a5f7e0dc203d5 diff --git a/phtml.cl b/phtml.cl index 4442183..b9c2fc1 100644 --- a/phtml.cl +++ b/phtml.cl @@ -4,11 +4,11 @@ :type :system :post-loadable t) -;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA +;; copyright (c) 1986-2000 Franz Inc, Berkeley, CA ;; ;; This code is free software; you can redistribute it and/or ;; modify it under the terms of the version 2.1 of -;; the GNU Lesser General Public License as published by +;; the GNU Lesser General Public License as published by ;; the Free Software Foundation, as clarified by the AllegroServe ;; prequel found in license-allegroserve.txt. ;; @@ -17,11 +17,11 @@ ;; merchantability or fitness for a particular purpose. See the GNU ;; Lesser General Public License for more details. ;; -;; Version 2.1 of the GNU Lesser General Public License is in the file +;; Version 2.1 of the GNU Lesser General Public License is in the file ;; license-lgpl.txt that was distributed with this file. ;; If it is not present, you can access it from ;; http://www.gnu.org/copyleft/lesser.txt (until superseded by a newer -;; version) or write to the Free Software Foundation, Inc., 59 Temple Place, +;; version) or write to the Free Software Foundation, Inc., 59 Temple Place, ;; Suite 330, Boston, MA 02111-1307 USA ;; @@ -31,7 +31,7 @@ ;; Change Log ;; 05/14/02 - add :parse-entities arg to parse-html. If true then -;; entities are converted to the character they represent. +;; entities are converted to the character they represent. ;; ;; 02/05/01 symbols mapped to preferred case at runtime (as opposed to ;; a compile time macro determining the case mapping) @@ -67,8 +67,8 @@ (defmacro tag-name (expr) `(let ((.xx. ,expr)) (if* (consp .xx.) - then (car .xx.) - else .xx.))) + then (car .xx.) + else .xx.))) @@ -91,7 +91,7 @@ ) -(defstruct collector +(defstruct collector next ; next index to set max ; 1+max index to set data ; string vector @@ -106,31 +106,31 @@ (let (col) (without-scheduling (do* ((cols *collectors* (cdr cols)) - (this (car cols) (car cols))) - ((null cols)) - (if* this - then (setf (car cols) nil) - (setq col this) - (return)))) + (this (car cols) (car cols))) + ((null cols)) + (if* this + then (setf (car cols) nil) + (setq col this) + (return)))) (if* col then (setf (collector-next col) 0) - col + col else (make-collector - :next 0 - :max 100 - :data (make-string 100))))) + :next 0 + :max 100 + :data (make-string 100))))) (defun put-back-collector (col) (declare (optimize (speed 3) (safety 1))) - (without-scheduling + (without-scheduling (do ((cols *collectors* (cdr cols))) - ((null cols) - ; toss it away - nil) + ((null cols) + ; toss it away + nil) (if* (null (car cols)) - then (setf (car cols) col) - (return))))) - + then (setf (car cols) col) + (return))))) + (defun grow-and-add (coll ch) @@ -138,7 +138,7 @@ ;; increase the size of the data portion of the collector and then ;; add the given char at the end (let* ((odata (collector-data coll)) - (ndata (make-string (* 2 (length odata))))) + (ndata (make-string (* 2 (length odata))))) (dotimes (i (length odata)) (setf (schar ndata i) (schar odata i))) (setf (collector-data coll) ndata) @@ -147,376 +147,376 @@ (setf (schar ndata next) ch) (setf (collector-next coll) (1+ next))))) - - - - + + + + ;; character characteristics (defconstant char-tagcharacter 1) ; valid char for a tag (defconstant char-attribnamechar 2) ; valid char for an attribute name (defconstant char-attribundelimattribvalue 4) ; valid for undelimited value (defconstant char-spacechar 8) -(defparameter *characteristics* +(defparameter *characteristics* ;; array of bits describing character characteristics (let ((arr (make-array 128 :initial-element 0))) (declare (optimize (speed 3) (safety 1))) (macrolet ((with-range ((var from to) &rest body) - `(do ((,var (char-code ,from) (1+ ,var)) - (mmax (char-code ,to))) - ((> ,var mmax)) - ,@body)) - - (addit (index charistic) - `(setf (svref arr ,index) - (logior (svref arr ,index) - ,charistic))) - ) - - (with-range (i #\A #\Z) - (addit i (+ char-tagcharacter - char-attribnamechar - char-attribundelimattribvalue))) - - (with-range (i #\a #\z) - (addit i (+ char-tagcharacter - char-attribnamechar - char-attribundelimattribvalue))) - - (with-range (i #\0 #\9) - (addit i (+ char-tagcharacter - char-attribnamechar - char-attribundelimattribvalue))) - - ;; let colon be legal tag character - (addit (char-code #\:) (+ char-attribnamechar - char-tagcharacter)) - - ;; NY times special tags have _ - (addit (char-code #\_) (+ char-attribnamechar - char-tagcharacter)) - - ; now the unusual cases - (addit (char-code #\-) (+ char-attribnamechar - char-attribundelimattribvalue)) - (addit (char-code #\.) (+ char-attribnamechar - char-attribundelimattribvalue)) - - ;; adding all typeable chars except for whitespace and > - (addit (char-code #\:) char-attribundelimattribvalue) - (addit (char-code #\@) char-attribundelimattribvalue) - (addit (char-code #\/) char-attribundelimattribvalue) - (addit (char-code #\!) char-attribundelimattribvalue) - (addit (char-code #\#) char-attribundelimattribvalue) - (addit (char-code #\$) char-attribundelimattribvalue) - (addit (char-code #\%) char-attribundelimattribvalue) - (addit (char-code #\^) char-attribundelimattribvalue) - (addit (char-code #\&) char-attribundelimattribvalue) - (addit (char-code #\() char-attribundelimattribvalue) - (addit (char-code #\)) char-attribundelimattribvalue) - (addit (char-code #\_) char-attribundelimattribvalue) - (addit (char-code #\=) char-attribundelimattribvalue) - (addit (char-code #\+) char-attribundelimattribvalue) - (addit (char-code #\\) char-attribundelimattribvalue) - (addit (char-code #\|) char-attribundelimattribvalue) - (addit (char-code #\{) char-attribundelimattribvalue) - (addit (char-code #\}) char-attribundelimattribvalue) - (addit (char-code #\[) char-attribundelimattribvalue) - (addit (char-code #\]) char-attribundelimattribvalue) - (addit (char-code #\;) char-attribundelimattribvalue) - (addit (char-code #\') char-attribundelimattribvalue) - (addit (char-code #\") char-attribundelimattribvalue) - (addit (char-code #\,) char-attribundelimattribvalue) - (addit (char-code #\<) char-attribundelimattribvalue) - (addit (char-code #\?) char-attribundelimattribvalue) - - ; i'm not sure what can be in a tag name but we know that - ; ! and - must be there since it's used in comments - - (addit (char-code #\-) char-tagcharacter) - (addit (char-code #\!) char-tagcharacter) - - ; spaces - (addit (char-code #\space) char-spacechar) - (addit (char-code #\tab) char-spacechar) - (addit (char-code #\return) char-spacechar) - (addit (char-code #\linefeed) char-spacechar) - - ) - - - + `(do ((,var (char-code ,from) (1+ ,var)) + (mmax (char-code ,to))) + ((> ,var mmax)) + ,@body)) + + (addit (index charistic) + `(setf (svref arr ,index) + (logior (svref arr ,index) + ,charistic))) + ) + + (with-range (i #\A #\Z) + (addit i (+ char-tagcharacter + char-attribnamechar + char-attribundelimattribvalue))) + + (with-range (i #\a #\z) + (addit i (+ char-tagcharacter + char-attribnamechar + char-attribundelimattribvalue))) + + (with-range (i #\0 #\9) + (addit i (+ char-tagcharacter + char-attribnamechar + char-attribundelimattribvalue))) + + ;; let colon be legal tag character + (addit (char-code #\:) (+ char-attribnamechar + char-tagcharacter)) + + ;; NY times special tags have _ + (addit (char-code #\_) (+ char-attribnamechar + char-tagcharacter)) + + ; now the unusual cases + (addit (char-code #\-) (+ char-attribnamechar + char-attribundelimattribvalue)) + (addit (char-code #\.) (+ char-attribnamechar + char-attribundelimattribvalue)) + + ;; adding all typeable chars except for whitespace and > + (addit (char-code #\:) char-attribundelimattribvalue) + (addit (char-code #\@) char-attribundelimattribvalue) + (addit (char-code #\/) char-attribundelimattribvalue) + (addit (char-code #\!) char-attribundelimattribvalue) + (addit (char-code #\#) char-attribundelimattribvalue) + (addit (char-code #\$) char-attribundelimattribvalue) + (addit (char-code #\%) char-attribundelimattribvalue) + (addit (char-code #\^) char-attribundelimattribvalue) + (addit (char-code #\&) char-attribundelimattribvalue) + (addit (char-code #\() char-attribundelimattribvalue) + (addit (char-code #\)) char-attribundelimattribvalue) + (addit (char-code #\_) char-attribundelimattribvalue) + (addit (char-code #\=) char-attribundelimattribvalue) + (addit (char-code #\+) char-attribundelimattribvalue) + (addit (char-code #\\) char-attribundelimattribvalue) + (addit (char-code #\|) char-attribundelimattribvalue) + (addit (char-code #\{) char-attribundelimattribvalue) + (addit (char-code #\}) char-attribundelimattribvalue) + (addit (char-code #\[) char-attribundelimattribvalue) + (addit (char-code #\]) char-attribundelimattribvalue) + (addit (char-code #\;) char-attribundelimattribvalue) + (addit (char-code #\') char-attribundelimattribvalue) + (addit (char-code #\") char-attribundelimattribvalue) + (addit (char-code #\,) char-attribundelimattribvalue) + (addit (char-code #\<) char-attribundelimattribvalue) + (addit (char-code #\?) char-attribundelimattribvalue) + + ; i'm not sure what can be in a tag name but we know that + ; ! and - must be there since it's used in comments + + (addit (char-code #\-) char-tagcharacter) + (addit (char-code #\!) char-tagcharacter) + + ; spaces + (addit (char-code #\space) char-spacechar) + (addit (char-code #\tab) char-spacechar) + (addit (char-code #\return) char-spacechar) + (addit (char-code #\linefeed) char-spacechar) + + ) + + + arr)) - + (defun char-characteristic (char bit) (declare (optimize (speed 3) (safety 1))) - ;; return true if the given char has the given bit set in + ;; return true if the given char has the given bit set in ;; the characteristic array (let ((code (char-code char))) (if* (<= 0 code 127) then ; in range - (not (zerop (logand (svref *characteristics* code) bit)))))) + (not (zerop (logand (svref *characteristics* code) bit)))))) -(defvar *html-entity-to-code* +(defvar *html-entity-to-code* (let ((table (make-hash-table :test #'equal))) (dolist (ent '(("nbsp" . 160) - ("iexcl" . 161) - ("cent" . 162) - ("pound" . 163) - ("curren" . 164) - ("yen" . 165) - ("brvbar" . 166) - ("sect" . 167) - ("uml" . 168) - ("copy" . 169) - ("ordf" . 170) - ("laquo" . 171) - ("not" . 172) - ("shy" . 173) - ("reg" . 174) - ("macr" . 175) - ("deg" . 176) - ("plusmn" . 177) - ("sup2" . 178) - ("sup3" . 179) - ("acute" . 180) - ("micro" . 181) - ("para" . 182) - ("middot" . 183) - ("cedil" . 184) - ("sup1" . 185) - ("ordm" . 186) - ("raquo" . 187) - ("frac14" . 188) - ("frac12" . 189) - ("frac34" . 190) - ("iquest" . 191) - ("Agrave" . 192) - ("Aacute" . 193) - ("Acirc" . 194) - ("Atilde" . 195) - ("Auml" . 196) - ("Aring" . 197) - ("AElig" . 198) - ("Ccedil" . 199) - ("Egrave" . 200) - ("Eacute" . 201) - ("Ecirc" . 202) - ("Euml" . 203) - ("Igrave" . 204) - ("Iacute" . 205) - ("Icirc" . 206) - ("Iuml" . 207) - ("ETH" . 208) - ("Ntilde" . 209) - ("Ograve" . 210) - ("Oacute" . 211) - ("Ocirc" . 212) - ("Otilde" . 213) - ("Ouml" . 214) - ("times" . 215) - ("Oslash" . 216) - ("Ugrave" . 217) - ("Uacute" . 218) - ("Ucirc" . 219) - ("Uuml" . 220) - ("Yacute" . 221) - ("THORN" . 222) - ("szlig" . 223) - ("agrave" . 224) - ("aacute" . 225) - ("acirc" . 226) - ("atilde" . 227) - ("auml" . 228) - ("aring" . 229) - ("aelig" . 230) - ("ccedil" . 231) - ("egrave" . 232) - ("eacute" . 233) - ("ecirc" . 234) - ("euml" . 235) - ("igrave" . 236) - ("iacute" . 237) - ("icirc" . 238) - ("iuml" . 239) - ("eth" . 240) - ("ntilde" . 241) - ("ograve" . 242) - ("oacute" . 243) - ("ocirc" . 244) - ("otilde" . 245) - ("ouml" . 246) - ("divide" . 247) - ("oslash" . 248) - ("ugrave" . 249) - ("uacute" . 250) - ("ucirc" . 251) - ("uuml" . 252) - ("yacute" . 253) - ("thorn" . 254) - ("yuml" . 255) - ("fnof" . 402) - ("Alpha" . 913) - ("Beta" . 914) - ("Gamma" . 915) - ("Delta" . 916) - ("Epsilon" . 917) - ("Zeta" . 918) - ("Eta" . 919) - ("Theta" . 920) - ("Iota" . 921) - ("Kappa" . 922) - ("Lambda" . 923) - ("Mu" . 924) - ("Nu" . 925) - ("Xi" . 926) - ("Omicron" . 927) - ("Pi" . 928) - ("Rho" . 929) - ("Sigma" . 931) - ("Tau" . 932) - ("Upsilon" . 933) - ("Phi" . 934) - ("Chi" . 935) - ("Psi" . 936) - ("Omega" . 937) - ("alpha" . 945) - ("beta" . 946) - ("gamma" . 947) - ("delta" . 948) - ("epsilon" . 949) - ("zeta" . 950) - ("eta" . 951) - ("theta" . 952) - ("iota" . 953) - ("kappa" . 954) - ("lambda" . 955) - ("mu" . 956) - ("nu" . 957) - ("xi" . 958) - ("omicron" . 959) - ("pi" . 960) - ("rho" . 961) - ("sigmaf" . 962) - ("sigma" . 963) - ("tau" . 964) - ("upsilon" . 965) - ("phi" . 966) - ("chi" . 967) - ("psi" . 968) - ("omega" . 969) - ("thetasym" . 977) - ("upsih" . 978) - ("piv" . 982) - ("bull" . 8226) - ("hellip" . 8230) - ("prime" . 8242) - ("Prime" . 8243) - ("oline" . 8254) - ("frasl" . 8260) - ("weierp" . 8472) - ("image" . 8465) - ("real" . 8476) - ("trade" . 8482) - ("alefsym" . 8501) - ("larr" . 8592) - ("uarr" . 8593) - ("rarr" . 8594) - ("darr" . 8595) - ("harr" . 8596) - ("crarr" . 8629) - ("lArr" . 8656) - ("uArr" . 8657) - ("rArr" . 8658) - ("dArr" . 8659) - ("hArr" . 8660) - ("forall" . 8704) - ("part" . 8706) - ("exist" . 8707) - ("empty" . 8709) - ("nabla" . 8711) - ("isin" . 8712) - ("notin" . 8713) - ("ni" . 8715) - ("prod" . 8719) - ("sum" . 8721) - ("minus" . 8722) - ("lowast" . 8727) - ("radic" . 8730) - ("prop" . 8733) - ("infin" . 8734) - ("ang" . 8736) - ("and" . 8743) - ("or" . 8744) - ("cap" . 8745) - ("cup" . 8746) - ("int" . 8747) - ("there4" . 8756) - ("sim" . 8764) - ("cong" . 8773) - ("asymp" . 8776) - ("ne" . 8800) - ("equiv" . 8801) - ("le" . 8804) - ("ge" . 8805) - ("sub" . 8834) - ("sup" . 8835) - ("nsub" . 8836) - ("sube" . 8838) - ("supe" . 8839) - ("oplus" . 8853) - ("otimes" . 8855) - ("perp" . 8869) - ("sdot" . 8901) - ("lceil" . 8968) - ("rceil" . 8969) - ("lfloor" . 8970) - ("rfloor" . 8971) - ("lang" . 9001) - ("rang" . 9002) - ("loz" . 9674) - ("spades" . 9824) - ("clubs" . 9827) - ("hearts" . 9829) - ("diams" . 9830) - ("quot" . 34) - ("amp" . 38) - ("lt" . 60) - ("gt" . 62) - ("OElig" . 338) - ("oelig" . 339) - ("Scaron" . 352) - ("scaron" . 353) - ("Yuml" . 376) - ("circ" . 710) - ("tilde" . 732) - ("ensp" . 8194) - ("emsp" . 8195) - ("thinsp" . 8201) - ("zwnj" . 8204) - ("zwj" . 8205) - ("lrm" . 8206) - ("rlm" . 8207) - ("ndash" . 8211) - ("mdash" . 8212) - ("lsquo" . 8216) - ("rsquo" . 8217) - ("sbquo" . 8218) - ("ldquo" . 8220) - ("rdquo" . 8221) - ("bdquo" . 8222) - ("dagger" . 8224) - ("Dagger" . 8225) - ("permil" . 8240) - ("lsaquo" . 8249) - ("rsaquo" . 8250) - ("euro" . 8364) - )) - (setf (gethash (car ent) table) (cdr ent))) + ("iexcl" . 161) + ("cent" . 162) + ("pound" . 163) + ("curren" . 164) + ("yen" . 165) + ("brvbar" . 166) + ("sect" . 167) + ("uml" . 168) + ("copy" . 169) + ("ordf" . 170) + ("laquo" . 171) + ("not" . 172) + ("shy" . 173) + ("reg" . 174) + ("macr" . 175) + ("deg" . 176) + ("plusmn" . 177) + ("sup2" . 178) + ("sup3" . 179) + ("acute" . 180) + ("micro" . 181) + ("para" . 182) + ("middot" . 183) + ("cedil" . 184) + ("sup1" . 185) + ("ordm" . 186) + ("raquo" . 187) + ("frac14" . 188) + ("frac12" . 189) + ("frac34" . 190) + ("iquest" . 191) + ("Agrave" . 192) + ("Aacute" . 193) + ("Acirc" . 194) + ("Atilde" . 195) + ("Auml" . 196) + ("Aring" . 197) + ("AElig" . 198) + ("Ccedil" . 199) + ("Egrave" . 200) + ("Eacute" . 201) + ("Ecirc" . 202) + ("Euml" . 203) + ("Igrave" . 204) + ("Iacute" . 205) + ("Icirc" . 206) + ("Iuml" . 207) + ("ETH" . 208) + ("Ntilde" . 209) + ("Ograve" . 210) + ("Oacute" . 211) + ("Ocirc" . 212) + ("Otilde" . 213) + ("Ouml" . 214) + ("times" . 215) + ("Oslash" . 216) + ("Ugrave" . 217) + ("Uacute" . 218) + ("Ucirc" . 219) + ("Uuml" . 220) + ("Yacute" . 221) + ("THORN" . 222) + ("szlig" . 223) + ("agrave" . 224) + ("aacute" . 225) + ("acirc" . 226) + ("atilde" . 227) + ("auml" . 228) + ("aring" . 229) + ("aelig" . 230) + ("ccedil" . 231) + ("egrave" . 232) + ("eacute" . 233) + ("ecirc" . 234) + ("euml" . 235) + ("igrave" . 236) + ("iacute" . 237) + ("icirc" . 238) + ("iuml" . 239) + ("eth" . 240) + ("ntilde" . 241) + ("ograve" . 242) + ("oacute" . 243) + ("ocirc" . 244) + ("otilde" . 245) + ("ouml" . 246) + ("divide" . 247) + ("oslash" . 248) + ("ugrave" . 249) + ("uacute" . 250) + ("ucirc" . 251) + ("uuml" . 252) + ("yacute" . 253) + ("thorn" . 254) + ("yuml" . 255) + ("fnof" . 402) + ("Alpha" . 913) + ("Beta" . 914) + ("Gamma" . 915) + ("Delta" . 916) + ("Epsilon" . 917) + ("Zeta" . 918) + ("Eta" . 919) + ("Theta" . 920) + ("Iota" . 921) + ("Kappa" . 922) + ("Lambda" . 923) + ("Mu" . 924) + ("Nu" . 925) + ("Xi" . 926) + ("Omicron" . 927) + ("Pi" . 928) + ("Rho" . 929) + ("Sigma" . 931) + ("Tau" . 932) + ("Upsilon" . 933) + ("Phi" . 934) + ("Chi" . 935) + ("Psi" . 936) + ("Omega" . 937) + ("alpha" . 945) + ("beta" . 946) + ("gamma" . 947) + ("delta" . 948) + ("epsilon" . 949) + ("zeta" . 950) + ("eta" . 951) + ("theta" . 952) + ("iota" . 953) + ("kappa" . 954) + ("lambda" . 955) + ("mu" . 956) + ("nu" . 957) + ("xi" . 958) + ("omicron" . 959) + ("pi" . 960) + ("rho" . 961) + ("sigmaf" . 962) + ("sigma" . 963) + ("tau" . 964) + ("upsilon" . 965) + ("phi" . 966) + ("chi" . 967) + ("psi" . 968) + ("omega" . 969) + ("thetasym" . 977) + ("upsih" . 978) + ("piv" . 982) + ("bull" . 8226) + ("hellip" . 8230) + ("prime" . 8242) + ("Prime" . 8243) + ("oline" . 8254) + ("frasl" . 8260) + ("weierp" . 8472) + ("image" . 8465) + ("real" . 8476) + ("trade" . 8482) + ("alefsym" . 8501) + ("larr" . 8592) + ("uarr" . 8593) + ("rarr" . 8594) + ("darr" . 8595) + ("harr" . 8596) + ("crarr" . 8629) + ("lArr" . 8656) + ("uArr" . 8657) + ("rArr" . 8658) + ("dArr" . 8659) + ("hArr" . 8660) + ("forall" . 8704) + ("part" . 8706) + ("exist" . 8707) + ("empty" . 8709) + ("nabla" . 8711) + ("isin" . 8712) + ("notin" . 8713) + ("ni" . 8715) + ("prod" . 8719) + ("sum" . 8721) + ("minus" . 8722) + ("lowast" . 8727) + ("radic" . 8730) + ("prop" . 8733) + ("infin" . 8734) + ("ang" . 8736) + ("and" . 8743) + ("or" . 8744) + ("cap" . 8745) + ("cup" . 8746) + ("int" . 8747) + ("there4" . 8756) + ("sim" . 8764) + ("cong" . 8773) + ("asymp" . 8776) + ("ne" . 8800) + ("equiv" . 8801) + ("le" . 8804) + ("ge" . 8805) + ("sub" . 8834) + ("sup" . 8835) + ("nsub" . 8836) + ("sube" . 8838) + ("supe" . 8839) + ("oplus" . 8853) + ("otimes" . 8855) + ("perp" . 8869) + ("sdot" . 8901) + ("lceil" . 8968) + ("rceil" . 8969) + ("lfloor" . 8970) + ("rfloor" . 8971) + ("lang" . 9001) + ("rang" . 9002) + ("loz" . 9674) + ("spades" . 9824) + ("clubs" . 9827) + ("hearts" . 9829) + ("diams" . 9830) + ("quot" . 34) + ("amp" . 38) + ("lt" . 60) + ("gt" . 62) + ("OElig" . 338) + ("oelig" . 339) + ("Scaron" . 352) + ("scaron" . 353) + ("Yuml" . 376) + ("circ" . 710) + ("tilde" . 732) + ("ensp" . 8194) + ("emsp" . 8195) + ("thinsp" . 8201) + ("zwnj" . 8204) + ("zwj" . 8205) + ("lrm" . 8206) + ("rlm" . 8207) + ("ndash" . 8211) + ("mdash" . 8212) + ("lsquo" . 8216) + ("rsquo" . 8217) + ("sbquo" . 8218) + ("ldquo" . 8220) + ("rdquo" . 8221) + ("bdquo" . 8222) + ("dagger" . 8224) + ("Dagger" . 8225) + ("permil" . 8240) + ("lsaquo" . 8249) + ("rsaquo" . 8250) + ("euro" . 8364) + )) + (setf (gethash (car ent) table) (cdr ent))) table)) @@ -536,392 +536,392 @@ (let (buf) (without-scheduling (do* ((bufs *tokenbufs* (cdr bufs)) - (this (car bufs) (car bufs))) - ((null bufs)) - (if* this - then (setf (car bufs) nil) - (setq buf this) - (return)))) + (this (car bufs) (car bufs))) + ((null bufs)) + (if* this + then (setf (car bufs) nil) + (setq buf this) + (return)))) (if* buf then (setf (tokenbuf-cur buf) 0) - (setf (tokenbuf-max buf) 0) - buf + (setf (tokenbuf-max buf) 0) + buf else (make-tokenbuf - :cur 0 - :max 0 - :data (make-array 1024 :element-type 'character))))) + :cur 0 + :max 0 + :data (make-array 1024 :element-type 'character))))) (defun put-back-tokenbuf (buf) (declare (optimize (speed 3) (safety 1))) - (without-scheduling + (without-scheduling (do ((bufs *tokenbufs* (cdr bufs))) - ((null bufs) - ; toss it away - nil) + ((null bufs) + ; toss it away + nil) (if* (null (car bufs)) - then (setf (car bufs) buf) - (return))))) + then (setf (car bufs) buf) + (return))))) (defun to-preferred-case (ch) (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER) then (char-upcase ch) else (char-downcase ch))) - - + + (defun next-token (stream ignore-strings raw-mode-delimiter - read-sequence-func tokenbuf parse-entities) + read-sequence-func tokenbuf parse-entities) (declare (optimize (speed 3) (safety 1))) - ;; return two values: + ;; return two values: ;; the next token from the stream. - ;; the kind of token (:pcdata, :start-tag, :end-tag, :eof) + ;; the kind of token (:pcdata, :start-tag, :end-tag, :eof) ;; ;; if read-sequence-func is non-nil, ;; read-sequence-func is called to fetch the next character (macrolet ((next-char (stream) - `(let ((cur (tokenbuf-cur tokenbuf)) - (tb (tokenbuf-data tokenbuf))) - (if* (>= cur (tokenbuf-max tokenbuf)) - then ; fill buffer - (if* (zerop (setf (tokenbuf-max tokenbuf) - (if* read-sequence-func - then (funcall read-sequence-func tb stream) - else (read-sequence tb stream)))) - then (setq cur nil) ; eof - else (setq cur 0))) - (if* cur - then (prog1 (schar tb cur) - (setf (tokenbuf-cur tokenbuf) (1+ cur)))))) - - - (un-next-char (stream ch) - `(decf (tokenbuf-cur tokenbuf))) - - (clear-coll (coll) - `(setf (collector-next coll) 0)) - - (add-to-coll (coll ch) - `(let ((.next. (collector-next ,coll))) - (if* (>= .next. (collector-max ,coll)) - then (grow-and-add ,coll ,ch) - else (setf (schar (collector-data ,coll) .next.) - ,ch) - (setf (collector-next ,coll) (1+ .next.))))) - - ) - + `(let ((cur (tokenbuf-cur tokenbuf)) + (tb (tokenbuf-data tokenbuf))) + (if* (>= cur (tokenbuf-max tokenbuf)) + then ; fill buffer + (if* (zerop (setf (tokenbuf-max tokenbuf) + (if* read-sequence-func + then (funcall read-sequence-func tb stream) + else (read-sequence tb stream)))) + then (setq cur nil) ; eof + else (setq cur 0))) + (if* cur + then (prog1 (schar tb cur) + (setf (tokenbuf-cur tokenbuf) (1+ cur)))))) + + + (un-next-char (stream ch) + `(decf (tokenbuf-cur tokenbuf))) + + (clear-coll (coll) + `(setf (collector-next coll) 0)) + + (add-to-coll (coll ch) + `(let ((.next. (collector-next ,coll))) + (if* (>= .next. (collector-max ,coll)) + then (grow-and-add ,coll ,ch) + else (setf (schar (collector-data ,coll) .next.) + ,ch) + (setf (collector-next ,coll) (1+ .next.))))) + + ) + (let ((state (if* raw-mode-delimiter then state-rawdata else state-pcdata)) - (coll (get-collector)) - (ch) - - (value-delim) - - (tag-to-return) - (attribs-to-return) - - (end-tag) - - (attrib-name) - (attrib-value) - - (name-length 0) ;; count only when it could be a comment - - (raw-length 0) + (coll (get-collector)) + (ch) + + (value-delim) + + (tag-to-return) + (attribs-to-return) + + (end-tag) + + (attrib-name) + (attrib-value) + + (name-length 0) ;; count only when it could be a comment + + (raw-length 0) (xml-bailout) - ) - + ) + (loop - - (setq ch (next-char stream)) - ;;(format t "ch: ~s state: ~s~%" ch state) - - (if* (null ch) - then (return) ; eof -- exit loop - ) - - - (case state - (#.state-pcdata - ; collect everything until we see a < - (if* (eq ch #\<) - then ; if we've collected nothing then get a tag - (if* (> (collector-next coll) 0) - then ; have collected something, return this string - (un-next-char stream ch) ; push back the < - (return) - else ; collect a tag - (setq state state-readtagfirst)) - elseif (and parse-entities (eq ch #\&)) - then ; reading an entity. entity ends at semicolon - (let (res (max 10)) - (loop (let ((ch (next-char stream))) - (if* (null ch) - then (error "End of file after & entity marker") - elseif (eq ch #\;) - then (return) - elseif (zerop (decf max)) - then (error "No semicolon found after entity starting: &~{~a~}" (nreverse res)) - else (push ch res)))) - (setq res (nreverse res)) - (if* (eq (car res) #\#) - then ; decimal entity - (let ((count 0)) - (dolist (ch (cdr res)) - (let ((code (char-code ch))) - (if* (<= #.(char-code #\0) - code - #.(char-code #\9)) - then (setq count - (+ (* 10 count) - (- code - #.(char-code #\0)))) - else (error "non decimal digit after &# - ~s" ch) - ))) - (add-to-coll coll (code-char count))) - else (let ((name (make-array (length res) - :element-type 'character - :initial-contents res))) - (let ((ch (gethash name *html-entity-to-code*))) - (if* ch - then (add-to-coll coll (code-char ch)) - else (error "No such entity as ~s" name)))))) - - else ; we will check for & here eventually - (if* (not (eq ch #\return)) - then (add-to-coll coll ch)))) - - (#.state-readtagfirst - ; starting to read a tag name - (if* (eq #\/ ch) - then ; end tag - (setq end-tag t) - else (if* (eq #\! ch) ; possible comment - then (setf xml-bailout t) - (setq name-length 0)) - (un-next-char stream ch)) - (setq state state-readtag)) - - (#.state-readtag - ;; reading the whole tag name - (if* (char-characteristic ch char-tagcharacter) - then (add-to-coll coll (to-preferred-case ch)) - (incf name-length) - (if* (and (eq name-length 3) - (coll-has-comment coll)) - then (clear-coll coll) - (setq state state-readcomment)) - - else (setq tag-to-return (compute-tag coll)) - (clear-coll coll) - (if* (eq ch #\>) - then (return) ; we're done - elseif xml-bailout then - (un-next-char stream ch) - (return) - else (if* (eq tag-to-return :!--) - then ; a comment - (setq state state-readcomment) - else (un-next-char stream ch) - (setq state state-findattribname))))) - - (#.state-findattribname - ;; search until we find the start of an attribute name - ;; or the end of the tag - (if* (eq ch #\>) - then ; end of the line - (return) - elseif (eq ch #\=) - then ; value for previous attribute name - ; (syntax "foo = bar" is bogus I think but it's - ; used some places, here is where we handle this - (pop attribs-to-return) - (setq attrib-name (pop attribs-to-return)) - (setq state state-findvalue) - elseif (char-characteristic ch char-attribnamechar) - then (un-next-char stream ch) - (setq state state-attribname) - else nil ; ignore other things - )) - - (#.state-findvalue - ;; find the start of the value - (if* (char-characteristic ch char-spacechar) - thenret ; keep looking - elseif (eq ch #\>) - then ; no value, set the value to be the - ; name as a string - (setq attrib-value - (string-downcase (string attrib-name))) - - (push attrib-name attribs-to-return) - (push attrib-value attribs-to-return) - (un-next-char stream ch) - (setq state state-findattribname) - else (un-next-char stream ch) - (setq state state-attribstartvalue))) - - - (#.state-attribname - ;; collect attribute name - - (if* (char-characteristic ch char-attribnamechar) - then (add-to-coll coll (to-preferred-case ch)) - elseif (eq #\= ch) - then ; end of attribute name, value is next - (setq attrib-name (compute-tag coll)) - (clear-coll coll) - (setq state state-attribstartvalue) - else ; end of attribute name with no value, - (setq attrib-name (compute-tag coll)) - (clear-coll coll) - (setq attrib-value - (string-downcase (string attrib-name))) - (push attrib-name attribs-to-return) - (push attrib-value attribs-to-return) - (un-next-char stream ch) - (setq state state-findattribname))) - - (#.state-attribstartvalue - ;; begin to collect value - (if* (or (eq ch #\") - (eq ch #\')) - then (setq value-delim ch) - (setq state state-attribvaluedelim) - ;; gobble spaces; assume since we've seen a '=' there really is a value - elseif (eq #\space ch) then nil - else (un-next-char stream ch) - (setq state state-attribvaluenodelim))) - - (#.state-attribvaluedelim - (if* (eq ch value-delim) - then (setq attrib-value (compute-coll-string coll)) - (clear-coll coll) - (push attrib-name attribs-to-return) - (push attrib-value attribs-to-return) - (setq state state-findattribname) - else (add-to-coll coll ch))) - - (#.state-attribvaluenodelim - ;; an attribute value not delimited by ' or " and thus restricted - ;; in the possible characters - (if* (char-characteristic ch char-attribundelimattribvalue) - then (add-to-coll coll ch) - else (un-next-char stream ch) - (setq attrib-value (compute-coll-string coll)) - (clear-coll coll) - (push attrib-name attribs-to-return) - (push attrib-value attribs-to-return) - (setq state state-findattribname))) - - (#.state-readcomment - ;; a comment ends on the first --, but we'll look for --> - ;; since that's what most people expect - (if* (eq ch #\-) - then (setq state state-readcomment-one) - else (add-to-coll coll ch))) - - (#.state-readcomment-one - ;; seen one -, looking for -> - - (if* (eq ch #\-) - then (setq state state-readcomment-two) - else ; not a comment end, put back the -'s - (add-to-coll coll #\-) - (add-to-coll coll ch) - (setq state state-readcomment))) - - (#.state-readcomment-two - ;; seen two -'s, looking for > - - (if* (eq ch #\>) - then ; end of the line - (return) - elseif (eq ch #\-) - then ; still at two -'s, have to put out first - (add-to-coll coll #\-) - else ; put out two hypens and back to looking for a hypen - (add-to-coll coll #\-) - (add-to-coll coll #\-) - (setq state state-readcomment))) - - (#.state-rawdata - ;; collect everything until we see the delimiter - (if* (eq (to-preferred-case ch) (elt raw-mode-delimiter raw-length)) - then - (incf raw-length) - (when (= raw-length (length raw-mode-delimiter)) - ;; push the end tag back so it can then be lexed - ;; but don't do it for xml stuff - (when (/= (length raw-mode-delimiter) 1) - (push :end-tag (tokenbuf-first-pass tokenbuf)) - (if* (equal raw-mode-delimiter "") - then (push :STYLE (tokenbuf-first-pass tokenbuf)) - elseif (equal raw-mode-delimiter "") - then (push :style (tokenbuf-first-pass tokenbuf)) - elseif (equal raw-mode-delimiter "") - then (push :SCRIPT (tokenbuf-first-pass tokenbuf)) - elseif (equal raw-mode-delimiter "") - then (push :script (tokenbuf-first-pass tokenbuf)) - else (error "unexpected raw-mode-delimiter")) - ) - ;; set state to state-pcdata for next section - (return)) - else - ;; push partial matches into data string - (dotimes (i raw-length) - (add-to-coll coll (elt raw-mode-delimiter i))) - (setf raw-length 0) - (add-to-coll coll ch))) - - )) - - - ;; out of the loop. + + (setq ch (next-char stream)) + ;;(format t "ch: ~s state: ~s~%" ch state) + + (if* (null ch) + then (return) ; eof -- exit loop + ) + + + (case state + (#.state-pcdata + ; collect everything until we see a < + (if* (eq ch #\<) + then ; if we've collected nothing then get a tag + (if* (> (collector-next coll) 0) + then ; have collected something, return this string + (un-next-char stream ch) ; push back the < + (return) + else ; collect a tag + (setq state state-readtagfirst)) + elseif (and parse-entities (eq ch #\&)) + then ; reading an entity. entity ends at semicolon + (let (res (max 10)) + (loop (let ((ch (next-char stream))) + (if* (null ch) + then (error "End of file after & entity marker") + elseif (eq ch #\;) + then (return) + elseif (zerop (decf max)) + then (error "No semicolon found after entity starting: &~{~a~}" (nreverse res)) + else (push ch res)))) + (setq res (nreverse res)) + (if* (eq (car res) #\#) + then ; decimal entity + (let ((count 0)) + (dolist (ch (cdr res)) + (let ((code (char-code ch))) + (if* (<= #.(char-code #\0) + code + #.(char-code #\9)) + then (setq count + (+ (* 10 count) + (- code + #.(char-code #\0)))) + else (error "non decimal digit after &# - ~s" ch) + ))) + (add-to-coll coll (code-char count))) + else (let ((name (make-array (length res) + :element-type 'character + :initial-contents res))) + (let ((ch (gethash name *html-entity-to-code*))) + (if* ch + then (add-to-coll coll (code-char ch)) + else (error "No such entity as ~s" name)))))) + + else ; we will check for & here eventually + (if* (not (eq ch #\return)) + then (add-to-coll coll ch)))) + + (#.state-readtagfirst + ; starting to read a tag name + (if* (eq #\/ ch) + then ; end tag + (setq end-tag t) + else (if* (eq #\! ch) ; possible comment + then (setf xml-bailout t) + (setq name-length 0)) + (un-next-char stream ch)) + (setq state state-readtag)) + + (#.state-readtag + ;; reading the whole tag name + (if* (char-characteristic ch char-tagcharacter) + then (add-to-coll coll (to-preferred-case ch)) + (incf name-length) + (if* (and (eq name-length 3) + (coll-has-comment coll)) + then (clear-coll coll) + (setq state state-readcomment)) + + else (setq tag-to-return (compute-tag coll)) + (clear-coll coll) + (if* (eq ch #\>) + then (return) ; we're done + elseif xml-bailout then + (un-next-char stream ch) + (return) + else (if* (eq tag-to-return :!--) + then ; a comment + (setq state state-readcomment) + else (un-next-char stream ch) + (setq state state-findattribname))))) + + (#.state-findattribname + ;; search until we find the start of an attribute name + ;; or the end of the tag + (if* (eq ch #\>) + then ; end of the line + (return) + elseif (eq ch #\=) + then ; value for previous attribute name + ; (syntax "foo = bar" is bogus I think but it's + ; used some places, here is where we handle this + (pop attribs-to-return) + (setq attrib-name (pop attribs-to-return)) + (setq state state-findvalue) + elseif (char-characteristic ch char-attribnamechar) + then (un-next-char stream ch) + (setq state state-attribname) + else nil ; ignore other things + )) + + (#.state-findvalue + ;; find the start of the value + (if* (char-characteristic ch char-spacechar) + thenret ; keep looking + elseif (eq ch #\>) + then ; no value, set the value to be the + ; name as a string + (setq attrib-value + (string-downcase (string attrib-name))) + + (push attrib-name attribs-to-return) + (push attrib-value attribs-to-return) + (un-next-char stream ch) + (setq state state-findattribname) + else (un-next-char stream ch) + (setq state state-attribstartvalue))) + + + (#.state-attribname + ;; collect attribute name + + (if* (char-characteristic ch char-attribnamechar) + then (add-to-coll coll (to-preferred-case ch)) + elseif (eq #\= ch) + then ; end of attribute name, value is next + (setq attrib-name (compute-tag coll)) + (clear-coll coll) + (setq state state-attribstartvalue) + else ; end of attribute name with no value, + (setq attrib-name (compute-tag coll)) + (clear-coll coll) + (setq attrib-value + (string-downcase (string attrib-name))) + (push attrib-name attribs-to-return) + (push attrib-value attribs-to-return) + (un-next-char stream ch) + (setq state state-findattribname))) + + (#.state-attribstartvalue + ;; begin to collect value + (if* (or (eq ch #\") + (eq ch #\')) + then (setq value-delim ch) + (setq state state-attribvaluedelim) + ;; gobble spaces; assume since we've seen a '=' there really is a value + elseif (eq #\space ch) then nil + else (un-next-char stream ch) + (setq state state-attribvaluenodelim))) + + (#.state-attribvaluedelim + (if* (eq ch value-delim) + then (setq attrib-value (compute-coll-string coll)) + (clear-coll coll) + (push attrib-name attribs-to-return) + (push attrib-value attribs-to-return) + (setq state state-findattribname) + else (add-to-coll coll ch))) + + (#.state-attribvaluenodelim + ;; an attribute value not delimited by ' or " and thus restricted + ;; in the possible characters + (if* (char-characteristic ch char-attribundelimattribvalue) + then (add-to-coll coll ch) + else (un-next-char stream ch) + (setq attrib-value (compute-coll-string coll)) + (clear-coll coll) + (push attrib-name attribs-to-return) + (push attrib-value attribs-to-return) + (setq state state-findattribname))) + + (#.state-readcomment + ;; a comment ends on the first --, but we'll look for --> + ;; since that's what most people expect + (if* (eq ch #\-) + then (setq state state-readcomment-one) + else (add-to-coll coll ch))) + + (#.state-readcomment-one + ;; seen one -, looking for -> + + (if* (eq ch #\-) + then (setq state state-readcomment-two) + else ; not a comment end, put back the -'s + (add-to-coll coll #\-) + (add-to-coll coll ch) + (setq state state-readcomment))) + + (#.state-readcomment-two + ;; seen two -'s, looking for > + + (if* (eq ch #\>) + then ; end of the line + (return) + elseif (eq ch #\-) + then ; still at two -'s, have to put out first + (add-to-coll coll #\-) + else ; put out two hypens and back to looking for a hypen + (add-to-coll coll #\-) + (add-to-coll coll #\-) + (setq state state-readcomment))) + + (#.state-rawdata + ;; collect everything until we see the delimiter + (if* (eq (to-preferred-case ch) (elt raw-mode-delimiter raw-length)) + then + (incf raw-length) + (when (= raw-length (length raw-mode-delimiter)) + ;; push the end tag back so it can then be lexed + ;; but don't do it for xml stuff + (when (/= (length raw-mode-delimiter) 1) + (push :end-tag (tokenbuf-first-pass tokenbuf)) + (if* (equal raw-mode-delimiter "") + then (push :STYLE (tokenbuf-first-pass tokenbuf)) + elseif (equal raw-mode-delimiter "") + then (push :style (tokenbuf-first-pass tokenbuf)) + elseif (equal raw-mode-delimiter "") + then (push :SCRIPT (tokenbuf-first-pass tokenbuf)) + elseif (equal raw-mode-delimiter "") + then (push :script (tokenbuf-first-pass tokenbuf)) + else (error "unexpected raw-mode-delimiter")) + ) + ;; set state to state-pcdata for next section + (return)) + else + ;; push partial matches into data string + (dotimes (i raw-length) + (add-to-coll coll (elt raw-mode-delimiter i))) + (setf raw-length 0) + (add-to-coll coll ch))) + + )) + + + ;; out of the loop. ;; if we're in certain states then it means we should return a value ;; (case state - ((#.state-pcdata #.state-rawdata) - ;; return the buffer as a string - (if* (zerop (collector-next coll)) - then (values nil (if (eq state state-pcdata) :eof :pcdata)) - else (values (prog1 - (if* (null ignore-strings) - then (compute-coll-string coll)) - (put-back-collector coll)) - :pcdata))) - - (#.state-readtag - (when (null tag-to-return) - (error "unexpected end of input encountered")) - ;; we've read a tag with no attributes - (put-back-collector coll) - (values tag-to-return - (if* end-tag - then :end-tag - else (if* xml-bailout then :xml else :start-tag)) - )) - - (#.state-findattribname - ;; returning a tag with possible attributes - (put-back-collector coll) - (if* end-tag - then ; ignore any attributes - (values tag-to-return :end-tag) - elseif attribs-to-return - then (values (cons tag-to-return - (nreverse attribs-to-return)) - :start-tag) - else (values tag-to-return :start-tag))) - - (#.state-readcomment-two - ;; returning a comment - (values (prog1 (if* (null ignore-strings) - then (compute-coll-string coll)) - (put-back-collector coll)) - :comment)) - - (t - (if* (null ch) then (error "unexpected end of input encountered") - else (error "internal error, can't be here in state ~d" state))))))) + ((#.state-pcdata #.state-rawdata) + ;; return the buffer as a string + (if* (zerop (collector-next coll)) + then (values nil (if (eq state state-pcdata) :eof :pcdata)) + else (values (prog1 + (if* (null ignore-strings) + then (compute-coll-string coll)) + (put-back-collector coll)) + :pcdata))) + + (#.state-readtag + (when (null tag-to-return) + (error "unexpected end of input encountered")) + ;; we've read a tag with no attributes + (put-back-collector coll) + (values tag-to-return + (if* end-tag + then :end-tag + else (if* xml-bailout then :xml else :start-tag)) + )) + + (#.state-findattribname + ;; returning a tag with possible attributes + (put-back-collector coll) + (if* end-tag + then ; ignore any attributes + (values tag-to-return :end-tag) + elseif attribs-to-return + then (values (cons tag-to-return + (nreverse attribs-to-return)) + :start-tag) + else (values tag-to-return :start-tag))) + + (#.state-readcomment-two + ;; returning a comment + (values (prog1 (if* (null ignore-strings) + then (compute-coll-string coll)) + (put-back-collector coll)) + :comment)) + + (t + (if* (null ch) then (error "unexpected end of input encountered") + else (error "internal error, can't be here in state ~d" state))))))) (defvar *kwd-package* (find-package :keyword)) @@ -937,10 +937,10 @@ (declare (optimize (speed 3) (safety 1))) ;; return the string that's in the collection (let ((str (make-string (collector-next coll))) - (from (collector-data coll))) + (from (collector-data coll))) (dotimes (i (collector-next coll)) (setf (schar str i) (schar from i))) - + str)) (defun coll-has-comment (coll) @@ -948,39 +948,39 @@ ;; true if the collector has exactly "!--" in it (and (eq 3 (collector-next coll)) (let ((data (collector-data coll))) - (and (eq #\! (schar data 0)) - (eq #\- (schar data 1)) - (eq #\- (schar data 2)))))) - + (and (eq #\! (schar data 0)) + (eq #\- (schar data 1)) + (eq #\- (schar data 2)))))) + ;;;;;;;;;;; quick and dirty parse ; the elements with no body and thus no end tag -(dolist (opt '(:area :base :basefont :bgsound :br :button :col - ;;:colgroup - no, this is an element with contents - :embed :hr :img :frame - :input :isindex :keygen :link :meta - :plaintext :spacer :wbr)) +(dolist (opt '(:area :base :basefont :bgsound :br :button :col + ;;:colgroup - no, this is an element with contents + :embed :hr :img :frame + :input :isindex :keygen :link :meta + :plaintext :spacer :wbr)) (setf (tag-no-end opt) t)) (defvar *in-line* '(:tt :i :b :big :small :em :strong :dfn :code :samp :kbd - :var :cite :abbr :acronym :a :img :object :br :map - :q :sub :sup :span :bdo :input :select :textarea :label :button :font)) + :var :cite :abbr :acronym :a :img :object :br :map + :q :sub :sup :span :bdo :input :select :textarea :label :button :font)) (defvar *ch-format* '(:i :b :tt :big :small :strike :s :u - :em :strong :font)) + :em :strong :font)) (defvar *known-tags* '(:!doctype :a :acronym :address :applet :area :b :base :basefont - :bdo :bgsound :big :blink :blockquote :body :br :button :caption - :center :cite :code :col :colgroup :comment :dd :del :dfn :dir - :div :dl :dt :em :embed :fieldset :font :form :frame :frameset - :h1 :h2 :h3 :h4 :h5 :h6 :head :hr :html :i :iframe :img :input - :ins :isindex :kbd :label :layer :legend :li :link :listing :map - :marquee :menu :meta :multicol :nobr :noframes :noscript :object - :ol :option :p :param :plaintext :pre :q :samp :script :select - :small :spacer :span :s :strike :strong :style :sub :sup :table - :tbody :td :textarea :tfoot :th :thead :title :tr :tt :u :ul :var - :wbr :xmp)) + :bdo :bgsound :big :blink :blockquote :body :br :button :caption + :center :cite :code :col :colgroup :comment :dd :del :dfn :dir + :div :dl :dt :em :embed :fieldset :font :form :frame :frameset + :h1 :h2 :h3 :h4 :h5 :h6 :head :hr :html :i :iframe :img :input + :ins :isindex :kbd :label :layer :legend :li :link :listing :map + :marquee :menu :meta :multicol :nobr :noframes :noscript :object + :ol :option :p :param :plaintext :pre :q :samp :script :select + :small :spacer :span :s :strike :strong :style :sub :sup :table + :tbody :td :textarea :tfoot :th :thead :title :tr :tt :u :ul :var + :wbr :xmp)) ; the elements whose start tag can end a previous tag @@ -1046,339 +1046,339 @@ (defmethod parse-html ((p stream) &key callback-only callbacks collect-rogue-tags - no-body-tags - parse-entities) + no-body-tags + parse-entities) (declare (optimize (speed 3) (safety 1))) (phtml-internal p nil callback-only callbacks collect-rogue-tags - no-body-tags parse-entities)) + no-body-tags parse-entities)) (defmacro tag-callback (tag) `(rest (assoc ,tag callbacks))) -(defun phtml-internal (p read-sequence-func callback-only - callbacks collect-rogue-tags - no-body-tags - parse-entities) +(defun phtml-internal (p read-sequence-func callback-only + callbacks collect-rogue-tags + no-body-tags + parse-entities) (declare (optimize (speed 3) (safety 1))) (let ((raw-mode-delimiter nil) - (pending nil) - (current-tag :start-parse) - (last-tag :start-parse) - (current-callback-tags nil) - (pending-ch-format nil) - (closed-pending-ch-format nil) - (new-opens nil) - (tokenbuf (get-tokenbuf)) - (guts) - (rogue-tags) - ) + (pending nil) + (current-tag :start-parse) + (last-tag :start-parse) + (current-callback-tags nil) + (pending-ch-format nil) + (closed-pending-ch-format nil) + (new-opens nil) + (tokenbuf (get-tokenbuf)) + (guts) + (rogue-tags) + ) (labels ((close-off-tags (name stop-at collect-rogues once-only) - ;; close off an open 'name' tag, but search no further - ;; than a 'stop-at' tag. - #+ignore (format t "close off name ~s, stop at ~s, ct ~s~%" - name stop-at current-tag) - (if* (member (tag-name current-tag) name :test #'eq) - then ;; close current tag(s) - (loop - (when (and collect-rogues - (not (member (tag-name current-tag) - *known-tags*))) - (push (tag-name current-tag) rogue-tags)) - (close-current-tag) - (if* (or once-only - (member (tag-name current-tag) - *ch-format*) - (not (member - (tag-name current-tag) name :test #'eq))) - then (return))) - elseif (member (tag-name current-tag) stop-at :test #'eq) - then nil - else ; search if there is a tag to close - (dolist (ent pending) - (if* (member (tag-name (car ent)) name :test #'eq) - then ; found one to close - (loop - (when (and collect-rogues - (not (member (tag-name current-tag) - *known-tags*))) - (push (tag-name current-tag) rogue-tags)) - (close-current-tag) - (if* (member (tag-name current-tag) name - :test #'eq) - then (close-current-tag) - (return))) - (return) - elseif (member (tag-name (car ent)) stop-at - :test #'eq) - then (return) ;; do nothing - )))) - - (close-current-tag () - ;; close off the current tag and open the pending tag - (when (member (tag-name current-tag) *ch-format* :test #'eq) - (push current-tag closed-pending-ch-format) - ) - (let (element) - (if* (tag-no-pcdata (tag-name current-tag)) - then (setq element `(,current-tag - ,@(strip-rev-pcdata guts))) - else (setq element `(,current-tag ,@(nreverse guts)))) - (let ((callback (tag-callback (tag-name current-tag)))) - (when callback - (setf current-callback-tags (rest current-callback-tags)) - (funcall callback element))) - (let* ((prev (pop pending))) - (setq current-tag (car prev) - guts (cdr prev)) - (push element guts)))) - - (save-state () - ;; push the current tag state since we're starting: - ;; a new open tag - (push (cons current-tag guts) pending) - #+ignore (format t "state saved, pending ~s~%" pending) - ) - - - (strip-rev-pcdata (stuff) - ;; reverse the list stuff, omitting all the strings - (let (res) - (dolist (st stuff) - (if* (not (stringp st)) then (push st res))) - res)) - (check-in-line (check-tag) - (setf new-opens nil) - (let (val kind (i 0) - (length (length (tokenbuf-first-pass tokenbuf)))) - (loop - (if* (< i length) then - (setf val (nth i (tokenbuf-first-pass tokenbuf))) - (setf kind (nth (+ i 1) (tokenbuf-first-pass tokenbuf))) - (setf i (+ i 2)) - (if* (= i length) then (setf (tokenbuf-first-pass tokenbuf) - (nreverse (tokenbuf-first-pass tokenbuf)))) - else - (multiple-value-setq (val kind) - (get-next-token t)) - (push val (tokenbuf-first-pass tokenbuf)) - (push kind (tokenbuf-first-pass tokenbuf)) - ) - (when (eq kind :eof) - (if* (= i length) then - (setf (tokenbuf-first-pass tokenbuf) - (nreverse (tokenbuf-first-pass tokenbuf)))) - (return)) - (when (and (eq val check-tag) (eq kind :end-tag)) - (if* (= i length) then - (setf (tokenbuf-first-pass tokenbuf) - (nreverse (tokenbuf-first-pass tokenbuf)))) - (return)) - (when (member val *ch-format* :test #'eq) - (if* (eq kind :start-tag) then (push val new-opens) - elseif (member val new-opens :test #'eq) then - (setf new-opens (remove val new-opens :count 1)) - else (close-off-tags (list val) nil nil nil) - ))))) - - (get-next-token (force) - (if* (or force (null (tokenbuf-first-pass tokenbuf))) then - (multiple-value-bind (val kind) - (next-token p nil raw-mode-delimiter read-sequence-func - tokenbuf parse-entities) - (values val kind)) - else - (let ((val (first (tokenbuf-first-pass tokenbuf))) - (kind (second (tokenbuf-first-pass tokenbuf)))) - (setf (tokenbuf-first-pass tokenbuf) - (rest (rest (tokenbuf-first-pass tokenbuf)))) - (values val kind)))) - ) + ;; close off an open 'name' tag, but search no further + ;; than a 'stop-at' tag. + #+ignore (format t "close off name ~s, stop at ~s, ct ~s~%" + name stop-at current-tag) + (if* (member (tag-name current-tag) name :test #'eq) + then ;; close current tag(s) + (loop + (when (and collect-rogues + (not (member (tag-name current-tag) + *known-tags*))) + (push (tag-name current-tag) rogue-tags)) + (close-current-tag) + (if* (or once-only + (member (tag-name current-tag) + *ch-format*) + (not (member + (tag-name current-tag) name :test #'eq))) + then (return))) + elseif (member (tag-name current-tag) stop-at :test #'eq) + then nil + else ; search if there is a tag to close + (dolist (ent pending) + (if* (member (tag-name (car ent)) name :test #'eq) + then ; found one to close + (loop + (when (and collect-rogues + (not (member (tag-name current-tag) + *known-tags*))) + (push (tag-name current-tag) rogue-tags)) + (close-current-tag) + (if* (member (tag-name current-tag) name + :test #'eq) + then (close-current-tag) + (return))) + (return) + elseif (member (tag-name (car ent)) stop-at + :test #'eq) + then (return) ;; do nothing + )))) + + (close-current-tag () + ;; close off the current tag and open the pending tag + (when (member (tag-name current-tag) *ch-format* :test #'eq) + (push current-tag closed-pending-ch-format) + ) + (let (element) + (if* (tag-no-pcdata (tag-name current-tag)) + then (setq element `(,current-tag + ,@(strip-rev-pcdata guts))) + else (setq element `(,current-tag ,@(nreverse guts)))) + (let ((callback (tag-callback (tag-name current-tag)))) + (when callback + (setf current-callback-tags (rest current-callback-tags)) + (funcall callback element))) + (let* ((prev (pop pending))) + (setq current-tag (car prev) + guts (cdr prev)) + (push element guts)))) + + (save-state () + ;; push the current tag state since we're starting: + ;; a new open tag + (push (cons current-tag guts) pending) + #+ignore (format t "state saved, pending ~s~%" pending) + ) + + + (strip-rev-pcdata (stuff) + ;; reverse the list stuff, omitting all the strings + (let (res) + (dolist (st stuff) + (if* (not (stringp st)) then (push st res))) + res)) + (check-in-line (check-tag) + (setf new-opens nil) + (let (val kind (i 0) + (length (length (tokenbuf-first-pass tokenbuf)))) + (loop + (if* (< i length) then + (setf val (nth i (tokenbuf-first-pass tokenbuf))) + (setf kind (nth (+ i 1) (tokenbuf-first-pass tokenbuf))) + (setf i (+ i 2)) + (if* (= i length) then (setf (tokenbuf-first-pass tokenbuf) + (nreverse (tokenbuf-first-pass tokenbuf)))) + else + (multiple-value-setq (val kind) + (get-next-token t)) + (push val (tokenbuf-first-pass tokenbuf)) + (push kind (tokenbuf-first-pass tokenbuf)) + ) + (when (eq kind :eof) + (if* (= i length) then + (setf (tokenbuf-first-pass tokenbuf) + (nreverse (tokenbuf-first-pass tokenbuf)))) + (return)) + (when (and (eq val check-tag) (eq kind :end-tag)) + (if* (= i length) then + (setf (tokenbuf-first-pass tokenbuf) + (nreverse (tokenbuf-first-pass tokenbuf)))) + (return)) + (when (member val *ch-format* :test #'eq) + (if* (eq kind :start-tag) then (push val new-opens) + elseif (member val new-opens :test #'eq) then + (setf new-opens (remove val new-opens :count 1)) + else (close-off-tags (list val) nil nil nil) + ))))) + + (get-next-token (force) + (if* (or force (null (tokenbuf-first-pass tokenbuf))) then + (multiple-value-bind (val kind) + (next-token p nil raw-mode-delimiter read-sequence-func + tokenbuf parse-entities) + (values val kind)) + else + (let ((val (first (tokenbuf-first-pass tokenbuf))) + (kind (second (tokenbuf-first-pass tokenbuf)))) + (setf (tokenbuf-first-pass tokenbuf) + (rest (rest (tokenbuf-first-pass tokenbuf)))) + (values val kind)))) + ) (loop - (multiple-value-bind (val kind) - (get-next-token nil) - #+ignore (format t "val: ~s kind: ~s last-tag ~s pending ~s~%" val kind - last-tag pending) - (case kind - (:pcdata - (when (or (and callback-only current-callback-tags) - (not callback-only)) - (if* (member last-tag *in-line*) - then - (push val guts) - else - (when (dotimes (i (length val) nil) - (when (not (char-characteristic (elt val i) - char-spacechar)) - (return t))) - (push val guts)))) - (when (and (= (length raw-mode-delimiter) 1) ;; xml tag... - (or (and callback-only current-callback-tags) - (not callback-only))) - (close-off-tags (list last-tag) nil nil t)) - (setf raw-mode-delimiter nil) - ) - - (:xml - (setf last-tag val) - (setf raw-mode-delimiter ">") - (let* ((name (tag-name val))) - (when (and callback-only (tag-callback name)) - (push name current-callback-tags)) - (save-state) - (setq current-tag val) - (setq guts nil) - )) - - (:start-tag - (setf last-tag val) - (if* (or (eq last-tag :style) - (and (listp last-tag) (eq (first last-tag) :style))) - then - (setf raw-mode-delimiter - (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER) - then "" - else "")) - elseif (or (eq last-tag :script) - (and (listp last-tag) (eq (first last-tag) :script))) - then - (setf raw-mode-delimiter - (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER) - then "" - else ""))) - ; maybe this is an end tag too - (let* ((name (tag-name val)) - (auto-close (tag-auto-close name)) - (auto-close-stop nil) - (no-end (or (tag-no-end name) (member name no-body-tags)))) - (when (and callback-only (tag-callback name)) - (push name current-callback-tags)) - (when (or (and callback-only current-callback-tags) - (not callback-only)) - (if* auto-close - then (setq auto-close-stop (tag-auto-close-stop name)) - (close-off-tags auto-close auto-close-stop nil nil)) - (when (and pending-ch-format (not no-end)) - (if* (member name *ch-format* :test #'eq) then nil - elseif (member name *in-line* :test #'eq) then - ;; close off only tags that are within *in-line* block - (check-in-line name) - else ;; close ALL pending char tags and then reopen - (dolist (this-tag (reverse pending-ch-format)) - (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil nil)) - )) - (if* no-end - then ; this is a singleton tag - (let ((callback (tag-callback (tag-name (if* (atom val) - then val - else (first val)))))) - (when callback - (funcall callback (if* (atom val) - then val - else (list val))))) - (push (if* (atom val) - then val - else (list val)) - guts) - else (save-state) - (setq current-tag val) - (setq guts nil)) - (if* (member name *ch-format* :test #'eq) - then (push val pending-ch-format) - else (when (not - (or (eq last-tag :style) - (and (listp last-tag) (eq (first last-tag) :style)) - (eq last-tag :script) - (and (listp last-tag) (eq (first last-tag) :script)))) - (dolist (tmp (reverse closed-pending-ch-format)) - (save-state) - (setf current-tag tmp) - (setf guts nil))) - ) - (when (not - (or (eq last-tag :style) - (and (listp last-tag) (eq (first last-tag) :style)) - (eq last-tag :script) - (and (listp last-tag) (eq (first last-tag) :script)))) - (setf closed-pending-ch-format nil)) - ))) - - (:end-tag - (setf raw-mode-delimiter nil) - (when (or (and callback-only current-callback-tags) - (not callback-only)) - (close-off-tags (list val) nil nil t) - (when (member val *ch-format* :test #'eq) - (setf pending-ch-format - (remove val pending-ch-format :count 1 - :test #'(lambda (x y) (eq x (if (listp y) (first y) y))))) - (setf closed-pending-ch-format - (remove val closed-pending-ch-format :count 1 - :test #'(lambda (x y) (eq x (if (listp y) (first y) y))))) - ) - (dolist (tmp (reverse closed-pending-ch-format)) - (save-state) - (setf current-tag tmp) - (setf guts nil)) - (setf closed-pending-ch-format nil) - )) - - (:comment - (setf raw-mode-delimiter nil) - (when (or (and callback-only current-callback-tags) - (not callback-only)) - (push `(:comment ,val) guts))) - - (:eof - (setf raw-mode-delimiter nil) - ;; close off all tags - (when (or (and callback-only current-callback-tags) - (not callback-only)) - (close-off-tags '(:start-parse) nil collect-rogue-tags nil)) - (put-back-tokenbuf tokenbuf) - (if collect-rogue-tags - (return (values (cdar guts) rogue-tags)) - (return (cdar guts)))))))))) - - + (multiple-value-bind (val kind) + (get-next-token nil) + #+ignore (format t "val: ~s kind: ~s last-tag ~s pending ~s~%" val kind + last-tag pending) + (case kind + (:pcdata + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (if* (member last-tag *in-line*) + then + (push val guts) + else + (when (dotimes (i (length val) nil) + (when (not (char-characteristic (elt val i) + char-spacechar)) + (return t))) + (push val guts)))) + (when (and (= (length raw-mode-delimiter) 1) ;; xml tag... + (or (and callback-only current-callback-tags) + (not callback-only))) + (close-off-tags (list last-tag) nil nil t)) + (setf raw-mode-delimiter nil) + ) + + (:xml + (setf last-tag val) + (setf raw-mode-delimiter ">") + (let* ((name (tag-name val))) + (when (and callback-only (tag-callback name)) + (push name current-callback-tags)) + (save-state) + (setq current-tag val) + (setq guts nil) + )) + + (:start-tag + (setf last-tag val) + (if* (or (eq last-tag :style) + (and (listp last-tag) (eq (first last-tag) :style))) + then + (setf raw-mode-delimiter + (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER) + then "" + else "")) + elseif (or (eq last-tag :script) + (and (listp last-tag) (eq (first last-tag) :script))) + then + (setf raw-mode-delimiter + (if* (eq excl:*current-case-mode* :CASE-INSENSITIVE-UPPER) + then "" + else ""))) + ; maybe this is an end tag too + (let* ((name (tag-name val)) + (auto-close (tag-auto-close name)) + (auto-close-stop nil) + (no-end (or (tag-no-end name) (member name no-body-tags)))) + (when (and callback-only (tag-callback name)) + (push name current-callback-tags)) + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (if* auto-close + then (setq auto-close-stop (tag-auto-close-stop name)) + (close-off-tags auto-close auto-close-stop nil nil)) + (when (and pending-ch-format (not no-end)) + (if* (member name *ch-format* :test #'eq) then nil + elseif (member name *in-line* :test #'eq) then + ;; close off only tags that are within *in-line* block + (check-in-line name) + else ;; close ALL pending char tags and then reopen + (dolist (this-tag (reverse pending-ch-format)) + (close-off-tags (list (if (listp this-tag) (first this-tag) this-tag)) nil nil nil)) + )) + (if* no-end + then ; this is a singleton tag + (let ((callback (tag-callback (tag-name (if* (atom val) + then val + else (first val)))))) + (when callback + (funcall callback (if* (atom val) + then val + else (list val))))) + (push (if* (atom val) + then val + else (list val)) + guts) + else (save-state) + (setq current-tag val) + (setq guts nil)) + (if* (member name *ch-format* :test #'eq) + then (push val pending-ch-format) + else (when (not + (or (eq last-tag :style) + (and (listp last-tag) (eq (first last-tag) :style)) + (eq last-tag :script) + (and (listp last-tag) (eq (first last-tag) :script)))) + (dolist (tmp (reverse closed-pending-ch-format)) + (save-state) + (setf current-tag tmp) + (setf guts nil))) + ) + (when (not + (or (eq last-tag :style) + (and (listp last-tag) (eq (first last-tag) :style)) + (eq last-tag :script) + (and (listp last-tag) (eq (first last-tag) :script)))) + (setf closed-pending-ch-format nil)) + ))) + + (:end-tag + (setf raw-mode-delimiter nil) + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (close-off-tags (list val) nil nil t) + (when (member val *ch-format* :test #'eq) + (setf pending-ch-format + (remove val pending-ch-format :count 1 + :test #'(lambda (x y) (eq x (if (listp y) (first y) y))))) + (setf closed-pending-ch-format + (remove val closed-pending-ch-format :count 1 + :test #'(lambda (x y) (eq x (if (listp y) (first y) y))))) + ) + (dolist (tmp (reverse closed-pending-ch-format)) + (save-state) + (setf current-tag tmp) + (setf guts nil)) + (setf closed-pending-ch-format nil) + )) + + (:comment + (setf raw-mode-delimiter nil) + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (push `(:comment ,val) guts))) + + (:eof + (setf raw-mode-delimiter nil) + ;; close off all tags + (when (or (and callback-only current-callback-tags) + (not callback-only)) + (close-off-tags '(:start-parse) nil collect-rogue-tags nil)) + (put-back-tokenbuf tokenbuf) + (if collect-rogue-tags + (return (values (cdar guts) rogue-tags)) + (return (cdar guts)))))))))) + + (defmethod parse-html (file &key callback-only callbacks collect-rogue-tags - no-body-tags parse-entities) + no-body-tags parse-entities) (declare (optimize (speed 3) (safety 1))) (with-open-file (p file :direction :input) (parse-html p :callback-only callback-only :callbacks callbacks - :collect-rogue-tags collect-rogue-tags - :no-body-tags no-body-tags - :parse-entities parse-entities - ))) - + :collect-rogue-tags collect-rogue-tags + :no-body-tags no-body-tags + :parse-entities parse-entities + ))) + (defmethod parse-html ((str string) &key callback-only callbacks collect-rogue-tags - no-body-tags parse-entities) + no-body-tags parse-entities) (declare (optimize (speed 3) (safety 1))) - (parse-html (make-string-input-stream str) - :callback-only callback-only :callbacks callbacks - :collect-rogue-tags collect-rogue-tags - :no-body-tags no-body-tags - :parse-entities parse-entities - )) - - - - - - - - - + (parse-html (make-string-input-stream str) + :callback-only callback-only :callbacks callbacks + :collect-rogue-tags collect-rogue-tags + :no-body-tags no-body-tags + :parse-entities parse-entities + )) + + + + + + + + + ;;;;;;;;;;;; test ;;;(defun doit (ignore-data) ;;; (with-open-file (p "readme.htm") ;;; (loop ;;; (multiple-value-bind (val kind) (next-token p ignore-data) -;;; ;(format t "~s -> ~s~%" kind val) -;;; -;;; (if* (eq kind :eof) then (return)))))) +;;; ;(format t "~s -> ~s~%" kind val) +;;; +;;; (if* (eq kind :eof) then (return)))))) ;;; ;;;(defun pdoit (&optional (file "testa.html")) ;;; (with-open-file (p file) @@ -1387,8 +1387,8 @@ ;;; ;;;;; requires http client module to work ;;;(defun getparse (host path) -;;; (parse-html (httpr-body -;;; (parse-response -;;; (simple-get host path))))) +;;; (parse-html (httpr-body +;;; (parse-response +;;; (simple-get host path))))) (provide :phtml)