-
- (setq ch (next-char stream))
- ;;(format t "ch: ~s state: ~s~%" ch state)
-
- (if* (null ch)
- then (return) ; eof -- exit loop
- )
-
-
- (case state
- (#.state-pcdata
- ; collect everything until we see a <
- (if* (eq ch #\<)
- then ; if we've collected nothing then get a tag
- (if* (> (collector-next coll) 0)
- then ; have collected something, return this string
- (un-next-char stream ch) ; push back the <
- (return)
- else ; collect a tag
- (setq state state-readtagfirst))
- elseif (and parse-entities (eq ch #\&))
- then ; reading an entity. entity ends at semicolon
- (let (res (max 10))
- (loop (let ((ch (next-char stream)))
- (if* (null ch)
- then (error "End of file after & entity marker")
- elseif (eq ch #\;)
- then (return)
- elseif (zerop (decf max))
- then (error "No semicolon found after entity starting: &~{~a~}" (nreverse res))
- else (push ch res))))
- (setq res (nreverse res))
- (if* (eq (car res) #\#)
- then ; decimal entity
- (let ((count 0))
- (dolist (ch (cdr res))
- (let ((code (char-code ch)))
- (if* (<= #.(char-code #\0)
- code
- #.(char-code #\9))
- then (setq count
- (+ (* 10 count)
- (- code
- #.(char-code #\0))))
- else (error "non decimal digit after &# - ~s" ch)
- )))
- (add-to-coll coll (code-char count)))
- else (let ((name (make-array (length res)
- :element-type 'character
- :initial-contents res)))
- (let ((ch (gethash name *html-entity-to-code*)))
- (if* ch
- then (add-to-coll coll (code-char ch))
- else (error "No such entity as ~s" name))))))
-
- else ; we will check for & here eventually
- (if* (not (eq ch #\return))
- then (add-to-coll coll ch))))
-
- (#.state-readtagfirst
- ; starting to read a tag name
- (if* (eq #\/ ch)
- then ; end tag
- (setq end-tag t)
- else (if* (eq #\! ch) ; possible comment
- then (setf xml-bailout t)
- (setq name-length 0))
- (un-next-char stream ch))
- (setq state state-readtag))
-
- (#.state-readtag
- ;; reading the whole tag name
- (if* (char-characteristic ch char-tagcharacter)
- then (add-to-coll coll (to-preferred-case ch))
- (incf name-length)
- (if* (and (eq name-length 3)
- (coll-has-comment coll))
- then (clear-coll coll)
- (setq state state-readcomment))
-
- else (setq tag-to-return (compute-tag coll))
- (clear-coll coll)
- (if* (eq ch #\>)
- then (return) ; we're done
- elseif xml-bailout then
- (un-next-char stream ch)
- (return)
- else (if* (eq tag-to-return :!--)
- then ; a comment
- (setq state state-readcomment)
- else (un-next-char stream ch)
- (setq state state-findattribname)))))
-
- (#.state-findattribname
- ;; search until we find the start of an attribute name
- ;; or the end of the tag
- (if* (eq ch #\>)
- then ; end of the line
- (return)
- elseif (eq ch #\=)
- then ; value for previous attribute name
- ; (syntax "foo = bar" is bogus I think but it's
- ; used some places, here is where we handle this
- (pop attribs-to-return)
- (setq attrib-name (pop attribs-to-return))
- (setq state state-findvalue)
- elseif (char-characteristic ch char-attribnamechar)
- then (un-next-char stream ch)
- (setq state state-attribname)
- else nil ; ignore other things
- ))
-
- (#.state-findvalue
- ;; find the start of the value
- (if* (char-characteristic ch char-spacechar)
- thenret ; keep looking
- elseif (eq ch #\>)
- then ; no value, set the value to be the
- ; name as a string
- (setq attrib-value
- (string-downcase (string attrib-name)))
-
- (push attrib-name attribs-to-return)
- (push attrib-value attribs-to-return)
- (un-next-char stream ch)
- (setq state state-findattribname)
- else (un-next-char stream ch)
- (setq state state-attribstartvalue)))
-
-
- (#.state-attribname
- ;; collect attribute name
-
- (if* (char-characteristic ch char-attribnamechar)
- then (add-to-coll coll (to-preferred-case ch))
- elseif (eq #\= ch)
- then ; end of attribute name, value is next
- (setq attrib-name (compute-tag coll))
- (clear-coll coll)
- (setq state state-attribstartvalue)
- else ; end of attribute name with no value,
- (setq attrib-name (compute-tag coll))
- (clear-coll coll)
- (setq attrib-value
- (string-downcase (string attrib-name)))
- (push attrib-name attribs-to-return)
- (push attrib-value attribs-to-return)
- (un-next-char stream ch)
- (setq state state-findattribname)))
-
- (#.state-attribstartvalue
- ;; begin to collect value
- (if* (or (eq ch #\")
- (eq ch #\'))
- then (setq value-delim ch)
- (setq state state-attribvaluedelim)
- ;; gobble spaces; assume since we've seen a '=' there really is a value
- elseif (eq #\space ch) then nil
- else (un-next-char stream ch)
- (setq state state-attribvaluenodelim)))
-
- (#.state-attribvaluedelim
- (if* (eq ch value-delim)
- then (setq attrib-value (compute-coll-string coll))
- (clear-coll coll)
- (push attrib-name attribs-to-return)
- (push attrib-value attribs-to-return)
- (setq state state-findattribname)
- else (add-to-coll coll ch)))
-
- (#.state-attribvaluenodelim
- ;; an attribute value not delimited by ' or " and thus restricted
- ;; in the possible characters
- (if* (char-characteristic ch char-attribundelimattribvalue)
- then (add-to-coll coll ch)
- else (un-next-char stream ch)
- (setq attrib-value (compute-coll-string coll))
- (clear-coll coll)
- (push attrib-name attribs-to-return)
- (push attrib-value attribs-to-return)
- (setq state state-findattribname)))
-
- (#.state-readcomment
- ;; a comment ends on the first --, but we'll look for -->
- ;; since that's what most people expect
- (if* (eq ch #\-)
- then (setq state state-readcomment-one)
- else (add-to-coll coll ch)))
-
- (#.state-readcomment-one
- ;; seen one -, looking for ->
-
- (if* (eq ch #\-)
- then (setq state state-readcomment-two)
- else ; not a comment end, put back the -'s
- (add-to-coll coll #\-)
- (add-to-coll coll ch)
- (setq state state-readcomment)))
-
- (#.state-readcomment-two
- ;; seen two -'s, looking for >
-
- (if* (eq ch #\>)
- then ; end of the line
- (return)
- elseif (eq ch #\-)
- then ; still at two -'s, have to put out first
- (add-to-coll coll #\-)
- else ; put out two hypens and back to looking for a hypen
- (add-to-coll coll #\-)
- (add-to-coll coll #\-)
- (setq state state-readcomment)))
-
- (#.state-rawdata
- ;; collect everything until we see the delimiter
- (if* (eq (to-preferred-case ch) (elt raw-mode-delimiter raw-length))
- then
- (incf raw-length)
- (when (= raw-length (length raw-mode-delimiter))
- ;; push the end tag back so it can then be lexed
- ;; but don't do it for xml stuff
- (when (/= (length raw-mode-delimiter) 1)
- (push :end-tag (tokenbuf-first-pass tokenbuf))
- (if* (equal raw-mode-delimiter "</STYLE>")
- then (push :STYLE (tokenbuf-first-pass tokenbuf))
- elseif (equal raw-mode-delimiter "</style>")
- then (push :style (tokenbuf-first-pass tokenbuf))
- elseif (equal raw-mode-delimiter "</SCRIPT>")
- then (push :SCRIPT (tokenbuf-first-pass tokenbuf))
- elseif (equal raw-mode-delimiter "</script>")
- then (push :script (tokenbuf-first-pass tokenbuf))
- else (error "unexpected raw-mode-delimiter"))
- )
- ;; set state to state-pcdata for next section
- (return))
- else
- ;; push partial matches into data string
- (dotimes (i raw-length)
- (add-to-coll coll (elt raw-mode-delimiter i)))
- (setf raw-length 0)
- (add-to-coll coll ch)))
-
- ))
-
-
- ;; out of the loop.
+
+ (setq ch (next-char stream))
+ ;;(format t "ch: ~s state: ~s~%" ch state)
+
+ (if* (null ch)
+ then (return) ; eof -- exit loop
+ )
+
+
+ (case state
+ (#.state-pcdata
+ ; collect everything until we see a <
+ (if* (eq ch #\<)
+ then ; if we've collected nothing then get a tag
+ (if* (> (collector-next coll) 0)
+ then ; have collected something, return this string
+ (un-next-char stream ch) ; push back the <
+ (return)
+ else ; collect a tag
+ (setq state state-readtagfirst))
+ elseif (and parse-entities (eq ch #\&))
+ then ; reading an entity. entity ends at semicolon
+ (let (res (max 10))
+ (loop (let ((ch (next-char stream)))
+ (if* (null ch)
+ then (error "End of file after & entity marker")
+ elseif (eq ch #\;)
+ then (return)
+ elseif (zerop (decf max))
+ then (error "No semicolon found after entity starting: &~{~a~}" (nreverse res))
+ else (push ch res))))
+ (setq res (nreverse res))
+ (if* (eq (car res) #\#)
+ then ; decimal entity
+ (let ((count 0))
+ (dolist (ch (cdr res))
+ (let ((code (char-code ch)))
+ (if* (<= #.(char-code #\0)
+ code
+ #.(char-code #\9))
+ then (setq count
+ (+ (* 10 count)
+ (- code
+ #.(char-code #\0))))
+ else (error "non decimal digit after &# - ~s" ch)
+ )))
+ (add-to-coll coll (code-char count)))
+ else (let ((name (make-array (length res)
+ :element-type 'character
+ :initial-contents res)))
+ (let ((ch (gethash name *html-entity-to-code*)))
+ (if* ch
+ then (add-to-coll coll (code-char ch))
+ else (error "No such entity as ~s" name))))))
+
+ else ; we will check for & here eventually
+ (if* (not (eq ch #\return))
+ then (add-to-coll coll ch))))
+
+ (#.state-readtagfirst
+ ; starting to read a tag name
+ (if* (eq #\/ ch)
+ then ; end tag
+ (setq end-tag t)
+ else (if* (eq #\! ch) ; possible comment
+ then (setf xml-bailout t)
+ (setq name-length 0))
+ (un-next-char stream ch))
+ (setq state state-readtag))
+
+ (#.state-readtag
+ ;; reading the whole tag name
+ (if* (char-characteristic ch char-tagcharacter)
+ then (add-to-coll coll (to-preferred-case ch))
+ (incf name-length)
+ (if* (and (eq name-length 3)
+ (coll-has-comment coll))
+ then (clear-coll coll)
+ (setq state state-readcomment))
+
+ else (setq tag-to-return (compute-tag coll))
+ (clear-coll coll)
+ (if* (eq ch #\>)
+ then (return) ; we're done
+ elseif xml-bailout then
+ (un-next-char stream ch)
+ (return)
+ else (if* (eq tag-to-return :!--)
+ then ; a comment
+ (setq state state-readcomment)
+ else (un-next-char stream ch)
+ (setq state state-findattribname)))))
+
+ (#.state-findattribname
+ ;; search until we find the start of an attribute name
+ ;; or the end of the tag
+ (if* (eq ch #\>)
+ then ; end of the line
+ (return)
+ elseif (eq ch #\=)
+ then ; value for previous attribute name
+ ; (syntax "foo = bar" is bogus I think but it's
+ ; used some places, here is where we handle this
+ (pop attribs-to-return)
+ (setq attrib-name (pop attribs-to-return))
+ (setq state state-findvalue)
+ elseif (char-characteristic ch char-attribnamechar)
+ then (un-next-char stream ch)
+ (setq state state-attribname)
+ else nil ; ignore other things
+ ))
+
+ (#.state-findvalue
+ ;; find the start of the value
+ (if* (char-characteristic ch char-spacechar)
+ thenret ; keep looking
+ elseif (eq ch #\>)
+ then ; no value, set the value to be the
+ ; name as a string
+ (setq attrib-value
+ (string-downcase (string attrib-name)))
+
+ (push attrib-name attribs-to-return)
+ (push attrib-value attribs-to-return)
+ (un-next-char stream ch)
+ (setq state state-findattribname)
+ else (un-next-char stream ch)
+ (setq state state-attribstartvalue)))
+
+
+ (#.state-attribname
+ ;; collect attribute name
+
+ (if* (char-characteristic ch char-attribnamechar)
+ then (add-to-coll coll (to-preferred-case ch))
+ elseif (eq #\= ch)
+ then ; end of attribute name, value is next
+ (setq attrib-name (compute-tag coll))
+ (clear-coll coll)
+ (setq state state-attribstartvalue)
+ else ; end of attribute name with no value,
+ (setq attrib-name (compute-tag coll))
+ (clear-coll coll)
+ (setq attrib-value
+ (string-downcase (string attrib-name)))
+ (push attrib-name attribs-to-return)
+ (push attrib-value attribs-to-return)
+ (un-next-char stream ch)
+ (setq state state-findattribname)))
+
+ (#.state-attribstartvalue
+ ;; begin to collect value
+ (if* (or (eq ch #\")
+ (eq ch #\'))
+ then (setq value-delim ch)
+ (setq state state-attribvaluedelim)
+ ;; gobble spaces; assume since we've seen a '=' there really is a value
+ elseif (eq #\space ch) then nil
+ else (un-next-char stream ch)
+ (setq state state-attribvaluenodelim)))
+
+ (#.state-attribvaluedelim
+ (if* (eq ch value-delim)
+ then (setq attrib-value (compute-coll-string coll))
+ (clear-coll coll)
+ (push attrib-name attribs-to-return)
+ (push attrib-value attribs-to-return)
+ (setq state state-findattribname)
+ else (add-to-coll coll ch)))
+
+ (#.state-attribvaluenodelim
+ ;; an attribute value not delimited by ' or " and thus restricted
+ ;; in the possible characters
+ (if* (char-characteristic ch char-attribundelimattribvalue)
+ then (add-to-coll coll ch)
+ else (un-next-char stream ch)
+ (setq attrib-value (compute-coll-string coll))
+ (clear-coll coll)
+ (push attrib-name attribs-to-return)
+ (push attrib-value attribs-to-return)
+ (setq state state-findattribname)))
+
+ (#.state-readcomment
+ ;; a comment ends on the first --, but we'll look for -->
+ ;; since that's what most people expect
+ (if* (eq ch #\-)
+ then (setq state state-readcomment-one)
+ else (add-to-coll coll ch)))
+
+ (#.state-readcomment-one
+ ;; seen one -, looking for ->
+
+ (if* (eq ch #\-)
+ then (setq state state-readcomment-two)
+ else ; not a comment end, put back the -'s
+ (add-to-coll coll #\-)
+ (add-to-coll coll ch)
+ (setq state state-readcomment)))
+
+ (#.state-readcomment-two
+ ;; seen two -'s, looking for >
+
+ (if* (eq ch #\>)
+ then ; end of the line
+ (return)
+ elseif (eq ch #\-)
+ then ; still at two -'s, have to put out first
+ (add-to-coll coll #\-)
+ else ; put out two hypens and back to looking for a hypen
+ (add-to-coll coll #\-)
+ (add-to-coll coll #\-)
+ (setq state state-readcomment)))
+
+ (#.state-rawdata
+ ;; collect everything until we see the delimiter
+ (if* (eq (to-preferred-case ch) (elt raw-mode-delimiter raw-length))
+ then
+ (incf raw-length)
+ (when (= raw-length (length raw-mode-delimiter))
+ ;; push the end tag back so it can then be lexed
+ ;; but don't do it for xml stuff
+ (when (/= (length raw-mode-delimiter) 1)
+ (push :end-tag (tokenbuf-first-pass tokenbuf))
+ (if* (equal raw-mode-delimiter "</STYLE>")
+ then (push :STYLE (tokenbuf-first-pass tokenbuf))
+ elseif (equal raw-mode-delimiter "</style>")
+ then (push :style (tokenbuf-first-pass tokenbuf))
+ elseif (equal raw-mode-delimiter "</SCRIPT>")
+ then (push :SCRIPT (tokenbuf-first-pass tokenbuf))
+ elseif (equal raw-mode-delimiter "</script>")
+ then (push :script (tokenbuf-first-pass tokenbuf))
+ else (error "unexpected raw-mode-delimiter"))
+ )
+ ;; set state to state-pcdata for next section
+ (return))
+ else
+ ;; push partial matches into data string
+ (dotimes (i raw-length)
+ (add-to-coll coll (elt raw-mode-delimiter i)))
+ (setf raw-length 0)
+ (add-to-coll coll ch)))
+
+ ))
+
+
+ ;; out of the loop.