1 ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: umlisp -*-
2 ;;;; *************************************************************************
3 ;;;; FILE IDENTIFICATION
5 ;;;; Name: parse-common.lisp
6 ;;;; Purpose: Common, stable parsing routines for UMLisp
7 ;;;; Programmer: Kevin M. Rosenberg
8 ;;;; Date Started: Apr 2000
10 ;;;; $Id: parse-common.lisp,v 1.2 2002/10/09 23:03:41 kevin Exp $
12 ;;;; This file, part of UMLisp, is
13 ;;;; Copyright (c) 2000-2002 by Kevin M. Rosenberg, M.D.
15 ;;;; UMLisp users are granted the rights to distribute and use this software
16 ;;;; as governed by the terms of the GNU General Public License.
17 ;;;; *************************************************************************
20 (declaim (optimize (speed 3) (safety 1) (compilation-speed 0) (debug 3)))
22 (defun umls-pathname (filename &optional (extension ""))
23 "Return pathname for a umls filename"
27 (make-pathname :name (concatenate 'string filename extension))
28 (case (char filename 0)
40 (defun read-umls-line (strm)
41 "Read a line from a UMLS stream, split into fields"
42 (let ((line (read-line strm nil 'eof)))
43 (if (stringp line) ;; ensure not 'eof
44 (let* ((len (length line))
45 (maybe-remove-terminal ;; LRWD doesn't have '|' at end of line
46 (if (char= #\| (char line (1- len)))
47 (subseq line 0 (1- len))
49 (declare (fixnum len))
50 (delimited-string-to-list maybe-remove-terminal #\|))
54 ;;; Find field lengths for LEX and NET files
56 (defun file-field-lengths (files)
59 (setq file (umls-file-fil file))
60 (let (max-field count-field num-fields (count-lines 0))
61 (with-umls-file (fields file)
63 (setq num-fields (length fields))
64 (setq max-field (make-array num-fields :element-type 'fixnum
66 (setq count-field (make-array num-fields :element-type 'number
68 (dotimes (i (length fields))
70 (let ((len (length (nth i fields))))
71 (incf (aref count-field i) len)
72 (when (> len (aref max-field i))
73 (setf (aref max-field i) len))))
75 (dotimes (i num-fields)
76 (setf (aref count-field i) (float (/ (aref count-field i) count-lines))))
77 (push (list file max-field count-field) lengths)))
80 (defun init-field-lengths ()
81 "Initial colstruct field lengths for files that don't have a measurement.
82 Currently, these are the LEX and NET files."
83 (let ((measure-files '()))
84 (dolist (file *umls-files*)
85 (let ((filename (umls-file-fil file)))
86 (unless (or (char= #\M (char filename 0))
87 (char= #\m (char filename 0)))
88 (push file measure-files))))
89 (let ((length-lists (file-field-lengths measure-files)))
90 (dolist (length-list length-lists)
91 (let* ((filename (car length-list))
92 (max-field (cadr length-list))
93 (av-field (caddr length-list))
94 (file (find-umls-file filename)))
96 (if (/= (length max-field) (length (umls-file-fields file)))
97 (format t "Warning: Number of file fields ~A doesn't match length of fields in file structure ~S"
99 (dotimes (i (max (length max-field) (length (umls-file-fields file))))
101 (let* ((field (nth i (umls-file-fields file)))
102 (col (find-umls-col field filename)))
105 (setf (umls-col-max col) (aref max-field i))
106 (setf (umls-col-av col) (aref av-field i))
107 (add-datatype-to-col col (datatype-for-col (umls-col-col col))))
108 (error "can't find column ~A" field)))))))))))
112 ;;; UMLS column/file functions
114 (defun find-col-in-columns (colname filename cols)
115 "Returns list of umls-col structure for a column name and a filename"
117 (when (and (string-equal filename (umls-col-fil col))
118 (string-equal colname (umls-col-col col)))
119 (return-from find-col-in-columns col)))
122 (defun find-or-make-col-in-columns (colname filename cols)
123 (let ((col (find-col-in-columns colname filename cols)))
126 ;; try to find column name without a terminal digit
127 (let* ((last-char (char colname (1- (length colname))))
128 (digit (- (char-code last-char) (char-code #\0))))
129 (if (and (>= digit 0) (<= digit 9))
130 (let ((base-colname (subseq colname 0 (1- (length colname)))))
131 (setq col (find-col-in-columns base-colname filename cols))
133 (let ((new-col (make-umls-col
134 :col (copy-seq colname)
135 :des (copy-seq (umls-col-des col))
136 :ref (copy-seq (umls-col-ref col))
137 :min (umls-col-min col)
138 :max (umls-col-max col)
139 :fil (copy-seq (umls-col-fil col))
140 :sqltype (copy-seq (umls-col-sqltype col))
141 :dty (copy-seq (umls-col-dty col))
142 :parsefunc (umls-col-parsefunc col)
143 :quotechar (copy-seq (umls-col-quotechar col))
144 :datatype (umls-col-datatype col)
145 :custom-value-func (umls-col-custom-value-func col))))
146 (push new-col *umls-cols*)
148 (error "Couldn't find a base column for col ~A in file ~A"
150 (let ((new-col (make-umls-col
151 :col (copy-seq colname)
159 :parsefunc #'add-sql-quotes
162 :custom-value-func nil)))
163 (push new-col *umls-cols*)
166 (defun find-umls-col (colname filename)
167 "Returns list of umls-col structure for a column name and a filename"
168 (find-or-make-col-in-columns colname filename *umls-cols*))
170 (defun find-umls-file (filename)
171 "Returns umls-file structure for a filename"
172 (find-if (lambda (f) (string-equal filename (umls-file-fil f))) *umls-files*))
174 (defun umls-cols-for-umls-file (file)
175 "Returns list of umls-cols for a file structure"
176 (let ((filename (umls-file-fil file)))
177 (mapcar (lambda (col) (find-umls-col col filename))
178 (umls-file-fields file))))
181 ;; SQL command functions
183 (defun create-table-cmd (file)
184 "Return sql command to create a table"
187 (let ((sqltype (umls-col-sqltype c)))
188 (concatenate 'string (umls-col-col c)
190 (if (or (string-equal sqltype "VARCHAR")
191 (string-equal sqltype "CHAR"))
192 (format nil "~a (~a)" sqltype (umls-col-max c))
195 (format nil "CREATE TABLE ~a (~a)" (umls-file-table file)
196 (string-trim-last-character
197 (mapcar-append-string col-func (umls-cols-for-umls-file file))))))
199 (defun create-custom-table-cmd (tablename sql-cmd)
200 "Return SQL command to create a custom table"
201 (format nil "CREATE TABLE ~a AS ~a;" tablename sql-cmd))
203 (defun insert-values-cmd (file values)
204 "Return sql insert command for a row of values"
209 (umls-col-quotechar col)
210 (if (null (umls-col-parsefunc col))
212 (format nil "~A" (funcall (umls-col-parsefunc col) value)))
213 (umls-col-quotechar col)
216 nil "INSERT INTO ~a (~a) VALUES (~a)"
217 (umls-file-table file)
218 (string-trim-last-character
219 (mapcar-append-string (lambda (c) (concatenate 'string c ","))
220 (umls-file-fields file)))
221 (string-trim-last-character
223 (mapcar2-append-string insert-func
224 (remove-custom-cols (umls-file-colstructs file))
226 (custom-col-values (custom-colstructs-for-file file) values "," t)))
229 (defun custom-col-values (colstructs values delim doquote)
230 "Returns string of column values for SQL inserts for custom columns"
232 (dolist (col colstructs)
233 (let* ((func (umls-col-custom-value-func col))
234 (custom-value (funcall func values)))
235 (string-append result
236 (if doquote (umls-col-quotechar col))
237 (escape-backslashes custom-value)
238 (if doquote (umls-col-quotechar col))
242 (defun remove-custom-cols (cols)
243 "Remove custom cols from a list col umls-cols"
244 (remove-if #'umls-col-custom-value-func cols))
246 (defun find-custom-cols-for-filename (filename)
247 (remove-if-not (lambda (x) (string-equal filename (car x))) +custom-cols+))
249 (defun find-custom-col (filename col)
250 (find-if (lambda (x) (and (string-equal filename (car x))
251 (string-equal col (cadr x)))) +custom-cols+))
254 (defun custom-colnames-for-filename (filename)
255 (mapcar #'cadr (find-custom-cols-for-filename filename)))
257 (defun custom-colstructs-for-file (file)
258 (remove-if-not #'umls-col-custom-value-func (umls-file-colstructs file)))
260 (defun noneng-lang-index-files ()
261 (remove-if-not (lambda (f) (and (> (length (umls-file-fil f)) 4)
262 (string-equal (umls-file-fil f) "MRXW." :end1 5)
263 (not (string-equal (umls-file-fil f) "MRXW.ENG"))
264 (not (string-equal (umls-file-fil f) "MRXW.NONENG"))))
267 ;;; SQL Command Functions
269 (defun create-index-cmd (colname tablename length)
270 "Return sql create index command"
271 (format nil "CREATE INDEX ~a ON ~a (~a ~a)"
272 (concatenate 'string tablename "_" colname "_X") tablename colname
273 (if (integerp length)
274 (format nil "(~d)" length)
277 (defun create-all-tables-cmdfile ()
278 "Return sql commands to create all tables. Not need for automated SQL import"
279 (mapcar (lambda (f) (format nil "~a~%~%" (create-table-cmd f))) *umls-files*))
282 ;; SQL Execution functions
284 (defun sql-drop-tables (conn)
285 "SQL Databases: drop all tables"
289 (sql-execute (format nil "DROP TABLE ~a" (umls-file-table file)) conn)))
292 (defun sql-create-tables (conn)
293 "SQL Databases: create all tables"
294 (mapcar (lambda (file) (sql-execute (create-table-cmd file) conn)) *umls-files*))
296 (defun sql-create-custom-tables (conn)
297 "SQL Databases: create all custom tables"
299 (sql-execute (create-custom-table-cmd (car ct) (cadr ct)) conn))
302 (defun sql-insert-values (conn file)
303 "SQL Databases: inserts all values for a file"
304 (with-umls-file (line (umls-file-fil file))
305 (sql-execute (insert-values-cmd file line) conn)))
307 (defun sql-insert-all-values (conn)
308 "SQL Databases: inserts all values for all files"
309 (mapcar (lambda (file) (sql-insert-values conn file)) *umls-files*))
311 (defun sql-create-indexes (conn &optional (indexes +index-cols+))
312 "SQL Databases: create all indexes"
315 (sql-execute (create-index-cmd (car idx) (cadr idx) (caddr idx)) conn))
318 (defun create-umls-db-by-insert ()
319 "SQL Databases: initializes entire database via SQL insert commands"
322 (with-sql-connection (conn)
323 ;; (sql-drop-tables conn)
324 ;; (sql-create-tables conn)
325 ;; (sql-insert-all-values conn)
326 (sql-create-indexes conn)
327 (sql-create-custom-tables conn)
328 (sql-create-indexes conn +custom-index-cols+)))
330 (defun create-umls-db (&optional (extension ".trans")
331 (copy-cmd #'mysql-copy-cmd))
332 "SQL Databases: initializes entire database via SQL copy commands"
335 (translate-all-files extension)
336 (with-sql-connection (conn)
337 (sql-drop-tables conn)
338 (sql-create-tables conn)
340 #'(lambda (file) (sql-execute (funcall copy-cmd file extension) conn))
342 (sql-create-indexes conn)
343 (sql-create-custom-tables conn)
344 (sql-create-indexes conn +custom-index-cols+)))
346 (defun translate-all-files (&optional (extension ".trans"))
347 "Copy translated files and return postgresql copy commands to import"
348 (make-noneng-index-file extension)
349 (mapcar (lambda (f) (translate-file f extension)) *umls-files*))
351 (defun translate-file (file extension)
352 "Translate a umls file into a format suitable for sql copy cmd"
353 (let ((path (umls-pathname (umls-file-fil file) extension)))
354 (if (probe-file path)
356 (format t "File ~A already exists: skipping~%" path)
358 (with-open-file (ostream path :direction :output)
359 (with-umls-file (line (umls-file-fil file))
360 (princ (umls-translate file line) ostream)
361 (princ #\newline ostream))
364 (defun make-noneng-index-file (extension)
365 "Make non-english index file"
366 (let* ((outfile (find-umls-file "MRXW.NONENG"))
367 (path (umls-pathname (umls-file-fil outfile) extension)))
369 (if (probe-file path)
371 (format t "File ~A already exists: skipping~%" path)
374 (with-open-file (ostream path :direction :output)
375 (dolist (inputfile (noneng-lang-index-files))
376 (with-umls-file (line (umls-file-fil inputfile))
377 (princ (umls-translate outfile line) ostream) ;; use outfile for custom cols
378 (princ #\newline ostream))))
381 (defun pg-copy-cmd (file extension)
382 "Return postgresql copy statement for a file"
383 (format nil "COPY ~a FROM '~a' using delimiters '|' with null as ''"
384 (umls-file-table file) (umls-pathname (umls-file-fil file) extension)))
386 (defun mysql-copy-cmd (file extension)
387 "Return mysql copy statement for a file"
388 (format nil "LOAD DATA LOCAL INFILE \"~a\" INTO TABLE ~a FIELDS TERMINATED BY \"|\""
389 (umls-pathname (umls-file-fil file) extension) (umls-file-table file)))
391 (defun umls-translate (file line)
392 "Translate a single line for sql output"
393 (string-trim-last-character
395 (mapcar2-append-string
399 (if (eq (umls-col-datatype col) 'sql-u)
400 (format nil "~d" (parse-ui value ""))
401 (escape-backslashes value))
403 (remove-custom-cols (umls-file-colstructs file))
405 (custom-col-values (custom-colstructs-for-file file) line "|" nil))))
408 (defun umls-fixed-size-waste ()
409 "Display storage waste if using all fixed size storage"
415 (dolist (file *umls-files*)
416 (dolist (col (umls-file-colstructs file))
417 (let* ((avwaste (- (umls-col-max col) (umls-col-av col)))
418 (cwaste (* avwaste (umls-file-rws file))))
419 (unless (zerop cwaste)
422 (incf totalunavoidable cwaste)
423 (setq unavoidable (append unavoidable (list (list (umls-file-fil file) (umls-col-col col) avwaste cwaste)))))
425 (incf totalavoidable cwaste)
426 (setq avoidable (append avoidable (list (list (umls-file-fil file) (umls-col-col col) avwaste cwaste))))))
427 (incf totalwaste cwaste)))))
428 (values totalwaste totalavoidable totalunavoidable avoidable unavoidable)))
430 (defun display-waste ()
433 (multiple-value-bind (tw ta tu al ul) (umls-fixed-size-waste)
434 (format t "Total waste: ~d~%" tw)
435 (format t "Total avoidable: ~d~%" ta)
436 (format t "Total unavoidable: ~d~%" tu)
437 (format t "Avoidable:~%")
439 (format t " (~a,~a): ~a,~a~%" (car w) (cadr w) (caddr w) (cadddr w)))
440 (format t "Unavoidable:~%")
442 (format t " (~a,~a): ~a,~a~%" (car w) (cadr w) (caddr w) (cadddr w)))
445 (defun max-umls-field ()
446 "Return length of longest field"
450 (declare (fixnum max))
451 (dolist (col *umls-cols*)
452 (when (> (umls-col-max col) max)
453 (setq max (umls-col-max col))))
456 (defun max-umls-row ()
457 "Return length of longest row"
459 6000 ;;; hack to use on systems without MRCOLS/MRFILES -- ok for UMLS2001
463 (let ((rowsizes '()))
464 (dolist (file *umls-files*)
466 (fields (umls-file-colstructs file)))
467 (dolist (field fields)
468 (incf row (1+ (umls-col-max field))))
469 (push row rowsizes)))
470 (car (sort rowsizes #'>))))))