2 ;;; General purpose Lisp Routines for parsing UMLS files
3 ;;; and inserting into SQL databases
5 ;;; Copyright (c) 2001 Kevin M. Rosenberg, M.D.
6 ;;; $Id: parse-common.lisp,v 1.1 2002/10/05 20:17:14 kevin Exp $
10 (defun umls-pathname (filename &optional (extension ""))
11 "Return pathname for a umls filename"
15 (make-pathname :name (concatenate 'string filename extension))
16 (case (char filename 0)
28 (defun read-umls-line (strm)
29 "Read a line from a UMLS stream, split into fields"
30 (let ((line (read-line strm nil 'eof)))
31 (if (stringp line) ;; ensure not 'eof
32 (let* ((len (length line))
33 (maybe-remove-terminal ;; LRWD doesn't have '|' at end of line
34 (if (char= #\| (char line (1- len)))
35 (subseq line 0 (1- len))
37 (declare (fixnum len))
38 (delimited-string-to-list maybe-remove-terminal #\|))
42 ;;; Find field lengths for LEX and NET files
44 (defun file-field-lengths (files)
47 (setq file (umls-file-fil file))
48 (let (max-field count-field num-fields (count-lines 0))
49 (with-umls-file (fields file)
51 (setq num-fields (length fields))
52 (setq max-field (make-array num-fields :element-type 'fixnum
54 (setq count-field (make-array num-fields :element-type 'number
56 (dotimes (i (length fields))
58 (let ((len (length (nth i fields))))
59 (incf (aref count-field i) len)
60 (when (> len (aref max-field i))
61 (setf (aref max-field i) len))))
63 (dotimes (i num-fields)
64 (setf (aref count-field i) (float (/ (aref count-field i) count-lines))))
65 (push (list file max-field count-field) lengths)))
68 (defun init-field-lengths ()
69 "Initial colstruct field lengths for files that don't have a measurement.
70 Currently, these are the LEX and NET files."
71 (let ((measure-files '()))
72 (dolist (file *umls-files*)
73 (let ((filename (umls-file-fil file)))
74 (unless (or (char= #\M (char filename 0))
75 (char= #\m (char filename 0)))
76 (push file measure-files))))
77 (let ((length-lists (file-field-lengths measure-files)))
78 (dolist (length-list length-lists)
79 (let* ((filename (car length-list))
80 (max-field (cadr length-list))
81 (av-field (caddr length-list))
82 (file (find-umls-file filename)))
84 (if (/= (length max-field) (length (umls-file-fields file)))
85 (format t "Warning: Number of file fields ~A doesn't match length of fields in file structure ~S"
87 (dotimes (i (max (length max-field) (length (umls-file-fields file))))
89 (let* ((field (nth i (umls-file-fields file)))
90 (col (find-umls-col field filename)))
93 (setf (umls-col-max col) (aref max-field i))
94 (setf (umls-col-av col) (aref av-field i))
95 (add-datatype-to-col col (datatype-for-col (umls-col-col col))))
96 (error "can't find column ~A" field)))))))))))
100 ;;; UMLS column/file functions
102 (defun find-col-in-columns (colname filename cols)
103 "Returns list of umls-col structure for a column name and a filename"
105 (when (and (string-equal filename (umls-col-fil col))
106 (string-equal colname (umls-col-col col)))
107 (return-from find-col-in-columns col)))
110 (defun find-or-make-col-in-columns (colname filename cols)
111 (let ((col (find-col-in-columns colname filename cols)))
114 ;; try to find column name without a terminal digit
115 (let* ((last-char (char colname (1- (length colname))))
116 (digit (- (char-code last-char) (char-code #\0))))
117 (if (and (>= digit 0) (<= digit 9))
118 (let ((base-colname (subseq colname 0 (1- (length colname)))))
119 (setq col (find-col-in-columns base-colname filename cols))
121 (let ((new-col (make-umls-col
122 :col (copy-seq colname)
123 :des (copy-seq (umls-col-des col))
124 :ref (copy-seq (umls-col-ref col))
125 :min (umls-col-min col)
126 :max (umls-col-max col)
127 :fil (copy-seq (umls-col-fil col))
128 :sqltype (copy-seq (umls-col-sqltype col))
129 :dty (copy-seq (umls-col-dty col))
130 :parsefunc (umls-col-parsefunc col)
131 :quotechar (copy-seq (umls-col-quotechar col))
132 :datatype (umls-col-datatype col)
133 :custom-value-func (umls-col-custom-value-func col))))
134 (push new-col *umls-cols*)
136 (error "Couldn't find a base column for col ~A in file ~A"
138 (let ((new-col (make-umls-col
139 :col (copy-seq colname)
147 :parsefunc #'add-sql-quotes
150 :custom-value-func nil)))
151 (push new-col *umls-cols*)
154 (defun find-umls-col (colname filename)
155 "Returns list of umls-col structure for a column name and a filename"
156 (find-or-make-col-in-columns colname filename *umls-cols*))
158 (defun find-umls-file (filename)
159 "Returns umls-file structure for a filename"
160 (find-if (lambda (f) (string-equal filename (umls-file-fil f))) *umls-files*))
162 (defun umls-cols-for-umls-file (file)
163 "Returns list of umls-cols for a file structure"
164 (let ((filename (umls-file-fil file)))
165 (mapcar (lambda (col) (find-umls-col col filename))
166 (umls-file-fields file))))
169 ;; SQL command functions
171 (defun create-table-cmd (file)
172 "Return sql command to create a table"
175 (let ((sqltype (umls-col-sqltype c)))
176 (concatenate 'string (umls-col-col c)
178 (if (or (string-equal sqltype "VARCHAR")
179 (string-equal sqltype "CHAR"))
180 (format nil "~a (~a)" sqltype (umls-col-max c))
183 (format nil "CREATE TABLE ~a (~a)" (umls-file-table file)
184 (string-trim-last-character
185 (mapcar-append-string col-func (umls-cols-for-umls-file file))))))
187 (defun create-custom-table-cmd (tablename sql-cmd)
188 "Return SQL command to create a custom table"
189 (format nil "CREATE TABLE ~a AS ~a;" tablename sql-cmd))
191 (defun insert-values-cmd (file values)
192 "Return sql insert command for a row of values"
197 (umls-col-quotechar col)
198 (if (null (umls-col-parsefunc col))
200 (format nil "~A" (funcall (umls-col-parsefunc col) value)))
201 (umls-col-quotechar col)
204 nil "INSERT INTO ~a (~a) VALUES (~a)"
205 (umls-file-table file)
206 (string-trim-last-character
207 (mapcar-append-string (lambda (c) (concatenate 'string c ","))
208 (umls-file-fields file)))
209 (string-trim-last-character
211 (mapcar2-append-string insert-func
212 (remove-custom-cols (umls-file-colstructs file))
214 (custom-col-values (custom-colstructs-for-file file) values "," t)))
217 (defun custom-col-values (colstructs values delim doquote)
218 "Returns string of column values for SQL inserts for custom columns"
220 (dolist (col colstructs)
221 (let* ((func (umls-col-custom-value-func col))
222 (custom-value (funcall func values)))
223 (string-append result
224 (if doquote (umls-col-quotechar col))
225 (escape-backslashes custom-value)
226 (if doquote (umls-col-quotechar col))
230 (defun remove-custom-cols (cols)
231 "Remove custom cols from a list col umls-cols"
232 (remove-if #'umls-col-custom-value-func cols))
234 (defun find-custom-cols-for-filename (filename)
235 (remove-if-not (lambda (x) (string-equal filename (car x))) +custom-cols+))
237 (defun find-custom-col (filename col)
238 (find-if (lambda (x) (and (string-equal filename (car x))
239 (string-equal col (cadr x)))) +custom-cols+))
242 (defun custom-colnames-for-filename (filename)
243 (mapcar #'cadr (find-custom-cols-for-filename filename)))
245 (defun custom-colstructs-for-file (file)
246 (remove-if-not #'umls-col-custom-value-func (umls-file-colstructs file)))
248 (defun noneng-lang-index-files ()
249 (remove-if-not (lambda (f) (and (> (length (umls-file-fil f)) 4)
250 (string-equal (umls-file-fil f) "MRXW." :end1 5)
251 (not (string-equal (umls-file-fil f) "MRXW.ENG"))
252 (not (string-equal (umls-file-fil f) "MRXW.NONENG"))))
255 ;;; SQL Command Functions
257 (defun create-index-cmd (colname tablename length)
258 "Return sql create index command"
259 (format nil "CREATE INDEX ~a ON ~a (~a ~a)"
260 (concatenate 'string tablename "_" colname "_X") tablename colname
261 (if (integerp length)
262 (format nil "(~d)" length)
265 (defun create-all-tables-cmdfile ()
266 "Return sql commands to create all tables. Not need for automated SQL import"
267 (mapcar (lambda (f) (format nil "~a~%~%" (create-table-cmd f))) *umls-files*))
270 ;; SQL Execution functions
272 (defun sql-drop-tables (conn)
273 "SQL Databases: drop all tables"
277 (sql-execute (format nil "DROP TABLE ~a" (umls-file-table file)) conn)))
280 (defun sql-create-tables (conn)
281 "SQL Databases: create all tables"
282 (mapcar (lambda (file) (sql-execute (create-table-cmd file) conn)) *umls-files*))
284 (defun sql-create-custom-tables (conn)
285 "SQL Databases: create all custom tables"
287 (sql-execute (create-custom-table-cmd (car ct) (cadr ct)) conn))
290 (defun sql-insert-values (conn file)
291 "SQL Databases: inserts all values for a file"
292 (with-umls-file (line (umls-file-fil file))
293 (sql-execute (insert-values-cmd file line) conn)))
295 (defun sql-insert-all-values (conn)
296 "SQL Databases: inserts all values for all files"
297 (mapcar (lambda (file) (sql-insert-values conn file)) *umls-files*))
299 (defun sql-create-indexes (conn &optional (indexes +index-cols+))
300 "SQL Databases: create all indexes"
303 (sql-execute (create-index-cmd (car idx) (cadr idx) (caddr idx)) conn))
306 (defun create-umls-db-by-insert ()
307 "SQL Databases: initializes entire database via SQL insert commands"
310 (with-sql-connection (conn)
311 ;; (sql-drop-tables conn)
312 ;; (sql-create-tables conn)
313 ;; (sql-insert-all-values conn)
314 (sql-create-indexes conn)
315 (sql-create-custom-tables conn)
316 (sql-create-indexes conn +custom-index-cols+)))
318 (defun create-umls-db (&optional (extension ".trans")
319 (copy-cmd #'mysql-copy-cmd))
320 "SQL Databases: initializes entire database via SQL copy commands"
323 (translate-all-files extension)
324 (with-sql-connection (conn)
325 (sql-drop-tables conn)
326 (sql-create-tables conn)
328 #'(lambda (file) (sql-execute (funcall copy-cmd file extension) conn))
330 (sql-create-indexes conn)
331 (sql-create-custom-tables conn)
332 (sql-create-indexes conn +custom-index-cols+)))
334 (defun translate-all-files (&optional (extension ".trans"))
335 "Copy translated files and return postgresql copy commands to import"
336 (make-noneng-index-file extension)
337 (mapcar (lambda (f) (translate-file f extension)) *umls-files*))
339 (defun translate-file (file extension)
340 "Translate a umls file into a format suitable for sql copy cmd"
341 (let ((path (umls-pathname (umls-file-fil file) extension)))
342 (if (probe-file path)
344 (format t "File ~A already exists: skipping~%" path)
346 (with-open-file (ostream path :direction :output)
347 (with-umls-file (line (umls-file-fil file))
348 (princ (umls-translate file line) ostream)
349 (princ #\newline ostream))
352 (defun make-noneng-index-file (extension)
353 "Make non-english index file"
354 (let* ((outfile (find-umls-file "MRXW.NONENG"))
355 (path (umls-pathname (umls-file-fil outfile) extension)))
357 (if (probe-file path)
359 (format t "File ~A already exists: skipping~%" path)
362 (with-open-file (ostream path :direction :output)
363 (dolist (inputfile (noneng-lang-index-files))
364 (with-umls-file (line (umls-file-fil inputfile))
365 (princ (umls-translate outfile line) ostream) ;; use outfile for custom cols
366 (princ #\newline ostream))))
369 (defun pg-copy-cmd (file extension)
370 "Return postgresql copy statement for a file"
371 (format nil "COPY ~a FROM '~a' using delimiters '|' with null as ''"
372 (umls-file-table file) (umls-pathname (umls-file-fil file) extension)))
374 (defun mysql-copy-cmd (file extension)
375 "Return mysql copy statement for a file"
376 (format nil "LOAD DATA LOCAL INFILE \"~a\" INTO TABLE ~a FIELDS TERMINATED BY \"|\""
377 (umls-pathname (umls-file-fil file) extension) (umls-file-table file)))
379 (defun umls-translate (file line)
380 "Translate a single line for sql output"
381 (string-trim-last-character
383 (mapcar2-append-string
387 (if (eq (umls-col-datatype col) 'sql-u)
388 (format nil "~d" (parse-ui value ""))
389 (escape-backslashes value))
391 (remove-custom-cols (umls-file-colstructs file))
393 (custom-col-values (custom-colstructs-for-file file) line "|" nil))))
396 (defun umls-fixed-size-waste ()
397 "Display storage waste if using all fixed size storage"
403 (dolist (file *umls-files*)
404 (dolist (col (umls-file-colstructs file))
405 (let* ((avwaste (- (umls-col-max col) (umls-col-av col)))
406 (cwaste (* avwaste (umls-file-rws file))))
407 (unless (zerop cwaste)
410 (incf totalunavoidable cwaste)
411 (setq unavoidable (append unavoidable (list (list (umls-file-fil file) (umls-col-col col) avwaste cwaste)))))
413 (incf totalavoidable cwaste)
414 (setq avoidable (append avoidable (list (list (umls-file-fil file) (umls-col-col col) avwaste cwaste))))))
415 (incf totalwaste cwaste)))))
416 (values totalwaste totalavoidable totalunavoidable avoidable unavoidable)))
418 (defun display-waste ()
421 (multiple-value-bind (tw ta tu al ul) (umls-fixed-size-waste)
422 (format t "Total waste: ~d~%" tw)
423 (format t "Total avoidable: ~d~%" ta)
424 (format t "Total unavoidable: ~d~%" tu)
425 (format t "Avoidable:~%")
427 (format t " (~a,~a): ~a,~a~%" (car w) (cadr w) (caddr w) (cadddr w)))
428 (format t "Unavoidable:~%")
430 (format t " (~a,~a): ~a,~a~%" (car w) (cadr w) (caddr w) (cadddr w)))
433 (defun max-umls-field ()
434 "Return length of longest field"
438 (declare (fixnum max))
439 (dolist (col *umls-cols*)
440 (when (> (umls-col-max col) max)
441 (setq max (umls-col-max col))))
444 (defun max-umls-row ()
445 "Return length of longest row"
447 6000 ;;; hack to use on systems without MRCOLS/MRFILES -- ok for UMLS2001
451 (let ((rowsizes '()))
452 (dolist (file *umls-files*)
454 (fields (umls-file-colstructs file)))
455 (dolist (field fields)
456 (incf row (1+ (umls-col-max field))))
457 (push row rowsizes)))
458 (car (sort rowsizes #'>))))))