1 ;;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; Package: umlisp -*-
2 ;;;; *************************************************************************
3 ;;;; FILE IDENTIFICATION
5 ;;;; Name: parse-2002.lisp
6 ;;;; Purpose: Parsing and SQL insertion routines for UMLisp which may
7 ;;;; change from year to year
8 ;;;; Programmer: Kevin M. Rosenberg
9 ;;;; Date Started: Apr 2000
11 ;;;; $Id: parse-2002.lisp,v 1.5 2002/11/10 22:39:15 kevin Exp $
13 ;;;; This file, part of UMLisp, is
14 ;;;; Copyright (c) 2000-2002 by Kevin M. Rosenberg, M.D.
16 ;;;; UMLisp users are granted the rights to distribute and use this software
17 ;;;; as governed by the terms of the GNU General Public License.
18 ;;;; *************************************************************************
21 (declaim (optimize (speed 3) (safety 1) (compilation-speed 0) (debug 3)))
23 ;;; Pre-read data for custom fields into hash tables
24 (defvar *parse-hash-init?* nil)
26 (eval-when (:compile-toplevel :load-toplevel :execute)
27 (let ((pfstr-hash nil) ;;; Preferred concept strings by CUI
28 (cui-lrl-hash nil) ;;; LRL by CUI
29 (lui-lrl-hash nil) ;;; LRL by LUI
30 (cuisui-lrl-hash nil) ;;; LRL by CUISUI
31 (sab-srl-hash nil)) ;;; SRL by SAB
33 (defun make-parse-hash-table ()
37 (clrhash cui-lrl-hash)
38 (clrhash lui-lrl-hash)
39 (clrhash cuisui-lrl-hash)
40 (clrhash sab-srl-hash))
42 pfstr-hash (make-hash-table :size 800000)
43 cui-lrl-hash (make-hash-table :size 800000)
44 lui-lrl-hash (make-hash-table :size 1500000)
45 cuisui-lrl-hash (make-hash-table :size 1800000)
46 sab-srl-hash (make-hash-table :size 100 :test 'equal))))
48 (defun binit-hash-table (&optional (force-read nil))
49 (when (or force-read (not *parse-hash-init?*))
50 (make-parse-hash-table)
51 (setq *parse-hash-init?* t))
52 (with-buffered-umls-file (line "MRCON")
53 (let ((cui (parse-ui (aref line 0)))
54 (lui (parse-ui (aref line 3)))
55 (sui (parse-ui (aref line 5)))
56 (lrl (parse-integer (aref line 7))))
57 (unless (gethash cui pfstr-hash) ;; if haven't stored pfstr for cui
58 (if (and (string-equal (aref line 1) "ENG") ; LAT
59 (string-equal (aref line 2) "P") ; ts
60 (string-equal (aref line 4) "PF")) ; stt
61 (setf (gethash cui pfstr-hash) (aref line 6))))
62 (set-lrl-hash cui lrl cui-lrl-hash)
63 (set-lrl-hash lui lrl lui-lrl-hash)
64 (set-lrl-hash (make-cuisui cui sui) lrl cuisui-lrl-hash)))
65 (with-buffered-umls-file (line "MRSO")
66 (let ((sab (aref line 3)))
67 (unless (gethash sab sab-srl-hash) ;; if haven't stored
68 (setf (gethash sab sab-srl-hash) (aref line 6))))))
70 (defun init-hash-table (&optional (force-read nil))
71 (when (or force-read (not *parse-hash-init?*))
72 (make-parse-hash-table)
73 (setq *parse-hash-init?* t))
74 (with-umls-file (line "MRCON")
75 (let ((cui (parse-ui (nth 0 line)))
76 (lui (parse-ui (nth 3 line)))
77 (sui (parse-ui (nth 5 line)))
78 (lrl (parse-integer (nth 7 line))))
79 (unless (gethash cui pfstr-hash) ;; if haven't stored pfstr for cui
80 (if (and (string-equal (nth 1 line) "ENG") ; LAT
81 (string-equal (nth 2 line) "P") ; ts
82 (string-equal (nth 4 line) "PF")) ; stt
83 (setf (gethash cui pfstr-hash) (nth 6 line))))
84 (set-lrl-hash cui lrl cui-lrl-hash)
85 (set-lrl-hash lui lrl lui-lrl-hash)
86 (set-lrl-hash (make-cuisui cui sui) lrl cuisui-lrl-hash)))
87 (with-umls-file (line "MRSO")
88 (let ((sab (nth 3 line)))
89 (multiple-value-bind (val found) (gethash sab sab-srl-hash)
90 (declare (ignore val))
92 (setf (gethash sab sab-srl-hash) (parse-integer (nth 6 line))))))))
94 (defun pfstr-hash (cui)
95 (gethash cui pfstr-hash))
98 (gethash cui cui-lrl-hash))
101 (gethash lui lui-lrl-hash))
103 (defun cuisui-lrl (cuisui)
104 (gethash cuisui cuisui-lrl-hash))
107 (kmrcl:aif (gethash sab sab-srl-hash) kmrcl::it 0))
110 (defun set-lrl-hash (key lrl hash)
111 "Set the least restrictive level in hash table"
112 (multiple-value-bind (hash-lrl found) (gethash key hash)
113 (if (or (not found) (< lrl hash-lrl))
114 (setf (gethash key hash) lrl))))
116 ;; UMLS file and column structures
118 (defstruct (umls-file)
119 "Record for each UMLS File"
120 fil table des fmt cls rws bts fields colstructs)
122 (defstruct (umls-col)
123 "Record for each UMLS Column in each file"
124 col des ref min av max fil sqltype
125 dty ;; new in 2002 umls: suggested SQL datatype
126 parsefunc quotechar datatype custom-value-func)
128 ;;; SQL datatypes symbols
129 ;;; sql-u - Unique identifier
130 ;;; sql-s - Small integer (16-bit)
131 ;;; sql-i - Integer (32-bit)
132 ;;; sql-l - Big integer (64-bit)
133 ;;; sql-f - Floating point
135 (defparameter +col-datatypes+
136 '(("AV" sql-f) ("BTS" sql-i) ("CLS" sql-i) ("COF" sql-i) ("CUI1" sql-u)
137 ("CUI2" sql-u) ("CUI" sql-u) ("CXN" sql-s) ("FR" sql-i) ("LRL" sql-s)
138 ("LUI" sql-u) ("MAX" sql-s) ("MIN" sql-s) ("RANK" sql-s) ("REF" sql-s)
139 ("RNK" sql-s) ("RWS" sql-i) ("SRL" sql-s) ("SUI" sql-u) ("TUI" sql-u)
141 ("KCUISUI" sql-l) ("KCUILUI" sql-l) ("KCUILRL" sql-i) ("KLUILRL" sql-i)
142 ("KSRL" sql-i) ("KLRL" sql-i)
144 ("EUI" sql-u) ("EUI2" sql-u)
145 ;;; Semantic net columns
146 ("UI" sql-u) ("UI2" sql-u) ("UI3" sql-u)
147 ;; New fields for 2002AD
148 ("RCUI" sql-u) ("VCUI" sql-u) ("CFR" sql-i) ("TFR" sql-i)
150 "SQL data types for each non-string column")
152 (defparameter +custom-tables+
155 '(("MRCONSO" "SELECT m.CUI, m.LAT, m.TS, m.LUI, m.STT, m.SUI, m.STR, m.LRL, s.SAB, s.TTY, s.SCD, s.SRL FROM MRCON m, MRSO s WHERE m.CUI=s.CUI AND m.LUI=s.LUI AND m.SUI=s.SUI")
156 ("MRCONFULL" "SELECT m.CUI, m.LAT, m.TS, m.LUI, m.STT, m.SUI, m.STR, m.LRL, s.SAB, s.TTY, s.SCD, s.SRL, t.TUI FROM MRCON m, MRSO s, MRSTY t WHERE m.CUI=s.CUI AND m.LUI=s.LUI AND m.SUI=s.SUI AND m.CUI=t.CUI AND s.CUI=t.CUI"))
157 "Custom tables to create")
159 (defparameter +custom-cols+
160 '(("MRCON" "KPFSTR" "TEXT" 1024
161 (lambda (x) (pfstr-hash (parse-ui (nth 0 x)))))
162 ("MRCON" "KCUISUI" "BIGINT" 0
163 (lambda (x) (format nil "~d" (make-cuisui (parse-ui (nth 0 x)) (parse-ui (nth 5 x))))))
164 ("MRCON" "KCUILUI" "BIGINT" 0
165 (lambda (x) (format nil "~d" (make-cuilui (parse-ui (nth 0 x)) (parse-ui (nth 3 x))))))
166 ("MRCON" "KCUILRL" "INTEGER" 0
167 (lambda (x) (format nil "~d" (cui-lrl (parse-ui (nth 0 x))))))
168 ("MRCON" "KLUILRL" "INTEGER" 0
169 (lambda (x) (format nil "~d" (lui-lrl (parse-ui (nth 3 x))))))
170 ("MRLO" "KLRL" "INTEGER" 0
171 (lambda (x) (format nil "~d"
172 (if (zerop (length (nth 4 x)))
173 (cui-lrl (parse-ui (nth 0 x)))
174 (cuisui-lrl (make-cuisui (parse-ui (nth 0 x)) (parse-ui (nth 4 x))))))))
175 ("MRSTY" "KLRL" "INTEGER" 0
176 (lambda (x) (format nil "~d" (cui-lrl (parse-ui (nth 0 x))))))
177 ("MRCOC" "KLRL" "INTEGER" 0
178 (lambda (x) (format nil "~d"
179 (max (cui-lrl (parse-ui (nth 0 x)))
180 (kmrcl:aif (cui-lrl (parse-ui (nth 1 x))) kmrcl::it 0)))))
181 ("MRSAT" "KSRL" "INTEGER" 0
182 (lambda (x) (format nil "~d" (sab-srl (nth 5 x)))))
183 ("MRREL" "KSRL" "INTEGER" 0
184 (lambda (x) (format nil "~d" (sab-srl (nth 4 x)))))
185 ("MRRANK" "KSRL" "INTEGER" 0
186 (lambda (x) (format nil "~d" (sab-srl (nth 1 x)))))
187 ("MRDEF" "KSRL" "INTEGER" 0
188 (lambda (x) (format nil "~d" (sab-srl (nth 1 x)))))
189 ("MRCXT" "KSRL" "INTEGER" 0
190 (lambda (x) (format nil "~d" (sab-srl (nth 2 x)))))
191 ("MRATX" "KSRL" "INTEGER" 0
192 (lambda (x) (format nil "~d" (sab-srl (nth 1 x)))))
193 ("MRXW.ENG" "KLRL" "INTEGER" 0
194 (lambda (x) (format nil "~d" (cuisui-lrl (make-cuisui
196 (parse-ui (nth 4 x)))))))
197 ("MRXW.NONENG" "KLRL" "INTEGER" 0
198 (lambda (x) (format nil "~d" (cuisui-lrl (make-cuisui
200 (parse-ui (nth 4 x)))))))
201 ("MRXNW.ENG" "KLRL" "INTEGER" 0
202 (lambda (x) (format nil "~d" (cuisui-lrl (make-cuisui
204 (parse-ui (nth 4 x)))))))
205 ("MRXNS.ENG" "KLRL" "INTEGER" 0
206 (lambda (x) (format nil "~d" (cuisui-lrl (make-cuisui
208 (parse-ui (nth 4 x)))))))
209 ("MRREL" "KPFSTR2" "TEXT" 1024
210 (lambda (x) (pfstr-hash (parse-ui (nth 2 x)))))
211 ("MRCOC" "KPFSTR2" "TEXT" 1024
212 (lambda (x) (pfstr-hash (parse-ui (nth 1 x)))))
213 ("MRCXT" "KCUISUI" "BIGINT" 0
214 (lambda (x) (format nil "~d" (make-cuisui (parse-ui (nth 0 x)) (parse-ui (nth 1 x))))))
215 ("MRSAT" "KCUILUI" "BIGINT" 0
216 (lambda (x) (format nil "~d" (make-cuilui (parse-ui (nth 0 x)) (parse-ui (nth 1 x))))))
217 ("MRSAT" "KCUISUI" "BIGINT" 0
218 (lambda (x) (format nil "~d" (make-cuisui (parse-ui (nth 0 x)) (parse-ui (nth 2 x))))))
219 ("MRSO" "KCUISUI" "BIGINT" 0
220 (lambda (x) (format nil "~d" (make-cuisui (parse-ui (nth 0 x)) (parse-ui (nth 2 x))))))
221 ("MRXW.ENG" "KCUISUI" "BIGINT" 0
222 (lambda (x) (format nil "~d" (make-cuisui (parse-ui (nth 2 x)) (parse-ui (nth 4 x))))))
223 ("MRXNW.ENG" "KCUISUI" "BIGINT" 0
224 (lambda (x) (format nil "~d" (make-cuisui (parse-ui (nth 2 x)) (parse-ui (nth 4 x))))))
225 ("MRXNS.ENG" "KCUISUI" "BIGINT" 0
226 (lambda (x) (format nil "~d" (make-cuisui (parse-ui (nth 2 x)) (parse-ui (nth 4 x))))))
227 ("MRXW.NONENG" "LAT" "CHAR" 3 (lambda (x) (nth 0 x)))
228 ("MRXW.NONENG" "WD" "CHAR" 200 (lambda (x) (nth 1 x)))
229 ("MRXW.NONENG" "CUI" "INTEGER" 0 (lambda (x) (nth 2 x)))
230 ("MRXW.NONENG" "LUI" "INTEGER" 0 (lambda (x) (nth 3 x)))
231 ("MRXW.NONENG" "SUI" "INTEGER" 0 (lambda (x) (nth 4 x)))
232 ("MRXW.NONENG" "KCUISUI" "BIGINT" 0
233 (lambda (x) (format nil "~d" (make-cuisui (parse-ui (nth 2 x)) (parse-ui (nth 4 x)))))))
234 "Custom columns to create.(filename, col, sqltype, value-func).")
236 (defparameter +index-cols+
237 '(("CUI" "MRATX") ("CUI1" "MRCOC") ("CUI" "MRCON") ("LUI" "MRCON")
239 ("SUI" "MRCON") ("CUI" "MRCXT") ("CUI" "MRDEF") ("CUI" "MRLO")
240 ("CUI1" "MRREL") ("CUI" "MRSAT") ("LUI" "MRSAT") ("SUI" "MRSAT")
241 ("CUI" "MRSO") ("SAB" "MRSO") ("SRL" "MRSO") ("CUI" "MRSTY")
242 ("TUI" "MRSTY") ("CUI" "MRXNS_ENG") ("NSTR" "MRXNS_ENG" 10)
243 ("CUI" "MRXNW_ENG") ("NWD" "MRXNW_ENG") ("WD" "MRXW_ENG")
244 ("KCUISUI" "MRCON") ("KCUILUI" "MRCON") ("KCUILRL" "MRCON")
245 ("KLUILRL" "MRCON") ("KCUISUI" "MRCXT")
246 ("KCUISUI" "MRSO") ("KCUISUI" "MRSAT") ("KCUILUI" "MRSAT")
247 ("KCUISUI" "MRXW_ENG") ("KCUISUI" "MRXNW_ENG")
248 ("KCUISUI" "MRXNS_ENG") ("KCUISUI" "MRXW_NONENG")
249 ("KSRL" "MRATX") ("KSRL" "MRCXT") ("KSRL" "MRDEF") ("KSRL" "MRRANK")
250 ("KSRL" "MRREL") ("KSRL" "MRSAT") ("KLRL" "MRCOC")
251 ("KLRL" "MRLO") ("KLRL" "MRSTY") ("KLRL" "MRXW_ENG") ("KLRL" "MRXNW_ENG")
252 ("KLRL" "MRXNS_ENG") ("KLRL" "MRXW_NONENG")
254 ("EUI" "LRABR") ("EUI2" "LRABR") ("EUI" "LRAGR") ("EUI" "LRCMP") ("EUI" "LRMOD")
255 ("EUI" "LRNOM") ("EUI2" "LRNOM") ("EUI" "LRPRN") ("EUI" "LRPRP") ("EUI" "LRSPL")
256 ("EUI" "LRTRM") ("EUI" "LRTYP") ("EUI" "LRWD") ("WRD" "LRWD")
258 ;; Semantic NET indices
259 ("UI" "SRSTRE1") ("UI2" "SRSTRE1") ("UI3" "SRSTRE1")
260 ("STY_RL" "SRDEF") ("RT" "SRDEF") ("STY_RL" "SRSTR") ("STY_RL2" "SRSTR")
262 ("SRL" "MRSAB") ("RSAB" "MRSAB") ("VSAB" "MRSAB") ("RCUI" "MRSAB")
263 ("VCUI" "MRSAB") ("LAT" "MRSAB"))
264 "Columns in files to index")
266 (defparameter +custom-index-cols+
269 '(("CUI" "MRCONFULL") ("SAB" "MRCONFULL") ("TUI" "MRCONFULL"))
270 "Indexes to custom tables")
272 ;; File & Column functions
274 (defun init-umls (&optional (alwaysclear nil))
275 "Initialize all UMLS file and column structures if not already initialized"
276 (when (or alwaysclear (null *umls-files*))
279 (init-field-lengths)))
281 (defun init-umls-cols ()
282 (setq *umls-cols* (append
285 (init-generic-cols "LRFLD")
286 (init-generic-cols "SRFLD"))))
288 (defun init-meta-cols ()
289 "Initialize all umls columns"
291 (with-umls-file (line "MRCOLS")
292 (destructuring-bind (col des ref min av max fil dty) line
293 (let ((c (make-umls-col
297 :min (parse-integer min)
298 :av (read-from-string av)
299 :max (parse-integer max)
301 :dty dty ;; new in 2002 UMLS
302 :sqltype "VARCHAR" ; default data type
303 :parsefunc #'add-sql-quotes
304 :custom-value-func nil
306 (add-datatype-to-col c (datatype-for-col col))
310 (defun init-custom-cols ()
311 "Initialize umls columns for custom columns"
313 (dolist (customcol +custom-cols+)
314 (let ((c (make-umls-col :col (nth 1 customcol)
318 :max (nth 3 customcol)
321 :fil (nth 0 customcol)
322 :sqltype (nth 2 customcol)
323 :parsefunc #'add-sql-quotes
324 :custom-value-func (nth 4 customcol)
326 (add-datatype-to-col c (datatype-for-col (nth 1 customcol)))
330 (defun escape-column-name (name)
331 (substitute #\_ #\/ name))
333 (defun init-generic-cols (col-filename)
334 "Initialize for generic (LEX/NET) columns"
336 (with-umls-file (line col-filename)
337 (destructuring-bind (nam des ref fil) line
338 (setq nam (escape-column-name nam))
339 (dolist (file (delimited-string-to-list fil #\,))
340 (let ((c (make-umls-col
349 :sqltype "VARCHAR" ; default data type
350 :parsefunc #'add-sql-quotes
351 :custom-value-func nil
353 (add-datatype-to-col c (datatype-for-col nam))
357 (defun init-umls-files ()
358 (setq *umls-files* (append
359 (init-generic-files "MRFILES")
360 (init-generic-files "LRFIL")
361 (init-generic-files "SRFIL")))
362 ;; need to separate this since init-custom-files depends on *umls-files*
363 (setq *umls-files* (append *umls-files* (init-custom-files))))
366 (defun umls-field-string-to-list (fmt)
367 "Converts a comma delimited list of fields into a list of field names. Will
368 append a unique number (starting at 2) onto a column name that is repeated in the list"
369 (let ((field-list (delimited-string-to-list (escape-column-name fmt) #\,))
370 (col-count (make-hash-table :test 'equal)))
371 (dotimes (i (length field-list))
373 (let ((col (nth i field-list)))
374 (multiple-value-bind (key found) (gethash col col-count)
376 (let ((next-id (1+ key)))
377 (setf (nth i field-list) (concatenate 'string
379 (format nil "~D" next-id)))
380 (setf (gethash col col-count) next-id))
381 (setf (gethash col col-count) 1)))))
384 (defun init-generic-files (files-filename)
385 "Initialize all LEX file structures"
387 (with-umls-file (line files-filename)
388 (destructuring-bind (fil des fmt cls rws bts) line
389 (let ((f (make-umls-file
391 :table (substitute #\_ #\. fil)
393 :fmt (escape-column-name fmt)
394 :cls (parse-integer cls)
395 :rws (parse-integer rws)
396 :bts (parse-integer bts)
397 :fields (concatenate 'list
398 (umls-field-string-to-list fmt)
399 (custom-colnames-for-filename fil)))))
400 (setf (umls-file-colstructs f) (umls-cols-for-umls-file f))
404 (defun init-custom-files ()
405 (let ((ffile (make-umls-file :fil "MRXW.NONENG"
406 :des "Custom NonEnglish Index"
411 :fields (umls-file-fields (find-umls-file "MRXW.ENG")))))
412 (setf (umls-file-colstructs ffile)
413 (umls-cols-for-umls-file ffile))
416 (defun datatype-for-col (colname)
417 "Return datatype for column name"
418 (car (cdr (find colname +col-datatypes+ :key #'car :test #'string-equal))))
420 (defun add-datatype-to-col (col datatype)
421 "Add data type information to column"
422 (setf (umls-col-datatype col) datatype)
424 (sql-u (setf (umls-col-sqltype col) "INTEGER"
425 (umls-col-parsefunc col) #'parse-ui
426 (umls-col-quotechar col) ""))
427 (sql-s (setf (umls-col-sqltype col) "SMALLINT"
428 (umls-col-parsefunc col) #'parse-integer
429 (umls-col-quotechar col) ""))
430 (sql-l (setf (umls-col-sqltype col) "BIGINT"
431 (umls-col-parsefunc col) #'parse-integer
432 (umls-col-quotechar col) ""))
433 (sql-i (setf (umls-col-sqltype col) "INTEGER"
434 (umls-col-parsefunc col) #'parse-integer
435 (umls-col-quotechar col) ""))
436 (sql-f (setf (umls-col-sqltype col) "NUMERIC"
437 (umls-col-parsefunc col) #'read-from-string
438 (umls-col-quotechar col) ""))
439 (t ; Default column type, optimized text storage
440 (setf (umls-col-parsefunc col) #'add-sql-quotes
441 (umls-col-quotechar col) "'")
442 (when (and (umls-col-max col) (umls-col-av col))
443 (if (> (umls-col-max col) 255)
444 (setf (umls-col-sqltype col) "TEXT")
445 (if (< (- (umls-col-max col) (umls-col-av col)) 4)
446 (setf (umls-col-sqltype col) "CHAR") ; if average bytes wasted < 4
447 (setf (umls-col-sqltype col) "VARCHAR")))))))