X-Git-Url: http://git.kpe.io/?p=umlisp.git;a=blobdiff_plain;f=create-sql.lisp;h=20964f5c4ae3a6a8f4023ecf033582a759061856;hp=fdc444c9cf2bd8fb1279674bff3f8e5c0e414cf0;hb=HEAD;hpb=612a2df000b3ff47d2454dbad0b901c1aa5558e7 diff --git a/create-sql.lisp b/create-sql.lisp index fdc444c..3a19020 100644 --- a/create-sql.lisp +++ b/create-sql.lisp @@ -7,10 +7,8 @@ ;;;; Author: Kevin M. Rosenberg ;;;; Created: Apr 2000 ;;;; -;;;; $Id$ -;;;; ;;;; This file, part of UMLisp, is -;;;; Copyright (c) 2000-2006 by Kevin M. Rosenberg, M.D. +;;;; Copyright (c) 2000-2010 by Kevin M. Rosenberg, M.D. ;;;; ;;;; UMLisp users are granted the rights to distribute and use this software ;;;; as governed by the terms of the GNU General Public License. @@ -21,33 +19,33 @@ (defun create-table-cmd (file) "Return sql command to create a table" (let ((col-func - (lambda (c) - (let ((sqltype (sqltype c))) - (case *umls-sql-type* - (:oracle - (cond - ((string-equal sqltype "VARCHAR") - (setq sqltype "VARCHAR2")) - ((string-equal sqltype "BIGINT") - (setq sqltype "VARCHAR2(20)"))))) - - (concatenate 'string - (col c) - " " - (if (or (string-equal sqltype "VARCHAR") - (string-equal sqltype "CHAR")) - (format nil "~a (~a)" sqltype (cmax c)) - sqltype)))))) + (lambda (c) + (let ((sqltype (sqltype c))) + (case *umls-sql-type* + (:oracle + (cond + ((string-equal sqltype "VARCHAR") + (setq sqltype "VARCHAR2")) + ((string-equal sqltype "BIGINT") + (setq sqltype "VARCHAR2(20)"))))) + + (concatenate 'string + (sqlname c) + " " + (if (or (string-equal sqltype "VARCHAR") + (string-equal sqltype "CHAR")) + (format nil "~a (~a)" sqltype (cmax c)) + sqltype)))))) (format nil "CREATE TABLE ~a (~{~a~^,~})~A~A" - (table file) - (mapcar col-func (ucols file)) - (if (and (eq *umls-sql-type* :mysql) - (string-equal (table file) "MRCXT")) - " MAX_ROWS=200000000" - "") - (if (eq *umls-sql-type* :mysql) - " TYPE=MYISAM DEFAULT CHARACTER latin1" - "")))) + (table file) + (mapcar col-func (ucols file)) + (if (and (eq *umls-sql-type* :mysql) + (string-equal (table file) "MRCXT")) + " MAX_ROWS=200000000" + "") + (if (eq *umls-sql-type* :mysql) + " ENGINE=MYISAM CHARACTER SET utf8 COLLATE utf8_bin" + "")))) (defun create-custom-table-cmd (tablename sql-cmd) "Return SQL command to create a custom table" @@ -61,10 +59,10 @@ (defun insert-values-cmd (file values) "Return sql insert command for a row of values" (let ((insert-func - (lambda (col value) - (concatenate 'string (quote-str col) - (insert-col-value col value) - (quote-str col))))) + (lambda (col value) + (concatenate 'string (quote-str col) + (insert-col-value col value) + (quote-str col))))) (format nil "INSERT INTO ~a (~{~a~^,~}) VALUES (~A)" (table file) @@ -78,12 +76,12 @@ (defun custom-col-value (col values doquote) (let ((custom-value (funcall (custom-value-fun col) values))) (if custom-value - (if doquote - (concatenate 'string (quote-str col) - (escape-backslashes custom-value) - (quote-str col)) - (escape-backslashes custom-value)) - ""))) + (if doquote + (concatenate 'string (quote-str col) + (escape-backslashes custom-value) + (quote-str col)) + (escape-backslashes custom-value)) + ""))) (defun custom-col-values (ucols values doquote) "Returns a list of string column values for SQL inserts for custom columns" @@ -98,7 +96,7 @@ (defun find-custom-col (filename col) (find-if (lambda (x) (and (string-equal filename (car x)) - (string-equal col (cadr x)))) +custom-cols+)) + (string-equal col (cadr x)))) +custom-cols+)) (defun custom-colnames-for-filename (filename) (mapcar #'cadr (find-custom-cols-for-filename filename))) @@ -109,9 +107,9 @@ (defun noneng-lang-index-files () (remove-if-not (lambda (f) (and (> (length (fil f)) 4) - (string-equal (fil f) "MRXW_" :end1 5) - (not (string-equal (fil f) "MRXW_ENG.RRF")) - (not (string-equal (fil f) "MRXW_NONENG.RRF")))) + (string-equal (fil f) "MRXW_" :end1 5) + (not (string-equal (fil f) "MRXW_ENG.RRF")) + (not (string-equal (fil f) "MRXW_NONENG.RRF")))) *umls-files*)) ;;; SQL Command Functions @@ -119,21 +117,21 @@ (defun create-index-cmd (colname tablename length) "Return sql create index command" (format nil "CREATE INDEX ~a ON ~a (~a)" - (concatenate 'string tablename "_" colname "_X") - tablename - (case *umls-sql-type* - (:mysql - (concatenate 'string colname - (if (integerp length) - (format nil " (~d)" length) - ""))) - ((:postgresql :postgresql-socket) - ;; FIXME: incorrect syntax - (if (integerp length) - (format nil "substr((~A)::text,1,~D)" colname length) - colname)) - (t - colname)))) + (concatenate 'string tablename "_" colname "_X") + tablename + (case *umls-sql-type* + (:mysql + (concatenate 'string colname + (if (integerp length) + (format nil " (~d)" length) + ""))) + ((:postgresql :postgresql-socket) + ;; FIXME: incorrect syntax for postgresql? + (if (integerp length) + (format nil "substr((~A)::text,1,~D)" colname length) + colname)) + (t + colname)))) (defun create-all-tables-cmdfile () "Return sql commands to create all tables. Not need for automated SQL import" @@ -158,25 +156,27 @@ (ignore-errors (execute-command "DROP TABLE KCON" :database conn)) (execute-command (format nil "CREATE TABLE KCON (CUI INTEGER, STR ~A, LRL ~A)" - (case *umls-sql-type* - (:oracle - (format nil "VARCHAR2(~D)" - (slot-value (find-ucol "STR" "MRCONSO.RRF") 'max))) - (t "TEXT")) - (case *umls-sql-type* - (:mysql "TINYINT") - ((:postgresql :postgresql-socket) "INT2") - (:oracle "NUMBER(2,0)") - (t "INTEGER"))) + (case *umls-sql-type* + (:oracle + (format nil "VARCHAR2(~D)" + (slot-value (find-ucol "STR" "MRCONSO.RRF") 'max))) + (t "TEXT")) + (case *umls-sql-type* + (:mysql "TINYINT") + ((:postgresql :postgresql-socket) "INT2") + (:oracle "NUMBER(2,0)") + (t "INTEGER"))) :database conn) + ;; KCON deprecated by KPFENG field in MRCONSO + #+nil (dolist (tuple (query "select distinct cui from MRCONSO order by cui" - :database conn)) + :database conn)) (let ((cui (car tuple))) (execute-command (format nil "INSERT into KCON VALUES (~D,'~A',~D)" - cui - (add-sql-quotes (pfstr-hash cui) ) - (cui-lrl cui)) + cui + (add-sql-quotes (pfstr-hash cui) ) + (cui-lrl cui)) :database conn)))) (defun sql-create-custom-tables (conn) @@ -200,34 +200,119 @@ (case *umls-sql-type* (:mysql (format nil "DROP INDEX ~a ON ~a" - (concatenate 'string tablename "_" colname "_X") - tablename)) + (concatenate 'string tablename "_" colname "_X") + tablename)) (t (format nil "DROP INDEX ~a" - (concatenate 'string tablename "_" colname "_X"))))) - -(defun sql-create-indexes (conn &optional (indexes +index-cols+)) + (concatenate 'string tablename "_" colname "_X"))))) + +(defun sql-create-indexes-mysql (conn indexes verbose) + (let ((tables nil) + (table-cols nil)) + (dolist (idx indexes) + (pushnew (second idx) tables :test 'string-equal) + (let ((table-col (find (second idx) table-cols :key 'car :test 'string-equal))) + (if table-col + (vector-push-extend (cons (first idx) (third idx)) (second table-col)) + (push (list (second idx) (make-array (list 1) :initial-contents (list (cons (first idx) (third idx))) + :adjustable t :fill-pointer 1)) + table-cols)))) + (dolist (table tables) + (let ((table-col (find table table-cols :key 'car :test 'string-equal)) + (first t) + (str (format nil "ALTER TABLE ~A" table))) + (loop for col across (second table-col) + do + (let ((colname (car col)) + (length (cdr col))) + (ignore-errors (sql-execute (drop-index-cmd colname table) conn)) + (setq str (concatenate 'string + str + (if first + (progn + (setq first nil) + " ") + ", ") + (format nil "ADD INDEX ~A (~A)" + (concatenate 'string table "_" colname "_X") + (concatenate 'string + colname + (if (integerp length) + (format nil " (~d)" length) + ""))))))) + (when verbose + (format t "UMLS Import: Creating indexes for columns ~A on table ~A.~%" + (mapcar 'car (coerce (second table-col) 'list)) table)) + (when conn + (sql-execute str conn)) + )))) + +(defun sql-create-indexes (conn &key (indexes +index-cols+) verbose) "SQL Databases: create all indexes" - (dolist (idx indexes) - (ignore-errors (sql-execute (drop-index-cmd (car idx) (cadr idx)) conn)) - (sql-execute (create-index-cmd (car idx) (cadr idx) (caddr idx)) conn))) + (if (eql :mysql *umls-sql-type*) + (sql-create-indexes-mysql conn indexes verbose) + (dolist (idx indexes) + (when verbose (format t "UMLS Import: Creating index for column ~A on table ~A.~%" + (first idx) (second idx))) + (ignore-errors (sql-execute (drop-index-cmd (car idx) (cadr idx)) conn)) + (sql-execute (create-index-cmd (car idx) (cadr idx) (caddr idx)) conn)))) (defun make-usrl (conn) (if (eql :mysql *umls-sql-type*) (sql-execute "drop table if exists USRL" conn) (ignore-errors (sql-execute "drop table USRL" conn))) - (sql-execute "create table USRL (sab varchar(80), srl integer)" conn) + (sql-execute + (concatenate 'string + "create table USRL (sab varchar(80), srl integer)" + (if (eq *umls-sql-type* :mysql) + " ENGINE=MYISAM CHARACTER SET utf8 COLLATE utf8_bin" + "")) + conn) (dolist (tuple (mutex-sql-query - "select distinct SAB,SRL from MRCONSO order by SAB asc")) + "select distinct SAB,SRL from MRCONSO order by SAB asc")) (sql-execute (format nil "insert into USRL (sab,srl) values ('~a',~d)" - (car tuple) (ensure-integer (cadr tuple))) - conn))) + (car tuple) (ensure-integer (cadr tuple))) + conn))) + +(defun make-ustats (conn) + (ignore-errors (sql-execute "drop table USTATS" conn)) + (sql-execute + (concatenate 'string"create table USTATS (NAME varchar(160), COUNT bigint, SRL integer)" + (if (eq *umls-sql-type* :mysql) + " ENGINE=MYISAM CHARACTER SET utf8 COLLATE utf8_bin" + "")) + conn) + + (dolist (srl '(0 1 2 3 4 9)) + (insert-ustats-count conn "Concept Count" "MRCONSO" "distinct CUI" "KCUILRL" srl) + (insert-ustats-count conn "Term Count" "MRCONSO" "distinct KCUILUI" "KCUILRL" srl) + (insert-ustats-count conn "Distinct Term Count" "MRCONSO" "distinct LUI" "KLUILRL" srl) + (insert-ustats-count conn "String Count" "MRCONSO" "*" "KSUILRL" srl) + (insert-ustats-count conn "Distinct String Count" "MRCONSO" "distinct SUI" "KSUILRL" srl) + (insert-ustats-count conn "Hierarchcy" "MRHIER" "*" "KSRL" srl) + (insert-ustats-count conn "Mappings" "MRMAP" "*" "KSRL" srl) + (insert-ustats-count conn "Simple Mappings" "MRSMAP" "*" "KSRL" srl) +#+mrcoc (insert-ustats-count conn "Co-occuring Concept Count" "MRCOC" "*" "KLRL" srl) + (insert-ustats-count conn "Definition Count" "MRDEF" "*" "KSRL" srl) + (insert-ustats-count conn "Rank Count" "MRRANK" "*" "KSRL" srl) + (insert-ustats-count conn "Relationship Count" "MRREL" "*" "KSRL" srl) + (insert-ustats-count conn "Semantic Type Count" "MRSTY" "*" "KLRL" srl) + (insert-ustats-count conn "Simple Attribute Count" "MRSAT" "*" "KSRL" srl) + (insert-ustats-count conn "Source Abbreviation Count" "MRSAB" "*" "SRL" srl) + (insert-ustats-count conn "Word Index Count" "MRXW_ENG" "*" "KLRL" srl) + (insert-ustats-count conn "Normalized Word Index Count" "MRXNW_ENG" "*" "KLRL" srl) + (insert-ustats-count conn "Normalized String Index Count" "MRXNS_ENG" "*" "KLRL" srl)) + (sql-execute "create index USTATS_SRL on USTATS (SRL)" conn) + (find-ustats-all)) + +(defun insert-ustats-count (conn name table count-variable srl-control srl) + (insert-ustats conn name (find-count-table conn table srl count-variable srl-control) srl)) (defun sql-create-special-tables (conn) (make-usrl conn) - (make-ustats)) + (make-ustats conn)) -(defun create-umls-db-by-insert () +(defun create-umls-db-by-insert (&key verbose) "SQL Databases: initializes entire database via SQL insert commands" (ensure-ucols+ufiles) (ensure-preparse) @@ -237,68 +322,128 @@ (sql-insert-all-values conn) (sql-create-indexes conn) (sql-create-custom-tables conn) - (sql-create-indexes conn +custom-index-cols+) + (sql-create-indexes conn :indexes +custom-index-cols+ :verbose verbose) (sql-create-special-tables conn))) -(defun create-umls-db (&key (extension ".trans") (skip-translation nil)) +(defun create-umls-db (&key (extension "-trans") (force-translation nil) (verbose nil)) "SQL Databases: initializes entire database via SQL copy commands. This is much faster that using create-umls-db-insert." + (when verbose (format t "UMLS Import: Starting.~%")) (ensure-ucols+ufiles) + (when verbose (format t "UMLS Import: Preparsing files.~%")) (ensure-preparse) - (unless skip-translation - (translate-all-files extension)) + (when verbose (format t "UMLS Import: Converting text UMLS files to optimized format.~%")) + (translate-all-files :extension extension :verbose verbose :force force-translation) (let ((copy-cmd - (ecase (umls-sql-type) - (:mysql #'mysql-copy-cmd) - (:postgresql #'pg-copy-cmd)))) + (ecase (umls-sql-type) + (:mysql #'mysql-copy-cmd) + (:postgresql #'pg-copy-cmd)))) (with-sql-connection (conn) (clsql:truncate-database :database conn) (sql-drop-tables conn) (sql-create-tables conn) (dolist (file *umls-files*) - (sql-execute (funcall copy-cmd file extension) conn)) - (sql-create-indexes conn) + (when verbose (format t "UMLS Import: Importing file ~A to SQL.~%" (fil file))) + (sql-execute (funcall copy-cmd file extension) conn)) + (when verbose (format t "UMLS Import: Creating SQL indices.~%")) + (sql-create-indexes conn :verbose verbose) + (when verbose (format t "UMLS Import: Creating custom tables.~%")) (sql-create-custom-tables conn) - (sql-create-indexes conn +custom-index-cols+) - (sql-create-special-tables conn)))) - -(defun translate-all-files (&optional (extension ".trans")) - "Copy translated files and return postgresql copy commands to import" - (make-noneng-index-file extension) + (when verbose (format t "UMLS Import: Creating custom indices.~%")) + (sql-create-indexes conn :indexes +custom-index-cols+ :verbose verbose) + (when verbose (format t "UMLS Import: Creating special tables.~%")) + (sql-create-special-tables conn))) + (when verbose (format t "UMLS Import: Completed.~%")) + t) + +(defun translate-all-files (&key (extension "-trans") verbose force) + "Translate all *umls-files* to optimized import format." + (when verbose (format t "UMLS Import: Translating file ~A.~%" (fil (find-ufile "MRXW_NONENG.RRF")))) + (make-noneng-index-file extension :force force) (dolist (f (remove "MRXW_NONENG.RRF" *umls-files* :test #'string= :key #'fil)) - (translate-umls-file f extension))) + (when verbose (format t "UMLS Import: Translating file ~A.~%" (fil f))) + (translate-umls-file f extension :force force))) -(defun translate-umls-file (file extension) +(defun translate-umls-file (file extension &key force) "Translate a umls file into a format suitable for sql copy cmd" - (translate-files file extension (list file))) + (translate-files file extension (list file) :force force)) -(defun make-noneng-index-file (extension) +(defun make-noneng-index-file (extension &key force) "Make non-english index file" (translate-files (find-ufile "MRXW_NONENG.RRF") - extension (noneng-lang-index-files))) - -(defun translate-files (out-ufile extension input-ufiles) + extension (noneng-lang-index-files) :force force)) + +(defun verify-translation-file (output-path input-ufiles) + "Returns t if translation file exists and is correct size. Warns and deletes incomplete translation file." + (when (probe-file output-path) + (let ((translated-lines 0) + (input-lines 0) + (eof (cons nil nil))) + (with-open-file (ts output-path :direction :input + #+(and sbcl sb-unicode) :external-format + #+(and sbcl sb-unicode) :UTF-8 + #+(and allegro ics) :external-format + #+(and allegro ics) :UTF-8 + #+lispworks :external-format + #+lispworks :UTF-8 + #+(and clisp unicode) :external-format + #+(and clisp unicode) charset:utf-8) + (do ((c (read-char ts nil eof) (read-char ts nil eof))) + ((eq c eof)) + (when (eql c #\newline) + (incf translated-lines)))) + (dolist (input-ufile input-ufiles) + (with-umls-ufile (line input-ufile) + (incf input-lines) + (when (> input-lines translated-lines) + (return)))) + (cond + ((< input-lines translated-lines) + (format t "Translated file ~A incomplete, deleting...~%" output-path) + (delete-file output-path) + nil) + ((eql input-lines translated-lines) + (format t "Translated file ~A exists and is proper number of lines: skipping...~%" output-path) + t) + ((eql input-lines 0) + (warn "The number of input lines is 0 for output file ~A." output-path) + nil) + ((> translated-lines input-lines) + (error "Shouldn't happen. Translated lines of ~A is ~D, greater than input lines ~D" + output-path translated-lines input-lines) + (delete-file output-path) + nil))))) + +(defun translate-files (out-ufile extension input-ufiles &key force) "Translate a umls file into a format suitable for sql copy cmd" (let ((output-path (ufile-pathname out-ufile extension))) - (if (probe-file output-path) - (format t "File ~A already exists: skipping~%" output-path) - (with-open-file (ostream output-path :direction :output - #+(and clisp unicode) :external-format - #+(and clisp unicode) charset:utf-8) - (dolist (input-ufile input-ufiles) - (with-umls-ufile (line input-ufile) - (translate-line out-ufile line ostream) - (princ #\newline ostream))))))) + (when (and (not force) (verify-translation-file output-path input-ufiles)) + (return-from translate-files output-path)) + (with-open-file (ostream output-path :direction :output + :if-exists :overwrite + :if-does-not-exist :create + #+(and sbcl sb-unicode) :external-format + #+(and sbcl sb-unicode) :UTF-8 + #+(and allegro ics) :external-format + #+(and allegro ics) :UTF-8 + #+lispworks :external-format + #+lispworks :UTF-8 + #+(and clisp unicode) :external-format + #+(and clisp unicode) charset:utf-8) + (dolist (input-ufile input-ufiles) + (with-umls-ufile (line input-ufile) + (translate-line out-ufile line ostream) + (princ #\newline ostream)))))) (defun translate-line (file line strm) "Translate a single line for sql output" (flet ((col-value (col value) - (if (eq (datatype col) 'sql-u) - (let ((ui (parse-ui value ""))) - (if (stringp ui) - ui - (write-to-string ui))) - (escape-backslashes value)))) + (if (eq (datatype col) 'sql-u) + (let ((ui (parse-ui value ""))) + (if (stringp ui) + ui + (write-to-string ui))) + (escape-backslashes value)))) (print-separated-strings strm "|" (mapcar #'col-value (remove-custom-cols (ucols file)) line) @@ -310,13 +455,13 @@ This is much faster that using create-umls-db-insert." nil "COPY ~a FROM '~a' using delimiters '|' with null as ''" (table file) (ufile-pathname file extension))) -(defun mysql-copy-cmd (file extension &key local-file) +(defun mysql-copy-cmd (file extension &key (local-file t)) "Return mysql copy statement for a file" (format nil - "LOAD DATA ~AINFILE \"~a\" INTO TABLE ~a FIELDS TERMINATED BY \"|\"" + "LOAD DATA ~AINFILE '~a' INTO TABLE ~a FIELDS TERMINATED BY '|'" (if local-file "LOCAL " "") - (ufile-pathname file extension) (table file))) + (namestring (ufile-pathname file extension)) (table file))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -328,29 +473,29 @@ This is much faster that using create-umls-db-insert." (defun umls-fixed-size-waste () "Display storage waste if using all fixed size storage" (let ((totalwaste 0) - (totalunavoidable 0) - (totalavoidable 0) - (unavoidable '()) - (avoidable '())) + (totalunavoidable 0) + (totalavoidable 0) + (unavoidable '()) + (avoidable '())) (dolist (file *umls-files*) (dolist (col (ucols file)) - (let* ((avwaste (- (cmax col) (av col))) - (cwaste (* avwaste (rws file)))) - (when (plusp cwaste) - (if (<= avwaste 6) - (progn - (incf totalunavoidable cwaste) - (push (list (fil file) (col col) - avwaste cwaste) - unavoidable)) - (progn - (incf totalavoidable cwaste) - (push (list (fil file) (col col) - avwaste cwaste) - avoidable))) - (incf totalwaste cwaste))))) + (let* ((avwaste (- (cmax col) (av col))) + (cwaste (* avwaste (rws file)))) + (when (plusp cwaste) + (if (<= avwaste 6) + (progn + (incf totalunavoidable cwaste) + (push (list (fil file) (col col) + avwaste cwaste) + unavoidable)) + (progn + (incf totalavoidable cwaste) + (push (list (fil file) (col col) + avwaste cwaste) + avoidable))) + (incf totalwaste cwaste))))) (values totalwaste totalavoidable totalunavoidable - (nreverse avoidable) (nreverse unavoidable)))) + (nreverse avoidable) (nreverse unavoidable)))) (defun display-waste () (ensure-ucols+ufiles) @@ -371,10 +516,10 @@ This is much faster that using create-umls-db-insert." (declare (optimize (speed 3) (space 0))) (ensure-ucols+ufiles) (let ((max 0)) - (declare (fixnum max)) + (declare (type (integer 0 1000000) max)) (dolist (ucol *umls-cols*) - (when (> (cmax ucol) max) - (setq max (cmax ucol)))) + (when (> (the (integer 0 1000000) (cmax ucol)) max) + (setq max (cmax ucol)))) max)) (defun max-umls-row () @@ -384,7 +529,11 @@ This is much faster that using create-umls-db-insert." (let ((rowsizes '())) (dolist (file *umls-files*) (let ((row 0)) - (dolist (ucol (ucols file)) - (incf row (1+ (cmax ucol)))) - (push row rowsizes))) + (declare (type (integer 0 1000000) row)) + (dolist (ucol (ucols file)) + (let* ((col-max (cmax ucol)) + (max-with-delim (1+ col-max))) + (declare (type (integer 0 1000000) col-max max-with-delim)) + (incf row max-with-delim))) + (push row rowsizes))) (car (sort rowsizes #'>))))