X-Git-Url: http://git.kpe.io/?p=umlisp.git;a=blobdiff_plain;f=create-sql.lisp;h=4a4413a1aa2e93e7f1f4431c7fb6ed66fedbfcf2;hp=fdc444c9cf2bd8fb1279674bff3f8e5c0e414cf0;hb=57c4b059639968903aec88e65103a7263cb97535;hpb=612a2df000b3ff47d2454dbad0b901c1aa5558e7 diff --git a/create-sql.lisp b/create-sql.lisp index fdc444c..4a4413a 100644 --- a/create-sql.lisp +++ b/create-sql.lisp @@ -46,7 +46,7 @@ " MAX_ROWS=200000000" "") (if (eq *umls-sql-type* :mysql) - " TYPE=MYISAM DEFAULT CHARACTER latin1" + " TYPE=MYISAM CHARACTER SET utf8" "")))) (defun create-custom-table-cmd (tablename sql-cmd) @@ -169,6 +169,8 @@ (:oracle "NUMBER(2,0)") (t "INTEGER"))) :database conn) + ;; KCON deprecated by KPFENG field in MRCONSO + #+nil (dolist (tuple (query "select distinct cui from MRCONSO order by cui" :database conn)) (let ((cui (car tuple))) @@ -240,7 +242,7 @@ (sql-create-indexes conn +custom-index-cols+) (sql-create-special-tables conn))) -(defun create-umls-db (&key (extension ".trans") (skip-translation nil)) +(defun create-umls-db (&key (extension "-trans") (skip-translation nil)) "SQL Databases: initializes entire database via SQL copy commands. This is much faster that using create-umls-db-insert." (ensure-ucols+ufiles) @@ -262,7 +264,7 @@ This is much faster that using create-umls-db-insert." (sql-create-indexes conn +custom-index-cols+) (sql-create-special-tables conn)))) -(defun translate-all-files (&optional (extension ".trans")) +(defun translate-all-files (&optional (extension "-trans")) "Copy translated files and return postgresql copy commands to import" (make-noneng-index-file extension) (dolist (f (remove "MRXW_NONENG.RRF" *umls-files* :test #'string= :key #'fil)) @@ -277,18 +279,53 @@ This is much faster that using create-umls-db-insert." (translate-files (find-ufile "MRXW_NONENG.RRF") extension (noneng-lang-index-files))) +(defun verify-translation-file (output-path input-ufiles) + "Returns t if translation file exists and is correct size. Warns and deletes incomplete translation file." + (when (probe-file output-path) + (let ((translated-lines 0) + (input-lines 0) + (eof (cons nil nil))) + (catch 'done-counting + (with-open-file (ts output-path :direction :input + #+(and clisp unicode) :external-format + #+(and clisp unicode) charset:utf-8) + (do () + ((eq (read-line ts nil eof) eof)) + (incf translated-lines))) + (dolist (input-ufile input-ufiles) + (with-umls-ufile (line input-ufile) + (incf input-lines) + (when (> input-lines translated-lines) + (throw 'done-counting 'incomplete))))) + (cond + ((eql input-lines 0) + (error "Input lines is 0") + nil) + ((< input-lines translated-lines) + (format t "Translated file ~A incomplete, deleting...~%" output-path) + (delete-file output-path) + nil) + ((eql input-lines translated-lines) + (format t "Translated file ~A already exists: skipping...~%" output-path) + t) + ((> translated-lines input-lines) + (error "Shouldn't happen. Translated lines of ~A is ~D, greater than input lines ~D" + output-path translated-lines input-lines) + (delete-file output-path) + nil))))) + (defun translate-files (out-ufile extension input-ufiles) "Translate a umls file into a format suitable for sql copy cmd" (let ((output-path (ufile-pathname out-ufile extension))) - (if (probe-file output-path) - (format t "File ~A already exists: skipping~%" output-path) - (with-open-file (ostream output-path :direction :output - #+(and clisp unicode) :external-format - #+(and clisp unicode) charset:utf-8) - (dolist (input-ufile input-ufiles) - (with-umls-ufile (line input-ufile) - (translate-line out-ufile line ostream) - (princ #\newline ostream))))))) + (when (verify-translation-file output-path input-ufiles) + (return-from translate-files output-path)) + (with-open-file (ostream output-path :direction :output + #+(and clisp unicode) :external-format + #+(and clisp unicode) charset:utf-8) + (dolist (input-ufile input-ufiles) + (with-umls-ufile (line input-ufile) + (translate-line out-ufile line ostream) + (princ #\newline ostream)))))) (defun translate-line (file line strm) "Translate a single line for sql output" @@ -310,13 +347,13 @@ This is much faster that using create-umls-db-insert." nil "COPY ~a FROM '~a' using delimiters '|' with null as ''" (table file) (ufile-pathname file extension))) -(defun mysql-copy-cmd (file extension &key local-file) +(defun mysql-copy-cmd (file extension &key (local-file t)) "Return mysql copy statement for a file" (format nil "LOAD DATA ~AINFILE \"~a\" INTO TABLE ~a FIELDS TERMINATED BY \"|\"" (if local-file "LOCAL " "") - (ufile-pathname file extension) (table file))) + (namestring (ufile-pathname file extension)) (table file))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;