From 7fd9dbbfd26f1090cca40efe3f8667f816a3c2f6 Mon Sep 17 00:00:00 2001 From: Lenz Furrer Date: Sat, 28 Jul 2018 20:58:06 +0200 Subject: [PATCH] more comprehensive word embeddings (data) --- config | 13 +++++---- log | 83 ++++++++++++++++++++++++++++----------------------------- results | 4 +-- 3 files changed, 49 insertions(+), 51 deletions(-) diff --git a/config b/config index 5a5c1e9..485181c 100644 --- a/config +++ b/config @@ -1,6 +1,4 @@ [DEFAULT] -rootpath = /home/lenz/disease-normalization -timestamp = 20180620-103131 workers = 0 [general] @@ -11,8 +9,9 @@ prediction_subset = dev [logging] format = %(asctime)s - %(message)s level = INFO -summary_fn = runs/summaries/${timestamp}.txt -prediction_fn = runs/predictions/${timestamp}.tsv +summary_fn = ${rootpath}/runs/summaries/${timestamp}.txt +prediction_fn = ${rootpath}/runs/predictions/${timestamp}.tsv +detailed_fn = ${rootpath}/runs/detailed/${timestamp}.{}.tsv [candidates] generator = SGramCosine(.5, 20) @@ -25,8 +24,8 @@ embedding_dim = 50 embedding_voc = 10000 vectorizer_cache = True tokenizer = whitespace -embedding_fn = ${rootpath}/data/embeddings/wvec_50_haodi-li-et-al.bin -trainable = True +embedding_fn = ${rootpath}/data/embeddings/wvec_200_win-30_chiu-et-al.bin +trainable = False [emb_sub] sample_size = ${emb:sample_size} @@ -39,7 +38,7 @@ embedding_fn = ${rootpath}/data/embeddings/bpe_vectors_10000_50_w2v.txt trainable = False [rank] -embeddings = ["emb_sub"] +embeddings = ["emb"] n_kernels = 50 filter_width = 3 activation = tanh diff --git a/log b/log index 339f8fd..db76fc2 100644 --- a/log +++ b/log @@ -1,42 +1,41 @@ -2018-06-20 10:31:32,824 - The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL: - https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29 - -2018-06-20 10:31:36,084 - 'pattern' package not found; tag filters are not available for English -2018-06-20 10:31:36,092 - loading terminology... -2018-06-20 10:31:36,403 - loading pretrained embeddings... -2018-06-20 10:31:36,403 - loading projection weights from /home/lenz/disease-normalization/data/embeddings/bpe_vectors_10000_50_w2v.txt -2018-06-20 10:31:37,271 - loaded (10257, 50) matrix from /home/lenz/disease-normalization/data/embeddings/bpe_vectors_10000_50_w2v.txt -2018-06-20 10:31:37,284 - loading vectorizer... -2018-06-20 10:31:37,416 - loading candidate generator... -2018-06-20 10:31:51,496 - preprocessing validation data... -2018-06-20 10:31:51,496 - loading corpus... -2018-06-20 10:31:51,505 - generating candidates with 0 workers... -2018-06-20 10:31:55,387 - generated 5671 pair-wise samples (11585 with duplicates) -2018-06-20 10:31:55,389 - compiling model architecture... -2018-06-20 10:31:56,393 - preprocessing training data... -2018-06-20 10:31:56,393 - loading corpus... -2018-06-20 10:31:56,587 - generating candidates with 0 workers... -2018-06-20 10:32:14,387 - generated 26308 pair-wise samples (71125 with duplicates) -2018-06-20 10:32:14,409 - training CNN... -2018-06-20 10:32:37,062 - Ranking accuracy: 0.491741 -2018-06-20 10:32:46,149 - Ranking accuracy: 0.550191 -2018-06-20 10:32:55,435 - Ranking accuracy: 0.645489 -2018-06-20 10:33:04,643 - Ranking accuracy: 0.664549 -2018-06-20 10:33:13,936 - Ranking accuracy: 0.669632 -2018-06-20 10:33:22,963 - Ranking accuracy: 0.673443 -2018-06-20 10:33:32,155 - Ranking accuracy: 0.691233 -2018-06-20 10:33:41,079 - Ranking accuracy: 0.696315 -2018-06-20 10:33:50,323 - Ranking accuracy: 0.715375 -2018-06-20 10:33:59,614 - Ranking accuracy: 0.729352 -2018-06-20 10:34:08,834 - Ranking accuracy: 0.733164 -2018-06-20 10:34:18,368 - Ranking accuracy: 0.747141 -2018-06-20 10:34:27,433 - Ranking accuracy: 0.747141 -2018-06-20 10:34:36,540 - Ranking accuracy: 0.750953 -2018-06-20 10:34:45,699 - Ranking accuracy: 0.750953 -2018-06-20 10:34:54,978 - Ranking accuracy: 0.752224 -2018-06-20 10:35:04,327 - Ranking accuracy: 0.750953 -2018-06-20 10:35:13,851 - Ranking accuracy: 0.756036 -2018-06-20 10:35:23,100 - Ranking accuracy: 0.756036 -2018-06-20 10:35:32,328 - Ranking accuracy: 0.756036 -2018-06-20 10:35:32,329 - Epoch 00020: early stopping -2018-06-20 10:35:32,330 - done training. +2018-07-28 20:25:01,335 - 'pattern' package not found; tag filters are not available for English +2018-07-28 20:25:01,342 - loading terminology... +2018-07-28 20:25:01,560 - loading pretrained embeddings... +2018-07-28 20:25:01,560 - loading projection weights from /mnt/storage/karr/users/furrer/prlnk/data/embeddings/wvec_200_win-30_chiu-et-al.bin +2018-07-28 20:25:47,157 - loaded (2231686, 200) matrix from /mnt/storage/karr/users/furrer/prlnk/data/embeddings/wvec_200_win-30_chiu-et-al.bin +2018-07-28 20:25:51,961 - loading vectorizer... +2018-07-28 20:25:51,962 - loading candidate generator... +2018-07-28 20:26:03,723 - preprocessing validation data... +2018-07-28 20:26:03,724 - loading corpus... +2018-07-28 20:26:03,757 - generating candidates with 0 workers... +2018-07-28 20:26:06,172 - generated 5671 pair-wise samples (11585 with duplicates) +2018-07-28 20:26:06,174 - compiling model architecture... +2018-07-28 20:26:20,043 - preprocessing training data... +2018-07-28 20:26:20,043 - loading corpus... +2018-07-28 20:26:20,100 - generating candidates with 0 workers... +2018-07-28 20:26:31,276 - generated 26308 pair-wise samples (71125 with duplicates) +2018-07-28 20:26:31,284 - training CNN... +2018-07-28 20:27:06,780 - Ranking accuracy: 0.631512 +2018-07-28 20:27:46,468 - Ranking accuracy: 0.644219 +2018-07-28 20:28:25,710 - Ranking accuracy: 0.684879 +2018-07-28 20:29:04,919 - Ranking accuracy: 0.696315 +2018-07-28 20:29:44,119 - Ranking accuracy: 0.712834 +2018-07-28 20:30:24,257 - Ranking accuracy: 0.724269 +2018-07-28 20:31:03,956 - Ranking accuracy: 0.735705 +2018-07-28 20:31:43,486 - Ranking accuracy: 0.750953 +2018-07-28 20:32:23,224 - Ranking accuracy: 0.761118 +2018-07-28 20:33:02,324 - Ranking accuracy: 0.761118 +2018-07-28 20:33:36,861 - Ranking accuracy: 0.762389 +2018-07-28 20:34:16,102 - Ranking accuracy: 0.76493 +2018-07-28 20:34:55,461 - Ranking accuracy: 0.759848 +2018-07-28 20:35:31,168 - Ranking accuracy: 0.768742 +2018-07-28 20:36:10,645 - Ranking accuracy: 0.770013 +2018-07-28 20:36:49,943 - Ranking accuracy: 0.771283 +2018-07-28 20:37:29,770 - Ranking accuracy: 0.767471 +2018-07-28 20:38:04,778 - Ranking accuracy: 0.759848 +2018-07-28 20:38:04,778 - Epoch 00018: early stopping +2018-07-28 20:38:04,779 - done training. +2018-07-28 20:38:04,785 - load best model... +2018-07-28 20:38:17,594 - predict scores for validation data... +2018-07-28 20:38:19,316 - evaluate and/or serialize... +2018-07-28 20:38:19,374 - done. diff --git a/results b/results index c665426..5c7e222 100644 --- a/results +++ b/results @@ -1,5 +1,5 @@ -accuracy 0.7560355781448539 -correct 595 +accuracy 0.7712833545108005 +correct 607 total 787 unreachable 129 nocandidates 10