From 7fd9dbbfd26f1090cca40efe3f8667f816a3c2f6 Mon Sep 17 00:00:00 2001
From: Lenz Furrer <furrer@cl.uzh.ch>
Date: Sat, 28 Jul 2018 20:58:06 +0200
Subject: [PATCH] more comprehensive word embeddings (data)

---
 config  | 13 +++++----
 log     | 83 ++++++++++++++++++++++++++++-----------------------------
 results |  4 +--
 3 files changed, 49 insertions(+), 51 deletions(-)

diff --git a/config b/config
index 5a5c1e9..485181c 100644
--- a/config
+++ b/config
@@ -1,6 +1,4 @@
 [DEFAULT]
-rootpath = /home/lenz/disease-normalization
-timestamp = 20180620-103131
 workers = 0
 
 [general]
@@ -11,8 +9,9 @@ prediction_subset = dev
 [logging]
 format = %(asctime)s - %(message)s
 level = INFO
-summary_fn = runs/summaries/${timestamp}.txt
-prediction_fn = runs/predictions/${timestamp}.tsv
+summary_fn = ${rootpath}/runs/summaries/${timestamp}.txt
+prediction_fn = ${rootpath}/runs/predictions/${timestamp}.tsv
+detailed_fn = ${rootpath}/runs/detailed/${timestamp}.{}.tsv
 
 [candidates]
 generator = SGramCosine(.5, 20)
@@ -25,8 +24,8 @@ embedding_dim = 50
 embedding_voc = 10000
 vectorizer_cache = True
 tokenizer = whitespace
-embedding_fn = ${rootpath}/data/embeddings/wvec_50_haodi-li-et-al.bin
-trainable = True
+embedding_fn = ${rootpath}/data/embeddings/wvec_200_win-30_chiu-et-al.bin
+trainable = False
 
 [emb_sub]
 sample_size = ${emb:sample_size}
@@ -39,7 +38,7 @@ embedding_fn = ${rootpath}/data/embeddings/bpe_vectors_10000_50_w2v.txt
 trainable = False
 
 [rank]
-embeddings = ["emb_sub"]
+embeddings = ["emb"]
 n_kernels = 50
 filter_width = 3
 activation = tanh
diff --git a/log b/log
index 339f8fd..db76fc2 100644
--- a/log
+++ b/log
@@ -1,42 +1,41 @@
-2018-06-20 10:31:32,824 - The cuda backend is deprecated and will be removed in the next release (v0.10).  Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
- https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29
-
-2018-06-20 10:31:36,084 - 'pattern' package not found; tag filters are not available for English
-2018-06-20 10:31:36,092 - loading terminology...
-2018-06-20 10:31:36,403 - loading pretrained embeddings...
-2018-06-20 10:31:36,403 - loading projection weights from /home/lenz/disease-normalization/data/embeddings/bpe_vectors_10000_50_w2v.txt
-2018-06-20 10:31:37,271 - loaded (10257, 50) matrix from /home/lenz/disease-normalization/data/embeddings/bpe_vectors_10000_50_w2v.txt
-2018-06-20 10:31:37,284 - loading vectorizer...
-2018-06-20 10:31:37,416 - loading candidate generator...
-2018-06-20 10:31:51,496 - preprocessing validation data...
-2018-06-20 10:31:51,496 - loading corpus...
-2018-06-20 10:31:51,505 - generating candidates with 0 workers...
-2018-06-20 10:31:55,387 - generated 5671 pair-wise samples (11585 with duplicates)
-2018-06-20 10:31:55,389 - compiling model architecture...
-2018-06-20 10:31:56,393 - preprocessing training data...
-2018-06-20 10:31:56,393 - loading corpus...
-2018-06-20 10:31:56,587 - generating candidates with 0 workers...
-2018-06-20 10:32:14,387 - generated 26308 pair-wise samples (71125 with duplicates)
-2018-06-20 10:32:14,409 - training CNN...
-2018-06-20 10:32:37,062 - Ranking accuracy: 0.491741
-2018-06-20 10:32:46,149 - Ranking accuracy: 0.550191
-2018-06-20 10:32:55,435 - Ranking accuracy: 0.645489
-2018-06-20 10:33:04,643 - Ranking accuracy: 0.664549
-2018-06-20 10:33:13,936 - Ranking accuracy: 0.669632
-2018-06-20 10:33:22,963 - Ranking accuracy: 0.673443
-2018-06-20 10:33:32,155 - Ranking accuracy: 0.691233
-2018-06-20 10:33:41,079 - Ranking accuracy: 0.696315
-2018-06-20 10:33:50,323 - Ranking accuracy: 0.715375
-2018-06-20 10:33:59,614 - Ranking accuracy: 0.729352
-2018-06-20 10:34:08,834 - Ranking accuracy: 0.733164
-2018-06-20 10:34:18,368 - Ranking accuracy: 0.747141
-2018-06-20 10:34:27,433 - Ranking accuracy: 0.747141
-2018-06-20 10:34:36,540 - Ranking accuracy: 0.750953
-2018-06-20 10:34:45,699 - Ranking accuracy: 0.750953
-2018-06-20 10:34:54,978 - Ranking accuracy: 0.752224
-2018-06-20 10:35:04,327 - Ranking accuracy: 0.750953
-2018-06-20 10:35:13,851 - Ranking accuracy: 0.756036
-2018-06-20 10:35:23,100 - Ranking accuracy: 0.756036
-2018-06-20 10:35:32,328 - Ranking accuracy: 0.756036
-2018-06-20 10:35:32,329 - Epoch 00020: early stopping
-2018-06-20 10:35:32,330 - done training.
+2018-07-28 20:25:01,335 - 'pattern' package not found; tag filters are not available for English
+2018-07-28 20:25:01,342 - loading terminology...
+2018-07-28 20:25:01,560 - loading pretrained embeddings...
+2018-07-28 20:25:01,560 - loading projection weights from /mnt/storage/karr/users/furrer/prlnk/data/embeddings/wvec_200_win-30_chiu-et-al.bin
+2018-07-28 20:25:47,157 - loaded (2231686, 200) matrix from /mnt/storage/karr/users/furrer/prlnk/data/embeddings/wvec_200_win-30_chiu-et-al.bin
+2018-07-28 20:25:51,961 - loading vectorizer...
+2018-07-28 20:25:51,962 - loading candidate generator...
+2018-07-28 20:26:03,723 - preprocessing validation data...
+2018-07-28 20:26:03,724 - loading corpus...
+2018-07-28 20:26:03,757 - generating candidates with 0 workers...
+2018-07-28 20:26:06,172 - generated 5671 pair-wise samples (11585 with duplicates)
+2018-07-28 20:26:06,174 - compiling model architecture...
+2018-07-28 20:26:20,043 - preprocessing training data...
+2018-07-28 20:26:20,043 - loading corpus...
+2018-07-28 20:26:20,100 - generating candidates with 0 workers...
+2018-07-28 20:26:31,276 - generated 26308 pair-wise samples (71125 with duplicates)
+2018-07-28 20:26:31,284 - training CNN...
+2018-07-28 20:27:06,780 - Ranking accuracy: 0.631512
+2018-07-28 20:27:46,468 - Ranking accuracy: 0.644219
+2018-07-28 20:28:25,710 - Ranking accuracy: 0.684879
+2018-07-28 20:29:04,919 - Ranking accuracy: 0.696315
+2018-07-28 20:29:44,119 - Ranking accuracy: 0.712834
+2018-07-28 20:30:24,257 - Ranking accuracy: 0.724269
+2018-07-28 20:31:03,956 - Ranking accuracy: 0.735705
+2018-07-28 20:31:43,486 - Ranking accuracy: 0.750953
+2018-07-28 20:32:23,224 - Ranking accuracy: 0.761118
+2018-07-28 20:33:02,324 - Ranking accuracy: 0.761118
+2018-07-28 20:33:36,861 - Ranking accuracy: 0.762389
+2018-07-28 20:34:16,102 - Ranking accuracy: 0.76493
+2018-07-28 20:34:55,461 - Ranking accuracy: 0.759848
+2018-07-28 20:35:31,168 - Ranking accuracy: 0.768742
+2018-07-28 20:36:10,645 - Ranking accuracy: 0.770013
+2018-07-28 20:36:49,943 - Ranking accuracy: 0.771283
+2018-07-28 20:37:29,770 - Ranking accuracy: 0.767471
+2018-07-28 20:38:04,778 - Ranking accuracy: 0.759848
+2018-07-28 20:38:04,778 - Epoch 00018: early stopping
+2018-07-28 20:38:04,779 - done training.
+2018-07-28 20:38:04,785 - load best model...
+2018-07-28 20:38:17,594 - predict scores for validation data...
+2018-07-28 20:38:19,316 - evaluate and/or serialize...
+2018-07-28 20:38:19,374 - done.
diff --git a/results b/results
index c665426..5c7e222 100644
--- a/results
+++ b/results
@@ -1,5 +1,5 @@
-accuracy     0.7560355781448539
-correct        595
+accuracy     0.7712833545108005
+correct        607
 total          787
 unreachable    129
 nocandidates    10