Skip to content

Commit

Permalink
more comprehensive word embeddings (data)
Browse files Browse the repository at this point in the history
  • Loading branch information
Lenz Furrer committed Jul 28, 2018
1 parent 263abb6 commit 7fd9dbb
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 51 deletions.
13 changes: 6 additions & 7 deletions config
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
[DEFAULT]
rootpath = /home/lenz/disease-normalization
timestamp = 20180620-103131
workers = 0

[general]
Expand All @@ -11,8 +9,9 @@ prediction_subset = dev
[logging]
format = %(asctime)s - %(message)s
level = INFO
summary_fn = runs/summaries/${timestamp}.txt
prediction_fn = runs/predictions/${timestamp}.tsv
summary_fn = ${rootpath}/runs/summaries/${timestamp}.txt
prediction_fn = ${rootpath}/runs/predictions/${timestamp}.tsv
detailed_fn = ${rootpath}/runs/detailed/${timestamp}.{}.tsv

[candidates]
generator = SGramCosine(.5, 20)
Expand All @@ -25,8 +24,8 @@ embedding_dim = 50
embedding_voc = 10000
vectorizer_cache = True
tokenizer = whitespace
embedding_fn = ${rootpath}/data/embeddings/wvec_50_haodi-li-et-al.bin
trainable = True
embedding_fn = ${rootpath}/data/embeddings/wvec_200_win-30_chiu-et-al.bin
trainable = False

[emb_sub]
sample_size = ${emb:sample_size}
Expand All @@ -39,7 +38,7 @@ embedding_fn = ${rootpath}/data/embeddings/bpe_vectors_10000_50_w2v.txt
trainable = False

[rank]
embeddings = ["emb_sub"]
embeddings = ["emb"]
n_kernels = 50
filter_width = 3
activation = tanh
Expand Down
83 changes: 41 additions & 42 deletions log
Original file line number Diff line number Diff line change
@@ -1,42 +1,41 @@
2018-06-20 10:31:32,824 - The cuda backend is deprecated and will be removed in the next release (v0.10). Please switch to the gpuarray backend. You can get more information about how to switch at this URL:
https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

2018-06-20 10:31:36,084 - 'pattern' package not found; tag filters are not available for English
2018-06-20 10:31:36,092 - loading terminology...
2018-06-20 10:31:36,403 - loading pretrained embeddings...
2018-06-20 10:31:36,403 - loading projection weights from /home/lenz/disease-normalization/data/embeddings/bpe_vectors_10000_50_w2v.txt
2018-06-20 10:31:37,271 - loaded (10257, 50) matrix from /home/lenz/disease-normalization/data/embeddings/bpe_vectors_10000_50_w2v.txt
2018-06-20 10:31:37,284 - loading vectorizer...
2018-06-20 10:31:37,416 - loading candidate generator...
2018-06-20 10:31:51,496 - preprocessing validation data...
2018-06-20 10:31:51,496 - loading corpus...
2018-06-20 10:31:51,505 - generating candidates with 0 workers...
2018-06-20 10:31:55,387 - generated 5671 pair-wise samples (11585 with duplicates)
2018-06-20 10:31:55,389 - compiling model architecture...
2018-06-20 10:31:56,393 - preprocessing training data...
2018-06-20 10:31:56,393 - loading corpus...
2018-06-20 10:31:56,587 - generating candidates with 0 workers...
2018-06-20 10:32:14,387 - generated 26308 pair-wise samples (71125 with duplicates)
2018-06-20 10:32:14,409 - training CNN...
2018-06-20 10:32:37,062 - Ranking accuracy: 0.491741
2018-06-20 10:32:46,149 - Ranking accuracy: 0.550191
2018-06-20 10:32:55,435 - Ranking accuracy: 0.645489
2018-06-20 10:33:04,643 - Ranking accuracy: 0.664549
2018-06-20 10:33:13,936 - Ranking accuracy: 0.669632
2018-06-20 10:33:22,963 - Ranking accuracy: 0.673443
2018-06-20 10:33:32,155 - Ranking accuracy: 0.691233
2018-06-20 10:33:41,079 - Ranking accuracy: 0.696315
2018-06-20 10:33:50,323 - Ranking accuracy: 0.715375
2018-06-20 10:33:59,614 - Ranking accuracy: 0.729352
2018-06-20 10:34:08,834 - Ranking accuracy: 0.733164
2018-06-20 10:34:18,368 - Ranking accuracy: 0.747141
2018-06-20 10:34:27,433 - Ranking accuracy: 0.747141
2018-06-20 10:34:36,540 - Ranking accuracy: 0.750953
2018-06-20 10:34:45,699 - Ranking accuracy: 0.750953
2018-06-20 10:34:54,978 - Ranking accuracy: 0.752224
2018-06-20 10:35:04,327 - Ranking accuracy: 0.750953
2018-06-20 10:35:13,851 - Ranking accuracy: 0.756036
2018-06-20 10:35:23,100 - Ranking accuracy: 0.756036
2018-06-20 10:35:32,328 - Ranking accuracy: 0.756036
2018-06-20 10:35:32,329 - Epoch 00020: early stopping
2018-06-20 10:35:32,330 - done training.
2018-07-28 20:25:01,335 - 'pattern' package not found; tag filters are not available for English
2018-07-28 20:25:01,342 - loading terminology...
2018-07-28 20:25:01,560 - loading pretrained embeddings...
2018-07-28 20:25:01,560 - loading projection weights from /mnt/storage/karr/users/furrer/prlnk/data/embeddings/wvec_200_win-30_chiu-et-al.bin
2018-07-28 20:25:47,157 - loaded (2231686, 200) matrix from /mnt/storage/karr/users/furrer/prlnk/data/embeddings/wvec_200_win-30_chiu-et-al.bin
2018-07-28 20:25:51,961 - loading vectorizer...
2018-07-28 20:25:51,962 - loading candidate generator...
2018-07-28 20:26:03,723 - preprocessing validation data...
2018-07-28 20:26:03,724 - loading corpus...
2018-07-28 20:26:03,757 - generating candidates with 0 workers...
2018-07-28 20:26:06,172 - generated 5671 pair-wise samples (11585 with duplicates)
2018-07-28 20:26:06,174 - compiling model architecture...
2018-07-28 20:26:20,043 - preprocessing training data...
2018-07-28 20:26:20,043 - loading corpus...
2018-07-28 20:26:20,100 - generating candidates with 0 workers...
2018-07-28 20:26:31,276 - generated 26308 pair-wise samples (71125 with duplicates)
2018-07-28 20:26:31,284 - training CNN...
2018-07-28 20:27:06,780 - Ranking accuracy: 0.631512
2018-07-28 20:27:46,468 - Ranking accuracy: 0.644219
2018-07-28 20:28:25,710 - Ranking accuracy: 0.684879
2018-07-28 20:29:04,919 - Ranking accuracy: 0.696315
2018-07-28 20:29:44,119 - Ranking accuracy: 0.712834
2018-07-28 20:30:24,257 - Ranking accuracy: 0.724269
2018-07-28 20:31:03,956 - Ranking accuracy: 0.735705
2018-07-28 20:31:43,486 - Ranking accuracy: 0.750953
2018-07-28 20:32:23,224 - Ranking accuracy: 0.761118
2018-07-28 20:33:02,324 - Ranking accuracy: 0.761118
2018-07-28 20:33:36,861 - Ranking accuracy: 0.762389
2018-07-28 20:34:16,102 - Ranking accuracy: 0.76493
2018-07-28 20:34:55,461 - Ranking accuracy: 0.759848
2018-07-28 20:35:31,168 - Ranking accuracy: 0.768742
2018-07-28 20:36:10,645 - Ranking accuracy: 0.770013
2018-07-28 20:36:49,943 - Ranking accuracy: 0.771283
2018-07-28 20:37:29,770 - Ranking accuracy: 0.767471
2018-07-28 20:38:04,778 - Ranking accuracy: 0.759848
2018-07-28 20:38:04,778 - Epoch 00018: early stopping
2018-07-28 20:38:04,779 - done training.
2018-07-28 20:38:04,785 - load best model...
2018-07-28 20:38:17,594 - predict scores for validation data...
2018-07-28 20:38:19,316 - evaluate and/or serialize...
2018-07-28 20:38:19,374 - done.
4 changes: 2 additions & 2 deletions results
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
accuracy 0.7560355781448539
correct 595
accuracy 0.7712833545108005
correct 607
total 787
unreachable 129
nocandidates 10
Expand Down

0 comments on commit 7fd9dbb

Please sign in to comment.