From 3226affe4b58805e178edd4ddb8c0ba33f90cdf4 Mon Sep 17 00:00:00 2001 From: anlausch Date: Fri, 26 Apr 2019 17:03:58 +0200 Subject: [PATCH] Status of submission --- dict2vec_en.sh | 21 ++++++++ fasttext_en_xling.sh | 38 +++++++++++++++ fasttext_multiling.sh | 13 +---- fasttext_multiling_cc.sh | 22 +++++++++ fasttext_xling.sh | 3 +- fasttext_xling2.sh | 63 ++++++++++++++++++++++++ fasttext_xling_es.sh | 79 ++++++++++++++++++++++++++++++ ft_postspecialized.sh | 25 ++++++++++ glove_en_cc.sh | 22 +++++++++ glove_en_tweets.sh | 22 +++++++++ glove_reproduction.sh | 7 +-- latex.py | 102 +++++++++++++++++++++++++++++++++++++++ utils.py | 19 ++++++-- w2v_multiling_wiki.sh | 22 +++++++++ weat.py | 81 +++++++++++++++++++++++++++---- 15 files changed, 511 insertions(+), 28 deletions(-) create mode 100644 dict2vec_en.sh create mode 100644 fasttext_en_xling.sh create mode 100644 fasttext_multiling_cc.sh create mode 100644 fasttext_xling2.sh create mode 100644 fasttext_xling_es.sh create mode 100644 ft_postspecialized.sh create mode 100644 glove_en_cc.sh create mode 100644 glove_en_tweets.sh create mode 100644 latex.py create mode 100644 w2v_multiling_wiki.sh diff --git a/dict2vec_en.sh b/dict2vec_en.sh new file mode 100644 index 0000000..f14b8bd --- /dev/null +++ b/dict2vec_en.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +for similarity_type in "cosine" "euclidean" ; do + for test_number in 6 7 8 9 10 1 2 3 4 5 ; do + for language in "en" ; do + echo $language + echo $similarity_type + echo $test_number + python weat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/dict2vec_${language}_${similarity_type}_${test_number}_cased.res \ + --lower False \ + --use_glove False \ + --is_vec_format True \ + --lang $language \ + --embeddings \ + /work/anlausch/dict2vec-vectors-dim300.vec \ + --similarity_type $similarity_type |& tee ./results/dict2vec_${language}_${similarity_type}_${test_number}_cased.out + done + done +done \ No newline at end of file diff --git a/fasttext_en_xling.sh b/fasttext_en_xling.sh new file mode 100644 index 0000000..58eb8b9 --- /dev/null +++ b/fasttext_en_xling.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +#for similarity_type in "cosine" ; do +for similarity_type in "cosine" "euclidean" ; do + targets_language="en" + attributes_language="en" + for xspace in "en-de" "en-hr" "en-it" "en-ru" "en-tr" "ru-en" "tr-en" "de-en"; do + for test_number in 1 2 3 4 5 6 7 8 9 10 ; do + echo $targets_language + echo $attributes_language + echo $similarity_type + echo $test_number + echo $xspace + + dir="/work/gglavas/data/word_embs/yacle/mappings/new/smith/fasttext/${xspace}" + + if [ -d "$dir" ]; then + python xweat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/ft_xling_space-${xspace}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.res \ + --lower True \ + --use_glove False \ + --targets_lang $targets_language \ + --attributes_lang $attributes_language \ + --targets_embedding_vocab \ + ${dir}/vocab_${xspace}.${targets_language}.yacle.train.freq.5k.pkl \ + --targets_embedding_vectors \ + ${dir}/vectors_${xspace}.${targets_language}.yacle.train.freq.5k.np \ + --attributes_embedding_vocab \ + ${dir}/vocab_${xspace}.${attributes_language}.yacle.train.freq.5k.pkl \ + --attributes_embedding_vectors \ + ${dir}/vectors_${xspace}.${attributes_language}.yacle.train.freq.5k.np \ + --similarity_type $similarity_type |& tee ./results/ft_xling_space-${xspace}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.out + fi + done + done +done \ No newline at end of file diff --git a/fasttext_multiling.sh b/fasttext_multiling.sh index 941316e..f10c816 100644 --- a/fasttext_multiling.sh +++ b/fasttext_multiling.sh @@ -1,17 +1,8 @@ #!/usr/bin/env bash -#parser.add_argument("--test_number", type=int, help="Number of the weat test to run", required=False) -#parser.add_argument("--permutation_number", type=int, default=None, -# help="Number of permutations (otherwise all will be run)", required=False) -# parser.add_argument("--output_file", type=str, default=None, help="File to store the results)", required=False) -# parser.add_argument("--lower", type=bool, default=False, help="Whether to lower the vocab", required=False) -# parser.add_argument("--similarity_type", type=str, default="cosine", help="Which similarity function to use", -# required=False) -# parser.add_argument("--embedding_file", type=str) - #for similarity_type in "cosine" "csls" ; do -for similarity_type in "cosine" "csls" ; do +for similarity_type in "euclidean" ; do for test_number in 1 2 3 4 5 6 7 8 9 10 ; do - for language in "de" "es" "hr" "it" "ru" "tr" ; do + for language in "en" "de" "es" "hr" "it" "ru" "tr" ; do echo $language echo $similarity_type echo $test_number diff --git a/fasttext_multiling_cc.sh b/fasttext_multiling_cc.sh new file mode 100644 index 0000000..6913e74 --- /dev/null +++ b/fasttext_multiling_cc.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +#for similarity_type in "cosine" "csls" ; do +for similarity_type in "cosine" "euclidean" ; do + for test_number in 3 4 5 ; do + for language in "en" ; do # "de" "es" "hr" "it" "ru" "tr" ; do + echo $language + echo $similarity_type + echo $test_number + python weat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/fasttext_cc_${language}_${similarity_type}_${test_number}.res \ + --lower True \ + --use_glove False \ + --is_vec_format True \ + --lang $language \ + --embeddings \ + /work/anlausch/fasttext_cc/cc.${language}.300.vec \ + --similarity_type $similarity_type |& tee ./results/fasttext_cc_${language}_${similarity_type}_${test_number}.out + done + done +done \ No newline at end of file diff --git a/fasttext_xling.sh b/fasttext_xling.sh index a353aba..892b9ff 100644 --- a/fasttext_xling.sh +++ b/fasttext_xling.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash -for similarity_type in "cosine" ; do +#for similarity_type in "cosine" ; do +for similarity_type in "euclidean" ; do for test_number in 1 2 3 4 5 6 7 8 9 10 ; do for targets_language in "en" "de" "hr" "it" "ru" "tr" ; do for attributes_language in "en" "de" "hr" "it" "ru" "tr" ; do diff --git a/fasttext_xling2.sh b/fasttext_xling2.sh new file mode 100644 index 0000000..3878ae9 --- /dev/null +++ b/fasttext_xling2.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +#for similarity_type in "cosine" ; do +for similarity_type in "cosine" "euclidean" ; do + for targets_language in "en" "de" "hr" "it" "ru" "tr" ; do + for attributes_language in "en" "de" "hr" "it" "ru" "tr" ; do + for test_number in 6 7 8 9 10 1 2 ; do + echo $targets_language + echo $attributes_language + echo $similarity_type + echo $test_number + + dir_1="/work/gglavas/data/word_embs/yacle/mappings/new/smith/fasttext/${targets_language}-${attributes_language}" + dir_2="/work/gglavas/data/word_embs/yacle/mappings/new/smith/fasttext/${attributes_language}-${targets_language}" + + if [ -d "$dir_1" ]; then + embedding_dir=$dir_1 + + echo $embedding_dir + + python xweat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/ft_xling2_space-${targets_language}-${attributes_language}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.res \ + --lower True \ + --use_glove False \ + --targets_lang $targets_language \ + --attributes_lang $attributes_language \ + --targets_embedding_vocab \ + ${embedding_dir}/vocab_${targets_language}-${attributes_language}.${targets_language}.yacle.train.freq.5k.pkl \ + --targets_embedding_vectors \ + ${embedding_dir}/vectors_${targets_language}-${attributes_language}.${targets_language}.yacle.train.freq.5k.np \ + --attributes_embedding_vocab \ + ${embedding_dir}/vocab_${targets_language}-${attributes_language}.${attributes_language}.yacle.train.freq.5k.pkl \ + --attributes_embedding_vectors \ + ${embedding_dir}/vectors_${targets_language}-${attributes_language}.${attributes_language}.yacle.train.freq.5k.np \ + --similarity_type $similarity_type |& tee ./results/ft_xling2_space-${targets_language}-${attributes_language}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.out + fi + if [ -d "$dir_2" ]; then + embedding_dir=$dir_2 + echo $embedding_dir + python xweat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/ft_xling2_space-${attributes_language}-${targets_language}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.res \ + --lower True \ + --use_glove False \ + --targets_lang $targets_language \ + --attributes_lang $attributes_language \ + --targets_embedding_vocab \ + ${embedding_dir}/vocab_${attributes_language}-${targets_language}.${targets_language}.yacle.train.freq.5k.pkl \ + --targets_embedding_vectors \ + ${embedding_dir}/vectors_${attributes_language}-${targets_language}.${targets_language}.yacle.train.freq.5k.np \ + --attributes_embedding_vocab \ + ${embedding_dir}/vocab_${attributes_language}-${targets_language}.${attributes_language}.yacle.train.freq.5k.pkl \ + --attributes_embedding_vectors \ + ${embedding_dir}/vectors_${attributes_language}-${targets_language}.${attributes_language}.yacle.train.freq.5k.np \ + --similarity_type $similarity_type |& tee ./results/ft_xling2_space-${attributes_language}-${targets_language}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.out + fi + done + done + done +done \ No newline at end of file diff --git a/fasttext_xling_es.sh b/fasttext_xling_es.sh new file mode 100644 index 0000000..bd446bf --- /dev/null +++ b/fasttext_xling_es.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +#!/usr/bin/env bash + +#for similarity_type in "cosine" ; do +for similarity_type in "cosine" "euclidean" ; do + targets_language="es" + for attributes_language in "en" "de" "hr" "it" "ru" "tr" ; do + for test_number in 6 7 8 9 10 1 2 ; do + xspace=${targets_language}-${attributes_language} + echo $targets_language + echo $attributes_language + echo $similarity_type + echo $test_number + + dir_1="/work/gglavas/data/word_embs/yacle/mappings/new/smith/fasttext/${xspace}" + + if [ -d "$dir_1" ]; then + embedding_dir=$dir_1 + echo $embedding_dir + + python xweat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/ft_xling_space-${xspace}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.res \ + --lower True \ + --use_glove False \ + --targets_lang $targets_language \ + --attributes_lang $attributes_language \ + --targets_embedding_vocab \ + ${embedding_dir}/${targets_language}.vocab \ + --targets_embedding_vectors \ + ${embedding_dir}/${targets_language}.vectors \ + --attributes_embedding_vocab \ + ${embedding_dir}/${attributes_language}.vocab \ + --attributes_embedding_vectors \ + ${embedding_dir}/${attributes_language}.vectors \ + --similarity_type $similarity_type |& tee ./results/ft_xling_space-${xspace}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.out + fi + done + done +done + +for similarity_type in "cosine" "euclidean" ; do + attributes_language="es" + for targets_language in "en" "de" "hr" "it" "ru" "tr" ; do + for test_number in 6 7 8 9 10 1 2 ; do + xspace=${attributes_language}-${targets_language} + echo $targets_language + echo $attributes_language + echo $similarity_type + echo $test_number + + dir_1="/work/gglavas/data/word_embs/yacle/mappings/new/smith/fasttext/${xspace}" + + if [ -d "$dir_1" ]; then + embedding_dir=$dir_1 + echo $embedding_dir + + python xweat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/ft_xling_space-${xspace}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.res \ + --lower True \ + --use_glove False \ + --targets_lang $targets_language \ + --attributes_lang $attributes_language \ + --targets_embedding_vocab \ + ${embedding_dir}/${targets_language}.vocab \ + --targets_embedding_vectors \ + ${embedding_dir}/${targets_language}.vectors \ + --attributes_embedding_vocab \ + ${embedding_dir}/${attributes_language}.vocab \ + --attributes_embedding_vectors \ + ${embedding_dir}/${attributes_language}.vectors \ + --similarity_type $similarity_type |& tee ./results/ft_xling_space-${xspace}_ta-${targets_language}-${attributes_language}_${similarity_type}_${test_number}.out + fi + done + done +done \ No newline at end of file diff --git a/ft_postspecialized.sh b/ft_postspecialized.sh new file mode 100644 index 0000000..bcadfed --- /dev/null +++ b/ft_postspecialized.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +#parser.add_argument("--test_number", type=int, help="Number of the weat test to run", required=False) +#parser.add_argument("--permutation_number", type=int, default=None, +# help="Number of permutations (otherwise all will be run)", required=False) +# parser.add_argument("--output_file", type=str, default=None, help="File to store the results)", required=False) +# parser.add_argument("--lower", type=bool, default=False, help="Whether to lower the vocab", required=False) +# parser.add_argument("--similarity_type", type=str, default="cosine", help="Which similarity function to use", +# required=False) +# parser.add_argument("--embedding_file", type=str) + +#for similarity_type in "cosine" "csls" ; do +for similarity_type in "cosine" "euclidean" ; do + for test_number in 1 2 3 4 5 6 7 8 9 10 ; do + echo $similarity_type + echo $test_number + python weat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/ft_postspec_en_${similarity_type}_${test_number}.res \ + --lower True \ + --use_glove False \ + --postspec True \ + --similarity_type $similarity_type |& tee ./results/ft_postspec_en_${similarity_type}_${test_number}.out + done +done \ No newline at end of file diff --git a/glove_en_cc.sh b/glove_en_cc.sh new file mode 100644 index 0000000..80c3304 --- /dev/null +++ b/glove_en_cc.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +#for similarity_type in "cosine" "csls" ; do +for similarity_type in "cosine" "euclidean" ; do + for test_number in 6 7 8 9 10 1 2 3 4 5 ; do + for language in "en" ; do + echo $language + echo $similarity_type + echo $test_number + python weat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/glove_cc_${language}_${similarity_type}_${test_number}_cased.res \ + --lower False \ + --use_glove False \ + --is_vec_format True \ + --lang $language \ + --embeddings \ + ~/glove.840B.300d.txt \ + --similarity_type $similarity_type |& tee ./results/glove_cc_${language}_${similarity_type}_${test_number}_cased.out + done + done +done \ No newline at end of file diff --git a/glove_en_tweets.sh b/glove_en_tweets.sh new file mode 100644 index 0000000..5441bf3 --- /dev/null +++ b/glove_en_tweets.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +#for similarity_type in "cosine" "csls" ; do +for similarity_type in "cosine" "euclidean" ; do + for test_number in 6 7 8 9 10 1 2 3 4 5 ; do + for language in "en" ; do + echo $language + echo $similarity_type + echo $test_number + python weat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/glove_twitter_${language}_${similarity_type}_${test_number}_cased.res \ + --lower False \ + --use_glove False \ + --is_vec_format True \ + --lang $language \ + --embeddings \ + /work/anlausch/glove_twitter/glove.twitter.27B.200d.txt \ + --similarity_type $similarity_type |& tee ./results/glove_twitter_${language}_${similarity_type}_${test_number}_cased.out + done + done +done \ No newline at end of file diff --git a/glove_reproduction.sh b/glove_reproduction.sh index ca6f4fc..0e06f9e 100644 --- a/glove_reproduction.sh +++ b/glove_reproduction.sh @@ -9,15 +9,16 @@ # parser.add_argument("--embedding_file", type=str) for similarity_type in "cosine" ; do +#for similarity_type in "euclidean" ; do for test_number in 1 2 3 4 5 6 7 8 9 10 ; do echo $similarity_type echo $test_number python weat.py \ --test_number $test_number \ --permutation_number 1000000 \ - --output_file ./results/glove_${similarity_type}_${test_number}.res \ - --lower True \ + --output_file ./results/glove_wiki_${similarity_type}_${test_number}_cased.res \ + --lower False \ --use_glove True \ - --similarity_type $similarity_type |& tee ./results/glove_${similarity_type}_${test_number}.out + --similarity_type $similarity_type |& tee ./results/glove_wiki_${similarity_type}_${test_number}_cased.out done done \ No newline at end of file diff --git a/latex.py b/latex.py new file mode 100644 index 0000000..28babc9 --- /dev/null +++ b/latex.py @@ -0,0 +1,102 @@ +import os.path +import codecs + +def main_results(): + with codecs.open("tables.txt", "w", "utf8") as o: + for i in [1, 2, 5, 6, 7, 8, 9]: + results = {} + for attribute_language in ["en", "de", "es", "it", "hr", "ru", "tr"]: + for target_language in ["en", "de", "es", "it", "hr", "ru", "tr"]: + file = "/work/anlausch/xweat/results/ft_xling_space-" + target_language + "-" + attribute_language + "_ta-" + target_language + "-" + attribute_language + "_cosine_" + str(i) + ".res" + + if not os.path.isfile(file): + file = "/work/anlausch/xweat/results/ft_xling_space-" + attribute_language + "-" + target_language + "_ta-" + target_language + "-" + attribute_language + "_cosine_" + str( + i) + ".res" + if os.path.isfile(file): + with codecs.open(file, "r", "utf8") as f: + for j,line in enumerate(f.readlines()): + if j == 1: + tuple = line.split("Result: ")[1] + tuple = eval(tuple) + effect_size = tuple[1] + p = tuple[2] + if p < 0.05 or p > 0.95: + effect_size = str(effect_size) + else: + effect_size = str(effect_size) + "*" + results[(attribute_language, target_language)] = effect_size + o.write("XWEAT " + str( + i) + "(T/A) & \\textbf{\\textsc{en}} & \\textbf{\\textsc{de}} & \\textbf{\\textsc{es}} & \\textbf{\\textsc{it}} & \\textbf{\textsc{hr}} & \\textbf{\textsc{ru}} & \\textbf{\\textsc{tr}}) \\\\\n") + for target_language in ["en", "de", "es", "it", "hr", "ru", "tr"]: + o.write("\\textbf{\\textsc{"+ target_language+"}} ") + for attribute_language in ["en", "de", "es", "it", "hr", "ru", "tr"]: + if target_language == attribute_language: + o.write("& -- ") + else: + if (attribute_language, target_language) in results: + o.write("&" + results[(attribute_language, target_language)]) + else: + o.write("& &") + if attribute_language == "tr": + o.write("\\\\\n") + +def main(): + with codecs.open("tables_avg.txt", "w", "utf8") as o: + results = {} + for i in [1, 2, 6, 7, 8, 9]: + for attribute_language in ["en", "de", "es", "it", "hr", "ru", "tr"]: + for target_language in ["en", "de", "es", "it", "hr", "ru", "tr"]: + file = "/work/anlausch/xweat/results/ft_xling_space-" + target_language + "-" + attribute_language + "_ta-" + target_language + "-" + attribute_language + "_cosine_" + str(i) + ".res" + + if not os.path.isfile(file): + file = "/work/anlausch/xweat/results/ft_xling_space-" + attribute_language + "-" + target_language + "_ta-" + target_language + "-" + attribute_language + "_cosine_" + str( + i) + ".res" + if os.path.isfile(file): + with codecs.open(file, "r", "utf8") as f: + for j,line in enumerate(f.readlines()): + if j == 1: + tuple = line.split("Result: ")[1] + tuple = eval(tuple) + effect_size = tuple[1] + p = tuple[2] + if (attribute_language, target_language) in results: + effect_sizes, old_significance = results[(attribute_language, target_language)] + effect_sizes.append(effect_size) + if not old_significance or not (p < 0.05 or p > 0.95): + results[(attribute_language, target_language)] = (effect_sizes, False) + else: + results[(attribute_language, target_language)] = (effect_sizes, True) + elif not (attribute_language, target_language) in results and (p < 0.05 or p > 0.95): + results[(attribute_language, target_language)] = ([effect_size], True) + elif not (attribute_language, target_language) in results and not (p < 0.05 or p > 0.95): + results[(attribute_language, target_language)] = ([effect_size], False) + else: + raise NotImplementedError() + results_transformed = {} + for key, value in results.items(): + effect_sizes, significance = value + if not len(effect_sizes) == 6: + print("Problem") + assert len(effect_sizes) == 6 + effect_size = sum(effect_sizes)/len(effect_sizes) + if significance: + results_transformed[key] = str(effect_size) + else: + results_transformed[key] = str(effect_size) + "*" + o.write("XWEAT " + str( + i) + "(T/A) & \\textbf{\\textsc{en}} & \\textbf{\\textsc{de}} & \\textbf{\\textsc{es}} & \\textbf{\\textsc{it}} & \\textbf{\textsc{hr}} & \\textbf{\textsc{ru}} & \\textbf{\\textsc{tr}}) \\\\\n") + for target_language in ["en", "de", "es", "it", "hr", "ru", "tr"]: + o.write("\\textbf{\\textsc{"+ target_language+"}} ") + for attribute_language in ["en", "de", "es", "it", "hr", "ru", "tr"]: + if target_language == attribute_language: + o.write("& -- ") + else: + if (attribute_language, target_language) in results_transformed: + o.write("&" + results_transformed[(attribute_language, target_language)]) + else: + o.write("& &") + if attribute_language == "tr": + o.write("\\\\\n") + +if __name__=="__main__": + main() \ No newline at end of file diff --git a/utils.py b/utils.py index f4d2382..0e97199 100644 --- a/utils.py +++ b/utils.py @@ -74,14 +74,25 @@ def __call__(self, doc): # returns a dictionary of embeddings def load_embeddings(path, word2vec=False, rdf2vec=False): + """ + >>> load_embeddings("/work/anlausch/glove_twitter/glove.twitter.27B.200d.txt") + :param path: + :param word2vec: + :param rdf2vec: + :return: + """ embbedding_dict = {} if word2vec == False and rdf2vec == False: with codecs.open(path, "rb", "utf8", "ignore") as infile: for line in infile: - parts = line.split() - word = parts[0] - nums = [float(p) for p in parts[1:]] - embbedding_dict[word] = nums + try: + parts = line.split() + word = parts[0] + nums = [float(p) for p in parts[1:]] + embbedding_dict[word] = nums + except Exception as e: + print(line) + continue return embbedding_dict elif word2vec == True: #Load Google's pre-trained Word2Vec model. diff --git a/w2v_multiling_wiki.sh b/w2v_multiling_wiki.sh new file mode 100644 index 0000000..a964bf3 --- /dev/null +++ b/w2v_multiling_wiki.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +#for similarity_type in "cosine" "csls" ; do +for similarity_type in "cosine" "euclidean" ; do + for language in "en" "de" "es" "hr" "it" "ru" "tr" ; do + for test_number in 3 4 5 6 10; do + echo $language + echo $similarity_type + echo $test_number + python weat.py \ + --test_number $test_number \ + --permutation_number 1000000 \ + --output_file ./results/w2v_wiki_${language}_${similarity_type}_${test_number}_cased.res \ + --lower False \ + --use_glove False \ + --is_vec_format True \ + --lang $language \ + --embeddings \ + /work/gglavas/data/word_embs/yacle/cbow/cbow.wiki.${language}.300w5.vec \ + --similarity_type $similarity_type |& tee ./results/w2v_wiki_${language}_${similarity_type}_${test_number}_cased.out + done + done +done \ No newline at end of file diff --git a/weat.py b/weat.py index a811e6d..8af4500 100644 --- a/weat.py +++ b/weat.py @@ -459,27 +459,41 @@ def load_vocab_goran(path): def load_vectors_goran(path): return np.load(path) -def load_embedding_dict(vocab_path="", vector_path="", glove=False): +def load_embedding_dict(vocab_path="", vector_path="", embeddings_path="", glove=False, postspec=False): """ >>> _load_embedding_dict() :param vocab_path: :param vector_path: :return: embd_dict """ - if glove: + if glove and postspec: + raise ValueError("Glove and postspec cannot both be true") + elif glove: if os.name == "nt": embd_dict = utils.load_embeddings("C:/Users/anlausch/workspace/embedding_files/glove.6B/glove.6B.300d.txt", word2vec=False) else: embd_dict = utils.load_embeddings("/work/anlausch/glove.6B.300d.txt", word2vec=False) return embd_dict - embd_dict = {} - vocab = load_vocab_goran(vocab_path) - vectors = load_vectors_goran(vector_path) - for term, index in vocab.items(): - embd_dict[term] = vectors[index] - assert len(embd_dict) == len(vocab) - return embd_dict + elif postspec: + embd_dict_temp = utils.load_embeddings("/work/anlausch/ft_postspec.txt", word2vec=False) + embd_dict = {} + for key, value in embd_dict_temp.items(): + embd_dict[key.split("en_")[1]] = value + assert("test" in embd_dict) + assert ("house" in embd_dict) + return embd_dict + elif embeddings_path != "": + embd_dict = utils.load_embeddings(embeddings_path, word2vec=False) + return embd_dict + else: + embd_dict = {} + vocab = load_vocab_goran(vocab_path) + vectors = load_vectors_goran(vector_path) + for term, index in vocab.items(): + embd_dict[term] = vectors[index] + assert len(embd_dict) == len(vocab) + return embd_dict def translate(translation_dict, terms): translation = [] @@ -500,6 +514,46 @@ def translate(translation_dict, terms): return translation +def compute_oov_percentage(): + """ + >>> compute_oov_percentage() + :return: + """ + with codecs.open("./results/oov_short.txt", "w", "utf8") as f: + for test in range(1,11): + f.write("Test %d \n" % test) + targets_1, targets_2, attributes_1, attributes_2 = XWEAT().__getattribute__("weat_" + str(test))() + vocab = targets_1 + targets_2 + attributes_1 + attributes_2 + vocab = [t.lower() for t in vocab] + #f.write("English vocab: %s \n" % str(vocab)) + for language in ["en", "es", "de", "tr", "ru", "hr", "it"]: + if language != "en": + #f.write("Translating terms from en to %s\n" % language) + translation_dict = load_vocab_goran("./data/vocab_dict_en_" + language + ".p") + vocab_translated = translate(translation_dict, vocab) + vocab_translated = [t.lower() for t in vocab_translated] + #f.write("Translated terms %s\n" % str(vocab)) + embd_dict = load_embedding_dict(vocab_path="/work/gglavas/data/word_embs/yacle/fasttext/200K/npformat/ft.wiki."+language+".300.vocab", vector_path="/work/gglavas/data/word_embs/yacle/fasttext/200K/npformat/ft.wiki."+language+".300.vectors") + ins=[] + not_ins=[] + if language != "en": + for term in vocab_translated: + if term in embd_dict: + ins.append(term) + else: + not_ins.append(term) + else: + for term in vocab: + if term in embd_dict: + ins.append(term) + else: + not_ins.append(term) + #f.write("OOVs: %s\n" % str(not_ins)) + f.write("OOV Percentage for language %s: %s\n" % (language, (len(not_ins)/len(vocab)))) + f.write("\n") + f.close() + + def main(): def boolean_string(s): if s not in {'False', 'True', 'false', 'true'}: @@ -516,6 +570,9 @@ def boolean_string(s): parser.add_argument("--embedding_vocab", type=str, help="Vocab of the embeddings") parser.add_argument("--embedding_vectors", type=str, help="Vectors of the embeddings") parser.add_argument("--use_glove", type=boolean_string, default=False, help="Use glove") + parser.add_argument("--postspec", type=boolean_string, default=False, help="Use postspecialized fasttext") + parser.add_argument("--is_vec_format", type=boolean_string, default=False, help="Whether embeddings are in vec format") + parser.add_argument("--embeddings", type=str, help="Vectors and vocab of the embeddings") parser.add_argument("--lang", type=str, default="en", help="Language to test") args = parser.parse_args() @@ -563,6 +620,12 @@ def boolean_string(s): if args.use_glove: logging.info("Using glove") embd_dict = load_embedding_dict(glove=True) + elif args.postspec: + logging.info("Using postspecialized embeddings") + embd_dict=load_embedding_dict(postspec=True) + elif args.is_vec_format: + logging.info("Embeddings are in vec format") + embd_dict = load_embedding_dict(embeddings_path=args.embeddings, glove=False) else: embd_dict = load_embedding_dict(vocab_path=args.embedding_vocab, vector_path=args.embedding_vectors, glove=False) weat.set_embd_dict(embd_dict)