modified

ftakezo · ftakezo · commit ebe94dfa000e · 2017-06-21T17:18:53.000+09:00
diff --git a/test.cpp.seq2seq b/test.cpp.seq2seq
@@ -30,7 +30,7 @@ void toPVariable(PVariable x1, float *X){
 
 
 
-WordEmbed *load_data(string filename, int vocab_size, bool addEOS, bool addSOS){
+WordEmbed *load_data(string filename, int vocab_size, bool tokenize, bool addEOS){
 
 
     std::ifstream reading_file(filename, std::ios::in);
@@ -48,7 +48,7 @@ WordEmbed *load_data(string filename, int vocab_size, bool addEOS, bool addSOS){
 
     WordEmbed *wd = new WordEmbed(vocab_size);
 
-    wd->addSentences(sequences, false, addEOS, addSOS);
+    wd->addSentences(sequences, tokenize, addEOS);
 
     return wd;
 }
@@ -140,7 +140,6 @@ PVariable attention_hidden_state(PVariable h, PVariable a){
 
     return model.G("attention_linear_tanh")->forward(attention_plus);
 }
-///////////////////////////
 
 
 int get_max_vocab_size(vector<vector<int>> &seqs_ids, int batch_size, int k){
@@ -155,8 +154,6 @@ int get_max_vocab_size(vector<vector<int>> &seqs_ids, int batch_size, int k){
 vector<PVariable> encoder(vector<vector<int>> &seqs_ids_ja, WordEmbed *wd_ja, int batch_size, int vocab_size, int k){
 
     int max_vocab_size_ja = get_max_vocab_size(seqs_ids_ja, batch_size, k);
-    //cout << "max_vocab_size_ja:" << max_vocab_size_ja << endl;
-
 
     vector<PVariable> src_hidden_states;
 
@@ -173,7 +170,6 @@ vector<PVariable> encoder(vector<vector<int>> &seqs_ids_ja, WordEmbed *wd_ja, in
             reverse(word_ids.begin(), word_ids.end());
 
             bool ignore = false;
-            //if (word_ids[j] == wd_ja->PAD_ID) ignore = true;
             wd_ja->toOneHot(vocab_size, data_ja, word_ids[j], batch_idx, ignore);
             batch_idx++;
         }
@@ -308,7 +304,6 @@ PVariable forward_one_step(vector<vector<int>> &seqs_ids_ja, vector<vector<int>>
             wd_en->padding(word_ids, max_vocab_size_en);
 
             bool ignore = false;
-            //if (word_ids[j] == wd_en->PAD_ID) ignore = true;
             wd_en->toOneHot(vocab_size, data_en, word_ids[j], batch_idx, ignore);
             batch_idx++;
         }
@@ -342,11 +337,10 @@ int main(){
     float clip_grad_threshold = 0;
     float learning_rate = 0.001; //ADAM
 
-    int epoch = 100;
-
+    int epoch = 20;
 
-    WordEmbed *wd_ja = load_data("tanaka_corpus_j_10000.txt", vocab_size, false, false);
-    WordEmbed *wd_en = load_data("tanaka_corpus_e_10000.txt", vocab_size, true, false);
+    WordEmbed *wd_ja = load_data("tanaka_corpus_j_10000.txt.train", vocab_size, true, false);
+    WordEmbed *wd_en = load_data("tanaka_corpus_e_10000.txt.train", vocab_size, true, true);
 
     vector<vector<int>> seqs_ids_ja = wd_ja->getSequencesIds();
     vector<vector<int>> seqs_ids_en = wd_en->getSequencesIds();
@@ -471,7 +465,7 @@ int main(){
         ((FullLSTM2 *) model.G("lstm_ja"))->reset_state();
         ((FullLSTM2 *) model.G("lstm_en"))->reset_state();
     }
-
+    
     delete wd_ja;
     delete wd_en;
 
diff --git a/tokenizer.h b/tokenizer.h
@@ -33,7 +33,8 @@ class Tokenizer {
 public:
     Tokenizer(){
         //tagger = MeCab::createTagger("-Owakati");
-        tagger = MeCab::createTagger("-xunknown");
+        //tagger = MeCab::createTagger("-xunknown -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd");
+        tagger = MeCab::createTagger("-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd");
     }
     ~Tokenizer(){
         delete tagger;
@@ -50,12 +51,12 @@ class Tokenizer {
 
         for (; node; node = node->next) {
             string feature(node->feature);
-            if (feature.find("名詞")==0 || feature.find("未知語")==0){
+            //if (feature.find("名詞")==0 || feature.find("未知語")==0){
                 strcpy(buf,node->surface);
                 buf[node->length]='\0';
                 string surface(buf);
                 result.push_back(surface);
-            }
+            //}
         }
         return result;
     }
diff --git a/word_embed.h b/word_embed.h
@@ -89,9 +89,9 @@ class WordEmbed {
     }
 
 
-    void addSentences(vector<string> seqs, bool tokenize, bool addEOS, bool addSOS){
+    void addSentences(vector<string> seqs, bool tokenize, bool addEOS){
         for (auto s : seqs){
-            add(s, tokenize, addEOS, addSOS);
+            add(s, tokenize, addEOS);
         }
 
         vector<pair<string, int> > pairs(words_count.size());
@@ -129,23 +129,27 @@ class WordEmbed {
 
     }
 
-    void add(string sentence, bool tokenize, bool addEOS, bool addSOS){
+    void add(string sentence, bool tokenize, bool addEOS){
 
         if (sentence == "") return;
 
-        if (addSOS) sentence = "<sos> " + sentence;
-        if (addEOS) sentence += " <eos>";
 
-        vector<string> words;
+        vector<string> words, words_final;
         if (tokenize) words = token.parse(sentence);
         else words = split(sentence, ' ');
 
-        for (auto w : words) {
+        if (addEOS) words.push_back("<eos>");
+
+        for (auto w : words){
+            if (w != "") words_final.push_back(w);
+        }
+
+        for (auto w : words_final) {
             if (words_count.count(w) == 0) words_count[w] = 1;
             else words_count[w] += 1;
         }
 
-        sequences.push_back(words);
+        sequences.push_back(words_final);
     }