From 0eefedf8cc3d01e5608ff42b414b2eb127d64fab Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Wed, 24 Mar 2021 09:13:01 +0000
Subject: [PATCH 001/135] Dynamic swap working, as long as the vocabularies are
 the same

---
 src/CMakeLists.txt             |   7 +-
 src/command/marian_swapper.cpp | 140 +++++++++++++++++++++++++++++++++
 src/common/config_parser.cpp   |   3 +-
 src/graph/expression_graph.h   |  12 +++
 src/graph/parameters.h         |   3 +-
 src/tensors/gpu/swap.cu        |  10 +++
 src/tensors/gpu/swap.h         |   6 ++
 7 files changed, 178 insertions(+), 3 deletions(-)
 create mode 100644 src/command/marian_swapper.cpp
 create mode 100644 src/tensors/gpu/swap.cu
 create mode 100644 src/tensors/gpu/swap.h
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 64b86a695..397a0330f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -182,6 +182,7 @@ if(CUDA_FOUND)
     tensors/gpu/add_all.cu
     tensors/gpu/tensor_operators.cu
     tensors/gpu/cudnn_wrappers.cu
+    tensors/gpu/swap.cu
     translator/nth_element.cu
     translator/helpers.cu
     STATIC)
@@ -213,6 +214,10 @@ if (NOT COMPILE_LIBRARY_ONLY)
   set_target_properties(marian_decoder PROPERTIES OUTPUT_NAME marian-decoder)
   target_compile_options(marian_decoder PRIVATE ${ALL_WARNINGS})
 
+  add_executable(marian_swapper command/marian_swapper.cpp)
+  set_target_properties(marian_swapper PROPERTIES OUTPUT_NAME marian_swapper)
+  target_compile_options(marian_swapper PRIVATE ${ALL_WARNINGS})
+
   add_executable(marian_scorer command/marian_scorer.cpp)
   set_target_properties(marian_scorer PROPERTIES OUTPUT_NAME marian-scorer)
   target_compile_options(marian_scorer PRIVATE ${ALL_WARNINGS})
@@ -225,7 +230,7 @@ if (NOT COMPILE_LIBRARY_ONLY)
   set_target_properties(marian_conv PROPERTIES OUTPUT_NAME marian-conv)
   target_compile_options(marian_conv PRIVATE ${ALL_WARNINGS})
 
-  set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv)
+  set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_swapper marian_scorer marian_vocab marian_conv)
 
   # marian.zip and marian.tgz
   # This combines marian, marian_decoder in a single ZIP or TAR file for
diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
new file mode 100644
index 000000000..3b9a4f75c
--- /dev/null
+++ b/src/command/marian_swapper.cpp
@@ -0,0 +1,140 @@
+#include "marian.h"
+#include "common/logging.h"
+#include "data/corpus.h"
+#include "data/text_input.h"
+#include "translator/beam_search.h"
+#include "translator/translator.h"
+#include "common/io.h"
+#include "common/timer.h"
+#include <vector>
+#include "tensors/gpu/swap.h"
+namespace marian {
+class SwapperTranslator {
+    private:
+        Ptr<Options> opts_;
+        Ptr<ExpressionGraph> graph_;
+        Ptr<Scorer> scorer_;
+
+        std::vector<Ptr<Vocab>> srcVocabs_;
+        Ptr<Vocab> trgVocab_;
+
+        // Models to store model;
+        bool primary_ = true;
+        std::vector<io::Item> primaryModel_;
+        std::vector<io::Item> secondaryModel_;
+
+    std::vector<io::Item> prepareItem(std::string path){
+        std::vector<io::Item> ret = io::loadItems(path);
+        // Find the special element and remove it:
+        size_t special_idx = 0;
+        for (size_t i = 0; i < ret.size(); i++) {
+            if (ret[i].name == "special:model.yml") {
+                special_idx = i;
+                break;
+            }
+        }
+        ret.erase(ret.begin() + special_idx);
+        // Prepare the name so that it matches the named map
+        for (auto&& item : ret) {
+            item.name = "F0::" + item.name;
+        }
+        return ret;
+    }
+
+    public:
+    SwapperTranslator(Ptr<Options> opt) : opts_(opt),
+                                          primaryModel_(prepareItem(opt->get<std::vector<std::string>>("models")[0])),
+                                          secondaryModel_(prepareItem(opt->get<std::string>("swap-model"))) {
+        opts_->set("inference", true);
+        opts_->set("shuffle", "none");
+
+        // Get vocabs
+        auto vocabPaths = opts_->get<std::vector<std::string>>("vocabs");
+        std::vector<int> maxVocabs = opts_->get<std::vector<int>>("dim-vocabs");
+
+        for(size_t i = 0; i < vocabPaths.size() - 1; ++i) {
+            Ptr<Vocab> vocab = New<Vocab>(opts_, i);
+            vocab->load(vocabPaths[i], maxVocabs[i]);
+            srcVocabs_.emplace_back(vocab);
+        }
+
+        trgVocab_ = New<Vocab>(opts_, vocabPaths.size() - 1);
+        trgVocab_->load(vocabPaths.back());
+
+        // get device IDs
+        auto devices = Config::getDevices(opts_);
+        auto numDevices = devices.size();
+        std::cerr << "Num devices: " << numDevices << std::endl;
+        
+        // Create graph
+        graph_ = New<ExpressionGraph>();
+        auto prec = opts_->get<std::vector<std::string>>("precision", {"float32"});
+        graph_->setDefaultElementType(typeFromString(prec[0]));
+        graph_->setDevice(devices[0]);
+        graph_->reserveWorkspaceMB(opts_->get<size_t>("workspace"));
+        scorer_ = createScorers(opts_)[0];
+        scorer_->init(graph_);
+        graph_->forward();
+    }
+
+    void translateTxt(std::string txt) {
+        std::vector<std::string> instr(1, txt);
+        auto corpus_ = New<data::TextInput>(instr, srcVocabs_, opts_);
+        data::BatchGenerator<data::TextInput> batchGenerator(corpus_, opts_, nullptr, false);
+
+        static const std::vector<Ptr<Scorer> > scorers(1, scorer_);
+        auto search = New<BeamSearch>(opts_, scorers, trgVocab_);
+        auto printer = New<OutputPrinter>(opts_, trgVocab_);
+        static int i = 0;
+        for (auto&& batch : batchGenerator) {
+            auto histories = search->search(graph_, batch);
+            for(auto history : histories) {
+                std::stringstream best1;
+                std::stringstream bestn;
+                printer->print(history, best1, bestn);
+                LOG(info, "Translation {} : {}", i, best1.str());
+                i++;
+            }
+        }
+    }
+
+    void swapActual(std::vector<io::Item>& from) {
+        auto namedMap = graph_->getParamsNamedMap();
+        for (auto&& item : from) {
+            auto to = reinterpret_cast<char *>(namedMap[item.name]->val()->memory()->data());
+            swapper::copyCpuToGpu(to, &item.bytes[0], item.bytes.size());
+        }
+    }
+
+    void swap() {
+        timer::Timer timer;
+        if (primary_) {
+            swapActual(secondaryModel_);
+            primary_ = false;
+        } else {
+            swapActual(primaryModel_);
+            primary_ = true;
+        }
+        LOG(info, "Swap took: {:.8f}s wall", timer.elapsed());
+    }
+};
+} // namespace marian
+
+int main(int argc, char** argv) {
+  using namespace marian;
+  auto options = parseOptions(argc, argv, cli::mode::translation);
+  SwapperTranslator swapper(options);
+
+  std::string line;
+  while (std::getline(std::cin, line)) {
+    if (line == "quit") {
+        break;
+    } else if (line == "swap") {
+        swapper.swap();
+    } else {
+        swapper.translateTxt(line);
+    }
+  }
+
+  return 0;
+}
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 602509c59..530fad69a 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -689,7 +689,8 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
   cli.add<std::vector<int>>("--output-approx-knn",
      "Use approximate knn search in output layer (currently only in transformer)")
      ->implicit_val("100 1024");
-
+  cli.add<std::string>("--swap-model",
+      "Path to model to swap to.");
 #if 0 // @TODO: Ask Hany if there are any decoding-time options
   // add ULR settings
   addSuboptionsULR(cli);
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index adc0aeae9..2fa28f67b 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -232,6 +232,18 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
     namespace_ = newNamespace;
   }
 
+  const std::unordered_map<std::string, Expr> & getParamsNamedMap() const {
+    if (paramsByElementType_.size() != 1) {
+      ABORT("Expected exactly one parameter datatype, got", paramsByElementType_.size());
+    }
+    for(auto&& kvParams : paramsByElementType_) {
+      auto cur_param = kvParams.second;
+      return cur_param->getMap();
+    }
+    ABORT("We should never get here"); // Just to satisfy compiler warnings;
+    return paramsByElementType_.find(Type::float32)->second->getMap();
+  }
+
   /**
    * Copy all parameter objects from one graph to current graph.
    * @param graph a pointer to a graph object
diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index 8b4af9dd5..d5ede0b4e 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -2,6 +2,7 @@
 
 #include <fstream>
 #include <map>
+#include <unordered_map>
 #include <unordered_set>
 
 #include "common/definitions.h"
@@ -22,7 +23,7 @@ class Parameters {
 
   /** @brief List of all parameter nodes of this expression graph. */
   std::vector<Expr> params_;
-  std::map<std::string, Expr> named_;
+  std::unordered_map<std::string, Expr> named_;
 
   Ptr<TensorAllocator> vals_;
   Ptr<TensorAllocator> grads_;
diff --git a/src/tensors/gpu/swap.cu b/src/tensors/gpu/swap.cu
new file mode 100644
index 000000000..3eb2c0df8
--- /dev/null
+++ b/src/tensors/gpu/swap.cu
@@ -0,0 +1,10 @@
+#include "cuda_helpers.h"
+void copyCpuToGpu(const char * in, char * gpuOut);
+
+namespace marian {
+    namespace swapper {
+        void copyCpuToGpu(char * gpuOut, const char * in, size_t count) {
+            CUDA_CHECK(cudaMemcpy(gpuOut, in, count, cudaMemcpyHostToDevice));
+        }
+    }
+}
diff --git a/src/tensors/gpu/swap.h b/src/tensors/gpu/swap.h
new file mode 100644
index 000000000..86b3094d7
--- /dev/null
+++ b/src/tensors/gpu/swap.h
@@ -0,0 +1,6 @@
+#include <stdlib.h>
+namespace marian {
+    namespace swapper {
+        void copyCpuToGpu(char * gpuOut, const char * in, size_t count);
+    }
+}

From 521f6343cb9ea86549e15cd11d0d670dd1ddbc7c Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 28 Mar 2021 18:09:20 +0000
Subject: [PATCH 002/135] Model and GPUSlot separation, add vocab support

---
 src/command/marian_swapper.cpp | 218 +++++++++++++++++----------------
 1 file changed, 115 insertions(+), 103 deletions(-)

diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
index 3b9a4f75c..7c5e92abe 100644
--- a/src/command/marian_swapper.cpp
+++ b/src/command/marian_swapper.cpp
@@ -9,130 +9,142 @@
 #include <vector>
 #include "tensors/gpu/swap.h"
 namespace marian {
-class SwapperTranslator {
-    private:
-        Ptr<Options> opts_;
-        Ptr<ExpressionGraph> graph_;
-        Ptr<Scorer> scorer_;
-
-        std::vector<Ptr<Vocab>> srcVocabs_;
-        Ptr<Vocab> trgVocab_;
-
-        // Models to store model;
-        bool primary_ = true;
-        std::vector<io::Item> primaryModel_;
-        std::vector<io::Item> secondaryModel_;
-
-    std::vector<io::Item> prepareItem(std::string path){
-        std::vector<io::Item> ret = io::loadItems(path);
-        // Find the special element and remove it:
-        size_t special_idx = 0;
-        for (size_t i = 0; i < ret.size(); i++) {
-            if (ret[i].name == "special:model.yml") {
-                special_idx = i;
-                break;
-            }
-        }
-        ret.erase(ret.begin() + special_idx);
-        // Prepare the name so that it matches the named map
-        for (auto&& item : ret) {
-            item.name = "F0::" + item.name;
-        }
-        return ret;
-    }
 
-    public:
-    SwapperTranslator(Ptr<Options> opt) : opts_(opt),
-                                          primaryModel_(prepareItem(opt->get<std::vector<std::string>>("models")[0])),
-                                          secondaryModel_(prepareItem(opt->get<std::string>("swap-model"))) {
-        opts_->set("inference", true);
-        opts_->set("shuffle", "none");
-
-        // Get vocabs
-        auto vocabPaths = opts_->get<std::vector<std::string>>("vocabs");
-        std::vector<int> maxVocabs = opts_->get<std::vector<int>>("dim-vocabs");
-
-        for(size_t i = 0; i < vocabPaths.size() - 1; ++i) {
-            Ptr<Vocab> vocab = New<Vocab>(opts_, i);
-            vocab->load(vocabPaths[i], maxVocabs[i]);
-            srcVocabs_.emplace_back(vocab);
+/* A model loaded on the CPU and possibly on a GPU */
+class Model {
+  private:
+    std::vector<io::Item> parameters_;
+    std::vector<Ptr<Vocab>> srcVocabs_;
+    Ptr<Vocab> trgVocab_;
+
+  public:
+    Model(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
+        : parameters_(io::loadItems(parameters)) {
+      // Load parameters.
+      // Find the special element and remove it:
+      size_t special_idx = 0;
+      for (size_t i = 0; i < parameters_.size(); i++) {
+        if (parameters_[i].name == "special:model.yml") {
+          special_idx = i;
+          break;
         }
+      }
+      parameters_.erase(parameters_.begin() + special_idx);
+      // Prepare the name so that it matches the named map
+      for (auto&& item : parameters_) {
+        item.name = "F0::" + item.name;
+      }
 
-        trgVocab_ = New<Vocab>(opts_, vocabPaths.size() - 1);
-        trgVocab_->load(vocabPaths.back());
+      // Load source vocabs.
+      const std::vector<int> &maxVocabs = options->get<std::vector<int>>("dim-vocabs");
+      for(size_t i = 0; i < sourceVocabPaths.size(); ++i) {
+        Ptr<Vocab> vocab = New<Vocab>(options, i);
+        vocab->load(sourceVocabPaths[i], maxVocabs[i]);
+        srcVocabs_.emplace_back(vocab);
+      }
 
-        // get device IDs
-        auto devices = Config::getDevices(opts_);
-        auto numDevices = devices.size();
-        std::cerr << "Num devices: " << numDevices << std::endl;
-        
-        // Create graph
-        graph_ = New<ExpressionGraph>();
-        auto prec = opts_->get<std::vector<std::string>>("precision", {"float32"});
-        graph_->setDefaultElementType(typeFromString(prec[0]));
-        graph_->setDevice(devices[0]);
-        graph_->reserveWorkspaceMB(opts_->get<size_t>("workspace"));
-        scorer_ = createScorers(opts_)[0];
-        scorer_->init(graph_);
-        graph_->forward();
+      // Load target vocab.
+      trgVocab_ = New<Vocab>(options, sourceVocabPaths.size());
+      trgVocab_->load(targetVocabPath);
     }
 
-    void translateTxt(std::string txt) {
-        std::vector<std::string> instr(1, txt);
-        auto corpus_ = New<data::TextInput>(instr, srcVocabs_, opts_);
-        data::BatchGenerator<data::TextInput> batchGenerator(corpus_, opts_, nullptr, false);
-
-        static const std::vector<Ptr<Scorer> > scorers(1, scorer_);
-        auto search = New<BeamSearch>(opts_, scorers, trgVocab_);
-        auto printer = New<OutputPrinter>(opts_, trgVocab_);
-        static int i = 0;
-        for (auto&& batch : batchGenerator) {
-            auto histories = search->search(graph_, batch);
-            for(auto history : histories) {
-                std::stringstream best1;
-                std::stringstream bestn;
-                printer->print(history, best1, bestn);
-                LOG(info, "Translation {} : {}", i, best1.str());
-                i++;
-            }
-        }
+    const std::vector<io::Item> &Parameters() const { return parameters_; }
+
+    const std::vector<Ptr<Vocab>> &SrcVocabs() const { return srcVocabs_; }
+
+    Ptr<Vocab> TrgVocab() const { return trgVocab_; }
+};
+
+/* Reserved space on a GPU with which to translate */
+class GPUSlot {
+	private:
+    Ptr<Options> options_;
+    Ptr<ExpressionGraph> graph_;
+    std::vector<Ptr<Scorer> > scorers_;
+
+    // Last model used for translation.  Used to skip loading.
+    const Model *loadedModel_;
+
+    void Load(const std::vector<io::Item> &parameters) {
+      timer::Timer timer;
+      auto namedMap = graph_->getParamsNamedMap();
+      for (auto&& item : parameters) {
+        auto to = reinterpret_cast<char *>(namedMap[item.name]->val()->memory()->data());
+        swapper::copyCpuToGpu(to, &item.bytes[0], item.bytes.size());
+      }
+      LOG(info, "Load took: {:.8f}s wall", timer.elapsed());
     }
 
-    void swapActual(std::vector<io::Item>& from) {
-        auto namedMap = graph_->getParamsNamedMap();
-        for (auto&& item : from) {
-            auto to = reinterpret_cast<char *>(namedMap[item.name]->val()->memory()->data());
-            swapper::copyCpuToGpu(to, &item.bytes[0], item.bytes.size());
-        }
+  public:
+    explicit GPUSlot(Ptr<Options> options) : options_(options), loadedModel_(nullptr) {
+      options_->set("inference", true);
+      options_->set("shuffle", "none");
+      // get device IDs
+      auto devices = Config::getDevices(options_);
+      auto numDevices = devices.size();
+      std::cerr << "Num devices: " << numDevices << std::endl;
+        
+      // Create graph
+      graph_ = New<ExpressionGraph>();
+      auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
+      graph_->setDefaultElementType(typeFromString(prec[0]));
+      graph_->setDevice(devices[0]);
+      graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+      // TODO: multiple scorers.
+      Ptr<Scorer> scorer = createScorers(options_)[0];
+      scorer->init(graph_);
+      scorers_.push_back(scorer);
+      graph_->forward();
     }
 
-    void swap() {
-        timer::Timer timer;
-        if (primary_) {
-            swapActual(secondaryModel_);
-            primary_ = false;
-        } else {
-            swapActual(primaryModel_);
-            primary_ = true;
+    void Translate(const Model &model, const std::vector<std::string> &input) {
+      if (loadedModel_ != &model) {
+        Load(model.Parameters());
+        loadedModel_ = &model;
+      }
+      auto corpus = New<data::TextInput>(input, model.SrcVocabs(), options_);
+      data::BatchGenerator<data::TextInput> batchGenerator(corpus, options_, nullptr, false);
+
+      auto search = New<BeamSearch>(options_, scorers_, model.TrgVocab());
+      auto printer = New<OutputPrinter>(options_, model.TrgVocab());
+      for (auto&& batch : batchGenerator) {
+        auto histories = search->search(graph_, batch);
+        for(auto history : histories) {
+          std::stringstream best1;
+          std::stringstream bestn;
+          printer->print(history, best1, bestn);
+          LOG(info, "Translation {}", best1.str());
         }
-        LOG(info, "Swap took: {:.8f}s wall", timer.elapsed());
+      }
     }
 };
+
 } // namespace marian
 
+/* Demo program */
 int main(int argc, char** argv) {
   using namespace marian;
-  auto options = parseOptions(argc, argv, cli::mode::translation);
-  SwapperTranslator swapper(options);
+  Ptr<Options> options = parseOptions(argc, argv, cli::mode::translation);
+  GPUSlot slot(options);
+  Model pten(options,
+      "/home/ubuntu/consistent-big-models/padded/pten.npz",
+      {"/home/ubuntu/consistent-big-models/padded/pten.vocab"},
+      "/home/ubuntu/consistent-big-models/padded/pten.vocab");
+
+  Model enit(options,
+      "/home/ubuntu/consistent-big-models/padded/enit.npz",
+      {"/home/ubuntu/consistent-big-models/padded/enit.vocab"},
+      "/home/ubuntu/consistent-big-models/padded/enit.vocab");
 
+  const Model *model = &pten;
   std::string line;
   while (std::getline(std::cin, line)) {
-    if (line == "quit") {
-        break;
-    } else if (line == "swap") {
-        swapper.swap();
+    if (line == " TRANSLATE PTEN") {
+      model = &pten;
+    } else if (line == " TRANSLATE ENIT") {
+      model = &enit;
     } else {
-        swapper.translateTxt(line);
+      slot.Translate(*model, {line});
     }
   }
 

From 67190dba9a6bbd1539d5a47c3439cd442058c84e Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 28 Mar 2021 18:34:46 +0000
Subject: [PATCH 003/135] Add vocabulary padding script

---
 scripts/contrib/pad_model_vocabulary.py | 52 +++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100755 scripts/contrib/pad_model_vocabulary.py

diff --git a/scripts/contrib/pad_model_vocabulary.py b/scripts/contrib/pad_model_vocabulary.py
new file mode 100755
index 000000000..eca73e34a
--- /dev/null
+++ b/scripts/contrib/pad_model_vocabulary.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# Pads a Marian model's vocabulary to have greater size.  The added tokens have
+# zero probability.
+# ./pad_model_vocabulary.py input.npz output.npz desired_vocab_size
+#
+# You'll also need to separately pad your vocabulary file like so:
+# old=$(wc -l input.vocab |cut -d " " -f 1)
+# (cat input.vocab; seq -f "<PADDING%g>" $((desired_vocab_size-old))) >output.vocab
+#
+# Warning: probably only works with shared vocabulary models.
+import math
+import numpy as np
+import sys
+import yaml
+
+# Amend the vocab size in a raw ["special:model.yml"] data from a Marian npz.
+# Returns the raw data to use for ["special:model.yml"]
+def substitute_vocab_config(raw, new_size):
+  print("Old yml: ", raw.tostring())
+  raw_yaml = raw.tostring().decode("utf-8")
+  #Python yaml doesn't like null bytes.
+  if raw_yaml.endswith("\x00"):
+    raw_yaml = raw_yaml[:-1]
+  config = yaml.load(raw_yaml)
+  config['dim-vocabs'] = [new_size] * len(config['dim-vocabs'])
+  raw_yaml = yaml.dump(config)
+  if raw_yaml.endswith("\n"):
+    raw_yaml = raw_yaml[:-1]
+  raw_yaml += "\x00"
+  return np.array(bytearray(raw_yaml, 'utf-8'))
+
+if len(sys.argv) != 4:
+  print("Usage: " + sys.argv[0] + " input.npz output.npz desired_vocab_size")
+  sys.exit(1)
+  
+resized_path = sys.argv[2]
+new_size = int(sys.argv[3])
+old_model = np.load(sys.argv[1])
+
+new_model = dict(old_model)
+old_size = len(old_model["Wemb"])
+if old_size > new_size:
+  sys.stderr.write("New size is smaller than original.  Cowardly refusing to clip vocab.\n")
+  sys.exit(2)
+print("Before: ", new_model["decoder_ff_logit_out_b"].shape, new_model["Wemb"].shape)
+bias = new_model["decoder_ff_logit_out_b"]
+new_model["decoder_ff_logit_out_b"] = np.pad(bias, [(0,0),(0,new_size - bias.shape[1])], mode='constant', constant_values = -math.inf)
+new_model["Wemb"] = np.pad(new_model["Wemb"], [(0,new_size - bias.shape[1]), (0,0)], mode='constant', constant_values = 0)
+print("After: ", new_model["decoder_ff_logit_out_b"].shape, new_model["Wemb"].shape)
+new_model["special:model.yml"] = substitute_vocab_config(new_model["special:model.yml"], new_size)
+print("New yml: ", new_model["special:model.yml"].tostring())
+np.savez(resized_path, **new_model)

From b165af8d6a6f1d8642c630f3fda7ff3ccfd7bab0 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 28 Mar 2021 19:31:44 +0000
Subject: [PATCH 004/135] Split code into main and library h/cpp

---
 src/CMakeLists.txt             |   1 +
 src/command/marian_swapper.cpp | 134 ++-------------------------------
 src/translator/swappable.cpp   |  96 +++++++++++++++++++++++
 src/translator/swappable.h     |  55 ++++++++++++++
 4 files changed, 159 insertions(+), 127 deletions(-)
 create mode 100644 src/translator/swappable.cpp
 create mode 100644 src/translator/swappable.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 397a0330f..98d5c4e98 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -100,6 +100,7 @@ set(MARIAN_SOURCES
   translator/nth_element.cpp
   translator/helpers.cpp
   translator/scorers.cpp
+  translator/swappable.cpp
 
   training/graph_group_async.cpp
   training/graph_group_sync.cpp
diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
index 7c5e92abe..15bdecf63 100644
--- a/src/command/marian_swapper.cpp
+++ b/src/command/marian_swapper.cpp
@@ -1,142 +1,22 @@
-#include "marian.h"
-#include "common/logging.h"
-#include "data/corpus.h"
-#include "data/text_input.h"
-#include "translator/beam_search.h"
-#include "translator/translator.h"
-#include "common/io.h"
-#include "common/timer.h"
-#include <vector>
-#include "tensors/gpu/swap.h"
-namespace marian {
+#include "translator/swappable.h"
+#include <iostream>
 
-/* A model loaded on the CPU and possibly on a GPU */
-class Model {
-  private:
-    std::vector<io::Item> parameters_;
-    std::vector<Ptr<Vocab>> srcVocabs_;
-    Ptr<Vocab> trgVocab_;
-
-  public:
-    Model(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
-        : parameters_(io::loadItems(parameters)) {
-      // Load parameters.
-      // Find the special element and remove it:
-      size_t special_idx = 0;
-      for (size_t i = 0; i < parameters_.size(); i++) {
-        if (parameters_[i].name == "special:model.yml") {
-          special_idx = i;
-          break;
-        }
-      }
-      parameters_.erase(parameters_.begin() + special_idx);
-      // Prepare the name so that it matches the named map
-      for (auto&& item : parameters_) {
-        item.name = "F0::" + item.name;
-      }
-
-      // Load source vocabs.
-      const std::vector<int> &maxVocabs = options->get<std::vector<int>>("dim-vocabs");
-      for(size_t i = 0; i < sourceVocabPaths.size(); ++i) {
-        Ptr<Vocab> vocab = New<Vocab>(options, i);
-        vocab->load(sourceVocabPaths[i], maxVocabs[i]);
-        srcVocabs_.emplace_back(vocab);
-      }
-
-      // Load target vocab.
-      trgVocab_ = New<Vocab>(options, sourceVocabPaths.size());
-      trgVocab_->load(targetVocabPath);
-    }
-
-    const std::vector<io::Item> &Parameters() const { return parameters_; }
-
-    const std::vector<Ptr<Vocab>> &SrcVocabs() const { return srcVocabs_; }
-
-    Ptr<Vocab> TrgVocab() const { return trgVocab_; }
-};
-
-/* Reserved space on a GPU with which to translate */
-class GPUSlot {
-	private:
-    Ptr<Options> options_;
-    Ptr<ExpressionGraph> graph_;
-    std::vector<Ptr<Scorer> > scorers_;
-
-    // Last model used for translation.  Used to skip loading.
-    const Model *loadedModel_;
-
-    void Load(const std::vector<io::Item> &parameters) {
-      timer::Timer timer;
-      auto namedMap = graph_->getParamsNamedMap();
-      for (auto&& item : parameters) {
-        auto to = reinterpret_cast<char *>(namedMap[item.name]->val()->memory()->data());
-        swapper::copyCpuToGpu(to, &item.bytes[0], item.bytes.size());
-      }
-      LOG(info, "Load took: {:.8f}s wall", timer.elapsed());
-    }
-
-  public:
-    explicit GPUSlot(Ptr<Options> options) : options_(options), loadedModel_(nullptr) {
-      options_->set("inference", true);
-      options_->set("shuffle", "none");
-      // get device IDs
-      auto devices = Config::getDevices(options_);
-      auto numDevices = devices.size();
-      std::cerr << "Num devices: " << numDevices << std::endl;
-        
-      // Create graph
-      graph_ = New<ExpressionGraph>();
-      auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
-      graph_->setDefaultElementType(typeFromString(prec[0]));
-      graph_->setDevice(devices[0]);
-      graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-      // TODO: multiple scorers.
-      Ptr<Scorer> scorer = createScorers(options_)[0];
-      scorer->init(graph_);
-      scorers_.push_back(scorer);
-      graph_->forward();
-    }
-
-    void Translate(const Model &model, const std::vector<std::string> &input) {
-      if (loadedModel_ != &model) {
-        Load(model.Parameters());
-        loadedModel_ = &model;
-      }
-      auto corpus = New<data::TextInput>(input, model.SrcVocabs(), options_);
-      data::BatchGenerator<data::TextInput> batchGenerator(corpus, options_, nullptr, false);
-
-      auto search = New<BeamSearch>(options_, scorers_, model.TrgVocab());
-      auto printer = New<OutputPrinter>(options_, model.TrgVocab());
-      for (auto&& batch : batchGenerator) {
-        auto histories = search->search(graph_, batch);
-        for(auto history : histories) {
-          std::stringstream best1;
-          std::stringstream bestn;
-          printer->print(history, best1, bestn);
-          LOG(info, "Translation {}", best1.str());
-        }
-      }
-    }
-};
-
-} // namespace marian
-
-/* Demo program */
+/* Demo program: run with options for any of the models */
 int main(int argc, char** argv) {
   using namespace marian;
   Ptr<Options> options = parseOptions(argc, argv, cli::mode::translation);
-  GPUSlot slot(options);
-  Model pten(options,
+  SwappableSlot slot(options);
+  SwappableModel pten(options,
       "/home/ubuntu/consistent-big-models/padded/pten.npz",
       {"/home/ubuntu/consistent-big-models/padded/pten.vocab"},
       "/home/ubuntu/consistent-big-models/padded/pten.vocab");
 
-  Model enit(options,
+  SwappableModel enit(options,
       "/home/ubuntu/consistent-big-models/padded/enit.npz",
       {"/home/ubuntu/consistent-big-models/padded/enit.vocab"},
       "/home/ubuntu/consistent-big-models/padded/enit.vocab");
 
-  const Model *model = &pten;
+  const SwappableModel *model = &pten;
   std::string line;
   while (std::getline(std::cin, line)) {
     if (line == " TRANSLATE PTEN") {
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
new file mode 100644
index 000000000..5f42f0748
--- /dev/null
+++ b/src/translator/swappable.cpp
@@ -0,0 +1,96 @@
+#include "marian.h"
+#include "translator/swappable.h"
+#include "common/logging.h"
+#include "data/corpus.h"
+#include "data/text_input.h"
+#include "translator/beam_search.h"
+#include "translator/translator.h"
+#include "common/io.h"
+#include "common/timer.h"
+#include <vector>
+#include "tensors/gpu/swap.h"
+namespace marian {
+
+SwappableModel::SwappableModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
+  : parameters_(io::loadItems(parameters)) {
+  // Load parameters.
+  // Find the special element and remove it:
+  size_t special_idx = 0;
+  for (size_t i = 0; i < parameters_.size(); i++) {
+    if (parameters_[i].name == "special:model.yml") {
+      special_idx = i;
+      break;
+    }
+  }
+  parameters_.erase(parameters_.begin() + special_idx);
+  // Prepare the name so that it matches the named map
+  for (auto&& item : parameters_) {
+    item.name = "F0::" + item.name;
+  }
+
+  // Load source vocabs.
+  const std::vector<int> &maxVocabs = options->get<std::vector<int>>("dim-vocabs");
+  for(size_t i = 0; i < sourceVocabPaths.size(); ++i) {
+    Ptr<Vocab> vocab = New<Vocab>(options, i);
+    vocab->load(sourceVocabPaths[i], maxVocabs[i]);
+    srcVocabs_.emplace_back(vocab);
+  }
+
+  // Load target vocab.
+  trgVocab_ = New<Vocab>(options, sourceVocabPaths.size());
+  trgVocab_->load(targetVocabPath);
+}
+
+void SwappableSlot::Load(const std::vector<io::Item> &parameters) {
+  timer::Timer timer;
+  auto namedMap = graph_->getParamsNamedMap();
+  for (auto&& item : parameters) {
+    auto to = reinterpret_cast<char *>(namedMap[item.name]->val()->memory()->data());
+    swapper::copyCpuToGpu(to, &item.bytes[0], item.bytes.size());
+  }
+  LOG(info, "Load took: {:.8f}s wall", timer.elapsed());
+}
+
+SwappableSlot::SwappableSlot(Ptr<Options> options) : options_(options), loadedModel_(nullptr) {
+  options_->set("inference", true);
+  options_->set("shuffle", "none");
+  // get device IDs
+  auto devices = Config::getDevices(options_);
+  auto numDevices = devices.size();
+  std::cerr << "Num devices: " << numDevices << std::endl;
+
+  // Create graph
+  graph_ = New<ExpressionGraph>();
+  auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
+  graph_->setDefaultElementType(typeFromString(prec[0]));
+  graph_->setDevice(devices[0]);
+  graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+  // TODO: multiple scorers.
+  Ptr<Scorer> scorer = createScorers(options_)[0];
+  scorer->init(graph_);
+  scorers_.push_back(scorer);
+  graph_->forward();
+}
+
+void SwappableSlot::Translate(const SwappableModel &model, const std::vector<std::string> &input) {
+  if (loadedModel_ != &model) {
+    Load(model.Parameters());
+    loadedModel_ = &model;
+  }
+  auto corpus = New<data::TextInput>(input, model.SrcVocabs(), options_);
+  data::BatchGenerator<data::TextInput> batchGenerator(corpus, options_, nullptr, false);
+
+  auto search = New<BeamSearch>(options_, scorers_, model.TrgVocab());
+  auto printer = New<OutputPrinter>(options_, model.TrgVocab());
+  for (auto&& batch : batchGenerator) {
+    auto histories = search->search(graph_, batch);
+    for(auto history : histories) {
+      std::stringstream best1;
+      std::stringstream bestn;
+      printer->print(history, best1, bestn);
+      LOG(info, "Translation {}", best1.str());
+    }
+  }
+}
+
+} // namespace marian
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
new file mode 100644
index 000000000..9ef0c871e
--- /dev/null
+++ b/src/translator/swappable.h
@@ -0,0 +1,55 @@
+#pragma once
+/* Support for swapping models in and out of a GPU, when you have more models
+ * than fit in the GPU's RAM.  The models must have identical graphs, including
+ * size. They can have different parameters and different vocabularies but the
+ * vocabularies must have the same size.  To make vocabulary the same size, pad
+ * using scripts/contrib/pad_model_vocabulary.py offline.
+ */
+#include "marian.h"
+#include "common/io.h"
+#include "data/corpus.h"
+#include "data/text_input.h"
+#include "translator/translator.h"
+#include <vector>
+namespace marian {
+
+/* A model loaded on the CPU and possibly on a GPU.
+ */
+class SwappableModel {
+  private:
+    std::vector<io::Item> parameters_;
+    std::vector<Ptr<Vocab>> srcVocabs_;
+    Ptr<Vocab> trgVocab_;
+
+  public:
+    // The parts of Options that relate to model and vocab are ignored.  The files provided will be loaded.
+    SwappableModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath);
+
+    const std::vector<io::Item> &Parameters() const { return parameters_; }
+
+    const std::vector<Ptr<Vocab>> &SrcVocabs() const { return srcVocabs_; }
+
+    Ptr<Vocab> TrgVocab() const { return trgVocab_; }
+};
+
+/* Reserved space on a GPU with which to translate. If you can afford to fit
+ * multiple models on 1 GPU, then each one that fits is a GPUSlot
+ */
+class SwappableSlot {
+	private:
+    Ptr<Options> options_;
+    Ptr<ExpressionGraph> graph_;
+    std::vector<Ptr<Scorer> > scorers_;
+
+    // Last model used for translation.  Used to skip loading.
+    const SwappableModel *loadedModel_;
+
+    void Load(const std::vector<io::Item> &parameters);
+
+  public:
+    explicit SwappableSlot(Ptr<Options> options);
+
+    void Translate(const SwappableModel &model, const std::vector<std::string> &input);
+};
+
+} // namespace marian

From 4d8e3271f9b07d666ba933f5604448557aacf20e Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 28 Mar 2021 19:53:34 +0000
Subject: [PATCH 005/135] Restore ensemble support

---
 src/translator/swappable.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 5f42f0748..d09cd85d0 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -65,10 +65,12 @@ SwappableSlot::SwappableSlot(Ptr<Options> options) : options_(options), loadedMo
   graph_->setDefaultElementType(typeFromString(prec[0]));
   graph_->setDevice(devices[0]);
   graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-  // TODO: multiple scorers.
-  Ptr<Scorer> scorer = createScorers(options_)[0];
-  scorer->init(graph_);
-  scorers_.push_back(scorer);
+
+  scorers_ = createScorers(options_);
+  for (auto scorer : scorers_) {
+    scorer->init(graph_);
+    // TODO lexical shortlists are not supported yet.
+  }
   graph_->forward();
 }
 

From 203a9bb87d5de281d377df7b6925f90118543d21 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 28 Mar 2021 19:56:53 +0000
Subject: [PATCH 006/135] Minor logging improvements

---
 src/translator/swappable.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index d09cd85d0..0f0912085 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -48,7 +48,7 @@ void SwappableSlot::Load(const std::vector<io::Item> &parameters) {
     auto to = reinterpret_cast<char *>(namedMap[item.name]->val()->memory()->data());
     swapper::copyCpuToGpu(to, &item.bytes[0], item.bytes.size());
   }
-  LOG(info, "Load took: {:.8f}s wall", timer.elapsed());
+  LOG(info, "Swapping model from CPU to GPU took {:.8f}s wall", timer.elapsed());
 }
 
 SwappableSlot::SwappableSlot(Ptr<Options> options) : options_(options), loadedModel_(nullptr) {
@@ -57,7 +57,6 @@ SwappableSlot::SwappableSlot(Ptr<Options> options) : options_(options), loadedMo
   // get device IDs
   auto devices = Config::getDevices(options_);
   auto numDevices = devices.size();
-  std::cerr << "Num devices: " << numDevices << std::endl;
 
   // Create graph
   graph_ = New<ExpressionGraph>();

From c71d48838752240297102f9c182a599cadd88811 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 28 Mar 2021 20:17:13 +0000
Subject: [PATCH 007/135] Return Histories

---
 src/command/marian_swapper.cpp | 15 +++++++++++++--
 src/translator/swappable.cpp   | 16 ++++++----------
 src/translator/swappable.h     | 11 +++++++----
 3 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
index 15bdecf63..d0d52d78b 100644
--- a/src/command/marian_swapper.cpp
+++ b/src/command/marian_swapper.cpp
@@ -1,5 +1,8 @@
 #include "translator/swappable.h"
+#include "translator/output_printer.h"
+
 #include <iostream>
+#include <string>
 
 /* Demo program: run with options for any of the models */
 int main(int argc, char** argv) {
@@ -21,10 +24,18 @@ int main(int argc, char** argv) {
   while (std::getline(std::cin, line)) {
     if (line == " TRANSLATE PTEN") {
       model = &pten;
+      continue;
     } else if (line == " TRANSLATE ENIT") {
       model = &enit;
-    } else {
-      slot.Translate(*model, {line});
+      continue;
+    }
+    marian::OutputPrinter printer(options, model->TrgVocab());
+    marian::Histories histories = slot.Translate(*model, {line});
+    for(auto history : histories) {
+      std::stringstream best1;
+      std::stringstream bestn;
+      printer.print(history, best1, bestn);
+      std::cout << best1.str() << '\n';
     }
   }
 
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 0f0912085..475e9bb5b 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -56,7 +56,6 @@ SwappableSlot::SwappableSlot(Ptr<Options> options) : options_(options), loadedMo
   options_->set("shuffle", "none");
   // get device IDs
   auto devices = Config::getDevices(options_);
-  auto numDevices = devices.size();
 
   // Create graph
   graph_ = New<ExpressionGraph>();
@@ -73,7 +72,7 @@ SwappableSlot::SwappableSlot(Ptr<Options> options) : options_(options), loadedMo
   graph_->forward();
 }
 
-void SwappableSlot::Translate(const SwappableModel &model, const std::vector<std::string> &input) {
+Histories SwappableSlot::Translate(const SwappableModel &model, const std::vector<std::string> &input) {
   if (loadedModel_ != &model) {
     Load(model.Parameters());
     loadedModel_ = &model;
@@ -82,16 +81,13 @@ void SwappableSlot::Translate(const SwappableModel &model, const std::vector<std
   data::BatchGenerator<data::TextInput> batchGenerator(corpus, options_, nullptr, false);
 
   auto search = New<BeamSearch>(options_, scorers_, model.TrgVocab());
-  auto printer = New<OutputPrinter>(options_, model.TrgVocab());
+  Histories ret;
+  ret.reserve(input.size());
   for (auto&& batch : batchGenerator) {
-    auto histories = search->search(graph_, batch);
-    for(auto history : histories) {
-      std::stringstream best1;
-      std::stringstream bestn;
-      printer->print(history, best1, bestn);
-      LOG(info, "Translation {}", best1.str());
-    }
+    auto result = search->search(graph_, batch);
+    ret.insert(ret.end(), result.begin(), result.end());
   }
+  return ret;
 }
 
 } // namespace marian
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 9ef0c871e..f6431b04c 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -7,12 +7,15 @@
  */
 #include "marian.h"
 #include "common/io.h"
-#include "data/corpus.h"
-#include "data/text_input.h"
-#include "translator/translator.h"
+#include "data/vocab.h"
+#include "translator/history.h"
+
+#include <string>
 #include <vector>
 namespace marian {
 
+class Scorer;
+
 /* A model loaded on the CPU and possibly on a GPU.
  */
 class SwappableModel {
@@ -49,7 +52,7 @@ class SwappableSlot {
   public:
     explicit SwappableSlot(Ptr<Options> options);
 
-    void Translate(const SwappableModel &model, const std::vector<std::string> &input);
+    Histories Translate(const SwappableModel &model, const std::vector<std::string> &input);
 };
 
 } // namespace marian

From 47feb2b2d103ab857985b3c05bff010b0beeddcc Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 28 Mar 2021 21:52:33 +0000
Subject: [PATCH 008/135] Alignments

---
 src/command/marian_swapper.cpp | 91 +++++++++++++++++++++++++++-------
 1 file changed, 72 insertions(+), 19 deletions(-)

diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
index d0d52d78b..0d8198846 100644
--- a/src/command/marian_swapper.cpp
+++ b/src/command/marian_swapper.cpp
@@ -1,41 +1,94 @@
-#include "translator/swappable.h"
+#include "translator/history.h"
 #include "translator/output_printer.h"
+#include "translator/swappable.h"
 
 #include <iostream>
 #include <string>
+#include <unordered_map>
+
+namespace marian {
+void LoadBig(Ptr<Options> options, std::unordered_map<std::string, SwappableModel> &to) {
+  to.emplace("pten", SwappableModel(options,
+      "/home/ubuntu/consistent-big-models/padded/pten.npz",
+      {"/home/ubuntu/consistent-big-models/padded/pten.vocab"},
+      "/home/ubuntu/consistent-big-models/padded/pten.vocab"));
+
+  to.emplace("iten", SwappableModel(options,
+      "/home/ubuntu/consistent-big-models/padded/enit.npz",
+      {"/home/ubuntu/consistent-big-models/padded/enit.vocab"},
+      "/home/ubuntu/consistent-big-models/padded/enit.vocab"));
+}
+
+void LoadTiny(Ptr<Options> options, std::unordered_map<std::string, SwappableModel> &to) {
+  std::vector<std::string> models = {"csen", "encs", "enet", "eten", "esen", "enes"};
+  for (const std::string m : models) {
+    std::string base = "/home/ubuntu/consistent-bergamot-students/padded/";
+    base += m + ".";
+    to.emplace(m, SwappableModel(options, base + "npz", {base + "spm"}, base + "spm"));
+  }
+}
+
+} // namespace
 
 /* Demo program: run with options for any of the models */
 int main(int argc, char** argv) {
   using namespace marian;
   Ptr<Options> options = parseOptions(argc, argv, cli::mode::translation);
+  // You can have multiple slots.  In principle these can even have different sizes, just use separate options.
   SwappableSlot slot(options);
-  SwappableModel pten(options,
-      "/home/ubuntu/consistent-big-models/padded/pten.npz",
-      {"/home/ubuntu/consistent-big-models/padded/pten.vocab"},
-      "/home/ubuntu/consistent-big-models/padded/pten.vocab");
+  
+  std::unordered_map<std::string, SwappableModel> models;
+//  LoadBig(options, models);
+  LoadTiny(options, models);
 
-  SwappableModel enit(options,
-      "/home/ubuntu/consistent-big-models/padded/enit.npz",
-      {"/home/ubuntu/consistent-big-models/padded/enit.vocab"},
-      "/home/ubuntu/consistent-big-models/padded/enit.vocab");
+  // begin with a space to avoid conflict with a real sentence.
+  const std::string kSwitchPrefix(" CHANGE ");
+
+  bool alignments = !options->get<std::string>("alignment").empty();
 
-  const SwappableModel *model = &pten;
+  const SwappableModel *model = nullptr;
   std::string line;
   while (std::getline(std::cin, line)) {
-    if (line == " TRANSLATE PTEN") {
-      model = &pten;
+    // Switch out which model is used.
+    if (line.substr(0, kSwitchPrefix.size()) == kSwitchPrefix) {
+      std::string key = line.substr(kSwitchPrefix.size());
+      auto found = models.find(key);
+      if (found == models.end()) {
+        std::cerr << "Model for " << key << " not loaded." << std::endl;
+        return 1;
+      }
+      model = &found->second;
       continue;
-    } else if (line == " TRANSLATE ENIT") {
-      model = &enit;
+    }
+    if (!model) {
+      std::cerr << "Select a model first." << std::endl;
       continue;
     }
-    marian::OutputPrinter printer(options, model->TrgVocab());
+
+    // Actually translating with a model.
     marian::Histories histories = slot.Translate(*model, {line});
+    // In practice there is one history because we provided one line.
     for(auto history : histories) {
-      std::stringstream best1;
-      std::stringstream bestn;
-      printer.print(history, best1, bestn);
-      std::cout << best1.str() << '\n';
+      Result result(history->top());
+      Words words = std::get<0>(result);
+      std::cout << model->TrgVocab()->decode(words) << std::endl;
+
+      /* Print alignments */
+      if (alignments) {
+        Hypothesis &hypo = *std::get<1>(result);
+        // [t][s] -> P(s|t)
+        marian::data::SoftAlignment alignment(hypo.tracebackAlignment());
+        // An easier call for this is:
+        // std:cout << data::SoftAlignToString(alignment);
+        // The below is just there to show how access them programatically.
+        // NB you can convert to hard with data::ConvertSoftAlignToHardAlign(alignment, threshold)
+        for (auto target : alignment) {
+          for (float source : target) {
+            std::cout << source << ' ';
+          }
+          std::cout << '\n';
+        }
+      }
     }
   }
 

From 8fc8d02b8080fb661ca2c84d8be26320767d11b6 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 28 Mar 2021 22:01:56 +0000
Subject: [PATCH 009/135] Fix enit

---
 src/command/marian_swapper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
index 0d8198846..8cc422ee7 100644
--- a/src/command/marian_swapper.cpp
+++ b/src/command/marian_swapper.cpp
@@ -13,7 +13,7 @@ void LoadBig(Ptr<Options> options, std::unordered_map<std::string, SwappableMode
       {"/home/ubuntu/consistent-big-models/padded/pten.vocab"},
       "/home/ubuntu/consistent-big-models/padded/pten.vocab"));
 
-  to.emplace("iten", SwappableModel(options,
+  to.emplace("enit", SwappableModel(options,
       "/home/ubuntu/consistent-big-models/padded/enit.npz",
       {"/home/ubuntu/consistent-big-models/padded/enit.vocab"},
       "/home/ubuntu/consistent-big-models/padded/enit.vocab"));

From 9b3e76a3397ddee13f10487a61b064a3e66b58e5 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Tue, 30 Mar 2021 13:37:06 +0100
Subject: [PATCH 010/135] Add an option to force loading

---
 src/translator/swappable.cpp | 5 +++++
 src/translator/swappable.h   | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 475e9bb5b..3b77ed3ec 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -72,6 +72,11 @@ SwappableSlot::SwappableSlot(Ptr<Options> options) : options_(options), loadedMo
   graph_->forward();
 }
 
+void SwappableSlot::ForceLoad(const SwappableModel &model) {
+  Load(model.Parameters());
+  loadedModel_ = &model;
+}
+
 Histories SwappableSlot::Translate(const SwappableModel &model, const std::vector<std::string> &input) {
   if (loadedModel_ != &model) {
     Load(model.Parameters());
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index f6431b04c..915824a33 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -52,6 +52,10 @@ class SwappableSlot {
   public:
     explicit SwappableSlot(Ptr<Options> options);
 
+    // Load this model even if it's already loaded.  Mostly useful for timing.
+    void ForceLoad(const SwappableModel &model);
+
+    // Translate using this model, loading if necessary.
     Histories Translate(const SwappableModel &model, const std::vector<std::string> &input);
 };
 

From cf12178df736f25ba256ba4cd347f5afd542234e Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Tue, 30 Mar 2021 14:25:41 +0100
Subject: [PATCH 011/135] Allow CPU only compilation

---
 src/tensors/gpu/swap.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/tensors/gpu/swap.h b/src/tensors/gpu/swap.h
index 86b3094d7..872810b0a 100644
--- a/src/tensors/gpu/swap.h
+++ b/src/tensors/gpu/swap.h
@@ -1,6 +1,14 @@
+#pragma once
 #include <stdlib.h>
+#include "common/logging.h"
 namespace marian {
     namespace swapper {
+#ifdef CUDA_FOUND
         void copyCpuToGpu(char * gpuOut, const char * in, size_t count);
+#else
+        inline void copyCpuToGpu(char * gpuOut, const char * in, size_t count) {
+            ABORT("Copy from CPU to GPU memory is only available with CUDA.");
+        }
+#endif
     }
 }

From 7e06801a6e9fc86eb70429ecbfe1dc22e553ad70 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Tue, 30 Mar 2021 18:36:44 +0000
Subject: [PATCH 012/135] Add explicit gpu device index when creating the
 object

---
 src/tensors/gpu/swap.cu      |  4 +++-
 src/tensors/gpu/swap.h       |  5 +++--
 src/translator/swappable.cpp | 11 +++++------
 src/translator/swappable.h   |  8 +++++++-
 4 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/tensors/gpu/swap.cu b/src/tensors/gpu/swap.cu
index 3eb2c0df8..c0bd73a9a 100644
--- a/src/tensors/gpu/swap.cu
+++ b/src/tensors/gpu/swap.cu
@@ -1,9 +1,11 @@
 #include "cuda_helpers.h"
+#include "swap.h"
 void copyCpuToGpu(const char * in, char * gpuOut);
 
 namespace marian {
     namespace swapper {
-        void copyCpuToGpu(char * gpuOut, const char * in, size_t count) {
+        void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
+            CUDA_CHECK(cudaSetDevice(deviceId.no));
             CUDA_CHECK(cudaMemcpy(gpuOut, in, count, cudaMemcpyHostToDevice));
         }
     }
diff --git a/src/tensors/gpu/swap.h b/src/tensors/gpu/swap.h
index 872810b0a..a020c8827 100644
--- a/src/tensors/gpu/swap.h
+++ b/src/tensors/gpu/swap.h
@@ -1,12 +1,13 @@
 #pragma once
 #include <stdlib.h>
+#include "common/definitions.h"
 #include "common/logging.h"
 namespace marian {
     namespace swapper {
 #ifdef CUDA_FOUND
-        void copyCpuToGpu(char * gpuOut, const char * in, size_t count);
+        void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId);
 #else
-        inline void copyCpuToGpu(char * gpuOut, const char * in, size_t count) {
+        inline void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
             ABORT("Copy from CPU to GPU memory is only available with CUDA.");
         }
 #endif
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 3b77ed3ec..f9bf98bbc 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -46,22 +46,21 @@ void SwappableSlot::Load(const std::vector<io::Item> &parameters) {
   auto namedMap = graph_->getParamsNamedMap();
   for (auto&& item : parameters) {
     auto to = reinterpret_cast<char *>(namedMap[item.name]->val()->memory()->data());
-    swapper::copyCpuToGpu(to, &item.bytes[0], item.bytes.size());
+    swapper::copyCpuToGpu(to, &item.bytes[0], item.bytes.size(), myDeviceId_);
   }
   LOG(info, "Swapping model from CPU to GPU took {:.8f}s wall", timer.elapsed());
 }
 
-SwappableSlot::SwappableSlot(Ptr<Options> options) : options_(options), loadedModel_(nullptr) {
+SwappableSlot::SwappableSlot(Ptr<Options> options, size_t deviceIdx /*=0*/) : options_(options), myDeviceId_(Config::getDevices(options)[deviceIdx]), loadedModel_(nullptr) {
+  ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
   options_->set("inference", true);
   options_->set("shuffle", "none");
-  // get device IDs
-  auto devices = Config::getDevices(options_);
 
   // Create graph
   graph_ = New<ExpressionGraph>();
   auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
   graph_->setDefaultElementType(typeFromString(prec[0]));
-  graph_->setDevice(devices[0]);
+  graph_->setDevice(myDeviceId_);
   graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
 
   scorers_ = createScorers(options_);
@@ -83,7 +82,7 @@ Histories SwappableSlot::Translate(const SwappableModel &model, const std::vecto
     loadedModel_ = &model;
   }
   auto corpus = New<data::TextInput>(input, model.SrcVocabs(), options_);
-  data::BatchGenerator<data::TextInput> batchGenerator(corpus, options_, nullptr, false);
+  data::BatchGenerator<data::TextInput> batchGenerator(corpus, options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
 
   auto search = New<BeamSearch>(options_, scorers_, model.TrgVocab());
   Histories ret;
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 915824a33..00e5027c3 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -43,6 +43,7 @@ class SwappableSlot {
     Ptr<Options> options_;
     Ptr<ExpressionGraph> graph_;
     std::vector<Ptr<Scorer> > scorers_;
+    const marian::DeviceId myDeviceId_;
 
     // Last model used for translation.  Used to skip loading.
     const SwappableModel *loadedModel_;
@@ -50,7 +51,12 @@ class SwappableSlot {
     void Load(const std::vector<io::Item> &parameters);
 
   public:
-    explicit SwappableSlot(Ptr<Options> options);
+    /**
+     * @param options The marian options object
+     * @param deviceNum The index of the device you want to use for this slot. Note that this is not the deviceID but the index of the device in the
+     *                  array of supplied devices. Eg if you provide -d 0 3 5 and you want the Slot to run on GPU 3, you provide deviceNum=1.
+     */
+    explicit SwappableSlot(Ptr<Options> options, size_t deviceIdx=0);
 
     // Load this model even if it's already loaded.  Mostly useful for timing.
     void ForceLoad(const SwappableModel &model);

From 635cfb06460c06bf1058d131815f699673519d52 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Tue, 30 Mar 2021 18:59:54 +0000
Subject: [PATCH 013/135] Allow multiple mini-batches

---
 src/translator/swappable.cpp | 14 +++++++++++++-
 src/translator/swappable.h   |  2 ++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index f9bf98bbc..d1c282012 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -51,6 +51,18 @@ void SwappableSlot::Load(const std::vector<io::Item> &parameters) {
   LOG(info, "Swapping model from CPU to GPU took {:.8f}s wall", timer.elapsed());
 }
 
+std::string SwappableSlot::MultilineInputHack(const std::vector<std::string> &input) {
+  if (input.size() == 1) {
+    return input[0];
+  } else {
+    std::stringstream ss;
+    for (auto&& line : input) {
+      ss << line << '\n';
+    }
+    return ss.str();
+  }
+}
+
 SwappableSlot::SwappableSlot(Ptr<Options> options, size_t deviceIdx /*=0*/) : options_(options), myDeviceId_(Config::getDevices(options)[deviceIdx]), loadedModel_(nullptr) {
   ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
   options_->set("inference", true);
@@ -81,7 +93,7 @@ Histories SwappableSlot::Translate(const SwappableModel &model, const std::vecto
     Load(model.Parameters());
     loadedModel_ = &model;
   }
-  auto corpus = New<data::TextInput>(input, model.SrcVocabs(), options_);
+  auto corpus = New<data::TextInput>(std::vector<std::string>(1,MultilineInputHack(input)), model.SrcVocabs(), options_); // @TODO dirty hack
   data::BatchGenerator<data::TextInput> batchGenerator(corpus, options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
 
   auto search = New<BeamSearch>(options_, scorers_, model.TrgVocab());
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 00e5027c3..8d6e207de 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -50,6 +50,8 @@ class SwappableSlot {
 
     void Load(const std::vector<io::Item> &parameters);
 
+    std::string MultilineInputHack(const std::vector<std::string> &input);
+
   public:
     /**
      * @param options The marian options object

From ee6ff754471c4b56e7a64a196bda71bd11f04454 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Tue, 30 Mar 2021 19:24:55 +0000
Subject: [PATCH 014/135] No stringstreams

---
 src/translator/swappable.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index d1c282012..2517baec5 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -55,11 +55,13 @@ std::string SwappableSlot::MultilineInputHack(const std::vector<std::string> &in
   if (input.size() == 1) {
     return input[0];
   } else {
-    std::stringstream ss;
+    std::string ret;
+    ret.reserve(10000);
     for (auto&& line : input) {
-      ss << line << '\n';
+      ret.append(line);
+      ret.append("\n");
     }
-    return ss.str();
+    return ret;
   }
 }
 

From 57ddebacbe057c519cc04840dd2f0d23b25d1e1b Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Thu, 1 Apr 2021 00:02:21 +0000
Subject: [PATCH 015/135] Sort the histories before returning them

---
 src/translator/swappable.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 2517baec5..eec74a72d 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -105,6 +105,7 @@ Histories SwappableSlot::Translate(const SwappableModel &model, const std::vecto
     auto result = search->search(graph_, batch);
     ret.insert(ret.end(), result.begin(), result.end());
   }
+  std::sort(ret.begin(), ret.end(),[](marian::Ptr<marian::History> a, marian::Ptr<marian::History> b){return a->getLineNum() < b->getLineNum();});
   return ret;
 }
 

From 4f2b218ba62f95b4af98f6681c22730a59cd7c0b Mon Sep 17 00:00:00 2001
From: Davide Caroselli <davide@modernmt.eu>
Date: Thu, 1 Apr 2021 11:11:40 +0000
Subject: [PATCH 016/135] SwappableSlot: add GPU-to-GPU reset feature

---
 src/tensors/gpu/swap.cu      |  5 +++++
 src/tensors/gpu/swap.h       |  5 +++++
 src/translator/swappable.cpp | 21 +++++++++++++++++++++
 src/translator/swappable.h   |  4 ++++
 4 files changed, 35 insertions(+)

diff --git a/src/tensors/gpu/swap.cu b/src/tensors/gpu/swap.cu
index c0bd73a9a..16210e0c5 100644
--- a/src/tensors/gpu/swap.cu
+++ b/src/tensors/gpu/swap.cu
@@ -1,6 +1,7 @@
 #include "cuda_helpers.h"
 #include "swap.h"
 void copyCpuToGpu(const char * in, char * gpuOut);
+void copyGpuToGpu(const char * in, char * gpuOut);
 
 namespace marian {
     namespace swapper {
@@ -8,5 +9,9 @@ namespace marian {
             CUDA_CHECK(cudaSetDevice(deviceId.no));
             CUDA_CHECK(cudaMemcpy(gpuOut, in, count, cudaMemcpyHostToDevice));
         }
+        void copyGpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
+            CUDA_CHECK(cudaSetDevice(deviceId.no));
+            CUDA_CHECK(cudaMemcpy(gpuOut, in, count, cudaMemcpyDeviceToDevice));
+        }
     }
 }
diff --git a/src/tensors/gpu/swap.h b/src/tensors/gpu/swap.h
index a020c8827..7d8784266 100644
--- a/src/tensors/gpu/swap.h
+++ b/src/tensors/gpu/swap.h
@@ -6,10 +6,15 @@ namespace marian {
     namespace swapper {
 #ifdef CUDA_FOUND
         void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId);
+        void copyGpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId);
 #else
         inline void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
             ABORT("Copy from CPU to GPU memory is only available with CUDA.");
         }
+
+        inline void copyGpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
+            ABORT("Copy from GPU to GPU memory is only available with CUDA.");
+        }
 #endif
     }
 }
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index eec74a72d..9f9ce3b14 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -51,6 +51,22 @@ void SwappableSlot::Load(const std::vector<io::Item> &parameters) {
   LOG(info, "Swapping model from CPU to GPU took {:.8f}s wall", timer.elapsed());
 }
 
+void SwappableSlot::Load(const SwappableSlot &slot) {
+    timer::Timer timer;
+    auto toMap = graph_->getParamsNamedMap();
+    auto fromMap = slot.graph_->getParamsNamedMap();
+
+    for (auto &it : fromMap) {
+        size_t size = it.second->val()->memory()->size();
+        auto from = reinterpret_cast<const char *>(it.second->val()->memory()->data());
+        auto to = reinterpret_cast<char *>(toMap[it.first]->val()->memory()->data());
+
+        swapper::copyGpuToGpu(to, from, size, myDeviceId_);
+    }
+
+    LOG(info, "Swapping model from GPU to GPU took {:.8f}s wall", timer.elapsed());
+}
+
 std::string SwappableSlot::MultilineInputHack(const std::vector<std::string> &input) {
   if (input.size() == 1) {
     return input[0];
@@ -90,6 +106,11 @@ void SwappableSlot::ForceLoad(const SwappableModel &model) {
   loadedModel_ = &model;
 }
 
+void SwappableSlot::ForceLoad(const SwappableModel &model, const SwappableSlot &slot) {
+  Load(slot);
+  loadedModel_ = &model;
+}
+
 Histories SwappableSlot::Translate(const SwappableModel &model, const std::vector<std::string> &input) {
   if (loadedModel_ != &model) {
     Load(model.Parameters());
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 8d6e207de..c9b728cf4 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -50,6 +50,8 @@ class SwappableSlot {
 
     void Load(const std::vector<io::Item> &parameters);
 
+    void Load(const SwappableSlot &slot);
+
     std::string MultilineInputHack(const std::vector<std::string> &input);
 
   public:
@@ -63,6 +65,8 @@ class SwappableSlot {
     // Load this model even if it's already loaded.  Mostly useful for timing.
     void ForceLoad(const SwappableModel &model);
 
+    void ForceLoad(const SwappableModel &model, const SwappableSlot &slot);
+
     // Translate using this model, loading if necessary.
     Histories Translate(const SwappableModel &model, const std::vector<std::string> &input);
 };

From e3f53884a06fa91db12299d78c530d6c8d68bd09 Mon Sep 17 00:00:00 2001
From: Nikolay Bogoychev <nheart@gmail.com>
Date: Fri, 2 Apr 2021 21:56:50 +0000
Subject: [PATCH 017/135] Separate graph from loading to GPU

---
 src/command/marian_swapper.cpp |  30 +++---
 src/translator/swappable.cpp   | 176 ++++++++++++++++++---------------
 src/translator/swappable.h     |  84 +++++++++-------
 3 files changed, 159 insertions(+), 131 deletions(-)

diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
index 8cc422ee7..ce3c160f0 100644
--- a/src/command/marian_swapper.cpp
+++ b/src/command/marian_swapper.cpp
@@ -7,24 +7,24 @@
 #include <unordered_map>
 
 namespace marian {
-void LoadBig(Ptr<Options> options, std::unordered_map<std::string, SwappableModel> &to) {
-  to.emplace("pten", SwappableModel(options,
+void LoadBig(Ptr<Options> options, std::unordered_map<std::string, CPULoadedModel> &to) {
+  to.emplace("pten", CPULoadedModel(options,
       "/home/ubuntu/consistent-big-models/padded/pten.npz",
       {"/home/ubuntu/consistent-big-models/padded/pten.vocab"},
       "/home/ubuntu/consistent-big-models/padded/pten.vocab"));
 
-  to.emplace("enit", SwappableModel(options,
+  to.emplace("enit", CPULoadedModel(options,
       "/home/ubuntu/consistent-big-models/padded/enit.npz",
       {"/home/ubuntu/consistent-big-models/padded/enit.vocab"},
       "/home/ubuntu/consistent-big-models/padded/enit.vocab"));
 }
 
-void LoadTiny(Ptr<Options> options, std::unordered_map<std::string, SwappableModel> &to) {
+void LoadTiny(Ptr<Options> options, std::unordered_map<std::string, CPULoadedModel> &to) {
   std::vector<std::string> models = {"csen", "encs", "enet", "eten", "esen", "enes"};
   for (const std::string m : models) {
     std::string base = "/home/ubuntu/consistent-bergamot-students/padded/";
     base += m + ".";
-    to.emplace(m, SwappableModel(options, base + "npz", {base + "spm"}, base + "spm"));
+    to.emplace(m, CPULoadedModel(options, base + "npz", {base + "spm"}, base + "spm"));
   }
 }
 
@@ -34,10 +34,11 @@ void LoadTiny(Ptr<Options> options, std::unordered_map<std::string, SwappableMod
 int main(int argc, char** argv) {
   using namespace marian;
   Ptr<Options> options = parseOptions(argc, argv, cli::mode::translation);
-  // You can have multiple slots.  In principle these can even have different sizes, just use separate options.
-  SwappableSlot slot(options);
-  
-  std::unordered_map<std::string, SwappableModel> models;
+
+  Ptr<GPUEngine> engine = New<GPUEngine>(options, 0);
+  GPULoadedModel slot(engine);
+
+  std::unordered_map<std::string, CPULoadedModel> models;
 //  LoadBig(options, models);
   LoadTiny(options, models);
 
@@ -46,7 +47,7 @@ int main(int argc, char** argv) {
 
   bool alignments = !options->get<std::string>("alignment").empty();
 
-  const SwappableModel *model = nullptr;
+  bool loaded = false;
   std::string line;
   while (std::getline(std::cin, line)) {
     // Switch out which model is used.
@@ -57,21 +58,22 @@ int main(int argc, char** argv) {
         std::cerr << "Model for " << key << " not loaded." << std::endl;
         return 1;
       }
-      model = &found->second;
+      slot.OverwriteFrom(found->second);
+      loaded = true;
       continue;
     }
-    if (!model) {
+    if (!loaded) {
       std::cerr << "Select a model first." << std::endl;
       continue;
     }
 
     // Actually translating with a model.
-    marian::Histories histories = slot.Translate(*model, {line});
+    marian::Histories histories = slot.Translate({line});
     // In practice there is one history because we provided one line.
     for(auto history : histories) {
       Result result(history->top());
       Words words = std::get<0>(result);
-      std::cout << model->TrgVocab()->decode(words) << std::endl;
+      std::cout << slot.TrgVocab()->decode(words) << std::endl;
 
       /* Print alignments */
       if (alignments) {
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 9f9ce3b14..21c9413bc 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -9,70 +9,81 @@
 #include "common/timer.h"
 #include <vector>
 #include "tensors/gpu/swap.h"
+
 namespace marian {
 
-SwappableModel::SwappableModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
-  : parameters_(io::loadItems(parameters)) {
-  // Load parameters.
-  // Find the special element and remove it:
-  size_t special_idx = 0;
-  for (size_t i = 0; i < parameters_.size(); i++) {
-    if (parameters_[i].name == "special:model.yml") {
-      special_idx = i;
-      break;
-    }
-  }
-  parameters_.erase(parameters_.begin() + special_idx);
-  // Prepare the name so that it matches the named map
-  for (auto&& item : parameters_) {
-    item.name = "F0::" + item.name;
+void GPUEngine::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
+  auto write_it = graph_->params()->begin();
+  auto read_it = with.begin();
+  for (; read_it != with.end(); ++write_it, ++read_it) {
+    std::swap(*(*write_it)->val()->memory(), **read_it);
   }
+}
 
-  // Load source vocabs.
-  const std::vector<int> &maxVocabs = options->get<std::vector<int>>("dim-vocabs");
-  for(size_t i = 0; i < sourceVocabPaths.size(); ++i) {
-    Ptr<Vocab> vocab = New<Vocab>(options, i);
-    vocab->load(sourceVocabPaths[i], maxVocabs[i]);
-    srcVocabs_.emplace_back(vocab);
+GPUEngine::GPUEngine(Ptr<Options> options, size_t deviceIdx) 
+  : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(Config::getDevices(options)[deviceIdx]), allocator_(myDeviceId_, 0, 128 * 1048576) {
+  ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
+  options_->set("inference", true);
+  options_->set("shuffle", "none");
+
+  // Create graph
+  auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
+  graph_->setDefaultElementType(typeFromString(prec[0]));
+  graph_->setDevice(myDeviceId_);
+  graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+
+  scorers_ = createScorers(options_);
+  for (auto scorer : scorers_) {
+    scorer->init(graph_);
+    // TODO lexical shortlists are not supported yet.
   }
+  graph_->forward();
+  // TODO: reach into graph_->params() private members and free the parameter memory.
+}
 
-  // Load target vocab.
-  trgVocab_ = New<Vocab>(options, sourceVocabPaths.size());
-  trgVocab_->load(targetVocabPath);
+GPUEngine::~GPUEngine() {}
+
+GPULoadedModel::GPULoadedModel(Ptr<GPUEngine> gpu) : engine_(gpu) {
+  for (auto &param : *engine_->graph_->params()) {
+    parameters_.push_back(engine_->allocator_.alloc(param->val()->memory()->size()));
+  }
 }
 
-void SwappableSlot::Load(const std::vector<io::Item> &parameters) {
-  timer::Timer timer;
-  auto namedMap = graph_->getParamsNamedMap();
-  for (auto&& item : parameters) {
-    auto to = reinterpret_cast<char *>(namedMap[item.name]->val()->memory()->data());
-    swapper::copyCpuToGpu(to, &item.bytes[0], item.bytes.size(), myDeviceId_);
+GPULoadedModel::~GPULoadedModel() {
+  for (MemoryPiece::PtrType &p : parameters_) {
+    engine_->allocator_.free(p);
   }
-  LOG(info, "Swapping model from CPU to GPU took {:.8f}s wall", timer.elapsed());
 }
 
-void SwappableSlot::Load(const SwappableSlot &slot) {
-    timer::Timer timer;
-    auto toMap = graph_->getParamsNamedMap();
-    auto fromMap = slot.graph_->getParamsNamedMap();
+void GPULoadedModel::OverwriteFrom(const GPULoadedModel &from) {
+  srcVocabs_ = from.srcVocabs_;
+  trgVocab_ = from.trgVocab_;
 
-    for (auto &it : fromMap) {
-        size_t size = it.second->val()->memory()->size();
-        auto from = reinterpret_cast<const char *>(it.second->val()->memory()->data());
-        auto to = reinterpret_cast<char *>(toMap[it.first]->val()->memory()->data());
+  ABORT_IF(engine_ != from.engine_, "TODO: copy across GPUs.");
 
-        swapper::copyGpuToGpu(to, from, size, myDeviceId_);
-    }
+  for (size_t i = 0; i < parameters_.size(); ++i) {
+    swapper::copyGpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), reinterpret_cast<const char*>(from.parameters_[i]->data()), parameters_[i]->size(), engine_->myDeviceId_);
+  }
+}
 
-    LOG(info, "Swapping model from GPU to GPU took {:.8f}s wall", timer.elapsed());
+void GPULoadedModel::OverwriteFrom(const CPULoadedModel &from) {
+  srcVocabs_ = from.SrcVocabs();
+  trgVocab_ = from.TrgVocab();
+  for (size_t i = 0; i < parameters_.size(); ++i) {
+    swapper::copyCpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), from.Parameters()[i].data(), from.Parameters()[i].size(), engine_->myDeviceId_);
+  }
 }
 
-std::string SwappableSlot::MultilineInputHack(const std::vector<std::string> &input) {
+std::string MultilineInputHack(const std::vector<std::string> &input) {
   if (input.size() == 1) {
     return input[0];
   } else {
     std::string ret;
-    ret.reserve(10000);
+    std::size_t size = 0;
+    for (auto&& line : input) {
+      size += line.size() + 1;
+    }
+    ret.reserve(size);
     for (auto&& line : input) {
       ret.append(line);
       ret.append("\n");
@@ -81,53 +92,54 @@ std::string SwappableSlot::MultilineInputHack(const std::vector<std::string> &in
   }
 }
 
-SwappableSlot::SwappableSlot(Ptr<Options> options, size_t deviceIdx /*=0*/) : options_(options), myDeviceId_(Config::getDevices(options)[deviceIdx]), loadedModel_(nullptr) {
-  ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
-  options_->set("inference", true);
-  options_->set("shuffle", "none");
-
-  // Create graph
-  graph_ = New<ExpressionGraph>();
-  auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
-  graph_->setDefaultElementType(typeFromString(prec[0]));
-  graph_->setDevice(myDeviceId_);
-  graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-
-  scorers_ = createScorers(options_);
-  for (auto scorer : scorers_) {
-    scorer->init(graph_);
-    // TODO lexical shortlists are not supported yet.
-  }
-  graph_->forward();
-}
-
-void SwappableSlot::ForceLoad(const SwappableModel &model) {
-  Load(model.Parameters());
-  loadedModel_ = &model;
-}
-
-void SwappableSlot::ForceLoad(const SwappableModel &model, const SwappableSlot &slot) {
-  Load(slot);
-  loadedModel_ = &model;
-}
+Histories GPULoadedModel::Translate(const std::vector<std::string> &input) {
+  engine_->SwapPointers(parameters_);
 
-Histories SwappableSlot::Translate(const SwappableModel &model, const std::vector<std::string> &input) {
-  if (loadedModel_ != &model) {
-    Load(model.Parameters());
-    loadedModel_ = &model;
-  }
-  auto corpus = New<data::TextInput>(std::vector<std::string>(1,MultilineInputHack(input)), model.SrcVocabs(), options_); // @TODO dirty hack
-  data::BatchGenerator<data::TextInput> batchGenerator(corpus, options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
+  auto corpus = New<data::TextInput>(std::vector<std::string>(1, MultilineInputHack(input)), srcVocabs_, engine_->options_); // @TODO dirty hack
+  data::BatchGenerator<data::TextInput> batchGenerator(corpus, engine_->options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
 
-  auto search = New<BeamSearch>(options_, scorers_, model.TrgVocab());
+  BeamSearch search(engine_->options_, engine_->scorers_, trgVocab_);
   Histories ret;
   ret.reserve(input.size());
   for (auto&& batch : batchGenerator) {
-    auto result = search->search(graph_, batch);
+    auto result = search.search(engine_->graph_, batch);
     ret.insert(ret.end(), result.begin(), result.end());
   }
   std::sort(ret.begin(), ret.end(),[](marian::Ptr<marian::History> a, marian::Ptr<marian::History> b){return a->getLineNum() < b->getLineNum();});
+  engine_->SwapPointers(parameters_);
   return ret;
 }
 
+CPULoadedModel::CPULoadedModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
+  : parameters_(io::loadItems(parameters)) {
+  // Load parameters.
+  // Find the special element and remove it:
+  size_t special_idx = 0;
+  for (size_t i = 0; i < parameters_.size(); i++) {
+    if (parameters_[i].name == "special:model.yml") {
+      special_idx = i;
+      break;
+    }
+  }
+  parameters_.erase(parameters_.begin() + special_idx);
+  // Prepare the name so that it matches the named map
+  for (auto&& item : parameters_) {
+    item.name = "F0::" + item.name;
+  }
+  // Sort by name to match params order.
+  std::sort(parameters_.begin(), parameters_.end(), [](const io::Item &a, const io::Item &b){return a.name < b.name;});
+
+  // Load source vocabs.
+  const std::vector<int> &maxVocabs = options->get<std::vector<int>>("dim-vocabs");
+  for(size_t i = 0; i < sourceVocabPaths.size(); ++i) {
+    Ptr<Vocab> vocab = New<Vocab>(options, i);
+    vocab->load(sourceVocabPaths[i], maxVocabs[i]);
+    srcVocabs_.emplace_back(vocab);
+  }
+
+  // Load target vocab.
+  trgVocab_ = New<Vocab>(options, sourceVocabPaths.size());
+  trgVocab_->load(targetVocabPath);
+}
+
 } // namespace marian
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index c9b728cf4..f9a276e46 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -16,59 +16,73 @@ namespace marian {
 
 class Scorer;
 
-/* A model loaded on the CPU and possibly on a GPU.
- */
-class SwappableModel {
+class GPULoadedModel;
+class CPULoadedModel;
+
+/* Execute on a particular device */
+class GPUEngine {
+	private:
+    friend class GPULoadedModel;
+    Ptr<Options> options_;
+    Ptr<ExpressionGraph> graph_;
+    std::vector<Ptr<Scorer> > scorers_;
+    const marian::DeviceId myDeviceId_;
+    Allocator allocator_;
+
+    void SwapPointers(std::vector<MemoryPiece::PtrType> &with);
+
+  public:
+    /**
+     * @param options The marian options object
+     * @param deviceNum The index of the device you want to use for this slot. Note that this is not the deviceID but the index of the device in the
+     *                  array of supplied devices. Eg if you provide -d 0 3 5 and you want the Slot to run on GPU 3, you provide deviceNum=1.
+     */
+    explicit GPUEngine(Ptr<Options> options, size_t deviceNum);
+
+    ~GPUEngine();
+};
+
+/* A model loaded on the GPU that can be overwritten from CPU or GPU. */
+class GPULoadedModel {
   private:
-    std::vector<io::Item> parameters_;
+    Ptr<GPUEngine> engine_;
+
+    std::vector<MemoryPiece::PtrType> parameters_;
     std::vector<Ptr<Vocab>> srcVocabs_;
     Ptr<Vocab> trgVocab_;
 
   public:
-    // The parts of Options that relate to model and vocab are ignored.  The files provided will be loaded.
-    SwappableModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath);
+    GPULoadedModel(Ptr<GPUEngine> gpu);
 
-    const std::vector<io::Item> &Parameters() const { return parameters_; }
+    ~GPULoadedModel();
 
     const std::vector<Ptr<Vocab>> &SrcVocabs() const { return srcVocabs_; }
 
     Ptr<Vocab> TrgVocab() const { return trgVocab_; }
-};
 
-/* Reserved space on a GPU with which to translate. If you can afford to fit
- * multiple models on 1 GPU, then each one that fits is a GPUSlot
- */
-class SwappableSlot {
-	private:
-    Ptr<Options> options_;
-    Ptr<ExpressionGraph> graph_;
-    std::vector<Ptr<Scorer> > scorers_;
-    const marian::DeviceId myDeviceId_;
+    // Overwrite this model with parameters from a different one.
+    void OverwriteFrom(const CPULoadedModel &from);
+    void OverwriteFrom(const GPULoadedModel &from);
 
-    // Last model used for translation.  Used to skip loading.
-    const SwappableModel *loadedModel_;
-
-    void Load(const std::vector<io::Item> &parameters);
-
-    void Load(const SwappableSlot &slot);
+    Histories Translate(const std::vector<std::string> &input);
+};
 
-    std::string MultilineInputHack(const std::vector<std::string> &input);
+/* A model loaded on the CPU. */
+class CPULoadedModel {
+  private:
+    std::vector<io::Item> parameters_;
+    std::vector<Ptr<Vocab>> srcVocabs_;
+    Ptr<Vocab> trgVocab_;
 
   public:
-    /**
-     * @param options The marian options object
-     * @param deviceNum The index of the device you want to use for this slot. Note that this is not the deviceID but the index of the device in the
-     *                  array of supplied devices. Eg if you provide -d 0 3 5 and you want the Slot to run on GPU 3, you provide deviceNum=1.
-     */
-    explicit SwappableSlot(Ptr<Options> options, size_t deviceIdx=0);
+    // The parts of Options that relate to model and vocab are ignored.  The files provided will be loaded.
+    CPULoadedModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath);
 
-    // Load this model even if it's already loaded.  Mostly useful for timing.
-    void ForceLoad(const SwappableModel &model);
+    const std::vector<io::Item> &Parameters() const { return parameters_; }
 
-    void ForceLoad(const SwappableModel &model, const SwappableSlot &slot);
+    const std::vector<Ptr<Vocab>> &SrcVocabs() const { return srcVocabs_; }
 
-    // Translate using this model, loading if necessary.
-    Histories Translate(const SwappableModel &model, const std::vector<std::string> &input);
+    Ptr<Vocab> TrgVocab() const { return trgVocab_; }
 };
 
 } // namespace marian

From ba4d166bca752ed5565b49279f55d605ac9522d0 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Fri, 2 Apr 2021 22:14:59 +0000
Subject: [PATCH 018/135] Abort if not initialized

---
 src/translator/swappable.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 21c9413bc..441608390 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -93,6 +93,7 @@ std::string MultilineInputHack(const std::vector<std::string> &input) {
 }
 
 Histories GPULoadedModel::Translate(const std::vector<std::string> &input) {
+  ABORT_IF(!trgVocab_, "GPULoadedModel needs to be overwritten by a CPU model first.");
   engine_->SwapPointers(parameters_);
 
   auto corpus = New<data::TextInput>(std::vector<std::string>(1, MultilineInputHack(input)), srcVocabs_, engine_->options_); // @TODO dirty hack

From f8523b70fae5d235f4bbfb49422e2168de8c4890 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Fri, 2 Apr 2021 22:32:37 +0000
Subject: [PATCH 019/135] Go back to Load instead of OverwriteFrom

---
 src/command/marian_swapper.cpp | 2 +-
 src/translator/swappable.cpp   | 4 ++--
 src/translator/swappable.h     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
index ce3c160f0..758501d1e 100644
--- a/src/command/marian_swapper.cpp
+++ b/src/command/marian_swapper.cpp
@@ -58,7 +58,7 @@ int main(int argc, char** argv) {
         std::cerr << "Model for " << key << " not loaded." << std::endl;
         return 1;
       }
-      slot.OverwriteFrom(found->second);
+      slot.Load(found->second);
       loaded = true;
       continue;
     }
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 441608390..fbaac7bba 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -55,7 +55,7 @@ GPULoadedModel::~GPULoadedModel() {
   }
 }
 
-void GPULoadedModel::OverwriteFrom(const GPULoadedModel &from) {
+void GPULoadedModel::Load(const GPULoadedModel &from) {
   srcVocabs_ = from.srcVocabs_;
   trgVocab_ = from.trgVocab_;
 
@@ -66,7 +66,7 @@ void GPULoadedModel::OverwriteFrom(const GPULoadedModel &from) {
   }
 }
 
-void GPULoadedModel::OverwriteFrom(const CPULoadedModel &from) {
+void GPULoadedModel::Load(const CPULoadedModel &from) {
   srcVocabs_ = from.SrcVocabs();
   trgVocab_ = from.TrgVocab();
   for (size_t i = 0; i < parameters_.size(); ++i) {
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index f9a276e46..91f92adad 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -61,8 +61,8 @@ class GPULoadedModel {
     Ptr<Vocab> TrgVocab() const { return trgVocab_; }
 
     // Overwrite this model with parameters from a different one.
-    void OverwriteFrom(const CPULoadedModel &from);
-    void OverwriteFrom(const GPULoadedModel &from);
+    void Load(const CPULoadedModel &from);
+    void Load(const GPULoadedModel &from);
 
     Histories Translate(const std::vector<std::string> &input);
 };

From 8bcfdcc2aef76a5fc86a9015a594af56fa4930fc Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Fri, 2 Apr 2021 22:54:24 +0000
Subject: [PATCH 020/135] Check device index

---
 src/translator/swappable.cpp | 10 +++++++++-
 src/translator/swappable.h   |  2 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index fbaac7bba..c5c2bae05 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -20,8 +20,16 @@ void GPUEngine::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
   }
 }
 
+namespace {
+DeviceId LookupGPU(const Ptr<Options> options, size_t deviceIdx) {
+  auto devices = Config::getDevices(options);
+  ABORT_IF(deviceIdx >= devices.size(), "GPU device index higher than configured.");
+  return devices[deviceIdx];
+}
+} // namespace
+
 GPUEngine::GPUEngine(Ptr<Options> options, size_t deviceIdx) 
-  : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(Config::getDevices(options)[deviceIdx]), allocator_(myDeviceId_, 0, 128 * 1048576) {
+  : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
   ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
   options_->set("inference", true);
   options_->set("shuffle", "none");
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 91f92adad..b3cb5f82f 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -26,7 +26,7 @@ class GPUEngine {
     Ptr<Options> options_;
     Ptr<ExpressionGraph> graph_;
     std::vector<Ptr<Scorer> > scorers_;
-    const marian::DeviceId myDeviceId_;
+    const DeviceId myDeviceId_;
     Allocator allocator_;
 
     void SwapPointers(std::vector<MemoryPiece::PtrType> &with);

From a893f19fab2d6d57018d4b8b76958290b56da1c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Fri, 12 Feb 2021 09:26:57 +0200
Subject: [PATCH 021/135] Start working on code to reproduce a bug i
 encountered

---
 src/CMakeLists.txt        |  5 +++++
 src/command/bug_repro.cpp | 27 +++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)
 create mode 100644 src/command/bug_repro.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 98d5c4e98..a3a8008e1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -261,6 +261,11 @@ if (NOT COMPILE_LIBRARY_ONLY)
   add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz")
   add_custom_target(philly DEPENDS marian_tgz marian_zip)
 
+  add_executable(bug_repro command/bug_repro.cpp)
+  set_target_properties(bug_repro PROPERTIES OUTPUT_NAME bug_repro)
+  target_compile_options(bug_repro PRIVATE ${ALL_WARNINGS} -Wno-suggest-override)
+  set(EXECUTABLES ${EXECUTABLES} bug_repro)
+
   if(COMPILE_SERVER)
     add_executable(marian_server command/marian_server.cpp)
     set_target_properties(marian_server PROPERTIES OUTPUT_NAME marian-server)
diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
new file mode 100644
index 000000000..2025249e0
--- /dev/null
+++ b/src/command/bug_repro.cpp
@@ -0,0 +1,27 @@
+#include "../models/model_factory.h"
+#include "../models/model_task.h"
+#include "marian.h"
+
+namespace marian {
+
+class ReproTask : public marian::ModelServiceTask {
+private:
+  Ptr<ExpressionGraph> graph_;
+  Ptr<models::ICriterionFunction> builder_;  // Training model
+
+public:
+  ReproTask() {
+    graph_ = New<ExpressionGraph>();
+    graph_->setDevice({0, DeviceType::cpu});
+    graph_->reserveWorkspaceMB(128);
+    // builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
+  }
+  std::string run(const std::string& json) override {
+    return "";
+  }
+};
+
+int main(int argc, char **argv) {
+  return 0;
+}
+}

From 7f6d01e835e95f096ae147c3167a2f1ba1e83496 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Wed, 17 Feb 2021 15:16:58 +0200
Subject: [PATCH 022/135] Build the model implement a simplistic training loop

This doesn't work though because we're missing a lot of options because
we initialize them manually instead of using the config parser.
---
 src/command/bug_repro.cpp | 89 +++++++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 12 deletions(-)

diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
index 2025249e0..20abfec28 100644
--- a/src/command/bug_repro.cpp
+++ b/src/command/bug_repro.cpp
@@ -1,27 +1,92 @@
+#include "../common/options.h"
+#include "../data/text_input.h"
 #include "../models/model_factory.h"
 #include "../models/model_task.h"
+#include "../training/scheduler.h"
 #include "marian.h"
 
 namespace marian {
 
-class ReproTask : public marian::ModelServiceTask {
-private:
-  Ptr<ExpressionGraph> graph_;
-  Ptr<models::ICriterionFunction> builder_;  // Training model
-
+class ReproTask : public marian::ModelTask {
 public:
   ReproTask() {
-    graph_ = New<ExpressionGraph>();
-    graph_->setDevice({0, DeviceType::cpu});
-    graph_->reserveWorkspaceMB(128);
-    // builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
   }
-  std::string run(const std::string& json) override {
-    return "";
+  void run() override {
+    io::InputFileStream strm("/home/rihards/exp/marian-adaptive-crash-repro/models/model.npz.repro.yml");
+    YAML::Node optionsNode = YAML::Load(strm);
+    auto optionsBig = New<Options>(optionsNode);
+    auto options = New<Options>("after", "0e");
+    options->merge(optionsBig);
+    auto builder = models::createCriterionFunctionFromOptions(options, models::usage::training);
+    auto optimizer = Optimizer<Adam>(0.01);
+
+    std::vector<std::string> vocabPaths
+        = {"/home/rihards/exp/marian-adaptive-crash-repro/models/train.1-to-1.bpe.en-lv.yml",
+      "/home/rihards/exp/marian-adaptive-crash-repro/models/train.1-to-1.bpe.en-lv.yml"};
+    std::vector<int> maxVocabs = {500, 500};
+
+    std::vector<Ptr<Vocab>> vocabs;
+    for(size_t i = 0; i < vocabPaths.size(); i++) {
+      Ptr<Vocab> vocab = New<Vocab>(options, i);
+      vocab->load(vocabPaths[i], maxVocabs[i]);
+      vocabs.emplace_back(vocab);
+    }
+    std::string sources = "del@@ e@@ tions affecting 13 q 14 are also the most frequent structural genetic ab@@ "
+          "err@@ ations in chronic lym@@ pho@@ cy@@ tic leu@@ ka@@ emia ( C@@ ll ) 6,@@ 7 , 8 "
+          ".\nthis region is found to be heter@@ oz@@ y@@ g@@ ously deleted in 30 ¬ 60 % and hom@@ "
+      "oz@@ y@@ g@@ ously deleted in 10 ¬ 20 % of C@@ ll patien@@ ts@@ 9 .";
+    std::string targets
+        = "del@@ ē@@ cijas , kas ietekmē 13 q 14 , arī ir visbiežāk sastopa@@ mās strukturālās "
+          "ģenē@@ tiskās ab@@ er@@ ācijas hron@@ iskā lim@@ foc@@ ī@@ tiskajā leik@@ ēm@@ ijā ( "
+          "H@@ LL ) 6,@@ 7 , 8 .\n30 –@@ 60 % H@@ LL pacientu ir konstatēta šī reģiona heter@@ "
+          "oz@@ ig@@ ota del@@ ē@@ cija , savukārt 10 –@@ 20 % H@@ LL pacientu ir konstatēta šī "
+      "reģiona hom@@ oz@@ ig@@ ota del@@ ē@@ c@@ ij@@ a@@ 9 .";
+    auto inputs = New<data::TextInput>(std::vector<std::string>({sources, targets}), vocabs, options);
+    auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
+
+    auto state = New<TrainingState>(options->get<float>("learn-rate"));
+    auto scheduler = New<Scheduler>(options, state);
+    scheduler->registerTrainingObserver(scheduler);
+    scheduler->registerTrainingObserver(optimizer);
+
+    Ptr<ExpressionGraph> graph;
+
+    bool first = true;
+    scheduler->started();
+    while(scheduler->keepGoing()) {
+      batches->prepare();
+
+      for(auto batch : *batches) {
+        if(!scheduler->keepGoing()) {
+          break;
+        }
+
+        if(first) {
+          graph = New<ExpressionGraph>();
+          graph->setDevice({0, DeviceType::cpu});
+          graph->reserveWorkspaceMB(128);
+          first = false;
+        }
+
+        auto lossNode = builder->build(graph, batch);
+        graph->forward();
+        StaticLoss loss = *lossNode;
+        graph->backward();
+
+        optimizer->update(graph);
+        scheduler->update(loss, batch);
+      }
+
+      if(scheduler->keepGoing())
+        scheduler->increaseEpoch();
+    }
+    scheduler->finished();
   }
 };
+}
 
 int main(int argc, char **argv) {
+  auto task = marian::ReproTask();
+  task.run();
   return 0;
 }
-}

From f4e227e019c4b79d565971f787e8d51a53a9b05c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Wed, 17 Feb 2021 17:14:35 +0200
Subject: [PATCH 023/135] Load config using the cli parser so that we can have
 default values for options

---
 src/command/bug_repro.cpp | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
index 20abfec28..f69f5a74f 100644
--- a/src/command/bug_repro.cpp
+++ b/src/command/bug_repro.cpp
@@ -1,3 +1,4 @@
+#include "../common/config_parser.h"
 #include "../common/options.h"
 #include "../data/text_input.h"
 #include "../models/model_factory.h"
@@ -12,11 +13,20 @@ class ReproTask : public marian::ModelTask {
   ReproTask() {
   }
   void run() override {
-    io::InputFileStream strm("/home/rihards/exp/marian-adaptive-crash-repro/models/model.npz.repro.yml");
-    YAML::Node optionsNode = YAML::Load(strm);
-    auto optionsBig = New<Options>(optionsNode);
-    auto options = New<Options>("after", "0e");
-    options->merge(optionsBig);
+    auto parser = ConfigParser(cli::mode::training);
+    // i'm prob leaking memory at the end of run() but i don't care
+    const char* argseasy[]
+        = {"marian",
+           "-c",
+           "/home/rihards/exp/marian-adaptive-crash-repro/models/model.npz.repro.yml"};
+    int argc = sizeof(argseasy) / sizeof(char*);
+    // this is as close as i could get to initializing a char** in a sane manner
+    char** args = new char*[argc];
+    for (int i = 0; i < argc; i++) {
+      args[i] = strdup(argseasy[i]);
+    }
+    auto options = parser.parseOptions(argc, args, false);
+
     auto builder = models::createCriterionFunctionFromOptions(options, models::usage::training);
     auto optimizer = Optimizer<Adam>(0.01);
 

From dcb71223a180a0fd4cb0176fd9dd2b10b2a640af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Wed, 24 Feb 2021 13:11:16 +0200
Subject: [PATCH 024/135] Add dummy values for training sets in the config

---
 src/command/bug_repro.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
index f69f5a74f..6d9c48b7e 100644
--- a/src/command/bug_repro.cpp
+++ b/src/command/bug_repro.cpp
@@ -18,7 +18,8 @@ class ReproTask : public marian::ModelTask {
     const char* argseasy[]
         = {"marian",
            "-c",
-           "/home/rihards/exp/marian-adaptive-crash-repro/models/model.npz.repro.yml"};
+           "/home/rihards/exp/marian-adaptive-crash-repro/models/model.npz.repro.yml",
+           "-t", "dummy-value", "-t", "dummy-value"};
     int argc = sizeof(argseasy) / sizeof(char*);
     // this is as close as i could get to initializing a char** in a sane manner
     char** args = new char*[argc];

From fcb9a61bee28b983b505c4bb7e7531d66eacfb3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 1 Mar 2021 11:06:47 +0200
Subject: [PATCH 025/135] Repeat the graph initialization in a cycle

---
 src/command/bug_repro.cpp | 66 +++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
index 6d9c48b7e..10ac3cadb 100644
--- a/src/command/bug_repro.cpp
+++ b/src/command/bug_repro.cpp
@@ -19,7 +19,11 @@ class ReproTask : public marian::ModelTask {
         = {"marian",
            "-c",
            "/home/rihards/exp/marian-adaptive-crash-repro/models/model.npz.repro.yml",
-           "-t", "dummy-value", "-t", "dummy-value"};
+           "-t", "dummy-value", "-t", "dummy-value",
+           "--after-batches", "20",
+           "--after-epochs", "20",
+           "--learn-rate", "0.1",
+           "--mini-batch", "1"};
     int argc = sizeof(argseasy) / sizeof(char*);
     // this is as close as i could get to initializing a char** in a sane manner
     char** args = new char*[argc];
@@ -55,43 +59,45 @@ class ReproTask : public marian::ModelTask {
     auto inputs = New<data::TextInput>(std::vector<std::string>({sources, targets}), vocabs, options);
     auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
 
-    auto state = New<TrainingState>(options->get<float>("learn-rate"));
-    auto scheduler = New<Scheduler>(options, state);
-    scheduler->registerTrainingObserver(scheduler);
-    scheduler->registerTrainingObserver(optimizer);
+    for(size_t i = 0; i < 10; i++) {
+      auto state = New<TrainingState>(options->get<float>("learn-rate"));
+      auto scheduler = New<Scheduler>(options, state);
+      scheduler->registerTrainingObserver(scheduler);
+      scheduler->registerTrainingObserver(optimizer);
 
-    Ptr<ExpressionGraph> graph;
+      Ptr<ExpressionGraph> graph;
 
-    bool first = true;
-    scheduler->started();
-    while(scheduler->keepGoing()) {
-      batches->prepare();
+      bool first = true;
+      scheduler->started();
+      while(scheduler->keepGoing()) {
+        batches->prepare();
 
-      for(auto batch : *batches) {
-        if(!scheduler->keepGoing()) {
-          break;
-        }
+        for(auto batch : *batches) {
+          if(!scheduler->keepGoing()) {
+            break;
+          }
 
-        if(first) {
-          graph = New<ExpressionGraph>();
-          graph->setDevice({0, DeviceType::cpu});
-          graph->reserveWorkspaceMB(128);
-          first = false;
-        }
+          if(first) {
+            graph = New<ExpressionGraph>();
+            graph->setDevice({0, DeviceType::cpu});
+            graph->reserveWorkspaceMB(128);
+            first = false;
+          }
 
-        auto lossNode = builder->build(graph, batch);
-        graph->forward();
-        StaticLoss loss = *lossNode;
-        graph->backward();
+          auto lossNode = builder->build(graph, batch);
+          graph->forward();
+          StaticLoss loss = *lossNode;
+          graph->backward();
 
-        optimizer->update(graph);
-        scheduler->update(loss, batch);
-      }
+          optimizer->update(graph);
+          scheduler->update(loss, batch);
+        }
 
-      if(scheduler->keepGoing())
-        scheduler->increaseEpoch();
+        if(scheduler->keepGoing())
+          scheduler->increaseEpoch();
+      }
+      scheduler->finished();
     }
-    scheduler->finished();
   }
 };
 }

From 6560067f96be5bb31140093adefdb521a1f2fbdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Tue, 2 Mar 2021 15:52:20 +0200
Subject: [PATCH 026/135] Add a part of the self adaptive marian's
 implementation

---
 CMakeLists.txt                      |   8 +-
 src/CMakeLists.txt                  |   8 +
 src/command/marian_adaptive.cpp     |  66 +++++++
 src/common/config_parser.cpp        |  98 +++++----
 src/common/config_parser.h          |   2 +-
 src/translator/output_collector.cpp |   4 +
 src/translator/output_collector.h   |  15 +-
 src/translator/self_adaptive.h      | 296 ++++++++++++++++++++++++++++
 8 files changed, 453 insertions(+), 44 deletions(-)
 create mode 100644 src/command/marian_adaptive.cpp
 create mode 100644 src/translator/self_adaptive.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index dffbd1ff2..a1ebded05 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,6 +15,7 @@ option(COMPILE_CPU "Compile CPU version" ON)
 option(COMPILE_CUDA "Compile GPU version" ON)
 option(COMPILE_EXAMPLES "Compile examples" OFF)
 option(COMPILE_SERVER "Compile marian-server" OFF)
+option(COMPILE_ADAPTIVE "Compile marian-ADAPTIVE" OFF)
 option(COMPILE_TESTS "Compile tests" OFF)
 option(USE_APPLE_ACCELERATE "Compile with Apple Accelerate" OFF)
 option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
@@ -464,7 +465,7 @@ endif(COMPILE_CPU)
 ###############################################################################
 # Find OpenSSL
 set(BOOST_COMPONENTS "")
-if(COMPILE_SERVER)
+if(COMPILE_SERVER OR COMPILE_ADAPTIVE)
   find_package(OpenSSL)
   if(OpenSSL_FOUND)
     message(STATUS "Found OpenSSL")
@@ -479,10 +480,11 @@ if(COMPILE_SERVER)
     endif()
     set(BOOST_COMPONENTS ${BOOST_COMPONENTS} system)
   else(OpenSSL_FOUND)
-    message(WARNING "Cannot find OpenSSL library. Not compiling server.")
+    message(WARNING "Cannot find OpenSSL library. Not compiling server or marian-adaptive.")
     set(COMPILE_SERVER "off")
+    set(COMPILE_ADAPTIVE "off")
   endif(OpenSSL_FOUND)
-endif(COMPILE_SERVER)
+endif(COMPILE_SERVER OR COMPILE_ADAPTIVE)
 
 ###############################################################################
 # Undo static lib search and put non-static searches here:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a3a8008e1..6084f091e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -246,6 +246,7 @@ if (NOT COMPILE_LIBRARY_ONLY)
                 "${CMAKE_BINARY_DIR}/marian-scorer"
                 "${CMAKE_BINARY_DIR}/marian-vocab"
                 "${CMAKE_BINARY_DIR}/marian-conv"
+                "${CMAKE_BINARY_DIR}/marian-adaptive"
     DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
   add_custom_target(marian_zip DEPENDS "${CMAKE_BINARY_DIR}/marian.zip")
 
@@ -257,6 +258,7 @@ if (NOT COMPILE_LIBRARY_ONLY)
                 "marian-scorer"
                 "marian-vocab"
                 "marian-conv"
+                "marian-adaptive"
     DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
   add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz")
   add_custom_target(philly DEPENDS marian_tgz marian_zip)
@@ -279,6 +281,12 @@ if (NOT COMPILE_LIBRARY_ONLY)
     set(EXECUTABLES ${EXECUTABLES} marian_server)
   endif(COMPILE_SERVER)
 
+  if(COMPILE_ADAPTIVE)
+    add_executable(marian_adaptive command/marian_adaptive.cpp)
+    set_target_properties(marian_adaptive PROPERTIES OUTPUT_NAME marian-adaptive)
+    set(EXECUTABLES ${EXECUTABLES} marian_adaptive)
+  endif(COMPILE_ADAPTIVE)
+
   foreach(exec ${EXECUTABLES})
     target_link_libraries(${exec} marian)
     if(CUDA_FOUND)
diff --git a/src/command/marian_adaptive.cpp b/src/command/marian_adaptive.cpp
new file mode 100644
index 000000000..0f64a84ca
--- /dev/null
+++ b/src/command/marian_adaptive.cpp
@@ -0,0 +1,66 @@
+#include "marian.h"
+
+#include "3rd_party/simple-websocket-server/server_ws.hpp"
+#include "common/file_stream.h"
+#include "common/timer.h"
+#include "common/utils.h"
+#include "training/training.h"
+#include "translator/self_adaptive.h"
+
+using namespace marian;
+
+typedef SimpleWeb::SocketServer<SimpleWeb::WS> WSServer;
+
+int main(int argc, char **argv) {
+  auto options = parseOptions(argc, argv, cli::mode::selfadaptive);
+  auto task = New<TrainSelfAdaptive>(options);
+
+  if(options->has("port") && options->get<size_t>("port") != 0) {
+    // Initialize web server
+    WSServer server;
+    server.config.port = options->get<size_t>("port", 8080);
+
+    auto &translate = server.endpoint["^/translate/?$"];
+
+    translate.on_message = [&task](Ptr<WSServer::Connection> connection,
+                                   Ptr<WSServer::InMessage> message) {
+      auto sendStream = std::make_shared<WSServer::OutMessage>();
+
+      // Get input text
+      auto inputText = message->string();
+
+      // Translate
+      timer::Timer timer;
+      auto outputText = task->run(inputText);
+      LOG(info, "Best translation: {}", outputText);
+      *sendStream << outputText << std::endl;
+      LOG(info, "Translation took: {:.5f}s", timer.elapsed());
+
+      // Send translation back
+      connection->send(sendStream, [](const SimpleWeb::error_code &ec) {
+        if(ec)
+          LOG(error, "Error sending message: ({}) {}", ec.value(), ec.message());
+      });
+    };
+
+    // Error Codes for error code meanings
+    // http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
+    translate.on_error = [](Ptr<WSServer::Connection> connection, const SimpleWeb::error_code &ec) {
+      LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
+    };
+
+    // Start server thread
+    std::thread serverThread([&server]() {
+      LOG(info, "Server is listening on port {}", server.config.port);
+      server.start();
+    });
+
+    serverThread.join();
+  } else {
+    timer::Timer timer;
+    task->run();
+    LOG(info, "Total time: {:.5f}s", timer.elapsed());
+  }
+
+  return 0;
+}
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index b2c73b2b7..f611f9246 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -94,6 +94,11 @@ ConfigParser::ConfigParser(cli::mode mode)
     case cli::mode::embedding:
       addOptionsEmbedding(cli_);
       break;
+    case cli::mode::selfadaptive:
+      addOptionsTraining(cli_);
+      addOptionsValidation(cli_);
+      addOptionsServer(cli_);
+      break;
     default:
       ABORT("wrong CLI mode");
       break;
@@ -159,9 +164,11 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
 void ConfigParser::addOptionsServer(cli::CLIWrapper& cli) {
   // clang-format off
   auto previous_group = cli.switchGroup("Server options");
+  // TODO why is this needed?
+  size_t defaultPort = mode_ == cli::mode::selfadaptive ? 0 : 8080;
   cli.add<size_t>("--port,-p",
       "Port number for web socket server",
-      8080);
+      defaultPort);
   cli.switchGroup(previous_group);
   // clang-format on
 }
@@ -318,7 +325,7 @@ void ConfigParser::addOptionsModel(cli::CLIWrapper& cli) {
       {1, 2, 3, 4, 5, 6, 7, 8});
 #endif
 
-  if(mode_ == cli::mode::training) {
+  if(mode_ == cli::mode::training || mode_ == cli::mode::selfadaptive) {
     // TODO: add ->range(0,1);
     cli.add<float>("--dropout-rnn",
         "Scaling dropout along rnn layers and time (0 = no dropout)");
@@ -370,9 +377,13 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
 #endif
   // scheduling options
 
+  // In self-adaptive mode users would typically want less updates to happen than in regular training
+  size_t defaultAfterEpochs = (mode_ == cli::mode::selfadaptive) ? 2 : 0;
+  std::string defaultDispFreq = (mode_ == cli::mode::selfadaptive) ? "1" : "1000u";
+
   // @TODO: these should be re-defined as aliases for `--after` but the current frame work matches on value, so not doable.
   cli.add<size_t>("--after-epochs,-e",
-      "Finish after this many epochs, 0 is infinity (deprecated, '--after-epochs N' corresponds to '--after Ne')"); // @TODO: replace with alias
+      "Finish after this many epochs, 0 is infinity (deprecated, '--after-epochs N' corresponds to '--after Ne')", defaultAfterEpochs); // @TODO: replace with alias
   cli.add<size_t>("--after-batches",
       "Finish after this many batch updates, 0 is infinity (deprecated, '--after-batches N' corresponds to '--after Nu')"); // @TODO: replace with alias
 
@@ -381,7 +392,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       "0e");
   cli.add<std::string/*SchedulerPeriod*/>("--disp-freq",
       "Display information every  arg  updates (append 't' for every  arg  target labels)",
-      "1000u");
+      defaultDispFreq);
   cli.add<size_t>("--disp-first",
       "Display information for the first  arg  updates");
   cli.add<bool>("--disp-label-counts",
@@ -401,31 +412,34 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   addSuboptionsTSV(cli);
 
   // data management options
-  cli.add<std::string>("--shuffle",
-      "How to shuffle input data (data: shuffles data and sorted batches; batches: "
-      "data is read in order into batches, but batches are shuffled; none: no shuffling). "
-      "Use with '--maxi-batch-sort none' in order to achieve exact reading order", "data");
-  cli.add<bool>("--no-shuffle",
-      "Shortcut for backwards compatiblity, equivalent to --shuffle none (deprecated)");
-  cli.add<bool>("--no-restore-corpus",
-      "Skip restoring corpus state after training is restarted");
-  cli.add<std::string>("--tempdir,-T",
-      "Directory for temporary (shuffled) files and database",
-      "/tmp");
-  cli.add<std::string>("--sqlite",
-      "Use disk-based sqlite3 database for training corpus storage, default"
-      " is temporary with path creates persistent storage")
-    ->implicit_val("temporary");
-  cli.add<bool>("--sqlite-drop",
-      "Drop existing tables in sqlite3 database");
+  if (mode_ != cli::mode::selfadaptive) {
+    cli.add<std::string>("--shuffle",
+        "How to shuffle input data (data: shuffles data and sorted batches; batches: "
+        "data is read in order into batches, but batches are shuffled; none: no shuffling). "
+        "Use with '--maxi-batch-sort none' in order to achieve exact reading order", "data");
+    cli.add<bool>("--no-shuffle",
+        "Shortcut for backwards compatiblity, equivalent to --shuffle none (deprecated)");
+    cli.add<bool>("--no-restore-corpus",
+        "Skip restoring corpus state after training is restarted");
+    cli.add<std::string>("--tempdir,-T",
+        "Directory for temporary (shuffled) files and database",
+        "/tmp");
+    cli.add<std::string>("--sqlite",
+        "Use disk-based sqlite3 database for training corpus storage, default"
+        " is temporary with path creates persistent storage")
+      ->implicit_val("temporary");
+    cli.add<bool>("--sqlite-drop",
+        "Drop existing tables in sqlite3 database");
+  }
 
   addSuboptionsDevices(cli);
   addSuboptionsBatching(cli);
 
   // optimizer options
+  auto defaultOptimizer = (mode_ == cli::mode::selfadaptive) ? "sgd" : "adam";
   cli.add<std::string>("--optimizer,-o",
      "Optimization algorithm: sgd, adagrad, adam",
-     "adam");
+     defaultOptimizer);
   cli.add<std::vector<float>>("--optimizer-params",
      "Parameters for optimization algorithm, e.g. betas for Adam. "
      "Auto-adjusted to --mini-batch-words-ref if given");
@@ -635,8 +649,11 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
   cli.add<std::string>("--output,-o",
       "Path to output file, stdout by default",
       "stdout");
-  cli.add<std::vector<std::string>>("--vocabs,-v",
-      "Paths to vocabulary files have to correspond to --input");
+  // for self-adaptive mode vocabs are already added via the training options
+  if(mode_ != cli::mode::selfadaptive) {
+    cli.add<std::vector<std::string>>("--vocabs,-v",
+        "Paths to vocabulary files have to correspond to --input");
+  }
   // decoding options
   cli.add<size_t>("--beam-size,-b",
       "Beam size used during search with validating translator",
@@ -668,16 +685,21 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
       "Keep the output segmented into SentencePiece subwords");
 #endif
 
-  addSuboptionsInputLength(cli);
-  addSuboptionsTSV(cli);
-  addSuboptionsDevices(cli);
-  addSuboptionsBatching(cli);
+  if(mode_ != cli::mode::selfadaptive) {
+    addSuboptionsInputLength(cli);
+    addSuboptionsTSV(cli);
+    addSuboptionsDevices(cli);
+    addSuboptionsBatching(cli);
+  }
 
-  cli.add<bool>("--fp16",
-      "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
-  cli.add<std::vector<std::string>>("--precision",
-      "Mixed precision for inference, set parameter type in expression graph",
-      {"float32"});
+  // for self-adaptive mode vocabs are already added via the training options
+  if(mode_ != cli::mode::selfadaptive) {
+    cli.add<bool>("--fp16",
+        "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");
+    cli.add<std::vector<std::string>>("--precision",
+        "Mixed precision for inference, set parameter type in expression graph",
+        {"float32"});
+  }
   cli.add<bool>("--skip-cost",
     "Ignore model cost during translation, not recommended for beam-size > 1");
 
@@ -695,7 +717,8 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
       "Path to model to swap to.");
 #if 0 // @TODO: Ask Hany if there are any decoding-time options
   // add ULR settings
-  addSuboptionsULR(cli);
+  if(mode_ != cli::mode::selfadaptive)
+    addSuboptionsULR(cli);
 #endif
 
   cli.switchGroup(previous_group);
@@ -819,8 +842,9 @@ void ConfigParser::addSuboptionsDevices(cli::CLIWrapper& cli) {
 }
 
 void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
-  int defaultMiniBatch = (mode_ == cli::mode::translation) ? 1 : 64;
-  int defaultMaxiBatch = (mode_ == cli::mode::translation) ? 1 : 100;
+  bool transMode = mode_ == cli::mode::translation || mode_ == cli::mode::selfadaptive;
+  int defaultMiniBatch = transMode ? 1 : 64;
+  int defaultMaxiBatch = transMode ? 1 : 100;
   std::string defaultMaxiBatchSort = (mode_ == cli::mode::translation) ? "none" : "trg";
 
   // clang-format off
@@ -852,7 +876,7 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
       "Sorting strategy for maxi-batch: none, src, trg (not available for decoder)",
       defaultMaxiBatchSort);
 
-  if(mode_ == cli::mode::training) {
+  if(mode_ == cli::mode::training || mode_ == cli::mode::selfadaptive) {
     cli.add<bool>("--shuffle-in-ram",
         "Keep shuffled corpus in RAM, do not write to temp file");
     // @TODO: Consider making the next two options options of the vocab instead, to make it more local in scope.
diff --git a/src/common/config_parser.h b/src/common/config_parser.h
index 18b6eccb7..b6b825d7d 100644
--- a/src/common/config_parser.h
+++ b/src/common/config_parser.h
@@ -14,7 +14,7 @@
 namespace marian {
 
 namespace cli {
-enum struct mode { training, translation, scoring, server, embedding };
+enum struct mode { training, translation, scoring, server, embedding, selfadaptive };
 }  // namespace cli
 
 /**
diff --git a/src/translator/output_collector.cpp b/src/translator/output_collector.cpp
index 078be232b..b74a5a54c 100644
--- a/src/translator/output_collector.cpp
+++ b/src/translator/output_collector.cpp
@@ -81,6 +81,10 @@ void OutputCollector::Write(long sourceId,
 
 StringCollector::StringCollector(bool quiet /*=false*/) : maxId_(-1), quiet_(quiet) {}
 
+void StringCollector::Write(long sourceId, const std::string &best1, const std::string &bestn, bool) {
+  StringCollector::add(sourceId, best1, bestn);
+}
+
 void StringCollector::add(long sourceId,
                           const std::string& best1,
                           const std::string& bestn) {
diff --git a/src/translator/output_collector.h b/src/translator/output_collector.h
index 0e6bfc9f8..4b0c48f13 100644
--- a/src/translator/output_collector.h
+++ b/src/translator/output_collector.h
@@ -44,7 +44,12 @@ class GeometricPrinting : public PrintingStrategy {
   long next_{10};
 };
 
-class OutputCollector {
+struct CollectorBase {
+  virtual void Write(long sourceId, const std::string& best1, const std::string& bestn, bool nbest)
+      = 0;
+};
+
+class OutputCollector : public CollectorBase {
 public:
   OutputCollector();
   OutputCollector(std::string outFile);
@@ -57,7 +62,7 @@ class OutputCollector {
   void Write(long sourceId,
              const std::string& best1,
              const std::string& bestn,
-             bool nbest);
+             bool nbest) override;
 
   void setPrintingStrategy(Ptr<PrintingStrategy> strategy) {
     printing_ = strategy;
@@ -72,11 +77,15 @@ class OutputCollector {
   std::mutex mutex_;
 };
 
-class StringCollector {
+class StringCollector : public CollectorBase {
 public:
   StringCollector(bool quiet = false);
   StringCollector(const StringCollector&) = delete;
 
+  void Write(long sourceId,
+             const std::string& best1,
+             const std::string& bestn,
+             bool nbest) override;
   void add(long sourceId, const std::string& best1, const std::string& bestn);
   std::vector<std::string> collect(bool nbest);
 
diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
new file mode 100644
index 000000000..c03a85ea3
--- /dev/null
+++ b/src/translator/self_adaptive.h
@@ -0,0 +1,296 @@
+#pragma once
+
+#include "common/config.h"
+#include "common/file_stream.h"
+#include "data/batch_generator.h"
+#include "data/text_input.h"
+#include "models/model_task.h"
+#include "training/scheduler.h"
+#include "training/validator.h"
+
+namespace marian {
+
+using namespace data;
+
+class TrainSetReader {
+  std::vector<UPtr<io::InputFileStream>> files_;
+
+public:
+  TrainSetReader(std::vector<std::string> paths) {
+    for(auto& path : paths)
+      files_.emplace_back(new io::InputFileStream(path));
+  }
+
+  std::vector<std::string> getSamples() {
+    // extracted lines for source and target corpora
+    std::vector<std::string> samples;
+    // counters of number of lines extracted for source and target
+    std::vector<size_t> counts;
+
+    for(auto const& file : files_) {
+      size_t currCount = 0;
+      std::string lines;
+      std::string line;
+      while(io::getline(*file, line)) {
+        if(line.empty())
+          break;
+
+        if(currCount)
+          lines += "\n";
+        lines += line;
+        currCount += 1;
+      }
+
+      if(!lines.empty())
+        samples.emplace_back(lines);
+      counts.push_back(currCount);
+
+      // check if the same number of lines is extracted for source and target
+      size_t prevCount = counts[0];
+      for(size_t i = 1; i < counts.size(); ++i) {
+        ABORT_IF(prevCount != counts[i],
+                 "An empty source or target sentence has been encountered!");
+        prevCount = counts[i];
+      }
+    }
+
+    return samples;
+  }
+};
+
+class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
+public:
+  TrainSelfAdaptive(Ptr<Options> options) : options_(options) {
+
+    // @TODO: should probably better re-enable the shuffling related options
+    // in config for marian-adaptive
+    options_->set("shuffle", "none");
+    // Set up translator options
+    optionsTrans_ = New<Options>(options_->clone());
+    optionsTrans_->set<size_t>("mini-batch", 1);
+    optionsTrans_->set<size_t>("maxi-batch", 1);
+    optionsTrans_->set<size_t>("max-length", 1000);
+    optionsTrans_->set("shuffle", "none");
+
+    // auto deviceId = Config::getDevices(options_)[0];
+
+    // Initialize model for training
+    // graph_ = New<ExpressionGraph>();
+    // graph_->setDevice(deviceId);
+    // graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+    builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
+
+    optimizer_ = Optimizer(options_);
+
+    // Initialize model for translation
+    Ptr<Options> opts = New<Options>();
+    opts->merge(options_);
+    opts->set("inference", true);
+    // builderTrans_ = models::createModelFromOptions(opts, models::usage::translation);
+
+    // Initialize a scorer for translation
+    auto model = options_->get<std::string>("model");
+    // Ptr<Scorer> scorer = New<ScorerWrapper>(builderTrans_, "", 1.0f, model);
+    // scorers_.push_back(scorer);
+
+    // Read vocabularies
+    auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
+    std::vector<int> maxVocabs = options_->get<std::vector<int>>("dim-vocabs");
+    for(size_t i = 0; i < vocabPaths.size(); ++i) {
+      Ptr<Vocab> vocab = New<Vocab>(options_, i);
+      vocab->load(vocabPaths[i], maxVocabs[i]);
+      vocabs_.emplace_back(vocab);
+    }
+
+    // Load model
+    // builder_->load(graph_, model);
+  }
+
+  std::string run(const std::string& json) override {
+    //LOG(warn, "REMOVEME Received Json:\n{}", json);
+
+    // Check if input is in JSON
+    YAML::Node yaml = YAML::Load(json);
+    if(!yaml["input"]) {
+      LOG(warn, "No 'input' node found in the request");
+      return "";
+    }
+
+    // Get input sentences
+    auto input = yaml["input"].as<std::string>();
+    std::vector<Ptr<Vocab>> srcVocabs(vocabs_.begin(), vocabs_.end() - 1);
+    auto testSet = New<TextInput>(std::vector<std::string>({input}), srcVocabs, optionsTrans_);
+
+    // Prepare batches
+    auto testBatches = New<BatchGenerator<TextInput>>(testSet, optionsTrans_);
+    testBatches->prepare();
+
+    // Initialize output printing
+    auto collector = New<StringCollector>();
+    auto printer = New<OutputPrinter>(options_, vocabs_.back());
+
+    // Get training sentences
+    std::vector<std::vector<std::string>> contexts;
+    if(yaml["context"])
+      contexts = yaml["context"].as<std::vector<std::vector<std::string>>>();
+
+    LOG(info, "Running...");
+
+    size_t id = 0;
+    for(auto testBatch : *testBatches) {
+      if(contexts.size() > id && !contexts[id].empty()) {
+        // train(contexts[id]);
+        translate(testBatch, collector, printer, graphAdapt_);
+      } else {
+        LOG(info, "No context provided for sentence {}", id);
+        // translate(testBatch, collector, printer, graph_);
+      }
+
+      // iterating by 1 is quite safe because the mini-batch size for
+      // translation is always 1
+      ++id;
+    }
+
+    auto translations = collector->collect(options_->get<bool>("n-best"));
+    YAML::Emitter output;
+    output << YAML::DoubleQuoted << YAML::Flow << utils::join(translations, "\\n");
+    return "{\"output\":" + std::string(output.c_str()) + "}";
+  }
+
+  void run() override {
+    // Initialize input data
+    auto srcPaths = options_->get<std::vector<std::string>>("input");
+    std::vector<Ptr<Vocab>> srcVocabs(vocabs_.begin(), vocabs_.end() - 1);
+    auto testSet = New<Corpus>(srcPaths, srcVocabs, optionsTrans_);
+
+    // Prepare batches
+    auto testBatches = New<BatchGenerator<Corpus>>(testSet, optionsTrans_);
+    testBatches->prepare();
+
+    // Initialize output printing
+    auto collector = New<OutputCollector>(options_->get<std::string>("output"));
+    if(options_->get<bool>("quiet-translation"))
+      collector->setPrintingStrategy(New<QuietPrinting>());
+    auto printer = New<OutputPrinter>(options_, vocabs_.back());
+
+    // Initialize train data
+    auto trainPaths = options_->get<std::vector<std::string>>("train-sets");
+    auto trainSets = New<TrainSetReader>(trainPaths);
+
+    LOG(info, "Running...");
+
+    // auto state = New<TrainingState>(options_->get<float>("learn-rate"));
+    // auto scheduler = New<Scheduler>(options_, state);
+    // scheduler->registerTrainingObserver(scheduler);
+    // scheduler->registerTrainingObserver(optimizer_);
+
+    for(auto testBatch : *testBatches) {
+      auto trainSet = trainSets->getSamples();
+
+      if(!trainSet.empty()) {
+        LOG(info, "### NEW TEST BATCH");
+        train(trainSet, nullptr);
+        translate(testBatch, collector, printer, graphAdapt_);
+      } else {
+        LOG(info, "### EMPTY TEST BATCH");
+        // translate(testBatch, collector, printer, graph_);
+      }
+    }
+  }
+
+private:
+  Ptr<Options> options_;       // Options for training
+  Ptr<Options> optionsTrans_;  // Options for translator
+
+  Ptr<models::ICriterionFunction> builder_;      // Training model
+  // Ptr<models::IModel> builderTrans_; // Translation model
+  // Ptr<ExpressionGraph> graph_;          // A graph with original parameters
+  Ptr<ExpressionGraph> graphAdapt_;     // A graph on which training is performed
+
+  std::vector<Ptr<Vocab>> vocabs_;
+  // std::vector<Ptr<Scorer>> scorers_;
+  Ptr<OptimizerBase> optimizer_;
+
+  void train(std::vector<std::string> trainSents, std::shared_ptr<Scheduler> _scheduler) {
+    auto state = New<TrainingState>(options_->get<float>("learn-rate"));
+    auto scheduler = New<Scheduler>(options_, state);
+    scheduler->registerTrainingObserver(scheduler);
+    scheduler->registerTrainingObserver(optimizer_);
+
+    auto trainSet = New<TextInput>(trainSents, vocabs_, options_);
+    auto trainBatches = New<BatchGenerator<TextInput>>(trainSet, options_);
+
+    bool first = true;
+
+    scheduler->started();
+    while(scheduler->keepGoing()) {
+      trainBatches->prepare();
+
+      LOG(info, "### NEW BATCHES");
+      for(auto batch : *trainBatches) {
+        if(!scheduler->keepGoing())
+          break;
+
+        LOG(info, "### NEW BATCH");
+        // Copy params from the original model
+        if(first) {
+          auto deviceId = Config::getDevices(options_)[0];
+          graphAdapt_ = New<ExpressionGraph>();
+          graphAdapt_->setDevice(deviceId);
+          graphAdapt_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+
+          // builder_->build(graph_, batch);
+          // graph_->forward();
+
+          // graphAdapt_ = New<ExpressionGraph>();
+          // graphAdapt_->setDevice(graph_->getDeviceId());
+          // graphAdapt_->reuseWorkspace(graph_);
+
+          // graphAdapt_->copyParams(graph_);
+          first = false;
+        }
+
+        // Make an update step on the copy of the model
+        auto lossNode = builder_->build(graphAdapt_, batch);
+        graphAdapt_->forward();
+        StaticLoss loss = *lossNode;
+        graphAdapt_->backward();
+
+        // Notify optimizer and scheduler
+        optimizer_->update(graphAdapt_);
+        scheduler->update(loss, batch);
+      }
+      if(scheduler->keepGoing())
+        scheduler->increaseEpoch();
+    }
+    scheduler->finished();
+  }
+
+  void translate(Ptr<data::CorpusBatch> batch,
+                 Ptr<CollectorBase> collector,
+                 Ptr<OutputPrinter> printer,
+                 Ptr<ExpressionGraph> graph) {
+    // graph->setInference(true);
+    // graph->clear();
+
+    // {
+    //   auto search = New<BeamSearch>(options_,
+    //                                 scorers_,
+    //                                 vocabs_.back());
+    //   auto histories = search->search(graph, batch);
+
+    //   for(auto history : histories) {
+    //     std::stringstream best1;
+    //     std::stringstream bestn;
+    //     printer->print(history, best1, bestn);
+    //     collector->Write(history->getLineNum(),
+    //                      best1.str(),
+    //                      bestn.str(),
+    //                      options_->get<bool>("n-best"));
+    //   }
+    // }
+
+    // graph->setInference(false);
+  }
+};
+}

From 7130800012b689a4806efdf4659ee76d67d6baa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Wed, 3 Mar 2021 15:29:56 +0200
Subject: [PATCH 027/135] Fix compatability issues with some new refactors in
 master

---
 src/command/bug_repro.cpp       | 4 ++--
 src/common/config_validator.cpp | 5 +++++
 src/translator/self_adaptive.h  | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
index 10ac3cadb..f37c47add 100644
--- a/src/command/bug_repro.cpp
+++ b/src/command/bug_repro.cpp
@@ -33,7 +33,7 @@ class ReproTask : public marian::ModelTask {
     auto options = parser.parseOptions(argc, args, false);
 
     auto builder = models::createCriterionFunctionFromOptions(options, models::usage::training);
-    auto optimizer = Optimizer<Adam>(0.01);
+    auto optimizer = Optimizer(New<Options>("optimizer", "adam", "learn-rate", 0.01));
 
     std::vector<std::string> vocabPaths
         = {"/home/rihards/exp/marian-adaptive-crash-repro/models/train.1-to-1.bpe.en-lv.yml",
@@ -89,7 +89,7 @@ class ReproTask : public marian::ModelTask {
           StaticLoss loss = *lossNode;
           graph->backward();
 
-          optimizer->update(graph);
+          optimizer->update(graph, 1);
           scheduler->update(loss, batch);
         }
 
diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp
index b24001450..92c86c553 100644
--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@@ -35,6 +35,11 @@ void ConfigValidator::validateOptions(cli::mode mode) const {
       validateOptionsParallelData();
       validateOptionsTraining();
       break;
+    case cli::mode::selfadaptive:
+      validateOptionsTranslation();
+      validateOptionsParallelData();
+      validateOptionsTraining();
+      break;
     default:
       ABORT("wrong CLI mode");
       break;
diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index c03a85ea3..318acff6a 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -257,7 +257,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
         graphAdapt_->backward();
 
         // Notify optimizer and scheduler
-        optimizer_->update(graphAdapt_);
+        optimizer_->update(graphAdapt_, 1);
         scheduler->update(loss, batch);
       }
       if(scheduler->keepGoing())

From 10cdffab9cd891eb69aada7df8df6870119df240 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 4 Mar 2021 17:12:29 +0200
Subject: [PATCH 028/135] Fix options parsing issues

---
 src/common/config_parser.cpp    | 12 ++++++------
 src/common/config_validator.cpp |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index f611f9246..79d0b7d1e 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -96,7 +96,7 @@ ConfigParser::ConfigParser(cli::mode mode)
       break;
     case cli::mode::selfadaptive:
       addOptionsTraining(cli_);
-      addOptionsValidation(cli_);
+      addOptionsTranslation(cli_);
       addOptionsServer(cli_);
       break;
     default:
@@ -646,11 +646,11 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
   cli.add<std::vector<std::string>>("--input,-i",
       "Paths to input file(s), stdin by default",
       {"stdin"});
-  cli.add<std::string>("--output,-o",
-      "Path to output file, stdout by default",
-      "stdout");
-  // for self-adaptive mode vocabs are already added via the training options
+  // for self-adaptive mode these are already added via the training options
   if(mode_ != cli::mode::selfadaptive) {
+    cli.add<std::string>("--output,-o",
+        "Path to output file, stdout by default",
+        "stdout");
     cli.add<std::vector<std::string>>("--vocabs,-v",
         "Paths to vocabulary files have to correspond to --input");
   }
@@ -723,7 +723,7 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
 
   cli.switchGroup(previous_group);
   // clang-format on
-}
+  }
 
 void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
   auto previous_group = cli.switchGroup("Scorer options");
diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp
index 92c86c553..cbb8c3d86 100644
--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@@ -36,9 +36,9 @@ void ConfigValidator::validateOptions(cli::mode mode) const {
       validateOptionsTraining();
       break;
     case cli::mode::selfadaptive:
-      validateOptionsTranslation();
-      validateOptionsParallelData();
-      validateOptionsTraining();
+      // validateOptionsTranslation();
+      // validateOptionsParallelData();
+      // validateOptionsTraining();
       break;
     default:
       ABORT("wrong CLI mode");

From 286a23c8e5f6426f7bf8abfb06b9fbcc308920a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Wed, 10 Mar 2021 17:20:25 +0200
Subject: [PATCH 029/135] Fix remaining input parsing issues

---
 src/common/config_parser.cpp | 20 +++++++++++++++-----
 src/common/config_parser.h   |  1 +
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 79d0b7d1e..b46b6a6e7 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -98,6 +98,7 @@ ConfigParser::ConfigParser(cli::mode mode)
       addOptionsTraining(cli_);
       addOptionsTranslation(cli_);
       addOptionsServer(cli_);
+      addOptionsStupid(cli_);
       break;
     default:
       ABORT("wrong CLI mode");
@@ -108,7 +109,16 @@ ConfigParser::ConfigParser(cli::mode mode)
   // clang-format on
 }
 
-void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
+void ConfigParser::addOptionsStupid(cli::CLIWrapper & cli) {
+  auto previous_group = cli.switchGroup("Server options");
+  cli.add<size_t>(
+      "--early-stopping",
+      "Stop if the first validation metric does not improve for  arg  consecutive validation steps",
+      10);
+  cli.switchGroup(previous_group);
+}
+
+void ConfigParser::addOptionsGeneral(cli::CLIWrapper & cli) {
   int defaultWorkspace = (mode_ == cli::mode::translation) ? 512 : 2048;
 
   cli.switchGroup("General options");
@@ -437,7 +447,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
 
   // optimizer options
   auto defaultOptimizer = (mode_ == cli::mode::selfadaptive) ? "sgd" : "adam";
-  cli.add<std::string>("--optimizer,-o",
+  cli.add<std::string>("--optimizer",
      "Optimization algorithm: sgd, adagrad, adam",
      defaultOptimizer);
   cli.add<std::vector<float>>("--optimizer-params",
@@ -646,11 +656,11 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
   cli.add<std::vector<std::string>>("--input,-i",
       "Paths to input file(s), stdin by default",
       {"stdin"});
+  cli.add<std::string>("--output,-o",
+      "Path to output file, stdout by default",
+      "stdout");
   // for self-adaptive mode these are already added via the training options
   if(mode_ != cli::mode::selfadaptive) {
-    cli.add<std::string>("--output,-o",
-        "Path to output file, stdout by default",
-        "stdout");
     cli.add<std::vector<std::string>>("--vocabs,-v",
         "Paths to vocabulary files have to correspond to --input");
   }
diff --git a/src/common/config_parser.h b/src/common/config_parser.h
index b6b825d7d..744656458 100644
--- a/src/common/config_parser.h
+++ b/src/common/config_parser.h
@@ -130,6 +130,7 @@ class ConfigParser {
   void addOptionsTranslation(cli::CLIWrapper&);
   void addOptionsScoring(cli::CLIWrapper&);
   void addOptionsEmbedding(cli::CLIWrapper&);
+  void addOptionsStupid(cli::CLIWrapper&);
 
   void addAliases(cli::CLIWrapper&);
 

From 85685c640dbc663ede34d0a1f06fd8cabef399cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Fri, 19 Mar 2021 13:48:06 +0200
Subject: [PATCH 030/135] Re-enable all of the adaptive code

---
 src/translator/self_adaptive.h | 106 ++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 56 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 318acff6a..e45602be6 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -72,12 +72,12 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     optionsTrans_->set<size_t>("max-length", 1000);
     optionsTrans_->set("shuffle", "none");
 
-    // auto deviceId = Config::getDevices(options_)[0];
+    auto deviceId = Config::getDevices(options_)[0];
 
     // Initialize model for training
-    // graph_ = New<ExpressionGraph>();
-    // graph_->setDevice(deviceId);
-    // graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+    graph_ = New<ExpressionGraph>();
+    graph_->setDevice(deviceId);
+    graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
     builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
 
     optimizer_ = Optimizer(options_);
@@ -86,12 +86,12 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     Ptr<Options> opts = New<Options>();
     opts->merge(options_);
     opts->set("inference", true);
-    // builderTrans_ = models::createModelFromOptions(opts, models::usage::translation);
+    builderTrans_ = models::createModelFromOptions(opts, models::usage::translation);
 
     // Initialize a scorer for translation
     auto model = options_->get<std::string>("model");
-    // Ptr<Scorer> scorer = New<ScorerWrapper>(builderTrans_, "", 1.0f, model);
-    // scorers_.push_back(scorer);
+    Ptr<Scorer> scorer = New<ScorerWrapper>(builderTrans_, "", 1.0f, model);
+    scorers_.push_back(scorer);
 
     // Read vocabularies
     auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
@@ -103,7 +103,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     }
 
     // Load model
-    // builder_->load(graph_, model);
+    builder_->load(graph_, model);
   }
 
   std::string run(const std::string& json) override {
@@ -139,11 +139,11 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     size_t id = 0;
     for(auto testBatch : *testBatches) {
       if(contexts.size() > id && !contexts[id].empty()) {
-        // train(contexts[id]);
+        train(contexts[id]);
         translate(testBatch, collector, printer, graphAdapt_);
       } else {
         LOG(info, "No context provided for sentence {}", id);
-        // translate(testBatch, collector, printer, graph_);
+        translate(testBatch, collector, printer, graph_);
       }
 
       // iterating by 1 is quite safe because the mini-batch size for
@@ -179,21 +179,16 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     LOG(info, "Running...");
 
-    // auto state = New<TrainingState>(options_->get<float>("learn-rate"));
-    // auto scheduler = New<Scheduler>(options_, state);
-    // scheduler->registerTrainingObserver(scheduler);
-    // scheduler->registerTrainingObserver(optimizer_);
-
     for(auto testBatch : *testBatches) {
       auto trainSet = trainSets->getSamples();
 
       if(!trainSet.empty()) {
-        LOG(info, "### NEW TEST BATCH");
-        train(trainSet, nullptr);
+        LOG(info, "# NEW TEST BATCH");
+        train(trainSet);
         translate(testBatch, collector, printer, graphAdapt_);
       } else {
-        LOG(info, "### EMPTY TEST BATCH");
-        // translate(testBatch, collector, printer, graph_);
+        LOG(info, "# EMPTY TEST BATCH");
+        translate(testBatch, collector, printer, graph_);
       }
     }
   }
@@ -203,15 +198,15 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
   Ptr<Options> optionsTrans_;  // Options for translator
 
   Ptr<models::ICriterionFunction> builder_;      // Training model
-  // Ptr<models::IModel> builderTrans_; // Translation model
-  // Ptr<ExpressionGraph> graph_;          // A graph with original parameters
+  Ptr<models::IModel> builderTrans_; // Translation model
+  Ptr<ExpressionGraph> graph_;          // A graph with original parameters
   Ptr<ExpressionGraph> graphAdapt_;     // A graph on which training is performed
 
   std::vector<Ptr<Vocab>> vocabs_;
-  // std::vector<Ptr<Scorer>> scorers_;
+  std::vector<Ptr<Scorer>> scorers_;
   Ptr<OptimizerBase> optimizer_;
 
-  void train(std::vector<std::string> trainSents, std::shared_ptr<Scheduler> _scheduler) {
+  void train(std::vector<std::string> trainSents) {
     auto state = New<TrainingState>(options_->get<float>("learn-rate"));
     auto scheduler = New<Scheduler>(options_, state);
     scheduler->registerTrainingObserver(scheduler);
@@ -226,7 +221,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     while(scheduler->keepGoing()) {
       trainBatches->prepare();
 
-      LOG(info, "### NEW BATCHES");
+      LOG(info, "## NEW BATCHES");
       for(auto batch : *trainBatches) {
         if(!scheduler->keepGoing())
           break;
@@ -234,19 +229,18 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
         LOG(info, "### NEW BATCH");
         // Copy params from the original model
         if(first) {
-          auto deviceId = Config::getDevices(options_)[0];
-          graphAdapt_ = New<ExpressionGraph>();
-          graphAdapt_->setDevice(deviceId);
-          graphAdapt_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+          builder_->build(graph_, batch);
+          // TODO: Why do we need to do a froward pass here?
+          graph_->forward();
 
-          // builder_->build(graph_, batch);
-          // graph_->forward();
-
-          // graphAdapt_ = New<ExpressionGraph>();
-          // graphAdapt_->setDevice(graph_->getDeviceId());
-          // graphAdapt_->reuseWorkspace(graph_);
+          graphAdapt_ = New<ExpressionGraph>();
+          graphAdapt_->setDevice(graph_->getDeviceId());
+          graphAdapt_->reuseWorkspace(graph_);
 
-          // graphAdapt_->copyParams(graph_);
+          // TODO: why aren't we using a builder before this?
+          // it's probably because the order doesn't matter and the
+          // builder is used below
+          graphAdapt_->copyParams(graph_);
           first = false;
         }
 
@@ -270,27 +264,27 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
                  Ptr<CollectorBase> collector,
                  Ptr<OutputPrinter> printer,
                  Ptr<ExpressionGraph> graph) {
-    // graph->setInference(true);
-    // graph->clear();
-
-    // {
-    //   auto search = New<BeamSearch>(options_,
-    //                                 scorers_,
-    //                                 vocabs_.back());
-    //   auto histories = search->search(graph, batch);
-
-    //   for(auto history : histories) {
-    //     std::stringstream best1;
-    //     std::stringstream bestn;
-    //     printer->print(history, best1, bestn);
-    //     collector->Write(history->getLineNum(),
-    //                      best1.str(),
-    //                      bestn.str(),
-    //                      options_->get<bool>("n-best"));
-    //   }
-    // }
-
-    // graph->setInference(false);
+    graph->setInference(true);
+    graph->clear();
+
+    {
+      auto search = New<BeamSearch>(options_,
+                                    scorers_,
+                                    vocabs_.back());
+      auto histories = search->search(graph, batch);
+
+      for(auto history : histories) {
+        std::stringstream best1;
+        std::stringstream bestn;
+        printer->print(history, best1, bestn);
+        collector->Write(history->getLineNum(),
+                         best1.str(),
+                         bestn.str(),
+                         options_->get<bool>("n-best"));
+      }
+    }
+
+    graph->setInference(false);
   }
 };
 }

From fca5fe4b5e1e1a5dedbd0ff66dc838eb89424176 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Fri, 19 Mar 2021 15:04:24 +0200
Subject: [PATCH 031/135] Some further debugging, ugh

---
 src/translator/self_adaptive.h | 62 +++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index e45602be6..1582bb997 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -89,7 +89,8 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     builderTrans_ = models::createModelFromOptions(opts, models::usage::translation);
 
     // Initialize a scorer for translation
-    auto model = options_->get<std::string>("model");
+    // auto model = options_->get<std::string>("model");
+    model = options_->get<std::string>("model");
     Ptr<Scorer> scorer = New<ScorerWrapper>(builderTrans_, "", 1.0f, model);
     scorers_.push_back(scorer);
 
@@ -201,6 +202,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
   Ptr<models::IModel> builderTrans_; // Translation model
   Ptr<ExpressionGraph> graph_;          // A graph with original parameters
   Ptr<ExpressionGraph> graphAdapt_;     // A graph on which training is performed
+  std::string model;
 
   std::vector<Ptr<Vocab>> vocabs_;
   std::vector<Ptr<Scorer>> scorers_;
@@ -229,18 +231,22 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
         LOG(info, "### NEW BATCH");
         // Copy params from the original model
         if(first) {
-          builder_->build(graph_, batch);
-          // TODO: Why do we need to do a froward pass here?
-          graph_->forward();
+          // builder_->build(graph_, batch);
+          // // TODO: Why do we need to do a froward pass here?
+          // graph_->forward();
 
           graphAdapt_ = New<ExpressionGraph>();
-          graphAdapt_->setDevice(graph_->getDeviceId());
-          graphAdapt_->reuseWorkspace(graph_);
+          // graphAdapt_->setDevice(graph_->getDeviceId());
+          auto deviceId = Config::getDevices(options_)[0];
+          graphAdapt_->setDevice(deviceId);
+          // graphAdapt_->reuseWorkspace(graph_);
+          graphAdapt_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
 
           // TODO: why aren't we using a builder before this?
           // it's probably because the order doesn't matter and the
           // builder is used below
-          graphAdapt_->copyParams(graph_);
+          // graphAdapt_->copyParams(graph_);
+          // builder_->load(graphAdapt_, model);
           first = false;
         }
 
@@ -264,27 +270,27 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
                  Ptr<CollectorBase> collector,
                  Ptr<OutputPrinter> printer,
                  Ptr<ExpressionGraph> graph) {
-    graph->setInference(true);
-    graph->clear();
-
-    {
-      auto search = New<BeamSearch>(options_,
-                                    scorers_,
-                                    vocabs_.back());
-      auto histories = search->search(graph, batch);
-
-      for(auto history : histories) {
-        std::stringstream best1;
-        std::stringstream bestn;
-        printer->print(history, best1, bestn);
-        collector->Write(history->getLineNum(),
-                         best1.str(),
-                         bestn.str(),
-                         options_->get<bool>("n-best"));
-      }
-    }
-
-    graph->setInference(false);
+    // graph->setInference(true);
+    // graph->clear();
+
+    // {
+    //   auto search = New<BeamSearch>(options_,
+    //                                 scorers_,
+    //                                 vocabs_.back());
+    //   auto histories = search->search(graph, batch);
+
+    //   for(auto history : histories) {
+    //     std::stringstream best1;
+    //     std::stringstream bestn;
+    //     printer->print(history, best1, bestn);
+    //     collector->Write(history->getLineNum(),
+    //                      best1.str(),
+    //                      bestn.str(),
+    //                      options_->get<bool>("n-best"));
+    //   }
+    // }
+
+    // graph->setInference(false);
   }
 };
 }

From 37b6aa4d0d91de42ddaa41073271cc6c2a3946a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 29 Mar 2021 11:33:44 +0300
Subject: [PATCH 032/135] Fix the way inputs are initialized

---
 src/command/bug_repro.cpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
index f37c47add..ee9bc905e 100644
--- a/src/command/bug_repro.cpp
+++ b/src/command/bug_repro.cpp
@@ -21,8 +21,9 @@ class ReproTask : public marian::ModelTask {
            "/home/rihards/exp/marian-adaptive-crash-repro/models/model.npz.repro.yml",
            "-t", "dummy-value", "-t", "dummy-value",
            "--after-batches", "20",
-           "--after-epochs", "20",
+           "--after-epochs", "4",
            "--learn-rate", "0.1",
+           "--shuffle", "none",
            "--mini-batch", "1"};
     int argc = sizeof(argseasy) / sizeof(char*);
     // this is as close as i could get to initializing a char** in a sane manner
@@ -56,8 +57,8 @@ class ReproTask : public marian::ModelTask {
           "H@@ LL ) 6,@@ 7 , 8 .\n30 –@@ 60 % H@@ LL pacientu ir konstatēta šī reģiona heter@@ "
           "oz@@ ig@@ ota del@@ ē@@ cija , savukārt 10 –@@ 20 % H@@ LL pacientu ir konstatēta šī "
       "reģiona hom@@ oz@@ ig@@ ota del@@ ē@@ c@@ ij@@ a@@ 9 .";
-    auto inputs = New<data::TextInput>(std::vector<std::string>({sources, targets}), vocabs, options);
-    auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
+    // auto inputs = New<data::TextInput>(std::vector<std::string>({sources, targets}), vocabs, options);
+    // auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
 
     for(size_t i = 0; i < 10; i++) {
       auto state = New<TrainingState>(options->get<float>("learn-rate"));
@@ -70,6 +71,11 @@ class ReproTask : public marian::ModelTask {
       bool first = true;
       scheduler->started();
       while(scheduler->keepGoing()) {
+        // if inputs aren't initialized for each epoch, their internal istringstreams get exhausted
+        auto inputs
+            = New<data::TextInput>(std::vector<std::string>({sources, targets}), vocabs, options);
+        auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
+        // auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
         batches->prepare();
 
         for(auto batch : *batches) {

From f67015e9a9233f81e68b537aa58bb18843e48557 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Tue, 30 Mar 2021 10:08:34 +0300
Subject: [PATCH 033/135] Output graphviz graphs for the training graph

---
 src/command/bug_repro.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
index ee9bc905e..a7f63e54b 100644
--- a/src/command/bug_repro.cpp
+++ b/src/command/bug_repro.cpp
@@ -61,6 +61,7 @@ class ReproTask : public marian::ModelTask {
     // auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
 
     for(size_t i = 0; i < 10; i++) {
+      LOG(info, "# NEW OUTER ITER");
       auto state = New<TrainingState>(options->get<float>("learn-rate"));
       auto scheduler = New<Scheduler>(options, state);
       scheduler->registerTrainingObserver(scheduler);
@@ -70,7 +71,12 @@ class ReproTask : public marian::ModelTask {
 
       bool first = true;
       scheduler->started();
+
+      graph = New<ExpressionGraph>();
+      graph->setDevice({0, DeviceType::cpu});
+      graph->reserveWorkspaceMB(128);
       while(scheduler->keepGoing()) {
+        LOG(info, "## NEW INNER ITER");
         // if inputs aren't initialized for each epoch, their internal istringstreams get exhausted
         auto inputs
             = New<data::TextInput>(std::vector<std::string>({sources, targets}), vocabs, options);
@@ -79,18 +85,16 @@ class ReproTask : public marian::ModelTask {
         batches->prepare();
 
         for(auto batch : *batches) {
+          LOG(info, "### NEW BATCH");
           if(!scheduler->keepGoing()) {
             break;
           }
 
-          if(first) {
-            graph = New<ExpressionGraph>();
-            graph->setDevice({0, DeviceType::cpu});
-            graph->reserveWorkspaceMB(128);
+          auto lossNode = builder->build(graph, batch);
+          if (first) {
+            graph->graphviz("graph-" + std::to_string(i) + ".gv");
             first = false;
           }
-
-          auto lossNode = builder->build(graph, batch);
           graph->forward();
           StaticLoss loss = *lossNode;
           graph->backward();

From 78bcce1b93abc20cbd991fda11f9d4e900135779 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Wed, 31 Mar 2021 11:00:22 +0300
Subject: [PATCH 034/135] Fix the segfault in the repro by moving the builder
 inside the loop

Turns out that the builder is maintaining some internal state and for some
reason it messes everything up when building a new expressino graph the second
time. Symptoms are
  1. the node ids in the graph are incremented by a constant amount in the
     second expression graph
  2. the graphviz diagram for the second graph is all messed up (edges missing
     and such)
---
 src/command/bug_repro.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
index a7f63e54b..86464ff77 100644
--- a/src/command/bug_repro.cpp
+++ b/src/command/bug_repro.cpp
@@ -33,7 +33,7 @@ class ReproTask : public marian::ModelTask {
     }
     auto options = parser.parseOptions(argc, args, false);
 
-    auto builder = models::createCriterionFunctionFromOptions(options, models::usage::training);
+    // auto builder = models::createCriterionFunctionFromOptions(options, models::usage::training);
     auto optimizer = Optimizer(New<Options>("optimizer", "adam", "learn-rate", 0.01));
 
     std::vector<std::string> vocabPaths
@@ -62,6 +62,7 @@ class ReproTask : public marian::ModelTask {
 
     for(size_t i = 0; i < 10; i++) {
       LOG(info, "# NEW OUTER ITER");
+      auto builder = models::createCriterionFunctionFromOptions(options, models::usage::training);
       auto state = New<TrainingState>(options->get<float>("learn-rate"));
       auto scheduler = New<Scheduler>(options, state);
       scheduler->registerTrainingObserver(scheduler);

From 162a17c488a71e2a6ccd00bf60e7d26a16a30e3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Wed, 31 Mar 2021 11:24:27 +0300
Subject: [PATCH 035/135] Move the builder initialization inside run() to fix
 the segfault

Turns out that the builder is maintaining some internal state and for some
reason it messes everything up when building a new expression graph the second
time. The symptoms are
  1. the node ids in the graph are incremented by a constant amount in the
     second expression graph
  2. the graphviz diagram for the second graph is all messed up (edges missing
     and such)

This is meant as a workaround, it seems to be quite inefficient
---
 src/translator/self_adaptive.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 1582bb997..82dcf6239 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -78,7 +78,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     graph_ = New<ExpressionGraph>();
     graph_->setDevice(deviceId);
     graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-    builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
+    // builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
 
     optimizer_ = Optimizer(options_);
 
@@ -104,7 +104,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     }
 
     // Load model
-    builder_->load(graph_, model);
+    // builder_->load(graph_, model);
   }
 
   std::string run(const std::string& json) override {
@@ -231,6 +231,9 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
         LOG(info, "### NEW BATCH");
         // Copy params from the original model
         if(first) {
+          builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
+          builder_->load(graph_, model);
+
           // builder_->build(graph_, batch);
           // // TODO: Why do we need to do a froward pass here?
           // graph_->forward();

From de49880ac8cf50ab975b579d2d5794260da6f130 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 1 Apr 2021 11:37:17 +0300
Subject: [PATCH 036/135] Use a dedicated builder for the adaptive graph to
 avoid segfaults

This moves crashing further down the line - the crash now happens upon
translation with the adaptive graph
---
 src/translator/self_adaptive.h | 56 ++++++++++++++++++----------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 82dcf6239..08a16e0d9 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -78,7 +78,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     graph_ = New<ExpressionGraph>();
     graph_->setDevice(deviceId);
     graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-    // builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
+    builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
 
     optimizer_ = Optimizer(options_);
 
@@ -104,7 +104,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     }
 
     // Load model
-    // builder_->load(graph_, model);
+    builder_->load(graph_, model);
   }
 
   std::string run(const std::string& json) override {
@@ -199,6 +199,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
   Ptr<Options> optionsTrans_;  // Options for translator
 
   Ptr<models::ICriterionFunction> builder_;      // Training model
+  Ptr<models::ICriterionFunction> secondBuilder_; // To not get a segfault when training model else could just use builder_
   Ptr<models::IModel> builderTrans_; // Translation model
   Ptr<ExpressionGraph> graph_;          // A graph with original parameters
   Ptr<ExpressionGraph> graphAdapt_;     // A graph on which training is performed
@@ -231,8 +232,9 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
         LOG(info, "### NEW BATCH");
         // Copy params from the original model
         if(first) {
-          builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
-          builder_->load(graph_, model);
+          secondBuilder_
+              = models::createCriterionFunctionFromOptions(options_, models::usage::training);
+          // secondBuilder->load(graph_, model);
 
           // builder_->build(graph_, batch);
           // // TODO: Why do we need to do a froward pass here?
@@ -249,12 +251,12 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
           // it's probably because the order doesn't matter and the
           // builder is used below
           // graphAdapt_->copyParams(graph_);
-          // builder_->load(graphAdapt_, model);
+          secondBuilder_->load(graphAdapt_, model);
           first = false;
         }
 
         // Make an update step on the copy of the model
-        auto lossNode = builder_->build(graphAdapt_, batch);
+        auto lossNode = secondBuilder_->build(graphAdapt_, batch);
         graphAdapt_->forward();
         StaticLoss loss = *lossNode;
         graphAdapt_->backward();
@@ -273,27 +275,27 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
                  Ptr<CollectorBase> collector,
                  Ptr<OutputPrinter> printer,
                  Ptr<ExpressionGraph> graph) {
-    // graph->setInference(true);
-    // graph->clear();
-
-    // {
-    //   auto search = New<BeamSearch>(options_,
-    //                                 scorers_,
-    //                                 vocabs_.back());
-    //   auto histories = search->search(graph, batch);
-
-    //   for(auto history : histories) {
-    //     std::stringstream best1;
-    //     std::stringstream bestn;
-    //     printer->print(history, best1, bestn);
-    //     collector->Write(history->getLineNum(),
-    //                      best1.str(),
-    //                      bestn.str(),
-    //                      options_->get<bool>("n-best"));
-    //   }
-    // }
-
-    // graph->setInference(false);
+    graph->setInference(true);
+    graph->clear();
+
+    {
+      auto search = New<BeamSearch>(options_,
+                                    scorers_,
+                                    vocabs_.back());
+      auto histories = search->search(graph, batch);
+
+      for(auto history : histories) {
+        std::stringstream best1;
+        std::stringstream bestn;
+        printer->print(history, best1, bestn);
+        collector->Write(history->getLineNum(),
+                         best1.str(),
+                         bestn.str(),
+                         options_->get<bool>("n-best"));
+      }
+    }
+
+    graph->setInference(false);
   }
 };
 }

From 29415c71f892d65ee7a025617242fee3dcb25bf7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 19 Apr 2021 11:32:36 +0300
Subject: [PATCH 037/135] Make a copy of all the swappable stuff to later
 adjust for training

---
 src/translator/swappable.cpp | 170 +++++++++++++++++++++++++++++------
 src/translator/swappable.h   |  77 ++++++++++++++++
 2 files changed, 222 insertions(+), 25 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index c5c2bae05..803fb352e 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -11,8 +11,33 @@
 #include "tensors/gpu/swap.h"
 
 namespace marian {
+std::string MultilineInputHack(const std::vector<std::string> &input) {
+  if (input.size() == 1) {
+    return input[0];
+  } else {
+    std::string ret;
+    std::size_t size = 0;
+    for (auto&& line : input) {
+      size += line.size() + 1;
+    }
+    ret.reserve(size);
+    for (auto&& line : input) {
+      ret.append(line);
+      ret.append("\n");
+    }
+    return ret;
+  }
+}
 
-void GPUEngine::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
+namespace {
+  DeviceId LookupGPU(const Ptr<Options> options, size_t deviceIdx) {
+    auto devices = Config::getDevices(options);
+    ABORT_IF(deviceIdx >= devices.size(), "GPU device index higher than configured.");
+    return devices[deviceIdx];
+  }
+} // namespace
+
+void GPUEngineTrain::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
   auto write_it = graph_->params()->begin();
   auto read_it = with.begin();
   for (; read_it != with.end(); ++write_it, ++read_it) {
@@ -20,13 +45,126 @@ void GPUEngine::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
   }
 }
 
-namespace {
-DeviceId LookupGPU(const Ptr<Options> options, size_t deviceIdx) {
-  auto devices = Config::getDevices(options);
-  ABORT_IF(deviceIdx >= devices.size(), "GPU device index higher than configured.");
-  return devices[deviceIdx];
+GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx) 
+  : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
+  ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
+  options_->set("inference", true);
+  options_->set("shuffle", "none");
+
+  // Create graph
+  auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
+  graph_->setDefaultElementType(typeFromString(prec[0]));
+  graph_->setDevice(myDeviceId_);
+  graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+
+  scorers_ = createScorers(options_);
+  for (auto scorer : scorers_) {
+    scorer->init(graph_);
+    // TODO lexical shortlists are not supported yet.
+  }
+  graph_->forward();
+  // TODO: reach into graph_->params() private members and free the parameter memory.
+}
+
+GPUEngineTrain::~GPUEngineTrain() {}
+
+GPULoadedModelTrain::GPULoadedModelTrain(Ptr<GPUEngineTrain> gpu) : engine_(gpu) {
+  for (auto &param : *engine_->graph_->params()) {
+    parameters_.push_back(engine_->allocator_.alloc(param->val()->memory()->size()));
+  }
+}
+
+GPULoadedModelTrain::~GPULoadedModelTrain() {
+  for (MemoryPiece::PtrType &p : parameters_) {
+    engine_->allocator_.free(p);
+  }
+}
+
+void GPULoadedModelTrain::Load(const GPULoadedModelTrain &from) {
+  srcVocabs_ = from.srcVocabs_;
+  trgVocab_ = from.trgVocab_;
+
+  ABORT_IF(engine_ != from.engine_, "TODO: copy across GPUs.");
+
+  for (size_t i = 0; i < parameters_.size(); ++i) {
+    swapper::copyGpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), reinterpret_cast<const char*>(from.parameters_[i]->data()), parameters_[i]->size(), engine_->myDeviceId_);
+  }
+}
+
+void GPULoadedModelTrain::Load(const CPULoadedModelTrain &from) {
+  srcVocabs_ = from.SrcVocabs();
+  trgVocab_ = from.TrgVocab();
+  for (size_t i = 0; i < parameters_.size(); ++i) {
+    swapper::copyCpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), from.Parameters()[i].data(), from.Parameters()[i].size(), engine_->myDeviceId_);
+  }
+}
+
+Histories GPULoadedModelTrain::Translate(const std::vector<std::string> &input) {
+  ABORT_IF(!trgVocab_, "GPULoadedModelTrain needs to be overwritten by a CPU model first.");
+  engine_->SwapPointers(parameters_);
+
+  auto corpus = New<data::TextInput>(std::vector<std::string>(1, MultilineInputHack(input)), srcVocabs_, engine_->options_); // @TODO dirty hack
+  data::BatchGenerator<data::TextInput> batchGenerator(corpus, engine_->options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
+
+  BeamSearch search(engine_->options_, engine_->scorers_, trgVocab_);
+  Histories ret;
+  ret.reserve(input.size());
+  for (auto&& batch : batchGenerator) {
+    auto result = search.search(engine_->graph_, batch);
+    ret.insert(ret.end(), result.begin(), result.end());
+  }
+  std::sort(ret.begin(), ret.end(),[](marian::Ptr<marian::History> a, marian::Ptr<marian::History> b){return a->getLineNum() < b->getLineNum();});
+  engine_->SwapPointers(parameters_);
+  return ret;
+}
+
+CPULoadedModelTrain::CPULoadedModelTrain(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
+  : parameters_(io::loadItems(parameters)) {
+  // Load parameters.
+  // Find the special element and remove it:
+  size_t special_idx = 0;
+  for (size_t i = 0; i < parameters_.size(); i++) {
+    if (parameters_[i].name == "special:model.yml") {
+      special_idx = i;
+      break;
+    }
+  }
+  parameters_.erase(parameters_.begin() + special_idx);
+  // Prepare the name so that it matches the named map
+  for (auto&& item : parameters_) {
+    item.name = "F0::" + item.name;
+  }
+  // Sort by name to match params order.
+  std::sort(parameters_.begin(), parameters_.end(), [](const io::Item &a, const io::Item &b){return a.name < b.name;});
+
+  // Load source vocabs.
+  const std::vector<int> &maxVocabs = options->get<std::vector<int>>("dim-vocabs");
+  for(size_t i = 0; i < sourceVocabPaths.size(); ++i) {
+    Ptr<Vocab> vocab = New<Vocab>(options, i);
+    vocab->load(sourceVocabPaths[i], maxVocabs[i]);
+    srcVocabs_.emplace_back(vocab);
+  }
+
+  // Load target vocab.
+  trgVocab_ = New<Vocab>(options, sourceVocabPaths.size());
+  trgVocab_->load(targetVocabPath);
+}
+
+
+
+  // ##### ^ above is stuff for runtime domain adaptation
+
+
+
+
+
+void GPUEngine::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
+  auto write_it = graph_->params()->begin();
+  auto read_it = with.begin();
+  for (; read_it != with.end(); ++write_it, ++read_it) {
+    std::swap(*(*write_it)->val()->memory(), **read_it);
+  }
 }
-} // namespace
 
 GPUEngine::GPUEngine(Ptr<Options> options, size_t deviceIdx) 
   : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
@@ -82,24 +220,6 @@ void GPULoadedModel::Load(const CPULoadedModel &from) {
   }
 }
 
-std::string MultilineInputHack(const std::vector<std::string> &input) {
-  if (input.size() == 1) {
-    return input[0];
-  } else {
-    std::string ret;
-    std::size_t size = 0;
-    for (auto&& line : input) {
-      size += line.size() + 1;
-    }
-    ret.reserve(size);
-    for (auto&& line : input) {
-      ret.append(line);
-      ret.append("\n");
-    }
-    return ret;
-  }
-}
-
 Histories GPULoadedModel::Translate(const std::vector<std::string> &input) {
   ABORT_IF(!trgVocab_, "GPULoadedModel needs to be overwritten by a CPU model first.");
   engine_->SwapPointers(parameters_);
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index b3cb5f82f..4b525c580 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -14,11 +14,88 @@
 #include <vector>
 namespace marian {
 
+class GPULoadedModelTrain;
+class CPULoadedModelTrain;
+
 class Scorer;
 
 class GPULoadedModel;
 class CPULoadedModel;
 
+
+/* Execute on a particular device */
+class GPUEngineTrain {
+private:
+  friend class GPULoadedModelTrain;
+  Ptr<Options> options_;
+  Ptr<ExpressionGraph> graph_;
+  std::vector<Ptr<Scorer> > scorers_;
+  const DeviceId myDeviceId_;
+  Allocator allocator_;
+
+  void SwapPointers(std::vector<MemoryPiece::PtrType> &with);
+
+public:
+  /**
+    * @param options The marian options object
+    * @param deviceNum The index of the device you want to use for this slot. Note that this is not the deviceID but the index of the device in the
+    *                  array of supplied devices. Eg if you provide -d 0 3 5 and you want the Slot to run on GPU 3, you provide deviceNum=1.
+    */
+  explicit GPUEngineTrain(Ptr<Options> options, size_t deviceNum);
+
+  ~GPUEngineTrain();
+};
+
+/* A model loaded on the GPU that can be overwritten from CPU or GPU. */
+class GPULoadedModelTrain {
+  private:
+    Ptr<GPUEngineTrain> engine_;
+
+    std::vector<MemoryPiece::PtrType> parameters_;
+    std::vector<Ptr<Vocab>> srcVocabs_;
+    Ptr<Vocab> trgVocab_;
+
+  public:
+    GPULoadedModelTrain(Ptr<GPUEngineTrain> gpu);
+
+    ~GPULoadedModelTrain();
+
+    const std::vector<Ptr<Vocab>> &SrcVocabs() const { return srcVocabs_; }
+
+    Ptr<Vocab> TrgVocab() const { return trgVocab_; }
+
+    // Overwrite this model with parameters from a different one.
+    void Load(const CPULoadedModelTrain &from);
+    void Load(const GPULoadedModelTrain &from);
+
+    Histories Translate(const std::vector<std::string> &input);
+};
+
+/* A model loaded on the CPU. */
+class CPULoadedModelTrain {
+  private:
+    std::vector<io::Item> parameters_;
+    std::vector<Ptr<Vocab>> srcVocabs_;
+    Ptr<Vocab> trgVocab_;
+
+  public:
+    // The parts of Options that relate to model and vocab are ignored.  The files provided will be loaded.
+    CPULoadedModelTrain(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath);
+
+    const std::vector<io::Item> &Parameters() const { return parameters_; }
+
+    const std::vector<Ptr<Vocab>> &SrcVocabs() const { return srcVocabs_; }
+
+    Ptr<Vocab> TrgVocab() const { return trgVocab_; }
+};
+
+
+
+// ##### ^ above is stuff for runtime domain adaptation
+
+
+
+
 /* Execute on a particular device */
 class GPUEngine {
 	private:

From 5b28f1f8a273a3098c2460f0bcff79cd267e11f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 19 Apr 2021 16:11:24 +0300
Subject: [PATCH 038/135] Implement training with swappable stuff

---
 src/translator/swappable.cpp | 93 +++++++++++++++++++++++++++++-------
 src/translator/swappable.h   | 13 +++--
 2 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 803fb352e..b61779323 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -45,10 +45,18 @@ void GPUEngineTrain::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
   }
 }
 
+void GPUEngineTrain::Initialize(Ptr<data::Batch> batch) {
+  if (!initialized_) {
+    builder_->build(graph_, batch);
+    graph_->forward();
+    initialized_ = true;
+  }
+}
+
 GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx) 
   : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
   ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
-  options_->set("inference", true);
+  options_->set("inference", false);
   options_->set("shuffle", "none");
 
   // Create graph
@@ -57,13 +65,14 @@ GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx)
   graph_->setDevice(myDeviceId_);
   graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
 
-  scorers_ = createScorers(options_);
-  for (auto scorer : scorers_) {
-    scorer->init(graph_);
-    // TODO lexical shortlists are not supported yet.
-  }
-  graph_->forward();
-  // TODO: reach into graph_->params() private members and free the parameter memory.
+  builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
+  // scorers_ = createScorers(options_);
+  // for (auto scorer : scorers_) {
+  //   scorer->init(graph_);
+  //   // TODO lexical shortlists are not supported yet.
+  // }
+  // graph_->forward();
+  // // TODO: reach into graph_->params() private members and free the parameter memory.
 }
 
 GPUEngineTrain::~GPUEngineTrain() {}
@@ -99,23 +108,57 @@ void GPULoadedModelTrain::Load(const CPULoadedModelTrain &from) {
   }
 }
 
-Histories GPULoadedModelTrain::Translate(const std::vector<std::string> &input) {
+void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   ABORT_IF(!trgVocab_, "GPULoadedModelTrain needs to be overwritten by a CPU model first.");
-  engine_->SwapPointers(parameters_);
+  // engine_->SwapPointers(parameters_);
+
+  auto state     = New<TrainingState>(engine_->options_->get<float>("learn-rate"));
+  auto scheduler = New<Scheduler>(engine_->options_, state);
+  auto optimizer = Optimizer(engine_->options_);
+  scheduler->registerTrainingObserver(scheduler);
+  scheduler->registerTrainingObserver(optimizer);
 
   auto corpus = New<data::TextInput>(std::vector<std::string>(1, MultilineInputHack(input)), srcVocabs_, engine_->options_); // @TODO dirty hack
   data::BatchGenerator<data::TextInput> batchGenerator(corpus, engine_->options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
 
-  BeamSearch search(engine_->options_, engine_->scorers_, trgVocab_);
-  Histories ret;
-  ret.reserve(input.size());
-  for (auto&& batch : batchGenerator) {
-    auto result = search.search(engine_->graph_, batch);
-    ret.insert(ret.end(), result.begin(), result.end());
+  bool first = true;
+  scheduler->started();
+  while(scheduler->keepGoing()) {
+    batchGenerator.prepare();
+
+    LOG(info, "## NEW BATCHES");
+    for(auto&& batch : batchGenerator) {
+      if(!scheduler->keepGoing())
+        break;
+
+      LOG(info, "### NEW BATCH");
+      if(first) {
+        // This is a bit awkward but for some reason
+        // ICriterionFunction::build, which Initialize invokes underneath,
+        // expects a batch. So, afaik, this is the first time where i can
+        // invoke build and, as a result i can call SwapPointers only
+        // afterwards. TODO: verify last claim.
+        engine_->Initialize(batch);
+        engine_->SwapPointers(parameters_);
+        first = false;
+      }
+
+      // Make an update step on the copy of the model
+      auto lossNode = engine_->builder_->build(engine_->graph_, batch);
+      engine_->graph_->forward();
+      StaticLoss loss = *lossNode;
+      engine_->graph_->backward();
+
+      // Notify optimizer and scheduler
+      optimizer->update(engine_->graph_, 1);
+      scheduler->update(loss, batch);
+    }
+    if(scheduler->keepGoing())
+      scheduler->increaseEpoch();
   }
-  std::sort(ret.begin(), ret.end(),[](marian::Ptr<marian::History> a, marian::Ptr<marian::History> b){return a->getLineNum() < b->getLineNum();});
+  scheduler->finished();
+
   engine_->SwapPointers(parameters_);
-  return ret;
 }
 
 CPULoadedModelTrain::CPULoadedModelTrain(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
@@ -212,6 +255,20 @@ void GPULoadedModel::Load(const GPULoadedModel &from) {
   }
 }
 
+void GPULoadedModel::Load(const GPULoadedModelTrain &from) {
+  srcVocabs_ = from.srcVocabs_;
+  trgVocab_  = from.trgVocab_;
+
+  ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
+
+  for(size_t i = 0; i < parameters_.size(); ++i) {
+    swapper::copyGpuToGpu(reinterpret_cast<char *>(parameters_[i]->data()),
+                          reinterpret_cast<const char *>(from.parameters_[i]->data()),
+                          parameters_[i]->size(),
+                          engine_->myDeviceId_);
+  }
+}
+
 void GPULoadedModel::Load(const CPULoadedModel &from) {
   srcVocabs_ = from.SrcVocabs();
   trgVocab_ = from.TrgVocab();
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 4b525c580..b6e53a6c7 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -5,9 +5,10 @@
  * vocabularies must have the same size.  To make vocabulary the same size, pad
  * using scripts/contrib/pad_model_vocabulary.py offline.
  */
-#include "marian.h"
 #include "common/io.h"
 #include "data/vocab.h"
+#include "marian.h"
+#include "training/scheduler.h"
 #include "translator/history.h"
 
 #include <string>
@@ -27,12 +28,15 @@ class CPULoadedModel;
 class GPUEngineTrain {
 private:
   friend class GPULoadedModelTrain;
+  friend class GPULoadedModel;
   Ptr<Options> options_;
   Ptr<ExpressionGraph> graph_;
-  std::vector<Ptr<Scorer> > scorers_;
+  Ptr<models::ICriterionFunction> builder_;
   const DeviceId myDeviceId_;
   Allocator allocator_;
+  bool initialized_ = false;
 
+  void Initialize(Ptr<data::Batch> batch);
   void SwapPointers(std::vector<MemoryPiece::PtrType> &with);
 
 public:
@@ -49,6 +53,8 @@ class GPUEngineTrain {
 /* A model loaded on the GPU that can be overwritten from CPU or GPU. */
 class GPULoadedModelTrain {
   private:
+    friend class GPULoadedModel;
+
     Ptr<GPUEngineTrain> engine_;
 
     std::vector<MemoryPiece::PtrType> parameters_;
@@ -68,7 +74,7 @@ class GPULoadedModelTrain {
     void Load(const CPULoadedModelTrain &from);
     void Load(const GPULoadedModelTrain &from);
 
-    Histories Translate(const std::vector<std::string> &input);
+    void Train(const std::vector<std::string> &input);
 };
 
 /* A model loaded on the CPU. */
@@ -140,6 +146,7 @@ class GPULoadedModel {
     // Overwrite this model with parameters from a different one.
     void Load(const CPULoadedModel &from);
     void Load(const GPULoadedModel &from);
+    void Load(const GPULoadedModelTrain &from);
 
     Histories Translate(const std::vector<std::string> &input);
 };

From 98b1ad1b1a6ab63bb232627c819a138b42f8990c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 20 Apr 2021 13:36:36 +0300
Subject: [PATCH 039/135] Remove CPULoadedModelTrain in favor of just using
 CPULoadedModel

---
 src/translator/swappable.cpp | 33 +--------------------------------
 src/translator/swappable.h   | 20 +-------------------
 2 files changed, 2 insertions(+), 51 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index b61779323..e135566f1 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -100,7 +100,7 @@ void GPULoadedModelTrain::Load(const GPULoadedModelTrain &from) {
   }
 }
 
-void GPULoadedModelTrain::Load(const CPULoadedModelTrain &from) {
+void GPULoadedModelTrain::Load(const CPULoadedModel &from) {
   srcVocabs_ = from.SrcVocabs();
   trgVocab_ = from.TrgVocab();
   for (size_t i = 0; i < parameters_.size(); ++i) {
@@ -161,37 +161,6 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   engine_->SwapPointers(parameters_);
 }
 
-CPULoadedModelTrain::CPULoadedModelTrain(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
-  : parameters_(io::loadItems(parameters)) {
-  // Load parameters.
-  // Find the special element and remove it:
-  size_t special_idx = 0;
-  for (size_t i = 0; i < parameters_.size(); i++) {
-    if (parameters_[i].name == "special:model.yml") {
-      special_idx = i;
-      break;
-    }
-  }
-  parameters_.erase(parameters_.begin() + special_idx);
-  // Prepare the name so that it matches the named map
-  for (auto&& item : parameters_) {
-    item.name = "F0::" + item.name;
-  }
-  // Sort by name to match params order.
-  std::sort(parameters_.begin(), parameters_.end(), [](const io::Item &a, const io::Item &b){return a.name < b.name;});
-
-  // Load source vocabs.
-  const std::vector<int> &maxVocabs = options->get<std::vector<int>>("dim-vocabs");
-  for(size_t i = 0; i < sourceVocabPaths.size(); ++i) {
-    Ptr<Vocab> vocab = New<Vocab>(options, i);
-    vocab->load(sourceVocabPaths[i], maxVocabs[i]);
-    srcVocabs_.emplace_back(vocab);
-  }
-
-  // Load target vocab.
-  trgVocab_ = New<Vocab>(options, sourceVocabPaths.size());
-  trgVocab_->load(targetVocabPath);
-}
 
 
 
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index b6e53a6c7..3d62d98c5 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -16,7 +16,6 @@
 namespace marian {
 
 class GPULoadedModelTrain;
-class CPULoadedModelTrain;
 
 class Scorer;
 
@@ -71,29 +70,12 @@ class GPULoadedModelTrain {
     Ptr<Vocab> TrgVocab() const { return trgVocab_; }
 
     // Overwrite this model with parameters from a different one.
-    void Load(const CPULoadedModelTrain &from);
+    void Load(const CPULoadedModel &from);
     void Load(const GPULoadedModelTrain &from);
 
     void Train(const std::vector<std::string> &input);
 };
 
-/* A model loaded on the CPU. */
-class CPULoadedModelTrain {
-  private:
-    std::vector<io::Item> parameters_;
-    std::vector<Ptr<Vocab>> srcVocabs_;
-    Ptr<Vocab> trgVocab_;
-
-  public:
-    // The parts of Options that relate to model and vocab are ignored.  The files provided will be loaded.
-    CPULoadedModelTrain(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath);
-
-    const std::vector<io::Item> &Parameters() const { return parameters_; }
-
-    const std::vector<Ptr<Vocab>> &SrcVocabs() const { return srcVocabs_; }
-
-    Ptr<Vocab> TrgVocab() const { return trgVocab_; }
-};
 
 
 

From d14da1b6b13eb04b9ea7d92331ab5d342d1c125e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 20 Apr 2021 17:39:08 +0300
Subject: [PATCH 040/135] Adapt self_adaptive.h to use the swappable stuff

Haven't tested it. Likelly broken
---
 src/translator/self_adaptive.h | 186 +++++++++------------------------
 src/translator/swappable.cpp   |  16 +++
 src/translator/swappable.h     |   1 +
 3 files changed, 66 insertions(+), 137 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 08a16e0d9..12d3f233d 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -7,6 +7,7 @@
 #include "models/model_task.h"
 #include "training/scheduler.h"
 #include "training/validator.h"
+#include "translator/swappable.h"
 
 namespace marian {
 
@@ -74,37 +75,14 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     auto deviceId = Config::getDevices(options_)[0];
 
-    // Initialize model for training
-    graph_ = New<ExpressionGraph>();
-    graph_->setDevice(deviceId);
-    graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-    builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
-
-    optimizer_ = Optimizer(options_);
-
-    // Initialize model for translation
-    Ptr<Options> opts = New<Options>();
-    opts->merge(options_);
-    opts->set("inference", true);
-    builderTrans_ = models::createModelFromOptions(opts, models::usage::translation);
-
-    // Initialize a scorer for translation
-    // auto model = options_->get<std::string>("model");
-    model = options_->get<std::string>("model");
-    Ptr<Scorer> scorer = New<ScorerWrapper>(builderTrans_, "", 1.0f, model);
-    scorers_.push_back(scorer);
-
-    // Read vocabularies
+    auto modelFilename = options_->get<std::string>("model");
     auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
-    std::vector<int> maxVocabs = options_->get<std::vector<int>>("dim-vocabs");
-    for(size_t i = 0; i < vocabPaths.size(); ++i) {
-      Ptr<Vocab> vocab = New<Vocab>(options_, i);
-      vocab->load(vocabPaths[i], maxVocabs[i]);
-      vocabs_.emplace_back(vocab);
-    }
-
-    // Load model
-    builder_->load(graph_, model);
+    std::vector<std::string> srcVocabPaths(vocabPaths.begin(), vocabPaths.end() - 1);
+    cpuModel_ = New<CPULoadedModel>(options_, modelFilename, srcVocabPaths, vocabPaths.back());
+    translateEngine_ = New<GPUEngine>(options_, deviceId.no);
+    translateSlot_ = New<GPULoadedModel>(translateEngine_);
+    trainEngine_ = New<GPUEngineTrain>(options_, deviceId.no);
+    trainSlot_   = New<GPULoadedModelTrain>(trainEngine_);
   }
 
   std::string run(const std::string& json) override {
@@ -119,8 +97,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     // Get input sentences
     auto input = yaml["input"].as<std::string>();
-    std::vector<Ptr<Vocab>> srcVocabs(vocabs_.begin(), vocabs_.end() - 1);
-    auto testSet = New<TextInput>(std::vector<std::string>({input}), srcVocabs, optionsTrans_);
+    auto testSet = New<TextInput>(std::vector<std::string>({input}), cpuModel_->SrcVocabs(), optionsTrans_);
 
     // Prepare batches
     auto testBatches = New<BatchGenerator<TextInput>>(testSet, optionsTrans_);
@@ -128,7 +105,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     // Initialize output printing
     auto collector = New<StringCollector>();
-    auto printer = New<OutputPrinter>(options_, vocabs_.back());
+    auto printer = New<OutputPrinter>(options_, cpuModel_->TrgVocab());
 
     // Get training sentences
     std::vector<std::vector<std::string>> contexts;
@@ -140,11 +117,18 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     size_t id = 0;
     for(auto testBatch : *testBatches) {
       if(contexts.size() > id && !contexts[id].empty()) {
-        train(contexts[id]);
-        translate(testBatch, collector, printer, graphAdapt_);
+        trainSlot_->Load(*cpuModel_);
+        trainSlot_->Train(contexts[id]);
+        translateSlot_->Load(*trainSlot_);
+        translate(testBatch, collector, printer);
+        needsSwitching_ = true;
       } else {
         LOG(info, "No context provided for sentence {}", id);
-        translate(testBatch, collector, printer, graph_);
+        if(needsSwitching_) {
+          translateSlot_->Load(*cpuModel_);
+          needsSwitching_ = false;
+        }
+        translate(testBatch, collector, printer);
       }
 
       // iterating by 1 is quite safe because the mini-batch size for
@@ -161,8 +145,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
   void run() override {
     // Initialize input data
     auto srcPaths = options_->get<std::vector<std::string>>("input");
-    std::vector<Ptr<Vocab>> srcVocabs(vocabs_.begin(), vocabs_.end() - 1);
-    auto testSet = New<Corpus>(srcPaths, srcVocabs, optionsTrans_);
+    auto testSet = New<Corpus>(srcPaths, cpuModel_->SrcVocabs(), optionsTrans_);
 
     // Prepare batches
     auto testBatches = New<BatchGenerator<Corpus>>(testSet, optionsTrans_);
@@ -172,7 +155,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     auto collector = New<OutputCollector>(options_->get<std::string>("output"));
     if(options_->get<bool>("quiet-translation"))
       collector->setPrintingStrategy(New<QuietPrinting>());
-    auto printer = New<OutputPrinter>(options_, vocabs_.back());
+    auto printer = New<OutputPrinter>(options_, cpuModel_->SrcVocabs().back());
 
     // Initialize train data
     auto trainPaths = options_->get<std::vector<std::string>>("train-sets");
@@ -185,11 +168,18 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
       if(!trainSet.empty()) {
         LOG(info, "# NEW TEST BATCH");
-        train(trainSet);
-        translate(testBatch, collector, printer, graphAdapt_);
+        trainSlot_->Load(*cpuModel_);
+        trainSlot_->Train(trainSet);
+        translateSlot_->Load(*trainSlot_);
+        translate(testBatch, collector, printer);
+        needsSwitching_ = true;
       } else {
         LOG(info, "# EMPTY TEST BATCH");
-        translate(testBatch, collector, printer, graph_);
+        if (needsSwitching_) {
+          translateSlot_->Load(*cpuModel_);
+          needsSwitching_ = false;
+        }
+        translate(testBatch, collector, printer);
       }
     }
   }
@@ -197,105 +187,27 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 private:
   Ptr<Options> options_;       // Options for training
   Ptr<Options> optionsTrans_;  // Options for translator
-
-  Ptr<models::ICriterionFunction> builder_;      // Training model
-  Ptr<models::ICriterionFunction> secondBuilder_; // To not get a segfault when training model else could just use builder_
-  Ptr<models::IModel> builderTrans_; // Translation model
-  Ptr<ExpressionGraph> graph_;          // A graph with original parameters
-  Ptr<ExpressionGraph> graphAdapt_;     // A graph on which training is performed
-  std::string model;
-
-  std::vector<Ptr<Vocab>> vocabs_;
-  std::vector<Ptr<Scorer>> scorers_;
-  Ptr<OptimizerBase> optimizer_;
-
-  void train(std::vector<std::string> trainSents) {
-    auto state = New<TrainingState>(options_->get<float>("learn-rate"));
-    auto scheduler = New<Scheduler>(options_, state);
-    scheduler->registerTrainingObserver(scheduler);
-    scheduler->registerTrainingObserver(optimizer_);
-
-    auto trainSet = New<TextInput>(trainSents, vocabs_, options_);
-    auto trainBatches = New<BatchGenerator<TextInput>>(trainSet, options_);
-
-    bool first = true;
-
-    scheduler->started();
-    while(scheduler->keepGoing()) {
-      trainBatches->prepare();
-
-      LOG(info, "## NEW BATCHES");
-      for(auto batch : *trainBatches) {
-        if(!scheduler->keepGoing())
-          break;
-
-        LOG(info, "### NEW BATCH");
-        // Copy params from the original model
-        if(first) {
-          secondBuilder_
-              = models::createCriterionFunctionFromOptions(options_, models::usage::training);
-          // secondBuilder->load(graph_, model);
-
-          // builder_->build(graph_, batch);
-          // // TODO: Why do we need to do a froward pass here?
-          // graph_->forward();
-
-          graphAdapt_ = New<ExpressionGraph>();
-          // graphAdapt_->setDevice(graph_->getDeviceId());
-          auto deviceId = Config::getDevices(options_)[0];
-          graphAdapt_->setDevice(deviceId);
-          // graphAdapt_->reuseWorkspace(graph_);
-          graphAdapt_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-
-          // TODO: why aren't we using a builder before this?
-          // it's probably because the order doesn't matter and the
-          // builder is used below
-          // graphAdapt_->copyParams(graph_);
-          secondBuilder_->load(graphAdapt_, model);
-          first = false;
-        }
-
-        // Make an update step on the copy of the model
-        auto lossNode = secondBuilder_->build(graphAdapt_, batch);
-        graphAdapt_->forward();
-        StaticLoss loss = *lossNode;
-        graphAdapt_->backward();
-
-        // Notify optimizer and scheduler
-        optimizer_->update(graphAdapt_, 1);
-        scheduler->update(loss, batch);
-      }
-      if(scheduler->keepGoing())
-        scheduler->increaseEpoch();
-    }
-    scheduler->finished();
-  }
+  Ptr<CPULoadedModel> cpuModel_;
+  Ptr<GPULoadedModelTrain> trainSlot_;
+  Ptr<GPULoadedModel> translateSlot_;
+  Ptr<GPUEngineTrain> trainEngine_;
+  Ptr<GPUEngine> translateEngine_;
+  bool needsSwitching_ = true;
 
   void translate(Ptr<data::CorpusBatch> batch,
                  Ptr<CollectorBase> collector,
-                 Ptr<OutputPrinter> printer,
-                 Ptr<ExpressionGraph> graph) {
-    graph->setInference(true);
-    graph->clear();
-
-    {
-      auto search = New<BeamSearch>(options_,
-                                    scorers_,
-                                    vocabs_.back());
-      auto histories = search->search(graph, batch);
-
-      for(auto history : histories) {
-        std::stringstream best1;
-        std::stringstream bestn;
-        printer->print(history, best1, bestn);
-        collector->Write(history->getLineNum(),
-                         best1.str(),
-                         bestn.str(),
-                         options_->get<bool>("n-best"));
-      }
+                 Ptr<OutputPrinter> printer) {
+    auto histories = translateSlot_->Translate(batch);
+
+    for(auto history : histories) {
+      std::stringstream best1;
+      std::stringstream bestn;
+      printer->print(history, best1, bestn);
+      collector->Write(history->getLineNum(),
+                        best1.str(),
+                        bestn.str(),
+                        options_->get<bool>("n-best"));
     }
-
-    graph->setInference(false);
   }
 };
 }
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index e135566f1..30760e67e 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -265,6 +265,22 @@ Histories GPULoadedModel::Translate(const std::vector<std::string> &input) {
   return ret;
 }
 
+Histories GPULoadedModel::Translate(const Ptr<data::CorpusBatch> batch) {
+  ABORT_IF(!trgVocab_, "GPULoadedModel needs to be overwritten by a CPU model first.");
+  engine_->SwapPointers(parameters_);
+
+  BeamSearch search(engine_->options_, engine_->scorers_, trgVocab_);
+  Histories ret;
+  ret.reserve(batch->size()); // TODO: input.size() was here previously, this is likely wrong
+
+  auto result = search.search(engine_->graph_, batch);
+  ret.insert(ret.end(), result.begin(), result.end());
+
+  std::sort(ret.begin(), ret.end(),[](marian::Ptr<marian::History> a, marian::Ptr<marian::History> b){return a->getLineNum() < b->getLineNum();});
+  engine_->SwapPointers(parameters_);
+  return ret;
+}
+
 CPULoadedModel::CPULoadedModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
   : parameters_(io::loadItems(parameters)) {
   // Load parameters.
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 3d62d98c5..c018908f0 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -131,6 +131,7 @@ class GPULoadedModel {
     void Load(const GPULoadedModelTrain &from);
 
     Histories Translate(const std::vector<std::string> &input);
+    Histories Translate(const Ptr<data::CorpusBatch> batch);
 };
 
 /* A model loaded on the CPU. */

From 07658fb7221f3d15dd3f2571455642897668fc1d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 21 Apr 2021 15:45:16 +0300
Subject: [PATCH 041/135] Fix some runtime issues related to configuration

---
 src/translator/self_adaptive.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 12d3f233d..bbdda2c84 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -76,12 +76,13 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     auto deviceId = Config::getDevices(options_)[0];
 
     auto modelFilename = options_->get<std::string>("model");
+    options_->set<std::vector<std::string>>("models", {modelFilename});
     auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
     std::vector<std::string> srcVocabPaths(vocabPaths.begin(), vocabPaths.end() - 1);
     cpuModel_ = New<CPULoadedModel>(options_, modelFilename, srcVocabPaths, vocabPaths.back());
-    translateEngine_ = New<GPUEngine>(options_, deviceId.no);
+    translateEngine_ = New<GPUEngine>(options_, 0);
     translateSlot_ = New<GPULoadedModel>(translateEngine_);
-    trainEngine_ = New<GPUEngineTrain>(options_, deviceId.no);
+    trainEngine_ = New<GPUEngineTrain>(options_, 0);
     trainSlot_   = New<GPULoadedModelTrain>(trainEngine_);
   }
 

From 06ee187604753c4283140c250a7ed07b01bf180d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Sat, 8 May 2021 16:19:58 +0300
Subject: [PATCH 042/135] Fix issues woth vocab initialization and memory
 allocation

---
 src/translator/self_adaptive.h | 10 +++++++---
 src/translator/swappable.cpp   | 23 ++++++++++++++++++++---
 src/translator/swappable.h     |  1 +
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index bbdda2c84..e10920a62 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -76,14 +76,18 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     auto deviceId = Config::getDevices(options_)[0];
 
     auto modelFilename = options_->get<std::string>("model");
-    options_->set<std::vector<std::string>>("models", {modelFilename});
+    optionsTrans_->set<std::vector<std::string>>("models", {modelFilename});
+
     auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
     std::vector<std::string> srcVocabPaths(vocabPaths.begin(), vocabPaths.end() - 1);
+    // TODO: or use optionsTrans_ here? cpuModel_ is used by both, trainin and translation, code
+    // so i don't yet know what's the correct approach
     cpuModel_ = New<CPULoadedModel>(options_, modelFilename, srcVocabPaths, vocabPaths.back());
-    translateEngine_ = New<GPUEngine>(options_, 0);
+    translateEngine_ = New<GPUEngine>(optionsTrans_, 0);
     translateSlot_ = New<GPULoadedModel>(translateEngine_);
     trainEngine_ = New<GPUEngineTrain>(options_, 0);
     trainSlot_   = New<GPULoadedModelTrain>(trainEngine_);
+    trainSlot_->AllocateParamsLike(*cpuModel_);
   }
 
   std::string run(const std::string& json) override {
@@ -106,7 +110,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     // Initialize output printing
     auto collector = New<StringCollector>();
-    auto printer = New<OutputPrinter>(options_, cpuModel_->TrgVocab());
+    auto printer = New<OutputPrinter>(optionsTrans_, cpuModel_->TrgVocab());
 
     // Get training sentences
     std::vector<std::vector<std::string>> contexts;
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 30760e67e..95e1073c6 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -78,8 +78,15 @@ GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx)
 GPUEngineTrain::~GPUEngineTrain() {}
 
 GPULoadedModelTrain::GPULoadedModelTrain(Ptr<GPUEngineTrain> gpu) : engine_(gpu) {
-  for (auto &param : *engine_->graph_->params()) {
-    parameters_.push_back(engine_->allocator_.alloc(param->val()->memory()->size()));
+  // NOTE: engine_ must contain an initialized graph already at this point
+  // for (auto &param : *engine_->graph_->params()) {
+  //   parameters_.push_back(engine_->allocator_.alloc(param->val()->memory()->size()));
+  // }
+}
+
+void GPULoadedModelTrain::AllocateParamsLike(const CPULoadedModel &from) {
+  for (auto &param : from.Parameters()) {
+    parameters_.push_back(engine_->allocator_.alloc(param.size()));
   }
 }
 
@@ -118,7 +125,17 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   scheduler->registerTrainingObserver(scheduler);
   scheduler->registerTrainingObserver(optimizer);
 
-  auto corpus = New<data::TextInput>(std::vector<std::string>(1, MultilineInputHack(input)), srcVocabs_, engine_->options_); // @TODO dirty hack
+  // LOG(info, "GAAAH: vocabs is {}", srcVocabs_);
+  for (auto vocab: srcVocabs_) {
+    LOG(info, "GAAAH: single vocab is {}", vocab);
+  }
+
+  std::vector<Ptr<Vocab>> allVocabs;
+  allVocabs.reserve(srcVocabs_.size() + 1);
+  allVocabs.insert(allVocabs.end(), srcVocabs_.begin(), srcVocabs_.end());
+  allVocabs.emplace_back(trgVocab_);
+  auto corpus = New<data::TextInput>(input, allVocabs, engine_->options_);  // @TODO dirty hack
+  // auto corpus = New<data::TextInput>(input, srcVocabs_, engine_->options_); // @TODO dirty hack
   data::BatchGenerator<data::TextInput> batchGenerator(corpus, engine_->options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
 
   bool first = true;
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index c018908f0..3615f3080 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -72,6 +72,7 @@ class GPULoadedModelTrain {
     // Overwrite this model with parameters from a different one.
     void Load(const CPULoadedModel &from);
     void Load(const GPULoadedModelTrain &from);
+    void AllocateParamsLike(const CPULoadedModel &from);
 
     void Train(const std::vector<std::string> &input);
 };

From c4ff8b9496ede8408737a29b74573c898820844c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 14 May 2021 10:34:19 +0300
Subject: [PATCH 043/135] Initialize the ExpressionGraph for translation with
 inference=true

---
 src/translator/swappable.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 95e1073c6..86eb4a74e 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -196,7 +196,7 @@ void GPUEngine::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
 }
 
 GPUEngine::GPUEngine(Ptr<Options> options, size_t deviceIdx) 
-  : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
+  : options_(options), graph_(New<ExpressionGraph>(true)), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
   ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
   options_->set("inference", true);
   options_->set("shuffle", "none");

From 16ec013111671f1dd9461f0f8f1159f8d2e7c5f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 14 May 2021 16:02:21 +0300
Subject: [PATCH 044/135] Seek to beginning of the istringstream when resetting
 text input

This solves an issue where a BatchGenerator cannot be initialized with a
TextInput because iterating over batches would then exhaust the TextInput and it
wouldn't reset upon BatchGenerator::prepare.
---
 src/data/text_input.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/data/text_input.h b/src/data/text_input.h
index b08a4fdcc..5ea0b45e9 100644
--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@@ -45,7 +45,12 @@ class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
   Sample next() override;
 
   void shuffle() override {}
-  void reset() override {}
+  void reset() override {
+    for (auto& file : files_) {
+      file->clear();
+      file->seekg(0);
+    }
+  }
 
   iterator begin() override { return iterator(*this); }
   iterator end() override { return iterator(); }

From a220a2b5d5c4bff45a21fd5b4574f7dd8a37757e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 19 May 2021 11:30:02 +0300
Subject: [PATCH 045/135] When translating, directly use the trained parameters
 instead of loading them

---
 src/translator/self_adaptive.h |  4 ++--
 src/translator/swappable.cpp   | 13 +++----------
 src/translator/swappable.h     |  2 +-
 3 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index e10920a62..18e3a18dc 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -124,7 +124,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
       if(contexts.size() > id && !contexts[id].empty()) {
         trainSlot_->Load(*cpuModel_);
         trainSlot_->Train(contexts[id]);
-        translateSlot_->Load(*trainSlot_);
+        translateSlot_->PointToParams(*trainSlot_);
         translate(testBatch, collector, printer);
         needsSwitching_ = true;
       } else {
@@ -175,7 +175,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
         LOG(info, "# NEW TEST BATCH");
         trainSlot_->Load(*cpuModel_);
         trainSlot_->Train(trainSet);
-        translateSlot_->Load(*trainSlot_);
+        translateSlot_->PointToParams(*trainSlot_);
         translate(testBatch, collector, printer);
         needsSwitching_ = true;
       } else {
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 86eb4a74e..4afe266b3 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -241,18 +241,11 @@ void GPULoadedModel::Load(const GPULoadedModel &from) {
   }
 }
 
-void GPULoadedModel::Load(const GPULoadedModelTrain &from) {
+void GPULoadedModel::PointToParams(const GPULoadedModelTrain &from) {
+  ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
   srcVocabs_ = from.srcVocabs_;
   trgVocab_  = from.trgVocab_;
-
-  ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
-
-  for(size_t i = 0; i < parameters_.size(); ++i) {
-    swapper::copyGpuToGpu(reinterpret_cast<char *>(parameters_[i]->data()),
-                          reinterpret_cast<const char *>(from.parameters_[i]->data()),
-                          parameters_[i]->size(),
-                          engine_->myDeviceId_);
-  }
+  parameters_ = from.parameters_;
 }
 
 void GPULoadedModel::Load(const CPULoadedModel &from) {
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 3615f3080..d0bee4667 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -129,7 +129,7 @@ class GPULoadedModel {
     // Overwrite this model with parameters from a different one.
     void Load(const CPULoadedModel &from);
     void Load(const GPULoadedModel &from);
-    void Load(const GPULoadedModelTrain &from);
+    void PointToParams(const GPULoadedModelTrain &from);
 
     Histories Translate(const std::vector<std::string> &input);
     Histories Translate(const Ptr<data::CorpusBatch> batch);

From 4f67aabf26b12435759f6aa69092d3b4a31ddb97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 19 May 2021 11:31:21 +0300
Subject: [PATCH 046/135] Ensure that SwapPointers is called an even number of
 times

---
 src/translator/swappable.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 4afe266b3..ee8ef767a 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -175,7 +175,9 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   }
   scheduler->finished();
 
-  engine_->SwapPointers(parameters_);
+  if(!first) {
+    engine_->SwapPointers(parameters_);
+  }
 }
 
 

From ea1380d8abdf3c5ec7f620fd71fdc735285a6c0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 26 May 2021 14:59:52 +0300
Subject: [PATCH 047/135] Get some params from the gpu memory for debugging

---
 src/translator/self_adaptive.h |  2 +-
 src/translator/swappable.cpp   | 36 ++++++++++++++++++++++++++++++++++
 src/translator/swappable.h     |  1 +
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 18e3a18dc..b276b9d69 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -175,7 +175,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
         LOG(info, "# NEW TEST BATCH");
         trainSlot_->Load(*cpuModel_);
         trainSlot_->Train(trainSet);
-        translateSlot_->PointToParams(*trainSlot_);
+        translateSlot_->Load(*trainSlot_);
         translate(testBatch, collector, printer);
         needsSwitching_ = true;
       } else {
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index ee8ef767a..7ae584987 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -53,6 +53,11 @@ void GPUEngineTrain::Initialize(Ptr<data::Batch> batch) {
   }
 }
 
+void get(std::vector<uint8_t> &v, MemoryPiece::PtrType mem, Ptr<Backend> backend) {
+  v.resize(mem->size());
+  gpu::copy(backend, mem->data<uint8_t>(), mem->data<uint8_t>() + mem->size(), v.data());
+}
+
 GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx) 
   : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
   ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
@@ -118,6 +123,8 @@ void GPULoadedModelTrain::Load(const CPULoadedModel &from) {
 void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   ABORT_IF(!trgVocab_, "GPULoadedModelTrain needs to be overwritten by a CPU model first.");
   // engine_->SwapPointers(parameters_);
+  std::vector<uint8_t> outvec;
+  get(outvec, parameters_[0], engine_->graph_->getBackend());
 
   auto state     = New<TrainingState>(engine_->options_->get<float>("learn-rate"));
   auto scheduler = New<Scheduler>(engine_->options_, state);
@@ -156,12 +163,16 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
         // invoke build and, as a result i can call SwapPointers only
         // afterwards. TODO: verify last claim.
         engine_->Initialize(batch);
+        std::vector<uint8_t> outvec;
+        get(outvec, parameters_[0], engine_->graph_->getBackend());
         engine_->SwapPointers(parameters_);
+        get(outvec, parameters_[0], engine_->graph_->getBackend());
         first = false;
       }
 
       // Make an update step on the copy of the model
       auto lossNode = engine_->builder_->build(engine_->graph_, batch);
+      // LOG(info, "Before: {}", engine_->graph_->params()->vals()->debug());
       engine_->graph_->forward();
       StaticLoss loss = *lossNode;
       engine_->graph_->backward();
@@ -169,6 +180,7 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
       // Notify optimizer and scheduler
       optimizer->update(engine_->graph_, 1);
       scheduler->update(loss, batch);
+      // LOG(info, "After: {}", engine_->graph_->params()->vals()->debug());
     }
     if(scheduler->keepGoing())
       scheduler->increaseEpoch();
@@ -176,7 +188,12 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   scheduler->finished();
 
   if(!first) {
+    std::vector<uint8_t> outvec;
+    get(outvec, parameters_[0], engine_->graph_->getBackend());
     engine_->SwapPointers(parameters_);
+    get(outvec, parameters_[0], engine_->graph_->getBackend());
+    // does nothing, need a place for a breakpoint
+    first = false;
   }
 }
 
@@ -243,6 +260,20 @@ void GPULoadedModel::Load(const GPULoadedModel &from) {
   }
 }
 
+void GPULoadedModel::Load(const GPULoadedModelTrain &from) {
+  srcVocabs_ = from.srcVocabs_;
+  trgVocab_  = from.trgVocab_;
+
+  ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
+
+  for(size_t i = 0; i < parameters_.size(); ++i) {
+    swapper::copyGpuToGpu(reinterpret_cast<char *>(parameters_[i]->data()),
+                          reinterpret_cast<const char *>(from.parameters_[i]->data()),
+                          parameters_[i]->size(),
+                          engine_->myDeviceId_);
+  }
+}
+
 void GPULoadedModel::PointToParams(const GPULoadedModelTrain &from) {
   ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
   srcVocabs_ = from.srcVocabs_;
@@ -279,7 +310,10 @@ Histories GPULoadedModel::Translate(const std::vector<std::string> &input) {
 
 Histories GPULoadedModel::Translate(const Ptr<data::CorpusBatch> batch) {
   ABORT_IF(!trgVocab_, "GPULoadedModel needs to be overwritten by a CPU model first.");
+  std::vector<uint8_t> outvec;
+  get(outvec, parameters_[0], engine_->graph_->getBackend());
   engine_->SwapPointers(parameters_);
+  // LOG(info, "Before translation: {}", engine_->graph_->params()->vals()->debug());
 
   BeamSearch search(engine_->options_, engine_->scorers_, trgVocab_);
   Histories ret;
@@ -289,6 +323,8 @@ Histories GPULoadedModel::Translate(const Ptr<data::CorpusBatch> batch) {
   ret.insert(ret.end(), result.begin(), result.end());
 
   std::sort(ret.begin(), ret.end(),[](marian::Ptr<marian::History> a, marian::Ptr<marian::History> b){return a->getLineNum() < b->getLineNum();});
+
+  // LOG(info, "After translation: {}", engine_->graph_->params()->vals()->debug());
   engine_->SwapPointers(parameters_);
   return ret;
 }
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index d0bee4667..370a2858c 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -129,6 +129,7 @@ class GPULoadedModel {
     // Overwrite this model with parameters from a different one.
     void Load(const CPULoadedModel &from);
     void Load(const GPULoadedModel &from);
+    void Load(const GPULoadedModelTrain &from);
     void PointToParams(const GPULoadedModelTrain &from);
 
     Histories Translate(const std::vector<std::string> &input);

From dda5995ff1f43bd4d569705515f20a5922ca2359 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 16 Jun 2021 13:22:56 +0300
Subject: [PATCH 048/135] Retrieve some debugging information in SwapPointers

---
 src/translator/swappable.cpp | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 7ae584987..234fe28f7 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -37,12 +37,33 @@ namespace {
   }
 } // namespace
 
-void GPUEngineTrain::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
+void get(std::vector<uint8_t> &v, MemoryPiece::PtrType mem, Ptr<Backend> backend) {
+  v.resize(mem->size());
+  gpu::copy(backend, mem->data<uint8_t>(), mem->data<uint8_t>() + mem->size(), v.data());
+}
+
+void GPUEngineTrain::SwapPointers(
+    std::vector<MemoryPiece::PtrType> &with /*, std::vector<std::string> &with_names*/) {
   auto write_it = graph_->params()->begin();
   auto read_it = with.begin();
-  for (; read_it != with.end(); ++write_it, ++read_it) {
+  // auto read_it_names  = with_names.begin();
+  bool first = true;
+  std::vector<uint8_t> outvec;
+  for(; read_it != with.end(); ++write_it, ++read_it /*, ++read_it_names*/ ) {
+    if (first){
+      get(outvec, (*write_it)->val()->memory(), graph_->getBackend());
+      get(outvec, *read_it, graph_->getBackend());
+    }
     std::swap(*(*write_it)->val()->memory(), **read_it);
+    // *graph_->params()->get(*read_it_names)->val()->memory() = std::move(**read_it);
+    // assign(*graph_->params()->get(*read_it_names)->val()->memory(), **read_it);
+    if(first) {
+      get(outvec, (*write_it)->val()->memory(), graph_->getBackend());
+      get(outvec, *read_it, graph_->getBackend());
+      first = false;
+    }
   }
+  // graph_->params()->init(graph_->getBackend(), graph_->getDeviceId());
 }
 
 void GPUEngineTrain::Initialize(Ptr<data::Batch> batch) {
@@ -53,11 +74,6 @@ void GPUEngineTrain::Initialize(Ptr<data::Batch> batch) {
   }
 }
 
-void get(std::vector<uint8_t> &v, MemoryPiece::PtrType mem, Ptr<Backend> backend) {
-  v.resize(mem->size());
-  gpu::copy(backend, mem->data<uint8_t>(), mem->data<uint8_t>() + mem->size(), v.data());
-}
-
 GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx) 
   : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
   ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");

From 4e743bf6983e3a2792461ef3246879d5c171d11c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 30 Jun 2021 16:16:24 +0300
Subject: [PATCH 049/135] Attempt to load the io::Items representing parameters
 directly into the training graph

---
 src/graph/expression_graph.h   |  2 +-
 src/graph/parameters.h         | 11 ++++
 src/translator/self_adaptive.h |  9 ++--
 src/translator/swappable.cpp   | 92 +++++++++++++++++++---------------
 src/translator/swappable.h     | 10 ++--
 5 files changed, 74 insertions(+), 50 deletions(-)

diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 2fa28f67b..75d89a82b 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -743,7 +743,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
 
 public:
   /** Load model (mainly parameter objects) from array of io::Items */
-  void load(std::vector<io::Item>& ioItems, bool markReloaded = true) {
+  void load(const std::vector<io::Item>& ioItems, bool markReloaded = true) {
     setReloaded(false);
     for(auto& item : ioItems) {
       std::string pName = item.name;
diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index d5ede0b4e..40b311b7c 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -45,6 +45,17 @@ class Parameters {
     LOG(debug, "Destroyed parameter object of type {}", acceptedElementType_);
   }
 
+  std::vector<MemoryPiece::PtrType> toMemoryPieces() {
+    std::vector<MemoryPiece::PtrType> res(params_.size());
+    auto read_it = begin();
+    int i = 0;
+    for(; read_it != end(); ++read_it) {
+      i++;
+      res.push_back((*read_it)->val()->memory());
+    }
+    return res;
+  }
+
   auto begin() -> decltype(params_.begin()) { return params_.begin(); }
 
   auto end() -> decltype(params_.begin()) { return params_.end(); }
diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index b276b9d69..8bb6f2577 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -87,7 +87,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     translateSlot_ = New<GPULoadedModel>(translateEngine_);
     trainEngine_ = New<GPUEngineTrain>(options_, 0);
     trainSlot_   = New<GPULoadedModelTrain>(trainEngine_);
-    trainSlot_->AllocateParamsLike(*cpuModel_);
+    // trainSlot_->AllocateParamsLike(*cpuModel_);
   }
 
   std::string run(const std::string& json) override {
@@ -122,7 +122,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     size_t id = 0;
     for(auto testBatch : *testBatches) {
       if(contexts.size() > id && !contexts[id].empty()) {
-        trainSlot_->Load(*cpuModel_);
+        trainSlot_->Load(cpuModel_);
         trainSlot_->Train(contexts[id]);
         translateSlot_->PointToParams(*trainSlot_);
         translate(testBatch, collector, printer);
@@ -173,9 +173,10 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
       if(!trainSet.empty()) {
         LOG(info, "# NEW TEST BATCH");
-        trainSlot_->Load(*cpuModel_);
+        trainSlot_->Load(cpuModel_);
         trainSlot_->Train(trainSet);
-        translateSlot_->Load(*trainSlot_);
+        // translateSlot_->Load(*trainSlot_);
+        translateSlot_->PointToParams(*trainSlot_);
         translate(testBatch, collector, printer);
         needsSwitching_ = true;
       } else {
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 234fe28f7..fb3f07cb0 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -105,42 +105,48 @@ GPULoadedModelTrain::GPULoadedModelTrain(Ptr<GPUEngineTrain> gpu) : engine_(gpu)
   // }
 }
 
-void GPULoadedModelTrain::AllocateParamsLike(const CPULoadedModel &from) {
-  for (auto &param : from.Parameters()) {
-    parameters_.push_back(engine_->allocator_.alloc(param.size()));
-  }
-}
+// void GPULoadedModelTrain::AllocateParamsLike(const CPULoadedModel &from) {
+//   for (auto &param : from.Parameters()) {
+//     parameters_.push_back(engine_->allocator_.alloc(param.size()));
+//   }
+// }
 
 GPULoadedModelTrain::~GPULoadedModelTrain() {
-  for (MemoryPiece::PtrType &p : parameters_) {
-    engine_->allocator_.free(p);
-  }
+  // for (MemoryPiece::PtrType &p : parameters_) {
+  //   engine_->allocator_.free(p);
+  // }
 }
 
-void GPULoadedModelTrain::Load(const GPULoadedModelTrain &from) {
-  srcVocabs_ = from.srcVocabs_;
-  trgVocab_ = from.trgVocab_;
+// void GPULoadedModelTrain::Load(const GPULoadedModelTrain &from) {
+//   srcVocabs_ = from.srcVocabs_;
+//   trgVocab_ = from.trgVocab_;
 
-  ABORT_IF(engine_ != from.engine_, "TODO: copy across GPUs.");
+//   ABORT_IF(engine_ != from.engine_, "TODO: copy across GPUs.");
 
-  for (size_t i = 0; i < parameters_.size(); ++i) {
-    swapper::copyGpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), reinterpret_cast<const char*>(from.parameters_[i]->data()), parameters_[i]->size(), engine_->myDeviceId_);
-  }
-}
+//   for (size_t i = 0; i < parameters_.size(); ++i) {
+//     swapper::copyGpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), reinterpret_cast<const char*>(from.parameters_[i]->data()), parameters_[i]->size(), engine_->myDeviceId_);
+//   }
+// }
 
-void GPULoadedModelTrain::Load(const CPULoadedModel &from) {
-  srcVocabs_ = from.SrcVocabs();
-  trgVocab_ = from.TrgVocab();
-  for (size_t i = 0; i < parameters_.size(); ++i) {
-    swapper::copyCpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), from.Parameters()[i].data(), from.Parameters()[i].size(), engine_->myDeviceId_);
-  }
+void GPULoadedModelTrain::Load(Ptr<CPULoadedModel> from) {
+  srcVocabs_ = from->SrcVocabs();
+  trgVocab_  = from->TrgVocab();
+  cpuModel_ = from;
 }
 
+// void GPULoadedModelTrain::Load(const CPULoadedModel &from) {
+//   srcVocabs_ = from.SrcVocabs();
+//   trgVocab_ = from.TrgVocab();
+//   for (size_t i = 0; i < parameters_.size(); ++i) {
+//     swapper::copyCpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), from.Parameters()[i].data(), from.Parameters()[i].size(), engine_->myDeviceId_);
+//   }
+// }
+
 void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   ABORT_IF(!trgVocab_, "GPULoadedModelTrain needs to be overwritten by a CPU model first.");
   // engine_->SwapPointers(parameters_);
   std::vector<uint8_t> outvec;
-  get(outvec, parameters_[0], engine_->graph_->getBackend());
+  // get(outvec, parameters_[0], engine_->graph_->getBackend());
 
   auto state     = New<TrainingState>(engine_->options_->get<float>("learn-rate"));
   auto scheduler = New<Scheduler>(engine_->options_, state);
@@ -180,9 +186,10 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
         // afterwards. TODO: verify last claim.
         engine_->Initialize(batch);
         std::vector<uint8_t> outvec;
-        get(outvec, parameters_[0], engine_->graph_->getBackend());
-        engine_->SwapPointers(parameters_);
-        get(outvec, parameters_[0], engine_->graph_->getBackend());
+        // get(outvec, parameters_[0], engine_->graph_->getBackend());
+        // engine_->SwapPointers(parameters_);
+        engine_->graph_->load(cpuModel_->Parameters(), false);
+        // get(outvec, parameters_[0], engine_->graph_->getBackend());
         first = false;
       }
 
@@ -193,6 +200,8 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
       StaticLoss loss = *lossNode;
       engine_->graph_->backward();
 
+      // auto out = engine_->graph_->params()->toMemoryPieces();
+
       // Notify optimizer and scheduler
       optimizer->update(engine_->graph_, 1);
       scheduler->update(loss, batch);
@@ -205,9 +214,9 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
 
   if(!first) {
     std::vector<uint8_t> outvec;
-    get(outvec, parameters_[0], engine_->graph_->getBackend());
-    engine_->SwapPointers(parameters_);
-    get(outvec, parameters_[0], engine_->graph_->getBackend());
+    // get(outvec, parameters_[0], engine_->graph_->getBackend());
+    // engine_->SwapPointers(parameters_);
+    // get(outvec, parameters_[0], engine_->graph_->getBackend());
     // does nothing, need a place for a breakpoint
     first = false;
   }
@@ -276,25 +285,26 @@ void GPULoadedModel::Load(const GPULoadedModel &from) {
   }
 }
 
-void GPULoadedModel::Load(const GPULoadedModelTrain &from) {
-  srcVocabs_ = from.srcVocabs_;
-  trgVocab_  = from.trgVocab_;
+// void GPULoadedModel::Load(const GPULoadedModelTrain &from) {
+//   srcVocabs_ = from.srcVocabs_;
+//   trgVocab_  = from.trgVocab_;
 
-  ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
+//   ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
 
-  for(size_t i = 0; i < parameters_.size(); ++i) {
-    swapper::copyGpuToGpu(reinterpret_cast<char *>(parameters_[i]->data()),
-                          reinterpret_cast<const char *>(from.parameters_[i]->data()),
-                          parameters_[i]->size(),
-                          engine_->myDeviceId_);
-  }
-}
+//   for(size_t i = 0; i < parameters_.size(); ++i) {
+//     swapper::copyGpuToGpu(reinterpret_cast<char *>(parameters_[i]->data()),
+//                           reinterpret_cast<const char *>(from.parameters_[i]->data()),
+//                           parameters_[i]->size(),
+//                           engine_->myDeviceId_);
+//   }
+// }
 
 void GPULoadedModel::PointToParams(const GPULoadedModelTrain &from) {
   ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
   srcVocabs_ = from.srcVocabs_;
   trgVocab_  = from.trgVocab_;
-  parameters_ = from.parameters_;
+  // TODO: this might be wrong and could be droped in favor of using SwapPointers
+  parameters_ = from.engine_->graph_->params()->toMemoryPieces();
 }
 
 void GPULoadedModel::Load(const CPULoadedModel &from) {
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 370a2858c..ca0ba0caa 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -56,7 +56,8 @@ class GPULoadedModelTrain {
 
     Ptr<GPUEngineTrain> engine_;
 
-    std::vector<MemoryPiece::PtrType> parameters_;
+    // std::vector<MemoryPiece::PtrType> parameters_;
+    Ptr<CPULoadedModel> cpuModel_;
     std::vector<Ptr<Vocab>> srcVocabs_;
     Ptr<Vocab> trgVocab_;
 
@@ -70,9 +71,10 @@ class GPULoadedModelTrain {
     Ptr<Vocab> TrgVocab() const { return trgVocab_; }
 
     // Overwrite this model with parameters from a different one.
-    void Load(const CPULoadedModel &from);
-    void Load(const GPULoadedModelTrain &from);
-    void AllocateParamsLike(const CPULoadedModel &from);
+    // void Load(const CPULoadedModel &from);
+    void Load(Ptr<CPULoadedModel> from);
+    // void Load(const GPULoadedModelTrain &from);
+    // void AllocateParamsLike(const CPULoadedModel &from);
 
     void Train(const std::vector<std::string> &input);
 };

From 9e898b07dd88db9a83a409e42f8338c74c4d4697 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 14 Jul 2021 16:15:12 +0300
Subject: [PATCH 050/135] Only reserve memory not fill it with values when
 initializing the memory piece vector

---
 src/graph/parameters.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index 40b311b7c..8dd579af1 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -46,7 +46,8 @@ class Parameters {
   }
 
   std::vector<MemoryPiece::PtrType> toMemoryPieces() {
-    std::vector<MemoryPiece::PtrType> res(params_.size());
+    std::vector<MemoryPiece::PtrType> res;
+    res.reserve(params_.size());
     auto read_it = begin();
     int i = 0;
     for(; read_it != end(); ++read_it) {

From e7d339b4d58b2ab3543f2b08b2279e98a1ea8f6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 19 Jul 2021 13:50:15 +0300
Subject: [PATCH 051/135] Load params before building the graph, drop the F0::
 prefix, clear params

celarParams() seems to be unnecessary here, though. Left it in because didn't
want to recompile and test if working again.

This approach still doesn't work though, btw. Managed to fix the issue where
toMemoryPieces() was running into some null pointer problems, but ran into a
different issue afterwards - "Parameters should be allocated by their graph.
Parameter encoder_l1_self_Wq was not"
---
 src/graph/expression_graph.h | 10 +++++++++-
 src/translator/swappable.cpp |  3 ++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 75d89a82b..75b9c28d0 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -186,6 +186,11 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
       kvParams.second->clear();
   }
 
+  void clearParams() {
+    for(auto kvParams : paramsByElementType_)
+      kvParams.second->clear();
+  }
+
   /**
    * Set device options used to run the graph.
    * @param deviceId a struct type which stores device no. (size_t)
@@ -743,10 +748,13 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
 
 public:
   /** Load model (mainly parameter objects) from array of io::Items */
-  void load(const std::vector<io::Item>& ioItems, bool markReloaded = true) {
+  void load(const std::vector<io::Item>& ioItems, bool markReloaded = true, bool dropF0prefix = false) {
     setReloaded(false);
     for(auto& item : ioItems) {
       std::string pName = item.name;
+      if (dropF0prefix && pName.substr(0, 4) == "F0::") {
+        pName = pName.substr(4);
+      }
       // skip over special parameters starting with "special:"
       if(pName.substr(0, 8) == "special:")
         continue;
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index fb3f07cb0..bdaf39d4c 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -184,11 +184,12 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
         // expects a batch. So, afaik, this is the first time where i can
         // invoke build and, as a result i can call SwapPointers only
         // afterwards. TODO: verify last claim.
+        engine_->graph_->clearParams();
+        engine_->graph_->load(cpuModel_->Parameters(), true, true);
         engine_->Initialize(batch);
         std::vector<uint8_t> outvec;
         // get(outvec, parameters_[0], engine_->graph_->getBackend());
         // engine_->SwapPointers(parameters_);
-        engine_->graph_->load(cpuModel_->Parameters(), false);
         // get(outvec, parameters_[0], engine_->graph_->getBackend());
         first = false;
       }

From 26e757459242f83e78d633fff8c780a6809be086 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 23 Jul 2021 14:05:14 +0300
Subject: [PATCH 052/135] Try to clear the graph before loading the parameters
 in an attempt to solve null parameter values

It doesn't help though
---
 src/translator/swappable.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index bdaf39d4c..7b738b54d 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -184,6 +184,7 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
         // expects a batch. So, afaik, this is the first time where i can
         // invoke build and, as a result i can call SwapPointers only
         // afterwards. TODO: verify last claim.
+        engine_->graph_->clear();
         engine_->graph_->clearParams();
         engine_->graph_->load(cpuModel_->Parameters(), true, true);
         engine_->Initialize(batch);

From 58851ae09f19c563ea9a4388fab4a0f67a76da15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 23 Jul 2021 17:18:17 +0300
Subject: [PATCH 053/135] Recreate the graph upon every training invocation

This works, finally
---
 src/translator/swappable.cpp | 16 ++++++++++++++--
 src/translator/swappable.h   |  1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 7b738b54d..049e36b8f 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -96,6 +96,17 @@ GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx)
   // // TODO: reach into graph_->params() private members and free the parameter memory.
 }
 
+void GPUEngineTrain::recreateGraphAndBuilder() {
+  // Create graph
+  graph_ = New<ExpressionGraph>();
+  auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
+  graph_->setDefaultElementType(typeFromString(prec[0]));
+  graph_->setDevice(myDeviceId_);
+  graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+
+  builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
+}
+
 GPUEngineTrain::~GPUEngineTrain() {}
 
 GPULoadedModelTrain::GPULoadedModelTrain(Ptr<GPUEngineTrain> gpu) : engine_(gpu) {
@@ -184,8 +195,9 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
         // expects a batch. So, afaik, this is the first time where i can
         // invoke build and, as a result i can call SwapPointers only
         // afterwards. TODO: verify last claim.
-        engine_->graph_->clear();
-        engine_->graph_->clearParams();
+
+        // Create graph
+        engine_->recreateGraphAndBuilder();
         engine_->graph_->load(cpuModel_->Parameters(), true, true);
         engine_->Initialize(batch);
         std::vector<uint8_t> outvec;
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index ca0ba0caa..1ee9001bf 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -37,6 +37,7 @@ class GPUEngineTrain {
 
   void Initialize(Ptr<data::Batch> batch);
   void SwapPointers(std::vector<MemoryPiece::PtrType> &with);
+  void recreateGraphAndBuilder();
 
 public:
   /**

From 2765e65a0560ff88e9beb254d1e88680d9df0c31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 10 Aug 2021 14:15:21 +0300
Subject: [PATCH 054/135] The wrong vocab was being passed to the printer

Or at least i think so
---
 src/translator/self_adaptive.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 8bb6f2577..9522e903d 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -160,7 +160,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     auto collector = New<OutputCollector>(options_->get<std::string>("output"));
     if(options_->get<bool>("quiet-translation"))
       collector->setPrintingStrategy(New<QuietPrinting>());
-    auto printer = New<OutputPrinter>(options_, cpuModel_->SrcVocabs().back());
+    auto printer = New<OutputPrinter>(options_, cpuModel_->TrgVocab());
 
     // Initialize train data
     auto trainPaths = options_->get<std::vector<std::string>>("train-sets");

From c28939788eb66a6a8f51257a1baa87172302437c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 11 Aug 2021 15:08:39 +0300
Subject: [PATCH 055/135] Clean up and move memory piece extraction to a better
 place

---
 src/translator/swappable.cpp | 13 +++++--------
 src/translator/swappable.h   |  6 ++----
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 049e36b8f..015907461 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -145,6 +145,10 @@ void GPULoadedModelTrain::Load(Ptr<CPULoadedModel> from) {
   cpuModel_ = from;
 }
 
+std::vector<MemoryPiece::PtrType> GPULoadedModelTrain::Parameters() const {
+  return engine_->graph_->params()->toMemoryPieces();
+}
+
 // void GPULoadedModelTrain::Load(const CPULoadedModel &from) {
 //   srcVocabs_ = from.SrcVocabs();
 //   trgVocab_ = from.TrgVocab();
@@ -165,17 +169,11 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   scheduler->registerTrainingObserver(scheduler);
   scheduler->registerTrainingObserver(optimizer);
 
-  // LOG(info, "GAAAH: vocabs is {}", srcVocabs_);
-  for (auto vocab: srcVocabs_) {
-    LOG(info, "GAAAH: single vocab is {}", vocab);
-  }
-
   std::vector<Ptr<Vocab>> allVocabs;
   allVocabs.reserve(srcVocabs_.size() + 1);
   allVocabs.insert(allVocabs.end(), srcVocabs_.begin(), srcVocabs_.end());
   allVocabs.emplace_back(trgVocab_);
   auto corpus = New<data::TextInput>(input, allVocabs, engine_->options_);  // @TODO dirty hack
-  // auto corpus = New<data::TextInput>(input, srcVocabs_, engine_->options_); // @TODO dirty hack
   data::BatchGenerator<data::TextInput> batchGenerator(corpus, engine_->options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
 
   bool first = true;
@@ -317,8 +315,7 @@ void GPULoadedModel::PointToParams(const GPULoadedModelTrain &from) {
   ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
   srcVocabs_ = from.srcVocabs_;
   trgVocab_  = from.trgVocab_;
-  // TODO: this might be wrong and could be droped in favor of using SwapPointers
-  parameters_ = from.engine_->graph_->params()->toMemoryPieces();
+  parameters_ = from.Parameters();
 }
 
 void GPULoadedModel::Load(const CPULoadedModel &from) {
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 1ee9001bf..c062ab3d4 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -57,7 +57,6 @@ class GPULoadedModelTrain {
 
     Ptr<GPUEngineTrain> engine_;
 
-    // std::vector<MemoryPiece::PtrType> parameters_;
     Ptr<CPULoadedModel> cpuModel_;
     std::vector<Ptr<Vocab>> srcVocabs_;
     Ptr<Vocab> trgVocab_;
@@ -72,10 +71,9 @@ class GPULoadedModelTrain {
     Ptr<Vocab> TrgVocab() const { return trgVocab_; }
 
     // Overwrite this model with parameters from a different one.
-    // void Load(const CPULoadedModel &from);
     void Load(Ptr<CPULoadedModel> from);
-    // void Load(const GPULoadedModelTrain &from);
-    // void AllocateParamsLike(const CPULoadedModel &from);
+
+    std::vector<MemoryPiece::PtrType> Parameters() const;
 
     void Train(const std::vector<std::string> &input);
 };

From 20c893d8be548cd0c55c501f74b1cac5c06f650e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 11 Aug 2021 15:45:12 +0300
Subject: [PATCH 056/135] Rename for readability; remove commented out code;
 remove debugging code

---
 src/translator/self_adaptive.h |   4 +-
 src/translator/swappable.cpp   | 102 +++------------------------------
 src/translator/swappable.h     |   4 +-
 3 files changed, 12 insertions(+), 98 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 9522e903d..f2352098b 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -122,7 +122,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     size_t id = 0;
     for(auto testBatch : *testBatches) {
       if(contexts.size() > id && !contexts[id].empty()) {
-        trainSlot_->Load(cpuModel_);
+        trainSlot_->SetModel(cpuModel_);
         trainSlot_->Train(contexts[id]);
         translateSlot_->PointToParams(*trainSlot_);
         translate(testBatch, collector, printer);
@@ -173,7 +173,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
       if(!trainSet.empty()) {
         LOG(info, "# NEW TEST BATCH");
-        trainSlot_->Load(cpuModel_);
+        trainSlot_->SetModel(cpuModel_);
         trainSlot_->Train(trainSet);
         // translateSlot_->Load(*trainSlot_);
         translateSlot_->PointToParams(*trainSlot_);
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 015907461..06a7c31f3 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -37,33 +37,21 @@ namespace {
   }
 } // namespace
 
-void get(std::vector<uint8_t> &v, MemoryPiece::PtrType mem, Ptr<Backend> backend) {
-  v.resize(mem->size());
-  gpu::copy(backend, mem->data<uint8_t>(), mem->data<uint8_t>() + mem->size(), v.data());
+// For debugging memory
+void get(std::vector<uint8_t> &out, MemoryPiece::PtrType mem, Ptr<Backend> backend) {
+  out.resize(mem->size());
+  gpu::copy(backend, mem->data<uint8_t>(), mem->data<uint8_t>() + mem->size(), out.data());
 }
 
 void GPUEngineTrain::SwapPointers(
-    std::vector<MemoryPiece::PtrType> &with /*, std::vector<std::string> &with_names*/) {
+    std::vector<MemoryPiece::PtrType> &with) {
   auto write_it = graph_->params()->begin();
   auto read_it = with.begin();
-  // auto read_it_names  = with_names.begin();
-  bool first = true;
+
   std::vector<uint8_t> outvec;
-  for(; read_it != with.end(); ++write_it, ++read_it /*, ++read_it_names*/ ) {
-    if (first){
-      get(outvec, (*write_it)->val()->memory(), graph_->getBackend());
-      get(outvec, *read_it, graph_->getBackend());
-    }
+  for(; read_it != with.end(); ++write_it, ++read_it) {
     std::swap(*(*write_it)->val()->memory(), **read_it);
-    // *graph_->params()->get(*read_it_names)->val()->memory() = std::move(**read_it);
-    // assign(*graph_->params()->get(*read_it_names)->val()->memory(), **read_it);
-    if(first) {
-      get(outvec, (*write_it)->val()->memory(), graph_->getBackend());
-      get(outvec, *read_it, graph_->getBackend());
-      first = false;
-    }
   }
-  // graph_->params()->init(graph_->getBackend(), graph_->getDeviceId());
 }
 
 void GPUEngineTrain::Initialize(Ptr<data::Batch> batch) {
@@ -87,13 +75,6 @@ GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx)
   graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
 
   builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
-  // scorers_ = createScorers(options_);
-  // for (auto scorer : scorers_) {
-  //   scorer->init(graph_);
-  //   // TODO lexical shortlists are not supported yet.
-  // }
-  // graph_->forward();
-  // // TODO: reach into graph_->params() private members and free the parameter memory.
 }
 
 void GPUEngineTrain::recreateGraphAndBuilder() {
@@ -110,36 +91,12 @@ void GPUEngineTrain::recreateGraphAndBuilder() {
 GPUEngineTrain::~GPUEngineTrain() {}
 
 GPULoadedModelTrain::GPULoadedModelTrain(Ptr<GPUEngineTrain> gpu) : engine_(gpu) {
-  // NOTE: engine_ must contain an initialized graph already at this point
-  // for (auto &param : *engine_->graph_->params()) {
-  //   parameters_.push_back(engine_->allocator_.alloc(param->val()->memory()->size()));
-  // }
 }
 
-// void GPULoadedModelTrain::AllocateParamsLike(const CPULoadedModel &from) {
-//   for (auto &param : from.Parameters()) {
-//     parameters_.push_back(engine_->allocator_.alloc(param.size()));
-//   }
-// }
-
 GPULoadedModelTrain::~GPULoadedModelTrain() {
-  // for (MemoryPiece::PtrType &p : parameters_) {
-  //   engine_->allocator_.free(p);
-  // }
 }
 
-// void GPULoadedModelTrain::Load(const GPULoadedModelTrain &from) {
-//   srcVocabs_ = from.srcVocabs_;
-//   trgVocab_ = from.trgVocab_;
-
-//   ABORT_IF(engine_ != from.engine_, "TODO: copy across GPUs.");
-
-//   for (size_t i = 0; i < parameters_.size(); ++i) {
-//     swapper::copyGpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), reinterpret_cast<const char*>(from.parameters_[i]->data()), parameters_[i]->size(), engine_->myDeviceId_);
-//   }
-// }
-
-void GPULoadedModelTrain::Load(Ptr<CPULoadedModel> from) {
+void GPULoadedModelTrain::SetModel(Ptr<CPULoadedModel> from) {
   srcVocabs_ = from->SrcVocabs();
   trgVocab_  = from->TrgVocab();
   cpuModel_ = from;
@@ -149,19 +106,8 @@ std::vector<MemoryPiece::PtrType> GPULoadedModelTrain::Parameters() const {
   return engine_->graph_->params()->toMemoryPieces();
 }
 
-// void GPULoadedModelTrain::Load(const CPULoadedModel &from) {
-//   srcVocabs_ = from.SrcVocabs();
-//   trgVocab_ = from.TrgVocab();
-//   for (size_t i = 0; i < parameters_.size(); ++i) {
-//     swapper::copyCpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), from.Parameters()[i].data(), from.Parameters()[i].size(), engine_->myDeviceId_);
-//   }
-// }
-
 void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   ABORT_IF(!trgVocab_, "GPULoadedModelTrain needs to be overwritten by a CPU model first.");
-  // engine_->SwapPointers(parameters_);
-  std::vector<uint8_t> outvec;
-  // get(outvec, parameters_[0], engine_->graph_->getBackend());
 
   auto state     = New<TrainingState>(engine_->options_->get<float>("learn-rate"));
   auto scheduler = New<Scheduler>(engine_->options_, state);
@@ -199,39 +145,23 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
         engine_->graph_->load(cpuModel_->Parameters(), true, true);
         engine_->Initialize(batch);
         std::vector<uint8_t> outvec;
-        // get(outvec, parameters_[0], engine_->graph_->getBackend());
-        // engine_->SwapPointers(parameters_);
-        // get(outvec, parameters_[0], engine_->graph_->getBackend());
         first = false;
       }
 
       // Make an update step on the copy of the model
       auto lossNode = engine_->builder_->build(engine_->graph_, batch);
-      // LOG(info, "Before: {}", engine_->graph_->params()->vals()->debug());
       engine_->graph_->forward();
       StaticLoss loss = *lossNode;
       engine_->graph_->backward();
 
-      // auto out = engine_->graph_->params()->toMemoryPieces();
-
       // Notify optimizer and scheduler
       optimizer->update(engine_->graph_, 1);
       scheduler->update(loss, batch);
-      // LOG(info, "After: {}", engine_->graph_->params()->vals()->debug());
     }
     if(scheduler->keepGoing())
       scheduler->increaseEpoch();
   }
   scheduler->finished();
-
-  if(!first) {
-    std::vector<uint8_t> outvec;
-    // get(outvec, parameters_[0], engine_->graph_->getBackend());
-    // engine_->SwapPointers(parameters_);
-    // get(outvec, parameters_[0], engine_->graph_->getBackend());
-    // does nothing, need a place for a breakpoint
-    first = false;
-  }
 }
 
 
@@ -297,20 +227,6 @@ void GPULoadedModel::Load(const GPULoadedModel &from) {
   }
 }
 
-// void GPULoadedModel::Load(const GPULoadedModelTrain &from) {
-//   srcVocabs_ = from.srcVocabs_;
-//   trgVocab_  = from.trgVocab_;
-
-//   ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
-
-//   for(size_t i = 0; i < parameters_.size(); ++i) {
-//     swapper::copyGpuToGpu(reinterpret_cast<char *>(parameters_[i]->data()),
-//                           reinterpret_cast<const char *>(from.parameters_[i]->data()),
-//                           parameters_[i]->size(),
-//                           engine_->myDeviceId_);
-//   }
-// }
-
 void GPULoadedModel::PointToParams(const GPULoadedModelTrain &from) {
   ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
   srcVocabs_ = from.srcVocabs_;
@@ -350,7 +266,6 @@ Histories GPULoadedModel::Translate(const Ptr<data::CorpusBatch> batch) {
   std::vector<uint8_t> outvec;
   get(outvec, parameters_[0], engine_->graph_->getBackend());
   engine_->SwapPointers(parameters_);
-  // LOG(info, "Before translation: {}", engine_->graph_->params()->vals()->debug());
 
   BeamSearch search(engine_->options_, engine_->scorers_, trgVocab_);
   Histories ret;
@@ -361,7 +276,6 @@ Histories GPULoadedModel::Translate(const Ptr<data::CorpusBatch> batch) {
 
   std::sort(ret.begin(), ret.end(),[](marian::Ptr<marian::History> a, marian::Ptr<marian::History> b){return a->getLineNum() < b->getLineNum();});
 
-  // LOG(info, "After translation: {}", engine_->graph_->params()->vals()->debug());
   engine_->SwapPointers(parameters_);
   return ret;
 }
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index c062ab3d4..65a36405f 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -70,8 +70,8 @@ class GPULoadedModelTrain {
 
     Ptr<Vocab> TrgVocab() const { return trgVocab_; }
 
-    // Overwrite this model with parameters from a different one.
-    void Load(Ptr<CPULoadedModel> from);
+    // Change the internal pointers to vocabularies and CPULoadedModel to different ones
+    void SetModel(Ptr<CPULoadedModel> from);
 
     std::vector<MemoryPiece::PtrType> Parameters() const;
 

From 724b9100215eb74ba9934f48e7255d07fa416aa2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Thu, 12 Aug 2021 13:36:04 +0300
Subject: [PATCH 057/135] Remove some redundant initialization code

---
 src/translator/swappable.cpp | 33 ++++++---------------------------
 src/translator/swappable.h   |  3 ---
 2 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 06a7c31f3..e887d903f 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -54,27 +54,14 @@ void GPUEngineTrain::SwapPointers(
   }
 }
 
-void GPUEngineTrain::Initialize(Ptr<data::Batch> batch) {
-  if (!initialized_) {
-    builder_->build(graph_, batch);
-    graph_->forward();
-    initialized_ = true;
-  }
-}
-
-GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx) 
-  : options_(options), graph_(New<ExpressionGraph>()), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
+GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx)
+  : options_(options), myDeviceId_(LookupGPU(options, deviceIdx)) {
   ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
   options_->set("inference", false);
   options_->set("shuffle", "none");
 
-  // Create graph
-  auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
-  graph_->setDefaultElementType(typeFromString(prec[0]));
-  graph_->setDevice(myDeviceId_);
-  graph_->reserveWorkspaceMB(options_->get<size_t>("workspace"));
-
-  builder_ = models::createCriterionFunctionFromOptions(options_, models::usage::training);
+  // There is no need to initialize the graph or builder here because that's done before
+  // each Train() invokation
 }
 
 void GPUEngineTrain::recreateGraphAndBuilder() {
@@ -134,17 +121,9 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
 
       LOG(info, "### NEW BATCH");
       if(first) {
-        // This is a bit awkward but for some reason
-        // ICriterionFunction::build, which Initialize invokes underneath,
-        // expects a batch. So, afaik, this is the first time where i can
-        // invoke build and, as a result i can call SwapPointers only
-        // afterwards. TODO: verify last claim.
-
         // Create graph
         engine_->recreateGraphAndBuilder();
         engine_->graph_->load(cpuModel_->Parameters(), true, true);
-        engine_->Initialize(batch);
-        std::vector<uint8_t> outvec;
         first = false;
       }
 
@@ -263,8 +242,8 @@ Histories GPULoadedModel::Translate(const std::vector<std::string> &input) {
 
 Histories GPULoadedModel::Translate(const Ptr<data::CorpusBatch> batch) {
   ABORT_IF(!trgVocab_, "GPULoadedModel needs to be overwritten by a CPU model first.");
-  std::vector<uint8_t> outvec;
-  get(outvec, parameters_[0], engine_->graph_->getBackend());
+  // std::vector<uint8_t> outvec;
+  // get(outvec, parameters_[0], engine_->graph_->getBackend());
   engine_->SwapPointers(parameters_);
 
   BeamSearch search(engine_->options_, engine_->scorers_, trgVocab_);
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 65a36405f..55f5b1def 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -32,10 +32,7 @@ class GPUEngineTrain {
   Ptr<ExpressionGraph> graph_;
   Ptr<models::ICriterionFunction> builder_;
   const DeviceId myDeviceId_;
-  Allocator allocator_;
-  bool initialized_ = false;
 
-  void Initialize(Ptr<data::Batch> batch);
   void SwapPointers(std::vector<MemoryPiece::PtrType> &with);
   void recreateGraphAndBuilder();
 

From 7a790271207eabe9a220684bd81c2786edd26df1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 8 Sep 2021 14:14:34 +0300
Subject: [PATCH 058/135] Make method naming consistent in GPUEngineTrain

---
 src/translator/swappable.cpp | 4 ++--
 src/translator/swappable.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index e887d903f..a2ca534ef 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -64,7 +64,7 @@ GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx)
   // each Train() invokation
 }
 
-void GPUEngineTrain::recreateGraphAndBuilder() {
+void GPUEngineTrain::RecreateGraphAndBuilder() {
   // Create graph
   graph_ = New<ExpressionGraph>();
   auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
@@ -122,7 +122,7 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
       LOG(info, "### NEW BATCH");
       if(first) {
         // Create graph
-        engine_->recreateGraphAndBuilder();
+        engine_->RecreateGraphAndBuilder();
         engine_->graph_->load(cpuModel_->Parameters(), true, true);
         first = false;
       }
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 55f5b1def..7aee61c9d 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -34,7 +34,7 @@ class GPUEngineTrain {
   const DeviceId myDeviceId_;
 
   void SwapPointers(std::vector<MemoryPiece::PtrType> &with);
-  void recreateGraphAndBuilder();
+  void RecreateGraphAndBuilder();
 
 public:
   /**

From 1790ea11888265992db1eb7507d54e2df1976674 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 8 Sep 2021 15:14:36 +0300
Subject: [PATCH 059/135] Clean up some comments

---
 src/translator/self_adaptive.h | 12 ++++++------
 src/translator/swappable.cpp   |  1 -
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index f2352098b..5b02234ce 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -68,26 +68,27 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     options_->set("shuffle", "none");
     // Set up translator options
     optionsTrans_ = New<Options>(options_->clone());
+    // We will only ever translate a single sentence at a time because dynamic
+    // adaptation happens per sentence
     optionsTrans_->set<size_t>("mini-batch", 1);
     optionsTrans_->set<size_t>("maxi-batch", 1);
+    // TODO: should probably un-hardcode this? The issue is, though, that the users
+    // might want separate options for training and translation
     optionsTrans_->set<size_t>("max-length", 1000);
     optionsTrans_->set("shuffle", "none");
 
-    auto deviceId = Config::getDevices(options_)[0];
-
     auto modelFilename = options_->get<std::string>("model");
+    // Training has a single "model", translation can have multiple "models" in the general case.
+    // Adaptive options also take a single "model" so we have to adapt translation options manually.
     optionsTrans_->set<std::vector<std::string>>("models", {modelFilename});
 
     auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
     std::vector<std::string> srcVocabPaths(vocabPaths.begin(), vocabPaths.end() - 1);
-    // TODO: or use optionsTrans_ here? cpuModel_ is used by both, trainin and translation, code
-    // so i don't yet know what's the correct approach
     cpuModel_ = New<CPULoadedModel>(options_, modelFilename, srcVocabPaths, vocabPaths.back());
     translateEngine_ = New<GPUEngine>(optionsTrans_, 0);
     translateSlot_ = New<GPULoadedModel>(translateEngine_);
     trainEngine_ = New<GPUEngineTrain>(options_, 0);
     trainSlot_   = New<GPULoadedModelTrain>(trainEngine_);
-    // trainSlot_->AllocateParamsLike(*cpuModel_);
   }
 
   std::string run(const std::string& json) override {
@@ -175,7 +176,6 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
         LOG(info, "# NEW TEST BATCH");
         trainSlot_->SetModel(cpuModel_);
         trainSlot_->Train(trainSet);
-        // translateSlot_->Load(*trainSlot_);
         translateSlot_->PointToParams(*trainSlot_);
         translate(testBatch, collector, printer);
         needsSwitching_ = true;
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index a2ca534ef..04b14350f 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -65,7 +65,6 @@ GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx)
 }
 
 void GPUEngineTrain::RecreateGraphAndBuilder() {
-  // Create graph
   graph_ = New<ExpressionGraph>();
   auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
   graph_->setDefaultElementType(typeFromString(prec[0]));

From f8fe981150c1c2f0eb4bd6647bc15c3679521435 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 14 Sep 2021 10:12:36 +0300
Subject: [PATCH 060/135] Simplify the training loop

---
 src/translator/swappable.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 04b14350f..fa66c0ae9 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -92,6 +92,7 @@ std::vector<MemoryPiece::PtrType> GPULoadedModelTrain::Parameters() const {
   return engine_->graph_->params()->toMemoryPieces();
 }
 
+// Load the initial model (dropping any previous changes) and train it on the provided input
 void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   ABORT_IF(!trgVocab_, "GPULoadedModelTrain needs to be overwritten by a CPU model first.");
 
@@ -108,7 +109,11 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   auto corpus = New<data::TextInput>(input, allVocabs, engine_->options_);  // @TODO dirty hack
   data::BatchGenerator<data::TextInput> batchGenerator(corpus, engine_->options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
 
-  bool first = true;
+  // We reset the training graph to the original model parameters to prepare
+  // for adapting it to the new inputs
+  engine_->RecreateGraphAndBuilder();
+  engine_->graph_->load(cpuModel_->Parameters(), true, true);
+
   scheduler->started();
   while(scheduler->keepGoing()) {
     batchGenerator.prepare();
@@ -119,13 +124,6 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
         break;
 
       LOG(info, "### NEW BATCH");
-      if(first) {
-        // Create graph
-        engine_->RecreateGraphAndBuilder();
-        engine_->graph_->load(cpuModel_->Parameters(), true, true);
-        first = false;
-      }
-
       // Make an update step on the copy of the model
       auto lossNode = engine_->builder_->build(engine_->graph_, batch);
       engine_->graph_->forward();

From 4a4214ab1055718aec4af2f9b4685e64daf9c3a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 14 Sep 2021 10:12:53 +0300
Subject: [PATCH 061/135] Make CorpusBase understand that stdin is not a file

---
 src/data/corpus_base.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index 5be4298be..f89ad0e61 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -54,9 +54,13 @@ CorpusBase::CorpusBase(const std::vector<std::string>& paths,
   }
 
   for(auto path : paths_) {
-    UPtr<io::InputFileStream> strm(new io::InputFileStream(path));
-    ABORT_IF(strm->empty(), "File '{}' is empty", path);
-    files_.emplace_back(std::move(strm));
+    if(path == "stdin" || path == "-")
+      files_.emplace_back(new std::istream(std::cin.rdbuf()));
+    else {
+      UPtr<io::InputFileStream> strm(new io::InputFileStream(path));
+      ABORT_IF(strm->empty(), "File '{}' is empty", path);
+      files_.emplace_back(std::move(strm));
+    }
   }
 
   initEOS(/*training=*/true);

From 0c974ebafc4ea4c7f7551627943fc446893db0f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Thu, 16 Sep 2021 12:45:46 +0300
Subject: [PATCH 062/135] Move common training/translation stuff out into a
 separate method

---
 src/translator/self_adaptive.h | 124 +++++++++++++++++++++++----------
 1 file changed, 89 insertions(+), 35 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 5b02234ce..597873792 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -3,6 +3,7 @@
 #include "common/config.h"
 #include "common/file_stream.h"
 #include "data/batch_generator.h"
+#include "data/iterator_facade.h"
 #include "data/text_input.h"
 #include "models/model_task.h"
 #include "training/scheduler.h"
@@ -13,8 +14,29 @@ namespace marian {
 
 using namespace data;
 
+class TrainSetReader;
+
+class TrainSetIterator : public IteratorFacade<TrainSetIterator, std::vector<std::string>> {
+private:
+  TrainSetReader* trainSetReader_;
+  std::vector<std::string> currentSamples_;
+public:
+  // TODO: should we use a smart pointer here instead? The TrainSetReader::begin() method
+  // would make it difficult
+  TrainSetIterator(TrainSetReader* trainSetReader);
+
+  bool equal(const TrainSetIterator& other) const override {
+    return other.trainSetReader_ == trainSetReader_;
+  }
+
+  const std::vector<std::string>& dereference() const override { return currentSamples_; }
+
+  void increment() override;
+};
+
 class TrainSetReader {
   std::vector<UPtr<io::InputFileStream>> files_;
+  bool eof_ = false;
 
 public:
   TrainSetReader(std::vector<std::string> paths) {
@@ -22,25 +44,44 @@ class TrainSetReader {
       files_.emplace_back(new io::InputFileStream(path));
   }
 
+  TrainSetIterator begin() {
+    return TrainSetIterator(this);
+  }
+
+  TrainSetIterator end() {
+    return TrainSetIterator(nullptr);
+  }
+
+  bool eof() {
+    return eof_;
+  }
+
   std::vector<std::string> getSamples() {
     // extracted lines for source and target corpora
     std::vector<std::string> samples;
     // counters of number of lines extracted for source and target
     std::vector<size_t> counts;
 
+    // Early exit if files are exhausted
+    if (eof_) return samples;
+
     for(auto const& file : files_) {
       size_t currCount = 0;
       std::string lines;
       std::string line;
+      bool fileEnded = true;
       while(io::getline(*file, line)) {
-        if(line.empty())
+        if(line.empty()) {
+          fileEnded = false;
           break;
+        }
 
         if(currCount)
           lines += "\n";
         lines += line;
         currCount += 1;
       }
+      eof_ = fileEnded;
 
       if(!lines.empty())
         samples.emplace_back(lines);
@@ -59,6 +100,26 @@ class TrainSetReader {
   }
 };
 
+TrainSetIterator::TrainSetIterator(TrainSetReader* trainSetReader) : trainSetReader_(trainSetReader) {
+  if(trainSetReader) {
+    currentSamples_ = trainSetReader_->getSamples();
+  }
+}
+
+void TrainSetIterator::increment() {
+  // If the previous increment has exhausted the file, we must indicate that the we've reached
+  // the iterator's end
+  if(trainSetReader_->eof() && trainSetReader_ != nullptr) {
+    trainSetReader_ = nullptr;
+    return;
+  }
+  // If we're at the end of the iterator and increment has been called yet another time, there's
+  // a bug in the calling code
+  ABORT_IF(trainSetReader_ == nullptr, "Incrementing the end of the iterator isn't allowed");
+
+  currentSamples_ = trainSetReader_->getSamples();
+}
+
 class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 public:
   TrainSelfAdaptive(Ptr<Options> options) : options_(options) {
@@ -111,7 +172,6 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     // Initialize output printing
     auto collector = New<StringCollector>();
-    auto printer = New<OutputPrinter>(optionsTrans_, cpuModel_->TrgVocab());
 
     // Get training sentences
     std::vector<std::vector<std::string>> contexts;
@@ -120,32 +180,45 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     LOG(info, "Running...");
 
-    size_t id = 0;
+    translate(testBatches, contexts.begin(), contexts.end(), collector);
+
+    auto translations = collector->collect(options_->get<bool>("n-best"));
+    YAML::Emitter output;
+    output << YAML::DoubleQuoted << YAML::Flow << utils::join(translations, "\\n");
+    return "{\"output\":" + std::string(output.c_str()) + "}";
+  }
+
+  template <class Iterator, class DataSet>
+  void translate(
+      Ptr<marian::data::BatchGenerator<DataSet>>
+          testBatches,
+      Iterator trainBegin,
+      Iterator trainEnd,
+      Ptr<marian::CollectorBase> collector) {
+    auto printer = New<OutputPrinter>(options_, cpuModel_->TrgVocab());
+
     for(auto testBatch : *testBatches) {
-      if(contexts.size() > id && !contexts[id].empty()) {
+      ABORT_IF(trainBegin == trainEnd, "Context batches ran out before test batches");
+
+      auto trainSet = *trainBegin;
+      ++trainBegin;
+
+      if(!trainSet.empty()) {
+        LOG(info, "# NEW TEST BATCH");
         trainSlot_->SetModel(cpuModel_);
-        trainSlot_->Train(contexts[id]);
+        trainSlot_->Train(trainSet);
         translateSlot_->PointToParams(*trainSlot_);
         translate(testBatch, collector, printer);
         needsSwitching_ = true;
       } else {
-        LOG(info, "No context provided for sentence {}", id);
+        LOG(info, "# EMPTY TEST BATCH");
         if(needsSwitching_) {
           translateSlot_->Load(*cpuModel_);
           needsSwitching_ = false;
         }
         translate(testBatch, collector, printer);
       }
-
-      // iterating by 1 is quite safe because the mini-batch size for
-      // translation is always 1
-      ++id;
     }
-
-    auto translations = collector->collect(options_->get<bool>("n-best"));
-    YAML::Emitter output;
-    output << YAML::DoubleQuoted << YAML::Flow << utils::join(translations, "\\n");
-    return "{\"output\":" + std::string(output.c_str()) + "}";
   }
 
   void run() override {
@@ -161,7 +234,6 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     auto collector = New<OutputCollector>(options_->get<std::string>("output"));
     if(options_->get<bool>("quiet-translation"))
       collector->setPrintingStrategy(New<QuietPrinting>());
-    auto printer = New<OutputPrinter>(options_, cpuModel_->TrgVocab());
 
     // Initialize train data
     auto trainPaths = options_->get<std::vector<std::string>>("train-sets");
@@ -169,25 +241,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     LOG(info, "Running...");
 
-    for(auto testBatch : *testBatches) {
-      auto trainSet = trainSets->getSamples();
-
-      if(!trainSet.empty()) {
-        LOG(info, "# NEW TEST BATCH");
-        trainSlot_->SetModel(cpuModel_);
-        trainSlot_->Train(trainSet);
-        translateSlot_->PointToParams(*trainSlot_);
-        translate(testBatch, collector, printer);
-        needsSwitching_ = true;
-      } else {
-        LOG(info, "# EMPTY TEST BATCH");
-        if (needsSwitching_) {
-          translateSlot_->Load(*cpuModel_);
-          needsSwitching_ = false;
-        }
-        translate(testBatch, collector, printer);
-      }
-    }
+    translate(testBatches, trainSets->begin(), trainSets->end(), collector);
   }
 
 private:

From 1176c3fbbcfa26d965c4e10b536f907da750d025 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Thu, 16 Sep 2021 15:13:33 +0300
Subject: [PATCH 063/135] Rename TrainSet{Reader,Iterator} to
 AdaptiveContext{Reader,Iterator}

---
 src/translator/self_adaptive.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 597873792..b0085478f 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -14,18 +14,18 @@ namespace marian {
 
 using namespace data;
 
-class TrainSetReader;
+class AdaptiveContextReader;
 
-class TrainSetIterator : public IteratorFacade<TrainSetIterator, std::vector<std::string>> {
+class AdaptiveContextIterator : public IteratorFacade<AdaptiveContextIterator, std::vector<std::string>> {
 private:
-  TrainSetReader* trainSetReader_;
+  AdaptiveContextReader* trainSetReader_;
   std::vector<std::string> currentSamples_;
 public:
   // TODO: should we use a smart pointer here instead? The TrainSetReader::begin() method
   // would make it difficult
-  TrainSetIterator(TrainSetReader* trainSetReader);
+  AdaptiveContextIterator(AdaptiveContextReader* trainSetReader);
 
-  bool equal(const TrainSetIterator& other) const override {
+  bool equal(const AdaptiveContextIterator& other) const override {
     return other.trainSetReader_ == trainSetReader_;
   }
 
@@ -34,22 +34,22 @@ class TrainSetIterator : public IteratorFacade<TrainSetIterator, std::vector<std
   void increment() override;
 };
 
-class TrainSetReader {
+class AdaptiveContextReader {
   std::vector<UPtr<io::InputFileStream>> files_;
   bool eof_ = false;
 
 public:
-  TrainSetReader(std::vector<std::string> paths) {
+  AdaptiveContextReader(std::vector<std::string> paths) {
     for(auto& path : paths)
       files_.emplace_back(new io::InputFileStream(path));
   }
 
-  TrainSetIterator begin() {
-    return TrainSetIterator(this);
+  AdaptiveContextIterator begin() {
+    return AdaptiveContextIterator(this);
   }
 
-  TrainSetIterator end() {
-    return TrainSetIterator(nullptr);
+  AdaptiveContextIterator end() {
+    return AdaptiveContextIterator(nullptr);
   }
 
   bool eof() {
@@ -100,13 +100,13 @@ class TrainSetReader {
   }
 };
 
-TrainSetIterator::TrainSetIterator(TrainSetReader* trainSetReader) : trainSetReader_(trainSetReader) {
+AdaptiveContextIterator::AdaptiveContextIterator(AdaptiveContextReader* trainSetReader) : trainSetReader_(trainSetReader) {
   if(trainSetReader) {
     currentSamples_ = trainSetReader_->getSamples();
   }
 }
 
-void TrainSetIterator::increment() {
+void AdaptiveContextIterator::increment() {
   // If the previous increment has exhausted the file, we must indicate that the we've reached
   // the iterator's end
   if(trainSetReader_->eof() && trainSetReader_ != nullptr) {
@@ -237,7 +237,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     // Initialize train data
     auto trainPaths = options_->get<std::vector<std::string>>("train-sets");
-    auto trainSets = New<TrainSetReader>(trainPaths);
+    auto trainSets = New<AdaptiveContextReader>(trainPaths);
 
     LOG(info, "Running...");
 

From 79002cbaee38251a8d70b81367e91a9077fd7f2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Thu, 16 Sep 2021 16:18:49 +0300
Subject: [PATCH 064/135] Add documentation comments for adaptive context
 reader classes

---
 src/translator/self_adaptive.h | 49 +++++++++++++++++++++++++++++++---
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index b0085478f..dd242e354 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -16,7 +16,12 @@ using namespace data;
 
 class AdaptiveContextReader;
 
-class AdaptiveContextIterator : public IteratorFacade<AdaptiveContextIterator, std::vector<std::string>> {
+/**
+ * @brief An iterator for easier access of the context sentences produced by
+ * `AdaptiveContextReader::getSamples()`
+ */
+class AdaptiveContextIterator
+  : public IteratorFacade<AdaptiveContextIterator, std::vector<std::string>> {
 private:
   AdaptiveContextReader* trainSetReader_;
   std::vector<std::string> currentSamples_;
@@ -34,16 +39,38 @@ class AdaptiveContextIterator : public IteratorFacade<AdaptiveContextIterator, s
   void increment() override;
 };
 
+/**
+ * @brief Reads the context sentences, that are used for on-the-fly training in
+ * the self-adaptive translation mode, from files.
+ */
 class AdaptiveContextReader {
   std::vector<UPtr<io::InputFileStream>> files_;
+  /// Indicates whether the input files have been exhausted.
   bool eof_ = false;
 
 public:
+  /**
+   * @brief Initializes a new reader by supplying paths to the files with
+   * context sentences
+   *
+   * @param paths paths to the input files. The input files contain
+   * newline-separated parallel sentence pairs (as usual for MT). Sentences are
+   * grouped by the translatable sentences (which are provided in a different
+   * file). Each group is delimited by a single empty line. The sentence group
+   * can be empty (no context is provided for the respective translatable
+   * sentence) in which case it is also represented by a single empty line.
+   */
   AdaptiveContextReader(std::vector<std::string> paths) {
     for(auto& path : paths)
       files_.emplace_back(new io::InputFileStream(path));
   }
 
+  /**
+   * @brief Returns an iterator over the sets of context sentences produced by
+   * `getSamples()`
+   *
+   * @return the beginning of the iterator.
+   */
   AdaptiveContextIterator begin() {
     return AdaptiveContextIterator(this);
   }
@@ -56,13 +83,29 @@ class AdaptiveContextReader {
     return eof_;
   }
 
+  /**
+   * @brief Reads the next set of samples -- the contaxt sentences -- for
+   * on-the-fly training in the self-adaptive translation mode.
+   *
+   * @details The input files contain newline-separated parallel sentence pairs
+   * (as usual for MT). Sentences are grouped by the translatable sentences
+   * (which are provided in a different file). Each group is delimited by a
+   * single empty line. The sentence group can be empty (no context is provided
+   * for the respective translatable sentence) in which case it is also
+   * represented by a single empty line.
+   *
+   * @return a vector representing a single group of context sentences. Each
+   * element in the vector contains newline seperated input lines comming from a
+   * single file, e.g., [0] could contain 3 newline separated sentences in
+   * English and [1] would contain their 3 respective translations in Latvian.
+   */
   std::vector<std::string> getSamples() {
     // extracted lines for source and target corpora
     std::vector<std::string> samples;
     // counters of number of lines extracted for source and target
     std::vector<size_t> counts;
 
-    // Early exit if files are exhausted
+    // Early exit if input files are exhausted
     if (eof_) return samples;
 
     for(auto const& file : files_) {
@@ -115,7 +158,7 @@ void AdaptiveContextIterator::increment() {
   }
   // If we're at the end of the iterator and increment has been called yet another time, there's
   // a bug in the calling code
-  ABORT_IF(trainSetReader_ == nullptr, "Incrementing the end of the iterator isn't allowed");
+  ABORT_IF(trainSetReader_ == nullptr, "Incrementing past the end of the iterator isn't allowed");
 
   currentSamples_ = trainSetReader_->getSamples();
 }

From 632d05fb310e9b3c41d887df22ccad0a817a3b92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Thu, 16 Sep 2021 16:42:43 +0300
Subject: [PATCH 065/135] Move self-adaptive data stuff to a separate file

---
 src/data/adaptive_context.h    | 156 +++++++++++++++++++++++++++++++++
 src/translator/self_adaptive.h | 151 +------------------------------
 2 files changed, 157 insertions(+), 150 deletions(-)
 create mode 100644 src/data/adaptive_context.h

diff --git a/src/data/adaptive_context.h b/src/data/adaptive_context.h
new file mode 100644
index 000000000..4b61ff546
--- /dev/null
+++ b/src/data/adaptive_context.h
@@ -0,0 +1,156 @@
+#pragma once
+
+#include "common/file_stream.h"
+#include "data/iterator_facade.h"
+
+namespace marian {
+
+class AdaptiveContextReader;
+
+/**
+ * @brief An iterator for easier access of the context sentences produced by
+ * `AdaptiveContextReader::getSamples()`
+ */
+class AdaptiveContextIterator
+  : public IteratorFacade<AdaptiveContextIterator, std::vector<std::string>> {
+private:
+  AdaptiveContextReader* trainSetReader_;
+  std::vector<std::string> currentSamples_;
+public:
+  // TODO: should we use a smart pointer here instead? The TrainSetReader::begin() method
+  // would make it difficult
+  AdaptiveContextIterator(AdaptiveContextReader* trainSetReader);
+
+  bool equal(const AdaptiveContextIterator& other) const override {
+    return other.trainSetReader_ == trainSetReader_;
+  }
+
+  const std::vector<std::string>& dereference() const override { return currentSamples_; }
+
+  void increment() override;
+};
+
+/**
+ * @brief Reads the context sentences, that are used for on-the-fly training in
+ * the self-adaptive translation mode, from files.
+ */
+class AdaptiveContextReader {
+  std::vector<UPtr<io::InputFileStream>> files_;
+  /// Indicates whether the input files have been exhausted.
+  bool eof_ = false;
+
+public:
+  /**
+   * @brief Initializes a new reader by supplying paths to the files with
+   * context sentences
+   *
+   * @param paths paths to the input files. The input files contain
+   * newline-separated parallel sentence pairs (as usual for MT). Sentences are
+   * grouped by the translatable sentences (which are provided in a different
+   * file). Each group is delimited by a single empty line. The sentence group
+   * can be empty (no context is provided for the respective translatable
+   * sentence) in which case it is also represented by a single empty line.
+   */
+  AdaptiveContextReader(std::vector<std::string> paths) {
+    for(auto& path : paths)
+      files_.emplace_back(new io::InputFileStream(path));
+  }
+
+  /**
+   * @brief Returns an iterator over the sets of context sentences produced by
+   * `getSamples()`
+   *
+   * @return the beginning of the iterator.
+   */
+  AdaptiveContextIterator begin() {
+    return AdaptiveContextIterator(this);
+  }
+
+  AdaptiveContextIterator end() {
+    return AdaptiveContextIterator(nullptr);
+  }
+
+  bool eof() {
+    return eof_;
+  }
+
+  /**
+   * @brief Reads the next set of samples -- the contaxt sentences -- for
+   * on-the-fly training in the self-adaptive translation mode.
+   *
+   * @details The input files contain newline-separated parallel sentence pairs
+   * (as usual for MT). Sentences are grouped by the translatable sentences
+   * (which are provided in a different file). Each group is delimited by a
+   * single empty line. The sentence group can be empty (no context is provided
+   * for the respective translatable sentence) in which case it is also
+   * represented by a single empty line.
+   *
+   * @return a vector representing a single group of context sentences. Each
+   * element in the vector contains newline seperated input lines comming from a
+   * single file, e.g., [0] could contain 3 newline separated sentences in
+   * English and [1] would contain their 3 respective translations in Latvian.
+   */
+  std::vector<std::string> getSamples() {
+    // extracted lines for source and target corpora
+    std::vector<std::string> samples;
+    // counters of number of lines extracted for source and target
+    std::vector<size_t> counts;
+
+    // Early exit if input files are exhausted
+    if (eof_) return samples;
+
+    for(auto const& file : files_) {
+      size_t currCount = 0;
+      std::string lines;
+      std::string line;
+      bool fileEnded = true;
+      while(io::getline(*file, line)) {
+        if(line.empty()) {
+          fileEnded = false;
+          break;
+        }
+
+        if(currCount)
+          lines += "\n";
+        lines += line;
+        currCount += 1;
+      }
+      eof_ = fileEnded;
+
+      if(!lines.empty())
+        samples.emplace_back(lines);
+      counts.push_back(currCount);
+
+      // check if the same number of lines is extracted for source and target
+      size_t prevCount = counts[0];
+      for(size_t i = 1; i < counts.size(); ++i) {
+        ABORT_IF(prevCount != counts[i],
+                 "An empty source or target sentence has been encountered!");
+        prevCount = counts[i];
+      }
+    }
+
+    return samples;
+  }
+};
+
+AdaptiveContextIterator::AdaptiveContextIterator(AdaptiveContextReader* trainSetReader) : trainSetReader_(trainSetReader) {
+  if(trainSetReader) {
+    currentSamples_ = trainSetReader_->getSamples();
+  }
+}
+
+void AdaptiveContextIterator::increment() {
+  // If the previous increment has exhausted the file, we must indicate that the we've reached
+  // the iterator's end
+  if(trainSetReader_->eof() && trainSetReader_ != nullptr) {
+    trainSetReader_ = nullptr;
+    return;
+  }
+  // If we're at the end of the iterator and increment has been called yet another time, there's
+  // a bug in the calling code
+  ABORT_IF(trainSetReader_ == nullptr, "Incrementing past the end of the iterator isn't allowed");
+
+  currentSamples_ = trainSetReader_->getSamples();
+}
+}  // namespace marian
diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index dd242e354..8aa5189f7 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -3,166 +3,17 @@
 #include "common/config.h"
 #include "common/file_stream.h"
 #include "data/batch_generator.h"
-#include "data/iterator_facade.h"
 #include "data/text_input.h"
 #include "models/model_task.h"
 #include "training/scheduler.h"
 #include "training/validator.h"
 #include "translator/swappable.h"
+#include "data/adaptive_context.h"
 
 namespace marian {
 
 using namespace data;
 
-class AdaptiveContextReader;
-
-/**
- * @brief An iterator for easier access of the context sentences produced by
- * `AdaptiveContextReader::getSamples()`
- */
-class AdaptiveContextIterator
-  : public IteratorFacade<AdaptiveContextIterator, std::vector<std::string>> {
-private:
-  AdaptiveContextReader* trainSetReader_;
-  std::vector<std::string> currentSamples_;
-public:
-  // TODO: should we use a smart pointer here instead? The TrainSetReader::begin() method
-  // would make it difficult
-  AdaptiveContextIterator(AdaptiveContextReader* trainSetReader);
-
-  bool equal(const AdaptiveContextIterator& other) const override {
-    return other.trainSetReader_ == trainSetReader_;
-  }
-
-  const std::vector<std::string>& dereference() const override { return currentSamples_; }
-
-  void increment() override;
-};
-
-/**
- * @brief Reads the context sentences, that are used for on-the-fly training in
- * the self-adaptive translation mode, from files.
- */
-class AdaptiveContextReader {
-  std::vector<UPtr<io::InputFileStream>> files_;
-  /// Indicates whether the input files have been exhausted.
-  bool eof_ = false;
-
-public:
-  /**
-   * @brief Initializes a new reader by supplying paths to the files with
-   * context sentences
-   *
-   * @param paths paths to the input files. The input files contain
-   * newline-separated parallel sentence pairs (as usual for MT). Sentences are
-   * grouped by the translatable sentences (which are provided in a different
-   * file). Each group is delimited by a single empty line. The sentence group
-   * can be empty (no context is provided for the respective translatable
-   * sentence) in which case it is also represented by a single empty line.
-   */
-  AdaptiveContextReader(std::vector<std::string> paths) {
-    for(auto& path : paths)
-      files_.emplace_back(new io::InputFileStream(path));
-  }
-
-  /**
-   * @brief Returns an iterator over the sets of context sentences produced by
-   * `getSamples()`
-   *
-   * @return the beginning of the iterator.
-   */
-  AdaptiveContextIterator begin() {
-    return AdaptiveContextIterator(this);
-  }
-
-  AdaptiveContextIterator end() {
-    return AdaptiveContextIterator(nullptr);
-  }
-
-  bool eof() {
-    return eof_;
-  }
-
-  /**
-   * @brief Reads the next set of samples -- the contaxt sentences -- for
-   * on-the-fly training in the self-adaptive translation mode.
-   *
-   * @details The input files contain newline-separated parallel sentence pairs
-   * (as usual for MT). Sentences are grouped by the translatable sentences
-   * (which are provided in a different file). Each group is delimited by a
-   * single empty line. The sentence group can be empty (no context is provided
-   * for the respective translatable sentence) in which case it is also
-   * represented by a single empty line.
-   *
-   * @return a vector representing a single group of context sentences. Each
-   * element in the vector contains newline seperated input lines comming from a
-   * single file, e.g., [0] could contain 3 newline separated sentences in
-   * English and [1] would contain their 3 respective translations in Latvian.
-   */
-  std::vector<std::string> getSamples() {
-    // extracted lines for source and target corpora
-    std::vector<std::string> samples;
-    // counters of number of lines extracted for source and target
-    std::vector<size_t> counts;
-
-    // Early exit if input files are exhausted
-    if (eof_) return samples;
-
-    for(auto const& file : files_) {
-      size_t currCount = 0;
-      std::string lines;
-      std::string line;
-      bool fileEnded = true;
-      while(io::getline(*file, line)) {
-        if(line.empty()) {
-          fileEnded = false;
-          break;
-        }
-
-        if(currCount)
-          lines += "\n";
-        lines += line;
-        currCount += 1;
-      }
-      eof_ = fileEnded;
-
-      if(!lines.empty())
-        samples.emplace_back(lines);
-      counts.push_back(currCount);
-
-      // check if the same number of lines is extracted for source and target
-      size_t prevCount = counts[0];
-      for(size_t i = 1; i < counts.size(); ++i) {
-        ABORT_IF(prevCount != counts[i],
-                 "An empty source or target sentence has been encountered!");
-        prevCount = counts[i];
-      }
-    }
-
-    return samples;
-  }
-};
-
-AdaptiveContextIterator::AdaptiveContextIterator(AdaptiveContextReader* trainSetReader) : trainSetReader_(trainSetReader) {
-  if(trainSetReader) {
-    currentSamples_ = trainSetReader_->getSamples();
-  }
-}
-
-void AdaptiveContextIterator::increment() {
-  // If the previous increment has exhausted the file, we must indicate that the we've reached
-  // the iterator's end
-  if(trainSetReader_->eof() && trainSetReader_ != nullptr) {
-    trainSetReader_ = nullptr;
-    return;
-  }
-  // If we're at the end of the iterator and increment has been called yet another time, there's
-  // a bug in the calling code
-  ABORT_IF(trainSetReader_ == nullptr, "Incrementing past the end of the iterator isn't allowed");
-
-  currentSamples_ = trainSetReader_->getSamples();
-}
-
 class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 public:
   TrainSelfAdaptive(Ptr<Options> options) : options_(options) {

From 95ed9afc2a8423ed3318b5aeafa31851e70b6b9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 17 Sep 2021 11:33:36 +0300
Subject: [PATCH 066/135] Move method definitions from adaptive_context.h to
 .cpp

---
 src/CMakeLists.txt            |  1 +
 src/data/adaptive_context.cpp | 97 +++++++++++++++++++++++++++++++++++
 src/data/adaptive_context.h   | 87 ++++---------------------------
 3 files changed, 107 insertions(+), 78 deletions(-)
 create mode 100644 src/data/adaptive_context.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6084f091e..44aebe6f4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -41,6 +41,7 @@ set(MARIAN_SOURCES
   data/corpus_nbest.cpp
   data/text_input.cpp
   data/shortlist.cpp
+  data/adaptive_context.cpp
 
   3rd_party/cnpy/cnpy.cpp
   3rd_party/ExceptionWithCallStack.cpp
diff --git a/src/data/adaptive_context.cpp b/src/data/adaptive_context.cpp
new file mode 100644
index 000000000..f63a9fc54
--- /dev/null
+++ b/src/data/adaptive_context.cpp
@@ -0,0 +1,97 @@
+#include "data/adaptive_context.h"
+
+namespace marian {
+namespace data {
+
+AdaptiveContextIterator::AdaptiveContextIterator(AdaptiveContextReader* trainSetReader)
+    : trainSetReader_(trainSetReader) {
+  if(trainSetReader) {
+    currentSamples_ = trainSetReader_->getSamples();
+  }
+}
+
+bool AdaptiveContextIterator::equal(const AdaptiveContextIterator& other) const {
+  return other.trainSetReader_ == trainSetReader_;
+}
+
+const std::vector<std::string>& AdaptiveContextIterator::dereference() const {
+  return currentSamples_;
+}
+
+void AdaptiveContextIterator::increment() {
+  // If the previous increment has exhausted the file, we must indicate that the we've reached
+  // the iterator's end
+  if(trainSetReader_->eof() && trainSetReader_ != nullptr) {
+    trainSetReader_ = nullptr;
+    return;
+  }
+  // If we're at the end of the iterator and increment has been called yet another time, there's
+  // a bug in the calling code
+  ABORT_IF(trainSetReader_ == nullptr, "Incrementing past the end of the iterator isn't allowed");
+
+  currentSamples_ = trainSetReader_->getSamples();
+}
+
+
+AdaptiveContextReader::AdaptiveContextReader(std::vector<std::string> paths) {
+  for(auto& path : paths)
+    files_.emplace_back(new io::InputFileStream(path));
+}
+
+AdaptiveContextIterator AdaptiveContextReader::begin() {
+  return AdaptiveContextIterator(this);
+}
+
+AdaptiveContextIterator AdaptiveContextReader::end() {
+  return AdaptiveContextIterator(nullptr);
+}
+
+bool AdaptiveContextReader::eof() {
+  return eof_;
+}
+
+std::vector<std::string> AdaptiveContextReader::getSamples() {
+  // extracted lines for source and target corpora
+  std::vector<std::string> samples;
+  // counters of number of lines extracted for source and target
+  std::vector<size_t> counts;
+
+  // Early exit if input files are exhausted
+  if (eof_) return samples;
+
+  for(auto const& file : files_) {
+    size_t currCount = 0;
+    std::string lines;
+    std::string line;
+    bool fileEnded = true;
+    while(io::getline(*file, line)) {
+      if(line.empty()) {
+        fileEnded = false;
+        break;
+      }
+
+      if(currCount)
+        lines += "\n";
+      lines += line;
+      currCount += 1;
+    }
+    eof_ = fileEnded;
+
+    if(!lines.empty())
+      samples.emplace_back(lines);
+    counts.push_back(currCount);
+
+    // check if the same number of lines is extracted for source and target
+    size_t prevCount = counts[0];
+    for(size_t i = 1; i < counts.size(); ++i) {
+      ABORT_IF(prevCount != counts[i],
+                "An empty source or target sentence has been encountered!");
+      prevCount = counts[i];
+    }
+  }
+
+  return samples;
+}
+
+}  // namespace data
+}  // namespace marian
diff --git a/src/data/adaptive_context.h b/src/data/adaptive_context.h
index 4b61ff546..167cd1efe 100644
--- a/src/data/adaptive_context.h
+++ b/src/data/adaptive_context.h
@@ -4,6 +4,7 @@
 #include "data/iterator_facade.h"
 
 namespace marian {
+namespace data {
 
 class AdaptiveContextReader;
 
@@ -21,11 +22,9 @@ class AdaptiveContextIterator
   // would make it difficult
   AdaptiveContextIterator(AdaptiveContextReader* trainSetReader);
 
-  bool equal(const AdaptiveContextIterator& other) const override {
-    return other.trainSetReader_ == trainSetReader_;
-  }
+  bool equal(const AdaptiveContextIterator& other) const override;
 
-  const std::vector<std::string>& dereference() const override { return currentSamples_; }
+  const std::vector<std::string>& dereference() const override;
 
   void increment() override;
 };
@@ -51,10 +50,7 @@ class AdaptiveContextReader {
    * can be empty (no context is provided for the respective translatable
    * sentence) in which case it is also represented by a single empty line.
    */
-  AdaptiveContextReader(std::vector<std::string> paths) {
-    for(auto& path : paths)
-      files_.emplace_back(new io::InputFileStream(path));
-  }
+  AdaptiveContextReader(std::vector<std::string> paths);
 
   /**
    * @brief Returns an iterator over the sets of context sentences produced by
@@ -62,17 +58,11 @@ class AdaptiveContextReader {
    *
    * @return the beginning of the iterator.
    */
-  AdaptiveContextIterator begin() {
-    return AdaptiveContextIterator(this);
-  }
+  AdaptiveContextIterator begin();
 
-  AdaptiveContextIterator end() {
-    return AdaptiveContextIterator(nullptr);
-  }
+  AdaptiveContextIterator end();
 
-  bool eof() {
-    return eof_;
-  }
+  bool eof();
 
   /**
    * @brief Reads the next set of samples -- the contaxt sentences -- for
@@ -90,67 +80,8 @@ class AdaptiveContextReader {
    * single file, e.g., [0] could contain 3 newline separated sentences in
    * English and [1] would contain their 3 respective translations in Latvian.
    */
-  std::vector<std::string> getSamples() {
-    // extracted lines for source and target corpora
-    std::vector<std::string> samples;
-    // counters of number of lines extracted for source and target
-    std::vector<size_t> counts;
-
-    // Early exit if input files are exhausted
-    if (eof_) return samples;
-
-    for(auto const& file : files_) {
-      size_t currCount = 0;
-      std::string lines;
-      std::string line;
-      bool fileEnded = true;
-      while(io::getline(*file, line)) {
-        if(line.empty()) {
-          fileEnded = false;
-          break;
-        }
-
-        if(currCount)
-          lines += "\n";
-        lines += line;
-        currCount += 1;
-      }
-      eof_ = fileEnded;
-
-      if(!lines.empty())
-        samples.emplace_back(lines);
-      counts.push_back(currCount);
-
-      // check if the same number of lines is extracted for source and target
-      size_t prevCount = counts[0];
-      for(size_t i = 1; i < counts.size(); ++i) {
-        ABORT_IF(prevCount != counts[i],
-                 "An empty source or target sentence has been encountered!");
-        prevCount = counts[i];
-      }
-    }
-
-    return samples;
-  }
+  std::vector<std::string> getSamples();
 };
 
-AdaptiveContextIterator::AdaptiveContextIterator(AdaptiveContextReader* trainSetReader) : trainSetReader_(trainSetReader) {
-  if(trainSetReader) {
-    currentSamples_ = trainSetReader_->getSamples();
-  }
-}
-
-void AdaptiveContextIterator::increment() {
-  // If the previous increment has exhausted the file, we must indicate that the we've reached
-  // the iterator's end
-  if(trainSetReader_->eof() && trainSetReader_ != nullptr) {
-    trainSetReader_ = nullptr;
-    return;
-  }
-  // If we're at the end of the iterator and increment has been called yet another time, there's
-  // a bug in the calling code
-  ABORT_IF(trainSetReader_ == nullptr, "Incrementing past the end of the iterator isn't allowed");
-
-  currentSamples_ = trainSetReader_->getSamples();
-}
+}  // namespace data
 }  // namespace marian

From 030ddb019fe74b2aaa38fcf8cf2b1f0e7bdaa8a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 17 Sep 2021 11:40:36 +0300
Subject: [PATCH 067/135] Introduce more whitespace for readability

---
 src/data/adaptive_context.cpp | 11 +++++++++++
 src/data/adaptive_context.h   |  8 +++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/src/data/adaptive_context.cpp b/src/data/adaptive_context.cpp
index f63a9fc54..9ac680e83 100644
--- a/src/data/adaptive_context.cpp
+++ b/src/data/adaptive_context.cpp
@@ -3,6 +3,7 @@
 namespace marian {
 namespace data {
 
+
 AdaptiveContextIterator::AdaptiveContextIterator(AdaptiveContextReader* trainSetReader)
     : trainSetReader_(trainSetReader) {
   if(trainSetReader) {
@@ -10,14 +11,17 @@ AdaptiveContextIterator::AdaptiveContextIterator(AdaptiveContextReader* trainSet
   }
 }
 
+
 bool AdaptiveContextIterator::equal(const AdaptiveContextIterator& other) const {
   return other.trainSetReader_ == trainSetReader_;
 }
 
+
 const std::vector<std::string>& AdaptiveContextIterator::dereference() const {
   return currentSamples_;
 }
 
+
 void AdaptiveContextIterator::increment() {
   // If the previous increment has exhausted the file, we must indicate that the we've reached
   // the iterator's end
@@ -33,23 +37,29 @@ void AdaptiveContextIterator::increment() {
 }
 
 
+
+
 AdaptiveContextReader::AdaptiveContextReader(std::vector<std::string> paths) {
   for(auto& path : paths)
     files_.emplace_back(new io::InputFileStream(path));
 }
 
+
 AdaptiveContextIterator AdaptiveContextReader::begin() {
   return AdaptiveContextIterator(this);
 }
 
+
 AdaptiveContextIterator AdaptiveContextReader::end() {
   return AdaptiveContextIterator(nullptr);
 }
 
+
 bool AdaptiveContextReader::eof() {
   return eof_;
 }
 
+
 std::vector<std::string> AdaptiveContextReader::getSamples() {
   // extracted lines for source and target corpora
   std::vector<std::string> samples;
@@ -93,5 +103,6 @@ std::vector<std::string> AdaptiveContextReader::getSamples() {
   return samples;
 }
 
+
 }  // namespace data
 }  // namespace marian
diff --git a/src/data/adaptive_context.h b/src/data/adaptive_context.h
index 167cd1efe..f0d2fe93a 100644
--- a/src/data/adaptive_context.h
+++ b/src/data/adaptive_context.h
@@ -6,17 +6,20 @@
 namespace marian {
 namespace data {
 
+
 class AdaptiveContextReader;
 
+
 /**
  * @brief An iterator for easier access of the context sentences produced by
  * `AdaptiveContextReader::getSamples()`
  */
 class AdaptiveContextIterator
   : public IteratorFacade<AdaptiveContextIterator, std::vector<std::string>> {
-private:
+
   AdaptiveContextReader* trainSetReader_;
   std::vector<std::string> currentSamples_;
+
 public:
   // TODO: should we use a smart pointer here instead? The TrainSetReader::begin() method
   // would make it difficult
@@ -29,11 +32,13 @@ class AdaptiveContextIterator
   void increment() override;
 };
 
+
 /**
  * @brief Reads the context sentences, that are used for on-the-fly training in
  * the self-adaptive translation mode, from files.
  */
 class AdaptiveContextReader {
+
   std::vector<UPtr<io::InputFileStream>> files_;
   /// Indicates whether the input files have been exhausted.
   bool eof_ = false;
@@ -83,5 +88,6 @@ class AdaptiveContextReader {
   std::vector<std::string> getSamples();
 };
 
+
 }  // namespace data
 }  // namespace marian

From c90a4d7c361e8911564ca2717041d05bc302a12b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 17 Sep 2021 14:35:20 +0300
Subject: [PATCH 068/135] Rename and move the adaptive translation function

---
 src/translator/self_adaptive.h | 70 +++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 8aa5189f7..d816a9f1c 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -74,7 +74,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     LOG(info, "Running...");
 
-    translate(testBatches, contexts.begin(), contexts.end(), collector);
+    adaptAndTranslate(testBatches, contexts.begin(), contexts.end(), collector);
 
     auto translations = collector->collect(options_->get<bool>("n-best"));
     YAML::Emitter output;
@@ -82,39 +82,6 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     return "{\"output\":" + std::string(output.c_str()) + "}";
   }
 
-  template <class Iterator, class DataSet>
-  void translate(
-      Ptr<marian::data::BatchGenerator<DataSet>>
-          testBatches,
-      Iterator trainBegin,
-      Iterator trainEnd,
-      Ptr<marian::CollectorBase> collector) {
-    auto printer = New<OutputPrinter>(options_, cpuModel_->TrgVocab());
-
-    for(auto testBatch : *testBatches) {
-      ABORT_IF(trainBegin == trainEnd, "Context batches ran out before test batches");
-
-      auto trainSet = *trainBegin;
-      ++trainBegin;
-
-      if(!trainSet.empty()) {
-        LOG(info, "# NEW TEST BATCH");
-        trainSlot_->SetModel(cpuModel_);
-        trainSlot_->Train(trainSet);
-        translateSlot_->PointToParams(*trainSlot_);
-        translate(testBatch, collector, printer);
-        needsSwitching_ = true;
-      } else {
-        LOG(info, "# EMPTY TEST BATCH");
-        if(needsSwitching_) {
-          translateSlot_->Load(*cpuModel_);
-          needsSwitching_ = false;
-        }
-        translate(testBatch, collector, printer);
-      }
-    }
-  }
-
   void run() override {
     // Initialize input data
     auto srcPaths = options_->get<std::vector<std::string>>("input");
@@ -135,7 +102,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
     LOG(info, "Running...");
 
-    translate(testBatches, trainSets->begin(), trainSets->end(), collector);
+    adaptAndTranslate(testBatches, trainSets->begin(), trainSets->end(), collector);
   }
 
 private:
@@ -148,6 +115,39 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
   Ptr<GPUEngine> translateEngine_;
   bool needsSwitching_ = true;
 
+  template <class Iterator, class DataSet>
+  void adaptAndTranslate(
+      Ptr<marian::data::BatchGenerator<DataSet>>
+          testBatches,
+      Iterator trainBegin,
+      Iterator trainEnd,
+      Ptr<marian::CollectorBase> collector) {
+    auto printer = New<OutputPrinter>(options_, cpuModel_->TrgVocab());
+
+    for(auto testBatch : *testBatches) {
+      ABORT_IF(trainBegin == trainEnd, "Context batches ran out before test batches");
+
+      auto trainSet = *trainBegin;
+      ++trainBegin;
+
+      if(!trainSet.empty()) {
+        LOG(info, "# NEW TEST BATCH");
+        trainSlot_->SetModel(cpuModel_);
+        trainSlot_->Train(trainSet);
+        translateSlot_->PointToParams(*trainSlot_);
+        translate(testBatch, collector, printer);
+        needsSwitching_ = true;
+      } else {
+        LOG(info, "# EMPTY TEST BATCH");
+        if(needsSwitching_) {
+          translateSlot_->Load(*cpuModel_);
+          needsSwitching_ = false;
+        }
+        translate(testBatch, collector, printer);
+      }
+    }
+  }
+
   void translate(Ptr<data::CorpusBatch> batch,
                  Ptr<CollectorBase> collector,
                  Ptr<OutputPrinter> printer) {

From 448de67b08340677fee459c88c43c6342c8fc6fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 17 Sep 2021 14:35:42 +0300
Subject: [PATCH 069/135] Unhardcode the maximum translation input length
 parameter

---
 src/common/config_parser.cpp   | 4 ++++
 src/translator/self_adaptive.h | 5 ++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index b46b6a6e7..26a4d6601 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -700,6 +700,10 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
     addSuboptionsTSV(cli);
     addSuboptionsDevices(cli);
     addSuboptionsBatching(cli);
+  } else {
+    cli.add<size_t>("--max-length-translate",
+        "Maximum input sentence length for translation",
+        1000);
   }
 
   // for self-adaptive mode vocabs are already added via the training options
diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index d816a9f1c..bf942e89f 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -27,9 +27,8 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     // adaptation happens per sentence
     optionsTrans_->set<size_t>("mini-batch", 1);
     optionsTrans_->set<size_t>("maxi-batch", 1);
-    // TODO: should probably un-hardcode this? The issue is, though, that the users
-    // might want separate options for training and translation
-    optionsTrans_->set<size_t>("max-length", 1000);
+    auto maxTranslationInput = options_->get<size_t>("max-length-translate");
+    optionsTrans_->set<size_t>("max-length", maxTranslationInput);
     optionsTrans_->set("shuffle", "none");
 
     auto modelFilename = options_->get<std::string>("model");

From a6639ffb220df5afef3a4534d7bc7d329f199352 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 17 Sep 2021 15:26:11 +0300
Subject: [PATCH 070/135] Compile adaptive_context.cpp conditionally

---
 src/CMakeLists.txt | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 44aebe6f4..471c49770 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -41,7 +41,6 @@ set(MARIAN_SOURCES
   data/corpus_nbest.cpp
   data/text_input.cpp
   data/shortlist.cpp
-  data/adaptive_context.cpp
 
   3rd_party/cnpy/cnpy.cpp
   3rd_party/ExceptionWithCallStack.cpp
@@ -126,6 +125,12 @@ set(MARIAN_SOURCES
   $<TARGET_OBJECTS:faiss>
 )
 
+if(COMPILE_ADAPTIVE)
+  set(MARIAN_SOURCES ${MARIAN_SOURCES}
+    data/adaptive_context.cpp
+  )
+endif(COMPILE_ADAPTIVE)
+
 add_library(marian STATIC ${MARIAN_SOURCES})
 
 target_compile_options(marian PRIVATE ${ALL_WARNINGS})

From bafcae1274411fd9aea709132b73a3650019fc1c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 20 Sep 2021 13:15:20 +0300
Subject: [PATCH 071/135] Remove the marian_swapper executable

---
 src/CMakeLists.txt | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 471c49770..45e7e538f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -221,10 +221,6 @@ if (NOT COMPILE_LIBRARY_ONLY)
   set_target_properties(marian_decoder PROPERTIES OUTPUT_NAME marian-decoder)
   target_compile_options(marian_decoder PRIVATE ${ALL_WARNINGS})
 
-  add_executable(marian_swapper command/marian_swapper.cpp)
-  set_target_properties(marian_swapper PROPERTIES OUTPUT_NAME marian_swapper)
-  target_compile_options(marian_swapper PRIVATE ${ALL_WARNINGS})
-
   add_executable(marian_scorer command/marian_scorer.cpp)
   set_target_properties(marian_scorer PROPERTIES OUTPUT_NAME marian-scorer)
   target_compile_options(marian_scorer PRIVATE ${ALL_WARNINGS})
@@ -237,7 +233,7 @@ if (NOT COMPILE_LIBRARY_ONLY)
   set_target_properties(marian_conv PROPERTIES OUTPUT_NAME marian-conv)
   target_compile_options(marian_conv PRIVATE ${ALL_WARNINGS})
 
-  set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_swapper marian_scorer marian_vocab marian_conv)
+  set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv)
 
   # marian.zip and marian.tgz
   # This combines marian, marian_decoder in a single ZIP or TAR file for

From afc5e158f037599d7509a2d541250c8f6d3791c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 20 Sep 2021 15:20:31 +0300
Subject: [PATCH 072/135] Remove dead code from the model swapping code

---
 src/command/marian_swapper.cpp | 98 ----------------------------------
 src/tensors/gpu/swap.cu        |  4 --
 src/tensors/gpu/swap.h         |  5 --
 src/translator/swappable.cpp   | 58 --------------------
 src/translator/swappable.h     | 11 ----
 5 files changed, 176 deletions(-)
 delete mode 100644 src/command/marian_swapper.cpp

diff --git a/src/command/marian_swapper.cpp b/src/command/marian_swapper.cpp
deleted file mode 100644
index 758501d1e..000000000
--- a/src/command/marian_swapper.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#include "translator/history.h"
-#include "translator/output_printer.h"
-#include "translator/swappable.h"
-
-#include <iostream>
-#include <string>
-#include <unordered_map>
-
-namespace marian {
-void LoadBig(Ptr<Options> options, std::unordered_map<std::string, CPULoadedModel> &to) {
-  to.emplace("pten", CPULoadedModel(options,
-      "/home/ubuntu/consistent-big-models/padded/pten.npz",
-      {"/home/ubuntu/consistent-big-models/padded/pten.vocab"},
-      "/home/ubuntu/consistent-big-models/padded/pten.vocab"));
-
-  to.emplace("enit", CPULoadedModel(options,
-      "/home/ubuntu/consistent-big-models/padded/enit.npz",
-      {"/home/ubuntu/consistent-big-models/padded/enit.vocab"},
-      "/home/ubuntu/consistent-big-models/padded/enit.vocab"));
-}
-
-void LoadTiny(Ptr<Options> options, std::unordered_map<std::string, CPULoadedModel> &to) {
-  std::vector<std::string> models = {"csen", "encs", "enet", "eten", "esen", "enes"};
-  for (const std::string m : models) {
-    std::string base = "/home/ubuntu/consistent-bergamot-students/padded/";
-    base += m + ".";
-    to.emplace(m, CPULoadedModel(options, base + "npz", {base + "spm"}, base + "spm"));
-  }
-}
-
-} // namespace
-
-/* Demo program: run with options for any of the models */
-int main(int argc, char** argv) {
-  using namespace marian;
-  Ptr<Options> options = parseOptions(argc, argv, cli::mode::translation);
-
-  Ptr<GPUEngine> engine = New<GPUEngine>(options, 0);
-  GPULoadedModel slot(engine);
-
-  std::unordered_map<std::string, CPULoadedModel> models;
-//  LoadBig(options, models);
-  LoadTiny(options, models);
-
-  // begin with a space to avoid conflict with a real sentence.
-  const std::string kSwitchPrefix(" CHANGE ");
-
-  bool alignments = !options->get<std::string>("alignment").empty();
-
-  bool loaded = false;
-  std::string line;
-  while (std::getline(std::cin, line)) {
-    // Switch out which model is used.
-    if (line.substr(0, kSwitchPrefix.size()) == kSwitchPrefix) {
-      std::string key = line.substr(kSwitchPrefix.size());
-      auto found = models.find(key);
-      if (found == models.end()) {
-        std::cerr << "Model for " << key << " not loaded." << std::endl;
-        return 1;
-      }
-      slot.Load(found->second);
-      loaded = true;
-      continue;
-    }
-    if (!loaded) {
-      std::cerr << "Select a model first." << std::endl;
-      continue;
-    }
-
-    // Actually translating with a model.
-    marian::Histories histories = slot.Translate({line});
-    // In practice there is one history because we provided one line.
-    for(auto history : histories) {
-      Result result(history->top());
-      Words words = std::get<0>(result);
-      std::cout << slot.TrgVocab()->decode(words) << std::endl;
-
-      /* Print alignments */
-      if (alignments) {
-        Hypothesis &hypo = *std::get<1>(result);
-        // [t][s] -> P(s|t)
-        marian::data::SoftAlignment alignment(hypo.tracebackAlignment());
-        // An easier call for this is:
-        // std:cout << data::SoftAlignToString(alignment);
-        // The below is just there to show how access them programatically.
-        // NB you can convert to hard with data::ConvertSoftAlignToHardAlign(alignment, threshold)
-        for (auto target : alignment) {
-          for (float source : target) {
-            std::cout << source << ' ';
-          }
-          std::cout << '\n';
-        }
-      }
-    }
-  }
-
-  return 0;
-}
diff --git a/src/tensors/gpu/swap.cu b/src/tensors/gpu/swap.cu
index 16210e0c5..c16a71614 100644
--- a/src/tensors/gpu/swap.cu
+++ b/src/tensors/gpu/swap.cu
@@ -9,9 +9,5 @@ namespace marian {
             CUDA_CHECK(cudaSetDevice(deviceId.no));
             CUDA_CHECK(cudaMemcpy(gpuOut, in, count, cudaMemcpyHostToDevice));
         }
-        void copyGpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
-            CUDA_CHECK(cudaSetDevice(deviceId.no));
-            CUDA_CHECK(cudaMemcpy(gpuOut, in, count, cudaMemcpyDeviceToDevice));
-        }
     }
 }
diff --git a/src/tensors/gpu/swap.h b/src/tensors/gpu/swap.h
index 7d8784266..a020c8827 100644
--- a/src/tensors/gpu/swap.h
+++ b/src/tensors/gpu/swap.h
@@ -6,15 +6,10 @@ namespace marian {
     namespace swapper {
 #ifdef CUDA_FOUND
         void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId);
-        void copyGpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId);
 #else
         inline void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
             ABORT("Copy from CPU to GPU memory is only available with CUDA.");
         }
-
-        inline void copyGpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
-            ABORT("Copy from GPU to GPU memory is only available with CUDA.");
-        }
 #endif
     }
 }
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index fa66c0ae9..565bbcbeb 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -11,23 +11,6 @@
 #include "tensors/gpu/swap.h"
 
 namespace marian {
-std::string MultilineInputHack(const std::vector<std::string> &input) {
-  if (input.size() == 1) {
-    return input[0];
-  } else {
-    std::string ret;
-    std::size_t size = 0;
-    for (auto&& line : input) {
-      size += line.size() + 1;
-    }
-    ret.reserve(size);
-    for (auto&& line : input) {
-      ret.append(line);
-      ret.append("\n");
-    }
-    return ret;
-  }
-}
 
 namespace {
   DeviceId LookupGPU(const Ptr<Options> options, size_t deviceIdx) {
@@ -43,17 +26,6 @@ void get(std::vector<uint8_t> &out, MemoryPiece::PtrType mem, Ptr<Backend> backe
   gpu::copy(backend, mem->data<uint8_t>(), mem->data<uint8_t>() + mem->size(), out.data());
 }
 
-void GPUEngineTrain::SwapPointers(
-    std::vector<MemoryPiece::PtrType> &with) {
-  auto write_it = graph_->params()->begin();
-  auto read_it = with.begin();
-
-  std::vector<uint8_t> outvec;
-  for(; read_it != with.end(); ++write_it, ++read_it) {
-    std::swap(*(*write_it)->val()->memory(), **read_it);
-  }
-}
-
 GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx)
   : options_(options), myDeviceId_(LookupGPU(options, deviceIdx)) {
   ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
@@ -192,17 +164,6 @@ GPULoadedModel::~GPULoadedModel() {
   }
 }
 
-void GPULoadedModel::Load(const GPULoadedModel &from) {
-  srcVocabs_ = from.srcVocabs_;
-  trgVocab_ = from.trgVocab_;
-
-  ABORT_IF(engine_ != from.engine_, "TODO: copy across GPUs.");
-
-  for (size_t i = 0; i < parameters_.size(); ++i) {
-    swapper::copyGpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), reinterpret_cast<const char*>(from.parameters_[i]->data()), parameters_[i]->size(), engine_->myDeviceId_);
-  }
-}
-
 void GPULoadedModel::PointToParams(const GPULoadedModelTrain &from) {
   ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
   srcVocabs_ = from.srcVocabs_;
@@ -218,25 +179,6 @@ void GPULoadedModel::Load(const CPULoadedModel &from) {
   }
 }
 
-Histories GPULoadedModel::Translate(const std::vector<std::string> &input) {
-  ABORT_IF(!trgVocab_, "GPULoadedModel needs to be overwritten by a CPU model first.");
-  engine_->SwapPointers(parameters_);
-
-  auto corpus = New<data::TextInput>(std::vector<std::string>(1, MultilineInputHack(input)), srcVocabs_, engine_->options_); // @TODO dirty hack
-  data::BatchGenerator<data::TextInput> batchGenerator(corpus, engine_->options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
-
-  BeamSearch search(engine_->options_, engine_->scorers_, trgVocab_);
-  Histories ret;
-  ret.reserve(input.size());
-  for (auto&& batch : batchGenerator) {
-    auto result = search.search(engine_->graph_, batch);
-    ret.insert(ret.end(), result.begin(), result.end());
-  }
-  std::sort(ret.begin(), ret.end(),[](marian::Ptr<marian::History> a, marian::Ptr<marian::History> b){return a->getLineNum() < b->getLineNum();});
-  engine_->SwapPointers(parameters_);
-  return ret;
-}
-
 Histories GPULoadedModel::Translate(const Ptr<data::CorpusBatch> batch) {
   ABORT_IF(!trgVocab_, "GPULoadedModel needs to be overwritten by a CPU model first.");
   // std::vector<uint8_t> outvec;
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 7aee61c9d..6128b5db4 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -33,7 +33,6 @@ class GPUEngineTrain {
   Ptr<models::ICriterionFunction> builder_;
   const DeviceId myDeviceId_;
 
-  void SwapPointers(std::vector<MemoryPiece::PtrType> &with);
   void RecreateGraphAndBuilder();
 
 public:
@@ -76,13 +75,6 @@ class GPULoadedModelTrain {
 };
 
 
-
-
-// ##### ^ above is stuff for runtime domain adaptation
-
-
-
-
 /* Execute on a particular device */
 class GPUEngine {
 	private:
@@ -126,11 +118,8 @@ class GPULoadedModel {
 
     // Overwrite this model with parameters from a different one.
     void Load(const CPULoadedModel &from);
-    void Load(const GPULoadedModel &from);
-    void Load(const GPULoadedModelTrain &from);
     void PointToParams(const GPULoadedModelTrain &from);
 
-    Histories Translate(const std::vector<std::string> &input);
     Histories Translate(const Ptr<data::CorpusBatch> batch);
 };
 

From 6aeb510ce78e0b955dff93829e764be7cc8cbe76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 21 Sep 2021 16:03:36 +0300
Subject: [PATCH 073/135] Rename some swappable classes and improve
 documentation

---
 src/data/adaptive_context.h    |   2 +-
 src/translator/self_adaptive.h |   8 +--
 src/translator/swappable.cpp   |  25 ++++----
 src/translator/swappable.h     | 106 ++++++++++++++++++++++-----------
 4 files changed, 89 insertions(+), 52 deletions(-)

diff --git a/src/data/adaptive_context.h b/src/data/adaptive_context.h
index f0d2fe93a..80d2213da 100644
--- a/src/data/adaptive_context.h
+++ b/src/data/adaptive_context.h
@@ -81,7 +81,7 @@ class AdaptiveContextReader {
    * represented by a single empty line.
    *
    * @return a vector representing a single group of context sentences. Each
-   * element in the vector contains newline seperated input lines comming from a
+   * element in the vector contains newline separated input lines comming from a
    * single file, e.g., [0] could contain 3 newline separated sentences in
    * English and [1] would contain their 3 respective translations in Latvian.
    */
diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index bf942e89f..962ac164c 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -39,10 +39,10 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
     std::vector<std::string> srcVocabPaths(vocabPaths.begin(), vocabPaths.end() - 1);
     cpuModel_ = New<CPULoadedModel>(options_, modelFilename, srcVocabPaths, vocabPaths.back());
-    translateEngine_ = New<GPUEngine>(optionsTrans_, 0);
+    translateEngine_ = New<GPUEngineTranslate>(optionsTrans_, 0);
     translateSlot_ = New<GPULoadedModel>(translateEngine_);
     trainEngine_ = New<GPUEngineTrain>(options_, 0);
-    trainSlot_   = New<GPULoadedModelTrain>(trainEngine_);
+    trainSlot_   = New<SwappableModelTrainer>(trainEngine_);
   }
 
   std::string run(const std::string& json) override {
@@ -108,10 +108,10 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
   Ptr<Options> options_;       // Options for training
   Ptr<Options> optionsTrans_;  // Options for translator
   Ptr<CPULoadedModel> cpuModel_;
-  Ptr<GPULoadedModelTrain> trainSlot_;
+  Ptr<SwappableModelTrainer> trainSlot_;
   Ptr<GPULoadedModel> translateSlot_;
   Ptr<GPUEngineTrain> trainEngine_;
-  Ptr<GPUEngine> translateEngine_;
+  Ptr<GPUEngineTranslate> translateEngine_;
   bool needsSwitching_ = true;
 
   template <class Iterator, class DataSet>
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 565bbcbeb..dff2e02e4 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -48,24 +48,23 @@ void GPUEngineTrain::RecreateGraphAndBuilder() {
 
 GPUEngineTrain::~GPUEngineTrain() {}
 
-GPULoadedModelTrain::GPULoadedModelTrain(Ptr<GPUEngineTrain> gpu) : engine_(gpu) {
+SwappableModelTrainer::SwappableModelTrainer(Ptr<GPUEngineTrain> gpu) : engine_(gpu) {
 }
 
-GPULoadedModelTrain::~GPULoadedModelTrain() {
+SwappableModelTrainer::~SwappableModelTrainer() {
 }
 
-void GPULoadedModelTrain::SetModel(Ptr<CPULoadedModel> from) {
+void SwappableModelTrainer::SetModel(Ptr<CPULoadedModel> from) {
   srcVocabs_ = from->SrcVocabs();
   trgVocab_  = from->TrgVocab();
   cpuModel_ = from;
 }
 
-std::vector<MemoryPiece::PtrType> GPULoadedModelTrain::Parameters() const {
+std::vector<MemoryPiece::PtrType> SwappableModelTrainer::Parameters() const {
   return engine_->graph_->params()->toMemoryPieces();
 }
 
-// Load the initial model (dropping any previous changes) and train it on the provided input
-void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
+void SwappableModelTrainer::Train(const std::vector<std::string> &input) {
   ABORT_IF(!trgVocab_, "GPULoadedModelTrain needs to be overwritten by a CPU model first.");
 
   auto state     = New<TrainingState>(engine_->options_->get<float>("learn-rate"));
@@ -78,8 +77,8 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
   allVocabs.reserve(srcVocabs_.size() + 1);
   allVocabs.insert(allVocabs.end(), srcVocabs_.begin(), srcVocabs_.end());
   allVocabs.emplace_back(trgVocab_);
-  auto corpus = New<data::TextInput>(input, allVocabs, engine_->options_);  // @TODO dirty hack
-  data::BatchGenerator<data::TextInput> batchGenerator(corpus, engine_->options_, nullptr, false); // @TODO if the asynchronous batch preparation = true, but we supply less text than the mini-batch size we crash
+  auto corpus = New<data::TextInput>(input, allVocabs, engine_->options_);
+  data::BatchGenerator<data::TextInput> batchGenerator(corpus, engine_->options_, nullptr, false);
 
   // We reset the training graph to the original model parameters to prepare
   // for adapting it to the new inputs
@@ -121,7 +120,7 @@ void GPULoadedModelTrain::Train(const std::vector<std::string> &input) {
 
 
 
-void GPUEngine::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
+void GPUEngineTranslate::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
   auto write_it = graph_->params()->begin();
   auto read_it = with.begin();
   for (; read_it != with.end(); ++write_it, ++read_it) {
@@ -129,7 +128,7 @@ void GPUEngine::SwapPointers(std::vector<MemoryPiece::PtrType> &with) {
   }
 }
 
-GPUEngine::GPUEngine(Ptr<Options> options, size_t deviceIdx) 
+GPUEngineTranslate::GPUEngineTranslate(Ptr<Options> options, size_t deviceIdx) 
   : options_(options), graph_(New<ExpressionGraph>(true)), myDeviceId_(LookupGPU(options, deviceIdx)), allocator_(myDeviceId_, 0, 128 * 1048576) {
   ABORT_IF(myDeviceId_.type == DeviceType::cpu, "Swappable slot only works for GPU devices.");
   options_->set("inference", true);
@@ -150,9 +149,9 @@ GPUEngine::GPUEngine(Ptr<Options> options, size_t deviceIdx)
   // TODO: reach into graph_->params() private members and free the parameter memory.
 }
 
-GPUEngine::~GPUEngine() {}
+GPUEngineTranslate::~GPUEngineTranslate() {}
 
-GPULoadedModel::GPULoadedModel(Ptr<GPUEngine> gpu) : engine_(gpu) {
+GPULoadedModel::GPULoadedModel(Ptr<GPUEngineTranslate> gpu) : engine_(gpu) {
   for (auto &param : *engine_->graph_->params()) {
     parameters_.push_back(engine_->allocator_.alloc(param->val()->memory()->size()));
   }
@@ -164,7 +163,7 @@ GPULoadedModel::~GPULoadedModel() {
   }
 }
 
-void GPULoadedModel::PointToParams(const GPULoadedModelTrain &from) {
+void GPULoadedModel::PointToParams(const SwappableModelTrainer &from) {
   ABORT_IF(engine_->myDeviceId_ != from.engine_->myDeviceId_, "TODO: copy across GPUs.");
   srcVocabs_ = from.srcVocabs_;
   trgVocab_  = from.trgVocab_;
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 6128b5db4..ffa2666ee 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -15,7 +15,7 @@
 #include <vector>
 namespace marian {
 
-class GPULoadedModelTrain;
+class SwappableModelTrainer;
 
 class Scorer;
 
@@ -23,10 +23,13 @@ class GPULoadedModel;
 class CPULoadedModel;
 
 
-/* Execute on a particular device */
+/**
+  * The class wraps an expression graph and a model builder that are used by
+  * `SwappableModelTrainer` for training a model.
+  */
 class GPUEngineTrain {
 private:
-  friend class GPULoadedModelTrain;
+  friend class SwappableModelTrainer;
   friend class GPULoadedModel;
   Ptr<Options> options_;
   Ptr<ExpressionGraph> graph_;
@@ -46,8 +49,14 @@ class GPUEngineTrain {
   ~GPUEngineTrain();
 };
 
-/* A model loaded on the GPU that can be overwritten from CPU or GPU. */
-class GPULoadedModelTrain {
+/**
+ * @brief Wraps a `GPUEngineTrain` and a `CPULoadedModel` and performs model
+ * training.
+ *
+ * This class is created with self-adaptive translation in mind. Each invocation
+ * of Train() resets the model parameters at the start of training.
+ */
+class SwappableModelTrainer {
   private:
     friend class GPULoadedModel;
 
@@ -58,57 +67,74 @@ class GPULoadedModelTrain {
     Ptr<Vocab> trgVocab_;
 
   public:
-    GPULoadedModelTrain(Ptr<GPUEngineTrain> gpu);
+    SwappableModelTrainer(Ptr<GPUEngineTrain> gpu);
 
-    ~GPULoadedModelTrain();
+    ~SwappableModelTrainer();
 
     const std::vector<Ptr<Vocab>> &SrcVocabs() const { return srcVocabs_; }
 
     Ptr<Vocab> TrgVocab() const { return trgVocab_; }
 
-    // Change the internal pointers to vocabularies and CPULoadedModel to different ones
+    /// Change the internal pointers to vocabularies and CPULoadedModel to
+    /// different ones
     void SetModel(Ptr<CPULoadedModel> from);
 
     std::vector<MemoryPiece::PtrType> Parameters() const;
 
+    /**
+     * @brief resets the training graph, reloads the model parameters and trains
+     * the model on the provided inputs.
+     *
+     * Intended to be used in the self-adaptive translation mode -- training is
+     * always performed on the original model parameters, each training
+     * invocation resets previous changes.
+     *
+     * @param input Training data. A vector representing a parallel corpus --
+     * vector elements are the different sides of a parallel corpus, each is a
+     * newline separated set of sentences in a single language.
+     */
     void Train(const std::vector<std::string> &input);
 };
 
+/**
+  * The class wraps an expression graph and scorers that are used by
+  * `GPULoadedModel` for translation.
+  */
+class GPUEngineTranslate {
+private:
+  friend class GPULoadedModel;
+  Ptr<Options> options_;
+  Ptr<ExpressionGraph> graph_;
+  std::vector<Ptr<Scorer>> scorers_;
+  const DeviceId myDeviceId_;
+  Allocator allocator_;
 
-/* Execute on a particular device */
-class GPUEngine {
-	private:
-    friend class GPULoadedModel;
-    Ptr<Options> options_;
-    Ptr<ExpressionGraph> graph_;
-    std::vector<Ptr<Scorer> > scorers_;
-    const DeviceId myDeviceId_;
-    Allocator allocator_;
-
-    void SwapPointers(std::vector<MemoryPiece::PtrType> &with);
+  void SwapPointers(std::vector<MemoryPiece::PtrType> &with);
 
-  public:
-    /**
-     * @param options The marian options object
-     * @param deviceNum The index of the device you want to use for this slot. Note that this is not the deviceID but the index of the device in the
-     *                  array of supplied devices. Eg if you provide -d 0 3 5 and you want the Slot to run on GPU 3, you provide deviceNum=1.
-     */
-    explicit GPUEngine(Ptr<Options> options, size_t deviceNum);
+public:
+  /**
+    * @param options The marian options object
+    * @param deviceNum The index of the device you want to use for this slot. Note that this is not the deviceID but the index of the device in the
+    *                  array of supplied devices. Eg if you provide -d 0 3 5 and you want the Slot to run on GPU 3, you provide deviceNum=1.
+    */
+  explicit GPUEngineTranslate(Ptr<Options> options, size_t deviceNum);
 
-    ~GPUEngine();
+  ~GPUEngineTranslate();
 };
 
-/* A model loaded on the GPU that can be overwritten from CPU or GPU. */
+/** A model loaded on the GPU that can be overwritten from CPU. Facilitates
+  * translation with the model.
+  */
 class GPULoadedModel {
   private:
-    Ptr<GPUEngine> engine_;
+    Ptr<GPUEngineTranslate> engine_;
 
     std::vector<MemoryPiece::PtrType> parameters_;
     std::vector<Ptr<Vocab>> srcVocabs_;
     Ptr<Vocab> trgVocab_;
 
   public:
-    GPULoadedModel(Ptr<GPUEngine> gpu);
+    GPULoadedModel(Ptr<GPUEngineTranslate> gpu);
 
     ~GPULoadedModel();
 
@@ -116,14 +142,25 @@ class GPULoadedModel {
 
     Ptr<Vocab> TrgVocab() const { return trgVocab_; }
 
-    // Overwrite this model with parameters from a different one.
+    /// Overwrite this model with parameters from a different one.
     void Load(const CPULoadedModel &from);
-    void PointToParams(const GPULoadedModelTrain &from);
+    /**
+     * @brief Set the internal shared pointers to model parameters and
+     * vocabularies to different ones
+     *
+     * The effect is similar to `Load()` but nothing is copied in the process.
+     *
+     * @param from Swappable model trainer from which to take the shared
+     * pointers to model parameters and vocabularies.
+     */
+    void PointToParams(const SwappableModelTrainer &from);
 
     Histories Translate(const Ptr<data::CorpusBatch> batch);
 };
 
-/* A model loaded on the CPU. */
+/**
+  * A model loaded on the CPU. Holds model parameters and vocabularies.
+  */
 class CPULoadedModel {
   private:
     std::vector<io::Item> parameters_;
@@ -131,7 +168,8 @@ class CPULoadedModel {
     Ptr<Vocab> trgVocab_;
 
   public:
-    // The parts of Options that relate to model and vocab are ignored.  The files provided will be loaded.
+    // The parts of Options that relate to model and vocab are ignored. The
+    // files provided will be loaded.
     CPULoadedModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath);
 
     const std::vector<io::Item> &Parameters() const { return parameters_; }

From ad38da9ae18c6afbc8957189cfea00d31a3a7b0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 21 Sep 2021 17:02:19 +0300
Subject: [PATCH 074/135] Describe the purpose of swappable.h

---
 src/translator/swappable.h | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index ffa2666ee..8bd9d84bd 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -1,9 +1,18 @@
 #pragma once
-/* Support for swapping models in and out of a GPU, when you have more models
- * than fit in the GPU's RAM.  The models must have identical graphs, including
- * size. They can have different parameters and different vocabularies but the
- * vocabularies must have the same size.  To make vocabulary the same size, pad
- * using scripts/contrib/pad_model_vocabulary.py offline.
+/* Support for swapping and resetting models for the self-adaptive translation
+ * mode. The intended use case is to store a read-only copy of the model in
+ * `CPULoadedModel`, optionally train on a copy of the parameters using
+ * `SwappableModelTrainer` and then transfer either the trained or original
+ * model parameters into `GPULoadedModel` for translation. `GPUEngineTrain` and
+ * `GPUEngineTranslate` are used for storing the expression graphs for training
+ * and translation, respectively, and other related things. Translation on the
+ * CPU currently isn't supported.
+ *
+ * Originally this code was intended to allow multiple models to share a single
+ * GPU for translation and be swapped into GPU memory only when needed. However,
+ * parts of it, that weren't needed for self-adaptive translation, have been
+ * trimmed down since then. Look into the commit history if you want to revive
+ * this functionality.
  */
 #include "common/io.h"
 #include "data/vocab.h"

From 295040db405a95516125b221612273494ee6089b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 22 Sep 2021 13:41:37 +0300
Subject: [PATCH 075/135] Explain the purpose of self-adaptive code

---
 src/translator/self_adaptive.h | 8 ++++++++
 src/translator/swappable.h     | 3 ++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 962ac164c..94fdf4853 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -14,6 +14,14 @@ namespace marian {
 
 using namespace data;
 
+/**
+ * @breif Implementation of the self-adaptive translation mode.
+ *
+ * Self-adaptive translation means optionally using a set of context sentences
+ * (e.g., provided by a translation memory), that are similar to the
+ * translatable sentence, to train the model for a few iterations to fine-tune
+ * it before translating the given sentence.
+ */
 class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 public:
   TrainSelfAdaptive(Ptr<Options> options) : options_(options) {
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 8bd9d84bd..3f6864751 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -1,5 +1,6 @@
 #pragma once
-/* Support for swapping and resetting models for the self-adaptive translation
+/**
+ * Support for swapping and resetting models for the self-adaptive translation
  * mode. The intended use case is to store a read-only copy of the model in
  * `CPULoadedModel`, optionally train on a copy of the parameters using
  * `SwappableModelTrainer` and then transfer either the trained or original

From 6311f2bb4534475fcc9d1c0c6e13541632484312 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 22 Sep 2021 14:49:59 +0300
Subject: [PATCH 076/135] Improve comments in self-adaptive code

---
 src/translator/self_adaptive.h | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 94fdf4853..f129777f1 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -25,9 +25,6 @@ using namespace data;
 class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 public:
   TrainSelfAdaptive(Ptr<Options> options) : options_(options) {
-
-    // @TODO: should probably better re-enable the shuffling related options
-    // in config for marian-adaptive
     options_->set("shuffle", "none");
     // Set up translator options
     optionsTrans_ = New<Options>(options_->clone());
@@ -40,8 +37,9 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     optionsTrans_->set("shuffle", "none");
 
     auto modelFilename = options_->get<std::string>("model");
-    // Training has a single "model", translation can have multiple "models" in the general case.
-    // Adaptive options also take a single "model" so we have to adapt translation options manually.
+    // Training has a single "model", translation can have multiple "models" in
+    // the general case. Adaptive options also take only a single "model" so we
+    // have to adapt translation options manually.
     optionsTrans_->set<std::vector<std::string>>("models", {modelFilename});
 
     auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
@@ -53,6 +51,16 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     trainSlot_   = New<SwappableModelTrainer>(trainEngine_);
   }
 
+  /**
+   * @brief Implementation for self-adaptive translation where data come from a
+   * web request.
+   *
+   * @param json Input data in JSON. An "input" array of strings is expected to
+   * contain translatable sentences, each of which has a corresponding set of
+   * context sentences as a sub-array in the "context" array.
+   *
+   * @return JSON-encoded translations
+   */
   std::string run(const std::string& json) override {
     //LOG(warn, "REMOVEME Received Json:\n{}", json);
 
@@ -89,6 +97,10 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     return "{\"output\":" + std::string(output.c_str()) + "}";
   }
 
+  /**
+   * @brief Implementation for self-adaptive translation where inputs and
+   * outputs are specified in CLI options.
+   */
   void run() override {
     // Initialize input data
     auto srcPaths = options_->get<std::vector<std::string>>("input");

From 6bf3445516a6c561bfb6b1090ee585859ec3da58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 24 Sep 2021 14:59:29 +0300
Subject: [PATCH 077/135] Check that param names and sizes match upon loading

---
 src/translator/swappable.cpp | 32 ++++++++++++++++++++++++++++++--
 src/translator/swappable.h   |  1 +
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index dff2e02e4..6920d41c3 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -153,6 +153,7 @@ GPUEngineTranslate::~GPUEngineTranslate() {}
 
 GPULoadedModel::GPULoadedModel(Ptr<GPUEngineTranslate> gpu) : engine_(gpu) {
   for (auto &param : *engine_->graph_->params()) {
+    names_.push_back(param->name());
     parameters_.push_back(engine_->allocator_.alloc(param->val()->memory()->size()));
   }
 }
@@ -173,8 +174,35 @@ void GPULoadedModel::PointToParams(const SwappableModelTrainer &from) {
 void GPULoadedModel::Load(const CPULoadedModel &from) {
   srcVocabs_ = from.SrcVocabs();
   trgVocab_ = from.TrgVocab();
-  for (size_t i = 0; i < parameters_.size(); ++i) {
-    swapper::copyCpuToGpu(reinterpret_cast<char*>(parameters_[i]->data()), from.Parameters()[i].data(), from.Parameters()[i].size(), engine_->myDeviceId_);
+  auto fromParams = from.Parameters();
+
+  auto printParamsAndExit = [&]() {
+    std::ostringstream paramNames;
+    for(size_t i = 0; i < parameters_.size(); ++i) {
+      paramNames << "  TO (" << names_[i] << ") size: " << parameters_[i]->size() << "\n";
+    }
+    for(size_t i = 0; i < fromParams.size(); ++i) {
+      paramNames << "  FROM (" << fromParams[i].name << ") size: " << fromParams[i].size() << "\n";
+    }
+    LOG(error,
+        "Attempting to load parameters with mismatched names or sizes:\n{}",
+        paramNames.str());
+    ABORT("Attempting to load parameters with mismatched names or sizes.");
+  };
+
+  // Sanity check
+  if (parameters_.size() != fromParams.size())
+    printParamsAndExit();
+
+  for(size_t i = 0; i < parameters_.size(); ++i) {
+    // Sanity check
+    if (names_[i] != fromParams[i].name || parameters_[i]->size() != fromParams[i].size())
+      printParamsAndExit();
+
+    swapper::copyCpuToGpu(reinterpret_cast<char *>(parameters_[i]->data()),
+                          fromParams[i].data(),
+                          fromParams[i].size(),
+                          engine_->myDeviceId_);
   }
 }
 
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 3f6864751..af3cffa4c 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -139,6 +139,7 @@ class GPULoadedModel {
   private:
     Ptr<GPUEngineTranslate> engine_;
 
+    std::vector<std::string> names_;
     std::vector<MemoryPiece::PtrType> parameters_;
     std::vector<Ptr<Vocab>> srcVocabs_;
     Ptr<Vocab> trgVocab_;

From 5cac0d1ac02673bd8bf0b0e5c4e9a16842bc6eb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 24 Sep 2021 15:00:58 +0300
Subject: [PATCH 078/135] Fix amun model loading

---
 src/models/amun.h            | 19 ++++++++++++-------
 src/translator/swappable.cpp | 19 ++++++++++++-------
 2 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/models/amun.h b/src/models/amun.h
index 1bfda2697..9c42d4015 100644
--- a/src/models/amun.h
+++ b/src/models/amun.h
@@ -35,9 +35,7 @@ class Amun : public EncoderDecoder {
              "use --type s2s");
   }
 
-  void load(Ptr<ExpressionGraph> graph,
-            const std::string& name,
-            bool /*markedReloaded*/ = true) override {
+  static void remapIoItems(std::vector<io::Item> &ioItems, bool tiedEmbeddinsSrcOrAll) {
     std::map<std::string, std::string> nameMap
         = {{"decoder_U", "decoder_cell1_U"},
            {"decoder_Ux", "decoder_cell1_Ux"},
@@ -86,12 +84,9 @@ class Amun : public EncoderDecoder {
            {"encoder_r_gamma1", "encoder_bi_r_gamma1"},
            {"encoder_r_gamma2", "encoder_bi_r_gamma2"}};
 
-    if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
+    if (tiedEmbeddinsSrcOrAll)
       nameMap["Wemb"] = "Wemb";
 
-    LOG(info, "Loading model from {}", name);
-    // load items from .npz file
-    auto ioItems = io::loadItems(name);
     // map names and remove a dummy matrices
     for(auto it = ioItems.begin(); it != ioItems.end();) {
       // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size
@@ -116,6 +111,16 @@ class Amun : public EncoderDecoder {
         it++;
       }
     }
+  }
+
+  void load(Ptr<ExpressionGraph> graph,
+            const std::string& name,
+            bool /*markedReloaded*/ = true) override {
+    LOG(info, "Loading model from {}", name);
+    // load items from .npz file
+    auto ioItems = io::loadItems(name);
+    // remap item names and remove dummy matrices
+    remapIoItems(ioItems, opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"));
     // load items into the graph
     graph->load(ioItems);
   }
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 6920d41c3..56d3dd990 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -9,6 +9,7 @@
 #include "common/timer.h"
 #include <vector>
 #include "tensors/gpu/swap.h"
+#include "models/amun.h"
 
 namespace marian {
 
@@ -228,15 +229,19 @@ Histories GPULoadedModel::Translate(const Ptr<data::CorpusBatch> batch) {
 CPULoadedModel::CPULoadedModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
   : parameters_(io::loadItems(parameters)) {
   // Load parameters.
+  //Remap the parameter names if the model uses an older naming convention
+  if (options->get<std::string>("type") == "amun") {
+    bool tied = options->get<bool>("tied-embeddings-src") || options->get<bool>("tied-embeddings-all");
+    Amun::remapIoItems(parameters_, tied);
+  }
+
   // Find the special element and remove it:
-  size_t special_idx = 0;
-  for (size_t i = 0; i < parameters_.size(); i++) {
-    if (parameters_[i].name == "special:model.yml") {
-      special_idx = i;
-      break;
-    }
+  auto pred = [](const io::Item &item) { return item.name == "special:model.yml"; };
+  auto special_it = std::find_if(parameters_.begin(), parameters_.end(), pred);
+  if (special_it != parameters_.end()) {
+    parameters_.erase(special_it);
   }
-  parameters_.erase(parameters_.begin() + special_idx);
+
   // Prepare the name so that it matches the named map
   for (auto&& item : parameters_) {
     item.name = "F0::" + item.name;

From 1e1397d767751c9b3429e74a00152fedd94dff56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 24 Sep 2021 15:29:37 +0300
Subject: [PATCH 079/135] Implement parameter name remapping for nematus models

---
 src/models/nematus.h         | 340 ++++++++++++++++++-----------------
 src/translator/swappable.cpp |  16 +-
 2 files changed, 184 insertions(+), 172 deletions(-)

diff --git a/src/models/nematus.h b/src/models/nematus.h
index 730418e5f..a5c0c1c3f 100644
--- a/src/models/nematus.h
+++ b/src/models/nematus.h
@@ -8,7 +8,7 @@ namespace marian {
 
 class Nematus : public EncoderDecoder {
 public:
-  Nematus(Ptr<ExpressionGraph> graph, Ptr<Options> options) : EncoderDecoder(graph, options), nameMap_(createNameMap()) {
+  Nematus(Ptr<ExpressionGraph> graph, Ptr<Options> options) : EncoderDecoder(graph, options), nameMap_(createNameMap(options)) {
     ABORT_IF(options_->get<std::string>("enc-type") != "bidirectional",
              "--type nematus does not support other encoder "
              "type than bidirectional, use --type s2s");
@@ -25,178 +25,188 @@ class Nematus : public EncoderDecoder {
              "--dec-cell-high-depth > 1, use --type s2s");
   }
 
-  void load(Ptr<ExpressionGraph> graph,
-            const std::string& name,
-            bool /*markReloaded*/ = true) override {
-    LOG(info, "Loading model from {}", name);
-    // load items from .npz file
-    auto ioItems = io::loadItems(name);
-    // map names and remove a dummy matrix 'decoder_c_tt' from items to avoid creating isolated node
-    for(auto it = ioItems.begin(); it != ioItems.end();) {
-      // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size
-      // @TODO: consider dropping support for Nematus models
-      if(it->shape.size() == 1) {
-        int dim = it->shape[-1];
-        it->shape.resize(2);
-        it->shape.set(0, 1);
-        it->shape.set(1, dim);
-      }
-      
-      if(it->name == "decoder_c_tt") {
-        it = ioItems.erase(it);
-      } else if(it->name == "uidx") {
-        it = ioItems.erase(it);
-      } else if(it->name == "history_errs") {
-        it = ioItems.erase(it);
-      } else {
-        auto pair = nameMap_.find(it->name);
-        if(pair != nameMap_.end())
-          it->name = pair->second;
-        it++;
-      }
-    }
-    // load items into the graph
-    graph->load(ioItems);
+  static void remapIoItems(std::vector<io::Item>& ioItems, Ptr<Options> options) {
+    remapIoItems(ioItems, createNameMap(options), options);
   }
 
-  void save(Ptr<ExpressionGraph> graph,
-            const std::string& name,
-            bool saveTranslatorConfig = false) override {
-    LOG(info, "Saving model to {}", name);
-
-    // prepare reversed map
-    if(nameMapRev_.empty())
-      for(const auto& kv : nameMap_)
-        nameMapRev_.insert({kv.second, kv.first});
-
-    // get parameters from the graph to items
-    std::vector<io::Item> ioItems;
-    graph->save(ioItems);
-    // replace names to be compatible with Nematus
-    for(auto& item : ioItems) {
-      auto newItemName = nameMapRev_.find(item.name);
-      if(newItemName != nameMapRev_.end())
-        item.name = newItemName->second;
-    }
-    // add a dummy matrix 'decoder_c_tt' required for Amun and Nematus
-    ioItems.emplace_back();
-    ioItems.back().name = "decoder_c_tt";
-    ioItems.back().shape = Shape({1, 0});
-    ioItems.back().bytes.emplace_back((char)0);
-
-    io::addMetaToItems(getModelParametersAsString(), "special:model.yml", ioItems);
-    io::saveItems(name, ioItems);
-
-    if(saveTranslatorConfig) {
-      createAmunConfig(name);
-      createDecoderConfig(name);
+  void load(Ptr<ExpressionGraph> graph, const std::string& name, bool /*markReloaded*/ = true)
+        override {
+      LOG(info, "Loading model from {}", name);
+      // load items from .npz file
+      auto ioItems = io::loadItems(name);
+
+      // map names and remove a dummy matrices
+      remapIoItems(ioItems, nameMap_, options_);
+
+      // load items into the graph
+      graph->load(ioItems);
     }
-  }
 
-private:
-  std::map<std::string, std::string> nameMap_;
-  std::map<std::string, std::string> nameMapRev_;
-
-  std::map<std::string, std::string> createNameMap() {
-    std::map<std::string, std::string> nameMap
-        = {{"decoder_U", "decoder_cell1_U"},
-           {"decoder_Ux", "decoder_cell1_Ux"},
-           {"decoder_W", "decoder_cell1_W"},
-           {"decoder_Wx", "decoder_cell1_Wx"},
-           {"decoder_b", "decoder_cell1_b"},
-           {"decoder_bx", "decoder_cell1_bx"},
-           {"decoder_U_nl", "decoder_cell2_U"},
-           {"decoder_Ux_nl", "decoder_cell2_Ux"},
-           {"decoder_Wc", "decoder_cell2_W"},
-           {"decoder_Wcx", "decoder_cell2_Wx"},
-           {"decoder_b_nl", "decoder_cell2_b"},
-           {"decoder_bx_nl", "decoder_cell2_bx"},
-           {"ff_logit_prev_W", "decoder_ff_logit_l1_W0"},
-           {"ff_logit_lstm_W", "decoder_ff_logit_l1_W1"},
-           {"ff_logit_ctx_W", "decoder_ff_logit_l1_W2"},
-           {"ff_logit_prev_b", "decoder_ff_logit_l1_b0"},
-           {"ff_logit_lstm_b", "decoder_ff_logit_l1_b1"},
-           {"ff_logit_ctx_b", "decoder_ff_logit_l1_b2"},
-           {"ff_logit_W", "decoder_ff_logit_l2_W"},
-           {"ff_logit_b", "decoder_ff_logit_l2_b"},
-           {"ff_state_W", "decoder_ff_state_W"},
-           {"ff_state_b", "decoder_ff_state_b"},
-           {"Wemb_dec", "decoder_Wemb"},
-           {"Wemb", "encoder_Wemb"},
-           {"encoder_U", "encoder_bi_U"},
-           {"encoder_Ux", "encoder_bi_Ux"},
-           {"encoder_W", "encoder_bi_W"},
-           {"encoder_Wx", "encoder_bi_Wx"},
-           {"encoder_b", "encoder_bi_b"},
-           {"encoder_bx", "encoder_bi_bx"},
-           {"encoder_r_U", "encoder_bi_r_U"},
-           {"encoder_r_Ux", "encoder_bi_r_Ux"},
-           {"encoder_r_W", "encoder_bi_r_W"},
-           {"encoder_r_Wx", "encoder_bi_r_Wx"},
-           {"encoder_r_b", "encoder_bi_r_b"},
-           {"encoder_r_bx", "encoder_bi_r_bx"},
-           {"ff_state_ln_s", "decoder_ff_state_ln_s"},
-           {"ff_state_ln_b", "decoder_ff_state_ln_b"},
-           {"ff_logit_prev_ln_s", "decoder_ff_logit_l1_ln_s0"},
-           {"ff_logit_lstm_ln_s", "decoder_ff_logit_l1_ln_s1"},
-           {"ff_logit_ctx_ln_s", "decoder_ff_logit_l1_ln_s2"},
-           {"ff_logit_prev_ln_b", "decoder_ff_logit_l1_ln_b0"},
-           {"ff_logit_lstm_ln_b", "decoder_ff_logit_l1_ln_b1"},
-           {"ff_logit_ctx_ln_b", "decoder_ff_logit_l1_ln_b2"}};
-
-    // add mapping for deep encoder cells
-    std::vector<std::string> suffixes = {"_U", "_Ux", "_b", "_bx"};
-    for(int i = 1; i < options_->get<int>("enc-cell-depth"); ++i) {
-      std::string num1 = std::to_string(i);
-      std::string num2 = std::to_string(i + 1);
-      for(auto suf : suffixes) {
-        nameMap.insert({"encoder" + suf + "_drt_" + num1, "encoder_bi_cell" + num2 + suf});
-        nameMap.insert({"encoder_r" + suf + "_drt_" + num1, "encoder_bi_r_cell" + num2 + suf});
+    void save(
+        Ptr<ExpressionGraph> graph, const std::string& name, bool saveTranslatorConfig = false)
+        override {
+      LOG(info, "Saving model to {}", name);
+
+      // prepare reversed map
+      if(nameMapRev_.empty())
+        for(const auto& kv : nameMap_)
+          nameMapRev_.insert({kv.second, kv.first});
+
+      // get parameters from the graph to items
+      std::vector<io::Item> ioItems;
+      graph->save(ioItems);
+      // replace names to be compatible with Nematus
+      for(auto& item : ioItems) {
+        auto newItemName = nameMapRev_.find(item.name);
+        if(newItemName != nameMapRev_.end())
+          item.name = newItemName->second;
+      }
+      // add a dummy matrix 'decoder_c_tt' required for Amun and Nematus
+      ioItems.emplace_back();
+      ioItems.back().name  = "decoder_c_tt";
+      ioItems.back().shape = Shape({1, 0});
+      ioItems.back().bytes.emplace_back((char)0);
+
+      io::addMetaToItems(getModelParametersAsString(), "special:model.yml", ioItems);
+      io::saveItems(name, ioItems);
+
+      if(saveTranslatorConfig) {
+        createAmunConfig(name);
+        createDecoderConfig(name);
       }
     }
-    // add mapping for deep decoder cells
-    for(int i = 3; i <= options_->get<int>("dec-cell-base-depth"); ++i) {
-      std::string num1 = std::to_string(i - 2);
-      std::string num2 = std::to_string(i);
-      for(auto suf : suffixes)
-        nameMap.insert({"decoder" + suf + "_nl_drt_" + num1, "decoder_cell" + num2 + suf});
-    }
-    // add mapping for normalization layers
-    std::map<std::string, std::string> nameMapCopy(nameMap);
-    for(auto& kv : nameMapCopy) {
-      std::string prefix = kv.first.substr(0, 7);
-
-      if(prefix == "encoder" || prefix == "decoder") {
-        nameMap.insert({kv.first + "_lns", kv.second + "_lns"});
-        nameMap.insert({kv.first + "_lnb", kv.second + "_lnb"});
+
+  private:
+    std::map<std::string, std::string> nameMap_;
+    std::map<std::string, std::string> nameMapRev_;
+
+    static void remapIoItems(std::vector<io::Item>& ioItems, std::map<std::string, std::string> nameMap, Ptr<Options> options) {
+      // map names and remove a dummy matrix 'decoder_c_tt' from items to avoid creating isolated node
+      for(auto it = ioItems.begin(); it != ioItems.end();) {
+        // for backwards compatibility, turn one-dimensional vector into two dimensional matrix with first dimension being 1 and second dimension of the original size
+        // @TODO: consider dropping support for Nematus models
+        if(it->shape.size() == 1) {
+          int dim = it->shape[-1];
+          it->shape.resize(2);
+          it->shape.set(0, 1);
+          it->shape.set(1, dim);
+        }
+
+        if(it->name == "decoder_c_tt") {
+          it = ioItems.erase(it);
+        } else if(it->name == "uidx") {
+          it = ioItems.erase(it);
+        } else if(it->name == "history_errs") {
+          it = ioItems.erase(it);
+        } else {
+          auto pair = nameMap.find(it->name);
+          if(pair != nameMap.end())
+            it->name = pair->second;
+          it++;
+        }
       }
     }
 
-    return nameMap;
-  }
+    static std::map<std::string, std::string> createNameMap(Ptr<Options> options) {
+      std::map<std::string, std::string> nameMap
+          = {{"decoder_U", "decoder_cell1_U"},
+             {"decoder_Ux", "decoder_cell1_Ux"},
+             {"decoder_W", "decoder_cell1_W"},
+             {"decoder_Wx", "decoder_cell1_Wx"},
+             {"decoder_b", "decoder_cell1_b"},
+             {"decoder_bx", "decoder_cell1_bx"},
+             {"decoder_U_nl", "decoder_cell2_U"},
+             {"decoder_Ux_nl", "decoder_cell2_Ux"},
+             {"decoder_Wc", "decoder_cell2_W"},
+             {"decoder_Wcx", "decoder_cell2_Wx"},
+             {"decoder_b_nl", "decoder_cell2_b"},
+             {"decoder_bx_nl", "decoder_cell2_bx"},
+             {"ff_logit_prev_W", "decoder_ff_logit_l1_W0"},
+             {"ff_logit_lstm_W", "decoder_ff_logit_l1_W1"},
+             {"ff_logit_ctx_W", "decoder_ff_logit_l1_W2"},
+             {"ff_logit_prev_b", "decoder_ff_logit_l1_b0"},
+             {"ff_logit_lstm_b", "decoder_ff_logit_l1_b1"},
+             {"ff_logit_ctx_b", "decoder_ff_logit_l1_b2"},
+             {"ff_logit_W", "decoder_ff_logit_l2_W"},
+             {"ff_logit_b", "decoder_ff_logit_l2_b"},
+             {"ff_state_W", "decoder_ff_state_W"},
+             {"ff_state_b", "decoder_ff_state_b"},
+             {"Wemb_dec", "decoder_Wemb"},
+             {"Wemb", "encoder_Wemb"},
+             {"encoder_U", "encoder_bi_U"},
+             {"encoder_Ux", "encoder_bi_Ux"},
+             {"encoder_W", "encoder_bi_W"},
+             {"encoder_Wx", "encoder_bi_Wx"},
+             {"encoder_b", "encoder_bi_b"},
+             {"encoder_bx", "encoder_bi_bx"},
+             {"encoder_r_U", "encoder_bi_r_U"},
+             {"encoder_r_Ux", "encoder_bi_r_Ux"},
+             {"encoder_r_W", "encoder_bi_r_W"},
+             {"encoder_r_Wx", "encoder_bi_r_Wx"},
+             {"encoder_r_b", "encoder_bi_r_b"},
+             {"encoder_r_bx", "encoder_bi_r_bx"},
+             {"ff_state_ln_s", "decoder_ff_state_ln_s"},
+             {"ff_state_ln_b", "decoder_ff_state_ln_b"},
+             {"ff_logit_prev_ln_s", "decoder_ff_logit_l1_ln_s0"},
+             {"ff_logit_lstm_ln_s", "decoder_ff_logit_l1_ln_s1"},
+             {"ff_logit_ctx_ln_s", "decoder_ff_logit_l1_ln_s2"},
+             {"ff_logit_prev_ln_b", "decoder_ff_logit_l1_ln_b0"},
+             {"ff_logit_lstm_ln_b", "decoder_ff_logit_l1_ln_b1"},
+             {"ff_logit_ctx_ln_b", "decoder_ff_logit_l1_ln_b2"}};
+
+      // add mapping for deep encoder cells
+      std::vector<std::string> suffixes = {"_U", "_Ux", "_b", "_bx"};
+      for(int i = 1; i < options->get<int>("enc-cell-depth"); ++i) {
+        std::string num1 = std::to_string(i);
+        std::string num2 = std::to_string(i + 1);
+        for(auto suf : suffixes) {
+          nameMap.insert({"encoder" + suf + "_drt_" + num1, "encoder_bi_cell" + num2 + suf});
+          nameMap.insert({"encoder_r" + suf + "_drt_" + num1, "encoder_bi_r_cell" + num2 + suf});
+        }
+      }
+      // add mapping for deep decoder cells
+      for(int i = 3; i <= options->get<int>("dec-cell-base-depth"); ++i) {
+        std::string num1 = std::to_string(i - 2);
+        std::string num2 = std::to_string(i);
+        for(auto suf : suffixes)
+          nameMap.insert({"decoder" + suf + "_nl_drt_" + num1, "decoder_cell" + num2 + suf});
+      }
+      // add mapping for normalization layers
+      std::map<std::string, std::string> nameMapCopy(nameMap);
+      for(auto& kv : nameMapCopy) {
+        std::string prefix = kv.first.substr(0, 7);
+
+        if(prefix == "encoder" || prefix == "decoder") {
+          nameMap.insert({kv.first + "_lns", kv.second + "_lns"});
+          nameMap.insert({kv.first + "_lnb", kv.second + "_lnb"});
+        }
+      }
 
-  void createAmunConfig(const std::string& name) {
-    Config::YamlNode amun;
-    // Amun has only CPU decoder for deep Nematus models
-    amun["cpu-threads"] = 16;
-    amun["gpu-threads"] = 0;
-    amun["maxi-batch"] = 1;
-    amun["mini-batch"] = 1;
-
-    auto vocabs = options_->get<std::vector<std::string>>("vocabs");
-    amun["source-vocab"] = vocabs[0];
-    amun["target-vocab"] = vocabs[1];
-    amun["devices"] = options_->get<std::vector<size_t>>("devices");
-    amun["normalize"] = true;
-    amun["beam-size"] = 5;
-    amun["relative-paths"] = false;
-
-    amun["scorers"]["F0"]["path"] = name;
-    amun["scorers"]["F0"]["type"] = "nematus2";
-    amun["weights"]["F0"] = 1.0f;
-
-    io::OutputFileStream out(name + ".amun.yml");
-    out << amun;
-  }
-};
+      return nameMap;
+    }
+
+    void createAmunConfig(const std::string& name) {
+      Config::YamlNode amun;
+      // Amun has only CPU decoder for deep Nematus models
+      amun["cpu-threads"] = 16;
+      amun["gpu-threads"] = 0;
+      amun["maxi-batch"]  = 1;
+      amun["mini-batch"]  = 1;
+
+      auto vocabs            = options_->get<std::vector<std::string>>("vocabs");
+      amun["source-vocab"]   = vocabs[0];
+      amun["target-vocab"]   = vocabs[1];
+      amun["devices"]        = options_->get<std::vector<size_t>>("devices");
+      amun["normalize"]      = true;
+      amun["beam-size"]      = 5;
+      amun["relative-paths"] = false;
+
+      amun["scorers"]["F0"]["path"] = name;
+      amun["scorers"]["F0"]["type"] = "nematus2";
+      amun["weights"]["F0"]         = 1.0f;
+
+      io::OutputFileStream out(name + ".amun.yml");
+      out << amun;
+    }
+  };
 }  // namespace marian
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 56d3dd990..144d532b5 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -1,15 +1,16 @@
-#include "marian.h"
 #include "translator/swappable.h"
+#include <vector>
+#include "common/io.h"
 #include "common/logging.h"
+#include "common/timer.h"
 #include "data/corpus.h"
 #include "data/text_input.h"
+#include "marian.h"
+#include "models/amun.h"
+#include "models/nematus.h"
+#include "tensors/gpu/swap.h"
 #include "translator/beam_search.h"
 #include "translator/translator.h"
-#include "common/io.h"
-#include "common/timer.h"
-#include <vector>
-#include "tensors/gpu/swap.h"
-#include "models/amun.h"
 
 namespace marian {
 
@@ -228,11 +229,12 @@ Histories GPULoadedModel::Translate(const Ptr<data::CorpusBatch> batch) {
 
 CPULoadedModel::CPULoadedModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath)
   : parameters_(io::loadItems(parameters)) {
-  // Load parameters.
   //Remap the parameter names if the model uses an older naming convention
   if (options->get<std::string>("type") == "amun") {
     bool tied = options->get<bool>("tied-embeddings-src") || options->get<bool>("tied-embeddings-all");
     Amun::remapIoItems(parameters_, tied);
+  } else if (options->get<std::string>("type") == "nematus") {
+    Nematus::remapIoItems(parameters_, options);
   }
 
   // Find the special element and remove it:

From 3f9c088eeb35a5f983694cace68737e504b41dfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 24 Sep 2021 15:41:11 +0300
Subject: [PATCH 080/135] Work around a crash in amun model loading

---
 src/translator/swappable.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 144d532b5..5a4555955 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -198,7 +198,9 @@ void GPULoadedModel::Load(const CPULoadedModel &from) {
 
   for(size_t i = 0; i < parameters_.size(); ++i) {
     // Sanity check
-    if (names_[i] != fromParams[i].name || parameters_[i]->size() != fromParams[i].size())
+    // Not sure if that's ok, but we don't check for size equality because for
+    // some reason the target memory location sometimes can be bigger
+    if (names_[i] != fromParams[i].name || parameters_[i]->size() < fromParams[i].size())
       printParamsAndExit();
 
     swapper::copyCpuToGpu(reinterpret_cast<char *>(parameters_[i]->data()),

From d4ba1fa53a2751ebe4f025988c88286c179019d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 26 Oct 2021 15:33:52 +0300
Subject: [PATCH 081/135] Don't crash when training sets not provided

Happens in server mode for self-adaptive translation
---
 src/training/scheduler.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index 9d2500f92..12bb7b69d 100644
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -510,9 +510,14 @@ class Scheduler : public TrainingObserver {
 
   void actAfterEpoch(TrainingState& state) override {
     // stop if data streaming from STDIN is stopped for a TSV input
-    std::string firstPath = options_->get<std::vector<std::string>>("train-sets")[0];
-    if(options_->get<bool>("tsv", false) && (firstPath == "stdin" || firstPath == "-"))
-      endOfStdin_ = true;
+    if (options_->has("training-sets")) {
+      auto trainingSets = options_->get<std::vector<std::string>>("train-sets");
+      if (trainingSets.size() > 0) {
+        std::string firstPath = options_->get<std::vector<std::string>>("train-sets")[0];
+        if(options_->get<bool>("tsv", false) && (firstPath == "stdin" || firstPath == "-"))
+          endOfStdin_ = true;
+      }
+    }
 
     float factor = options_->get<float>("lr-decay");
 

From 24e8fc3ead8f738046e9e6aa754163f6cee1bd2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 26 Oct 2021 15:57:30 +0300
Subject: [PATCH 082/135] Copy over the self-adaptive server example script
 from an older commit

It was left out during the reimplementation of the self-adaptive translation
stuff to use the new "swappable" approach.
---
 scripts/self-adaptive/client_example.py | 51 +++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 scripts/self-adaptive/client_example.py

diff --git a/scripts/self-adaptive/client_example.py b/scripts/self-adaptive/client_example.py
new file mode 100644
index 000000000..6ef7757a7
--- /dev/null
+++ b/scripts/self-adaptive/client_example.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+from __future__ import print_function, unicode_literals, division
+
+import sys
+import time
+import argparse
+import json
+
+from websocket import create_connection
+
+
+def translate(batch, port=8080):
+    ws = create_connection("ws://localhost:{}/translate".format(port))
+    ws.send(batch)
+    result = ws.recv()
+    ws.close()
+    return result.rstrip()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-p", "--port", type=int, default=8080)
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # List of input sentences separated by a new line character
+    inputs = "this is an example\nthe second sentence\nno context provided"
+    # For each input sentence a list of parallel sentences can be provided as a
+    # list of source and target sentences.
+    contexts = [
+        # Source-side context for the first input sentence
+        ["this is a test\nthese are examples",
+        # Target-side context for the first input sentence
+            "das ist ein test\ndies sind Beispiele"],
+        # Only one example is given as a context for the second input sentence
+        ["the next sentence",
+            "der nächste Satz"],
+        # No context for the third input sentence
+        []
+    ]
+
+    input_data = {'input': inputs, 'context': contexts}
+    input_json = json.dumps(input_data)
+
+    output_json = translate(input_json, port=args.port)
+    output_data = json.loads(output_json)
+    print(output_data['output'])

From d68fd733ef3e37fa8ce7ceaf9aa11e21bc603825 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 26 Oct 2021 16:01:53 +0300
Subject: [PATCH 083/135] Clean up logging

---
 src/translator/self_adaptive.h | 4 ++--
 src/translator/swappable.cpp   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index f129777f1..a6c23f533 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -150,14 +150,14 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
       ++trainBegin;
 
       if(!trainSet.empty()) {
-        LOG(info, "# NEW TEST BATCH");
+        LOG(info, "Got {} context sentences", trainSet.size());
         trainSlot_->SetModel(cpuModel_);
         trainSlot_->Train(trainSet);
         translateSlot_->PointToParams(*trainSlot_);
         translate(testBatch, collector, printer);
         needsSwitching_ = true;
       } else {
-        LOG(info, "# EMPTY TEST BATCH");
+        LOG(info, "No context");
         if(needsSwitching_) {
           translateSlot_->Load(*cpuModel_);
           needsSwitching_ = false;
diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 5a4555955..2fda0e6c6 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -91,12 +91,12 @@ void SwappableModelTrainer::Train(const std::vector<std::string> &input) {
   while(scheduler->keepGoing()) {
     batchGenerator.prepare();
 
-    LOG(info, "## NEW BATCHES");
+    // LOG(info, "## NEW BATCHES");
     for(auto&& batch : batchGenerator) {
       if(!scheduler->keepGoing())
         break;
 
-      LOG(info, "### NEW BATCH");
+      // LOG(info, "### NEW BATCH");
       // Make an update step on the copy of the model
       auto lossNode = engine_->builder_->build(engine_->graph_, batch);
       engine_->graph_->forward();

From 324f69a1e14a5a6211d1e7d145398cf5d74573d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 26 Oct 2021 16:41:37 +0300
Subject: [PATCH 084/135] Remove a config option for swappable stuff that isn't
 used any more

---
 src/common/config_parser.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 26a4d6601..7d9163eff 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -727,8 +727,6 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
   cli.add<std::vector<int>>("--output-approx-knn",
      "Use approximate knn search in output layer (currently only in transformer)")
      ->implicit_val("100 1024");
-  cli.add<std::string>("--swap-model",
-      "Path to model to swap to.");
 #if 0 // @TODO: Ask Hany if there are any decoding-time options
   // add ULR settings
   if(mode_ != cli::mode::selfadaptive)

From 7f430741dafee1d5ba62aa03796e146d58d5ffb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 27 Oct 2021 11:49:41 +0300
Subject: [PATCH 085/135] Disable early stopping for self-adaptive training

Fixes a crash due to the early-stopping-on option being required after the merge
---
 src/common/config_parser.cpp   | 10 ----------
 src/common/config_parser.h     |  1 -
 src/translator/self_adaptive.h |  3 +++
 3 files changed, 3 insertions(+), 11 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 3de2caf95..937a86cca 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -98,7 +98,6 @@ ConfigParser::ConfigParser(cli::mode mode)
       addOptionsTraining(cli_);
       addOptionsTranslation(cli_);
       addOptionsServer(cli_);
-      addOptionsStupid(cli_);
       break;
     default:
       ABORT("wrong CLI mode");
@@ -109,15 +108,6 @@ ConfigParser::ConfigParser(cli::mode mode)
   // clang-format on
 }
 
-void ConfigParser::addOptionsStupid(cli::CLIWrapper & cli) {
-  auto previous_group = cli.switchGroup("Server options");
-  cli.add<size_t>(
-      "--early-stopping",
-      "Stop if the first validation metric does not improve for  arg  consecutive validation steps",
-      10);
-  cli.switchGroup(previous_group);
-}
-
 void ConfigParser::addOptionsGeneral(cli::CLIWrapper & cli) {
   int defaultWorkspace = (mode_ == cli::mode::translation) ? 512 : 2048;
 
diff --git a/src/common/config_parser.h b/src/common/config_parser.h
index 744656458..b6b825d7d 100644
--- a/src/common/config_parser.h
+++ b/src/common/config_parser.h
@@ -130,7 +130,6 @@ class ConfigParser {
   void addOptionsTranslation(cli::CLIWrapper&);
   void addOptionsScoring(cli::CLIWrapper&);
   void addOptionsEmbedding(cli::CLIWrapper&);
-  void addOptionsStupid(cli::CLIWrapper&);
 
   void addAliases(cli::CLIWrapper&);
 
diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index a6c23f533..c26e3a8b5 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -26,6 +26,9 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 public:
   TrainSelfAdaptive(Ptr<Options> options) : options_(options) {
     options_->set("shuffle", "none");
+    // Disable early stopping because typically training would happen for only a few iterations and
+    // and also it doesn't make much sense to run the validation metrics on the validation dataset here
+    options_->set<size_t>("early-stopping", 0);
     // Set up translator options
     optionsTrans_ = New<Options>(options_->clone());
     // We will only ever translate a single sentence at a time because dynamic

From d7676bd6ef6cc7286083529074edc096b3957e28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 27 Oct 2021 15:16:37 +0300
Subject: [PATCH 086/135] Forgot to remove a file that was used for debugging

---
 src/CMakeLists.txt        |   5 --
 src/command/bug_repro.cpp | 120 --------------------------------------
 2 files changed, 125 deletions(-)
 delete mode 100644 src/command/bug_repro.cpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f5a6b2ee8..282d87be0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -270,11 +270,6 @@ if (NOT COMPILE_LIBRARY_ONLY)
   add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz")
   add_custom_target(philly DEPENDS marian_tgz marian_zip)
 
-  add_executable(bug_repro command/bug_repro.cpp)
-  set_target_properties(bug_repro PROPERTIES OUTPUT_NAME bug_repro)
-  target_compile_options(bug_repro PRIVATE ${ALL_WARNINGS} -Wno-suggest-override)
-  set(EXECUTABLES ${EXECUTABLES} bug_repro)
-
   if(COMPILE_SERVER)
     add_executable(marian_server command/marian_server.cpp)
     set_target_properties(marian_server PROPERTIES OUTPUT_NAME marian-server)
diff --git a/src/command/bug_repro.cpp b/src/command/bug_repro.cpp
deleted file mode 100644
index 86464ff77..000000000
--- a/src/command/bug_repro.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-#include "../common/config_parser.h"
-#include "../common/options.h"
-#include "../data/text_input.h"
-#include "../models/model_factory.h"
-#include "../models/model_task.h"
-#include "../training/scheduler.h"
-#include "marian.h"
-
-namespace marian {
-
-class ReproTask : public marian::ModelTask {
-public:
-  ReproTask() {
-  }
-  void run() override {
-    auto parser = ConfigParser(cli::mode::training);
-    // i'm prob leaking memory at the end of run() but i don't care
-    const char* argseasy[]
-        = {"marian",
-           "-c",
-           "/home/rihards/exp/marian-adaptive-crash-repro/models/model.npz.repro.yml",
-           "-t", "dummy-value", "-t", "dummy-value",
-           "--after-batches", "20",
-           "--after-epochs", "4",
-           "--learn-rate", "0.1",
-           "--shuffle", "none",
-           "--mini-batch", "1"};
-    int argc = sizeof(argseasy) / sizeof(char*);
-    // this is as close as i could get to initializing a char** in a sane manner
-    char** args = new char*[argc];
-    for (int i = 0; i < argc; i++) {
-      args[i] = strdup(argseasy[i]);
-    }
-    auto options = parser.parseOptions(argc, args, false);
-
-    // auto builder = models::createCriterionFunctionFromOptions(options, models::usage::training);
-    auto optimizer = Optimizer(New<Options>("optimizer", "adam", "learn-rate", 0.01));
-
-    std::vector<std::string> vocabPaths
-        = {"/home/rihards/exp/marian-adaptive-crash-repro/models/train.1-to-1.bpe.en-lv.yml",
-      "/home/rihards/exp/marian-adaptive-crash-repro/models/train.1-to-1.bpe.en-lv.yml"};
-    std::vector<int> maxVocabs = {500, 500};
-
-    std::vector<Ptr<Vocab>> vocabs;
-    for(size_t i = 0; i < vocabPaths.size(); i++) {
-      Ptr<Vocab> vocab = New<Vocab>(options, i);
-      vocab->load(vocabPaths[i], maxVocabs[i]);
-      vocabs.emplace_back(vocab);
-    }
-    std::string sources = "del@@ e@@ tions affecting 13 q 14 are also the most frequent structural genetic ab@@ "
-          "err@@ ations in chronic lym@@ pho@@ cy@@ tic leu@@ ka@@ emia ( C@@ ll ) 6,@@ 7 , 8 "
-          ".\nthis region is found to be heter@@ oz@@ y@@ g@@ ously deleted in 30 ¬ 60 % and hom@@ "
-      "oz@@ y@@ g@@ ously deleted in 10 ¬ 20 % of C@@ ll patien@@ ts@@ 9 .";
-    std::string targets
-        = "del@@ ē@@ cijas , kas ietekmē 13 q 14 , arī ir visbiežāk sastopa@@ mās strukturālās "
-          "ģenē@@ tiskās ab@@ er@@ ācijas hron@@ iskā lim@@ foc@@ ī@@ tiskajā leik@@ ēm@@ ijā ( "
-          "H@@ LL ) 6,@@ 7 , 8 .\n30 –@@ 60 % H@@ LL pacientu ir konstatēta šī reģiona heter@@ "
-          "oz@@ ig@@ ota del@@ ē@@ cija , savukārt 10 –@@ 20 % H@@ LL pacientu ir konstatēta šī "
-      "reģiona hom@@ oz@@ ig@@ ota del@@ ē@@ c@@ ij@@ a@@ 9 .";
-    // auto inputs = New<data::TextInput>(std::vector<std::string>({sources, targets}), vocabs, options);
-    // auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
-
-    for(size_t i = 0; i < 10; i++) {
-      LOG(info, "# NEW OUTER ITER");
-      auto builder = models::createCriterionFunctionFromOptions(options, models::usage::training);
-      auto state = New<TrainingState>(options->get<float>("learn-rate"));
-      auto scheduler = New<Scheduler>(options, state);
-      scheduler->registerTrainingObserver(scheduler);
-      scheduler->registerTrainingObserver(optimizer);
-
-      Ptr<ExpressionGraph> graph;
-
-      bool first = true;
-      scheduler->started();
-
-      graph = New<ExpressionGraph>();
-      graph->setDevice({0, DeviceType::cpu});
-      graph->reserveWorkspaceMB(128);
-      while(scheduler->keepGoing()) {
-        LOG(info, "## NEW INNER ITER");
-        // if inputs aren't initialized for each epoch, their internal istringstreams get exhausted
-        auto inputs
-            = New<data::TextInput>(std::vector<std::string>({sources, targets}), vocabs, options);
-        auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
-        // auto batches = New<data::BatchGenerator<data::TextInput>>(inputs, options);
-        batches->prepare();
-
-        for(auto batch : *batches) {
-          LOG(info, "### NEW BATCH");
-          if(!scheduler->keepGoing()) {
-            break;
-          }
-
-          auto lossNode = builder->build(graph, batch);
-          if (first) {
-            graph->graphviz("graph-" + std::to_string(i) + ".gv");
-            first = false;
-          }
-          graph->forward();
-          StaticLoss loss = *lossNode;
-          graph->backward();
-
-          optimizer->update(graph, 1);
-          scheduler->update(loss, batch);
-        }
-
-        if(scheduler->keepGoing())
-          scheduler->increaseEpoch();
-      }
-      scheduler->finished();
-    }
-  }
-};
-}
-
-int main(int argc, char **argv) {
-  auto task = marian::ReproTask();
-  task.run();
-  return 0;
-}

From e48e737aafa3c2cfbc8f77961327de833b013b26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 27 Oct 2021 15:34:23 +0300
Subject: [PATCH 087/135] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e0b853144..915ef2560 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Adds a `marian-adaptive` executable to enable self-adaptive translation (a.k.a, runtime domain adaptation).
 - Adds option --add-lsh to marian-conv which allows the LSH to be memory-mapped.
 - Early stopping based on first, all, or any validation metrics via `--early-stopping-on`
 - Compute 8.6 support if using CUDA>=11.1

From 017b6c1a90d7ced1d3b9dd9f245466fb72674115 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Thu, 28 Oct 2021 14:50:39 +0300
Subject: [PATCH 088/135] Fix CPU-only compilation

---
 src/translator/swappable.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/translator/swappable.cpp b/src/translator/swappable.cpp
index 2fda0e6c6..35c3cb3f1 100644
--- a/src/translator/swappable.cpp
+++ b/src/translator/swappable.cpp
@@ -25,7 +25,9 @@ namespace {
 // For debugging memory
 void get(std::vector<uint8_t> &out, MemoryPiece::PtrType mem, Ptr<Backend> backend) {
   out.resize(mem->size());
+#ifdef CUDA_FOUND
   gpu::copy(backend, mem->data<uint8_t>(), mem->data<uint8_t>() + mem->size(), out.data());
+#endif
 }
 
 GPUEngineTrain::GPUEngineTrain(Ptr<Options> options, size_t deviceIdx)

From 1257a4540c078d5cc9f5829383681b45ab6a1da9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Thu, 28 Oct 2021 18:17:50 +0300
Subject: [PATCH 089/135] Add a virtual destructor to CollectorBase

To fix a compilation error on MacOS
---
 src/translator/output_collector.cpp | 2 ++
 src/translator/output_collector.h   | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/translator/output_collector.cpp b/src/translator/output_collector.cpp
index b74a5a54c..eec63ff8a 100644
--- a/src/translator/output_collector.cpp
+++ b/src/translator/output_collector.cpp
@@ -6,6 +6,8 @@
 
 namespace marian {
 
+CollectorBase::~CollectorBase(){};
+
 OutputCollector::OutputCollector()
   : nextId_(0),
     printing_(new DefaultPrinting()) {}
diff --git a/src/translator/output_collector.h b/src/translator/output_collector.h
index 4b0c48f13..106ecbf26 100644
--- a/src/translator/output_collector.h
+++ b/src/translator/output_collector.h
@@ -45,6 +45,7 @@ class GeometricPrinting : public PrintingStrategy {
 };
 
 struct CollectorBase {
+  virtual ~CollectorBase() = 0;
   virtual void Write(long sourceId, const std::string& best1, const std::string& bestn, bool nbest)
       = 0;
 };

From 96115c8e658ba332b1b5bafe65ef1cc75824cfba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 29 Nov 2021 12:51:25 +0200
Subject: [PATCH 090/135] Fix casing in the `COMPILE_ADAPTIVE` cmake option's
 description

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f42b3d0b2..a7665874a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,7 @@ option(COMPILE_CPU "Compile CPU version" ON)
 option(COMPILE_CUDA "Compile GPU version" ON)
 option(COMPILE_EXAMPLES "Compile examples" OFF)
 option(COMPILE_SERVER "Compile marian-server" OFF)
-option(COMPILE_ADAPTIVE "Compile marian-ADAPTIVE" OFF)
+option(COMPILE_ADAPTIVE "Compile marian-adaptive" OFF)
 option(COMPILE_TESTS "Compile tests" OFF)
 option(USE_APPLE_ACCELERATE "Compile with Apple Accelerate" OFF)
 option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)

From ba61acd758723cf8f4730389f32928435eef2a9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 29 Nov 2021 14:40:09 +0200
Subject: [PATCH 091/135] Split out marian-adaptive server mode into a separate
 executable

---
 CMakeLists.txt                  |  9 +++---
 src/CMakeLists.txt              |  6 ++++
 src/command/marian_adaptive.cpp | 53 ++-------------------------------
 src/common/config_parser.cpp    | 15 ++++------
 src/common/config_parser.h      |  2 +-
 5 files changed, 20 insertions(+), 65 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7665874a..6e53f6d12 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,7 +15,7 @@ option(COMPILE_CPU "Compile CPU version" ON)
 option(COMPILE_CUDA "Compile GPU version" ON)
 option(COMPILE_EXAMPLES "Compile examples" OFF)
 option(COMPILE_SERVER "Compile marian-server" OFF)
-option(COMPILE_ADAPTIVE "Compile marian-adaptive" OFF)
+option(COMPILE_ADAPTIVE "Compile marian-adaptive. Set COMPILE_SERVER=ON to enable the server mode." OFF)
 option(COMPILE_TESTS "Compile tests" OFF)
 option(USE_APPLE_ACCELERATE "Compile with Apple Accelerate" OFF)
 option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
@@ -541,7 +541,7 @@ endif(COMPILE_CPU)
 ###############################################################################
 # Find OpenSSL
 set(BOOST_COMPONENTS "")
-if(COMPILE_SERVER OR COMPILE_ADAPTIVE)
+if(COMPILE_SERVER)
   find_package(OpenSSL)
   if(OpenSSL_FOUND)
     message(STATUS "Found OpenSSL")
@@ -556,11 +556,10 @@ if(COMPILE_SERVER OR COMPILE_ADAPTIVE)
     endif()
     set(BOOST_COMPONENTS ${BOOST_COMPONENTS} system)
   else(OpenSSL_FOUND)
-    message(WARNING "Cannot find OpenSSL library. Not compiling server or marian-adaptive.")
+    message(WARNING "Cannot find OpenSSL library. Not compiling server.")
     set(COMPILE_SERVER "off")
-    set(COMPILE_ADAPTIVE "off")
   endif(OpenSSL_FOUND)
-endif(COMPILE_SERVER OR COMPILE_ADAPTIVE)
+endif(COMPILE_SERVER)
 
 ###############################################################################
 # Undo static lib search and put non-static searches here:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 282d87be0..8a4bac9f1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -287,6 +287,12 @@ if (NOT COMPILE_LIBRARY_ONLY)
     add_executable(marian_adaptive command/marian_adaptive.cpp)
     set_target_properties(marian_adaptive PROPERTIES OUTPUT_NAME marian-adaptive)
     set(EXECUTABLES ${EXECUTABLES} marian_adaptive)
+
+    if(COMPILE_SERVER)
+      add_executable(marian_adaptive_server command/marian_adaptive_server.cpp)
+      set_target_properties(marian_adaptive_server PROPERTIES OUTPUT_NAME marian-adaptive-server)
+      set(EXECUTABLES ${EXECUTABLES} marian_adaptive_server)
+    endif(COMPILE_SERVER)
   endif(COMPILE_ADAPTIVE)
 
   foreach(exec ${EXECUTABLES})
diff --git a/src/command/marian_adaptive.cpp b/src/command/marian_adaptive.cpp
index 0f64a84ca..a21d04a7d 100644
--- a/src/command/marian_adaptive.cpp
+++ b/src/command/marian_adaptive.cpp
@@ -1,7 +1,5 @@
 #include "marian.h"
 
-#include "3rd_party/simple-websocket-server/server_ws.hpp"
-#include "common/file_stream.h"
 #include "common/timer.h"
 #include "common/utils.h"
 #include "training/training.h"
@@ -9,58 +7,13 @@
 
 using namespace marian;
 
-typedef SimpleWeb::SocketServer<SimpleWeb::WS> WSServer;
-
 int main(int argc, char **argv) {
   auto options = parseOptions(argc, argv, cli::mode::selfadaptive);
   auto task = New<TrainSelfAdaptive>(options);
 
-  if(options->has("port") && options->get<size_t>("port") != 0) {
-    // Initialize web server
-    WSServer server;
-    server.config.port = options->get<size_t>("port", 8080);
-
-    auto &translate = server.endpoint["^/translate/?$"];
-
-    translate.on_message = [&task](Ptr<WSServer::Connection> connection,
-                                   Ptr<WSServer::InMessage> message) {
-      auto sendStream = std::make_shared<WSServer::OutMessage>();
-
-      // Get input text
-      auto inputText = message->string();
-
-      // Translate
-      timer::Timer timer;
-      auto outputText = task->run(inputText);
-      LOG(info, "Best translation: {}", outputText);
-      *sendStream << outputText << std::endl;
-      LOG(info, "Translation took: {:.5f}s", timer.elapsed());
-
-      // Send translation back
-      connection->send(sendStream, [](const SimpleWeb::error_code &ec) {
-        if(ec)
-          LOG(error, "Error sending message: ({}) {}", ec.value(), ec.message());
-      });
-    };
-
-    // Error Codes for error code meanings
-    // http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
-    translate.on_error = [](Ptr<WSServer::Connection> connection, const SimpleWeb::error_code &ec) {
-      LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
-    };
-
-    // Start server thread
-    std::thread serverThread([&server]() {
-      LOG(info, "Server is listening on port {}", server.config.port);
-      server.start();
-    });
-
-    serverThread.join();
-  } else {
-    timer::Timer timer;
-    task->run();
-    LOG(info, "Total time: {:.5f}s", timer.elapsed());
-  }
+  timer::Timer timer;
+  task->run();
+  LOG(info, "Total time: {:.5f}s", timer.elapsed());
 
   return 0;
 }
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 937a86cca..e736c20f0 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -70,12 +70,13 @@ std::string const& ConfigParser::cmdLine() const {
 }
 
 ConfigParser::ConfigParser(cli::mode mode)
-  : cli_(config_,"Marian: Fast Neural Machine Translation in C++",
-         "General options", "", 40),
-    mode_(mode == cli::mode::server ? cli::mode::translation : mode) {
+    : cli_(config_, "Marian: Fast Neural Machine Translation in C++", "General options", "", 40),
+      mode_(mode == cli::mode::server
+                ? cli::mode::translation
+                : (mode == cli::mode::selfadaptiveServer ? cli::mode::selfadaptive : mode)) {
 
   addOptionsGeneral(cli_);
-  if (mode == cli::mode::server)
+  if (mode == cli::mode::server || mode == cli::mode::selfadaptiveServer)
     addOptionsServer(cli_);
   addOptionsModel(cli_);
 
@@ -97,7 +98,6 @@ ConfigParser::ConfigParser(cli::mode mode)
     case cli::mode::selfadaptive:
       addOptionsTraining(cli_);
       addOptionsTranslation(cli_);
-      addOptionsServer(cli_);
       break;
     default:
       ABORT("wrong CLI mode");
@@ -165,10 +165,7 @@ void ConfigParser::addOptionsServer(cli::CLIWrapper& cli) {
   // clang-format off
   auto previous_group = cli.switchGroup("Server options");
   // TODO why is this needed?
-  size_t defaultPort = mode_ == cli::mode::selfadaptive ? 0 : 8080;
-  cli.add<size_t>("--port,-p",
-      "Port number for web socket server",
-      defaultPort);
+  cli.add<size_t>("--port,-p", "Port number for web socket server", 8080);
   cli.switchGroup(previous_group);
   // clang-format on
 }
diff --git a/src/common/config_parser.h b/src/common/config_parser.h
index b6b825d7d..5429f3d2c 100644
--- a/src/common/config_parser.h
+++ b/src/common/config_parser.h
@@ -14,7 +14,7 @@
 namespace marian {
 
 namespace cli {
-enum struct mode { training, translation, scoring, server, embedding, selfadaptive };
+  enum struct mode { training, translation, scoring, server, embedding, selfadaptive, selfadaptiveServer };
 }  // namespace cli
 
 /**

From 2e7e78f0c208a6660d545533ab3e0442578c5f92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 29 Nov 2021 14:43:46 +0200
Subject: [PATCH 092/135] Remove marian-adaptive from the .zip and .tgz targets

It was an oversight to include them there in a previous commit. As the comment
suggests, the targets are for some MS internal needs.
---
 src/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8a4bac9f1..85747787e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -253,7 +253,6 @@ if (NOT COMPILE_LIBRARY_ONLY)
                 "${CMAKE_BINARY_DIR}/marian-scorer"
                 "${CMAKE_BINARY_DIR}/marian-vocab"
                 "${CMAKE_BINARY_DIR}/marian-conv"
-                "${CMAKE_BINARY_DIR}/marian-adaptive"
     DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
   add_custom_target(marian_zip DEPENDS "${CMAKE_BINARY_DIR}/marian.zip")
 
@@ -265,7 +264,6 @@ if (NOT COMPILE_LIBRARY_ONLY)
                 "marian-scorer"
                 "marian-vocab"
                 "marian-conv"
-                "marian-adaptive"
     DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_conv)
   add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz")
   add_custom_target(philly DEPENDS marian_tgz marian_zip)

From 0084a3ad00a8f9bb4081bc4ff3916ae6d81f1d7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 29 Nov 2021 14:47:40 +0200
Subject: [PATCH 093/135] Remove a comment that was made obsolete by the
 grandparrent commit (ba61acd7)

---
 src/common/config_parser.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index e736c20f0..fbcaf6ed9 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -164,7 +164,6 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper & cli) {
 void ConfigParser::addOptionsServer(cli::CLIWrapper& cli) {
   // clang-format off
   auto previous_group = cli.switchGroup("Server options");
-  // TODO why is this needed?
   cli.add<size_t>("--port,-p", "Port number for web socket server", 8080);
   cli.switchGroup(previous_group);
   // clang-format on

From 30c040042eda425c1148c7847c58e6d34b5fc1f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 29 Nov 2021 14:54:52 +0200
Subject: [PATCH 094/135] Change the defaultDispFreq option to use an unsigned
 value

At least i think that's what this does

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 src/common/config_parser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index fbcaf6ed9..a732de3f1 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -382,7 +382,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
 
   // In self-adaptive mode users would typically want less updates to happen than in regular training
   size_t defaultAfterEpochs = (mode_ == cli::mode::selfadaptive) ? 2 : 0;
-  std::string defaultDispFreq = (mode_ == cli::mode::selfadaptive) ? "1" : "1000u";
+  std::string defaultDispFreq = (mode_ == cli::mode::selfadaptive) ? "1u" : "1000u";
 
   // @TODO: these should be re-defined as aliases for `--after` but the current frame work matches on value, so not doable.
   cli.add<size_t>("--after-epochs,-e",

From 2fbb6ec57aab90bd5f24cc32988249bad9162e93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 29 Nov 2021 14:58:36 +0200
Subject: [PATCH 095/135] Fix indentation

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 src/common/config_parser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index a732de3f1..6a30c7016 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -742,7 +742,7 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
 
   cli.switchGroup(previous_group);
   // clang-format on
-  }
+}
 
 void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
   auto previous_group = cli.switchGroup("Scorer options");

From d09c021d627d3790250d9429134f4e295bd32ec7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 29 Nov 2021 15:08:58 +0200
Subject: [PATCH 096/135] Fix indentation

---
 src/tensors/gpu/swap.cu | 14 ++++++++------
 src/tensors/gpu/swap.h  | 15 +++++++++------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/tensors/gpu/swap.cu b/src/tensors/gpu/swap.cu
index c16a71614..1528f3860 100644
--- a/src/tensors/gpu/swap.cu
+++ b/src/tensors/gpu/swap.cu
@@ -4,10 +4,12 @@ void copyCpuToGpu(const char * in, char * gpuOut);
 void copyGpuToGpu(const char * in, char * gpuOut);
 
 namespace marian {
-    namespace swapper {
-        void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
-            CUDA_CHECK(cudaSetDevice(deviceId.no));
-            CUDA_CHECK(cudaMemcpy(gpuOut, in, count, cudaMemcpyHostToDevice));
-        }
-    }
+namespace swapper {
+
+void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
+  CUDA_CHECK(cudaSetDevice(deviceId.no));
+  CUDA_CHECK(cudaMemcpy(gpuOut, in, count, cudaMemcpyHostToDevice));
+}
+
+}
 }
diff --git a/src/tensors/gpu/swap.h b/src/tensors/gpu/swap.h
index a020c8827..9de46e9e9 100644
--- a/src/tensors/gpu/swap.h
+++ b/src/tensors/gpu/swap.h
@@ -2,14 +2,17 @@
 #include <stdlib.h>
 #include "common/definitions.h"
 #include "common/logging.h"
+
 namespace marian {
-    namespace swapper {
+namespace swapper {
+
 #ifdef CUDA_FOUND
-        void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId);
+void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId);
 #else
-        inline void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
-            ABORT("Copy from CPU to GPU memory is only available with CUDA.");
-        }
+inline void copyCpuToGpu(char * gpuOut, const char * in, size_t count, const marian::DeviceId& deviceId) {
+  ABORT("Copy from CPU to GPU memory is only available with CUDA.");
+}
 #endif
-    }
+
+}
 }

From 10d5bffdfb9d5a9ea45190f03456a3ceb55ae124 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 29 Nov 2021 15:11:27 +0200
Subject: [PATCH 097/135] Remove @brief from doc comments

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 src/translator/self_adaptive.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index c26e3a8b5..29b06a88b 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -15,8 +15,7 @@ namespace marian {
 using namespace data;
 
 /**
- * @breif Implementation of the self-adaptive translation mode.
- *
+ * Implementation of the self-adaptive translation mode.
  * Self-adaptive translation means optionally using a set of context sentences
  * (e.g., provided by a translation memory), that are similar to the
  * translatable sentence, to train the model for a few iterations to fine-tune

From d41d81bb7cea58620c2c4bb318a79a3c2a54576f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 29 Nov 2021 15:12:52 +0200
Subject: [PATCH 098/135] Remove commented out debugging code

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 src/translator/self_adaptive.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 29b06a88b..bf5a0fb00 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -64,8 +64,6 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
    * @return JSON-encoded translations
    */
   std::string run(const std::string& json) override {
-    //LOG(warn, "REMOVEME Received Json:\n{}", json);
-
     // Check if input is in JSON
     YAML::Node yaml = YAML::Load(json);
     if(!yaml["input"]) {

From fde22269358fb939518689dff79d0ac8c8fbc035 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 29 Nov 2021 15:13:29 +0200
Subject: [PATCH 099/135] Don't split the line here

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 src/translator/self_adaptive.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index bf5a0fb00..a05a29142 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -136,8 +136,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 
   template <class Iterator, class DataSet>
   void adaptAndTranslate(
-      Ptr<marian::data::BatchGenerator<DataSet>>
-          testBatches,
+      Ptr<marian::data::BatchGenerator<DataSet>> testBatches,
       Iterator trainBegin,
       Iterator trainEnd,
       Ptr<marian::CollectorBase> collector) {

From e40758766096fd4a7d2fad9ca149d90406c46db8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 29 Nov 2021 15:13:48 +0200
Subject: [PATCH 100/135] Fix indentation

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 src/translator/self_adaptive.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index a05a29142..fe8fbe186 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -176,9 +176,9 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
       std::stringstream bestn;
       printer->print(history, best1, bestn);
       collector->Write(history->getLineNum(),
-                        best1.str(),
-                        bestn.str(),
-                        options_->get<bool>("n-best"));
+                       best1.str(),
+                       bestn.str(),
+                       options_->get<bool>("n-best"));
     }
   }
 };

From b869f683b9e9d4a6c2dd92d9801b0b036ad4da8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Mon, 29 Nov 2021 15:47:07 +0200
Subject: [PATCH 101/135] Make it clear that validation options are disabled

---
 src/translator/self_adaptive.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index fe8fbe186..fbde213c3 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -25,8 +25,12 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
 public:
   TrainSelfAdaptive(Ptr<Options> options) : options_(options) {
     options_->set("shuffle", "none");
-    // Disable early stopping because typically training would happen for only a few iterations and
-    // and also it doesn't make much sense to run the validation metrics on the validation dataset here
+    // Validation options are disabled for self-adaptive marian because
+    // typically training would happen for only a few iterations and it seems to
+    // not make much sense to run validation metrics on the validation dataset
+    // then (especially if you care about translation performance). However, we
+    // have to manually set the early-stopping option as disabled because the
+    // scheduler crashes if it's not present.
     options_->set<size_t>("early-stopping", 0);
     // Set up translator options
     optionsTrans_ = New<Options>(options_->clone());

From e04b82938274c8c55e54b3a9103b33d2c2c1556e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 30 Nov 2021 11:31:59 +0200
Subject: [PATCH 102/135] Delete the pad_model_vocabulary.py script

It came with the swappable code but isn't really needed for self-adaptive marian
---
 scripts/contrib/pad_model_vocabulary.py | 52 -------------------------
 1 file changed, 52 deletions(-)
 delete mode 100755 scripts/contrib/pad_model_vocabulary.py

diff --git a/scripts/contrib/pad_model_vocabulary.py b/scripts/contrib/pad_model_vocabulary.py
deleted file mode 100755
index eca73e34a..000000000
--- a/scripts/contrib/pad_model_vocabulary.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python3
-# Pads a Marian model's vocabulary to have greater size.  The added tokens have
-# zero probability.
-# ./pad_model_vocabulary.py input.npz output.npz desired_vocab_size
-#
-# You'll also need to separately pad your vocabulary file like so:
-# old=$(wc -l input.vocab |cut -d " " -f 1)
-# (cat input.vocab; seq -f "<PADDING%g>" $((desired_vocab_size-old))) >output.vocab
-#
-# Warning: probably only works with shared vocabulary models.
-import math
-import numpy as np
-import sys
-import yaml
-
-# Amend the vocab size in a raw ["special:model.yml"] data from a Marian npz.
-# Returns the raw data to use for ["special:model.yml"]
-def substitute_vocab_config(raw, new_size):
-  print("Old yml: ", raw.tostring())
-  raw_yaml = raw.tostring().decode("utf-8")
-  #Python yaml doesn't like null bytes.
-  if raw_yaml.endswith("\x00"):
-    raw_yaml = raw_yaml[:-1]
-  config = yaml.load(raw_yaml)
-  config['dim-vocabs'] = [new_size] * len(config['dim-vocabs'])
-  raw_yaml = yaml.dump(config)
-  if raw_yaml.endswith("\n"):
-    raw_yaml = raw_yaml[:-1]
-  raw_yaml += "\x00"
-  return np.array(bytearray(raw_yaml, 'utf-8'))
-
-if len(sys.argv) != 4:
-  print("Usage: " + sys.argv[0] + " input.npz output.npz desired_vocab_size")
-  sys.exit(1)
-  
-resized_path = sys.argv[2]
-new_size = int(sys.argv[3])
-old_model = np.load(sys.argv[1])
-
-new_model = dict(old_model)
-old_size = len(old_model["Wemb"])
-if old_size > new_size:
-  sys.stderr.write("New size is smaller than original.  Cowardly refusing to clip vocab.\n")
-  sys.exit(2)
-print("Before: ", new_model["decoder_ff_logit_out_b"].shape, new_model["Wemb"].shape)
-bias = new_model["decoder_ff_logit_out_b"]
-new_model["decoder_ff_logit_out_b"] = np.pad(bias, [(0,0),(0,new_size - bias.shape[1])], mode='constant', constant_values = -math.inf)
-new_model["Wemb"] = np.pad(new_model["Wemb"], [(0,new_size - bias.shape[1]), (0,0)], mode='constant', constant_values = 0)
-print("After: ", new_model["decoder_ff_logit_out_b"].shape, new_model["Wemb"].shape)
-new_model["special:model.yml"] = substitute_vocab_config(new_model["special:model.yml"], new_size)
-print("New yml: ", new_model["special:model.yml"].tostring())
-np.savez(resized_path, **new_model)

From 939384bda8199cb6239ff2589052a90e93ee4c23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 30 Nov 2021 14:15:57 +0200
Subject: [PATCH 103/135] Comment on why data management options are disabled
 for self-adaptive marian

---
 src/common/config_parser.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 6a30c7016..c91edf268 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -415,6 +415,13 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   addSuboptionsTSV(cli);
 
   // data management options
+  //
+  // These options are disabled for self-adaptive translation because they seem
+  // to not make much sense in that context, except for --shuffle, because they
+  // deal with the storage of training data but in self-adaptive translation
+  // training data sets are small and they typically change for each input
+  // sentence. --shuffle isn't currently supported because we use `TextInput`
+  // for training data and shuffle is a no-op in that class.
   if (mode_ != cli::mode::selfadaptive) {
     cli.add<std::string>("--shuffle",
         "How to shuffle input data (data: shuffles data and sorted batches; batches: "

From 92aaeeadb2815823d85d7339746679eac5d35ab6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 30 Nov 2021 15:58:32 +0200
Subject: [PATCH 104/135] Explain the max-length-translate option; fix the
 default for max-lengt

The default was wrong for self-adaptive translation
---
 src/common/config_parser.cpp | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index c91edf268..be18149ec 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -699,15 +699,13 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
       "Keep the output segmented into SentencePiece subwords");
 #endif
 
+  // For self-adaptive translation these options are already added in
+  // `addOptionsTraining`
   if(mode_ != cli::mode::selfadaptive) {
     addSuboptionsInputLength(cli);
     addSuboptionsTSV(cli);
     addSuboptionsDevices(cli);
     addSuboptionsBatching(cli);
-  } else {
-    cli.add<size_t>("--max-length-translate",
-        "Maximum input sentence length for translation",
-        1000);
   }
 
   // for self-adaptive mode vocabs are already added via the training options
@@ -937,13 +935,25 @@ void ConfigParser::addSuboptionsBatching(cli::CLIWrapper& cli) {
 }
 
 void ConfigParser::addSuboptionsInputLength(cli::CLIWrapper& cli) {
-  size_t defaultMaxLength = (mode_ == cli::mode::training) ? 50 : 1000;
+  size_t defaultMaxLength =
+    (mode_ == cli::mode::training || mode_ == cli::mode::selfadaptive)
+    ? 50
+    : 1000;
   // clang-format off
   cli.add<size_t>("--max-length",
       "Maximum length of a sentence in a training sentence pair",
       defaultMaxLength);
   cli.add<bool>("--max-length-crop",
       "Crop a sentence to max-length instead of omitting it if longer than max-length");
+  // In self-adaptive translation, the user might want to be able to set
+  // different max lengths for training and translation. In that case,
+  // --max-length is assumed to be meant for training (as per the help message)
+  // and we add a --max-lenght-translate parameter for translation.
+  if (mode_ == cli::mode::selfadaptive) {
+    cli.add<size_t>("--max-length-translate",
+        "Maximum input sentence length for translation",
+        1000);
+  }
   // clang-format on
 }
 

From 99553d59b7307a206d2a36356943fe6ebe3922ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 30 Nov 2021 16:01:54 +0200
Subject: [PATCH 105/135] Remove an obsolete comment

---
 src/common/config_parser.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index be18149ec..0da03706b 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -708,7 +708,6 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
     addSuboptionsBatching(cli);
   }
 
-  // for self-adaptive mode vocabs are already added via the training options
   if(mode_ != cli::mode::selfadaptive) {
     cli.add<bool>("--fp16",
         "Shortcut for mixed precision inference with float16, corresponds to: --precision float16");

From 8aec3caa38b30d28c1bd34fa9cb8216639e4d362 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 30 Nov 2021 16:04:24 +0200
Subject: [PATCH 106/135] Remove excessive empty lines

---
 src/data/adaptive_context.cpp | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/data/adaptive_context.cpp b/src/data/adaptive_context.cpp
index 9ac680e83..4e626ae88 100644
--- a/src/data/adaptive_context.cpp
+++ b/src/data/adaptive_context.cpp
@@ -3,7 +3,6 @@
 namespace marian {
 namespace data {
 
-
 AdaptiveContextIterator::AdaptiveContextIterator(AdaptiveContextReader* trainSetReader)
     : trainSetReader_(trainSetReader) {
   if(trainSetReader) {
@@ -11,17 +10,14 @@ AdaptiveContextIterator::AdaptiveContextIterator(AdaptiveContextReader* trainSet
   }
 }
 
-
 bool AdaptiveContextIterator::equal(const AdaptiveContextIterator& other) const {
   return other.trainSetReader_ == trainSetReader_;
 }
 
-
 const std::vector<std::string>& AdaptiveContextIterator::dereference() const {
   return currentSamples_;
 }
 
-
 void AdaptiveContextIterator::increment() {
   // If the previous increment has exhausted the file, we must indicate that the we've reached
   // the iterator's end
@@ -37,29 +33,23 @@ void AdaptiveContextIterator::increment() {
 }
 
 
-
-
 AdaptiveContextReader::AdaptiveContextReader(std::vector<std::string> paths) {
   for(auto& path : paths)
     files_.emplace_back(new io::InputFileStream(path));
 }
 
-
 AdaptiveContextIterator AdaptiveContextReader::begin() {
   return AdaptiveContextIterator(this);
 }
 
-
 AdaptiveContextIterator AdaptiveContextReader::end() {
   return AdaptiveContextIterator(nullptr);
 }
 
-
 bool AdaptiveContextReader::eof() {
   return eof_;
 }
 
-
 std::vector<std::string> AdaptiveContextReader::getSamples() {
   // extracted lines for source and target corpora
   std::vector<std::string> samples;
@@ -102,7 +92,5 @@ std::vector<std::string> AdaptiveContextReader::getSamples() {
 
   return samples;
 }
-
-
 }  // namespace data
 }  // namespace marian

From 971e1dc8eb4ffc6859426a1dda7c49935a98f7ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 30 Nov 2021 16:11:00 +0200
Subject: [PATCH 107/135] Split some long lines

---
 src/translator/swappable.h | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index af3cffa4c..ce4c28b60 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -51,8 +51,10 @@ class GPUEngineTrain {
 public:
   /**
     * @param options The marian options object
-    * @param deviceNum The index of the device you want to use for this slot. Note that this is not the deviceID but the index of the device in the
-    *                  array of supplied devices. Eg if you provide -d 0 3 5 and you want the Slot to run on GPU 3, you provide deviceNum=1.
+    * @param deviceNum The index of the device you want to use for this slot.
+    * Note that this is not the deviceID but the index of the device in the
+    * array of supplied devices. Eg if you provide -d 0 3 5 and you want the
+    * Slot to run on GPU 3, you provide deviceNum=1.
     */
   explicit GPUEngineTrain(Ptr<Options> options, size_t deviceNum);
 
@@ -124,8 +126,10 @@ class GPUEngineTranslate {
 public:
   /**
     * @param options The marian options object
-    * @param deviceNum The index of the device you want to use for this slot. Note that this is not the deviceID but the index of the device in the
-    *                  array of supplied devices. Eg if you provide -d 0 3 5 and you want the Slot to run on GPU 3, you provide deviceNum=1.
+    * @param deviceNum The index of the device you want to use for this slot.
+    * Note that this is not the deviceID but the index of the device in the
+    * array of supplied devices. Eg if you provide -d 0 3 5 and you want the
+    * Slot to run on GPU 3, you provide deviceNum=1.
     */
   explicit GPUEngineTranslate(Ptr<Options> options, size_t deviceNum);
 
@@ -181,7 +185,10 @@ class CPULoadedModel {
   public:
     // The parts of Options that relate to model and vocab are ignored. The
     // files provided will be loaded.
-    CPULoadedModel(Ptr<Options> options, const std::string &parameters, const std::vector<std::string> &sourceVocabPaths, const std::string &targetVocabPath);
+    CPULoadedModel(Ptr<Options> options,
+                   const std::string &parameters,
+                   const std::vector<std::string> &sourceVocabPaths,
+                   const std::string &targetVocabPath);
 
     const std::vector<io::Item> &Parameters() const { return parameters_; }
 

From f3a085c6ccd0c9646130989b871b0334901b9146 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 1 Dec 2021 12:20:26 +0200
Subject: [PATCH 108/135] Forgot to add the marian_adaptive_server.cpp file to
 git

---
 src/command/marian_adaptive_server.cpp | 60 ++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 src/command/marian_adaptive_server.cpp

diff --git a/src/command/marian_adaptive_server.cpp b/src/command/marian_adaptive_server.cpp
new file mode 100644
index 000000000..26d6dee10
--- /dev/null
+++ b/src/command/marian_adaptive_server.cpp
@@ -0,0 +1,60 @@
+#include "marian.h"
+
+#include "3rd_party/simple-websocket-server/server_ws.hpp"
+#include "common/file_stream.h"
+#include "common/timer.h"
+#include "common/utils.h"
+#include "training/training.h"
+#include "translator/self_adaptive.h"
+
+using namespace marian;
+
+typedef SimpleWeb::SocketServer<SimpleWeb::WS> WSServer;
+
+int main(int argc, char **argv) {
+  auto options = parseOptions(argc, argv, cli::mode::selfadaptiveServer);
+  auto task = New<TrainSelfAdaptive>(options);
+
+  // Initialize web server
+  WSServer server;
+  server.config.port = options->get<size_t>("port", 8080);
+
+  auto &translate = server.endpoint["^/translate/?$"];
+
+  translate.on_message = [&task](Ptr<WSServer::Connection> connection,
+                                  Ptr<WSServer::InMessage> message) {
+    auto sendStream = std::make_shared<WSServer::OutMessage>();
+
+    // Get input text
+    auto inputText = message->string();
+
+    // Translate
+    timer::Timer timer;
+    auto outputText = task->run(inputText);
+    LOG(info, "Best translation: {}", outputText);
+    *sendStream << outputText << std::endl;
+    LOG(info, "Translation took: {:.5f}s", timer.elapsed());
+
+    // Send translation back
+    connection->send(sendStream, [](const SimpleWeb::error_code &ec) {
+      if(ec)
+        LOG(error, "Error sending message: ({}) {}", ec.value(), ec.message());
+    });
+  };
+
+  // Error Codes for error code meanings
+  // http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
+  translate.on_error = [](Ptr<WSServer::Connection> connection, const SimpleWeb::error_code &ec) {
+    LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
+  };
+
+  // Start server thread
+  std::thread serverThread([&server]() {
+    LOG(info, "Server is listening on port {}", server.config.port);
+    server.start();
+  });
+
+  serverThread.join();
+
+  return 0;
+}

From bcbeb2d8f2fa306ed0a43aa87e7aa86cc93b062c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 2 Dec 2021 14:00:12 +0200
Subject: [PATCH 109/135] Document the toMemoryPieces method

---
 src/graph/parameters.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index 8dd579af1..8aed11460 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -45,6 +45,11 @@ class Parameters {
     LOG(debug, "Destroyed parameter object of type {}", acceptedElementType_);
   }
 
+  /**
+   * @brief Retrieves the memory corresponding to the parameter values
+   *
+   * @return A vector of memorypieces each corresponding to a single parameter
+   */
   std::vector<MemoryPiece::PtrType> toMemoryPieces() {
     std::vector<MemoryPiece::PtrType> res;
     res.reserve(params_.size());

From 2667ea90b5351fe0657799beb6d3c4ed4c492899 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 2 Dec 2021 17:13:39 +0200
Subject: [PATCH 110/135] Delete some more @briefs

---
 src/data/adaptive_context.h    | 10 +++++-----
 src/graph/parameters.h         |  4 ++--
 src/translator/self_adaptive.h |  4 ++--
 src/translator/swappable.h     |  6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/data/adaptive_context.h b/src/data/adaptive_context.h
index 80d2213da..dc7ebee5a 100644
--- a/src/data/adaptive_context.h
+++ b/src/data/adaptive_context.h
@@ -11,7 +11,7 @@ class AdaptiveContextReader;
 
 
 /**
- * @brief An iterator for easier access of the context sentences produced by
+ * An iterator for easier access of the context sentences produced by
  * `AdaptiveContextReader::getSamples()`
  */
 class AdaptiveContextIterator
@@ -34,7 +34,7 @@ class AdaptiveContextIterator
 
 
 /**
- * @brief Reads the context sentences, that are used for on-the-fly training in
+ * Reads the context sentences, that are used for on-the-fly training in
  * the self-adaptive translation mode, from files.
  */
 class AdaptiveContextReader {
@@ -45,7 +45,7 @@ class AdaptiveContextReader {
 
 public:
   /**
-   * @brief Initializes a new reader by supplying paths to the files with
+   * Initializes a new reader by supplying paths to the files with
    * context sentences
    *
    * @param paths paths to the input files. The input files contain
@@ -58,7 +58,7 @@ class AdaptiveContextReader {
   AdaptiveContextReader(std::vector<std::string> paths);
 
   /**
-   * @brief Returns an iterator over the sets of context sentences produced by
+   * Returns an iterator over the sets of context sentences produced by
    * `getSamples()`
    *
    * @return the beginning of the iterator.
@@ -70,7 +70,7 @@ class AdaptiveContextReader {
   bool eof();
 
   /**
-   * @brief Reads the next set of samples -- the contaxt sentences -- for
+   * Reads the next set of samples -- the contaxt sentences -- for
    * on-the-fly training in the self-adaptive translation mode.
    *
    * @details The input files contain newline-separated parallel sentence pairs
diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index 8aed11460..e7f2efa19 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -21,7 +21,7 @@ class Parameters {
 protected:
   Type acceptedElementType_; // this parameter object only takes paramters of this type
 
-  /** @brief List of all parameter nodes of this expression graph. */
+  /** List of all parameter nodes of this expression graph. */
   std::vector<Expr> params_;
   std::unordered_map<std::string, Expr> named_;
 
@@ -46,7 +46,7 @@ class Parameters {
   }
 
   /**
-   * @brief Retrieves the memory corresponding to the parameter values
+   *01234 Retrieves the memory corresponding to the parameter values
    *
    * @return A vector of memorypieces each corresponding to a single parameter
    */
diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index fbde213c3..793c19c00 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -58,7 +58,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
   }
 
   /**
-   * @brief Implementation for self-adaptive translation where data come from a
+   * Implementation for self-adaptive translation where data come from a
    * web request.
    *
    * @param json Input data in JSON. An "input" array of strings is expected to
@@ -102,7 +102,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
   }
 
   /**
-   * @brief Implementation for self-adaptive translation where inputs and
+   * Implementation for self-adaptive translation where inputs and
    * outputs are specified in CLI options.
    */
   void run() override {
diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index ce4c28b60..0ebda4389 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -62,7 +62,7 @@ class GPUEngineTrain {
 };
 
 /**
- * @brief Wraps a `GPUEngineTrain` and a `CPULoadedModel` and performs model
+ * Wraps a `GPUEngineTrain` and a `CPULoadedModel` and performs model
  * training.
  *
  * This class is created with self-adaptive translation in mind. Each invocation
@@ -94,7 +94,7 @@ class SwappableModelTrainer {
     std::vector<MemoryPiece::PtrType> Parameters() const;
 
     /**
-     * @brief resets the training graph, reloads the model parameters and trains
+     * Resets the training graph, reloads the model parameters and trains
      * the model on the provided inputs.
      *
      * Intended to be used in the self-adaptive translation mode -- training is
@@ -160,7 +160,7 @@ class GPULoadedModel {
     /// Overwrite this model with parameters from a different one.
     void Load(const CPULoadedModel &from);
     /**
-     * @brief Set the internal shared pointers to model parameters and
+     * Set the internal shared pointers to model parameters and
      * vocabularies to different ones
      *
      * The effect is similar to `Load()` but nothing is copied in the process.

From 5b28786429fa709727e9d9bfe4b0b74c4be2e149 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 2 Dec 2021 17:26:58 +0200
Subject: [PATCH 111/135] Comment on a possibly missing "training-sets" option

---
 src/training/scheduler.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index 6d1890116..5c91c477a 100644
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -534,8 +534,11 @@ class Scheduler : public TrainingObserver {
   }
 
   void actAfterEpoch(TrainingState& state) override {
-    // stop if data streaming from STDIN is stopped for a TSV input
+    // When running self-adaptive marian in server mode the "training-sets"
+    // option isn't present because the training sentences are passed in via the
+    // request body
     if (options_->has("training-sets")) {
+      // Stop if data streaming from STDIN is stopped for a TSV input.
       auto trainingSets = options_->get<std::vector<std::string>>("train-sets");
       if (trainingSets.size() > 0) {
         std::string firstPath = options_->get<std::vector<std::string>>("train-sets")[0];

From 097effa9a89572becd1f5c8e55357dba66d085ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 2 Dec 2021 17:38:15 +0200
Subject: [PATCH 112/135] Remove unneeded member variables and describe member
 var usage

---
 src/translator/self_adaptive.h | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 793c19c00..0a93d8061 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -51,10 +51,10 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
     std::vector<std::string> srcVocabPaths(vocabPaths.begin(), vocabPaths.end() - 1);
     cpuModel_ = New<CPULoadedModel>(options_, modelFilename, srcVocabPaths, vocabPaths.back());
-    translateEngine_ = New<GPUEngineTranslate>(optionsTrans_, 0);
-    translateSlot_ = New<GPULoadedModel>(translateEngine_);
-    trainEngine_ = New<GPUEngineTrain>(options_, 0);
-    trainSlot_   = New<SwappableModelTrainer>(trainEngine_);
+    auto translateEngine = New<GPUEngineTranslate>(optionsTrans_, 0);
+    translateSlot_ = New<GPULoadedModel>(translateEngine);
+    auto trainEngine = New<GPUEngineTrain>(options_, 0);
+    trainSlot_   = New<SwappableModelTrainer>(trainEngine);
   }
 
   /**
@@ -129,14 +129,12 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
   }
 
 private:
-  Ptr<Options> options_;       // Options for training
-  Ptr<Options> optionsTrans_;  // Options for translator
-  Ptr<CPULoadedModel> cpuModel_;
-  Ptr<SwappableModelTrainer> trainSlot_;
-  Ptr<GPULoadedModel> translateSlot_;
-  Ptr<GPUEngineTrain> trainEngine_;
-  Ptr<GPUEngineTranslate> translateEngine_;
-  bool needsSwitching_ = true;
+  Ptr<Options> options_;                  // Options for training
+  Ptr<Options> optionsTrans_;             // Options for translator
+  Ptr<CPULoadedModel> cpuModel_;          // Holds model parameters and vocabularies
+  Ptr<SwappableModelTrainer> trainSlot_;  // Performs model training
+  Ptr<GPULoadedModel> translateSlot_;     // Performs translation with the model
+  bool needsSwitching_ = true;            // Tracks whether translate slot's model needs to be reset
 
   template <class Iterator, class DataSet>
   void adaptAndTranslate(

From 507f8ebd5f16c4bd15f06388245cf2bb3cc61d7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Fri, 3 Dec 2021 12:03:56 +0200
Subject: [PATCH 113/135] Document some methods

---
 src/graph/expression_graph.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 5c375da01..abec87b09 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -184,6 +184,10 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
       kvParams.second->clear();
   }
 
+
+  /**
+   * Call `clear()` on each of the parameters in the graph
+   */
   void clearParams() {
     for(auto kvParams : paramsByElementType_)
       kvParams.second->clear();
@@ -235,6 +239,10 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
     namespace_ = newNamespace;
   }
 
+  /**
+   * Extract graph parameters into a named map.
+   * @return A map with parameter names are keys and corresponding graph elements as values
+   */
   const std::unordered_map<std::string, Expr> & getParamsNamedMap() const {
     if (paramsByElementType_.size() != 1) {
       ABORT("Expected exactly one parameter datatype, got", paramsByElementType_.size());

From d797c906e05adb4840228e3841f449a51cbb4db1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Fri, 3 Dec 2021 12:11:42 +0200
Subject: [PATCH 114/135] Don't suggest looking at commits because they'll get
 squashed

---
 src/translator/swappable.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/translator/swappable.h b/src/translator/swappable.h
index 0ebda4389..e6db24280 100644
--- a/src/translator/swappable.h
+++ b/src/translator/swappable.h
@@ -12,8 +12,9 @@
  * Originally this code was intended to allow multiple models to share a single
  * GPU for translation and be swapped into GPU memory only when needed. However,
  * parts of it, that weren't needed for self-adaptive translation, have been
- * trimmed down since then. Look into the commit history if you want to revive
- * this functionality.
+ * trimmed down since then. Look here
+ * https://github.com/kpu/marian-dev/blob/90e161fa9fcb3e3ba1467c76a10b1fc7f9390b6d/src/translator/swappable.h
+ * if you want to revive this functionality.
  */
 #include "common/io.h"
 #include "data/vocab.h"

From babf93d2904b1fe489dbccb994a4498ea75c06ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Fri, 3 Dec 2021 14:29:18 +0200
Subject: [PATCH 115/135] Add a comment on stdin handling in CorpusBase

---
 src/data/corpus_base.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
index f3b41f422..58a83a451 100644
--- a/src/data/corpus_base.cpp
+++ b/src/data/corpus_base.cpp
@@ -30,9 +30,9 @@ const SentenceTuple& CorpusIterator::dereference() const {
   return tup_;
 }
 
-// These types of corpus constructors are used in in-training validators
-// (only?), so do not load additional files for guided alignment or data
-// weighting.
+// These types of corpus constructors are used in in-training validators (only?
+// (also in self-adaptive translation)), so do not load additional files for
+// guided alignment or data weighting.
 CorpusBase::CorpusBase(const std::vector<std::string>& paths,
                        const std::vector<Ptr<Vocab>>& vocabs,
                        Ptr<Options> options,
@@ -54,6 +54,8 @@ CorpusBase::CorpusBase(const std::vector<std::string>& paths,
   }
 
   for(auto path : paths_) {
+    // This constructor is also used in self-adaptive translation and it needs
+    // support for reading translation inputs from stdin
     if(path == "stdin" || path == "-")
       files_.emplace_back(new std::istream(std::cin.rdbuf()));
     else {

From 2d1ff231c5fb4b77f33c2a4100d080ba86fc1230 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Fri, 3 Dec 2021 14:31:40 +0200
Subject: [PATCH 116/135] Fix a typo

---
 src/graph/expression_graph.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index abec87b09..1a54d6a89 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -241,7 +241,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
 
   /**
    * Extract graph parameters into a named map.
-   * @return A map with parameter names are keys and corresponding graph elements as values
+   * @return A map with parameter names as keys and the corresponding graph elements as values
    */
   const std::unordered_map<std::string, Expr> & getParamsNamedMap() const {
     if (paramsByElementType_.size() != 1) {

From 6955a9ae989d4bb16399086683222a1b212f3ef7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Fri, 3 Dec 2021 15:11:59 +0200
Subject: [PATCH 117/135] Document the `dropF0prefix` flag

---
 src/graph/expression_graph.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 1a54d6a89..9aeb18d2d 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -763,7 +763,15 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
   bool getThrowNaN() { return throwNaN_; }
 
 public:
-  /** Load model (mainly parameter objects) from array of io::Items */
+  /**
+   * Load model (mainly parameter objects) from array of io::Items
+   *
+   * @param dropF0prefix modify the `io::Item` names upon loading by removing
+   * "F0::" prefixes. "F*::" prefixes are used to distinguish parameters from
+   * different scorers in the translation graph. This option is used by
+   * self-adaptive translation to support loading these `io::Item`s for
+   * training.
+   */
   void load(const std::vector<io::Item>& ioItems, bool markReloaded = true, bool dropF0prefix = false) {
     setReloaded(false);
     for(auto& item : ioItems) {

From 20cde2077be001e8af20813bc2b79887727f6abe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Fri, 3 Dec 2021 15:43:41 +0200
Subject: [PATCH 118/135] Enable option validation for adaptive marian

---
 src/common/config_validator.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp
index 916ae14d7..7ab07d7e6 100644
--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@@ -38,9 +38,9 @@ void ConfigValidator::validateOptions(cli::mode mode) const {
       validateOptionsTraining();
       break;
     case cli::mode::selfadaptive:
-      // validateOptionsTranslation();
-      // validateOptionsParallelData();
-      // validateOptionsTraining();
+      validateOptionsTranslation();
+      validateOptionsParallelData();
+      validateOptionsTraining();
       break;
     default:
       ABORT("wrong CLI mode");

From bbe5196f70f400014957a740ba5234a37f93b34d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 6 Dec 2021 14:19:55 +0200
Subject: [PATCH 119/135] Add usage instructions to the
 adaptive/client_example.py script

---
 scripts/self-adaptive/client_example.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/scripts/self-adaptive/client_example.py b/scripts/self-adaptive/client_example.py
index 6ef7757a7..72473c315 100644
--- a/scripts/self-adaptive/client_example.py
+++ b/scripts/self-adaptive/client_example.py
@@ -1,5 +1,16 @@
 #!/usr/bin/env python
 
+# This is an example for using self-adaptive translation in server mode.
+#
+# To run:
+# 1. Start self-adaptive Marian in server mode, e.g.:
+#     ./build/marian-adaptive-server -p 8080 -m model.npz -v vocap.yaml vocab.yaml \
+#         --after-batches 10 --after-epochs 10 --learn-rate 0.1 --mini-batch 15 # other options
+# 2. In a new shell, run this script:
+#     python3 ./scripts/self-adaptive/client_exmaple.py -p 8080
+#
+# For a more extensive example, see https://github.com/marian-cef/marian-examples/tree/master/adaptive
+
 from __future__ import print_function, unicode_literals, division
 
 import sys

From 85d831f2555e0453b8d7fbcbafae08ad9c3a630b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 6 Dec 2021 14:37:37 +0200
Subject: [PATCH 120/135] Mention the tutorial repo as well

---
 scripts/self-adaptive/client_example.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/self-adaptive/client_example.py b/scripts/self-adaptive/client_example.py
index 72473c315..e1fa52d37 100644
--- a/scripts/self-adaptive/client_example.py
+++ b/scripts/self-adaptive/client_example.py
@@ -10,6 +10,7 @@
 #     python3 ./scripts/self-adaptive/client_exmaple.py -p 8080
 #
 # For a more extensive example, see https://github.com/marian-cef/marian-examples/tree/master/adaptive
+# or https://github.com/tilde-nlp/runtime-domain-adaptation-tutorial
 
 from __future__ import print_function, unicode_literals, division
 

From 7bb887afb864191a6b341e6986610f9ecbd2aa77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Mon, 6 Dec 2021 14:42:13 +0200
Subject: [PATCH 121/135] Add punctiation for clarity

---
 src/common/config_parser.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 0da03706b..b845bbceb 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -418,10 +418,11 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   //
   // These options are disabled for self-adaptive translation because they seem
   // to not make much sense in that context, except for --shuffle, because they
-  // deal with the storage of training data but in self-adaptive translation
+  // deal with the storage of training data, but, in self-adaptive translation,
   // training data sets are small and they typically change for each input
   // sentence. --shuffle isn't currently supported because we use `TextInput`
-  // for training data and shuffle is a no-op in that class.
+  // for training data and shuffle is a no-op in that class. This might get
+  // implement the future.
   if (mode_ != cli::mode::selfadaptive) {
     cli.add<std::string>("--shuffle",
         "How to shuffle input data (data: shuffles data and sorted batches; batches: "

From 9f0307083ba446c1043429a6eb1eab1d01b56c50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 9 Dec 2021 11:37:03 +0200
Subject: [PATCH 122/135] Fix a typo in a comment

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 src/common/config_parser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index b845bbceb..39b788511 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -422,7 +422,7 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
   // training data sets are small and they typically change for each input
   // sentence. --shuffle isn't currently supported because we use `TextInput`
   // for training data and shuffle is a no-op in that class. This might get
-  // implement the future.
+  // implemented in the future.
   if (mode_ != cli::mode::selfadaptive) {
     cli.add<std::string>("--shuffle",
         "How to shuffle input data (data: shuffles data and sorted batches; batches: "

From 96615e7a4162c1a5696db69f0d757725b38ec864 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 9 Dec 2021 11:37:27 +0200
Subject: [PATCH 123/135] Fix a typo in a comment

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 src/common/config_parser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 39b788511..41880bea8 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -948,7 +948,7 @@ void ConfigParser::addSuboptionsInputLength(cli::CLIWrapper& cli) {
   // In self-adaptive translation, the user might want to be able to set
   // different max lengths for training and translation. In that case,
   // --max-length is assumed to be meant for training (as per the help message)
-  // and we add a --max-lenght-translate parameter for translation.
+  // and we add a --max-length-translate parameter for translation.
   if (mode_ == cli::mode::selfadaptive) {
     cli.add<size_t>("--max-length-translate",
         "Maximum input sentence length for translation",

From d4a77bae17463a8cdd23ea2ecfbde5ee7251510e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 9 Dec 2021 11:38:16 +0200
Subject: [PATCH 124/135] Fix a typo in a comment

Co-authored-by: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
---
 src/graph/parameters.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index e7f2efa19..1d7808c92 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -46,8 +46,7 @@ class Parameters {
   }
 
   /**
-   *01234 Retrieves the memory corresponding to the parameter values
-   *
+   * Retrieves the memory corresponding to the parameter values.
    * @return A vector of memorypieces each corresponding to a single parameter
    */
   std::vector<MemoryPiece::PtrType> toMemoryPieces() {

From 379418b180ae677034ce33741d6bd5726ac7c989 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 9 Dec 2021 11:51:50 +0200
Subject: [PATCH 125/135] Revert an added space

Wasn't intentional
---
 src/common/config_parser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 41880bea8..3bd16b8fa 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -108,7 +108,7 @@ ConfigParser::ConfigParser(cli::mode mode)
   // clang-format on
 }
 
-void ConfigParser::addOptionsGeneral(cli::CLIWrapper & cli) {
+void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
   int defaultWorkspace = (mode_ == cli::mode::translation) ? 512 : 2048;
 
   cli.switchGroup("General options");

From 4bb6f5c6a9aca2020544f04ea8e8db3d90ecf8c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 9 Dec 2021 12:12:08 +0200
Subject: [PATCH 126/135] Clarify the server mode handling in ConfigParser

---
 src/common/config_parser.cpp | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 3bd16b8fa..26db51e45 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -69,12 +69,27 @@ std::string const& ConfigParser::cmdLine() const {
   return cmdLine_;
 }
 
+/**
+ * Convert some special modes (currently, server-like modes) to their non-special counterparts.
+ */
+cli::mode convertSpecialModes(cli::mode mode) {
+  switch(mode) {
+    case cli::mode::server:
+      return cli::mode::translation;
+    case cli::mode::selfadaptiveServer:
+      return cli::mode::selfadaptive;
+    default:
+      return mode;
+  }
+}
+
 ConfigParser::ConfigParser(cli::mode mode)
     : cli_(config_, "Marian: Fast Neural Machine Translation in C++", "General options", "", 40),
-      mode_(mode == cli::mode::server
-                ? cli::mode::translation
-                : (mode == cli::mode::selfadaptiveServer ? cli::mode::selfadaptive : mode)) {
-
+      // Server-like modes should mostly act like their non-server counterparts
+      // when parsing options. We keep all special handling in the constructor
+      // but in the rest of the parsing code we just pretend that we have a
+      // non-server mode.
+      mode_(convertSpecialModes(mode)) {
   addOptionsGeneral(cli_);
   if (mode == cli::mode::server || mode == cli::mode::selfadaptiveServer)
     addOptionsServer(cli_);

From c41a56b9ec01eba1ca4a7924312375698566bd34 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Thu, 9 Dec 2021 18:11:04 +0200
Subject: [PATCH 127/135] Remove TSV options from self-adaptive translation

---
 src/common/config_parser.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 26db51e45..edfbb2140 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -427,7 +427,11 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
       {"1e", "0"});
 
   addSuboptionsInputLength(cli);
-  addSuboptionsTSV(cli);
+  // TSV inputs aren't currently supported for self-adaptive translation because
+  // self-adaptive translation uses a custom training data reader
+  // (`AdaptiveContextReader`) which doesn't yet support TSV.
+  if (mode_ != cli::mode::selfadaptive)
+    addSuboptionsTSV(cli);
 
   // data management options
   //

From 6c97f825017aa79f6492c171929b373f3a264009 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Fri, 10 Dec 2021 13:39:37 +0200
Subject: [PATCH 128/135] Share code between marian-server and
 marian-adaptive-server

---
 src/command/marian_adaptive_server.cpp | 57 ++----------------------
 src/command/marian_server.cpp          | 55 +----------------------
 src/translator/server_common.h         | 60 ++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 106 deletions(-)
 create mode 100644 src/translator/server_common.h

diff --git a/src/command/marian_adaptive_server.cpp b/src/command/marian_adaptive_server.cpp
index 26d6dee10..e2f03d999 100644
--- a/src/command/marian_adaptive_server.cpp
+++ b/src/command/marian_adaptive_server.cpp
@@ -1,60 +1,11 @@
-#include "marian.h"
-
-#include "3rd_party/simple-websocket-server/server_ws.hpp"
-#include "common/file_stream.h"
-#include "common/timer.h"
-#include "common/utils.h"
-#include "training/training.h"
 #include "translator/self_adaptive.h"
-
-using namespace marian;
-
-typedef SimpleWeb::SocketServer<SimpleWeb::WS> WSServer;
+#include "translator/server_common.h"
 
 int main(int argc, char **argv) {
+  using namespace marian;
+
   auto options = parseOptions(argc, argv, cli::mode::selfadaptiveServer);
   auto task = New<TrainSelfAdaptive>(options);
 
-  // Initialize web server
-  WSServer server;
-  server.config.port = options->get<size_t>("port", 8080);
-
-  auto &translate = server.endpoint["^/translate/?$"];
-
-  translate.on_message = [&task](Ptr<WSServer::Connection> connection,
-                                  Ptr<WSServer::InMessage> message) {
-    auto sendStream = std::make_shared<WSServer::OutMessage>();
-
-    // Get input text
-    auto inputText = message->string();
-
-    // Translate
-    timer::Timer timer;
-    auto outputText = task->run(inputText);
-    LOG(info, "Best translation: {}", outputText);
-    *sendStream << outputText << std::endl;
-    LOG(info, "Translation took: {:.5f}s", timer.elapsed());
-
-    // Send translation back
-    connection->send(sendStream, [](const SimpleWeb::error_code &ec) {
-      if(ec)
-        LOG(error, "Error sending message: ({}) {}", ec.value(), ec.message());
-    });
-  };
-
-  // Error Codes for error code meanings
-  // http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
-  translate.on_error = [](Ptr<WSServer::Connection> connection, const SimpleWeb::error_code &ec) {
-    LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
-  };
-
-  // Start server thread
-  std::thread serverThread([&server]() {
-    LOG(info, "Server is listening on port {}", server.config.port);
-    server.start();
-  });
-
-  serverThread.join();
-
-  return 0;
+  return runServer(task, options);
 }
diff --git a/src/command/marian_server.cpp b/src/command/marian_server.cpp
index d712e8389..ef62320b8 100644
--- a/src/command/marian_server.cpp
+++ b/src/command/marian_server.cpp
@@ -1,62 +1,11 @@
-#include "marian.h"
-#include "translator/beam_search.h"
+#include "translator/server_common.h"
 #include "translator/translator.h"
-#include "common/timer.h"
-#include "common/utils.h"
-
-#include "3rd_party/simple-websocket-server/server_ws.hpp"
-
-typedef SimpleWeb::SocketServer<SimpleWeb::WS> WSServer;
 
 int main(int argc, char **argv) {
   using namespace marian;
 
-  // Initialize translation task
   auto options = parseOptions(argc, argv, cli::mode::server, true);
   auto task = New<TranslateService<BeamSearch>>(options);
-  auto quiet = options->get<bool>("quiet-translation");
-
-  // Initialize web server
-  WSServer server;
-  server.config.port = (short)options->get<size_t>("port", 8080);
-
-  auto &translate = server.endpoint["^/translate/?$"];
-
-  translate.on_message = [&task, quiet](Ptr<WSServer::Connection> connection,
-                                        Ptr<WSServer::InMessage> message) {
-    // Get input text
-    auto inputText = message->string();
-    auto sendStream = std::make_shared<WSServer::OutMessage>();
-
-    // Translate
-    timer::Timer timer;
-    auto outputText = task->run(inputText);
-    *sendStream << outputText << std::endl;
-    if(!quiet)
-      LOG(info, "Translation took: {:.5f}s", timer.elapsed());
-
-    // Send translation back
-    connection->send(sendStream, [](const SimpleWeb::error_code &ec) {
-      if(ec)
-        LOG(error, "Error sending message: ({}) {}", ec.value(), ec.message());
-    });
-  };
-
-  // Error Codes for error code meanings
-  // http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
-  translate.on_error = [](Ptr<WSServer::Connection> /*connection*/,
-                          const SimpleWeb::error_code &ec) {
-    LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
-  };
-
-  // Start server thread
-  std::thread serverThread([&server]() {
-    server.start([](unsigned short port) {
-      LOG(info, "Server is listening on port {}", port);
-    });
-  });
-
-  serverThread.join();
 
-  return 0;
+  return runServer(task, options);
 }
diff --git a/src/translator/server_common.h b/src/translator/server_common.h
new file mode 100644
index 000000000..94cec33f4
--- /dev/null
+++ b/src/translator/server_common.h
@@ -0,0 +1,60 @@
+#include "marian.h"
+#include "translator/beam_search.h"
+#include "translator/translator.h"
+#include "common/timer.h"
+#include "common/utils.h"
+
+#include "3rd_party/simple-websocket-server/server_ws.hpp"
+
+typedef SimpleWeb::SocketServer<SimpleWeb::WS> WSServer;
+
+namespace marian {
+
+int runServer(Ptr<ModelServiceTask> task, Ptr<Options> options) {
+  auto quiet = options->get<bool>("quiet-translation");
+
+  // Initialize web server
+  WSServer server;
+  server.config.port = (short)options->get<size_t>("port", 8080);
+
+  auto &translate = server.endpoint["^/translate/?$"];
+
+  translate.on_message = [&task, quiet](Ptr<WSServer::Connection> connection,
+                                        Ptr<WSServer::InMessage> message) {
+    // Get input text
+    auto inputText = message->string();
+    auto sendStream = std::make_shared<WSServer::OutMessage>();
+
+    // Translate
+    timer::Timer timer;
+    auto outputText = task->run(inputText);
+    *sendStream << outputText << std::endl;
+    if(!quiet)
+      LOG(info, "Translation took: {:.5f}s", timer.elapsed());
+
+    // Send translation back
+    connection->send(sendStream, [](const SimpleWeb::error_code &ec) {
+      if(ec)
+        LOG(error, "Error sending message: ({}) {}", ec.value(), ec.message());
+    });
+  };
+
+  // Error Codes for error code meanings
+  // http://www.boost.org/doc/libs/1_55_0/doc/html/boost_asio/reference.html
+  translate.on_error = [](Ptr<WSServer::Connection> /*connection*/,
+                          const SimpleWeb::error_code &ec) {
+    LOG(error, "Connection error: ({}) {}", ec.value(), ec.message());
+  };
+
+  // Start server thread
+  std::thread serverThread([&server]() {
+    server.start([](unsigned short port) {
+      LOG(info, "Server is listening on port {}", port);
+    });
+  });
+
+  serverThread.join();
+
+  return 0;
+}
+}  // namespace marian

From 88308a7e2026598f5e1fa75f6126ebdd28e4d265 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Wed, 15 Dec 2021 13:27:13 +0200
Subject: [PATCH 129/135] Don't require a "models" option for self-adaptive
 translation

---
 src/common/config_validator.cpp | 8 +++++++-
 src/common/config_validator.h   | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp
index 7ab07d7e6..e8c192acd 100644
--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@@ -38,7 +38,7 @@ void ConfigValidator::validateOptions(cli::mode mode) const {
       validateOptionsTraining();
       break;
     case cli::mode::selfadaptive:
-      validateOptionsTranslation();
+      validateOptionsVocabularies();
       validateOptionsParallelData();
       validateOptionsTraining();
       break;
@@ -64,6 +64,12 @@ void ConfigValidator::validateOptionsTranslation() const {
     ABORT_IF(!filesystem::exists(modelPath), "Model file does not exist: " + modelFile);
   }
 
+  validateOptionsVocabularies();
+}
+
+// Other validation methods already do vocabulary validation but we need this
+// functionality separately for self-adaptive translation option validation
+void ConfigValidator::validateOptionsVocabularies() const {
   auto vocabs = get<std::vector<std::string>>("vocabs");
   ABORT_IF(vocabs.empty(), "Translating, but vocabularies are not given");
 
diff --git a/src/common/config_validator.h b/src/common/config_validator.h
index 0e73a9e39..c16a62726 100644
--- a/src/common/config_validator.h
+++ b/src/common/config_validator.h
@@ -20,6 +20,7 @@ class ConfigValidator {
   bool dumpConfigOnly_{false};
 
   void validateOptionsTranslation() const;
+  void validateOptionsVocabularies() const;
   void validateOptionsParallelData() const;
   void validateOptionsScoring() const;
   void validateOptionsTraining() const;

From 08d20d5264bf9625f4606275fd0d7140bdb64e0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Thu, 16 Dec 2021 12:16:03 +0200
Subject: [PATCH 130/135] Fix crashes introduced by removing some options from
 self-adaptive marian

---
 src/common/config.cpp           |  2 +-
 src/common/config_parser.cpp    |  2 +-
 src/common/config_parser.h      | 10 ++++++++++
 src/common/config_validator.cpp | 22 +++++++++++++++-------
 src/common/config_validator.h   |  9 +++++++++
 5 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/src/common/config.cpp b/src/common/config.cpp
index 9878c70b0..3e03f8a6d 100644
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@@ -73,7 +73,7 @@ void Config::initialize(ConfigParser const& cp) {
   }
 
   // guess --tsv-fields, i.e. the number of fields in a TSV input, if not set
-  if(get<bool>("tsv") && get<size_t>("tsv-fields") == 0) {
+  if(get<bool>("tsv", false) && get<size_t>("tsv-fields") == 0) {
     size_t tsvFields = 0;
 
     // use the length of --input-types if given
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index edfbb2140..0e10eb2c9 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -1118,7 +1118,7 @@ Ptr<Options> ConfigParser::parseOptions(int argc, char** argv, bool doValidate)
   // (or --data-weighting and 'weight').
   //
   // Note: this may modify the config, so it is safer to do it after --dump-config.
-  if(mode_ == cli::mode::training || get<bool>("tsv")) {
+  if(mode_ == cli::mode::training || get<bool>("tsv", false)) {
     auto inputTypes = get<std::vector<std::string>>("input-types");
     if(!inputTypes.empty()) {
       bool seenAligns = false;
diff --git a/src/common/config_parser.h b/src/common/config_parser.h
index 5429f3d2c..b0b4f9386 100644
--- a/src/common/config_parser.h
+++ b/src/common/config_parser.h
@@ -122,6 +122,16 @@ class ConfigParser {
     return config_[key].as<T>();
   }
 
+  // Return value for given option key cast to given type. Return the supplied
+  // default value if option is not set.
+  template <typename T>
+  T get(const std::string& key, T defaultValue) const {
+    if(has(key))
+      return config_[key].as<T>();
+    else
+      return defaultValue;
+  }
+
   void addOptionsGeneral(cli::CLIWrapper&);
   void addOptionsServer(cli::CLIWrapper&);
   void addOptionsModel(cli::CLIWrapper&);
diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp
index e8c192acd..cc14bcb13 100644
--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@@ -88,11 +88,14 @@ void ConfigValidator::validateOptionsParallelData() const {
   ABORT_IF(trainSets.empty(), "No train sets given in config file or on command line");
 
   auto numVocabs = get<std::vector<std::string>>("vocabs").size();
-  ABORT_IF(!get<bool>("tsv") && numVocabs > 0 && numVocabs != trainSets.size(),
+  // The "tsv" option isn't present in self-adaptive translation options so we
+  // have to explicitly default to false for the option
+  auto tsv = get<bool>("tsv", false);
+  ABORT_IF(!tsv && numVocabs > 0 && numVocabs != trainSets.size(),
            "There should be as many vocabularies as training files");
 
   // disallow, for example --tsv --train-sets file1.tsv file2.tsv
-  ABORT_IF(get<bool>("tsv") && trainSets.size() != 1,
+  ABORT_IF(tsv && trainSets.size() != 1,
       "A single file must be provided with --train-sets (or stdin) for a tab-separated input");
 
   // disallow, for example --train-sets stdin stdin or --train-sets stdin file.tsv
@@ -134,7 +137,9 @@ void ConfigValidator::validateOptionsTraining() const {
            "Model directory does not exist");
 
   std::string errorMsg = "There should be as many validation files as training files";
-  if(get<bool>("tsv"))
+  // The "tsv" option isn't present in self-adaptive translation options so we
+  // have to explicitly default to false for the option
+  if(get<bool>("tsv", false))
     errorMsg += ". If the training set is in the TSV format, validation sets have to also be a single TSV file";
 
   ABORT_IF(has("valid-sets")
@@ -142,10 +147,13 @@ void ConfigValidator::validateOptionsTraining() const {
                && !get<std::vector<std::string>>("valid-sets").empty(),
            errorMsg);
 
-  // check if --early-stopping-on has proper value
-  std::set<std::string> supportedStops = {"first", "all", "any"};
-  ABORT_IF(supportedStops.find(get<std::string>("early-stopping-on")) == supportedStops.end(),
-           "Supported options for --early-stopping-on are: first, all, any");
+  // "early-stopping" also isn't present for self-adaptive translation
+  if (has("early-stopping")) {
+    // check if --early-stopping-on has proper value
+    std::set<std::string> supportedStops = {"first", "all", "any"};
+    ABORT_IF(supportedStops.find(get<std::string>("early-stopping-on")) == supportedStops.end(),
+            "Supported options for --early-stopping-on are: first, all, any");
+  }
 
   // validations for learning rate decaying
   ABORT_IF(get<float>("lr-decay") > 1.f, "Learning rate decay factor greater than 1.0 is unusual");
diff --git a/src/common/config_validator.h b/src/common/config_validator.h
index c16a62726..e31188532 100644
--- a/src/common/config_validator.h
+++ b/src/common/config_validator.h
@@ -14,6 +14,15 @@ class ConfigValidator {
   T get(const std::string& key) const {
     return config_[key].as<T>();
   }
+  // Return value for given option key cast to given type. Return the supplied
+  // default value if option is not set.
+  template <typename T>
+  T get(const std::string& key, T defaultValue) const {
+    if(has(key))
+      return config_[key].as<T>();
+    else
+      return defaultValue;
+  }
 
   // The option --dump-config is used, so alleviate some constraints, e.g. we don't want to require
   // --train-sets or --vocabs

From 1326bb1094c471d8ce23083606a95ee50db5a8c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Thu, 16 Dec 2021 16:03:34 +0200
Subject: [PATCH 131/135] Disable parallel data validation for self-adaptive
 server mode

---
 src/common/config_validator.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/common/config_validator.cpp b/src/common/config_validator.cpp
index cc14bcb13..119525cb8 100644
--- a/src/common/config_validator.cpp
+++ b/src/common/config_validator.cpp
@@ -39,7 +39,10 @@ void ConfigValidator::validateOptions(cli::mode mode) const {
       break;
     case cli::mode::selfadaptive:
       validateOptionsVocabularies();
-      validateOptionsParallelData();
+      // Check that we're not running in server mode. In server mode, training
+      // data are passed in via the request not CLI options
+      if (!has("port"))
+        validateOptionsParallelData();
       validateOptionsTraining();
       break;
     default:

From 56cfb374ca0310e61a29c4c26d3067b62cea38ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Tue, 28 Dec 2021 14:06:46 +0200
Subject: [PATCH 132/135] Introduce a separate workspace size option for the
 translation graph

---
 src/common/config_parser.cpp   | 9 +++++++++
 src/translator/self_adaptive.h | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 0e10eb2c9..a20d69c77 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -141,6 +141,15 @@ void ConfigParser::addOptionsGeneral(cli::CLIWrapper& cli) {
   cli.add<size_t>("--workspace,-w",
     "Preallocate  arg  MB of work space",
     defaultWorkspace);
+  // Self-adaptive translation uses a training graph and a translation graph. We
+  // want to be able to prealocate different amounts of memory for both (because
+  // translation usually needs less) so we add a dedicated opiton for
+  // translation if self-adaptive translation is used.
+  if (mode_ == cli::mode::selfadaptive) {
+    cli.add<size_t>("--workspace-translate",
+      "Preallocate  arg  MB of work space for translation",
+      512);
+  }
   cli.add<std::string>("--log",
     "Log training process information to file given by  arg");
   cli.add<std::string>("--log-level",
diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 0a93d8061..45c66139a 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -40,6 +40,8 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     optionsTrans_->set<size_t>("maxi-batch", 1);
     auto maxTranslationInput = options_->get<size_t>("max-length-translate");
     optionsTrans_->set<size_t>("max-length", maxTranslationInput);
+    auto translationWorkspace = options_->get<size_t>("workspace-translate");
+    optionsTrans_->set<size_t>("workspace", translationWorkspace);
     optionsTrans_->set("shuffle", "none");
 
     auto modelFilename = options_->get<std::string>("model");

From d9cddf41b8d434a8a98cbf7a0850a748d6dde142 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@tilde.lv>
Date: Wed, 29 Dec 2021 14:47:24 +0200
Subject: [PATCH 133/135] Fix alignment printing during translation

---
 src/translator/self_adaptive.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/translator/self_adaptive.h b/src/translator/self_adaptive.h
index 45c66139a..85b4f3041 100644
--- a/src/translator/self_adaptive.h
+++ b/src/translator/self_adaptive.h
@@ -50,6 +50,19 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
     // have to adapt translation options manually.
     optionsTrans_->set<std::vector<std::string>>("models", {modelFilename});
 
+    // We mask the alignment option for training so that the alignment loss
+    // nodes (self-attention heads) don't get added to the graph (for
+    // transformers). Adding the alignment loss nodes and not supplying guided
+    // alignments during training results in a crash with "There are more (n)
+    // than one top most nodes for the backward pass". In self-adaptive
+    // translation we don't support training the alignments because they are
+    // likely to remain good enough after the few self-adaptive updates.
+    //
+    // TODO: regarding the above, make the alignment heads non-trainable; afaik,
+    // they are treated like regular attantion heads currently which might
+    // decrease alignment precision.
+    options_->set("alignment", "");
+
     auto vocabPaths = options_->get<std::vector<std::string>>("vocabs");
     std::vector<std::string> srcVocabPaths(vocabPaths.begin(), vocabPaths.end() - 1);
     cpuModel_ = New<CPULoadedModel>(options_, modelFilename, srcVocabPaths, vocabPaths.back());
@@ -144,7 +157,7 @@ class TrainSelfAdaptive : public ModelTask, public ModelServiceTask {
       Iterator trainBegin,
       Iterator trainEnd,
       Ptr<marian::CollectorBase> collector) {
-    auto printer = New<OutputPrinter>(options_, cpuModel_->TrgVocab());
+    auto printer = New<OutputPrinter>(optionsTrans_, cpuModel_->TrgVocab());
 
     for(auto testBatch : *testBatches) {
       ABORT_IF(trainBegin == trainEnd, "Context batches ran out before test batches");

From 3359bb7a831a583a8821051e85c6d8966fe6f4e2 Mon Sep 17 00:00:00 2001
From: Roman Grundkiewicz <rgrundkiewicz@gmail.com>
Date: Mon, 31 Jan 2022 17:13:36 +0000
Subject: [PATCH 134/135] Change "training-sets" to "train-sets"

---
 src/training/scheduler.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/training/scheduler.h b/src/training/scheduler.h
index 5c91c477a..96dd31467 100644
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@@ -534,10 +534,10 @@ class Scheduler : public TrainingObserver {
   }
 
   void actAfterEpoch(TrainingState& state) override {
-    // When running self-adaptive marian in server mode the "training-sets"
+    // When running self-adaptive marian in server mode the "train-sets"
     // option isn't present because the training sentences are passed in via the
     // request body
-    if (options_->has("training-sets")) {
+    if (options_->has("train-sets")) {
       // Stop if data streaming from STDIN is stopped for a TSV input.
       auto trainingSets = options_->get<std::vector<std::string>>("train-sets");
       if (trainingSets.size() > 0) {

From a274dfbe0f356294ee092315ebd9a9df4dd16c5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rihards=20Kri=C5=A1lauks?= <rihards.krislauks@gmail.com>
Date: Tue, 22 Feb 2022 13:31:36 +0200
Subject: [PATCH 135/135] Mention marian-adaptive-server in the changelog

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b4927abbd..e343fd828 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
-- Adds a `marian-adaptive` executable to enable self-adaptive translation (a.k.a, runtime domain adaptation).
+- Adds `marian-adaptive` and `marian-adaptive-server` executables to enable self-adaptive translation (a.k.a, runtime domain adaptation).
 
 ### Fixed
 - Scripts using PyYAML now use `safe_load`; see https://msg.pyyaml.org/load