From 30323c8ee7585f829d1a1779b15c19be9b8d44d6 Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Thu, 14 Mar 2024 15:12:52 +0800
Subject: [PATCH 1/6] fix: merge TENSOR_STATIC_SHAPED+TENSOR_STATIC_ALLOCED
 ->TENSOR_STATIC_READY

---
 include/Types.hpp                      |   3 +-
 src/Layer.hpp                          | 243 ++++++++++---------------
 src/Module.hpp                         |  24 +--
 src/Tensor.cpp                         |  75 +++-----
 src/Tensor.hpp                         |  15 +-
 src/backends/cpu/CPUTensorFunction.hpp |  19 ++
 6 files changed, 157 insertions(+), 222 deletions(-)
diff --git a/include/Types.hpp b/include/Types.hpp
index 1545d850..5e8602af 100644
--- a/include/Types.hpp
+++ b/include/Types.hpp
@@ -27,8 +27,7 @@ typedef enum {
 enum TensorStatus {
     TENSOR_DYNAMIC,
     TENSOR_STATIC_INIT ,
-    TENSOR_STATIC_SHAPED ,
-    TENSOR_STATIC_ALLOCED ,
+    TENSOR_STATIC_READY ,
 };
 
 enum ErrorCode {
diff --git a/src/Layer.hpp b/src/Layer.hpp
index 4c934853..d8fb9ccb 100644
--- a/src/Layer.hpp
+++ b/src/Layer.hpp
@@ -17,7 +17,6 @@
 #include <regex>
 #include <string>
 
-
 namespace mllm {
 
 class Layer {
@@ -45,20 +44,21 @@ class Layer {
     }
 
     static int cpu_thread;
+
 private:
-    std::string name_num_to_X(const std::string& input_string) {
-        std::regex pattern(R"(\.\d{1,3}\.)");  // Matches any number between 1 and 100 between two dots
-        std::string replacement = ".X.";  // The string to replace the matched pattern with
+    std::string name_num_to_X(const std::string &input_string) {
+        std::regex pattern(R"(\.\d{1,3}\.)"); // Matches any number between 1 and 100 between two dots
+        std::string replacement = ".X.";      // The string to replace the matched pattern with
         std::string output_string = std::regex_replace(input_string, pattern, replacement);
         return output_string;
     }
-    std::string name_X_to_num(const std::string& input_string, int in_idx) {
-        std::regex pattern(".X.");  // Matches any number between 1 and 100 between two dots
-        std::string replacement = "."+std::to_string(in_idx)+".";  // The string to replace the matched pattern with
+    std::string name_X_to_num(const std::string &input_string, int in_idx) {
+        std::regex pattern(".X.");                                    // Matches any number between 1 and 100 between two dots
+        std::string replacement = "." + std::to_string(in_idx) + "."; // The string to replace the matched pattern with
         std::string output_string = std::regex_replace(input_string, pattern, replacement);
         return output_string;
     }
-    void reset_KVCache(string input_name, string layer_next_name) {
+    void reset_KVCache(string input_name) {
         vector<string> renameX_names;
         renameX_names.push_back(input_name);
         const vector<string> suffixs = {"-view", ".split-0", ".split-1", ".split-2"};
@@ -73,10 +73,15 @@ class Layer {
             auto name = name_X_to_num(x_name, saved_list_idx);
             vector<int> shape = {Tensor::gph_[x_name].batch(), Tensor::gph_[x_name].head(), Tensor::gph_[x_name].sequence(), Tensor::gph_[x_name].dimension()};
             layername_2_tensorname[name] = name;
-            if (Tensor::gph_.find(name) == Tensor::gph_.end()) {
-                Tensor::gph_[name] = Tensor(backend_);
-                Tensor::gph_[name].setName(name);
+            Tensor::gph_[name] = Tensor(backend_);
+            Tensor::gph_[name].initFrom(Tensor::gph_[x_name]);
+            Tensor::gph_[name].setName(name);
+            vector<Tensor *> new_chd_tensors = {};
+            for (auto child : Tensor::gph_[x_name].childTensors()) {
+                new_chd_tensors.push_back(&Tensor::gph_[name_X_to_num(child->name(), saved_list_idx)]);
             }
+            Tensor::gph_[name].childTensors().clear();
+            Tensor::gph_[name].childTensors() = new_chd_tensors;
             if (Tensor::gph_[x_name].aggregated() == true) {
                 vector<shared_ptr<Tensor>> new_aggregated_tensors = {};
                 for (const auto &aggregated_tensor : Tensor::gph_[x_name].aggregated_tensors()) {
@@ -85,7 +90,6 @@ class Layer {
                 }
                 Tensor::gph_[name].addTensors(new_aggregated_tensors, Tensor::gph_[x_name].aggregated_dim());
             }
-            Tensor::gph_[name].reshape(shape[0], shape[1], shape[2], shape[3]);
         }
     }
 
@@ -106,14 +110,16 @@ class Layer {
             if (Tensor::gph_.find(input.name()) == Tensor::gph_.end()) {
                 Tensor::gph_[input.name()] = input;
                 Tensor::gph_[input.name()].setName(input.name());
-            }else if(input.count() !=  Tensor::gph_[input.name()].count()) {
+            } else if (input.count() != Tensor::gph_[input.name()].count()) {
                 Tensor::gph_[input.name()] = input;
                 Tensor::gph_[input.name()].setName(input.name());
             }
-            if(layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
-                if(param_["type"] == KVCACHE) {
+            auto in_name = input.name();
+            if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+                if (param_["type"] == KVCACHE) {
                     layername_2_tensorname[layer_next_name] = layer_next_name;
-                    reset_KVCache(input.name(), layer_next_name);
+                    reset_KVCache(input.name());
+                    in_name = name_X_to_num(in_name, saved_list_idx);
                 } else {
                     layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
                 }
@@ -123,32 +129,24 @@ class Layer {
                 Tensor::gph_[next_name] = Tensor(backend_);
                 Tensor::gph_[next_name].setName(next_name);
             }
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor*){})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){})};
+            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[in_name], [](Tensor *) {})};
+            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
             op_->reshape(shared_inputs, shared_outputs);
-            break;
-        }
-        case TENSOR_STATIC_SHAPED: {
-            auto next_name = layername_2_tensorname[layer_next_name];
-            assert(Tensor::gph_[input.name()].hostPtr<float>() != nullptr);
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor*){})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){})};
             op_->setUp(shared_inputs, shared_outputs);
-            if(Tensor::gph_[next_name].aggregated() == false) {
+            if (Tensor::gph_[next_name].aggregated() == false) {
                 assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
             }
             break;
         }
-        case TENSOR_STATIC_ALLOCED: {
+        case TENSOR_STATIC_READY: {
             auto next_name = layername_2_tensorname[layer_next_name];
             assert(Tensor::gph_[input.name()].hostPtr<float>() != nullptr);
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor*){})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){})};
+            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor *) {})};
+            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
             op_->execute(shared_inputs, shared_outputs);
-            if(Tensor::gph_[next_name].aggregated() == false) {
+            if (Tensor::gph_[next_name].aggregated() == false) {
                 assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
             }
-            // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
             break;
         }
         default: {
@@ -157,6 +155,7 @@ class Layer {
         }
         auto next_name = layername_2_tensorname[layer_next_name];
         Tensor::gph_[next_name].status() = Tensor::gph_[input.name()].status();
+        // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
         return Tensor::gph_[next_name];
     }
     Tensor &_2I1O_OP(Tensor &input0, Tensor &input1) {
@@ -174,8 +173,7 @@ class Layer {
         if (Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
             Tensor::gph_[input1.name()].status() = input0.status();
         }
-        if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) &&
-            Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
+        if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) && Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
             assert(input0.status() == input1.status());
         }
         switch (input0.status()) {
@@ -188,7 +186,7 @@ class Layer {
                 Tensor::gph_[input1.name()] = input1;
                 Tensor::gph_[input1.name()].setName(input1.name());
             }
-            if(layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+            if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
                 layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
             }
             auto next_name = layername_2_tensorname[layer_next_name];
@@ -197,33 +195,22 @@ class Layer {
                 Tensor::gph_[next_name].setName(next_name);
             }
             vector<shared_ptr<Tensor>> shared_inputs{
-                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor*){}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor*){})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){})};
+                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
+                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {})};
+            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
             op_->reshape(shared_inputs, shared_outputs);
-            break;
-        }
-        case TENSOR_STATIC_SHAPED: {
-            auto next_name = layername_2_tensorname[layer_next_name];
-            vector<shared_ptr<Tensor>> shared_inputs{
-                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor*){}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor*){})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){})};
             op_->setUp(shared_inputs, shared_outputs);
             assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
             break;
         }
-        case TENSOR_STATIC_ALLOCED: {
+        case TENSOR_STATIC_READY: {
             auto next_name = layername_2_tensorname[layer_next_name];
             vector<shared_ptr<Tensor>> shared_inputs{
-                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor*){}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor*){})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){})};
+                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
+                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {})};
+            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
             op_->execute(shared_inputs, shared_outputs);
             assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
-            // Tensor::gph_[input0.name()].saveNData<float>(input0.name());
-            // Tensor::gph_[input1.name()].saveNData<float>(input1.name());
-            // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
             break;
         }
         default: {
@@ -232,6 +219,9 @@ class Layer {
         }
         auto next_name = layername_2_tensorname[layer_next_name];
         Tensor::gph_[next_name].status() = Tensor::gph_[input0.name()].status();
+        // Tensor::gph_[input0.name()].saveNData<float>(input0.name());
+        // Tensor::gph_[input1.name()].saveNData<float>(input1.name());
+        // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
         return Tensor::gph_[next_name];
     }
     Tensor &_3I1O_OP(Tensor &input0, Tensor &input1, Tensor &input2) {
@@ -251,14 +241,12 @@ class Layer {
         if (Tensor::gph_.find(input2.name()) != Tensor::gph_.end()) {
             Tensor::gph_[input2.name()].status() = input0.status();
         }
-        if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) &&
-            Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
+        if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) && Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
             assert(input0.status() == input1.status());
-            }
-        if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) &&
-            Tensor::gph_.find(input2.name()) != Tensor::gph_.end()) {
+        }
+        if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) && Tensor::gph_.find(input2.name()) != Tensor::gph_.end()) {
             assert(input0.status() == input2.status());
-            }
+        }
         switch (input0.status()) {
         case TENSOR_STATIC_INIT: {
             if (Tensor::gph_.find(input0.name()) == Tensor::gph_.end()) {
@@ -273,7 +261,7 @@ class Layer {
                 Tensor::gph_[input2.name()] = input2;
                 Tensor::gph_[input2.name()].setName(input2.name());
             }
-            if(layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+            if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
                 layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
             }
             auto next_name = layername_2_tensorname[layer_next_name];
@@ -287,20 +275,11 @@ class Layer {
                 std::shared_ptr<Tensor>(&Tensor::gph_[input2.name()], [](Tensor *) {})};
             vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
             op_->reshape(shared_inputs, shared_outputs);
-            break;
-        }
-        case TENSOR_STATIC_SHAPED: {
-            auto next_name = layername_2_tensorname[layer_next_name];
-            vector<shared_ptr<Tensor>> shared_inputs{
-                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input2.name()], [](Tensor *) {})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
             op_->setUp(shared_inputs, shared_outputs);
             assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
             break;
         }
-        case TENSOR_STATIC_ALLOCED: {
+        case TENSOR_STATIC_READY: {
             auto next_name = layername_2_tensorname[layer_next_name];
             vector<shared_ptr<Tensor>> shared_inputs{
                 std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
@@ -309,7 +288,6 @@ class Layer {
             vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
             op_->execute(shared_inputs, shared_outputs);
             assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
-            // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
             break;
         }
         default: {
@@ -318,6 +296,7 @@ class Layer {
         }
         auto next_name = layername_2_tensorname[layer_next_name];
         Tensor::gph_[next_name].status() = Tensor::gph_[input0.name()].status();
+        // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
         return Tensor::gph_[next_name];
     }
     Tensor &_0I1O_OP() {
@@ -329,8 +308,8 @@ class Layer {
         string layer_next_name = "param-" + op_->name();
         switch (Module::tensor_status) {
         case TENSOR_STATIC_INIT: {
-            if(layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
-                layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);                
+            if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+                layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
             }
             auto next_name = layername_2_tensorname[layer_next_name];
             if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
@@ -338,29 +317,22 @@ class Layer {
                 Tensor::gph_[next_name].setName(next_name);
             }
             vector<shared_ptr<Tensor>> shared_inputs{};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){})};
+            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
             op_->reshape(shared_inputs, shared_outputs);
-            break;
-        }
-        case TENSOR_STATIC_SHAPED: {
-            auto next_name = layername_2_tensorname[layer_next_name];
-            vector<shared_ptr<Tensor>> shared_inputs{};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){})};
             op_->setUp(shared_inputs, shared_outputs);
-            if(Tensor::gph_[next_name].aggregated() == false) {
+            if (Tensor::gph_[next_name].aggregated() == false) {
                 assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
             }
             break;
         }
-        case TENSOR_STATIC_ALLOCED: {
+        case TENSOR_STATIC_READY: {
             auto next_name = layername_2_tensorname[layer_next_name];
             vector<shared_ptr<Tensor>> shared_inputs{};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){})};
+            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
             op_->execute(shared_inputs, shared_outputs);
-            if(Tensor::gph_[next_name].aggregated() == false) {
+            if (Tensor::gph_[next_name].aggregated() == false) {
                 assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
             }
-            // Tensor::gph_[next_name].saveData<float>();
             break;
         }
         default: {
@@ -369,6 +341,7 @@ class Layer {
         }
         auto next_name = layername_2_tensorname[layer_next_name];
         Tensor::gph_[next_name].status() = Module::tensor_status;
+        // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
         return Tensor::gph_[next_name];
     }
     vector<Tensor> _1INO_OP(Tensor &input, int N) {
@@ -390,14 +363,14 @@ class Layer {
             if (Tensor::gph_.find(input.name()) == Tensor::gph_.end()) {
                 Tensor::gph_[input.name()] = input;
                 Tensor::gph_[input.name()].setName(input.name());
-            }else if(input.count() !=  Tensor::gph_[input.name()].count()) {
+            } else if (input.count() != Tensor::gph_[input.name()].count()) {
                 Tensor::gph_[input.name()] = input;
                 Tensor::gph_[input.name()].setName(input.name());
             }
             vector<shared_ptr<Tensor>> shared_outputs = {};
             vector<string> next_names = {};
-            for (const auto& layer_next_name : layer_next_names) {
-                if(layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+            for (const auto &layer_next_name : layer_next_names) {
+                if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
                     layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
                 }
                 auto next_name = layername_2_tensorname[layer_next_name];
@@ -406,44 +379,28 @@ class Layer {
                     Tensor::gph_[next_name].setName(next_name);
                 }
                 next_names.push_back(next_name);
-                shared_outputs.push_back(std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){}));
+                shared_outputs.push_back(std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {}));
             }
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor*){})};
+            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor *) {})};
             op_->reshape(shared_inputs, shared_outputs);
-            break;
-        }
-        case TENSOR_STATIC_SHAPED: {
-            // auto next_name = layername_2_tensorname[layer_next_name];
-            vector<shared_ptr<Tensor>> shared_outputs = {};
-            vector<string> next_names = {};
-            for (const auto& layer_next_name : layer_next_names) {
-                auto next_name = layername_2_tensorname[layer_next_name];
-                next_names.push_back(next_name);
-                shared_outputs.push_back(std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){}));
-            }
-            if(Tensor::gph_[input.name()].aggregated() == false) {
-                assert(Tensor::gph_[input.name()].hostPtr<float>() != nullptr);
-            }
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor*){})};
             op_->setUp(shared_inputs, shared_outputs);
             break;
         }
-        case TENSOR_STATIC_ALLOCED: {
+        case TENSOR_STATIC_READY: {
             vector<shared_ptr<Tensor>> shared_outputs = {};
             vector<string> next_names = {};
-            for (const auto& layer_next_name : layer_next_names) {
+            for (const auto &layer_next_name : layer_next_names) {
                 auto next_name = layername_2_tensorname[layer_next_name];
                 next_names.push_back(next_name);
-                shared_outputs.push_back(std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor*){}));
+                shared_outputs.push_back(std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {}));
             }
-            if(Tensor::gph_[input.name()].aggregated() == false) {
+            if (Tensor::gph_[input.name()].aggregated() == false) {
                 assert(Tensor::gph_[input.name()].hostPtr<float>() != nullptr);
             }
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor*){})};
+            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor *) {})};
             op_->execute(shared_inputs, shared_outputs);
             for (int i = 0; i < shared_outputs.size(); ++i) {
                 assert(Tensor::gph_[next_names[i]].hostPtr<float>() != nullptr);
-                //Tensor::gph_[next_names[i]].saveNData<float>(layer_next_names[i]);
             }
             break;
         }
@@ -452,9 +409,10 @@ class Layer {
         }
         }
         vector<Tensor> output_result = {};
-        for (const auto& layer_next_name : layer_next_names) {
+        for (const auto &layer_next_name : layer_next_names) {
             auto next_name = layername_2_tensorname[layer_next_name];
             Tensor::gph_[next_name].status() = Tensor::gph_[input.name()].status();
+            // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
             output_result.push_back(Tensor::gph_[next_name]);
         }
         return output_result;
@@ -466,7 +424,6 @@ class Layer {
     OpParam param_;
     bool init_ = false;
     int saved_list_idx;
-    
 };
 
 class Linear final : public Layer {
@@ -537,13 +494,13 @@ class QuickGELU final : public Layer {
     }
 };
 
-using ActFnConstructor = std::function<Layer( const std::string&)>;
+using ActFnConstructor = std::function<Layer(const std::string &)>;
 inline std::map<std::string, ActFnConstructor> ACT_FN = {
-    {"SiLU", []( const std::string& name) { return SiLU( name); }},
-    {"ReLU", []( const std::string& name) { return ReLU( name); }},
-    {"ReLU2", [](const std::string& name) { return ReLUSquaredActivation( name); }},
-    {"GELU", [](const std::string& name) { return GELU( name); }},
-    {"QuickGELU", []( const std::string& name) { return QuickGELU( name); }},
+    {"SiLU", [](const std::string &name) { return SiLU(name); }},
+    {"ReLU", [](const std::string &name) { return ReLU(name); }},
+    {"ReLU2", [](const std::string &name) { return ReLUSquaredActivation(name); }},
+    {"GELU", [](const std::string &name) { return GELU(name); }},
+    {"QuickGELU", [](const std::string &name) { return QuickGELU(name); }},
 };
 
 class Softmax final : public Layer {
@@ -609,7 +566,7 @@ class KVCache final : public Layer {
 
 class LayerNorm final : public Layer {
 public:
-    explicit LayerNorm(int norm_size, bool bias, float epsilon,std::string name) {
+    explicit LayerNorm(int norm_size, bool bias, float epsilon, std::string name) {
         param_["norm_size"] = norm_size;
         param_["epsilon"] = epsilon;
         param_["bias"] = (float)bias;
@@ -632,10 +589,9 @@ class RMSNorm final : public Layer {
     }
 };
 
-
 class Matmul final : public Layer {
 public:
-    explicit Matmul(bool transpose0, bool transpose1,  std::string name) {
+    explicit Matmul(bool transpose0, bool transpose1, std::string name) {
         param_["transpose0"] = transpose0;
         param_["transpose1"] = transpose1;
         init(std::move(name), OpType::MATMUL);
@@ -645,14 +601,13 @@ class Matmul final : public Layer {
     }
 };
 
-
 class Split final : public Layer {
 public:
     Split() = default;
     explicit Split(int split_num, Chl split_dim, int split_dim_size, std::string name) {
-        param_["split_num"] =(float) split_num;
-        param_["split_dim"] =(float) split_dim;
-        param_["split_dim_size"] =(float) split_dim_size;
+        param_["split_num"] = (float)split_num;
+        param_["split_dim"] = (float)split_dim;
+        param_["split_dim_size"] = (float)split_dim_size;
         init(std::move(name), OpType::SPLIT);
     }
     vector<Tensor> operator()(Tensor &input) {
@@ -663,14 +618,14 @@ class Split final : public Layer {
 class Convolution2D final : public Layer {
 public:
     explicit Convolution2D(int in_channel, int out_channel, vector<int> kernal, vector<int> stride, PaddingType padding, bool bias, std::string name) {
-        param_["in_channel"] =(float) in_channel;
-        param_["out_channel"] =(float) out_channel;
-        param_["kernal_h"] =(float) kernal[0];
-        param_["kernal_w"] =(float) kernal[1];
-        param_["stride_h"] =(float) stride[0];
-        param_["stride_w"] =(float) stride[1];
-        param_["padding"] =(float) padding;
-        param_["bias"] =(float) bias;
+        param_["in_channel"] = (float)in_channel;
+        param_["out_channel"] = (float)out_channel;
+        param_["kernal_h"] = (float)kernal[0];
+        param_["kernal_w"] = (float)kernal[1];
+        param_["stride_h"] = (float)stride[0];
+        param_["stride_w"] = (float)stride[1];
+        param_["padding"] = (float)padding;
+        param_["bias"] = (float)bias;
         init(std::move(name), OpType::CONVOLUTION2D);
     }
     Tensor &operator()(Tensor &input) {
@@ -681,16 +636,16 @@ class Convolution2D final : public Layer {
 class Convolution3D final : public Layer {
 public:
     explicit Convolution3D(int in_channel, int out_channel, vector<int> kernal, vector<int> stride, PaddingType padding, bool bias, std::string name) {
-        param_["in_channel"] =(float) in_channel;
-        param_["out_channel"] =(float) out_channel;
-        param_["kernal_t"] =(float) kernal[0];
-        param_["kernal_h"] =(float) kernal[1];
-        param_["kernal_w"] =(float) kernal[2];
-        param_["stride_t"] =(float) stride[0];
-        param_["stride_h"] =(float) stride[1];
-        param_["stride_w"] =(float) stride[2];
-        param_["padding"] =(float) padding;
-        param_["bias"] =(float) bias;
+        param_["in_channel"] = (float)in_channel;
+        param_["out_channel"] = (float)out_channel;
+        param_["kernal_t"] = (float)kernal[0];
+        param_["kernal_h"] = (float)kernal[1];
+        param_["kernal_w"] = (float)kernal[2];
+        param_["stride_t"] = (float)stride[0];
+        param_["stride_h"] = (float)stride[1];
+        param_["stride_w"] = (float)stride[2];
+        param_["padding"] = (float)padding;
+        param_["bias"] = (float)bias;
         init(std::move(name), OpType::CONVOLUTION3D);
     }
     Tensor &operator()(Tensor &input) {
@@ -701,7 +656,7 @@ class Convolution3D final : public Layer {
 class Concat final : public Layer {
 public:
     explicit Concat(Chl axis, std::string name) {
-        param_["axis"] =(float)axis;
+        param_["axis"] = (float)axis;
         init(std::move(name), OpType::CAT);
     }
     Tensor &operator()(Tensor &input0, Tensor &input1) {
diff --git a/src/Module.hpp b/src/Module.hpp
index fa17b2b6..8b06a230 100644
--- a/src/Module.hpp
+++ b/src/Module.hpp
@@ -68,15 +68,9 @@ class Module {
 
             Forward(inputs, anyArgs);
             for (auto &input : inputs) {
-                input.status() = TENSOR_STATIC_SHAPED;
+                input.status() = TENSOR_STATIC_READY;
             }
-            tensor_status = TENSOR_STATIC_SHAPED;
-
-            Forward(inputs, anyArgs);
-            for (auto &input : inputs) {
-                input.status() = TENSOR_STATIC_ALLOCED;
-            }
-            tensor_status = TENSOR_STATIC_ALLOCED;
+            tensor_status = TENSOR_STATIC_READY;
 
             return Forward(inputs, anyArgs);
         } else {
@@ -84,20 +78,6 @@ class Module {
         }
     }
 
-    // vector<Tensor> call(vector<Tensor> inputs, vector<std::any> args) {
-    //     return operator()(inputs, args);
-    // }
-
-    // template <typename T>
-    // static vector<T *> List(int n) {
-    //     static_assert(std::is_base_of<Module, T>::value, "T must be a subclass of Module");
-    //
-    //     vector<T *> modules;
-    //     for (int i = 0; i < n; i++) {
-    //         modules.push_back(new T());
-    //     }
-    //     return modules;
-    // }
     static int listIdx;
     static int runlistIdx;
 
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 81ee3033..527d367a 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -148,13 +148,11 @@ Tensor &Tensor::binaryCompute(Func operation, string append_s, float data) {
             gph_[next_name].setName(next_name);
         }
         CPUbinaryFunction::reshape(gph_[name_], gph_[next_name]);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
+
         CPUbinaryFunction::setup(gph_[name_], gph_[next_name]);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUbinaryFunction::execute(gph_[name_], gph_[next_name], operation, data);
         break;
     }
@@ -207,13 +205,11 @@ Tensor &Tensor::binaryTwoCompute(Func operation, string append_s, Tensor& other)
             gph_[next_name].setName(next_name);
         }
         CPUbinaryTwoFunction::reshape(gph_[name_], gph_[other.name_], gph_[next_name]);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
+
         CPUbinaryTwoFunction::setup(gph_[name_], gph_[other.name_], gph_[next_name]);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUbinaryTwoFunction::execute(gph_[name_], gph_[other.name_], gph_[next_name], operation);
         break;
     }
@@ -253,13 +249,11 @@ Tensor& Tensor::mean(Chl axis) {
             gph_[next_name].setName(next_name);
         }
         CPUmeanFunction::reshape(gph_[name_], gph_[next_name], axis);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
+
         CPUmeanFunction::setup(gph_[name_], gph_[next_name], axis);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUmeanFunction::execute(gph_[name_], gph_[next_name], axis);
         break;
     }
@@ -287,13 +281,11 @@ Tensor& Tensor::view(int b, int h, int s, int d) {
             gph_[next_name].setName(next_name);
         }
         CPUviewFunction::reshape(gph_[name_], gph_[next_name], b, h, s, d);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
+
         CPUviewFunction::setup(gph_[name_], gph_[next_name], b, h, s, d);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUviewFunction::execute(gph_[name_], gph_[next_name]);
         break;
     }
@@ -321,13 +313,11 @@ Tensor& Tensor::flatten(Chl axis_start, Chl axis_end) {
             gph_[next_name].setName(next_name);
         }
         CPUflattenFunction::reshape(gph_[name_], gph_[next_name], axis_start, axis_end);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
+
         CPUflattenFunction::setup(gph_[name_], gph_[next_name], axis_start, axis_end);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUflattenFunction::execute(gph_[name_], gph_[next_name]);
         break;
     }
@@ -376,9 +366,9 @@ Tensor &Tensor::transpose(vector<std::pair<Chl, Chl>> axiss) {
                 gph_[next_name].changeCtype(gph_[name_].shape().size());
                 gph_[next_name].undiffusion_ = true;
             }
-            break;
-        }
-        case TENSOR_STATIC_SHAPED: {
+        //     break;
+        // }
+        // case TENSOR_STATIC_SHAPED: {
             if(gph_[name_].masterTensor() != nullptr) {
                 if (gph_[next_name].master_tensor_ == nullptr) {
                     gph_[next_name].setDtype(gph_[name_].dtype());
@@ -396,7 +386,7 @@ Tensor &Tensor::transpose(vector<std::pair<Chl, Chl>> axiss) {
             }
             break;
         }
-        case TENSOR_STATIC_ALLOCED: {
+        case TENSOR_STATIC_READY: {
             break;
         }
         default: {
@@ -424,13 +414,11 @@ Tensor &Tensor::clip(vector<int> b, vector<int> h, vector<int> s, vector<int> d)
             gph_[next_name].setName(next_name);
         }
         CPUclipFunction::reshape(gph_[name_], gph_[next_name], b, h, s, d);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
+
         CPUclipFunction::setup(gph_[name_], gph_[next_name], b, h, s, d);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUclipFunction::execute(gph_[name_], gph_[next_name], b, h, s, d);
         break;
     }
@@ -459,13 +447,11 @@ Tensor &Tensor::clip(Chl keep_axis, vector<int> b, vector<int> h, vector<int> s,
             gph_[next_name].setName(next_name);
         }
         CPUclipaxisFunction::reshape(gph_[name_], gph_[next_name], keep_axis, b, h, s, d);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
+
         CPUclipaxisFunction::setup(gph_[name_], gph_[next_name],  keep_axis, b, h, s, d);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUclipaxisFunction::execute(gph_[name_], gph_[next_name],  keep_axis, b, h, s, d);
         break;
     }
@@ -502,13 +488,10 @@ Tensor &Tensor::cat(vector<Tensor> input_tensors, Chl axis) {
             gph_[next_name].setName(next_name);
         }
         CPUcatFunction::reshape(inputs, gph_[next_name], axis, expd_batch_, expd_batch_input_idx);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
         CPUcatFunction::setup(inputs, gph_[next_name], axis, expd_batch_, expd_batch_input_idx);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUcatFunction::execute(inputs, gph_[next_name], axis, expd_batch_, expd_batch_input_idx);
         break;
     }
@@ -536,13 +519,10 @@ Tensor &Tensor::mm(Tensor& input0, Tensor& input1) {
         } else {
             CPUmmFunction::reshape(gph_[input0.name()], gph_[input1.name()], gph_[next_name]);
         }
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
         CPUmmFunction::setup(gph_[input0.name()], gph_[input1.name()], gph_[next_name]);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUmmFunction::execute(gph_[input0.name()], gph_[input1.name()], gph_[next_name]);
         break;
     }
@@ -571,13 +551,10 @@ Tensor& Tensor::norm(int L_n) {
             gph_[next_name].setName(next_name);
         }
         CPUnormFunction::reshape(gph_[name_], gph_[next_name], L_n);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
         CPUnormFunction::setup(gph_[name_], gph_[next_name], L_n);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUnormFunction::execute(gph_[name_], gph_[next_name], L_n);
         break;
     }
@@ -604,13 +581,10 @@ Tensor& Tensor::where(float value, Chl axis) {
             gph_[next_name].setName(next_name);
         }
         CPUwhereFunction::reshape(gph_[name_], gph_[next_name], value, axis);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
         CPUwhereFunction::setup(gph_[name_], gph_[next_name], value, axis);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPUwhereFunction::execute(gph_[name_], gph_[next_name], value, axis);
         break;
     }
@@ -635,13 +609,10 @@ Tensor& Tensor::range(int start, int end) {
             gph_[next_name].setName(next_name);
         }
         CPURangeFunction::reshape(gph_[next_name], start, end);
-        break;
-    }
-    case TENSOR_STATIC_SHAPED: {
         CPURangeFunction::setup(gph_[next_name], start, end);
         break;
     }
-    case TENSOR_STATIC_ALLOCED: {
+    case TENSOR_STATIC_READY: {
         CPURangeFunction::execute(gph_[next_name], start, end);
         range_name_idx++;
         break;
diff --git a/src/Tensor.hpp b/src/Tensor.hpp
index 3e791217..4dc270e9 100644
--- a/src/Tensor.hpp
+++ b/src/Tensor.hpp
@@ -646,6 +646,16 @@ class Tensor {
         assert(source.count() == count());
         memcpy(host_ptr_, source.host_ptr_, cntSize());
     }
+    void initFrom(const Tensor &source) {
+        dtype_ = source.dtype();
+        chls_ = source.chls_;
+        ctype_ = source.ctype_;
+        shape_ = source.shape_;
+        count_ = source.count_;
+        if(source.host_ptr_!= nullptr) {
+            alloc();
+        }
+    }
     void copyFrom(const shared_ptr<Tensor> &source) {
         assert(masterTensor() == nullptr);
         assert(source->dtype() == dtype());
@@ -866,7 +876,7 @@ class Tensor {
         master_tensor_ = master_tensor;
     }
 
-    vector<Tensor *> childTensors() {
+    vector<Tensor *> &childTensors() {
         return child_tensors_;
     }
     void addChildTensor(Tensor *child) {
@@ -1201,7 +1211,8 @@ class Tensor {
 
     template <typename Dtype>
     void saveNData(string new_name = "", string ex = "") {
-        if (status() == TENSOR_STATIC_ALLOCED || (TENSOR_STATIC_SHAPED == status()&& shape().size()>0)) {
+        // if (status() == TENSOR_STATIC_ALLOCED || (TENSOR_STATIC_SHAPED == status()&& shape().size()>0)) {
+        if (status() == TENSOR_STATIC_READY && shape().size()>0) {
             if (ctype() == BTHWC || ctype() == BCTHW) {
                 save5Data<Dtype>(ex);
                 return;
diff --git a/src/backends/cpu/CPUTensorFunction.hpp b/src/backends/cpu/CPUTensorFunction.hpp
index 301f115a..e6a4f077 100644
--- a/src/backends/cpu/CPUTensorFunction.hpp
+++ b/src/backends/cpu/CPUTensorFunction.hpp
@@ -29,6 +29,25 @@ class CPUmmFunction {
         input.reshape(b, h, s, d);
         input.transed() = true;
         input.undiffusion() = false;
+        // if no TENSOR_STATIC_SHAPED
+        if (input.masterTensor() != nullptr) {
+            auto b = input.masterTensor()->batch();
+            auto h = input.masterTensor()->head();
+            auto d = input.masterTensor()->dimension();
+            auto s = input.masterTensor()->sequence();
+            input.masterTensor()->chls_ = input.chls_;
+            input.masterTensor()->changeCtype();
+            input.masterTensor()->reshape(b, h, s, d);
+            for (auto child : input.masterTensor()->childTensors()) {
+                auto b = child->batch();
+                auto h = child->head();
+                auto d = child->dimension();
+                auto s = child->sequence();
+                child->chls_ = input.chls_;
+                child->changeCtype();
+                child->reshape(b, h, s, d);
+            }
+        }
     }
 public:
     static void reshape(Tensor &input0, Tensor &input1, Tensor &output) {

From 32cceab829bc894b2c1d2f2ab8557bf15c95cdfe Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Thu, 14 Mar 2024 19:46:16 +0800
Subject: [PATCH 2/6] fix: static load

---
 src/Layer.hpp                          | 556 +++++++++++++------------
 src/Module.cpp                         |   2 +-
 src/Module.hpp                         |  14 +
 src/Tensor.cpp                         |  49 ++-
 src/Tensor.hpp                         | 139 ++++---
 src/backends/cpu/CPUTensorFunction.hpp |  20 +-
 6 files changed, 405 insertions(+), 375 deletions(-)

diff --git a/src/Layer.hpp b/src/Layer.hpp
index d8fb9ccb..ba6b7fe6 100644
--- a/src/Layer.hpp
+++ b/src/Layer.hpp
@@ -29,10 +29,6 @@ class Layer {
         backend_ = Module::backends[MLLM_CPU];
         saved_list_idx = Module::listIdx;
         init_ = true;
-        // std::cout<<name_<<std::endl;
-        // constexpr int threadCount = 4;
-        // op_ = backend_->opCreate(param_, std::move(name), threadCount);
-        // op_->load(*Module::loader);
     }
     bool ready() {
         return init_;
@@ -94,282 +90,225 @@ class Layer {
     }
 
 protected:
-    Tensor &_1I1O_OP(Tensor &input) {
-        Module::runlistIdx = saved_list_idx;
+    bool INIT_OP() {
         if (op_ == nullptr) {
             op_ = backend_->opCreate(param_, name_, cpu_thread);
-            op_->load(*Module::loader);
         }
-
-        string layer_next_name = "out-" + op_->name();
-        if (Tensor::gph_.find(input.name()) != Tensor::gph_.end()) {
-            Tensor::gph_[input.name()].status() = input.status();
+        if (Module::doLoad) {
+            op_->load(*Module::loader);
         }
-        switch (input.status()) {
-        case TENSOR_STATIC_INIT: {
-            if (Tensor::gph_.find(input.name()) == Tensor::gph_.end()) {
-                Tensor::gph_[input.name()] = input;
-                Tensor::gph_[input.name()].setName(input.name());
-            } else if (input.count() != Tensor::gph_[input.name()].count()) {
-                Tensor::gph_[input.name()] = input;
-                Tensor::gph_[input.name()].setName(input.name());
-            }
-            auto in_name = input.name();
-            if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
-                if (param_["type"] == KVCACHE) {
-                    layername_2_tensorname[layer_next_name] = layer_next_name;
-                    reset_KVCache(input.name());
-                    in_name = name_X_to_num(in_name, saved_list_idx);
-                } else {
-                    layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+        return Module::doLoad;
+    }
+    Tensor &_1I1O_OP(Tensor &input) {
+        Module::runlistIdx = saved_list_idx;
+        if (INIT_OP()) {
+            return input;
+        } else {
+            string layer_next_name = "out-" + op_->name();
+            if (Tensor::gph_.find(input.name()) != Tensor::gph_.end()) {
+                Tensor::gph_[input.name()].status() = input.status();
+            }
+            switch (input.status()) {
+            case TENSOR_STATIC_INIT: {
+                if (Tensor::gph_.find(input.name()) == Tensor::gph_.end()) {
+                    Tensor::gph_[input.name()] = input;
+                    Tensor::gph_[input.name()].setName(input.name());
+                } else if (input.count() != Tensor::gph_[input.name()].count()) {
+                    Tensor::gph_[input.name()] = input;
+                    Tensor::gph_[input.name()].setName(input.name());
                 }
+                auto in_name = input.name();
+                if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+                    if (param_["type"] == KVCACHE) {
+                        layername_2_tensorname[layer_next_name] = layer_next_name;
+                        reset_KVCache(input.name());
+                        in_name = name_X_to_num(in_name, saved_list_idx);
+                    } else {
+                        layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+                    }
+                }
+                auto next_name = layername_2_tensorname[layer_next_name];
+                if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
+                    Tensor::gph_[next_name] = Tensor(backend_);
+                    Tensor::gph_[next_name].setName(next_name);
+                }
+                vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[in_name], [](Tensor *) {})};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->reshape(shared_inputs, shared_outputs);
+                op_->setUp(shared_inputs, shared_outputs);
+                if (Tensor::gph_[next_name].aggregated() == false) {
+                    assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                }
+                break;
             }
-            auto next_name = layername_2_tensorname[layer_next_name];
-            if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
-                Tensor::gph_[next_name] = Tensor(backend_);
-                Tensor::gph_[next_name].setName(next_name);
-            }
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[in_name], [](Tensor *) {})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
-            op_->reshape(shared_inputs, shared_outputs);
-            op_->setUp(shared_inputs, shared_outputs);
-            if (Tensor::gph_[next_name].aggregated() == false) {
-                assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+            case TENSOR_STATIC_READY: {
+                auto next_name = layername_2_tensorname[layer_next_name];
+                assert(Tensor::gph_[input.name()].hostPtr<float>() != nullptr);
+                vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor *) {})};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->execute(shared_inputs, shared_outputs);
+                if (Tensor::gph_[next_name].aggregated() == false) {
+                    assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                }
+                break;
             }
-            break;
-        }
-        case TENSOR_STATIC_READY: {
-            auto next_name = layername_2_tensorname[layer_next_name];
-            assert(Tensor::gph_[input.name()].hostPtr<float>() != nullptr);
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor *) {})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
-            op_->execute(shared_inputs, shared_outputs);
-            if (Tensor::gph_[next_name].aggregated() == false) {
-                assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+            default: {
+                break;
             }
-            break;
-        }
-        default: {
-            break;
-        }
+            }
+            auto next_name = layername_2_tensorname[layer_next_name];
+            Tensor::gph_[next_name].status() = Tensor::gph_[input.name()].status();
+            // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
+            return Tensor::gph_[next_name];
         }
-        auto next_name = layername_2_tensorname[layer_next_name];
-        Tensor::gph_[next_name].status() = Tensor::gph_[input.name()].status();
-        // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
-        return Tensor::gph_[next_name];
     }
     Tensor &_2I1O_OP(Tensor &input0, Tensor &input1) {
         Module::runlistIdx = saved_list_idx;
-        if (op_ == nullptr) {
-            op_ = backend_->opCreate(param_, name_, cpu_thread);
-            op_->load(*Module::loader);
-        }
-
-        string layer_next_name = "out-" + op_->name();
-        if (Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) {
-            Tensor::gph_[input0.name()].status() = input0.status();
-        }
+        if (INIT_OP()) {
+            return input0;
+        } else {
+            string layer_next_name = "out-" + op_->name();
+            if (Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) {
+                Tensor::gph_[input0.name()].status() = input0.status();
+            }
 
-        if (Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
-            Tensor::gph_[input1.name()].status() = input0.status();
-        }
-        if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) && Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
-            assert(input0.status() == input1.status());
-        }
-        switch (input0.status()) {
-        case TENSOR_STATIC_INIT: {
-            if (Tensor::gph_.find(input0.name()) == Tensor::gph_.end()) {
-                Tensor::gph_[input0.name()] = input0;
-                Tensor::gph_[input0.name()].setName(input0.name());
+            if (Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
+                Tensor::gph_[input1.name()].status() = input0.status();
             }
-            if (Tensor::gph_.find(input1.name()) == Tensor::gph_.end()) {
-                Tensor::gph_[input1.name()] = input1;
-                Tensor::gph_[input1.name()].setName(input1.name());
+            if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) && Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
+                assert(input0.status() == input1.status());
+            }
+            switch (input0.status()) {
+            case TENSOR_STATIC_INIT: {
+                if (Tensor::gph_.find(input0.name()) == Tensor::gph_.end()) {
+                    Tensor::gph_[input0.name()] = input0;
+                    Tensor::gph_[input0.name()].setName(input0.name());
+                }
+                if (Tensor::gph_.find(input1.name()) == Tensor::gph_.end()) {
+                    Tensor::gph_[input1.name()] = input1;
+                    Tensor::gph_[input1.name()].setName(input1.name());
+                }
+                if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+                    layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+                }
+                auto next_name = layername_2_tensorname[layer_next_name];
+                if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
+                    Tensor::gph_[next_name] = Tensor(backend_);
+                    Tensor::gph_[next_name].setName(next_name);
+                }
+                vector<shared_ptr<Tensor>> shared_inputs{
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {})};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->reshape(shared_inputs, shared_outputs);
+                op_->setUp(shared_inputs, shared_outputs);
+                assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                break;
+            }
+            case TENSOR_STATIC_READY: {
+                auto next_name = layername_2_tensorname[layer_next_name];
+                vector<shared_ptr<Tensor>> shared_inputs{
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {})};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->execute(shared_inputs, shared_outputs);
+                assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                break;
+            }
+            default: {
+                break;
             }
-            if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
-                layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
             }
             auto next_name = layername_2_tensorname[layer_next_name];
-            if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
-                Tensor::gph_[next_name] = Tensor(backend_);
-                Tensor::gph_[next_name].setName(next_name);
-            }
-            vector<shared_ptr<Tensor>> shared_inputs{
-                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
-            op_->reshape(shared_inputs, shared_outputs);
-            op_->setUp(shared_inputs, shared_outputs);
-            assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
-            break;
-        }
-        case TENSOR_STATIC_READY: {
-            auto next_name = layername_2_tensorname[layer_next_name];
-            vector<shared_ptr<Tensor>> shared_inputs{
-                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
-            op_->execute(shared_inputs, shared_outputs);
-            assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
-            break;
-        }
-        default: {
-            break;
-        }
+            Tensor::gph_[next_name].status() = Tensor::gph_[input0.name()].status();
+            // Tensor::gph_[input0.name()].saveNData<float>(input0.name());
+            // Tensor::gph_[input1.name()].saveNData<float>(input1.name());
+            // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
+            return Tensor::gph_[next_name];
         }
-        auto next_name = layername_2_tensorname[layer_next_name];
-        Tensor::gph_[next_name].status() = Tensor::gph_[input0.name()].status();
-        // Tensor::gph_[input0.name()].saveNData<float>(input0.name());
-        // Tensor::gph_[input1.name()].saveNData<float>(input1.name());
-        // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
-        return Tensor::gph_[next_name];
     }
     Tensor &_3I1O_OP(Tensor &input0, Tensor &input1, Tensor &input2) {
         Module::runlistIdx = saved_list_idx;
-        if (op_ == nullptr) {
-            op_ = backend_->opCreate(param_, name_, cpu_thread);
-            op_->load(*Module::loader);
-        }
-
-        string layer_next_name = "out-" + op_->name();
-        if (Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) {
-            Tensor::gph_[input0.name()].status() = input0.status();
-        }
-        if (Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
-            Tensor::gph_[input1.name()].status() = input0.status();
-        }
-        if (Tensor::gph_.find(input2.name()) != Tensor::gph_.end()) {
-            Tensor::gph_[input2.name()].status() = input0.status();
-        }
-        if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) && Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
-            assert(input0.status() == input1.status());
-        }
-        if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) && Tensor::gph_.find(input2.name()) != Tensor::gph_.end()) {
-            assert(input0.status() == input2.status());
-        }
-        switch (input0.status()) {
-        case TENSOR_STATIC_INIT: {
-            if (Tensor::gph_.find(input0.name()) == Tensor::gph_.end()) {
-                Tensor::gph_[input0.name()] = input0;
-                Tensor::gph_[input0.name()].setName(input0.name());
+        if (INIT_OP()) {
+            return input0;
+        } else {
+            string layer_next_name = "out-" + op_->name();
+            if (Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) {
+                Tensor::gph_[input0.name()].status() = input0.status();
             }
-            if (Tensor::gph_.find(input1.name()) == Tensor::gph_.end()) {
-                Tensor::gph_[input1.name()] = input1;
-                Tensor::gph_[input1.name()].setName(input1.name());
+            if (Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
+                Tensor::gph_[input1.name()].status() = input0.status();
             }
-            if (Tensor::gph_.find(input2.name()) == Tensor::gph_.end()) {
-                Tensor::gph_[input2.name()] = input2;
-                Tensor::gph_[input2.name()].setName(input2.name());
+            if (Tensor::gph_.find(input2.name()) != Tensor::gph_.end()) {
+                Tensor::gph_[input2.name()].status() = input0.status();
             }
-            if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
-                layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+            if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) && Tensor::gph_.find(input1.name()) != Tensor::gph_.end()) {
+                assert(input0.status() == input1.status());
             }
-            auto next_name = layername_2_tensorname[layer_next_name];
-            if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
-                Tensor::gph_[next_name] = Tensor(backend_);
-                Tensor::gph_[next_name].setName(next_name);
-            }
-            vector<shared_ptr<Tensor>> shared_inputs{
-                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input2.name()], [](Tensor *) {})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
-            op_->reshape(shared_inputs, shared_outputs);
-            op_->setUp(shared_inputs, shared_outputs);
-            assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
-            break;
-        }
-        case TENSOR_STATIC_READY: {
-            auto next_name = layername_2_tensorname[layer_next_name];
-            vector<shared_ptr<Tensor>> shared_inputs{
-                std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {}),
-                std::shared_ptr<Tensor>(&Tensor::gph_[input2.name()], [](Tensor *) {})};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
-            op_->execute(shared_inputs, shared_outputs);
-            assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
-            break;
-        }
-        default: {
-            break;
-        }
-        }
-        auto next_name = layername_2_tensorname[layer_next_name];
-        Tensor::gph_[next_name].status() = Tensor::gph_[input0.name()].status();
-        // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
-        return Tensor::gph_[next_name];
-    }
-    Tensor &_0I1O_OP() {
-        Module::runlistIdx = saved_list_idx;
-        if (op_ == nullptr) {
-            op_ = backend_->opCreate(param_, name_, cpu_thread);
-            op_->load(*Module::loader);
-        }
-        string layer_next_name = "param-" + op_->name();
-        switch (Module::tensor_status) {
-        case TENSOR_STATIC_INIT: {
-            if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
-                layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+            if ((Tensor::gph_.find(input0.name()) != Tensor::gph_.end()) && Tensor::gph_.find(input2.name()) != Tensor::gph_.end()) {
+                assert(input0.status() == input2.status());
             }
-            auto next_name = layername_2_tensorname[layer_next_name];
-            if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
-                Tensor::gph_[next_name] = Tensor(backend_);
-                Tensor::gph_[next_name].setName(next_name);
-            }
-            vector<shared_ptr<Tensor>> shared_inputs{};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
-            op_->reshape(shared_inputs, shared_outputs);
-            op_->setUp(shared_inputs, shared_outputs);
-            if (Tensor::gph_[next_name].aggregated() == false) {
+            switch (input0.status()) {
+            case TENSOR_STATIC_INIT: {
+                if (Tensor::gph_.find(input0.name()) == Tensor::gph_.end()) {
+                    Tensor::gph_[input0.name()] = input0;
+                    Tensor::gph_[input0.name()].setName(input0.name());
+                }
+                if (Tensor::gph_.find(input1.name()) == Tensor::gph_.end()) {
+                    Tensor::gph_[input1.name()] = input1;
+                    Tensor::gph_[input1.name()].setName(input1.name());
+                }
+                if (Tensor::gph_.find(input2.name()) == Tensor::gph_.end()) {
+                    Tensor::gph_[input2.name()] = input2;
+                    Tensor::gph_[input2.name()].setName(input2.name());
+                }
+                if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+                    layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+                }
+                auto next_name = layername_2_tensorname[layer_next_name];
+                if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
+                    Tensor::gph_[next_name] = Tensor(backend_);
+                    Tensor::gph_[next_name].setName(next_name);
+                }
+                vector<shared_ptr<Tensor>> shared_inputs{
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input2.name()], [](Tensor *) {})};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->reshape(shared_inputs, shared_outputs);
+                op_->setUp(shared_inputs, shared_outputs);
                 assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                break;
             }
-            break;
-        }
-        case TENSOR_STATIC_READY: {
-            auto next_name = layername_2_tensorname[layer_next_name];
-            vector<shared_ptr<Tensor>> shared_inputs{};
-            vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
-            op_->execute(shared_inputs, shared_outputs);
-            if (Tensor::gph_[next_name].aggregated() == false) {
+            case TENSOR_STATIC_READY: {
+                auto next_name = layername_2_tensorname[layer_next_name];
+                vector<shared_ptr<Tensor>> shared_inputs{
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input0.name()], [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input1.name()], [](Tensor *) {}),
+                    std::shared_ptr<Tensor>(&Tensor::gph_[input2.name()], [](Tensor *) {})};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->execute(shared_inputs, shared_outputs);
                 assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                break;
             }
-            break;
-        }
-        default: {
-            break;
-        }
+            default: {
+                break;
+            }
+            }
+            auto next_name = layername_2_tensorname[layer_next_name];
+            Tensor::gph_[next_name].status() = Tensor::gph_[input0.name()].status();
+            // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
+            return Tensor::gph_[next_name];
         }
-        auto next_name = layername_2_tensorname[layer_next_name];
-        Tensor::gph_[next_name].status() = Module::tensor_status;
-        // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
-        return Tensor::gph_[next_name];
     }
-    vector<Tensor> _1INO_OP(Tensor &input, int N) {
+    Tensor &_0I1O_OP() {
         Module::runlistIdx = saved_list_idx;
-        if (op_ == nullptr) {
-            op_ = backend_->opCreate(param_, name_, cpu_thread);
-            op_->load(*Module::loader);
-        }
-        if (Tensor::gph_.find(input.name()) != Tensor::gph_.end()) {
-            Tensor::gph_[input.name()].status() = input.status();
-        }
-
-        vector<string> layer_next_names = {};
-        for (int i = 0; i < N; ++i) {
-            layer_next_names.push_back("out-" + op_->name() + "-" + std::to_string(i));
-        }
-        switch (input.status()) {
-        case TENSOR_STATIC_INIT: {
-            if (Tensor::gph_.find(input.name()) == Tensor::gph_.end()) {
-                Tensor::gph_[input.name()] = input;
-                Tensor::gph_[input.name()].setName(input.name());
-            } else if (input.count() != Tensor::gph_[input.name()].count()) {
-                Tensor::gph_[input.name()] = input;
-                Tensor::gph_[input.name()].setName(input.name());
-            }
-            vector<shared_ptr<Tensor>> shared_outputs = {};
-            vector<string> next_names = {};
-            for (const auto &layer_next_name : layer_next_names) {
+        if (INIT_OP()) {
+            return Tensor::gph_["0"];
+        } else {
+            string layer_next_name = "param-" + op_->name();
+            switch (Module::tensor_status) {
+            case TENSOR_STATIC_INIT: {
                 if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
                     layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
                 }
@@ -378,44 +317,107 @@ class Layer {
                     Tensor::gph_[next_name] = Tensor(backend_);
                     Tensor::gph_[next_name].setName(next_name);
                 }
-                next_names.push_back(next_name);
-                shared_outputs.push_back(std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {}));
+                vector<shared_ptr<Tensor>> shared_inputs{};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->reshape(shared_inputs, shared_outputs);
+                op_->setUp(shared_inputs, shared_outputs);
+                if (Tensor::gph_[next_name].aggregated() == false) {
+                    assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                }
+                break;
             }
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor *) {})};
-            op_->reshape(shared_inputs, shared_outputs);
-            op_->setUp(shared_inputs, shared_outputs);
-            break;
-        }
-        case TENSOR_STATIC_READY: {
-            vector<shared_ptr<Tensor>> shared_outputs = {};
-            vector<string> next_names = {};
-            for (const auto &layer_next_name : layer_next_names) {
+            case TENSOR_STATIC_READY: {
                 auto next_name = layername_2_tensorname[layer_next_name];
-                next_names.push_back(next_name);
-                shared_outputs.push_back(std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {}));
+                vector<shared_ptr<Tensor>> shared_inputs{};
+                vector<shared_ptr<Tensor>> shared_outputs{std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {})};
+                op_->execute(shared_inputs, shared_outputs);
+                if (Tensor::gph_[next_name].aggregated() == false) {
+                    assert(Tensor::gph_[next_name].hostPtr<float>() != nullptr);
+                }
+                break;
             }
-            if (Tensor::gph_[input.name()].aggregated() == false) {
-                assert(Tensor::gph_[input.name()].hostPtr<float>() != nullptr);
+            default: {
+                break;
             }
-            vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor *) {})};
-            op_->execute(shared_inputs, shared_outputs);
-            for (int i = 0; i < shared_outputs.size(); ++i) {
-                assert(Tensor::gph_[next_names[i]].hostPtr<float>() != nullptr);
             }
-            break;
-        }
-        default: {
-            break;
-        }
-        }
-        vector<Tensor> output_result = {};
-        for (const auto &layer_next_name : layer_next_names) {
             auto next_name = layername_2_tensorname[layer_next_name];
-            Tensor::gph_[next_name].status() = Tensor::gph_[input.name()].status();
+            Tensor::gph_[next_name].status() = Module::tensor_status;
             // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
-            output_result.push_back(Tensor::gph_[next_name]);
+            return Tensor::gph_[next_name];
+        }
+    }
+    vector<Tensor> _1INO_OP(Tensor &input, int N) {
+        Module::runlistIdx = saved_list_idx;
+        if (INIT_OP()) {
+            return {input};
+        } else {
+            if (Tensor::gph_.find(input.name()) != Tensor::gph_.end()) {
+                Tensor::gph_[input.name()].status() = input.status();
+            }
+
+            vector<string> layer_next_names = {};
+            for (int i = 0; i < N; ++i) {
+                layer_next_names.push_back("out-" + op_->name() + "-" + std::to_string(i));
+            }
+            switch (input.status()) {
+            case TENSOR_STATIC_INIT: {
+                if (Tensor::gph_.find(input.name()) == Tensor::gph_.end()) {
+                    Tensor::gph_[input.name()] = input;
+                    Tensor::gph_[input.name()].setName(input.name());
+                } else if (input.count() != Tensor::gph_[input.name()].count()) {
+                    Tensor::gph_[input.name()] = input;
+                    Tensor::gph_[input.name()].setName(input.name());
+                }
+                vector<shared_ptr<Tensor>> shared_outputs = {};
+                vector<string> next_names = {};
+                for (const auto &layer_next_name : layer_next_names) {
+                    if (layername_2_tensorname.find(layer_next_name) == layername_2_tensorname.end()) {
+                        layername_2_tensorname[layer_next_name] = name_num_to_X(layer_next_name);
+                    }
+                    auto next_name = layername_2_tensorname[layer_next_name];
+                    if (Tensor::gph_.find(next_name) == Tensor::gph_.end()) {
+                        Tensor::gph_[next_name] = Tensor(backend_);
+                        Tensor::gph_[next_name].setName(next_name);
+                    }
+                    next_names.push_back(next_name);
+                    shared_outputs.push_back(std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {}));
+                }
+                vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor *) {})};
+                op_->reshape(shared_inputs, shared_outputs);
+                op_->setUp(shared_inputs, shared_outputs);
+                break;
+            }
+            case TENSOR_STATIC_READY: {
+                vector<shared_ptr<Tensor>> shared_outputs = {};
+                vector<string> next_names = {};
+                for (const auto &layer_next_name : layer_next_names) {
+                    auto next_name = layername_2_tensorname[layer_next_name];
+                    next_names.push_back(next_name);
+                    shared_outputs.push_back(std::shared_ptr<Tensor>(&Tensor::gph_[next_name], [](Tensor *) {}));
+                }
+                if (Tensor::gph_[input.name()].aggregated() == false) {
+                    assert(Tensor::gph_[input.name()].hostPtr<float>() != nullptr);
+                }
+                vector<shared_ptr<Tensor>> shared_inputs{std::shared_ptr<Tensor>(&Tensor::gph_[input.name()], [](Tensor *) {})};
+                op_->execute(shared_inputs, shared_outputs);
+                for (int i = 0; i < shared_outputs.size(); ++i) {
+                    assert(Tensor::gph_[next_names[i]].hostPtr<float>() != nullptr);
+                }
+                break;
+            }
+            default: {
+                break;
+            }
+            }
+            vector<Tensor> output_result = {};
+            for (const auto &layer_next_name : layer_next_names) {
+                auto next_name = layername_2_tensorname[layer_next_name];
+                Tensor::gph_[next_name].status() = Tensor::gph_[input.name()].status();
+                // Tensor::gph_[next_name].saveNData<float>(layer_next_name);
+                output_result.push_back(Tensor::gph_[next_name]);
+            }
+            return output_result;
         }
-        return output_result;
     }
 
     std::string name_;
diff --git a/src/Module.cpp b/src/Module.cpp
index c902f630..f6d489b8 100644
--- a/src/Module.cpp
+++ b/src/Module.cpp
@@ -11,5 +11,5 @@ ParamLoader *Module::loader;
 int Module::listIdx;
 int Module::runlistIdx;
 TensorStatus Module::tensor_status;
-
+bool Module::doLoad = false;
 } // namespace mllm
\ No newline at end of file
diff --git a/src/Module.hpp b/src/Module.hpp
index 8b06a230..e85f7624 100644
--- a/src/Module.hpp
+++ b/src/Module.hpp
@@ -21,6 +21,7 @@ class Module {
     static map<BackendType, Backend *> backends;
     static ParamLoader *loader;
     static TensorStatus tensor_status;
+    static bool doLoad;
 
     Module() = default;
     virtual ~Module() = default;
@@ -48,6 +49,16 @@ class Module {
 
     void load(string path) {
         initLoader(path);
+        Module::doLoad = true;
+        vector<Tensor> tmps;
+        int max_in_size = 5;
+        for (int i = 0; i < max_in_size; ++i) {
+            Tensor::gph_[std::to_string(i)] = Tensor();
+            tmps.push_back(Tensor::gph_[std::to_string(i)]);
+        }
+        operator()(tmps, 0);
+        Module::doLoad = false;
+        Tensor::gph_.clear();
     }
 
     virtual vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) = 0;
@@ -58,6 +69,9 @@ class Module {
     }
     template <typename... Args>
     vector<Tensor> operator()(vector<Tensor> inputs, Args... args) {
+        if(doLoad) {
+            return Forward(inputs, {});
+        }
         vector<std::any> anyArgs = convertArgsToAnyVector(args...);
         if (inputs[0].ttype() == TensorType::INPUT_TENSOR) {
             for (auto &input : inputs) {
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index 527d367a..fdbbcb72 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -27,10 +27,10 @@ Tensor::Tensor(const vector<int> &shape) :
 
 bool Tensor::reshape(const int batch, const int head, const int sequence, const int dimension) {
     vector<int> shape(4);
-    shape[chls_[BATCH]] = batch;
-    shape[chls_[HEAD]] = head;
-    shape[chls_[SEQUENCE]] = sequence;
-    shape[chls_[DIMENSION]] = dimension;
+    shape[chls()[BATCH]] = batch;
+    shape[chls()[HEAD]] = head;
+    shape[chls()[SEQUENCE]] = sequence;
+    shape[chls()[DIMENSION]] = dimension;
 
     // shape[0] = batch;
     // switch (ctype_) {
@@ -54,10 +54,10 @@ bool Tensor::reshape(const int batch, const int head, const int sequence, const
     // }
 
     // vector<int> shape1(4);
-    // shape1[chls_[BATCH]] = batch;
-    // shape1[chls_[HEAD]] = head;
-    // shape1[chls_[SEQUENCE]] = sequence;
-    // shape1[chls_[DIMENSION]] = dimension;
+    // shape1[chls()[BATCH]] = batch;
+    // shape1[chls()[HEAD]] = head;
+    // shape1[chls()[SEQUENCE]] = sequence;
+    // shape1[chls()[DIMENSION]] = dimension;
     // bool isSame = std::equal(shape.begin(), shape.end(), shape1.begin());
     // if(!isSame) {
     //     std::cout<<"";
@@ -92,11 +92,11 @@ bool Tensor::reshape(const int batch, const int channel, const int time, const i
         ctype_ = BCTHW;
     }
     vector<int> shape(5);
-    shape[chls_[BATCH]] = batch;
-    shape[chls_[CHANNLE]] = channel;
-    shape[chls_[TIME]] = time;
-    shape[chls_[HEIGHT]] = height;
-    shape[chls_[WIDTH]] = width;
+    shape[chls()[BATCH]] = batch;
+    shape[chls()[CHANNLE]] = channel;
+    shape[chls()[TIME]] = time;
+    shape[chls()[HEIGHT]] = height;
+    shape[chls()[WIDTH]] = width;
     return reshape(shape);
     // if (ctype_ != BTHWC) {
     //     ctype_ = BCTHW;
@@ -122,6 +122,7 @@ map<string, Tensor> Tensor::gph_;
 
 template <typename Func>
 Tensor &Tensor::binaryCompute(Func operation, string append_s, float data) {
+    if(Module::doLoad){return *this;}
     const std::string next_name = name_ + append_s;
     switch (status_) {
     case TENSOR_DYNAMIC: {
@@ -179,6 +180,7 @@ Tensor &Tensor::operator/(double data) {
 }
 template <typename Func>
 Tensor &Tensor::binaryTwoCompute(Func operation, string append_s, Tensor& other) {
+    if(Module::doLoad){return *this;}
     const std::string next_name = name_ + append_s;
     switch (status_) {
     case TENSOR_DYNAMIC: {
@@ -233,6 +235,7 @@ Tensor& Tensor::operator/(Tensor& other){
 }
 
 Tensor& Tensor::mean(Chl axis) {
+    if(Module::doLoad){return *this;}
     const std::string next_name = name_ + "-mean";
     switch (status_) {
     case TENSOR_DYNAMIC: {
@@ -265,6 +268,7 @@ Tensor& Tensor::mean(Chl axis) {
 }
 
 Tensor& Tensor::view(int b, int h, int s, int d) {
+    if(Module::doLoad){return *this;}
     const std::string next_name = name_ + "-view";
     switch (status_) {
     case TENSOR_DYNAMIC: {
@@ -297,6 +301,7 @@ Tensor& Tensor::view(int b, int h, int s, int d) {
 }
 
 Tensor& Tensor::flatten(Chl axis_start, Chl axis_end) {
+    if(Module::doLoad){return *this;}
     const std::string next_name = name_ + "-flatten";
     switch (status_) {
     case TENSOR_DYNAMIC: {
@@ -332,6 +337,7 @@ Tensor &Tensor::transpose(Chl axis0, Chl axis1) {
     return transpose({{axis0, axis1}});
 }
 Tensor &Tensor::transpose(vector<std::pair<Chl, Chl>> axiss) {
+    if(Module::doLoad){return *this;}
     const std::string next_name = name_ + "-transpose";
     if (next_name.find(".X.") != std::string::npos && Module::runlistIdx > 0) {}
     else {
@@ -358,10 +364,10 @@ Tensor &Tensor::transpose(vector<std::pair<Chl, Chl>> axiss) {
                 for (auto axis : axiss) {
                     auto axis0 = axis.first;
                     auto axis1 = axis.second;
-                    auto ori_0_idx = gph_[next_name].chls_[axis0];
-                    auto ori_1_idx = gph_[next_name].chls_[axis1];
-                    gph_[next_name].chls_[axis0] = ori_1_idx;
-                    gph_[next_name].chls_[axis1] = ori_0_idx;
+                    auto ori_0_idx = gph_[next_name].chls()[axis0];
+                    auto ori_1_idx = gph_[next_name].chls()[axis1];
+                    gph_[next_name].chls()[axis0] = ori_1_idx;
+                    gph_[next_name].chls()[axis1] = ori_0_idx;
                 }
                 gph_[next_name].changeCtype(gph_[name_].shape().size());
                 gph_[next_name].undiffusion_ = true;
@@ -398,6 +404,7 @@ Tensor &Tensor::transpose(vector<std::pair<Chl, Chl>> axiss) {
 }
 
 Tensor &Tensor::clip(vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
+    if(Module::doLoad){return *this;}
     const std::string next_name = name_ + "-clip";
     switch (status_) {
     case TENSOR_DYNAMIC: {
@@ -429,8 +436,8 @@ Tensor &Tensor::clip(vector<int> b, vector<int> h, vector<int> s, vector<int> d)
     return gph_[next_name];
 }
 
-
 Tensor &Tensor::clip(Chl keep_axis, vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
+    if(Module::doLoad){return *this;}
     const std::string next_name = name_ + "-clip";
     switch (status_) {
     case TENSOR_DYNAMIC: {
@@ -463,6 +470,7 @@ Tensor &Tensor::clip(Chl keep_axis, vector<int> b, vector<int> h, vector<int> s,
 }
 
 Tensor &Tensor::cat(vector<Tensor> input_tensors, Chl axis) {
+    if(Module::doLoad){return Tensor::gph_["0"];}
     const std::string next_name = input_tensors[0].name() + "-cat";
     int expd_batch_ = input_tensors[0].batch();
     int expd_batch_input_idx = 0;
@@ -503,6 +511,7 @@ Tensor &Tensor::cat(vector<Tensor> input_tensors, Chl axis) {
 }
 
 Tensor &Tensor::mm(Tensor& input0, Tensor& input1) {
+    if(Module::doLoad){return Tensor::gph_["0"];}
     const std::string next_name = input0.name() + "-mm-" + input1.name();
     switch (input0.status()) {
     case TENSOR_DYNAMIC: {
@@ -534,6 +543,7 @@ Tensor &Tensor::mm(Tensor& input0, Tensor& input1) {
 }
 
 Tensor& Tensor::norm(int L_n) {
+    if(Module::doLoad){return *this;}
     assert(L_n ==1 || L_n ==2);
     const std::string next_name = name_ + "-norm";
     switch (status_) {
@@ -564,7 +574,9 @@ Tensor& Tensor::norm(int L_n) {
     gph_[next_name].status() = status_;
     return gph_[next_name];
 }
+
 Tensor& Tensor::where(float value, Chl axis) {
+    if(Module::doLoad){return *this;}
     const std::string next_name = name_ + "-where";
     switch (status_) {
     case TENSOR_DYNAMIC: {
@@ -596,6 +608,7 @@ Tensor& Tensor::where(float value, Chl axis) {
 }
 
 Tensor& Tensor::range(int start, int end) {
+    if(Module::doLoad){return Tensor::gph_["0"];}
     static int range_name_idx = 0;
     const std::string next_name = "range" + std::to_string(range_name_idx);
     switch (Module::tensor_status) {
diff --git a/src/Tensor.hpp b/src/Tensor.hpp
index 4dc270e9..c2b2b75b 100644
--- a/src/Tensor.hpp
+++ b/src/Tensor.hpp
@@ -68,11 +68,12 @@ class Tensor {
         }
     }
     static map<string, Tensor> gph_;
-
-    std::map<Chl, int> chls_ = {{BATCH, 0}, {SEQUENCE, 1}, {HEAD, 2}, {DIMENSION, 3},
-                                {CHANNLE, 1}, {TIME, 2}, {HEIGHT, 3}, {WIDTH, 4}};
-
+    std::map<Chl, int>& chls() {
+        return chls_;
+    }
 private:
+    std::map<Chl, int> chls_={{BATCH, 0}, {SEQUENCE, 1}, {HEAD, 2}, {DIMENSION, 3},
+                                {CHANNLE, 1}, {TIME, 2}, {HEIGHT, 3}, {WIDTH, 4}};
     string name_;
     DataType dtype_;
     ChlType ctype_ = BSHD;
@@ -220,16 +221,16 @@ class Tensor {
      */
 
     int batch() {
-        return legacyShape(chls_[BATCH]);
+        return legacyShape(chls()[BATCH]);
     }
     int head() {
-        return legacyShape(chls_[HEAD]);
+        return legacyShape(chls()[HEAD]);
     }
     int sequence() {
-        return legacyShape(chls_[SEQUENCE]);
+        return legacyShape(chls()[SEQUENCE]);
     }
     int dimension() {
-        return legacyShape(chls_[DIMENSION]);
+        return legacyShape(chls()[DIMENSION]);
     }
 
     /**
@@ -499,36 +500,36 @@ class Tensor {
         ctype_ = type;
         switch (ctype_) {
         case BSHD:
-            chls_[BATCH] = 0;
-            chls_[SEQUENCE] = 1;
-            chls_[HEAD] = 2;
-            chls_[DIMENSION] = 3;
+            chls()[BATCH] = 0;
+            chls()[SEQUENCE] = 1;
+            chls()[HEAD] = 2;
+            chls()[DIMENSION] = 3;
             break;
         case BHDS:
-            chls_[BATCH] = 0;
-            chls_[HEAD] = 1;
-            chls_[DIMENSION] = 2;
-            chls_[SEQUENCE] = 3;
+            chls()[BATCH] = 0;
+            chls()[HEAD] = 1;
+            chls()[DIMENSION] = 2;
+            chls()[SEQUENCE] = 3;
             break;
         case SBHD:
-            chls_[SEQUENCE] = 0;
-            chls_[BATCH] = 1;
-            chls_[HEAD] = 2;
-            chls_[DIMENSION] = 3;
+            chls()[SEQUENCE] = 0;
+            chls()[BATCH] = 1;
+            chls()[HEAD] = 2;
+            chls()[DIMENSION] = 3;
             break;
         case BTHWC:
-            chls_[BATCH] = 0;
-            chls_[TIME] = 1;
-            chls_[HEIGHT] = 2;
-            chls_[WIDTH] = 3;
-            chls_[CHANNLE] = 3;
+            chls()[BATCH] = 0;
+            chls()[TIME] = 1;
+            chls()[HEIGHT] = 2;
+            chls()[WIDTH] = 3;
+            chls()[CHANNLE] = 3;
             break;
         case BCTHW:
-            chls_[BATCH] = 0;
-            chls_[CHANNLE] = 1;
-            chls_[TIME] = 2;
-            chls_[HEIGHT] = 3;
-            chls_[WIDTH] = 3;
+            chls()[BATCH] = 0;
+            chls()[CHANNLE] = 1;
+            chls()[TIME] = 2;
+            chls()[HEIGHT] = 3;
+            chls()[WIDTH] = 3;
             break;
         default:
             break;
@@ -577,12 +578,12 @@ class Tensor {
             auto d = dimension();
             auto s = sequence();
             ctype_ = BHDS;
-            auto ori_seq_idx = chls_[SEQUENCE];
-            auto ori_head_idx = chls_[HEAD];
-            auto ori_dim_idx = chls_[DIMENSION];
-            chls_[HEAD] = ori_seq_idx;
-            chls_[DIMENSION] = ori_head_idx;
-            chls_[SEQUENCE] = ori_dim_idx;
+            auto ori_seq_idx = chls()[SEQUENCE];
+            auto ori_head_idx = chls()[HEAD];
+            auto ori_dim_idx = chls()[DIMENSION];
+            chls()[HEAD] = ori_seq_idx;
+            chls()[DIMENSION] = ori_head_idx;
+            chls()[SEQUENCE] = ori_dim_idx;
             reshape(b, h, s, d);
             transed_ = true;
             undiffusion_ = undiffusion;
@@ -592,12 +593,12 @@ class Tensor {
             auto d = dimension();
             auto s = sequence();
             ctype_ = BSHD;
-            auto ori_seq_idx = chls_[SEQUENCE];
-            auto ori_head_idx = chls_[HEAD];
-            auto ori_dim_idx = chls_[DIMENSION];
-            chls_[SEQUENCE] = ori_head_idx;
-            chls_[HEAD] = ori_dim_idx;
-            chls_[DIMENSION] = ori_seq_idx;
+            auto ori_seq_idx = chls()[SEQUENCE];
+            auto ori_head_idx = chls()[HEAD];
+            auto ori_dim_idx = chls()[DIMENSION];
+            chls()[SEQUENCE] = ori_head_idx;
+            chls()[HEAD] = ori_dim_idx;
+            chls()[DIMENSION] = ori_seq_idx;
             reshape(b, h, s, d);
             transed_ = false;
             undiffusion_ = undiffusion;
@@ -608,14 +609,14 @@ class Tensor {
             auto h = height();
             auto w = width();
             ctype_ = BTHWC;
-            auto ori_chl_idx = chls_[CHANNLE];
-            auto ori_time_idx = chls_[TIME];
-            auto ori_height_idx = chls_[HEIGHT];
-            auto ori_width_idx = chls_[WIDTH];
-            chls_[TIME] = ori_chl_idx;
-            chls_[HEIGHT] = ori_time_idx;
-            chls_[WIDTH] = ori_height_idx;
-            chls_[CHANNLE] = ori_width_idx;
+            auto ori_chl_idx = chls()[CHANNLE];
+            auto ori_time_idx = chls()[TIME];
+            auto ori_height_idx = chls()[HEIGHT];
+            auto ori_width_idx = chls()[WIDTH];
+            chls()[TIME] = ori_chl_idx;
+            chls()[HEIGHT] = ori_time_idx;
+            chls()[WIDTH] = ori_height_idx;
+            chls()[CHANNLE] = ori_width_idx;
             reshape(b, c, t, h, w);
             transed_ = true;
             undiffusion_ = undiffusion;
@@ -625,10 +626,10 @@ class Tensor {
             auto d = dimension();
             auto s = sequence();
             ctype_ = SBHD;
-            auto ori_batch_idx = chls_[BATCH];
-            auto ori_seq_idx = chls_[SEQUENCE];
-            chls_[SEQUENCE] = ori_batch_idx;
-            chls_[BATCH] = ori_seq_idx;
+            auto ori_batch_idx = chls()[BATCH];
+            auto ori_seq_idx = chls()[SEQUENCE];
+            chls()[SEQUENCE] = ori_batch_idx;
+            chls()[BATCH] = ori_seq_idx;
             reshape(b, h, s, d);
             transed_ = true;
             undiffusion_ = undiffusion;
@@ -675,10 +676,10 @@ class Tensor {
             size = shape().size();
         }
         if(size == 4) {
-            vector<int> a = {chls_[BATCH] , chls_[HEAD] , chls_[SEQUENCE] , chls_[DIMENSION]};
+            vector<int> a = {chls()[BATCH] , chls()[HEAD] , chls()[SEQUENCE] , chls()[DIMENSION]};
             ctype_ = Chls2Type[a];
         }else {
-            vector<int> a = {chls_[BATCH] , chls_[TIME] , chls_[HEIGHT] , chls_[WIDTH] , chls_[CHANNLE]};
+            vector<int> a = {chls()[BATCH] , chls()[TIME] , chls()[HEIGHT] , chls()[WIDTH] , chls()[CHANNLE]};
             ctype_ = Chls2Type[a];
         }
     }
@@ -782,10 +783,10 @@ class Tensor {
                 auto tf = trans_from_[i];
                 auto axis0 = tf.first;
                 auto axis1 = tf.second;
-                auto ori_0_idx = child_tensors_[0]->chls_[axis0];
-                auto ori_1_idx = child_tensors_[0]->chls_[axis1];
-                child_tensors_[0]->chls_[axis0] = ori_1_idx;
-                child_tensors_[0]->chls_[axis1] = ori_0_idx;
+                auto ori_0_idx = child_tensors_[0]->chls()[axis0];
+                auto ori_1_idx = child_tensors_[0]->chls()[axis1];
+                child_tensors_[0]->chls()[axis0] = ori_1_idx;
+                child_tensors_[0]->chls()[axis1] = ori_0_idx;
             }
             changeCtype();
             child_tensors_[0]->changeCtype();
@@ -808,10 +809,10 @@ class Tensor {
                 auto tf = trans_from_[i];
                 auto axis0 = tf.first;
                 auto axis1 = tf.second;
-                auto ori_0_idx = child_tensors_[0]->chls_[axis0];
-                auto ori_1_idx = child_tensors_[0]->chls_[axis1];
-                child_tensors_[0]->chls_[axis0] = ori_1_idx;
-                child_tensors_[0]->chls_[axis1] = ori_0_idx;
+                auto ori_0_idx = child_tensors_[0]->chls()[axis0];
+                auto ori_1_idx = child_tensors_[0]->chls()[axis1];
+                child_tensors_[0]->chls()[axis0] = ori_1_idx;
+                child_tensors_[0]->chls()[axis1] = ori_0_idx;
             }
             // chls_ ={{BATCH, 0}, {CHANNLE, 1}, {TIME, 2}, {HEIGHT, 3}, {WIDTH, 4}};
             // child_tensors_[0]->chls_ = {{BATCH, 0}, {CHANNLE, 4}, {TIME, 1}, {HEIGHT, 2}, {WIDTH, 3}};
@@ -833,7 +834,7 @@ class Tensor {
             shape_offset_ = shape_offset;
             shape_master_ = {source->batch(), source->head(), source->sequence(), source->dimension()};
             if (!std::equal(source->chls_.begin(), source->chls_.end(), chls_.begin())) {
-                if(chls_[SEQUENCE] == source->chls_[DIMENSION] && source->chls_[SEQUENCE] == chls_[DIMENSION]) {
+                if(chls()[SEQUENCE] == source->chls()[DIMENSION] && source->chls()[SEQUENCE] == chls()[DIMENSION]) {
                     shape_master_ = {source->batch(), source->head(), source->dimension(), source->sequence()};
                     shape_offset_ = {shape_offset[0], shape_offset[1], shape_offset[3], shape_offset[2]};
                 } else {
@@ -984,7 +985,7 @@ class Tensor {
      */
     int channel() {
         assert(shape().size() == 5);
-        return legacyShape(chls_[CHANNLE]);
+        return legacyShape(chls()[CHANNLE]);
         // switch (ctype_) {
         // case BCTHW:
         //     return legacyShape(1);
@@ -995,7 +996,7 @@ class Tensor {
     }
     int time() {
         assert(shape().size() == 5);
-        return legacyShape(chls_[TIME]);
+        return legacyShape(chls()[TIME]);
         switch (ctype_) {
         case BCTHW:
             return legacyShape(2);
@@ -1006,7 +1007,7 @@ class Tensor {
     }
     int height()  {
         assert(shape().size() == 5);
-        return legacyShape(chls_[HEIGHT]);
+        return legacyShape(chls()[HEIGHT]);
         // switch (ctype_) {
         // case BCTHW:
         //     return legacyShape(3);
@@ -1017,7 +1018,7 @@ class Tensor {
     }
     int width()  {
         assert(shape().size() == 5);
-        return legacyShape(chls_[WIDTH]);
+        return legacyShape(chls()[WIDTH]);
         // switch (ctype_) {
         // case BCTHW:
         //     return legacyShape(4);
diff --git a/src/backends/cpu/CPUTensorFunction.hpp b/src/backends/cpu/CPUTensorFunction.hpp
index e6a4f077..caf74d97 100644
--- a/src/backends/cpu/CPUTensorFunction.hpp
+++ b/src/backends/cpu/CPUTensorFunction.hpp
@@ -19,12 +19,12 @@ class CPUmmFunction {
         auto h = input.head();
         auto d = input.dimension();
         auto s = input.sequence();
-        auto ori_seq_idx = input.chls_[SEQUENCE];
-        auto ori_head_idx = input.chls_[HEAD];
-        auto ori_dim_idx = input.chls_[DIMENSION];
-        input.chls_[HEAD] = ori_seq_idx;
-        input.chls_[DIMENSION] = ori_head_idx;
-        input.chls_[SEQUENCE] = ori_dim_idx;
+        auto ori_seq_idx = input.chls()[SEQUENCE];
+        auto ori_head_idx = input.chls()[HEAD];
+        auto ori_dim_idx = input.chls()[DIMENSION];
+        input.chls()[HEAD] = ori_seq_idx;
+        input.chls()[DIMENSION] = ori_head_idx;
+        input.chls()[SEQUENCE] = ori_dim_idx;
         input.changeCtype();
         input.reshape(b, h, s, d);
         input.transed() = true;
@@ -35,7 +35,7 @@ class CPUmmFunction {
             auto h = input.masterTensor()->head();
             auto d = input.masterTensor()->dimension();
             auto s = input.masterTensor()->sequence();
-            input.masterTensor()->chls_ = input.chls_;
+            input.masterTensor()->chls() = input.chls();
             input.masterTensor()->changeCtype();
             input.masterTensor()->reshape(b, h, s, d);
             for (auto child : input.masterTensor()->childTensors()) {
@@ -43,7 +43,7 @@ class CPUmmFunction {
                 auto h = child->head();
                 auto d = child->dimension();
                 auto s = child->sequence();
-                child->chls_ = input.chls_;
+                child->chls() = input.chls();
                 child->changeCtype();
                 child->reshape(b, h, s, d);
             }
@@ -51,7 +51,7 @@ class CPUmmFunction {
     }
 public:
     static void reshape(Tensor &input0, Tensor &input1, Tensor &output) {
-        if(input1.chls_[SEQUENCE] != 3) {
+        if(input1.chls()[SEQUENCE] != 3) {
             tranTensorChl(input1);
         }
         assert(input0.dimension() == input1.sequence());
@@ -64,7 +64,7 @@ class CPUmmFunction {
         output.alloc();
     }
     static void execute(Tensor &input0, Tensor &input1, Tensor &output) {
-        bool isSame = std::equal(input0.chls_.begin(), input0.chls_.end(), input1.chls_.begin());
+        bool isSame = std::equal(input0.chls().begin(), input0.chls().end(), input1.chls().begin());
         assert(input0.dtype() == MLLM_TYPE_F32);
         switch (input1.dtype()) {
         case MLLM_TYPE_F32: {

From 9157339b080387ed16fb8f1e170b527fba24871e Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Fri, 15 Mar 2024 01:46:22 +0000
Subject: [PATCH 3/6] fix: load inputs

---
 src/Layer.hpp  | 6 +++++-
 src/Module.hpp | 7 ++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/Layer.hpp b/src/Layer.hpp
index ba6b7fe6..7a1e03dd 100644
--- a/src/Layer.hpp
+++ b/src/Layer.hpp
@@ -349,7 +349,11 @@ class Layer {
     vector<Tensor> _1INO_OP(Tensor &input, int N) {
         Module::runlistIdx = saved_list_idx;
         if (INIT_OP()) {
-            return {input};
+            vector<Tensor> out;
+            for (int i = 0; i < N; ++i) {
+                out.push_back(input);
+            }
+            return out;
         } else {
             if (Tensor::gph_.find(input.name()) != Tensor::gph_.end()) {
                 Tensor::gph_[input.name()].status() = input.status();
diff --git a/src/Module.hpp b/src/Module.hpp
index e85f7624..16581295 100644
--- a/src/Module.hpp
+++ b/src/Module.hpp
@@ -56,7 +56,8 @@ class Module {
             Tensor::gph_[std::to_string(i)] = Tensor();
             tmps.push_back(Tensor::gph_[std::to_string(i)]);
         }
-        operator()(tmps, 0);
+        vector<int> tmpt = {0, 0};
+        operator()(tmps, tmpt);
         Module::doLoad = false;
         Tensor::gph_.clear();
     }
@@ -69,10 +70,10 @@ class Module {
     }
     template <typename... Args>
     vector<Tensor> operator()(vector<Tensor> inputs, Args... args) {
+        vector<std::any> anyArgs = convertArgsToAnyVector(args...);
         if(doLoad) {
-            return Forward(inputs, {});
+            return Forward(inputs, anyArgs);
         }
-        vector<std::any> anyArgs = convertArgsToAnyVector(args...);
         if (inputs[0].ttype() == TensorType::INPUT_TENSOR) {
             for (auto &input : inputs) {
                 input.setTtype(TensorType::NORMAL_TENSOR);

From 9ce646a2e86e09d39bbb4970c1017d9ab6046d93 Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Fri, 15 Mar 2024 02:23:49 +0000
Subject: [PATCH 4/6] fix: cjls() error in testloader

---
 .gitignore            | 1 +
 examples/demo_vit.cpp | 1 -
 test/TestLoader.cpp   | 4 ++--
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitignore b/.gitignore
index 655f4a28..97d630c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,4 @@ Makefile
 models/*
 /.devcontainer/
 /.vscode/
+models
diff --git a/examples/demo_vit.cpp b/examples/demo_vit.cpp
index dba88d3e..361e4006 100644
--- a/examples/demo_vit.cpp
+++ b/examples/demo_vit.cpp
@@ -1,5 +1,4 @@
 #include <iostream>
-#include <utility>
 #include "cmdline.h"
 #include "models/vit/modeling_vit.hpp"
 #include "models/vit/labels_vit.hpp"
diff --git a/test/TestLoader.cpp b/test/TestLoader.cpp
index 4ad9bafa..4d0cfd05 100644
--- a/test/TestLoader.cpp
+++ b/test/TestLoader.cpp
@@ -71,13 +71,13 @@ bool TestLoader::load(Tensor *tensor, bool strict) {
         }
     }
     if(index->dims.size() == 5) {
-        tensor->chls_ = {{BATCH, 0},{CHANNLE, 1}, {TIME, 2}, {HEIGHT, 3}, {WIDTH, 4}};
+        tensor->chls() = {{BATCH, 0},{CHANNLE, 1}, {TIME, 2}, {HEIGHT, 3}, {WIDTH, 4}};
         tensor->setCtype(BCTHW);
     }
     if (tensor->shape().empty()) {
         // Get shape from TensorIndex
         if(index->dims.size() == 5) {
-        tensor->chls_ = {{BATCH, 0},{CHANNLE, 1}, {TIME, 2}, {HEIGHT, 3}, {WIDTH, 4}};
+        tensor->chls() = {{BATCH, 0},{CHANNLE, 1}, {TIME, 2}, {HEIGHT, 3}, {WIDTH, 4}};
             tensor->reshape(index->dims[0], index->dims[1], index->dims[2], index->dims[3], index->dims[4]);
         }else {
             tensor->reshape(index->dims[0], index->dims[1], index->dims[2], index->dims[3]);

From 6048dae88bfc1c3e5db0c5d872dd46145992804b Mon Sep 17 00:00:00 2001
From: yirongjie <yirj0809@gmail.com>
Date: Fri, 15 Mar 2024 10:04:39 +0000
Subject: [PATCH 5/6] fix: merge Tensor CPU Functions

---
 src/Tensor.cpp                                | 546 +++---------------
 src/Tensor.hpp                                |  21 +-
 src/backends/cpu/CPUTensorFunction.hpp        | 276 ++++-----
 .../transformer/configuration_transformer.hpp |   2 +
 4 files changed, 201 insertions(+), 644 deletions(-)

diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index fdbbcb72..f1926a50 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -4,6 +4,7 @@
 #include "backends/cpu/CPUTensorFunction.hpp"
 
 #include <Module.hpp>
+#include <vector>
 
 namespace mllm {
 
@@ -31,37 +32,6 @@ bool Tensor::reshape(const int batch, const int head, const int sequence, const
     shape[chls()[HEAD]] = head;
     shape[chls()[SEQUENCE]] = sequence;
     shape[chls()[DIMENSION]] = dimension;
-
-    // shape[0] = batch;
-    // switch (ctype_) {
-    // case BSHD:
-    //     shape[1] = sequence;
-    //     shape[2] = head;
-    //     shape[3] = dimension;
-    //     break;
-    // case BHDS:
-    //     shape[1] = head;
-    //     shape[2] = dimension;
-    //     shape[3] = sequence;
-    //     break;
-    // case SBHD:
-    //     shape[0] = sequence;
-    //     shape[1] = batch;
-    //     shape[2] = head;
-    //     shape[3] = dimension;
-    // default:
-    //     break;
-    // }
-
-    // vector<int> shape1(4);
-    // shape1[chls()[BATCH]] = batch;
-    // shape1[chls()[HEAD]] = head;
-    // shape1[chls()[SEQUENCE]] = sequence;
-    // shape1[chls()[DIMENSION]] = dimension;
-    // bool isSame = std::equal(shape.begin(), shape.end(), shape1.begin());
-    // if(!isSame) {
-    //     std::cout<<"";
-    // }
     return reshape(shape);
 }
 
@@ -87,7 +57,6 @@ void Tensor::alloc() {
 }
 
 bool Tensor::reshape(const int batch, const int channel, const int time, const int height, const int width) {
-
     if (ctype_ != BTHWC) {
         ctype_ = BCTHW;
     }
@@ -98,47 +67,15 @@ bool Tensor::reshape(const int batch, const int channel, const int time, const i
     shape[chls()[HEIGHT]] = height;
     shape[chls()[WIDTH]] = width;
     return reshape(shape);
-    // if (ctype_ != BTHWC) {
-    //     ctype_ = BCTHW;
-    //     vector<int> shape(5);
-    //     shape[0] = batch;
-    //     shape[1] = channel;
-    //     shape[2] = time;
-    //     shape[3] = height;
-    //     shape[4] = width;
-    //     return reshape(shape);
-    // } else {
-    //     vector<int> shape(5);
-    //     shape[0] = batch;
-    //     shape[1] = time;
-    //     shape[2] = height;
-    //     shape[3] = width;
-    //     shape[4] = channel;
-    //     return reshape(shape);
-    // }
 }
 
 map<string, Tensor> Tensor::gph_;
 
-template <typename Func>
-Tensor &Tensor::binaryCompute(Func operation, string append_s, float data) {
-    if(Module::doLoad){return *this;}
-    const std::string next_name = name_ + append_s;
+template <typename Func, typename... Args>
+Tensor &Tensor::applyFunc(const std::string &suffix, Func func, Args... args) {
+    if (Module::doLoad) { return *this; }
+    const std::string next_name = name_ + "-" + suffix;
     switch (status_) {
-    case TENSOR_DYNAMIC: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUbinaryFunction::reshape(gph_[name_], gph_[next_name]);
-        CPUbinaryFunction::setup(gph_[name_], gph_[next_name]);
-        CPUbinaryFunction::execute(gph_[name_], gph_[next_name], operation, data);
-        break;
-    }
     case TENSOR_STATIC_INIT: {
         if (gph_.find(name_) == gph_.end()) {
             gph_[name_] = *this;
@@ -148,13 +85,11 @@ Tensor &Tensor::binaryCompute(Func operation, string append_s, float data) {
             gph_[next_name] = Tensor(backend_);
             gph_[next_name].setName(next_name);
         }
-        CPUbinaryFunction::reshape(gph_[name_], gph_[next_name]);
-
-        CPUbinaryFunction::setup(gph_[name_], gph_[next_name]);
+        func.setup(gph_[name_], gph_[next_name], args...);
         break;
     }
     case TENSOR_STATIC_READY: {
-        CPUbinaryFunction::execute(gph_[name_], gph_[next_name], operation, data);
+        func.execute(gph_[name_], gph_[next_name], args...);
         break;
     }
     default: {
@@ -163,477 +98,126 @@ Tensor &Tensor::binaryCompute(Func operation, string append_s, float data) {
     gph_[next_name].status() = status_;
     return gph_[next_name];
 }
+
+template <typename Func>
+Tensor &Tensor::binaryCompute(Func operation, string append_s, float data) {
+    return applyFunc(append_s, CPUbinaryFunction(), operation, data);
+}
+
 Tensor &Tensor::operator+(float data) {
-    return binaryCompute(std::plus<float>(), "-TDadd",  data);
+    return binaryCompute(std::plus<float>(), "-TDadd", data);
 }
 Tensor &Tensor::operator-(float data) {
-    return binaryCompute(std::minus<float>(), "-TDsub",  data);
+    return binaryCompute(std::minus<float>(), "-TDsub", data);
 }
 Tensor &Tensor::operator*(float data) {
-    return binaryCompute(std::multiplies<float>(), "-TDmul",  data);
+    return binaryCompute(std::multiplies<float>(), "-TDmul", data);
 }
 Tensor &Tensor::operator/(float data) {
-    return binaryCompute(std::divides<float>(), "-TDdiv",  data);
+    return binaryCompute(std::divides<float>(), "-TDdiv", data);
 }
 Tensor &Tensor::operator/(double data) {
-    return binaryCompute(std::divides<float>(), "-TDdiv",  static_cast<float>(data));
+    return binaryCompute(std::divides<float>(), "-TDdiv", static_cast<float>(data));
 }
-template <typename Func>
-Tensor &Tensor::binaryTwoCompute(Func operation, string append_s, Tensor& other) {
-    if(Module::doLoad){return *this;}
-    const std::string next_name = name_ + append_s;
-    switch (status_) {
-    case TENSOR_DYNAMIC: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUbinaryTwoFunction::reshape(gph_[name_], gph_[other.name_], gph_[next_name]);
-        CPUbinaryTwoFunction::setup(gph_[name_], gph_[other.name_], gph_[next_name]);
-        CPUbinaryTwoFunction::execute(gph_[name_], gph_[other.name_], gph_[next_name], operation);
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUbinaryTwoFunction::reshape(gph_[name_], gph_[other.name_], gph_[next_name]);
 
-        CPUbinaryTwoFunction::setup(gph_[name_], gph_[other.name_], gph_[next_name]);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPUbinaryTwoFunction::execute(gph_[name_], gph_[other.name_], gph_[next_name], operation);
-        break;
-    }
-    default: {
-    }
-    }
-    gph_[next_name].status() = status_;
-    return gph_[next_name];
+template <typename Func>
+Tensor &Tensor::binaryTwoCompute(Func operation, string append_s, Tensor &other) {
+    return applyFunc(append_s, CPUbinaryTwoFunction(), other, operation);
 }
-Tensor& Tensor::operator+(Tensor& other) {
+
+Tensor &Tensor::operator+(Tensor &other) {
     return binaryTwoCompute(std::plus<float>(), "-TTadd", other);
 }
-Tensor& Tensor::operator-(Tensor& other){
+Tensor &Tensor::operator-(Tensor &other) {
     return binaryTwoCompute(std::minus<float>(), "-TTsub", other);
 }
-Tensor& Tensor::operator*(Tensor& other){
+Tensor &Tensor::operator*(Tensor &other) {
     return binaryTwoCompute(std::multiplies<float>(), "-TTmul", other);
 }
-Tensor& Tensor::operator/(Tensor& other){
+Tensor &Tensor::operator/(Tensor &other) {
     return binaryTwoCompute(std::divides<float>(), "-TTdiv", other);
 }
 
-Tensor& Tensor::mean(Chl axis) {
-    if(Module::doLoad){return *this;}
-    const std::string next_name = name_ + "-mean";
-    switch (status_) {
-    case TENSOR_DYNAMIC: {
-        std::cout<<"[TODO] not support dynamic tensor view"<<std::endl;
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor( backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUmeanFunction::reshape(gph_[name_], gph_[next_name], axis);
-
-        CPUmeanFunction::setup(gph_[name_], gph_[next_name], axis);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPUmeanFunction::execute(gph_[name_], gph_[next_name], axis);
-        break;
-    }
-    default: {
-    }
-    }
-    gph_[next_name].status() = status_;
-    return gph_[next_name];
+Tensor &Tensor::mean(Chl axis) {
+    return applyFunc("mean", CPUmeanFunction(), axis);
 }
 
-Tensor& Tensor::view(int b, int h, int s, int d) {
-    if(Module::doLoad){return *this;}
-    const std::string next_name = name_ + "-view";
-    switch (status_) {
-    case TENSOR_DYNAMIC: {
-        std::cout<<"[TODO] not support dynamic tensor view"<<std::endl;
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor( backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUviewFunction::reshape(gph_[name_], gph_[next_name], b, h, s, d);
-
-        CPUviewFunction::setup(gph_[name_], gph_[next_name], b, h, s, d);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPUviewFunction::execute(gph_[name_], gph_[next_name]);
-        break;
-    }
-    default: {
-    }
-    }
-    gph_[next_name].status() = status_;
-    return gph_[next_name];
+Tensor &Tensor::view(int b, int h, int s, int d) {
+    return applyFunc("view", CPUviewFunction(), b, h, s, d);
 }
 
-Tensor& Tensor::flatten(Chl axis_start, Chl axis_end) {
-    if(Module::doLoad){return *this;}
-    const std::string next_name = name_ + "-flatten";
-    switch (status_) {
-    case TENSOR_DYNAMIC: {
-        std::cout << "[TODO] not support dynamic tensor view" << std::endl;
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUflattenFunction::reshape(gph_[name_], gph_[next_name], axis_start, axis_end);
-
-        CPUflattenFunction::setup(gph_[name_], gph_[next_name], axis_start, axis_end);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPUflattenFunction::execute(gph_[name_], gph_[next_name]);
-        break;
-    }
-    default: {
-    }
-    }
-    gph_[next_name].status() = status_;
-    return gph_[next_name];
+Tensor &Tensor::flatten(Chl axis_start, Chl axis_end) {
+    return applyFunc("flatten", CPUflattenFunction(), axis_start, axis_end);
 }
 
-Tensor &Tensor::transpose(Chl axis0, Chl axis1) {
-    return transpose({{axis0, axis1}});
-}
 Tensor &Tensor::transpose(vector<std::pair<Chl, Chl>> axiss) {
-    if(Module::doLoad){return *this;}
-    const std::string next_name = name_ + "-transpose";
-    if (next_name.find(".X.") != std::string::npos && Module::runlistIdx > 0) {}
-    else {
-        switch (status_) {
-        case TENSOR_DYNAMIC: {
-            std::cout << "[TODO] not support dynamic tensor view" << std::endl;
-            break;
-        }
-        case TENSOR_STATIC_INIT: {
-            if (gph_.find(name_) == gph_.end()) {
-                gph_[name_] = *this;
-                gph_[name_].status() = status_;
-            }
-            // reshape
-            if (gph_.find(next_name) == gph_.end()) {
-                gph_[next_name] = Tensor(backend_);
-                gph_[next_name].setName(next_name);
-            }
-            gph_[next_name].trans_copy_shape(gph_[name_].shape());
-            std::map<Chl, int> origin_chls = {{BATCH, 0}, {SEQUENCE, 1}, {HEAD, 2}, {DIMENSION, 3},
-                                {CHANNLE, 1}, {TIME, 2}, {HEIGHT, 3}, {WIDTH, 4}};
-            if(std::equal(gph_[next_name].chls_.begin(), gph_[next_name].chls_.end(), origin_chls.begin())) {
-                gph_[next_name].chls_ = gph_[name_].chls_;
-                for (auto axis : axiss) {
-                    auto axis0 = axis.first;
-                    auto axis1 = axis.second;
-                    auto ori_0_idx = gph_[next_name].chls()[axis0];
-                    auto ori_1_idx = gph_[next_name].chls()[axis1];
-                    gph_[next_name].chls()[axis0] = ori_1_idx;
-                    gph_[next_name].chls()[axis1] = ori_0_idx;
-                }
-                gph_[next_name].changeCtype(gph_[name_].shape().size());
-                gph_[next_name].undiffusion_ = true;
-            }
-        //     break;
-        // }
-        // case TENSOR_STATIC_SHAPED: {
-            if(gph_[name_].masterTensor() != nullptr) {
-                if (gph_[next_name].master_tensor_ == nullptr) {
-                    gph_[next_name].setDtype(gph_[name_].dtype());
-                    gph_[next_name].deepCopyFrom(gph_[name_], false);
-                }
-            }else {
-                if(gph_[name_].masterTensor() == nullptr) {
-                    gph_[name_].free();
-                }
-                gph_[next_name].setDtype(gph_[name_].dtype());
-                gph_[next_name].alloc();
-                gph_[name_].undiffusion_ = true;
-                gph_[name_].deepCopyFrom(gph_[next_name], false);
-                gph_[next_name].trans_from_ = axiss;
-            }
-            break;
-        }
-        case TENSOR_STATIC_READY: {
-            break;
-        }
-        default: {
-        }
-        }
-    }
-    gph_[next_name].status() = status_;
-    return gph_[next_name];
+    return applyFunc("transpose", CPUtransposeFunction(), axiss);
 }
 
 Tensor &Tensor::clip(vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
-    if(Module::doLoad){return *this;}
-    const std::string next_name = name_ + "-clip";
-    switch (status_) {
-    case TENSOR_DYNAMIC: {
-        std::cout << "[TODO] not support dynamic tensor view" << std::endl;
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUclipFunction::reshape(gph_[name_], gph_[next_name], b, h, s, d);
-
-        CPUclipFunction::setup(gph_[name_], gph_[next_name], b, h, s, d);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPUclipFunction::execute(gph_[name_], gph_[next_name], b, h, s, d);
-        break;
-    }
-    default: {
-    }
-    }
-    gph_[next_name].status() = status_;
-    return gph_[next_name];
+    return applyFunc("clip", CPUclipFunction(), b, h, s, d);
 }
 
 Tensor &Tensor::clip(Chl keep_axis, vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
-    if(Module::doLoad){return *this;}
-    const std::string next_name = name_ + "-clip";
-    switch (status_) {
-    case TENSOR_DYNAMIC: {
-        std::cout << "[TODO] not support dynamic tensor view" << std::endl;
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUclipaxisFunction::reshape(gph_[name_], gph_[next_name], keep_axis, b, h, s, d);
+    return applyFunc("clip", CPUclipaxisFunction(), keep_axis, b, h, s, d);
+}
 
-        CPUclipaxisFunction::setup(gph_[name_], gph_[next_name],  keep_axis, b, h, s, d);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPUclipaxisFunction::execute(gph_[name_], gph_[next_name],  keep_axis, b, h, s, d);
-        break;
-    }
-    default: {
-    }
-    }
-    gph_[next_name].status() = status_;
-    return gph_[next_name];
+Tensor &Tensor::norm(int L_n) {
+    return applyFunc("norm", CPUnormFunction(), L_n);
 }
 
-Tensor &Tensor::cat(vector<Tensor> input_tensors, Chl axis) {
-    if(Module::doLoad){return Tensor::gph_["0"];}
-    const std::string next_name = input_tensors[0].name() + "-cat";
-    int expd_batch_ = input_tensors[0].batch();
-    int expd_batch_input_idx = 0;
-    for (int ii = 0; ii < input_tensors.size(); ++ii) {
-        auto input = input_tensors[ii];
-        if (input.batch() > expd_batch_) {
-            expd_batch_ = input.batch();
-            expd_batch_input_idx = ii;
-        }
-    }
-    vector<Tensor*> inputs = {};
-    for (const auto& input_tensor : input_tensors) {
-        inputs.push_back(&gph_[input_tensor.name()]);
-    }
-    switch (input_tensors[0].status()) {
-    case TENSOR_DYNAMIC: {
-        std::cout << "[TODO] not support dynamic tensor view" << std::endl;
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(input_tensors[0].backend());
-            gph_[next_name].setName(next_name);
-        }
-        CPUcatFunction::reshape(inputs, gph_[next_name], axis, expd_batch_, expd_batch_input_idx);
-        CPUcatFunction::setup(inputs, gph_[next_name], axis, expd_batch_, expd_batch_input_idx);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPUcatFunction::execute(inputs, gph_[next_name], axis, expd_batch_, expd_batch_input_idx);
-        break;
-    }
-    default: {
-    }
-    }
-    gph_[next_name].status() = input_tensors[0].status();
-    return gph_[next_name];
+Tensor &Tensor::where(float value, Chl axis) {
+    return applyFunc("where", CPUwhereFunction(), value, axis);
 }
+/**
+ * static function
+ */
 
-Tensor &Tensor::mm(Tensor& input0, Tensor& input1) {
-    if(Module::doLoad){return Tensor::gph_["0"];}
-    const std::string next_name = input0.name() + "-mm-" + input1.name();
-    switch (input0.status()) {
+template <typename Func, typename... Args>
+Tensor &Tensor::applyStaticFunc(const std::string &suffix, Func func, Args... args) {
+    if (Module::doLoad) { return Tensor::gph_["0"]; }
+    const std::string next_name = suffix;
+    switch (Module::tensor_status) {
     case TENSOR_DYNAMIC: {
         std::cout << "[TODO] not support dynamic tensor view" << std::endl;
         break;
     }
     case TENSOR_STATIC_INIT: {
         if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(input0.backend());
+            gph_[next_name] = Tensor(Module::backends[MLLM_CPU]);
             gph_[next_name].setName(next_name);
         }
-        if (input0.name().find(".X.") != std::string::npos && input1.name().find(".X.") != std::string::npos && next_name.find(".X.") != std::string::npos
-            && Module::runlistIdx > 0) {
-        } else {
-            CPUmmFunction::reshape(gph_[input0.name()], gph_[input1.name()], gph_[next_name]);
-        }
-        CPUmmFunction::setup(gph_[input0.name()], gph_[input1.name()], gph_[next_name]);
+        func.setup(gph_[next_name], args...);
         break;
     }
     case TENSOR_STATIC_READY: {
-        CPUmmFunction::execute(gph_[input0.name()], gph_[input1.name()], gph_[next_name]);
+        func.execute(gph_[next_name], args...);
         break;
     }
     default: {
     }
     }
-    gph_[next_name].status() = input0.status();
+    gph_[next_name].status() = Module::tensor_status;
     return gph_[next_name];
 }
 
-Tensor& Tensor::norm(int L_n) {
-    if(Module::doLoad){return *this;}
-    assert(L_n ==1 || L_n ==2);
-    const std::string next_name = name_ + "-norm";
-    switch (status_) {
-    case TENSOR_DYNAMIC: {
-        std::cout << "[TODO] not support dynamic tensor view" << std::endl;
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUnormFunction::reshape(gph_[name_], gph_[next_name], L_n);
-        CPUnormFunction::setup(gph_[name_], gph_[next_name], L_n);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPUnormFunction::execute(gph_[name_], gph_[next_name], L_n);
-        break;
-    }
-    default: {
-    }
+Tensor &Tensor::cat(vector<Tensor> input_tensors, Chl axis) {
+    vector<Tensor *> inputs = {};
+    for (const auto &input_tensor : input_tensors) {
+        inputs.push_back(&gph_[input_tensor.name()]);
     }
-    gph_[next_name].status() = status_;
-    return gph_[next_name];
+    const std::string next_name = input_tensors[0].name() + "-cat";
+    return applyStaticFunc(next_name, CPUcatFunction(), inputs, axis);
 }
 
-Tensor& Tensor::where(float value, Chl axis) {
-    if(Module::doLoad){return *this;}
-    const std::string next_name = name_ + "-where";
-    switch (status_) {
-    case TENSOR_DYNAMIC: {
-        std::cout << "[TODO] not support dynamic tensor view" << std::endl;
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(name_) == gph_.end()) {
-            gph_[name_] = *this;
-            gph_[name_].status() = status_;
-        }
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor(backend_);
-            gph_[next_name].setName(next_name);
-        }
-        CPUwhereFunction::reshape(gph_[name_], gph_[next_name], value, axis);
-        CPUwhereFunction::setup(gph_[name_], gph_[next_name], value, axis);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPUwhereFunction::execute(gph_[name_], gph_[next_name], value, axis);
-        break;
-    }
-    default: {
-    }
-    }
-    gph_[next_name].status() = status_;
-    return gph_[next_name];
+Tensor &Tensor::mm(Tensor &input0, Tensor &input1) {
+    const std::string next_name = input0.name() + "-mm-" + input1.name();
+    return applyStaticFunc(next_name, CPUmmFunction(), gph_[input0.name()], gph_[input1.name()]);
 }
 
-Tensor& Tensor::range(int start, int end) {
-    if(Module::doLoad){return Tensor::gph_["0"];}
-    static int range_name_idx = 0;
-    const std::string next_name = "range" + std::to_string(range_name_idx);
-    switch (Module::tensor_status) {
-    case TENSOR_DYNAMIC: {
-        std::cout<<"[TODO] not support dynamic tensor view"<<std::endl;
-        break;
-    }
-    case TENSOR_STATIC_INIT: {
-        if (gph_.find(next_name) == gph_.end()) {
-            gph_[next_name] = Tensor( Module::backends[MLLM_CPU]);
-            gph_[next_name].setName(next_name);
-        }
-        CPURangeFunction::reshape(gph_[next_name], start, end);
-        CPURangeFunction::setup(gph_[next_name], start, end);
-        break;
-    }
-    case TENSOR_STATIC_READY: {
-        CPURangeFunction::execute(gph_[next_name], start, end);
-        range_name_idx++;
-        break;
-    }
-    default: {
-    }
-    }
-    gph_[next_name].status() = Module::tensor_status;
-    return gph_[next_name];
+Tensor &Tensor::range(int start, int end) {
+    const std::string next_name = "range-" + std::to_string(start) + "-" + std::to_string(end);
+    return applyStaticFunc(next_name, CPURangeFunction(), start, end);
 }
+
 } // namespace mllm
\ No newline at end of file
diff --git a/src/Tensor.hpp b/src/Tensor.hpp
index c2b2b75b..e0e61ec7 100644
--- a/src/Tensor.hpp
+++ b/src/Tensor.hpp
@@ -691,6 +691,10 @@ class Tensor {
         return undiffusion_ ;
     }
 
+    vector<std::pair<Chl, Chl>>& transFrom() {
+        return trans_from_;
+    }
+
     /**
      * \brief Overload the operators.
      * \param data binary data
@@ -702,6 +706,8 @@ class Tensor {
     Tensor& operator/(float data);
     Tensor& operator/(double data);
 
+
+
     /**
      * \brief Overload the operators.
      * \param other The Other Tensor
@@ -711,16 +717,16 @@ class Tensor {
     Tensor& operator-(Tensor& other);
     Tensor& operator*(Tensor& other);
     Tensor& operator/(Tensor& other);
+
     Tensor& mean(Chl axis);
 
 
     Tensor& view(int b, int h, int s, int d);
     Tensor& flatten(Chl axis_start, Chl axis_end);
-    Tensor& transpose(Chl axis0, Chl axis1);
+    Tensor& transpose(Chl axis0, Chl axis1){
+        return transpose({{axis0, axis1}});
+    }
     Tensor& transpose(vector<std::pair<Chl, Chl>> axiss);
-    // Tensor& transpose(vector<Chl> dims);
-    // Tensor& transpose_(Chl axis0, Chl axis1);
-    // Tensor& transpose(vector<Chl> axis);
     Tensor& clip(vector<int> b, vector<int> h, vector<int> s, vector<int> d);
     Tensor &clip(Chl keep_axis, vector<int> b, vector<int> h, vector<int> s, vector<int> d);
     static Tensor& cat(vector<Tensor> input_tensors, Chl dims);;
@@ -1512,6 +1518,13 @@ class Tensor {
     template <typename Func>
     Tensor& binaryTwoCompute(Func operation, string append_s, Tensor& other) ;
 
+
+    template<typename Func, typename... Args>
+    Tensor& applyFunc(const std::string& suffix, Func func, Args... args);  
+
+    template<typename Func, typename... Args>
+    static Tensor& applyStaticFunc(const std::string& suffix, Func func, Args... args);
+
 };
 } // namespace mllm
 #endif // MLLM_TENSOR_H
\ No newline at end of file
diff --git a/src/backends/cpu/CPUTensorFunction.hpp b/src/backends/cpu/CPUTensorFunction.hpp
index caf74d97..7e579d7d 100644
--- a/src/backends/cpu/CPUTensorFunction.hpp
+++ b/src/backends/cpu/CPUTensorFunction.hpp
@@ -49,21 +49,18 @@ class CPUmmFunction {
             }
         }
     }
+
 public:
-    static void reshape(Tensor &input0, Tensor &input1, Tensor &output) {
-        if(input1.chls()[SEQUENCE] != 3) {
+    static void setup(Tensor &output, Tensor &input0, Tensor &input1) {
+        if (input1.chls()[SEQUENCE] != 3) {
             tranTensorChl(input1);
         }
         assert(input0.dimension() == input1.sequence());
-        if (input0.dimension() == input1.sequence()) {
-            output.reshape(input0.batch(), input0.head(), input0.sequence(), input1.dimension());
-        }
-    }
-    static void setup(Tensor &input0, Tensor &input1, Tensor &output) {
+        output.reshape(input0.batch(), input0.head(), input0.sequence(), input1.dimension());
         output.setDtype(input0.dtype());
         output.alloc();
     }
-    static void execute(Tensor &input0, Tensor &input1, Tensor &output) {
+    static void execute(Tensor &output, Tensor &input0, Tensor &input1) {
         bool isSame = std::equal(input0.chls().begin(), input0.chls().end(), input1.chls().begin());
         assert(input0.dtype() == MLLM_TYPE_F32);
         switch (input1.dtype()) {
@@ -83,52 +80,46 @@ class CPUmmFunction {
 
 class CPUnormFunction {
 public:
-    static void reshape(Tensor &input,  Tensor &output, int L_n) {
+    static void setup(Tensor &input, Tensor &output, int L_n) {
         output.reshape(input.batch(), input.head(), input.sequence(), input.dimension());
-    }
-    static void setup(Tensor &input,  Tensor &output, int L_n) {
         output.setDtype(input.dtype());
         output.alloc();
     }
-    static void execute(Tensor &input,  Tensor &output, int L_n) {
+    static void execute(Tensor &input, Tensor &output, int L_n) {
         for (int h = 0; h < input.head(); h++) {
             for (int n = 0; n < input.batch(); n++) {
                 for (int s = 0; s < input.sequence(); s++) {
                     if (L_n == 2) {
                         float sum_of_squares = 0.0f;
                         for (int d = 0; d < input.dimension(); ++d) {
-                            sum_of_squares += input.dataAt<float>(n, h, s,d) * input.dataAt<float>(n, h, s,d);
+                            sum_of_squares += input.dataAt<float>(n, h, s, d) * input.dataAt<float>(n, h, s, d);
                         }
                         float l2_norm = std::sqrt(sum_of_squares);
 #pragma omp parallel for num_threads(Layer::cpu_thread)
                         for (int d = 0; d < input.dimension(); d++) {
-                            output.setDataAt<float>(n, h, s,d, l2_norm);
+                            output.setDataAt<float>(n, h, s, d, l2_norm);
                         }
                     } else {
                         float sum_of_abs_values = 0.0f;
                         for (int d = 0; d < input.dimension(); ++d) {
-                            sum_of_abs_values += std::abs(input.dataAt<float>(n, h, s,d));
+                            sum_of_abs_values += std::abs(input.dataAt<float>(n, h, s, d));
                         }
 #pragma omp parallel for num_threads(Layer::cpu_thread)
                         for (int d = 0; d < input.dimension(); d++) {
-                            output.setDataAt<float>(n, h, s,d, sum_of_abs_values);
+                            output.setDataAt<float>(n, h, s, d, sum_of_abs_values);
                         }
-
                     }
                 }
             }
         }
     }
-    
 };
 
-
 class CPUbinaryFunction {
 public:
-    static void reshape(Tensor &input, Tensor &output) {
+    template <typename Func>
+    static void setup(Tensor &input, Tensor &output, Func operation, float data) {
         output.reshape(input.batch(), input.head(), input.sequence(), input.dimension());
-    }
-    static void setup(Tensor &input, Tensor &output) {
         output.setDtype(input.dtype());
         output.alloc();
     }
@@ -158,15 +149,14 @@ class CPUbinaryFunction {
 
 class CPUbinaryTwoFunction {
 public:
-    static void reshape(Tensor &input0, Tensor &input1, Tensor &output) {
+    template <typename Func>
+    static void setup(Tensor &input0, Tensor &output, Tensor &input1, Func operation) {
         output.reshape(std::max(input0.batch(), input1.batch()), input0.head(), input0.sequence(), input0.dimension());
-    }
-    static void setup(Tensor &input0, Tensor &input1, Tensor &output) {
         output.setDtype(input0.dtype());
         output.alloc();
     }
     template <typename Func>
-    static void execute(Tensor &input0, Tensor &input1, Tensor &output, Func operation) {
+    static void execute(Tensor &input0, Tensor &output, Tensor &input1, Func operation) {
         int batch_ = std::max(input0.batch(), input1.batch());
         if (input0.masterTensor() == nullptr && output.masterTensor() == nullptr && input0.ctype() == output.ctype()) {
             for (int n = 0; n < batch_; ++n) {
@@ -199,7 +189,7 @@ class CPUbinaryTwoFunction {
 };
 class CPUmeanFunction {
 public:
-    static void reshape(Tensor &input, Tensor &output, Chl axis) {
+    static void setup(Tensor &input, Tensor &output, Chl axis) {
         int batch = input.batch();
         int head = input.head();
         int sequence = input.sequence();
@@ -221,8 +211,6 @@ class CPUmeanFunction {
             break;
         }
         output.reshape(batch, head, sequence, dimension);
-    }
-    static void setup(Tensor &input, Tensor &output, Chl axis) {
         output.setDtype(input.dtype());
         output.alloc();
     }
@@ -296,7 +284,7 @@ class CPUmeanFunction {
 
 class CPUviewFunction {
 public:
-    static void reshape(Tensor &input, Tensor &output, int b, int h, int s, int d) {
+    static void setup(Tensor &input, Tensor &output, int b, int h, int s, int d) {
         int dim_b = input.batch();
         int dim_h = input.head();
         int dim_s = input.sequence();
@@ -347,8 +335,6 @@ class CPUviewFunction {
             std::cout << "[TODO]Tensor.View not support!!!!" << std::endl;
         }
         output.reshape(dim_b, dim_h, dim_s, dim_d);
-    }
-    static void setup(Tensor &input, Tensor &output, int b, int h, int s, int d) {
         if ((b == -1 && s == -1 && input.ctype() != BCTHW)   // head & dimension
             || (b == -1 && d == -1 && input.ctype() == BSHD) // head & sequence
             || (h == -1 && d == -1 && input.ctype() == BSHD) // batch & sequence
@@ -363,96 +349,33 @@ class CPUviewFunction {
             std::cout << "[TODO]Tensor.View not support!!!!" << std::endl;
         }
     }
-    static void execute(Tensor &input, Tensor &output) {
+    static void execute(Tensor &input, Tensor &output, int b, int h, int s, int d) {
     }
 };
 
 class CPUflattenFunction {
 public:
-    static void reshape(Tensor &input, Tensor &output, Chl axis_start, Chl axis_end) {
+    static void setup(Tensor &input, Tensor &output, Chl axis_start, Chl axis_end) {
         int dim_b = input.batch();
         int dim_h = 0;
         int dim_s = 0;
         int dim_d = 0;
-        /*
-        if (input.ctype() == BSHD) {
+        if (input.shape().size() == 4) {
             dim_h = input.head();
             dim_s = input.sequence();
             dim_d = input.dimension();
             if (axis_start == BATCH & axis_end == SEQUENCE) {
-                // data_dims = {-1, HEAD, BATCH + SEQUENCE, DIMENSION};
                 dim_b = 1;
                 dim_s = input.sequence() * input.batch();
             } else if (axis_start == HEAD & axis_end == SEQUENCE) {
-                // data_dims = {BATCH, -1, HEAD + SEQUENCE, DIMENSION};
                 dim_h = 1;
                 dim_s = input.sequence() * input.head();
             } else if (axis_start == HEAD & axis_end == DIMENSION) {
-                // data_dims = {BATCH, HEAD, -1, SEQUENCE + DIMENSION};
                 dim_h = 1;
                 dim_d = input.dimension() * input.head();
             } else {
                 std::cout << "ERROR:  flatten  " << axis_start << "&" << axis_end << std::endl;
             }
-        } else if (input.ctype() == BHDS) {
-            dim_h = input.head();
-            dim_s = input.dimension();
-            dim_d = input.sequence();
-            if (axis_start == BATCH & axis_end == SEQUENCE) {
-                // data_dims = {-1, HEAD, BATCH + SEQUENCE, DIMENSION};
-                dim_b = 1;
-                dim_s = dim_s * input.batch();
-            } else if (axis_start == HEAD & axis_end == SEQUENCE) {
-                // data_dims = {BATCH, -1, HEAD + SEQUENCE, DIMENSION};
-                dim_h = 1;
-                dim_s = dim_s * input.head();
-            } else if (axis_start == HEAD & axis_end == DIMENSION) {
-                // data_dims = {BATCH, HEAD, -1, SEQUENCE + DIMENSION};
-                dim_h = 1;
-                dim_d = dim_d * input.head();
-            } else {
-                std::cout << "ERROR:  flatten  " << axis_start << "&" << axis_end << std::endl;
-            }
-        } else if (input.ctype() == BDHS) {
-            dim_h = input.head();
-            dim_s = input.sequence();
-            dim_d = input.dimension();
-            if (axis_start == HEAD & axis_end == SEQUENCE) {
-                dim_h = 1;
-                dim_s = input.sequence() * input.head();
-            }
-        }else {
-            if (axis_start == TIME & axis_end == CHANNLE) {
-                // data_dims = {BATCH, -1, TIME + HEIGHT + WIDTH, CHANNLE};
-                if (input.ctype() == BTHWC) {
-                    dim_h = 1;
-                    dim_s = input.time() * input.height() * input.width();
-                    dim_d = input.channel();
-                } else if (input.ctype() == BCTHW) {
-                    dim_h = 1;
-                    dim_s = input.time() * input.height() * input.channel();
-                    dim_d = input.width();
-                } else {
-                    std::cout << "ERROR: flatten  " << axis_start << "&" << axis_end << std::endl;
-                }
-            }
-        }*/
-        if(input.shape().size() == 4) {
-            dim_h = input.head();
-            dim_s = input.sequence();
-            dim_d = input.dimension();
-            if (axis_start == BATCH & axis_end == SEQUENCE) {
-                dim_b = 1;
-                dim_s = input.sequence() * input.batch();
-            } else if (axis_start == HEAD & axis_end == SEQUENCE) {
-                dim_h = 1;
-                dim_s = input.sequence() * input.head();
-            } else if (axis_start == HEAD & axis_end == DIMENSION) {
-                dim_h = 1;
-                dim_d = input.dimension() * input.head();
-            }else {
-                std::cout << "ERROR:  flatten  " << axis_start << "&" << axis_end << std::endl;
-            }
         } else if (input.shape().size() == 5) {
             if (axis_start == CHANNLE & axis_end == HEIGHT) {
                 dim_h = 1;
@@ -464,45 +387,81 @@ class CPUflattenFunction {
                 dim_d = input.time();
             }
         }
-        assert(dim_d+dim_s+dim_h > 0);
+        assert(dim_d + dim_s + dim_h > 0);
         output.reshape(dim_b, dim_h, dim_s, dim_d);
-    }
-    static void setup(Tensor &input, Tensor &output, Chl axis_start, Chl axis_end) {
-        if (   (axis_start == TIME & axis_end == WIDTH && input.ctype()==BCTHW)
-            || (axis_start == CHANNLE & axis_end == HEIGHT && input.ctype()==BWCTH)
-            || (axis_start == HEIGHT & axis_end == CHANNLE && input.ctype()==BTHWC)
-            || (axis_start == BATCH & axis_end == SEQUENCE && input.ctype()!=BCTHW)
-            || (axis_start == HEAD & axis_end == SEQUENCE && input.ctype()==BSHD)
-            || (axis_start == HEAD & axis_end == SEQUENCE && input.ctype()==BHDS)
-            || (axis_start == HEAD & axis_end == SEQUENCE && input.ctype()==BDHS)
-            || (axis_start == HEAD & axis_end == DIMENSION && input.ctype()==BSHD)
-            || (axis_start == HEAD & axis_end == DIMENSION && input.ctype()==BHDS)
-            || (axis_start == HEAD & axis_end == SEQUENCE && input.ctype()==BDSH)
-        ){
-            if(input.masterTensor() == nullptr) {
+        if ((axis_start == TIME & axis_end == WIDTH && input.ctype() == BCTHW)
+            || (axis_start == CHANNLE & axis_end == HEIGHT && input.ctype() == BWCTH)
+            || (axis_start == HEIGHT & axis_end == CHANNLE && input.ctype() == BTHWC)
+            || (axis_start == BATCH & axis_end == SEQUENCE && input.ctype() != BCTHW)
+            || (axis_start == HEAD & axis_end == SEQUENCE && input.ctype() == BSHD)
+            || (axis_start == HEAD & axis_end == SEQUENCE && input.ctype() == BHDS)
+            || (axis_start == HEAD & axis_end == SEQUENCE && input.ctype() == BDHS)
+            || (axis_start == HEAD & axis_end == DIMENSION && input.ctype() == BSHD)
+            || (axis_start == HEAD & axis_end == DIMENSION && input.ctype() == BHDS)
+            || (axis_start == HEAD & axis_end == SEQUENCE && input.ctype() == BDSH)) {
+            if (input.masterTensor() == nullptr) {
                 input.free();
             }
             output.setDtype(input.dtype());
             output.alloc();
             input.deepCopyFrom(output, false);
-        }else {
-            std::cout<<"[TODO]Tensor.Flatten not support!!!!"<<std::endl;
+        } else {
+            std::cout << "[TODO]Tensor.Flatten not support!!!!" << std::endl;
         }
     }
-    static void execute(Tensor &input, Tensor &output) {
+    static void execute(Tensor &input, Tensor &output, Chl axis_start, Chl axis_end) {
+    }
+};
+class CPUtransposeFunction {
+public:
+    static void setup(Tensor &input, Tensor &output, vector<std::pair<Chl, Chl>> axiss) {
+        if (output.count() <= 0 || output.shape() != input.shape()) {
+            output.trans_copy_shape(input.shape());
+            std::map<Chl, int> origin_chls = {{BATCH, 0}, {SEQUENCE, 1}, {HEAD, 2}, {DIMENSION, 3}, {CHANNLE, 1}, {TIME, 2}, {HEIGHT, 3}, {WIDTH, 4}};
+            if (std::equal(output.chls().begin(), output.chls().end(), origin_chls.begin())) {
+                output.chls() = input.chls();
+                for (auto axis : axiss) {
+                    auto axis0 = axis.first;
+                    auto axis1 = axis.second;
+                    auto ori_0_idx = output.chls()[axis0];
+                    auto ori_1_idx = output.chls()[axis1];
+                    output.chls()[axis0] = ori_1_idx;
+                    output.chls()[axis1] = ori_0_idx;
+                }
+                output.changeCtype(input.shape().size());
+                output.undiffusion() = true;
+            }
+            if (input.masterTensor() != nullptr) {
+                if (output.masterTensor() == nullptr) {
+                    output.setDtype(input.dtype());
+                    output.deepCopyFrom(input, false);
+                }
+            } else {
+                if (input.masterTensor() == nullptr) {
+                    input.free();
+                }
+                output.setDtype(input.dtype());
+                output.alloc();
+                input.undiffusion() = true;
+                input.deepCopyFrom(output, false);
+                output.transFrom() = axiss;
+            }
+        }
+    }
+    static void execute(Tensor &input, Tensor &output, vector<std::pair<Chl, Chl>> axiss) {
     }
 };
 
 class CPUclipFunction {
 public:
-    static void reshape(Tensor &input, Tensor &output, vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
+    static void setup(Tensor &input, Tensor &output, vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
         // reshape
         int dim_b = input.batch();
         int dim_h = input.head();
         int dim_s = input.sequence();
         int dim_d = input.dimension();
-        std::vector<std::pair<std::vector<int>, int*>> data = {{b, &dim_b}, {h, &dim_h}, {s, &dim_s}, {d, &dim_d}};
-        for (auto& pair : data) {
+        std::vector<std::pair<std::vector<int>, int *>> data = {{b, &dim_b}, {h, &dim_h}, {s, &dim_s}, {d, &dim_d}};
+        for (auto &pair : data) {
             if (pair.first.size() == 2) {
                 *pair.second = pair.first[1] - pair.first[0];
             } else if (pair.first.size() == 1) {
@@ -510,8 +469,6 @@ class CPUclipFunction {
             }
         }
         output.reshape(dim_b, dim_h, dim_s, dim_d);
-    }
-    static void setup(Tensor &input, Tensor &output, vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
         output.setDtype(input.dtype());
         output.alloc();
     }
@@ -532,33 +489,24 @@ class CPUclipFunction {
                        input.hostPtr<float>() + input.offset(b, 0, seq_idx, 0),
                        input.head() * 1 * input.dimension() * sizeof(float));
             }
-        }else {
-            std::cout<<"[TODO]Tensor.CLip not support!!!!"<<std::endl;
+        } else {
+            std::cout << "[TODO]Tensor.CLip not support!!!!" << std::endl;
         }
     }
 };
 
 class CPUclipaxisFunction {
 public:
-    static void reshape(Tensor &input, Tensor &output,Chl axis, vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
+    static void setup(Tensor &input, Tensor &output, Chl axis, vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
         // reshape
         int dim_b = input.batch();
         int dim_h = input.head();
         int dim_s = input.sequence();
         int dim_d = input.dimension();
-
-        /*
-        std::vector<std::pair<std::vector<int>, int*>> data = {{b, &dim_b}, {h, &dim_h}, {s, &dim_s}, {d, &dim_d}};
-        for (auto& pair : data) {
-            if (pair.first.size() > 0) {
-                *pair.second = 1;
-            }
-        }
-        */
         switch (axis) {
         case BATCH: {
-            std::vector<std::pair<std::vector<int>, int*>> data = {{h, &dim_h}, {s, &dim_s}, {d, &dim_d}};
-            for (auto& pair : data) {
+            std::vector<std::pair<std::vector<int>, int *>> data = {{h, &dim_h}, {s, &dim_s}, {d, &dim_d}};
+            for (auto &pair : data) {
                 if (pair.first.size() > 0) {
                     *pair.second = 1;
                 }
@@ -566,8 +514,8 @@ class CPUclipaxisFunction {
             break;
         }
         case HEAD: {
-            std::vector<std::pair<std::vector<int>, int*>> data = {{b, &dim_b}, {s, &dim_s}, {d, &dim_d}};
-            for (auto& pair : data) {
+            std::vector<std::pair<std::vector<int>, int *>> data = {{b, &dim_b}, {s, &dim_s}, {d, &dim_d}};
+            for (auto &pair : data) {
                 if (pair.first.size() > 0) {
                     *pair.second = 1;
                 }
@@ -575,8 +523,8 @@ class CPUclipaxisFunction {
             break;
         }
         case SEQUENCE: {
-            std::vector<std::pair<std::vector<int>, int*>> data = {{b, &dim_b}, {h, &dim_h}, {d, &dim_d}};
-            for (auto& pair : data) {
+            std::vector<std::pair<std::vector<int>, int *>> data = {{b, &dim_b}, {h, &dim_h}, {d, &dim_d}};
+            for (auto &pair : data) {
                 if (pair.first.size() > 0) {
                     *pair.second = 1;
                 }
@@ -584,8 +532,8 @@ class CPUclipaxisFunction {
             break;
         }
         case DIMENSION: {
-            std::vector<std::pair<std::vector<int>, int*>> data = {{b, &dim_b}, {h, &dim_h}, {s, &dim_s}};
-            for (auto& pair : data) {
+            std::vector<std::pair<std::vector<int>, int *>> data = {{b, &dim_b}, {h, &dim_h}, {s, &dim_s}};
+            for (auto &pair : data) {
                 if (pair.first.size() > 0) {
                     *pair.second = 1;
                 }
@@ -596,14 +544,12 @@ class CPUclipaxisFunction {
             break;
         }
         output.reshape(dim_b, dim_h, dim_s, dim_d);
-    }
-    static void setup(Tensor &input, Tensor &output, Chl axis, vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
         output.setDtype(input.dtype());
         output.alloc();
     }
     static void execute(Tensor &input, Tensor &output, Chl axis, vector<int> b, vector<int> h, vector<int> s, vector<int> d) {
         if (axis == BATCH) {
-            if(s.size()>0) {
+            if (s.size() > 0) {
                 for (int i = 0; i < s.size(); ++i) {
                     auto seq_idx = s[i];
                     memcpy(output.hostPtr<float>() + output.offset(i, 0, 0, 0),
@@ -612,33 +558,40 @@ class CPUclipaxisFunction {
                 }
             }
         } else {
-            std::cout<<"[TODO]Tensor.CLip not support!!!!"<<std::endl;
+            std::cout << "[TODO]Tensor.CLip not support!!!!" << std::endl;
         }
     }
 };
 
 class CPUcatFunction {
 public:
-    static void reshape(vector<Tensor *>inputs, Tensor &output, Chl axis, int expd_batch_, int expd_batch_input_idx) {
+    static void setup(Tensor &output, vector<Tensor *> inputs, Chl axis) {
+        int expd_batch_ = inputs[0]->batch();
+        for (int ii = 0; ii < inputs.size(); ++ii) {
+            auto input = inputs[ii];
+            if (input->batch() > expd_batch_) {
+                expd_batch_ = input->batch();
+            }
+        }
         int dim_b = expd_batch_;
         int dim_h = inputs[0]->head();
         int dim_s = inputs[0]->sequence();
         int dim_d = inputs[0]->dimension();
         int sizes[] = {0, 0, 0, 0};
         Chl axes[] = {BATCH, HEAD, SEQUENCE, DIMENSION};
-        int* dims[] = {&dim_b, &dim_h, &dim_s, &dim_d};
+        int *dims[] = {&dim_b, &dim_h, &dim_s, &dim_d};
         for (int i = 0; i < 4; i++) {
             if (axis == axes[i]) {
                 for (auto input : inputs) {
-                    sizes[i] += (i == 0) ? input->batch() : (i == 1) ? input->head() : (i == 2) ? input->sequence() : input->dimension();
+                    sizes[i] += (i == 0) ? input->batch() : (i == 1) ? input->head() :
+                                                        (i == 2)     ? input->sequence() :
+                                                                       input->dimension();
                 }
                 *dims[i] = sizes[i];
                 break;
             }
         }
         output.reshape(dim_b, dim_h, dim_s, dim_d);
-    }
-    static void setup(vector<Tensor *>inputs, Tensor &output, Chl axis, int expd_batch_, int expd_batch_input_idx) {
         output.setDtype(inputs[0]->dtype());
         output.alloc();
         if (axis == SEQUENCE && inputs[0]->head() != 1) {
@@ -651,13 +604,22 @@ class CPUcatFunction {
                     inputs[idx]->free();
                 }
                 if (idx > 0) {
-                    cseq += inputs[idx-1]->sequence();
+                    cseq += inputs[idx - 1]->sequence();
                 }
                 inputs[idx]->deepCopyFrom(output, false, {cbatch, chead, cseq, cdim}); // b,h,s,d
             }
         }
     }
-    static void execute(vector<Tensor *>inputs, Tensor &output, Chl axis, int expd_batch_, int expd_batch_input_idx) {
+    static void execute(Tensor &output, vector<Tensor *> inputs, Chl axis) {
+        int expd_batch_ = inputs[0]->batch();
+        int expd_batch_input_idx = 0;
+        for (int ii = 0; ii < inputs.size(); ++ii) {
+            auto input = inputs[ii];
+            if (input->batch() > expd_batch_) {
+                expd_batch_ = input->batch();
+                expd_batch_input_idx = ii;
+            }
+        }
         if (axis == BATCH) {
             for (int n = 0; n < inputs.size(); ++n) {
                 auto copysize = inputs[0]->batch() * inputs[0]->head() * inputs[0]->sequence() * inputs[0]->dimension();
@@ -705,8 +667,6 @@ class CPUcatFunction {
 
 class CPUwhereFunction {
 public:
-    static void reshape(Tensor &input, Tensor &output, float value, Chl axis) {
-    }
     static void setup(Tensor &input, Tensor &output, float value, Chl axis) {
     }
     static void execute(Tensor &input, Tensor &output, float value, Chl axis) {
@@ -780,16 +740,14 @@ class CPUwhereFunction {
 
 class CPURangeFunction {
 public:
-    static void reshape(Tensor &output, int start, int end) {
-        output.reshape(1, 1,  end - start, 1);
-    }
     static void setup(Tensor &output, int start, int end) {
+        output.reshape(1, 1, end - start, 1);
         output.setDtype(MLLM_TYPE_F32);
         output.alloc();
     }
     static void execute(Tensor &output, int start, int end) {
-        for (int i = 0; i < end-start; ++i) {
-            output.setDataAt<float>(0, 0, i+start,0, (float)i);
+        for (int i = 0; i < end - start; ++i) {
+            output.setDataAt<float>(0, 0, i + start, 0, (float)i);
         }
     }
 };
diff --git a/src/models/transformer/configuration_transformer.hpp b/src/models/transformer/configuration_transformer.hpp
index 063b4e56..f38b4aff 100644
--- a/src/models/transformer/configuration_transformer.hpp
+++ b/src/models/transformer/configuration_transformer.hpp
@@ -5,6 +5,8 @@
 #ifndef CONFIGURATION_TRANSFORMER_HPP
 #define CONFIGURATION_TRANSFORMER_HPP
 
+#include "Layer.hpp"
+
 using namespace mllm;
 using namespace std;
 

From c5324bb331e17849763fbc61b733cac2d2e53984 Mon Sep 17 00:00:00 2001
From: Rongjie Yi <41737961+yirongjie@users.noreply.github.com>
Date: Tue, 19 Mar 2024 08:18:32 +0800
Subject: [PATCH 6/6] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 63b77b7a..32f09887 100644
--- a/README.md
+++ b/README.md
@@ -96,7 +96,7 @@ Building mllm requires following tools:
 
 #### Run Fuyu-8B
 
-Download the model from [here](https://huggingface.co/mllmTeam), or using the following instructions
+Download the model from [here](https://huggingface.co/mllmTeam/fuyu-8b-mllm/tree/main/), or using the following instructions
 
 ```bash
 mkdir ../models && cd ../models
@@ -125,7 +125,7 @@ Result are as followed:
 
 #### Run LLaMA-2-7B
 
-Download model
+Download model from [here](https://huggingface.co/mllmTeam/llama-2-7b-mllm/tree/main/), or using the following instructions
 
 ```bash
 mkdir ../models && cd ../models
@@ -165,7 +165,7 @@ BUPT offers a wide range of undergraduate and graduate programs in fields such a
 
 #### Run ImageBind
 
-Download model
+Download model from [here](https://huggingface.co/mllmTeam/imagebind_huge-mllm/tree/main), or using the following instructions
 
 ```bash
 mkdir ../models && cd ../models