UbiquitousLearning · lx200916 · Nov 27, 2023 · Nov 26, 2023 · Nov 27, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -74,6 +74,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
     add_compile_options(-mavx2)
 elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
     message(STATUS "ARM detected")
+    add_definitions(-DARM)
 endif()
 
 if(QUANT)

diff --git a/include/OpDefined.hpp b/include/OpDefined.hpp
@@ -23,6 +23,9 @@ enum OpType {
     MUL,
     VIEW,
     KVCACHE,
+    RELU,
+    RELU2,
+    LAYERNORM,
     OP_NUM
 };
 
@@ -42,6 +45,9 @@ static const vector<string> OpNames = {
     "Mul",
     "VIEW",
     "KVCACHE",
+    "ReLU",
+    "ReLUSquaredActivation",
+    "LayerNorm",
     "OP_NUM"};
 } // namespace mllm
 #endif
diff --git a/src/backends/cpu/CPUBackend.cpp b/src/backends/cpu/CPUBackend.cpp
@@ -13,6 +13,7 @@
 #include "CPUEmbedding.hpp"
 #include "CPUMul.hpp"
 #include "CPUKVCache.hpp"
+#include "CPUReLU.hpp"
 #include <math.h>
 namespace mllm {
 CPUBackend::CPUBackend(shared_ptr<MemoryManager>& mm) :
@@ -72,6 +73,8 @@ void CPUBackend::registerOps() {
     addCreator(MUL, (CPUBackend::Creator *)(new CPUMulCreator()));
     addCreator(VIEW, (CPUBackend::Creator *)(new CPUViewCreator()));
     addCreator(KVCACHE, (CPUBackend::Creator *)(new CPUKVCacheCreator()));
+    addCreator(RELU, (CPUBackend::Creator *)(new CPUReLUCreator()));
+    addCreator(RELU2, (CPUBackend::Creator *)(new CPUReLUCreator()));
 }
 
 } // namespace mllm
diff --git a/src/backends/cpu/CPULayerNorm.cpp b/src/backends/cpu/CPULayerNorm.cpp
@@ -0,0 +1,70 @@
+//
+// Created by 咸的鱼 on 2023/11/26.
+//
+
+#include "CPULayerNorm.hpp"
+
+namespace mllm {
+CPULayerNorm::CPULayerNorm(Backend *bn, string opName, bool multiThread, float epsilon) :
+    support_multi_thread_(multiThread), Op(bn, std::move(opName)), epsilon_(epsilon) {
+}
+ErrorCode CPULayerNorm::load(AbstructLoader &loader) {
+    weight_.setName(name() + ".weight");
+    weight_.reshape(1, 1, 1, normSize_); //
+    weight_.setDtype(loader.getDataType(weight_.name()));
+    weight_.alloc();
+    loader.load(&weight_);
+    bias_.setName(name() + ".bias");
+    bias_.reshape(1, 1, 1, normSize_); //
+    bias_.setDtype(loader.getDataType(bias_.name()));
+    bias_.alloc();
+    loader.load(&bias_);
+    return Op::load(loader);
+}
+ErrorCode CPULayerNorm::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    normSize_ = inputs[0]->dimension();
+    outputs[0]->reshape(inputs[0]->batch(), inputs[0]->shape(1), inputs[0]->shape(2), inputs[0]->shape(3));
+    return Op::reshape(inputs, outputs);
+}
+
+ErrorCode CPULayerNorm::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    auto input = inputs[0];
+    auto output = outputs[0];
+    int batch = input->batch();
+    int dim = input->dimension();
+    int seq = input->sequence();
+    int head = input->head();
+    for (int h = 0; h < head; h++) {
+        for (int n = 0; n < batch; n++) {
+            for (int s = 0; s < seq; s++) {
+                float sum_squares = 0.0F;
+                float sum = 0.0F;
+// sum
+#pragma omp parallel for reduction(+ : sum_squares) reduction(+ : sum) num_threads(4)
+                for (int d = 0; d < dim; d++) {
+                    float value = input->dataAt<float>(n, h, s, d);
+                    sum += value;
+                }
+                float mean = sum / dim;
+#pragma omp parallel for reduction(+ : sum_squares) num_threads(4)
+                for (int d = 0; d < dim; d++) {
+                    float value = input->dataAt<float>(n, h, s, d);
+                    sum_squares += (value - mean) * (value - mean);
+                    output->setDataAt(n, h, s, d, value - mean);
+                }
+                float rms = std::sqrt(sum_squares / dim + epsilon_);
+#pragma omp parallel for num_threads(4)
+                for (int d = 0; d < dim; d++) {
+                    float value = output->dataAt<float>(n, h, s, d);
+                    output->setDataAt<float>(n, h, s, d, weight_.dataAt<float>(0, 0, 0, d) * value / rms + bias_.dataAt<float>(0, 0, 0, d));
+                }
+            }
+        }
+    }
+
+    return Op::execute(inputs, outputs);
+}
+ErrorCode CPULayerNorm::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    return Op::free(inputs, outputs);
+}
+} // namespace mllm
diff --git a/src/backends/cpu/CPULayerNorm.hpp b/src/backends/cpu/CPULayerNorm.hpp
@@ -0,0 +1,38 @@
+//
+// Created by 咸的鱼 on 2023/11/26.
+//
+
+#ifndef MLLM_CPULAYERNORM_HPP
+#define MLLM_CPULAYERNORM_HPP
+
+#include "CPUBackend.hpp"
+namespace mllm {
+
+class CPULayerNorm:public Op {
+public:
+    CPULayerNorm(Backend *bn, string opName, bool multiThread, float epsilon = 1e-5);
+    virtual ~CPULayerNorm() = default;
+    virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    ErrorCode load(AbstructLoader &loader) override;
+
+private:
+    bool support_multi_thread_ = false;
+    float epsilon_ = 1e-5;
+    int normSize_=0;
+    Tensor weight_;
+    Tensor bias_;
+};
+class CPULayerNormCreator : public CPUBackend::Creator {
+public:
+    virtual Op *create(OpParam op_param, Backend *bn, string name) const {
+
+        return new CPULayerNorm(bn, name, false);
+    }
+};
+
+
+} // namespace mllm
+
+#endif // MLLM_CPULAYERNORM_HPP
diff --git a/src/backends/cpu/CPUReLU.cpp b/src/backends/cpu/CPUReLU.cpp
@@ -0,0 +1,42 @@
+//
+// Created by 咸的鱼 on 2023/11/26.
+//
+
+#include "CPUReLU.hpp"
+
+#include <utility>
+
+namespace mllm {
+CPUReLU::CPUReLU(Backend *bn, string opName, bool multiThread):support_multi_thread_(multiThread), Op(bn, std::move(opName))  {
+}
+ErrorCode CPUReLU::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    CHECK_EQ(inputs.size(), 1);
+    CHECK_EQ(outputs.size(), 1);
+    outputs[0]->reshape(inputs[0]->batch(), inputs[0]->shape(1), inputs[0]->shape(2), inputs[0]->shape(3));
+    return Op::reshape(inputs, outputs);
+}
+
+ErrorCode CPUReLU::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    auto input = inputs[0];
+    auto output = outputs[0];
+    int batch = input->batch();
+    int head = input->head();
+    int seq = input->sequence();
+    int dim = input->dimension();
+#pragma omp parallel for collapse(4)
+    for (int b = 0; b <batch ; ++b) {
+        for (int h = 0; h < head; ++h) {
+            for (int s = 0; s < seq; ++s) {
+                for (int d = 0; d < dim; ++d) {
+                    float value = input->dataAt<float>(b, h, s, d);
+                    output->setDataAt<float>(b, h, s, d, value > 0 ? value : 0);
+                }
+            }
+        }
+    }
+    return Op::execute(inputs, outputs);
+}
+ErrorCode CPUReLU::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    return Op::free(inputs, outputs);
+}
+} // namespace mllm
diff --git a/src/backends/cpu/CPUReLU.hpp b/src/backends/cpu/CPUReLU.hpp
@@ -0,0 +1,32 @@
+//
+// Created by 咸的鱼 on 2023/11/26.
+//
+
+#ifndef MLLM_CPURELU_HPP
+#define MLLM_CPURELU_HPP
+
+#include "Op.hpp"
+#include "CPUBackend.hpp"
+namespace mllm {
+class CPUReLU final : public Op {
+public:
+    CPUReLU(Backend *bn, string opName, bool multiThread);
+    virtual ~CPUReLU() = default;
+    virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+
+
+private:
+    bool support_multi_thread_ = false;
+};
+
+class CPUReLUCreator : public CPUBackend::Creator {
+public:
+    virtual Op *create(OpParam op_param, Backend *bn, string name) const {
+        return new CPUReLU(bn, name, false);
+    }
+};
+} // namespace mllm
+
+#endif // MLLM_CPURELU_HPP
diff --git a/src/backends/cpu/CPUReLU2.cpp b/src/backends/cpu/CPUReLU2.cpp
@@ -0,0 +1,45 @@
+//
+// Created by 咸的鱼 on 2023/11/26.
+//
+
+#include "CPUReLU2.hpp"
+
+namespace mllm {
+
+CPUReLU2::CPUReLU2(Backend *bn, string opName, bool multiThread):support_multi_thread_(multiThread), Op(bn, std::move(opName)) {
+}
+ErrorCode CPUReLU2::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    CHECK_EQ(inputs.size(), 1);
+    CHECK_EQ(outputs.size(), 1);
+    outputs[0]->reshape(inputs[0]->batch(), inputs[0]->shape(1), inputs[0]->shape(2), inputs[0]->shape(3));
+    return Op::reshape(inputs, outputs);
+}
+ErrorCode CPUReLU2::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    auto input = inputs[0];
+    auto output = outputs[0];
+    int batch = input->batch();
+    int head = input->head();
+    int seq = input->sequence();
+    int dim = input->dimension();
+#pragma omp parallel for collapse(4)
+    for (int b = 0; b <batch ; ++b) {
+        for (int h = 0; h < head; ++h) {
+            for (int s = 0; s < seq; ++s) {
+                for (int d = 0; d < dim; ++d) {
+                    float value = input->dataAt<float>(b, h, s, d);
+                    if (value < 0) {
+                        value = 0;
+                    }
+                    //Square
+                    value = std::pow(value, 2);
+                    output->setDataAt<float>(b, h, s, d, value);
+                }
+            }
+        }
+    }
+    return Op::execute(inputs, outputs);
+}
+ErrorCode CPUReLU2::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
+    return Op::free(inputs, outputs);
+}
+} // namespace mllm
diff --git a/src/backends/cpu/CPUReLU2.hpp b/src/backends/cpu/CPUReLU2.hpp
@@ -0,0 +1,32 @@
+//
+// Created by 咸的鱼 on 2023/11/26.
+//
+
+#ifndef MLLM_CPURELU2_HPP
+#define MLLM_CPURELU2_HPP
+
+#include "Op.hpp"
+#include "CPUBackend.hpp"
+namespace mllm {
+class CPUReLU2 final : public Op {
+public:
+    CPUReLU2(Backend *bn, string opName, bool multiThread);
+    virtual ~CPUReLU2() = default;
+    virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+    virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
+
+
+private:
+    bool support_multi_thread_ = false;
+};
+
+class CPUReLU2Creator : public CPUBackend::Creator {
+public:
+    virtual Op *create(OpParam op_param, Backend *bn, string name) const {
+        return new CPUReLU2(bn, name, false);
+    }
+};
+} // namespace mllm
+
+#endif // MLLM_CPURELU2_HPP
diff --git a/test/TestLoader.cpp b/test/TestLoader.cpp
@@ -201,6 +201,7 @@ string DimDesc(vector<int> dim) {
 TestIO::TestIO(string filename, bool read_mode) :
     read_mode_(read_mode) {
     filename = "test_" + filename + ".mllm";
+
     if (read_mode) {
         fp_ = fopen(filename.c_str(), "rb");
     } else {

diff --git a/test/cpu/CPUReLU.cpp b/test/cpu/CPUReLU.cpp
@@ -0,0 +1,22 @@
+//
+// Created by 咸的鱼 on 2023/11/27.
+//
+
+#include "CPUTest.hpp"
+#include "backends/cpu/CPUReLU.hpp"
+TEST_F(CPUTest, CPUReLU1) {
+    SETUP_OP(CPUReLU, false);
+    TENSOR(input0);
+    TENSOR(output);
+    TENSOR(c_output);
+    TEST_LOAD(input0);
+    TEST_LOAD(output);
+
+    TEST_RESHAPE({input0}, {c_output});
+    TEST_SETUP({input0}, {c_output});
+    // TEST_LOAD(&op->weight(), false);
+    TEST_WEIGHTS_LOAD(loader);
+    //    op->weight().printData<float>();
+    TEST_EXCUTE({input0}, {c_output});
+    COMPARE_TENSOR(c_output.get(), output.get(), true);
+}
diff --git a/test/cpu/CPUReLU2.cpp b/test/cpu/CPUReLU2.cpp
@@ -0,0 +1,21 @@
+//
+// Created by 咸的鱼 on 2023/11/27.
+//
+#include "CPUTest.hpp"
+#include "backends/cpu/CPUReLU2.hpp"
+TEST_F(CPUTest, CPUReLU21) {
+    SETUP_OP(CPUReLU2, false);
+    TENSOR(input0);
+    TENSOR(output);
+    TENSOR(c_output);
+    TEST_LOAD(input0);
+    TEST_LOAD(output);
+
+    TEST_RESHAPE({input0}, {c_output});
+    TEST_SETUP({input0}, {c_output});
+    // TEST_LOAD(&op->weight(), false);
+    TEST_WEIGHTS_LOAD(loader);
+    //    op->weight().printData<float>();
+    TEST_EXCUTE({input0}, {c_output});
+    COMPARE_TENSOR(c_output.get(), output.get(), true);
+}
diff --git a/test/cpu/CPUReLU2Test.py b/test/cpu/CPUReLU2Test.py
@@ -0,0 +1,29 @@
+import torch
+from torch import nn
+
+
+class ReLUSquaredActivation(nn.Module):
+    """
+    Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
+    """
+
+    def forward(self, input):
+        relu_applied = nn.functional.relu(input)
+        squared = torch.square(relu_applied)
+        return squared
+
+
+from TestUtils import TestBase
+
+
+class CPURelu21(TestBase):
+    def test(self):
+        seed = 1234
+        torch.manual_seed(seed)
+        torch.set_printoptions(precision=7)
+        bs, seq_len, embedding_dim = 1, 10, 32000
+        input0 = torch.randn(bs, seq_len, embedding_dim).float()
+        relu = ReLUSquaredActivation()
+        output = relu(input0)
+        print(output)
+        self.test_done(True)