Skip to content

Commit

Permalink
Prepare For Fuyu
Browse files Browse the repository at this point in the history
RELU,ReLUSquaredActivation & LayerNorm
  • Loading branch information
lx200916 authored Nov 27, 2023
2 parents e7e7711 + 8584b64 commit a0aeb52
Show file tree
Hide file tree
Showing 15 changed files with 373 additions and 3 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
add_compile_options(-mavx2)
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
add_definitions(-DARM)
endif()

if(QUANT)
Expand Down
6 changes: 6 additions & 0 deletions include/OpDefined.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ enum OpType {
MUL,
VIEW,
KVCACHE,
RELU,
RELU2,
LAYERNORM,
OP_NUM
};

Expand All @@ -42,6 +45,9 @@ static const vector<string> OpNames = {
"Mul",
"VIEW",
"KVCACHE",
"ReLU",
"ReLUSquaredActivation",
"LayerNorm",
"OP_NUM"};
} // namespace mllm
#endif
3 changes: 3 additions & 0 deletions src/backends/cpu/CPUBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "CPUEmbedding.hpp"
#include "CPUMul.hpp"
#include "CPUKVCache.hpp"
#include "CPUReLU.hpp"
#include <math.h>
namespace mllm {
CPUBackend::CPUBackend(shared_ptr<MemoryManager>& mm) :
Expand Down Expand Up @@ -72,6 +73,8 @@ void CPUBackend::registerOps() {
addCreator(MUL, (CPUBackend::Creator *)(new CPUMulCreator()));
addCreator(VIEW, (CPUBackend::Creator *)(new CPUViewCreator()));
addCreator(KVCACHE, (CPUBackend::Creator *)(new CPUKVCacheCreator()));
addCreator(RELU, (CPUBackend::Creator *)(new CPUReLUCreator()));
addCreator(RELU2, (CPUBackend::Creator *)(new CPUReLUCreator()));
}

} // namespace mllm
70 changes: 70 additions & 0 deletions src/backends/cpu/CPULayerNorm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#include "CPULayerNorm.hpp"

namespace mllm {
CPULayerNorm::CPULayerNorm(Backend *bn, string opName, bool multiThread, float epsilon) :
support_multi_thread_(multiThread), Op(bn, std::move(opName)), epsilon_(epsilon) {
}
ErrorCode CPULayerNorm::load(AbstructLoader &loader) {
weight_.setName(name() + ".weight");
weight_.reshape(1, 1, 1, normSize_); //
weight_.setDtype(loader.getDataType(weight_.name()));
weight_.alloc();
loader.load(&weight_);
bias_.setName(name() + ".bias");
bias_.reshape(1, 1, 1, normSize_); //
bias_.setDtype(loader.getDataType(bias_.name()));
bias_.alloc();
loader.load(&bias_);
return Op::load(loader);
}
ErrorCode CPULayerNorm::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
normSize_ = inputs[0]->dimension();
outputs[0]->reshape(inputs[0]->batch(), inputs[0]->shape(1), inputs[0]->shape(2), inputs[0]->shape(3));
return Op::reshape(inputs, outputs);
}

ErrorCode CPULayerNorm::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
auto input = inputs[0];
auto output = outputs[0];
int batch = input->batch();
int dim = input->dimension();
int seq = input->sequence();
int head = input->head();
for (int h = 0; h < head; h++) {
for (int n = 0; n < batch; n++) {
for (int s = 0; s < seq; s++) {
float sum_squares = 0.0F;
float sum = 0.0F;
// sum
#pragma omp parallel for reduction(+ : sum_squares) reduction(+ : sum) num_threads(4)
for (int d = 0; d < dim; d++) {
float value = input->dataAt<float>(n, h, s, d);
sum += value;
}
float mean = sum / dim;
#pragma omp parallel for reduction(+ : sum_squares) num_threads(4)
for (int d = 0; d < dim; d++) {
float value = input->dataAt<float>(n, h, s, d);
sum_squares += (value - mean) * (value - mean);
output->setDataAt(n, h, s, d, value - mean);
}
float rms = std::sqrt(sum_squares / dim + epsilon_);
#pragma omp parallel for num_threads(4)
for (int d = 0; d < dim; d++) {
float value = output->dataAt<float>(n, h, s, d);
output->setDataAt<float>(n, h, s, d, weight_.dataAt<float>(0, 0, 0, d) * value / rms + bias_.dataAt<float>(0, 0, 0, d));
}
}
}
}

return Op::execute(inputs, outputs);
}
ErrorCode CPULayerNorm::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
return Op::free(inputs, outputs);
}
} // namespace mllm
38 changes: 38 additions & 0 deletions src/backends/cpu/CPULayerNorm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#ifndef MLLM_CPULAYERNORM_HPP
#define MLLM_CPULAYERNORM_HPP

#include "CPUBackend.hpp"
namespace mllm {

class CPULayerNorm:public Op {
public:
CPULayerNorm(Backend *bn, string opName, bool multiThread, float epsilon = 1e-5);
virtual ~CPULayerNorm() = default;
virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
ErrorCode load(AbstructLoader &loader) override;

private:
bool support_multi_thread_ = false;
float epsilon_ = 1e-5;
int normSize_=0;
Tensor weight_;
Tensor bias_;
};
class CPULayerNormCreator : public CPUBackend::Creator {
public:
virtual Op *create(OpParam op_param, Backend *bn, string name) const {

return new CPULayerNorm(bn, name, false);
}
};


} // namespace mllm

#endif // MLLM_CPULAYERNORM_HPP
42 changes: 42 additions & 0 deletions src/backends/cpu/CPUReLU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#include "CPUReLU.hpp"

#include <utility>

namespace mllm {
CPUReLU::CPUReLU(Backend *bn, string opName, bool multiThread):support_multi_thread_(multiThread), Op(bn, std::move(opName)) {
}
ErrorCode CPUReLU::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
CHECK_EQ(inputs.size(), 1);
CHECK_EQ(outputs.size(), 1);
outputs[0]->reshape(inputs[0]->batch(), inputs[0]->shape(1), inputs[0]->shape(2), inputs[0]->shape(3));
return Op::reshape(inputs, outputs);
}

ErrorCode CPUReLU::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
auto input = inputs[0];
auto output = outputs[0];
int batch = input->batch();
int head = input->head();
int seq = input->sequence();
int dim = input->dimension();
#pragma omp parallel for collapse(4)
for (int b = 0; b <batch ; ++b) {
for (int h = 0; h < head; ++h) {
for (int s = 0; s < seq; ++s) {
for (int d = 0; d < dim; ++d) {
float value = input->dataAt<float>(b, h, s, d);
output->setDataAt<float>(b, h, s, d, value > 0 ? value : 0);
}
}
}
}
return Op::execute(inputs, outputs);
}
ErrorCode CPUReLU::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
return Op::free(inputs, outputs);
}
} // namespace mllm
32 changes: 32 additions & 0 deletions src/backends/cpu/CPUReLU.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#ifndef MLLM_CPURELU_HPP
#define MLLM_CPURELU_HPP

#include "Op.hpp"
#include "CPUBackend.hpp"
namespace mllm {
class CPUReLU final : public Op {
public:
CPUReLU(Backend *bn, string opName, bool multiThread);
virtual ~CPUReLU() = default;
virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;


private:
bool support_multi_thread_ = false;
};

class CPUReLUCreator : public CPUBackend::Creator {
public:
virtual Op *create(OpParam op_param, Backend *bn, string name) const {
return new CPUReLU(bn, name, false);
}
};
} // namespace mllm

#endif // MLLM_CPURELU_HPP
45 changes: 45 additions & 0 deletions src/backends/cpu/CPUReLU2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#include "CPUReLU2.hpp"

namespace mllm {

CPUReLU2::CPUReLU2(Backend *bn, string opName, bool multiThread):support_multi_thread_(multiThread), Op(bn, std::move(opName)) {
}
ErrorCode CPUReLU2::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
CHECK_EQ(inputs.size(), 1);
CHECK_EQ(outputs.size(), 1);
outputs[0]->reshape(inputs[0]->batch(), inputs[0]->shape(1), inputs[0]->shape(2), inputs[0]->shape(3));
return Op::reshape(inputs, outputs);
}
ErrorCode CPUReLU2::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
auto input = inputs[0];
auto output = outputs[0];
int batch = input->batch();
int head = input->head();
int seq = input->sequence();
int dim = input->dimension();
#pragma omp parallel for collapse(4)
for (int b = 0; b <batch ; ++b) {
for (int h = 0; h < head; ++h) {
for (int s = 0; s < seq; ++s) {
for (int d = 0; d < dim; ++d) {
float value = input->dataAt<float>(b, h, s, d);
if (value < 0) {
value = 0;
}
//Square
value = std::pow(value, 2);
output->setDataAt<float>(b, h, s, d, value);
}
}
}
}
return Op::execute(inputs, outputs);
}
ErrorCode CPUReLU2::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
return Op::free(inputs, outputs);
}
} // namespace mllm
32 changes: 32 additions & 0 deletions src/backends/cpu/CPUReLU2.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#ifndef MLLM_CPURELU2_HPP
#define MLLM_CPURELU2_HPP

#include "Op.hpp"
#include "CPUBackend.hpp"
namespace mllm {
class CPUReLU2 final : public Op {
public:
CPUReLU2(Backend *bn, string opName, bool multiThread);
virtual ~CPUReLU2() = default;
virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;


private:
bool support_multi_thread_ = false;
};

class CPUReLU2Creator : public CPUBackend::Creator {
public:
virtual Op *create(OpParam op_param, Backend *bn, string name) const {
return new CPUReLU2(bn, name, false);
}
};
} // namespace mllm

#endif // MLLM_CPURELU2_HPP
1 change: 1 addition & 0 deletions test/TestLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ string DimDesc(vector<int> dim) {
TestIO::TestIO(string filename, bool read_mode) :
read_mode_(read_mode) {
filename = "test_" + filename + ".mllm";

if (read_mode) {
fp_ = fopen(filename.c_str(), "rb");
} else {
Expand Down
22 changes: 22 additions & 0 deletions test/cpu/CPUReLU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// Created by 咸的鱼 on 2023/11/27.
//

#include "CPUTest.hpp"
#include "backends/cpu/CPUReLU.hpp"
TEST_F(CPUTest, CPUReLU1) {
SETUP_OP(CPUReLU, false);
TENSOR(input0);
TENSOR(output);
TENSOR(c_output);
TEST_LOAD(input0);
TEST_LOAD(output);

TEST_RESHAPE({input0}, {c_output});
TEST_SETUP({input0}, {c_output});
// TEST_LOAD(&op->weight(), false);
TEST_WEIGHTS_LOAD(loader);
// op->weight().printData<float>();
TEST_EXCUTE({input0}, {c_output});
COMPARE_TENSOR(c_output.get(), output.get(), true);
}
21 changes: 21 additions & 0 deletions test/cpu/CPUReLU2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//
// Created by 咸的鱼 on 2023/11/27.
//
#include "CPUTest.hpp"
#include "backends/cpu/CPUReLU2.hpp"
TEST_F(CPUTest, CPUReLU21) {
SETUP_OP(CPUReLU2, false);
TENSOR(input0);
TENSOR(output);
TENSOR(c_output);
TEST_LOAD(input0);
TEST_LOAD(output);

TEST_RESHAPE({input0}, {c_output});
TEST_SETUP({input0}, {c_output});
// TEST_LOAD(&op->weight(), false);
TEST_WEIGHTS_LOAD(loader);
// op->weight().printData<float>();
TEST_EXCUTE({input0}, {c_output});
COMPARE_TENSOR(c_output.get(), output.get(), true);
}
29 changes: 29 additions & 0 deletions test/cpu/CPUReLU2Test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import torch
from torch import nn


class ReLUSquaredActivation(nn.Module):
"""
Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
"""

def forward(self, input):
relu_applied = nn.functional.relu(input)
squared = torch.square(relu_applied)
return squared


from TestUtils import TestBase


class CPURelu21(TestBase):
def test(self):
seed = 1234
torch.manual_seed(seed)
torch.set_printoptions(precision=7)
bs, seq_len, embedding_dim = 1, 10, 32000
input0 = torch.randn(bs, seq_len, embedding_dim).float()
relu = ReLUSquaredActivation()
output = relu(input0)
print(output)
self.test_done(True)
Loading

0 comments on commit a0aeb52

Please sign in to comment.