Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RELU,ReLUSquaredActivation & LayerNorm #32

Merged
merged 2 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
add_compile_options(-mavx2)
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
add_definitions(-DARM)
endif()

if(QUANT)
Expand Down
6 changes: 6 additions & 0 deletions include/OpDefined.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ enum OpType {
MUL,
VIEW,
KVCACHE,
RELU,
RELU2,
LAYERNORM,
OP_NUM
};

Expand All @@ -42,6 +45,9 @@ static const vector<string> OpNames = {
"Mul",
"VIEW",
"KVCACHE",
"ReLU",
"ReLUSquaredActivation",
"LayerNorm",
"OP_NUM"};
} // namespace mllm
#endif
3 changes: 3 additions & 0 deletions src/backends/cpu/CPUBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "CPUEmbedding.hpp"
#include "CPUMul.hpp"
#include "CPUKVCache.hpp"
#include "CPUReLU.hpp"
#include <math.h>
namespace mllm {
CPUBackend::CPUBackend(shared_ptr<MemoryManager>& mm) :
Expand Down Expand Up @@ -72,6 +73,8 @@ void CPUBackend::registerOps() {
addCreator(MUL, (CPUBackend::Creator *)(new CPUMulCreator()));
addCreator(VIEW, (CPUBackend::Creator *)(new CPUViewCreator()));
addCreator(KVCACHE, (CPUBackend::Creator *)(new CPUKVCacheCreator()));
addCreator(RELU, (CPUBackend::Creator *)(new CPUReLUCreator()));
addCreator(RELU2, (CPUBackend::Creator *)(new CPUReLUCreator()));
}

} // namespace mllm
70 changes: 70 additions & 0 deletions src/backends/cpu/CPULayerNorm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#include "CPULayerNorm.hpp"

namespace mllm {
CPULayerNorm::CPULayerNorm(Backend *bn, string opName, bool multiThread, float epsilon) :
support_multi_thread_(multiThread), Op(bn, std::move(opName)), epsilon_(epsilon) {
}
ErrorCode CPULayerNorm::load(AbstructLoader &loader) {
weight_.setName(name() + ".weight");
weight_.reshape(1, 1, 1, normSize_); //
weight_.setDtype(loader.getDataType(weight_.name()));
weight_.alloc();
loader.load(&weight_);
bias_.setName(name() + ".bias");
bias_.reshape(1, 1, 1, normSize_); //
bias_.setDtype(loader.getDataType(bias_.name()));
bias_.alloc();
loader.load(&bias_);
return Op::load(loader);
}
ErrorCode CPULayerNorm::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
normSize_ = inputs[0]->dimension();
outputs[0]->reshape(inputs[0]->batch(), inputs[0]->shape(1), inputs[0]->shape(2), inputs[0]->shape(3));
return Op::reshape(inputs, outputs);
}

ErrorCode CPULayerNorm::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
auto input = inputs[0];
auto output = outputs[0];
int batch = input->batch();
int dim = input->dimension();
int seq = input->sequence();
int head = input->head();
for (int h = 0; h < head; h++) {
for (int n = 0; n < batch; n++) {
for (int s = 0; s < seq; s++) {
float sum_squares = 0.0F;
float sum = 0.0F;
// sum
#pragma omp parallel for reduction(+ : sum_squares) reduction(+ : sum) num_threads(4)
for (int d = 0; d < dim; d++) {
float value = input->dataAt<float>(n, h, s, d);
sum += value;
}
float mean = sum / dim;
#pragma omp parallel for reduction(+ : sum_squares) num_threads(4)
for (int d = 0; d < dim; d++) {
float value = input->dataAt<float>(n, h, s, d);
sum_squares += (value - mean) * (value - mean);
output->setDataAt(n, h, s, d, value - mean);
}
float rms = std::sqrt(sum_squares / dim + epsilon_);
#pragma omp parallel for num_threads(4)
for (int d = 0; d < dim; d++) {
float value = output->dataAt<float>(n, h, s, d);
output->setDataAt<float>(n, h, s, d, weight_.dataAt<float>(0, 0, 0, d) * value / rms + bias_.dataAt<float>(0, 0, 0, d));
}
}
}
}

return Op::execute(inputs, outputs);
}
ErrorCode CPULayerNorm::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
return Op::free(inputs, outputs);
}
} // namespace mllm
38 changes: 38 additions & 0 deletions src/backends/cpu/CPULayerNorm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#ifndef MLLM_CPULAYERNORM_HPP
#define MLLM_CPULAYERNORM_HPP

#include "CPUBackend.hpp"
namespace mllm {

class CPULayerNorm:public Op {
public:
CPULayerNorm(Backend *bn, string opName, bool multiThread, float epsilon = 1e-5);
virtual ~CPULayerNorm() = default;
virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
ErrorCode load(AbstructLoader &loader) override;

private:
bool support_multi_thread_ = false;
float epsilon_ = 1e-5;
int normSize_=0;
Tensor weight_;
Tensor bias_;
};
class CPULayerNormCreator : public CPUBackend::Creator {
public:
virtual Op *create(OpParam op_param, Backend *bn, string name) const {

return new CPULayerNorm(bn, name, false);
}
};


} // namespace mllm

#endif // MLLM_CPULAYERNORM_HPP
42 changes: 42 additions & 0 deletions src/backends/cpu/CPUReLU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#include "CPUReLU.hpp"

#include <utility>

namespace mllm {
CPUReLU::CPUReLU(Backend *bn, string opName, bool multiThread):support_multi_thread_(multiThread), Op(bn, std::move(opName)) {
}
ErrorCode CPUReLU::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
CHECK_EQ(inputs.size(), 1);
CHECK_EQ(outputs.size(), 1);
outputs[0]->reshape(inputs[0]->batch(), inputs[0]->shape(1), inputs[0]->shape(2), inputs[0]->shape(3));
return Op::reshape(inputs, outputs);
}

ErrorCode CPUReLU::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
auto input = inputs[0];
auto output = outputs[0];
int batch = input->batch();
int head = input->head();
int seq = input->sequence();
int dim = input->dimension();
#pragma omp parallel for collapse(4)
for (int b = 0; b <batch ; ++b) {
for (int h = 0; h < head; ++h) {
for (int s = 0; s < seq; ++s) {
for (int d = 0; d < dim; ++d) {
float value = input->dataAt<float>(b, h, s, d);
output->setDataAt<float>(b, h, s, d, value > 0 ? value : 0);
}
}
}
}
return Op::execute(inputs, outputs);
}
ErrorCode CPUReLU::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
return Op::free(inputs, outputs);
}
} // namespace mllm
32 changes: 32 additions & 0 deletions src/backends/cpu/CPUReLU.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#ifndef MLLM_CPURELU_HPP
#define MLLM_CPURELU_HPP

#include "Op.hpp"
#include "CPUBackend.hpp"
namespace mllm {
class CPUReLU final : public Op {
public:
CPUReLU(Backend *bn, string opName, bool multiThread);
virtual ~CPUReLU() = default;
virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;


private:
bool support_multi_thread_ = false;
};

class CPUReLUCreator : public CPUBackend::Creator {
public:
virtual Op *create(OpParam op_param, Backend *bn, string name) const {
return new CPUReLU(bn, name, false);
}
};
} // namespace mllm

#endif // MLLM_CPURELU_HPP
45 changes: 45 additions & 0 deletions src/backends/cpu/CPUReLU2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#include "CPUReLU2.hpp"

namespace mllm {

CPUReLU2::CPUReLU2(Backend *bn, string opName, bool multiThread):support_multi_thread_(multiThread), Op(bn, std::move(opName)) {
}
ErrorCode CPUReLU2::reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
CHECK_EQ(inputs.size(), 1);
CHECK_EQ(outputs.size(), 1);
outputs[0]->reshape(inputs[0]->batch(), inputs[0]->shape(1), inputs[0]->shape(2), inputs[0]->shape(3));
return Op::reshape(inputs, outputs);
}
ErrorCode CPUReLU2::execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
auto input = inputs[0];
auto output = outputs[0];
int batch = input->batch();
int head = input->head();
int seq = input->sequence();
int dim = input->dimension();
#pragma omp parallel for collapse(4)
for (int b = 0; b <batch ; ++b) {
for (int h = 0; h < head; ++h) {
for (int s = 0; s < seq; ++s) {
for (int d = 0; d < dim; ++d) {
float value = input->dataAt<float>(b, h, s, d);
if (value < 0) {
value = 0;
}
//Square
value = std::pow(value, 2);
output->setDataAt<float>(b, h, s, d, value);
}
}
}
}
return Op::execute(inputs, outputs);
}
ErrorCode CPUReLU2::free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) {
return Op::free(inputs, outputs);
}
} // namespace mllm
32 changes: 32 additions & 0 deletions src/backends/cpu/CPUReLU2.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//
// Created by 咸的鱼 on 2023/11/26.
//

#ifndef MLLM_CPURELU2_HPP
#define MLLM_CPURELU2_HPP

#include "Op.hpp"
#include "CPUBackend.hpp"
namespace mllm {
class CPUReLU2 final : public Op {
public:
CPUReLU2(Backend *bn, string opName, bool multiThread);
virtual ~CPUReLU2() = default;
virtual ErrorCode reshape(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode execute(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;
virtual ErrorCode free(vector<shared_ptr<Tensor>> inputs, vector<shared_ptr<Tensor>> outputs) override;


private:
bool support_multi_thread_ = false;
};

class CPUReLU2Creator : public CPUBackend::Creator {
public:
virtual Op *create(OpParam op_param, Backend *bn, string name) const {
return new CPUReLU2(bn, name, false);
}
};
} // namespace mllm

#endif // MLLM_CPURELU2_HPP
1 change: 1 addition & 0 deletions test/TestLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ string DimDesc(vector<int> dim) {
TestIO::TestIO(string filename, bool read_mode) :
read_mode_(read_mode) {
filename = "test_" + filename + ".mllm";

if (read_mode) {
fp_ = fopen(filename.c_str(), "rb");
} else {
Expand Down
22 changes: 22 additions & 0 deletions test/cpu/CPUReLU.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//
// Created by 咸的鱼 on 2023/11/27.
//

#include "CPUTest.hpp"
#include "backends/cpu/CPUReLU.hpp"
TEST_F(CPUTest, CPUReLU1) {
SETUP_OP(CPUReLU, false);
TENSOR(input0);
TENSOR(output);
TENSOR(c_output);
TEST_LOAD(input0);
TEST_LOAD(output);

TEST_RESHAPE({input0}, {c_output});
TEST_SETUP({input0}, {c_output});
// TEST_LOAD(&op->weight(), false);
TEST_WEIGHTS_LOAD(loader);
// op->weight().printData<float>();
TEST_EXCUTE({input0}, {c_output});
COMPARE_TENSOR(c_output.get(), output.get(), true);
}
21 changes: 21 additions & 0 deletions test/cpu/CPUReLU2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//
// Created by 咸的鱼 on 2023/11/27.
//
#include "CPUTest.hpp"
#include "backends/cpu/CPUReLU2.hpp"
TEST_F(CPUTest, CPUReLU21) {
SETUP_OP(CPUReLU2, false);
TENSOR(input0);
TENSOR(output);
TENSOR(c_output);
TEST_LOAD(input0);
TEST_LOAD(output);

TEST_RESHAPE({input0}, {c_output});
TEST_SETUP({input0}, {c_output});
// TEST_LOAD(&op->weight(), false);
TEST_WEIGHTS_LOAD(loader);
// op->weight().printData<float>();
TEST_EXCUTE({input0}, {c_output});
COMPARE_TENSOR(c_output.get(), output.get(), true);
}
29 changes: 29 additions & 0 deletions test/cpu/CPUReLU2Test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import torch
from torch import nn


class ReLUSquaredActivation(nn.Module):
"""
Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
"""

def forward(self, input):
relu_applied = nn.functional.relu(input)
squared = torch.square(relu_applied)
return squared


from TestUtils import TestBase


class CPURelu21(TestBase):
def test(self):
seed = 1234
torch.manual_seed(seed)
torch.set_printoptions(precision=7)
bs, seq_len, embedding_dim = 1, 10, 32000
input0 = torch.randn(bs, seq_len, embedding_dim).float()
relu = ReLUSquaredActivation()
output = relu(input0)
print(output)
self.test_done(True)
Loading
Loading