Skip to content

Commit

Permalink
bf16 saturation during model compilation
Browse files Browse the repository at this point in the history
  • Loading branch information
liubo-intel committed Jan 11, 2025
1 parent 6b5c0a5 commit ac50644
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 61 deletions.
29 changes: 18 additions & 11 deletions src/plugins/intel_cpu/src/cpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,26 @@ BlockedMemoryDescPtr IMemory::getDescWithType<BlockedMemoryDesc, 0, 0>() const {
}

namespace {
inline void setSubnormalsToZero(float* data, size_t size) {
inline void setSubnormalsToZeroAndbf16Saturation(float* data, size_t size, bool ftz, bool bf16saturation) {
uint32_t* u32data = reinterpret_cast<uint32_t*>(data);
float* floatdata = reinterpret_cast<float*>(data);
for (size_t i = 0; i < size; ++i) {
if ((u32data[i] & (0xFF << 23)) == 0) {
if (ftz && ((u32data[i] & (0xFF << 23)) == 0)) {
u32data[i] = 0;
} else if (bf16saturation) {
if (floatdata[i] < -3.3895313899137927e38f) {
floatdata[i] = -3.3895313899137927e38f;
} else if (floatdata[i] > 3.3895313899137927e38f) {
floatdata[i] = 3.3895313899137927e38f;
}
}
}
}

void transferData(const IMemory& src, const IMemory& dst, bool ftz) {
void transferData(const IMemory& src, const IMemory& dst, bool ftz, bool bf16saturation) {
node::Reorder::reorderData(src, dst);

if (!ftz) {
if (!ftz && !bf16saturation) {
return;
}
if (src.getDesc().getPrecision() != ov::element::f32 || dst.getDesc().getPrecision() != ov::element::f32) {
Expand All @@ -62,7 +69,7 @@ void transferData(const IMemory& src, const IMemory& dst, bool ftz) {
// actual FTZ
auto* memData = static_cast<float*>(dst.getData());
memData += offset;
setSubnormalsToZero(memData, dst.getSize() / sizeof(float));
setSubnormalsToZeroAndbf16Saturation(memData, dst.getSize() / sizeof(float), ftz, bf16saturation);
}

} // namespace
Expand Down Expand Up @@ -125,11 +132,11 @@ void Memory::create(MemoryDescPtr desc, const void* data, bool pads_zeroing) {
}
}

void Memory::load(const IMemory& src, bool ftz) const {
void Memory::load(const IMemory& src, bool ftz, bool bf16saturation) const {
if (src.getDesc().getPrecision() == element::string) {
OPENVINO_THROW("[CPU] Memory object cannot load string data.");
}
transferData(src, *this, ftz);
transferData(src, *this, ftz, bf16saturation);
}

void Memory::nullify() {
Expand Down Expand Up @@ -271,12 +278,12 @@ StringMemory::StringMemory(const dnnl::engine& engine, const MemoryDescPtr& desc
}
}

void StringMemory::load(const IMemory& src, bool ftz) const {
void StringMemory::load(const IMemory& src, bool ftz, bool bf16saturation) const {
if (src.getDesc().getPrecision() != element::string) {
OPENVINO_THROW("[CPU] String memory cannot load a non-string object.");
}

transferData(src, *this, false);
transferData(src, *this, false, false);
}

void* StringMemory::getData() const {
Expand Down Expand Up @@ -470,11 +477,11 @@ void StaticMemory::redefineDesc(MemoryDescPtr desc) {
OPENVINO_THROW("Unexpected: Memory descriptor may not be modified in StaticMemory object");
}

void StaticMemory::load(const IMemory& src, bool ftz) const {
void StaticMemory::load(const IMemory& src, bool ftz, bool bf16saturation) const {
if (src.getDesc().getPrecision() == element::string) {
OPENVINO_THROW("[CPU] StaticMemory cannot load string data.");
}
transferData(src, *this, ftz);
transferData(src, *this, ftz, bf16saturation);
}

MemoryBlockPtr StaticMemory::getMemoryBlock() const {
Expand Down
8 changes: 4 additions & 4 deletions src/plugins/intel_cpu/src/cpu_memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ class IMemory {
// Caution!!! This action invalidates the previous data layout. The old data may become unreachable.
virtual void redefineDesc(MemoryDescPtr desc) = 0;

virtual void load(const IMemory& src, bool ftz = true) const = 0;
virtual void load(const IMemory& src, bool ftz = true, bool bf16saturation = false) const = 0;

virtual MemoryBlockPtr getMemoryBlock() const = 0;

Expand Down Expand Up @@ -259,7 +259,7 @@ class StaticMemory final : public IMemory {
// Always throws since a static memory descriptor should not be modified
void redefineDesc(MemoryDescPtr desc) override;

void load(const IMemory& src, bool ftz = true) const override;
void load(const IMemory& src, bool ftz = true, bool bf16saturation = false) const override;

MemoryBlockPtr getMemoryBlock() const override;

Expand Down Expand Up @@ -314,7 +314,7 @@ class Memory : public IMemory {

void redefineDesc(MemoryDescPtr desc) override;

void load(const IMemory& src, bool ftz = true) const override;
void load(const IMemory& src, bool ftz = true, bool bf16saturation = false) const override;
void nullify() override;

dnnl::engine getEngine() const {
Expand Down Expand Up @@ -420,7 +420,7 @@ class StringMemory : public IMemory {

void redefineDesc(MemoryDescPtr desc) override;

void load(const IMemory& src, bool ftz = false) const override;
void load(const IMemory& src, bool ftz = false, bool bf16saturation = false) const override;

MemoryBlockPtr getMemoryBlock() const override;

Expand Down
115 changes: 71 additions & 44 deletions src/plugins/intel_cpu/src/nodes/input.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,71 @@ void Input::cloneBlobIfRequired() {
needFlushDenormalsToZero = false;
}

// The presence of subnormals is better to determined at IR read time.
auto checkSubnormalsAndBF16Overflows = [&](bool& has_subnormals, bool& has_bf16_overflows) {
if (prec == ov::element::f32) {
uint32_t const* u32data = m_constOp->get_data_ptr<uint32_t>();
float const* f32data = m_constOp->get_data_ptr<float>();

if (!size)
return;

const float bf16_max = 3.3895313899137927e38f;

#if defined(OPENVINO_ARCH_X86_64)
if (auto fn = jit_has_subnormals_function()) {
static const size_t batch_size = 2048;
const size_t iterations_num = size / batch_size + 1;

volatile bool has_subnormals_local = false;

parallel_for(iterations_num, [&](int n) {
auto ptr = u32data + n * batch_size;
const jit_has_subnormals_base::args_t args = {reinterpret_cast<float const*>(ptr),
std::min(batch_size, (size_t)(u32data + size - ptr)),
false};

fn(&args);

if (args.hasSubnormals)
has_subnormals_local = true;
});

has_subnormals = has_subnormals_local;
//TODO: opt with jit
for (size_t i = 0; i < size; ++i) {
if (!std::isnan(f32data[i]) && !std::isinf(f32data[i]) &&
(f32data[i] < -bf16_max || f32data[i] > bf16_max)) {
has_bf16_overflows = true;
return;
}
}
return;
}
#endif

uint32_t mantissaMask = 0x007fffff;
uint32_t exponentMask = 0x7f800000;
for (size_t i = 0; i < size; ++i) {
if ((u32data[i] & exponentMask) == 0 && (u32data[i] & mantissaMask) != 0) {
has_subnormals = true;
}
if (!std::isnan(f32data[i]) && !std::isinf(f32data[i]) &&
(f32data[i] < -bf16_max || f32data[i] > bf16_max)) {
has_bf16_overflows = true;
}
if (has_subnormals && has_bf16_overflows) {
return;
}
}
}
};

bool has_subnormals = false;
bool has_bf16_overflows = false;

checkSubnormalsAndBF16Overflows(has_subnormals, has_bf16_overflows);

auto cloneBlob = [&, this]() {
MemoryPtr memory;

Expand Down Expand Up @@ -294,7 +359,7 @@ void Input::cloneBlobIfRequired() {
} else {
ptr = std::make_shared<StaticMemory>(getEngine(), memDesc);
}
ptr->load(*memory.get(), needFlushDenormalsToZero);
ptr->load(*memory.get(), needFlushDenormalsToZero, has_bf16_overflows);

return ptr;
};
Expand All @@ -311,60 +376,22 @@ void Input::cloneBlobIfRequired() {
#endif
};

// The presence of subnormals is better to determined at IR read time.
auto hasSubnormals = [&]() {
if (prec == ov::element::f32) {
uint32_t const* u32data = m_constOp->get_data_ptr<uint32_t>();

if (!size)
return false;

#if defined(OPENVINO_ARCH_X86_64)
if (auto fn = jit_has_subnormals_function()) {
static const size_t batch_size = 2048;
const size_t iterations_num = size / batch_size + 1;

volatile bool has_subnormals = false;

parallel_for(iterations_num, [&](int n) {
auto ptr = u32data + n * batch_size;
const jit_has_subnormals_base::args_t args = {reinterpret_cast<float const*>(ptr),
std::min(batch_size, (size_t)(u32data + size - ptr)),
false};

fn(&args);

if (args.hasSubnormals)
has_subnormals = true;
});

return has_subnormals;
}
#endif

uint32_t mantissaMask = 0x007fffff;
uint32_t exponentMask = 0x7f800000;
for (size_t i = 0; i < size; ++i) {
if ((u32data[i] & exponentMask) == 0 && (u32data[i] & mantissaMask) != 0) {
return true;
}
}
}
return false;
};

auto blobKey = [&]() {
char ptr[32];
snprintf(ptr, sizeof ptr, "%p", m_constOp->get_data_ptr());
return getName() + "_" + std::to_string(size * prec.size()) + "_" + ptr;
};

// my test
if (has_bf16_overflows) {
std::cout << "my test: has_bf16_overflows" << std::endl;
}
const auto weightCache = context->getWeightsCache();
const bool clone_is_not_needed =
prec != element::string &&
// IRs already have all subnormals flushed to zero, but in
// read_model scenario with directly loaded original model still can have subnormals
isBlobAligned(m_constOp) && (!needFlushDenormalsToZero || !hasSubnormals()) &&
isBlobAligned(m_constOp) && (!needFlushDenormalsToZero || !has_subnormals) && !has_bf16_overflows &&
// Blob should be cloned in cache only if original weights are stored on other numa node.
// This is possible only in multistream case on multisocket machine.
// TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/src/nodes/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class MemoryStub : public IMemory {
m_pMemDesc = desc;
}

void load(const IMemory& src, bool ftz = true) const override {
void load(const IMemory& src, bool ftz = true, bool bf16saturation = false) const override {
OPENVINO_THROW("Unexpected call MemoryStub::load()");
}

Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_cpu/tests/unit/cpu_tensor_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class MockIMemory : public IMemory {
MOCK_METHOD(const VectorDims&, getStaticDims, (), (const, override));

MOCK_METHOD(void, redefineDesc, (MemoryDescPtr), (override));
MOCK_METHOD(void, load, (const IMemory&, bool), (const, override));
MOCK_METHOD(void, load, (const IMemory&, bool, bool), (const, override));
MOCK_METHOD(MemoryBlockPtr, getMemoryBlock, (), (const, override));

MOCK_METHOD(dnnl::memory, getPrimitive, (), (const, override));
Expand Down

0 comments on commit ac50644

Please sign in to comment.