Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SINGA-236 memory pool #255

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions include/singa/core/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@


#ifdef USE_OPENCL
#include "singa/utils/opencl_utils.h"
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#define CL_HPP_TARGET_OPENCL_VERSION 120
#include <CL/cl2.hpp>
#include <unordered_map>
#endif // USE_OPENCL

using std::atomic;
Expand All @@ -62,6 +65,9 @@ class Block {
// Disabled as it is not used currently.
// Block(void* ptr, size_t size, size_t offset, std::shared_ptr<atomic<int>>
// ref) : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {}

// TODO(wangwei) check if the set is correct and add lock if shared sturcture is allowed
void set_data(void* ptr) { data_ = ptr; }
void* mutable_data() {
initialized_ = true;
return static_cast<char*>(data_) + offset_;
Expand Down Expand Up @@ -107,8 +113,9 @@ typedef struct _Context {
#endif // USE_CUDA

#ifdef USE_OPENCL
// This stores the context ID of the OpenCL context controlled by ViennaCL.
long vcl_ctx_id;
std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
cl::CommandQueue ocl_cmdq;
cl::Context ocl_ctx;
#endif

} Context;
Expand Down
32 changes: 32 additions & 0 deletions include/singa/core/memory.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <atomic>
#include "singa/proto/core.pb.h"
#include "singa/singa_config.h"
#include "singa/core/common.h"

#ifdef USE_CUDA
#include "cnmem.h"
Expand Down Expand Up @@ -50,6 +51,37 @@ class DeviceMemPool {
// size_t init_size_ = 0, max_size_ = 0;
};

class CppMemPool {
public:
CppMemPool();

Block* Malloc(const size_t size);
void Free(Block* ptr);

// get the free and total size of the memory pool (in terms of bytes)
std::pair<size_t, size_t> GetMemUsage(){return std::make_pair(freeSize,memPoolSize);};

~CppMemPool();

private:
// each structure define a memory uint in the memory pool
// the structure is a static double linked list
struct _Uint {
struct _Uint *pPrev, *pNext;
Block* pBlk;
};

// total size held by the memory pool (in terms of bytes)
size_t memPoolSize;
// total free size by the memory pool (in terms of bytes)
size_t freeSize;

// each pointer in this array keeps a head of the allocated memory uints of different size (power of 2)
struct _Uint **ppAllocUints;
// each pointer in this array keeps a head of the allocated memory uints of different size (power of 2)
struct _Uint **ppFreeUints;
};

#ifdef USE_CUDA
class CnMemPool : public DeviceMemPool {
public:
Expand Down
118 changes: 116 additions & 2 deletions src/core/memory/memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,123 @@
#include "singa/utils/logging.h"
#include "singa/proto/core.pb.h"
#include <iostream>
/*
int get_pos(size_t size) {
int result = 0;
while(size > 1) {
result++;
size = size/2;
}
return result;
}
*/
namespace singa {

CppMemPool::CppMemPool() {
memPoolSize = 0;
freeSize = 0;
ppAllocUints = (struct _Uint**)malloc(64*sizeof(struct _Uint*));
ppFreeUints = (struct _Uint**)malloc(64*sizeof(struct _Uint*));
for(int i = 0; i < 64; i++) {
ppAllocUints[i] = NULL;
ppFreeUints[i] = NULL;
}
}


Block* CppMemPool::Malloc(const size_t size) {
CHECK(size > 0);
Block *pAllocBlk = NULL;
int pos = 63 - __builtin_clzll(size);

struct _Uint*& pAllocUint = ppAllocUints[pos];
struct _Uint*& pFreeUint = ppFreeUints[pos];
struct _Uint* pCurUint = NULL;
size_t memSize = pow(2,pos);
size_t blkSize = (size % memSize == 0) ? memSize : memSize*2;
blkSize += sizeof(struct _Uint);

if(pFreeUint == NULL) { // if no available free blocks
memPoolSize += blkSize;
pCurUint = (struct _Uint*)malloc(blkSize);
pCurUint->pPrev = NULL;
pCurUint->pNext = pAllocUint;
if(pAllocUint != NULL) {
pAllocUint->pPrev = pCurUint;
}
pAllocUint = pCurUint;
pAllocBlk = new Block((char*)(pCurUint) + sizeof(struct _Uint), size);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

size of the allocated block + sizeof (struct _Uint) == size?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the requested size is 1024, then we allocated 1024+16, 16 is used for metadata of the mempool.

pCurUint->pBlk = pAllocBlk;
} else {
freeSize -= blkSize;
pCurUint = pFreeUint;
pFreeUint = pCurUint->pNext;
if(pFreeUint != NULL) {
pFreeUint->pPrev = NULL;
}

pCurUint->pNext = pAllocUint;
if(pAllocUint != NULL) {
pAllocUint->pPrev = pCurUint;
}
pAllocUint = pCurUint;
pAllocBlk = pCurUint->pBlk;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pAllocBlk->size() could be larger than size?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I will reset the size. Thanks for identifying the bug.

}
return pAllocBlk;
}

void CppMemPool::Free(Block* ptr) {
void* pData = ptr->mutable_data();
struct _Uint *pCurUint = (struct _Uint*)((char*)pData-sizeof(struct _Uint));
int pos = 63 - __builtin_clzll(ptr->size());
struct _Uint*& pAllocUint = ppAllocUints[pos];
struct _Uint*& pFreeUint = ppFreeUints[pos];
size_t memSize = pow(2,pos);
size_t blkSize = (ptr->size() % memSize == 0) ? memSize : memSize*2;
blkSize += sizeof(struct _Uint);
freeSize += blkSize;

if(pCurUint == pAllocUint) {
pAllocUint = pCurUint->pNext;
if(pAllocUint != NULL) {
pAllocUint->pPrev = NULL;
}
} else {
struct _Uint *pCurPrevUint = pCurUint->pPrev;
pCurUint->pPrev = NULL;
pCurPrevUint->pNext = pCurUint->pNext;
if(pCurUint->pNext != NULL) {
pCurUint->pNext->pPrev = pCurPrevUint;
}
}

pCurUint->pNext = pFreeUint;
if(pFreeUint != NULL) {
pFreeUint->pPrev = pCurUint;
}
pFreeUint = pCurUint;
}


CppMemPool::~CppMemPool() {
// traverse all lists to delete the memory
for(int pos = 0; pos < 64; pos++) {
for(int i = 0; i < 2; i++) {
struct _Uint *pCurUint = i == 0 ? ppAllocUints[pos] : ppFreeUints[pos];
while(pCurUint != NULL) {
struct _Uint *pNextUint = pCurUint->pNext;
free(pCurUint->pBlk);
free(pCurUint);
pCurUint = pNextUint;
}
}
}
free(ppAllocUints);
free(ppFreeUints);
}


#ifdef USE_CUDA
namespace singa {
std::atomic<int> CnMemPool::pool_count(0);
std::pair<size_t, size_t> CnMemPool::GetMemUsage() {
size_t free, total;
Expand Down Expand Up @@ -107,5 +221,5 @@ void CudaMemPool::Free(void *ptr) {
cudaError_t status = cudaFree(ptr);
CHECK_EQ(status, cudaError_t::cudaSuccess);
}
}
#endif
}
158 changes: 158 additions & 0 deletions test/singa/test_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,164 @@
#include "singa/singa_config.h"
#include "singa/utils/timer.h"
#include "singa/utils/cuda_utils.h"
#include <stdlib.h>

TEST(CppMemPool, Compare) {
singa::CppMemPool pool;
const int numOfOuterLoops =1000;
const int numOfInnerLoops = 100;
const size_t allocSize = 1024*1024;
int** pp = new int*[numOfInnerLoops];
singa::Block** ppBlk = new singa::Block*[numOfInnerLoops];

double alloc_time = 0;
double free_time = 0;
time_t start,end;
singa::Timer t;
for (int i = 0; i < numOfOuterLoops; i++) {
start = clock();
for(int j = 0; j < numOfInnerLoops; j++) {
pp[j] = (int*)malloc(allocSize);
}
end = clock();
alloc_time += end-start;
start = clock();
for(int j = 0; j < numOfInnerLoops; j++) {
free(pp[j]);
}
end = clock();
free_time += end-start;
}
int kernel_time = t.Elapsed<singa::Timer::Milliseconds>();

t.Tick();
alloc_time = free_time = 0;
for (int i = 0; i < numOfOuterLoops; i++) {
start = clock();
for(int j = 0; j < numOfInnerLoops; j++) {
ppBlk[j] = pool.Malloc(allocSize);
}
end = clock();
alloc_time += end-start;
start = clock();
for(int j = 0; j < numOfInnerLoops; j++) {
pool.Free(ppBlk[j]);
}
end = clock();
free_time += end-start;
}
int mempool_time = t.Elapsed<singa::Timer::Milliseconds>();
EXPECT_GT(kernel_time,mempool_time);
delete pp;
delete ppBlk;
}

// this tests allocated a number of memory blocks in the memory pool
TEST(CppMemPool, Malloc) {
singa::CppMemPool pool;
const int numOfTests = 1024;
const size_t dataSizeSmall = 1000;
const size_t dataSizeLarge = 2000;
singa::Block** pptr = new singa::Block*[numOfTests];

for(int i = 0; i < numOfTests; i++) {
const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
pptr[i] = pool.Malloc(dataSize);
int* data = static_cast<int*>(pptr[i]->mutable_data());
for(int idx = 0; idx < (int)dataSize/4; idx++) {
data[idx] = i;
}
data = static_cast<int*>(pptr[i]->mutable_data());
int sum = 0;
for(int idx = 0; idx < (int)dataSize/4; idx++) {
sum += data[idx];
}
CHECK_EQ(sum,i*dataSize/4);
}
for(int i = 0; i < numOfTests; i++) {
pool.Free(pptr[i]);
}
delete[] pptr;
}


// we allocate 1024 memory blocks
// subsequently, we randomly free 512 blocks and after that allocate them back to the pool
TEST(CppMemPool, RandomFree) {
singa::CppMemPool pool;
const int numOfTests = 1024;
const size_t dataSizeSmall = 1000;
const size_t dataSizeLarge = 2000;
singa::Block** pptr = new singa::Block*[numOfTests];

for(int i = 0; i < numOfTests; i++) {
const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
pptr[i] = pool.Malloc(dataSize);
int* data = static_cast<int*>(pptr[i]->mutable_data());
for(int idx = 0; idx < (int)dataSize/4; idx++) {
data[idx] = i;
}
data = static_cast<int*>(pptr[i]->mutable_data());
int sum = 0;
for(int idx = 0; idx < (int)dataSize/4; idx++) {
sum += data[idx];
}
CHECK_EQ(sum,i*dataSize/4);
}

// randomized free pointers
int* randomPool = new int[numOfTests];
for(int i = 0; i < numOfTests; i++) {
randomPool[i] = i;
}
int iter = 0;
while(iter != numOfTests/2) { // random free half of the memory blocks
int pos = std::rand() % (numOfTests-iter);
int i = randomPool[pos];
std::swap(randomPool[pos],randomPool[numOfTests-1-iter]);

// check value before deletion
const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
int* data = static_cast<int*>(pptr[i]->mutable_data());
for(int idx = 0; idx < (int)dataSize/4; idx++) {
data[idx] = i;
}
data = static_cast<int*>(pptr[i]->mutable_data());
int sum = 0;
for(int idx = 0; idx < (int)dataSize/4; idx++) {
sum += data[idx];
}
CHECK_EQ(sum,i*dataSize/4);

pool.Free(pptr[i]);
iter++;
}

// test the unfreed memory block value
for(int pos = 0; pos < numOfTests/2; pos++) {
int i = randomPool[pos];
const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
int* data = static_cast<int*>(pptr[i]->mutable_data());
for(int idx = 0; idx < (int)dataSize/4; idx++) {
data[idx] = i;
}
data = static_cast<int*>(pptr[i]->mutable_data());
int sum = 0;
for(int idx = 0; idx < (int)dataSize/4; idx++) {
sum += data[idx];
}
CHECK_EQ(sum,i*dataSize/4);
}

for(int pos = numOfTests/2; pos < numOfTests; pos++) {
int i = randomPool[pos];
const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
pptr[i] = pool.Malloc(dataSize);
}

delete[] randomPool;
delete[] pptr;
}

#ifdef USE_CUDA
/*
Expand Down