diff --git a/include/singa/core/common.h b/include/singa/core/common.h index 2c6d1d8668..66e8f5a839 100644 --- a/include/singa/core/common.h +++ b/include/singa/core/common.h @@ -36,7 +36,10 @@ #ifdef USE_OPENCL -#include "singa/utils/opencl_utils.h" +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#include +#include #endif // USE_OPENCL using std::atomic; @@ -62,6 +65,9 @@ class Block { // Disabled as it is not used currently. // Block(void* ptr, size_t size, size_t offset, std::shared_ptr> // ref) : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {} + + // TODO(wangwei) check if the set is correct and add lock if shared sturcture is allowed + void set_size(size_t size) { size_=size; } void* mutable_data() { initialized_ = true; return static_cast(data_) + offset_; @@ -107,8 +113,9 @@ typedef struct _Context { #endif // USE_CUDA #ifdef USE_OPENCL - // This stores the context ID of the OpenCL context controlled by ViennaCL. - long vcl_ctx_id; + std::shared_ptr> kernels; + cl::CommandQueue ocl_cmdq; + cl::Context ocl_ctx; #endif } Context; diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h index f664f95ced..d0a8afa807 100644 --- a/include/singa/core/memory.h +++ b/include/singa/core/memory.h @@ -23,6 +23,7 @@ #include #include "singa/proto/core.pb.h" #include "singa/singa_config.h" +#include "singa/core/common.h" #ifdef USE_CUDA #include "cnmem.h" @@ -50,6 +51,37 @@ class DeviceMemPool { // size_t init_size_ = 0, max_size_ = 0; }; +class CppMemPool { + public: + CppMemPool(); + + Block* Malloc(const size_t size); + void Free(Block* ptr); + + // get the free and total size of the memory pool (in terms of bytes) + std::pair GetMemUsage(){return std::make_pair(freeSize,memPoolSize);}; + + ~CppMemPool(); + + private: + // each structure define a memory uint in the memory pool + // the structure is a static double linked list + struct _Uint { + struct _Uint *pPrev, *pNext; + Block* pBlk; + }; + + // total size held by the memory pool (in terms of bytes) + size_t memPoolSize; + // total free size by the memory pool (in terms of bytes) + size_t freeSize; + + // each pointer in this array keeps a head of the allocated memory uints of different size (power of 2) + struct _Uint **ppAllocUints; + // each pointer in this array keeps a head of the allocated memory uints of different size (power of 2) + struct _Uint **ppFreeUints; +}; + #ifdef USE_CUDA class CnMemPool : public DeviceMemPool { public: diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc index cb33a48cdd..50feed4da4 100644 --- a/src/core/memory/memory.cc +++ b/src/core/memory/memory.cc @@ -20,9 +20,125 @@ #include "singa/utils/logging.h" #include "singa/proto/core.pb.h" #include +/* +int get_pos(size_t size) { + int result = 0; + while(size > 1) { + result++; + size = size/2; + } + return result; +} +*/ +namespace singa { + +CppMemPool::CppMemPool() { + memPoolSize = 0; + freeSize = 0; + ppAllocUints = (struct _Uint**)malloc(64*sizeof(struct _Uint*)); + ppFreeUints = (struct _Uint**)malloc(64*sizeof(struct _Uint*)); + for(int i = 0; i < 64; i++) { + ppAllocUints[i] = NULL; + ppFreeUints[i] = NULL; + } +} + + +Block* CppMemPool::Malloc(const size_t size) { + CHECK(size > 0); + Block *pAllocBlk = NULL; + int pos = 63 - __builtin_clzll(size); + + struct _Uint*& pAllocUint = ppAllocUints[pos]; + struct _Uint*& pFreeUint = ppFreeUints[pos]; + struct _Uint* pCurUint = NULL; + size_t memSize = pow(2,pos); + size_t blkSize = (size % memSize == 0) ? memSize : memSize*2; + blkSize += sizeof(struct _Uint); + + if(pFreeUint == NULL) { // if no available free blocks + memPoolSize += blkSize; + pCurUint = (struct _Uint*)malloc(blkSize); + pCurUint->pPrev = NULL; + pCurUint->pNext = pAllocUint; + if(pAllocUint != NULL) { + pAllocUint->pPrev = pCurUint; + } + pAllocUint = pCurUint; + pAllocBlk = new Block((char*)(pCurUint) + sizeof(struct _Uint), size); + pCurUint->pBlk = pAllocBlk; + } else { + freeSize -= blkSize; + pCurUint = pFreeUint; + pFreeUint = pCurUint->pNext; + if(pFreeUint != NULL) { + pFreeUint->pPrev = NULL; + } + + pCurUint->pNext = pAllocUint; + if(pAllocUint != NULL) { + pAllocUint->pPrev = pCurUint; + } + pAllocUint = pCurUint; + pAllocBlk = pCurUint->pBlk; + pAllocBlk->set_size(size); + } + return pAllocBlk; +} + +void CppMemPool::Free(Block* ptr) { + void* pData = ptr->mutable_data(); + struct _Uint *pCurUint = (struct _Uint*)((char*)pData-sizeof(struct _Uint)); + int pos = 63 - __builtin_clzll(ptr->size()); + struct _Uint*& pAllocUint = ppAllocUints[pos]; + struct _Uint*& pFreeUint = ppFreeUints[pos]; + size_t memSize = pow(2,pos); + size_t blkSize = (ptr->size() % memSize == 0) ? memSize : memSize*2; + blkSize += sizeof(struct _Uint); + freeSize += blkSize; + + if(pCurUint == pAllocUint) { + pAllocUint = pCurUint->pNext; + if(pAllocUint != NULL) { + pAllocUint->pPrev = NULL; + } + } else { + struct _Uint *pCurPrevUint = pCurUint->pPrev; + pCurUint->pPrev = NULL; + pCurPrevUint->pNext = pCurUint->pNext; + if(pCurUint->pNext != NULL) { + pCurUint->pNext->pPrev = pCurPrevUint; + } + } + + pCurUint->pNext = pFreeUint; + if(pFreeUint != NULL) { + pFreeUint->pPrev = pCurUint; + } + pFreeUint = pCurUint; + ptr->set_size(0); +} + + +CppMemPool::~CppMemPool() { + // traverse all lists to delete the memory + for(int pos = 0; pos < 64; pos++) { + for(int i = 0; i < 2; i++) { + struct _Uint *pCurUint = i == 0 ? ppAllocUints[pos] : ppFreeUints[pos]; + while(pCurUint != NULL) { + struct _Uint *pNextUint = pCurUint->pNext; + free(pCurUint->pBlk); + free(pCurUint); + pCurUint = pNextUint; + } + } + } + free(ppAllocUints); + free(ppFreeUints); +} + #ifdef USE_CUDA -namespace singa { std::atomic CnMemPool::pool_count(0); std::pair CnMemPool::GetMemUsage() { size_t free, total; @@ -107,5 +223,5 @@ void CudaMemPool::Free(void *ptr) { cudaError_t status = cudaFree(ptr); CHECK_EQ(status, cudaError_t::cudaSuccess); } -} #endif +} diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc index 33a374724e..1ba24e2aa8 100644 --- a/test/singa/test_memory.cc +++ b/test/singa/test_memory.cc @@ -25,6 +25,164 @@ #include "singa/singa_config.h" #include "singa/utils/timer.h" #include "singa/utils/cuda_utils.h" +#include + +TEST(CppMemPool, Compare) { + singa::CppMemPool pool; + const int numOfOuterLoops =1000; + const int numOfInnerLoops = 100; + const size_t allocSize = 1024*1024; + int** pp = new int*[numOfInnerLoops]; + singa::Block** ppBlk = new singa::Block*[numOfInnerLoops]; + + double alloc_time = 0; + double free_time = 0; + time_t start,end; + singa::Timer t; + for (int i = 0; i < numOfOuterLoops; i++) { + start = clock(); + for(int j = 0; j < numOfInnerLoops; j++) { + pp[j] = (int*)malloc(allocSize); + } + end = clock(); + alloc_time += end-start; + start = clock(); + for(int j = 0; j < numOfInnerLoops; j++) { + free(pp[j]); + } + end = clock(); + free_time += end-start; + } + int kernel_time = t.Elapsed(); + + t.Tick(); + alloc_time = free_time = 0; + for (int i = 0; i < numOfOuterLoops; i++) { + start = clock(); + for(int j = 0; j < numOfInnerLoops; j++) { + ppBlk[j] = pool.Malloc(allocSize); + } + end = clock(); + alloc_time += end-start; + start = clock(); + for(int j = 0; j < numOfInnerLoops; j++) { + pool.Free(ppBlk[j]); + } + end = clock(); + free_time += end-start; + } + int mempool_time = t.Elapsed(); + EXPECT_GT(kernel_time,mempool_time); + delete pp; + delete ppBlk; +} + +// this tests allocated a number of memory blocks in the memory pool +TEST(CppMemPool, Malloc) { + singa::CppMemPool pool; + const int numOfTests = 1024; + const size_t dataSizeSmall = 1000; + const size_t dataSizeLarge = 2000; + singa::Block** pptr = new singa::Block*[numOfTests]; + + for(int i = 0; i < numOfTests; i++) { + const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge; + pptr[i] = pool.Malloc(dataSize); + int* data = static_cast(pptr[i]->mutable_data()); + for(int idx = 0; idx < (int)dataSize/4; idx++) { + data[idx] = i; + } + data = static_cast(pptr[i]->mutable_data()); + int sum = 0; + for(int idx = 0; idx < (int)dataSize/4; idx++) { + sum += data[idx]; + } + CHECK_EQ(sum,i*dataSize/4); + } + for(int i = 0; i < numOfTests; i++) { + pool.Free(pptr[i]); + } + delete[] pptr; +} + + +// we allocate 1024 memory blocks +// subsequently, we randomly free 512 blocks and after that allocate them back to the pool +TEST(CppMemPool, RandomFree) { + singa::CppMemPool pool; + const int numOfTests = 1024; + const size_t dataSizeSmall = 1000; + const size_t dataSizeLarge = 2000; + singa::Block** pptr = new singa::Block*[numOfTests]; + + for(int i = 0; i < numOfTests; i++) { + const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge; + pptr[i] = pool.Malloc(dataSize); + int* data = static_cast(pptr[i]->mutable_data()); + for(int idx = 0; idx < (int)dataSize/4; idx++) { + data[idx] = i; + } + data = static_cast(pptr[i]->mutable_data()); + int sum = 0; + for(int idx = 0; idx < (int)dataSize/4; idx++) { + sum += data[idx]; + } + CHECK_EQ(sum,i*dataSize/4); + } + + // randomized free pointers + int* randomPool = new int[numOfTests]; + for(int i = 0; i < numOfTests; i++) { + randomPool[i] = i; + } + int iter = 0; + while(iter != numOfTests/2) { // random free half of the memory blocks + int pos = std::rand() % (numOfTests-iter); + int i = randomPool[pos]; + std::swap(randomPool[pos],randomPool[numOfTests-1-iter]); + + // check value before deletion + const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge; + int* data = static_cast(pptr[i]->mutable_data()); + for(int idx = 0; idx < (int)dataSize/4; idx++) { + data[idx] = i; + } + data = static_cast(pptr[i]->mutable_data()); + int sum = 0; + for(int idx = 0; idx < (int)dataSize/4; idx++) { + sum += data[idx]; + } + CHECK_EQ(sum,i*dataSize/4); + + pool.Free(pptr[i]); + iter++; + } + + // test the unfreed memory block value + for(int pos = 0; pos < numOfTests/2; pos++) { + int i = randomPool[pos]; + const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge; + int* data = static_cast(pptr[i]->mutable_data()); + for(int idx = 0; idx < (int)dataSize/4; idx++) { + data[idx] = i; + } + data = static_cast(pptr[i]->mutable_data()); + int sum = 0; + for(int idx = 0; idx < (int)dataSize/4; idx++) { + sum += data[idx]; + } + CHECK_EQ(sum,i*dataSize/4); + } + + for(int pos = numOfTests/2; pos < numOfTests; pos++) { + int i = randomPool[pos]; + const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge; + pptr[i] = pool.Malloc(dataSize); + } + + delete[] randomPool; + delete[] pptr; +} #ifdef USE_CUDA /*