diff --git a/include/singa/core/common.h b/include/singa/core/common.h
index 2c6d1d8668..66e8f5a839 100644
--- a/include/singa/core/common.h
+++ b/include/singa/core/common.h
@@ -36,7 +36,10 @@
 
 
 #ifdef USE_OPENCL
-#include "singa/utils/opencl_utils.h"
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#include <CL/cl2.hpp>
+#include <unordered_map>
 #endif  // USE_OPENCL
 
 using std::atomic;
@@ -62,6 +65,9 @@ class Block {
   // Disabled as it is not used currently.
   // Block(void* ptr, size_t size, size_t offset, std::shared_ptr<atomic<int>>
   //  ref) : data_(ptr), size_(size), offset_(offset), ref_count_(ref) {}
+
+	// TODO(wangwei) check if the set is correct and add lock if shared sturcture is allowed
+	void set_size(size_t size) { size_=size; }
   void* mutable_data() {
     initialized_ = true;
     return static_cast<char*>(data_) + offset_;
@@ -107,8 +113,9 @@ typedef struct _Context {
 #endif // USE_CUDA
 
 #ifdef USE_OPENCL
-  // This stores the context ID of the OpenCL context controlled by ViennaCL.
-  long vcl_ctx_id;
+  std::shared_ptr<std::unordered_map<std::string, cl::Kernel>> kernels;
+  cl::CommandQueue ocl_cmdq;
+  cl::Context ocl_ctx;
 #endif
 
 } Context;
diff --git a/include/singa/core/memory.h b/include/singa/core/memory.h
index f664f95ced..d0a8afa807 100644
--- a/include/singa/core/memory.h
+++ b/include/singa/core/memory.h
@@ -23,6 +23,7 @@
 #include <atomic>
 #include "singa/proto/core.pb.h"
 #include "singa/singa_config.h"
+#include "singa/core/common.h"
 
 #ifdef USE_CUDA
 #include "cnmem.h"
@@ -50,6 +51,37 @@ class DeviceMemPool {
 //  size_t init_size_ = 0, max_size_ = 0;
 };
 
+class CppMemPool {
+	public:
+		CppMemPool();
+		
+		Block* Malloc(const size_t size);
+		void Free(Block* ptr);
+
+		// get the free and total size of the memory pool (in terms of bytes)
+  	std::pair<size_t, size_t> GetMemUsage(){return std::make_pair(freeSize,memPoolSize);};
+
+  	~CppMemPool();
+
+	private:
+		// each structure define a memory uint in the memory pool
+		// the structure is a static double linked list
+		struct _Uint {
+			struct _Uint *pPrev, *pNext;
+			Block* pBlk;
+		};
+
+		// total size held by the memory pool (in terms of bytes)
+		size_t memPoolSize;
+		// total free size by the memory pool (in terms of bytes)
+		size_t freeSize;
+
+		// each pointer in this array keeps a head of the allocated memory uints of different size (power of 2)
+		struct _Uint **ppAllocUints;
+		// each pointer in this array keeps a head of the allocated memory uints of different size (power of 2)
+		struct _Uint **ppFreeUints;
+};
+
 #ifdef USE_CUDA
 class CnMemPool : public DeviceMemPool {
  public:
diff --git a/src/core/memory/memory.cc b/src/core/memory/memory.cc
index cb33a48cdd..50feed4da4 100644
--- a/src/core/memory/memory.cc
+++ b/src/core/memory/memory.cc
@@ -20,9 +20,125 @@
 #include "singa/utils/logging.h"
 #include "singa/proto/core.pb.h"
 #include <iostream>
+/*
+int get_pos(size_t size) {
+	int result = 0;
+	while(size > 1) {
+		result++;
+		size = size/2;
+	}
+	return result;
+}
+*/
+namespace singa {
+
+CppMemPool::CppMemPool()	{
+	memPoolSize = 0;
+	freeSize = 0;
+	ppAllocUints = (struct _Uint**)malloc(64*sizeof(struct _Uint*));
+	ppFreeUints = (struct _Uint**)malloc(64*sizeof(struct _Uint*));
+	for(int i = 0; i < 64; i++) {
+		ppAllocUints[i] = NULL;
+		ppFreeUints[i] = NULL;
+	}
+}
+
+
+Block* CppMemPool::Malloc(const size_t size) {	
+	CHECK(size > 0);
+	Block *pAllocBlk = NULL;
+	int pos = 63 - __builtin_clzll(size);
+	
+	struct _Uint*& pAllocUint = ppAllocUints[pos];
+	struct _Uint*& pFreeUint = ppFreeUints[pos];
+	struct _Uint* pCurUint = NULL;
+	size_t memSize = pow(2,pos);
+	size_t blkSize = (size % memSize == 0) ? memSize : memSize*2;
+	blkSize += sizeof(struct _Uint);
+	
+	if(pFreeUint == NULL) { // if no available free blocks
+		memPoolSize += blkSize;
+		pCurUint = (struct _Uint*)malloc(blkSize);
+		pCurUint->pPrev = NULL;
+		pCurUint->pNext = pAllocUint; 
+		if(pAllocUint != NULL) {
+			pAllocUint->pPrev = pCurUint;
+		}
+		pAllocUint = pCurUint;
+		pAllocBlk = new Block((char*)(pCurUint) + sizeof(struct _Uint), size);
+		pCurUint->pBlk = pAllocBlk;
+	} else {
+		freeSize -= blkSize;
+		pCurUint = pFreeUint;
+		pFreeUint = pCurUint->pNext;
+		if(pFreeUint != NULL) {
+			pFreeUint->pPrev = NULL;
+		}
+		
+		pCurUint->pNext = pAllocUint;
+		if(pAllocUint != NULL) {
+			pAllocUint->pPrev = pCurUint;
+		}
+		pAllocUint = pCurUint;
+		pAllocBlk = pCurUint->pBlk;
+		pAllocBlk->set_size(size);
+	}
+	return pAllocBlk;
+}
+
+void CppMemPool::Free(Block* ptr) {
+	void* pData = ptr->mutable_data();	
+	struct _Uint *pCurUint = (struct _Uint*)((char*)pData-sizeof(struct _Uint));
+	int pos = 63 - __builtin_clzll(ptr->size());
+	struct _Uint*& pAllocUint = ppAllocUints[pos];
+	struct _Uint*& pFreeUint = ppFreeUints[pos];
+	size_t memSize = pow(2,pos); 
+	size_t blkSize = (ptr->size() % memSize == 0) ? memSize : memSize*2;
+	blkSize += sizeof(struct _Uint);
+	freeSize += blkSize;
+
+	if(pCurUint == pAllocUint) {
+		pAllocUint = pCurUint->pNext;
+		if(pAllocUint != NULL) {
+			pAllocUint->pPrev = NULL;
+		}		
+	} else {
+		struct _Uint *pCurPrevUint = pCurUint->pPrev;
+		pCurUint->pPrev = NULL;
+		pCurPrevUint->pNext = pCurUint->pNext;
+		if(pCurUint->pNext != NULL) {
+			pCurUint->pNext->pPrev = pCurPrevUint;
+		}
+	}
+	
+	pCurUint->pNext = pFreeUint;
+	if(pFreeUint != NULL) {
+		pFreeUint->pPrev = pCurUint;
+	}		
+	pFreeUint = pCurUint;
+	ptr->set_size(0);
+}
+
+
+CppMemPool::~CppMemPool() {
+	// traverse all lists to delete the memory
+	for(int pos = 0; pos < 64; pos++) {
+		for(int i = 0; i < 2; i++) {
+			struct _Uint *pCurUint = i == 0 ? ppAllocUints[pos] : ppFreeUints[pos];
+			while(pCurUint != NULL) {
+				struct _Uint *pNextUint = pCurUint->pNext;
+				free(pCurUint->pBlk);
+				free(pCurUint);
+				pCurUint = pNextUint;
+			}
+		}
+	}
+	free(ppAllocUints);
+	free(ppFreeUints);
+}
+
 
 #ifdef USE_CUDA
-namespace singa {
 std::atomic<int> CnMemPool::pool_count(0);
 std::pair<size_t, size_t> CnMemPool::GetMemUsage() {
   size_t free, total;
@@ -107,5 +223,5 @@ void CudaMemPool::Free(void *ptr) {
   cudaError_t status = cudaFree(ptr);
   CHECK_EQ(status, cudaError_t::cudaSuccess);
 }
-}
 #endif
+}
diff --git a/test/singa/test_memory.cc b/test/singa/test_memory.cc
index 33a374724e..1ba24e2aa8 100644
--- a/test/singa/test_memory.cc
+++ b/test/singa/test_memory.cc
@@ -25,6 +25,164 @@
 #include "singa/singa_config.h"
 #include "singa/utils/timer.h"
 #include "singa/utils/cuda_utils.h"
+#include <stdlib.h>
+
+TEST(CppMemPool, Compare) {
+	singa::CppMemPool pool;
+	const int numOfOuterLoops =1000;
+	const int numOfInnerLoops = 100;
+	const size_t allocSize = 1024*1024; 
+	int** pp = new int*[numOfInnerLoops];
+	singa::Block** ppBlk = new singa::Block*[numOfInnerLoops];
+	
+	double alloc_time = 0;
+	double free_time = 0;
+	time_t start,end;
+	singa::Timer t;
+  for (int i = 0; i < numOfOuterLoops; i++) {
+		start = clock();
+		for(int j = 0; j < numOfInnerLoops; j++) {
+    	pp[j] = (int*)malloc(allocSize);
+		}
+		end = clock();
+		alloc_time += end-start;
+		start = clock();
+		for(int j = 0; j < numOfInnerLoops; j++) {
+    	free(pp[j]);
+		}
+		end = clock();
+		free_time += end-start;
+  }
+  int kernel_time = t.Elapsed<singa::Timer::Milliseconds>();
+	
+	t.Tick();
+	alloc_time = free_time = 0;
+  for (int i = 0; i < numOfOuterLoops; i++) {
+		start = clock();
+		for(int j = 0; j < numOfInnerLoops; j++) {
+			ppBlk[j] = pool.Malloc(allocSize);
+		}
+		end = clock();
+		alloc_time += end-start;
+		start = clock();
+		for(int j = 0; j < numOfInnerLoops; j++) {
+    	pool.Free(ppBlk[j]);
+		}
+		end = clock();
+		free_time += end-start;
+  }
+  int mempool_time = t.Elapsed<singa::Timer::Milliseconds>();
+	EXPECT_GT(kernel_time,mempool_time);
+	delete pp;
+	delete ppBlk;
+}
+
+// this tests allocated a number of memory blocks in the memory pool
+TEST(CppMemPool, Malloc) {
+	singa::CppMemPool pool;
+	const int numOfTests = 1024;
+	const size_t dataSizeSmall = 1000;
+	const size_t dataSizeLarge = 2000;
+	singa::Block** pptr = new singa::Block*[numOfTests];
+
+	for(int i = 0; i < numOfTests; i++) {
+		const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
+		pptr[i] = pool.Malloc(dataSize);
+		int* data = static_cast<int*>(pptr[i]->mutable_data());
+		for(int idx = 0; idx < (int)dataSize/4; idx++) {
+			data[idx] = i;
+		}
+		data = static_cast<int*>(pptr[i]->mutable_data());
+		int sum = 0;
+		for(int idx = 0; idx < (int)dataSize/4; idx++) {
+			sum += data[idx];
+		}
+		CHECK_EQ(sum,i*dataSize/4);
+	}
+	for(int i = 0; i < numOfTests; i++) {
+		pool.Free(pptr[i]);
+	}
+	delete[] pptr;
+}
+
+
+// we allocate 1024 memory blocks
+// subsequently, we randomly free 512 blocks and after that allocate them back to the pool
+TEST(CppMemPool, RandomFree) {
+	singa::CppMemPool pool;
+	const int numOfTests = 1024;
+	const size_t dataSizeSmall = 1000;
+	const size_t dataSizeLarge = 2000;
+	singa::Block** pptr = new singa::Block*[numOfTests];
+
+	for(int i = 0; i < numOfTests; i++) {
+		const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
+		pptr[i] = pool.Malloc(dataSize);
+		int* data = static_cast<int*>(pptr[i]->mutable_data());
+		for(int idx = 0; idx < (int)dataSize/4; idx++) {
+			data[idx] = i;
+		}
+		data = static_cast<int*>(pptr[i]->mutable_data());
+		int sum = 0;
+		for(int idx = 0; idx < (int)dataSize/4; idx++) {
+			sum += data[idx];
+		}
+		CHECK_EQ(sum,i*dataSize/4);
+	}
+
+	// randomized free pointers
+	int* randomPool = new int[numOfTests];
+	for(int i = 0; i < numOfTests; i++) {
+		randomPool[i] = i;
+	}
+	int iter = 0;
+	while(iter != numOfTests/2) { // random free half of the memory blocks
+		int pos = std::rand() % (numOfTests-iter);
+		int i = randomPool[pos];
+		std::swap(randomPool[pos],randomPool[numOfTests-1-iter]);
+		
+		// check value before deletion
+		const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
+		int* data = static_cast<int*>(pptr[i]->mutable_data());
+		for(int idx = 0; idx < (int)dataSize/4; idx++) {
+			data[idx] = i;
+		}
+		data = static_cast<int*>(pptr[i]->mutable_data());
+		int sum = 0;
+		for(int idx = 0; idx < (int)dataSize/4; idx++) {
+			sum += data[idx];
+		}
+		CHECK_EQ(sum,i*dataSize/4);
+
+		pool.Free(pptr[i]);
+		iter++;
+	}
+	
+	// test the unfreed memory block value
+	for(int pos = 0; pos < numOfTests/2; pos++) {
+		int i = randomPool[pos];
+		const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
+		int* data = static_cast<int*>(pptr[i]->mutable_data());
+		for(int idx = 0; idx < (int)dataSize/4; idx++) {
+			data[idx] = i;
+		}
+		data = static_cast<int*>(pptr[i]->mutable_data());
+		int sum = 0;
+		for(int idx = 0; idx < (int)dataSize/4; idx++) {
+			sum += data[idx];
+		}
+		CHECK_EQ(sum,i*dataSize/4);
+	}
+
+	for(int pos = numOfTests/2; pos < numOfTests; pos++) {
+		int i = randomPool[pos];
+		const size_t dataSize = (i%2) ? dataSizeSmall : dataSizeLarge;
+		pptr[i] = pool.Malloc(dataSize);
+	}
+	
+	delete[] randomPool;
+	delete[] pptr;
+}
 
 #ifdef USE_CUDA
 /*