diff --git a/README.md b/README.md
index 8fd40b4a..f3d7bf04 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 ### How to Install:
 
 ```sh
-python3 -m pip install https://github.com/microsoft/antares/releases/download/v0.3.0/antares-0.3.0-py3-none-linux_x86_64.whl
+python3 -m pip install antares
 ```
 
 ### Quick Test:
@@ -20,6 +20,9 @@ BACKEND=c-scpu antares
 # Quickly generate a multi-threaded CPU code:
 BACKEND=c-mcpu antares
 
+# Search an efficient multi-threaded CPU code:
+STEP=100 BACKEND=c-mcpu antares
+
 # Quickly generate a SHADER code for Windows 10/11's DirectX12:
 BACKEND=c-hlsl_win64 antares
 
diff --git a/antares/antares_compiler.py b/antares/antares_compiler.py
index 20c49c72..839f07f0 100644
--- a/antares/antares_compiler.py
+++ b/antares/antares_compiler.py
@@ -5,7 +5,6 @@
 import random
 import hashlib
 import traceback
-import numpy as np
 import math
 import re
 import json
@@ -324,14 +323,14 @@ def compute_mem_ratio(tpr):
   global_arg_props = get_global_arg_props()
   access_bytes = 0
   for buf in global_arg_props['_in']:
-    access_bytes += np.product(buf['shape']) * get_type_size(buf['dtype'])
+    access_bytes += product(buf['shape']) * get_type_size(buf['dtype'])
   for buf in global_arg_props['_out']:
-    access_bytes += np.product(buf['shape']) * get_type_size(buf['dtype'])
+    access_bytes += product(buf['shape']) * get_type_size(buf['dtype'])
 
   access_bytes = int(access_bytes)
   if access_bytes <= 0:
     return -1
-  ratio = np.ceil(access_bytes * 1e-7 / tpr / device_properties().mem_bandwith)
+  ratio = math.ceil(access_bytes * 1e-7 / tpr / device_properties().mem_bandwith)
   return min(int(ratio), 100)
 
 def run_config_entity(target_source, config_str, dir_sid, expected_timecost='inf', dev_id=0):
diff --git a/antares/common.py b/antares/common.py
index 0d58f7c5..caddfdde 100644
--- a/antares/common.py
+++ b/antares/common.py
@@ -4,7 +4,7 @@
 import os
 import subprocess
 import math
-import numpy as np
+from functools import reduce
 
 class Mock(object):
   pass
@@ -12,6 +12,9 @@ class Mock(object):
 backend = os.environ['BACKEND']
 AntaresGlobal = Mock()
 
+def product(arrlist):
+  return reduce((lambda x, y: x * y), arrlist)
+
 def wait_for(func, timeout=None, args=[]):
   if not timeout:
     return func(*args)
diff --git a/backends/c-mcpu/schedule/standard/default.py b/backends/c-mcpu/schedule/standard/default.py
index 40dfef69..e90f984b 100644
--- a/backends/c-mcpu/schedule/standard/default.py
+++ b/backends/c-mcpu/schedule/standard/default.py
@@ -3,7 +3,6 @@
 
 from tvm import te
 import numpy as np
-import psutil
 
 def schedule(attrs):
   cfg, s = attrs.auto_config, attrs.scheduler
diff --git a/docker/Dockerfile.c-base b/docker/Dockerfile.c-base
index da425714..a48c3fb6 100644
--- a/docker/Dockerfile.c-base
+++ b/docker/Dockerfile.c-base
@@ -21,8 +21,8 @@ RUN bash -c 'rm -rf ~/.local/antares/3rdparty/tvm/build/{CMake*,Makefile,cmake_i
 RUN bash -c 'rm -rf ~/.local/antares/3rdparty/tvm/{src,include,golang,tests,3rdparty,device-stub,apps,.??*}'
 RUN echo '' > ~/.local/antares/3rdparty/tvm/python/tvm/relay/__init__.py
 
-ENV ANTARES_VERSION 0.3.0_0
+ENV ANTARES_VERSION 0.3.1
 
 RUN cd ~ && git clone https://github.com/microsoft/antares --single-branch --depth 1 antares_core && mv ~/.local/antares/3rdparty antares_core
 RUN cd ~ && sed -i "s/@VERSION@/${ANTARES_VERSION}/g" /antares/engine/dist-info/METADATA && cp -r /antares/engine/dist-info ~/antares-${ANTARES_VERSION}.dist-info
-RUN cd ~ && rm -rf antares_core/.??* && zip -r /antares-${ANTARES_VERSION}-py3-none-linux_x86_64.whl antares* >/dev/null
+RUN cd ~ && rm -rf antares_core/.??* && zip -r /antares-${ANTARES_VERSION}-py3-none-manylinux1_x86_64.whl antares* >/dev/null
diff --git a/engine/device-stub/tvm_extra.patch b/engine/device-stub/tvm_extra.patch
new file mode 100644
index 00000000..b4c94f62
--- /dev/null
+++ b/engine/device-stub/tvm_extra.patch
@@ -0,0 +1,123 @@
+diff --git a/src/runtime/dso_library.cc b/src/runtime/dso_library.cc
+index 81eb30ee1..785fb48ac 100644
+--- a/src/runtime/dso_library.cc
++++ b/src/runtime/dso_library.cc
+@@ -115,15 +115,16 @@ void DSOLibrary::Unload() {
+ #else
+
+ void DSOLibrary::Load(const std::string& name) {
++  abort(); /*
+   lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+   ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " "
+-                                 << dlerror();
++                                 << dlerror(); */
+ }
+
+-void* DSOLibrary::GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
++void* DSOLibrary::GetSymbol_(const char* name) { abort(); /* return dlsym(lib_handle_, name); */ }
+
+ void DSOLibrary::Unload() {
+-  dlclose(lib_handle_);
++  abort(); // dlclose(lib_handle_);
+   lib_handle_ = nullptr;
+ }
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 7293abb60..d741ce2a2 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1,6 +1,10 @@
+ cmake_minimum_required(VERSION 3.2)
+ project(tvm C CXX)
+ 
++set(CMAKE_C_FLAGS ${CMAKE_C_FLAGS} "-static-libgcc -static-libstdc++")
++set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-static-libgcc -static-libstdc++")
++set(CMAKE_EXE_LINKER_FLAGS "-static-libgcc -static-libstdc++")
++
+ # Utility functions
+ include(cmake/utils/Utils.cmake)
+ include(cmake/utils/FindCUDA.cmake)
+@@ -50,7 +54,7 @@ tvm_option(USE_FALLBACK_STL_MAP "Use TVM's POD compatible Map" OFF)
+ tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF)
+ tvm_option(USE_CMSISNN "Build with Arm CMSIS-NN" OFF)
+ tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON)
+-tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO)
++# tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" AUTO)
+ tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF)
+ tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to read performance counters" OFF)
+ tvm_option(USE_GTEST "Use GoogleTest for C++ sanity tests" AUTO)
+@@ -497,7 +501,7 @@ target_compile_definitions(tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logg
+ target_compile_definitions(tvm_runtime PUBLIC DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
+ 
+ # logging option for libbacktrace
+-include(cmake/modules/Logging.cmake)
++# include(cmake/modules/Logging.cmake)
+ 
+ include(cmake/modules/contrib/PAPI.cmake)
+ 
+diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
+index 5b3093ac8..ce0d33fa1 100644
+--- a/src/runtime/threading_backend.cc
++++ b/src/runtime/threading_backend.cc
+@@ -127,7 +127,7 @@ class ThreadGroup::Impl {
+ #if defined(__ANDROID__)
+       sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
+ #else
+-      pthread_setaffinity_np(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
++      abort(); // pthread_setaffinity_np(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
+ #endif
+     }
+     if (exclude_worker0) {  // main thread run task
+@@ -167,7 +167,7 @@ class ThreadGroup::Impl {
+ #if defined(__ANDROID__)
+     sched_setaffinity(pthread_self(), sizeof(cpu_set_t), &cpuset);
+ #else
+-    pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
++    abort(); // pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+ #endif
+ #endif
+   }
+@@ -227,7 +227,7 @@ class ThreadGroup::Impl {
+ 
+ ThreadGroup::ThreadGroup(int num_workers, std::function<void(int)> worker_callback,
+                          bool exclude_worker0)
+-    : impl_(new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0)) {}
++    : impl_((abort(), nullptr) /* new ThreadGroup::Impl(num_workers, worker_callback, exclude_worker0) */) {}
+ ThreadGroup::~ThreadGroup() { delete impl_; }
+ void ThreadGroup::Join() { impl_->Join(); }
+ 
+diff --git a/src/support/parallel_for.cc b/src/support/parallel_for.cc
+index e90967562..e55ed2b25 100644
+--- a/src/support/parallel_for.cc
++++ b/src/support/parallel_for.cc
+@@ -49,6 +49,8 @@ std::vector<std::vector<int>> rr_partitioner(int begin, int end, int step, int n
+ 
+ void parallel_for(int begin, int end, const std::function<void(int)>& f, int step,
+                   const PartitionerFuncType partitioner) {
++  abort();
++#if 0
+   static bool GLOBAL_PARALLEL_FOR_FLAG{false};
+   static std::mutex M_GLOBAL_PARALLEL_FOR_FLAG;
+   {
+@@ -91,10 +93,13 @@ void parallel_for(int begin, int end, const std::function<void(int)>& f, int ste
+   } catch (const std::exception& e) {
+     LOG(FATAL) << "Parallel_for error with " << e.what();
+   }
++#endif
+ }
+ 
+ void parallel_for_dynamic(int begin, int end, int num_threads,
+                           const std::function<void(int thread_id, int task_id)>& f) {
++  abort();
++#if 0
+   // Step 1. Sanity checks
+   if (begin == end) {
+     return;
+@@ -138,6 +143,7 @@ void parallel_for_dynamic(int begin, int end, int num_threads,
+   } catch (const std::exception& e) {
+     LOG(FATAL) << "RuntimeError: parallel_for_dynamic error with " << e.what();
+   }
++#endif
+ }
+ 
+ }  // namespace support
diff --git a/engine/dist-info/METADATA b/engine/dist-info/METADATA
index fabfee2c..7d665c45 100644
--- a/engine/dist-info/METADATA
+++ b/engine/dist-info/METADATA
@@ -8,13 +8,8 @@ Keywords: antares dnn
 Platform: UNKNOWN
 Requires-Dist: wheel (>=0.26) ; python_version >= "3"
 Requires-Dist: tornado ; python_version >= "3"
-Requires-Dist: psutil ; python_version >= "3"
 Requires-Dist: numpy ; python_version >= "3"
 Requires-Dist: decorator ; python_version >= "3"
-Requires-Dist: attrs ; python_version >= "3"
-Requires-Dist: pytest ; python_version >= "3"
-Requires-Dist: typed_ast ; python_version >= "3"
-Requires-Dist: cloudpickle ; python_version >= "3"
 
 Antares is an engine to automatically generate optimized kernels for multi-platform
 
diff --git a/graph_evaluator/client.py b/graph_evaluator/client.py
index 49dce9eb..ff324004 100644
--- a/graph_evaluator/client.py
+++ b/graph_evaluator/client.py
@@ -67,7 +67,10 @@ def eval(kernel_path, **kwargs):
       return eval_client.eval(kernel_path, **kwargs)
 
     is_wsl = 1 if (os.environ.get('IS_WSL', '0') == '1') else 0
-    if is_wsl == os.system(f'file {evaluator_path} | grep "MS Windows" >/dev/null 2>&1'):
+    with open(evaluator_path, 'rb') as fp:
+      exec_magic = fp.read(2)
+
+    if is_wsl == 0 and exec_magic == b'MZ':
       print(f"Antares should run under WSL-1/2 for this backend({backend}), otherwise, evaluation would be skipped.")
       exit(1)
 
diff --git a/lang/generic.py b/lang/generic.py
index cb2e3131..7c34d47b 100644
--- a/lang/generic.py
+++ b/lang/generic.py
@@ -150,7 +150,8 @@ def select_plan(plan_name):
   try:
     return select_plan(plan)
   except ModuleNotFoundError:
-    setattr(AntaresGlobal, 'mode', 'antares')
+    traceback.print_exc()
+    # setattr(AntaresGlobal, 'mode', 'antares')
     return None