diff --git a/benchmarks/benchmarks/multigpu/bench_multigpu_rgcn.py b/benchmarks/benchmarks/multigpu/bench_multigpu_rgcn.py
index 3dbc9e4c02c2..ec66a2d4f114 100644
--- a/benchmarks/benchmarks/multigpu/bench_multigpu_rgcn.py
+++ b/benchmarks/benchmarks/multigpu/bench_multigpu_rgcn.py
@@ -48,7 +48,6 @@ def __init__(
         num_hidden_layers=1,
         dropout=0,
         use_self_loop=False,
-        low_mem=True,
         layer_norm=False,
     ):
         super(EntityClassify, self).__init__()
@@ -61,7 +60,6 @@ def __init__(
         self.num_hidden_layers = num_hidden_layers
         self.dropout = dropout
         self.use_self_loop = use_self_loop
-        self.low_mem = low_mem
         self.layer_norm = layer_norm
 
         self.layers = nn.ModuleList()
@@ -75,7 +73,6 @@ def __init__(
                 self.num_bases,
                 activation=F.relu,
                 self_loop=self.use_self_loop,
-                low_mem=self.low_mem,
                 dropout=self.dropout,
                 layer_norm=layer_norm,
             )
@@ -91,7 +88,6 @@ def __init__(
                     self.num_bases,
                     activation=F.relu,
                     self_loop=self.use_self_loop,
-                    low_mem=self.low_mem,
                     dropout=self.dropout,
                     layer_norm=layer_norm,
                 )
@@ -106,7 +102,6 @@ def __init__(
                 self.num_bases,
                 activation=None,
                 self_loop=self.use_self_loop,
-                low_mem=self.low_mem,
                 layer_norm=layer_norm,
             )
         )
@@ -236,7 +231,6 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
         num_hidden_layers=args.n_layers - 2,
         dropout=args.dropout,
         use_self_loop=args.use_self_loop,
-        low_mem=args.low_mem,
         layer_norm=args.layer_norm,
     )
 
@@ -373,14 +367,12 @@ def run(proc_id, n_gpus, n_cpus, args, devices, dataset, split, queue=None):
 @utils.skip_if_not_4gpu()
 @utils.benchmark("time", timeout=600)
 @utils.parametrize("data", ["am", "ogbn-mag"])
-@utils.parametrize("low_mem", [True, False])
 @utils.parametrize("dgl_sparse", [True, False])
-def track_time(data, low_mem, dgl_sparse):
+def track_time(data, dgl_sparse):
     # load graph data
     dataset = utils.process_data(data)
     args = config()
     devices = [0, 1, 2, 3]
-    args.low_mem = low_mem
     args.dgl_sparse = dgl_sparse
     args.dataset = dataset
     ogb_dataset = False
@@ -572,49 +564,8 @@ def config():
         node_feats=False,
         num_workers=0,
         dgl_sparse=False,
-        low_mem=False,
     )
-    # parser.add_argument("--dropout", type=float, default=0,
-    #         help="dropout probability")
-    # parser.add_argument("--n-hidden", type=int, default=16,
-    #         help="number of hidden units")
-    # parser.add_argument("--gpu", type=str, default='0',
-    #         help="gpu")
-    # parser.add_argument("--lr", type=float, default=1e-2,
-    #         help="learning rate")
-    # parser.add_argument("--sparse-lr", type=float, default=2e-2,
-    #         help="sparse embedding learning rate")
-    # parser.add_argument("--n-bases", type=int, default=-1,
-    #         help="number of filter weight matrices, default: -1 [use all]")
-    # parser.add_argument("--n-layers", type=int, default=2,
-    #         help="number of propagation rounds")
-    # parser.add_argument("-e", "--n-epochs", type=int, default=50,
-    #         help="number of training epochs")
-    # parser.add_argument("-d", "--dataset", type=str, required=True,
-    #         help="dataset to use")
-    # parser.add_argument("--l2norm", type=float, default=0,
-    #         help="l2 norm coef")
-    # parser.add_argument("--fanout", type=str, default="4, 4",
-    #         help="Fan-out of neighbor sampling.")
-    # parser.add_argument("--use-self-loop", default=False, action='store_true',
-    #         help="include self feature as a special relation")
-    # fp = parser.add_mutually_exclusive_group(required=False)
-    # parser.add_argument("--batch-size", type=int, default=100,
-    #         help="Mini-batch size. ")
-    # parser.add_argument("--eval-batch-size", type=int, default=32,
-    #         help="Mini-batch size. ")
-    # parser.add_argument("--num-workers", type=int, default=0,
-    #         help="Number of workers for dataloader.")
-    # parser.add_argument("--low-mem", default=False, action='store_true',
-    #         help="Whether use low mem RelGraphCov")
-    # parser.add_argument("--dgl-sparse", default=False, action='store_true',
-    #         help='Use sparse embedding for node embeddings.')
-    # parser.add_argument('--node-feats', default=False, action='store_true',
-    #         help='Whether use node features')
-    # parser.add_argument('--layer-norm', default=False, action='store_true',
-    #         help='Use layer norm')
-    # parser.set_defaults(validation=True)
-    # args = parser.parse_args()
+
     return args
 
 
diff --git a/benchmarks/benchmarks/utils.py b/benchmarks/benchmarks/utils.py
index 7e20608869e7..b4e8159db155 100644
--- a/benchmarks/benchmarks/utils.py
+++ b/benchmarks/benchmarks/utils.py
@@ -534,7 +534,7 @@ def skip_if_not_4gpu():
     """skip if DGL_BENCH_DEVICE is gpu"""
 
     def _wrapper(func):
-        if GPU_COUNT != 4:
+        if GPU_COUNT < 4:
             # skip if not enabled
             print("Skip {}".format(func.__name__))
             func.benchmark_name = "skip_" + func.__name__
diff --git a/benchmarks/scripts/build_dgl_asv.sh b/benchmarks/scripts/build_dgl_asv.sh
index 47162dcf0fb1..28fb008c2725 100644
--- a/benchmarks/scripts/build_dgl_asv.sh
+++ b/benchmarks/scripts/build_dgl_asv.sh
@@ -10,7 +10,7 @@ pip install -r /asv/torch_gpu_pip.txt
 # build
 CMAKE_VARS="-DUSE_OPENMP=ON -DBUILD_TORCH=ON -DBUILD_SPARSE=ON -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda"
 if [[ $DEVICE == "gpu" ]]; then
-    CMAKE_VARS="-DUSE_CUDA=ON $CMAKE_VARS"
+    CMAKE_VARS="-DUSE_CUDA=ON -DUSE_NCCL=ON $CMAKE_VARS"
 fi
 arch=`uname -m`
 if [[ $arch == *"x86"* ]]; then
diff --git a/benchmarks/scripts/publish.sh b/benchmarks/scripts/publish.sh
index 833df473b4ea..484208a9c8cb 100644
--- a/benchmarks/scripts/publish.sh
+++ b/benchmarks/scripts/publish.sh
@@ -26,7 +26,7 @@ else
 fi
 
 WS_ROOT=/asv/dgl
-docker pull public.ecr.aws/s1o7b3d9/benchmark_test:cu116
+docker pull public.ecr.aws/s1o7b3d9/benchmark_test:cu116_v230110
 if [ -z "$DGL_REG_CONF" ]; then
     DOCKER_ENV_OPT="$DOCKER_ENV_OPT"
 else
@@ -56,14 +56,14 @@ if [[ $DEVICE == "cpu" ]]; then
         $DOCKER_MOUNT_OPT \
         $DOCKER_ENV_OPT \
         --shm-size="16g" \
-        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
+        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116_v230110 /bin/bash
 else
     docker run --name dgl-reg \
         --rm --gpus all \
         $DOCKER_MOUNT_OPT \
         $DOCKER_ENV_OPT \
         --shm-size="16g" \
-        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116 /bin/bash
+        --hostname=$MACHINE -dit public.ecr.aws/s1o7b3d9/benchmark_test:cu116_v230110 /bin/bash
 fi
 
 pwd
diff --git a/docker/Dockerfile.ci_benchmark b/docker/Dockerfile.ci_benchmark
index af4af27a080e..1b36808f844d 100644
--- a/docker/Dockerfile.ci_benchmark
+++ b/docker/Dockerfile.ci_benchmark
@@ -27,5 +27,4 @@ ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH}
 ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH}
 ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LIBRARY_PATH}
 ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
-ENV CUDA_VISIBLE_DEVICES=0
 ENV TF_FORCE_GPU_ALLOW_GROWTH=true
\ No newline at end of file