diff --git a/LICENSE b/LICENSE
index 38a240c2f6..c5f50ba9c5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -297,6 +297,7 @@ license and copyright terms herein.
 =====================================================================
 SINGA bundles the following under MIT license:
 cmake/ThirdParty/FindOpenCL.cmake
+include/half.hpp
 
 Copyright (c) 2010-2016 Institute for Microelectronics,
                         Institute for Analysis and Scientific Computing, TU Wien.
diff --git a/examples/cnn/train_cnn.py b/examples/cnn/train_cnn.py
index 5b9bba8861..d750d90418 100644
--- a/examples/cnn/train_cnn.py
+++ b/examples/cnn/train_cnn.py
@@ -121,8 +121,6 @@ def run(global_rank,
         from data import mnist
         train_x, train_y, val_x, val_y = mnist.load()
 
-    train_x = train_x.astype(np_dtype[precision])
-    val_x = val_x.astype(np_dtype[precision])
 
     num_channels = train_x.shape[1]
     image_size = train_x.shape[2]
@@ -216,6 +214,7 @@ def run(global_rank,
                 x = augmentation(x, batch_size)
                 if (image_size != model.input_size):
                     x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
             y = train_y[idx[b * batch_size:(b + 1) * batch_size]]
 
             # Copy the patch data into input tensors
@@ -246,6 +245,7 @@ def run(global_rank,
             if model.dimension == 4:
                 if (image_size != model.input_size):
                     x = resize_dataset(x, model.input_size)
+            x = x.astype(np_dtype[precision])
             y = val_y[b * batch_size:(b + 1) * batch_size]
             tx.copy_from_numpy(x)
             ty.copy_from_numpy(y)
diff --git a/examples/mlp/module.py b/examples/mlp/module.py
index ae89596b63..06dcc834fb 100644
--- a/examples/mlp/module.py
+++ b/examples/mlp/module.py
@@ -124,7 +124,7 @@ def create_model(pretrained=False, **kwargs):
     data = np.array([[a, b] for (a, b) in zip(x, y)], dtype=np_precision)
 
     dev = device.create_cuda_gpu_on(0)
-    sgd = opt.SGD(0.1, 0.9, 1e-5)
+    sgd = opt.SGD(0.1, 0.9, 1e-5, dtype=singa_dtype[args.precision])
     tx = tensor.Tensor((400, 2), dev, precision)
     ty = tensor.Tensor((400,), dev, tensor.int32)
     model = MLP(data_size=2, perceptron_size=3, num_classes=2)
diff --git a/python/singa/layer.py b/python/singa/layer.py
index 950a1fd294..390128eb9c 100644
--- a/python/singa/layer.py
+++ b/python/singa/layer.py
@@ -856,7 +856,7 @@ def forward(self, x):
 
         self.device_check(x, self.scale, self.bias, self.running_mean,
                           self.running_var)
-        self.type_check(x, self.scale, self.bias, self.running_mean,
+        self.dtype_check(x, self.scale, self.bias, self.running_mean,
                         self.running_var)
 
         y = autograd.batchnorm_2d(
diff --git a/src/model/operation/convolution.cc b/src/model/operation/convolution.cc
index 32e20e68b4..96313d20b1 100644
--- a/src/model/operation/convolution.cc
+++ b/src/model/operation/convolution.cc
@@ -495,7 +495,7 @@ CudnnConvHandle::CudnnConvHandle(
       channels / groups, kernel_h, kernel_w));
 
   if (prefer == "tensor_ops") {
-    std::cout<<"using tensor op\n";
+    // std::cout<<"using tensor op\n";
     CUDNN_CHECK(cudnnSetConvolutionMathType(conv_desc, CUDNN_TENSOR_OP_MATH));
     fp_alg = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
     bp_filter_alg = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;