From 13936020d4cc3ccb2d4192adccaa282cef509193 Mon Sep 17 00:00:00 2001
From: kpuatamazon <56725192+kpuatamazon@users.noreply.github.com>
Date: Mon, 31 Aug 2020 18:10:47 +0100
Subject: [PATCH] [MXNET-1446] Quantization: intgemm matrix multiply wrappers 
 (#17559)

This pull request adds wrappers to the intgemm matrix multiplication library: https://github.com/kpu/intgemm .

A performance comparison with DNNL aka MKL-DNN is at kpu/intgemm#59

The library targets thin matrix sizes seen in neural machine translation inference and was part of the top submission to the 2018 Workshop on Neural Generation and Translation efficiency task: https://neural.mt/papers/edinburgh/wnmt_marian_paper.pdf . The purpose of this issue is to add similar functionality to Sockeye: awslabs/sockeye#771 .

Quantized Sockeye performance is 2.95x as fast. One problem with the current MXQuantizeSymbol approach is that Sockeye does not have a static graph for everything.

intgemm uses a custom memory layout for the weight matrix to make more memory accesses consecutive, so there are operators to convert weights to that format. The idea is that weights are typically loaded once for inference.

On architectures without VNNI, intgemm uses saturating 16-bit accumulation. This avoids an expensive madd_epi16 instruction every multiply by exploiting the fact that most neural network parameters are near 0.

Because x86 only offers a unsigned * signed instruction and most people want signed * signed, there are two strategies one can take.

Add 128 to data so now it's unsigned.  But that biases the output.  DNNL calculates this bias on the fly by summing weights then subtracts it out during GEMM.  intgemm calculates this bias in advance, which can then be subtracted from the bias term with no overhead at runtime.  A problem with this strategy is that it makes the accumulator bigger, requiring more upcasting with an expensive madd_epi16 instruction.
Emulate signed * signed by normalizing the sign bit into the second argument. This requires extra instructions in the hot loop but keeps the accumulator small, so it's less necessary to accumulate into 32-bit integers and madd_epi16 can be avoided.

Both intgemm and DNNL implement strategy 1; intgemm also implements strategy 2.

Similar to DNNL, intgemm has runtime CPUID selection among backends for SSSE3, AVX2, AVX512BW, and AVX512VNNI.
---
 CMakeLists.txt                                |  26 ++
 LICENSE                                       |   2 +
 include/mxnet/base.h                          |   2 +-
 .../intgemm/intgemm_fully_connected_op.cc     | 328 ++++++++++++++++++
 .../contrib/intgemm/max_absolute_op.cc        | 119 +++++++
 .../contrib/intgemm/prepare_data_op.cc        | 134 +++++++
 .../contrib/intgemm/prepare_weight_op.cc      | 180 ++++++++++
 .../contrib/intgemm/take_weight_op.cc         | 146 ++++++++
 src/storage/cpu_device_storage.h              |   2 +-
 src/storage/storage_manager_helpers.h         |   2 +-
 tests/python/unittest/test_contrib_intgemm.py | 219 ++++++++++++
 11 files changed, 1157 insertions(+), 3 deletions(-)
 create mode 100644 src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
 create mode 100644 src/operator/contrib/intgemm/max_absolute_op.cc
 create mode 100644 src/operator/contrib/intgemm/prepare_data_op.cc
 create mode 100644 src/operator/contrib/intgemm/prepare_weight_op.cc
 create mode 100644 src/operator/contrib/intgemm/take_weight_op.cc
 create mode 100644 tests/python/unittest/test_contrib_intgemm.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 814c8c99f65e..a48a63032098 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ if(USE_MKL_IF_AVAILABLE AND (NOT APPLE) AND (NOT MSVC) AND (CMAKE_HOST_SYSTEM_PR
 else()
   option(USE_MKLDNN "Build with MKL-DNN support" OFF)
 endif()
+cmake_dependent_option(USE_INTGEMM "Build with x86_64 intgemm library for low-precision multiplication" ON "CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64" OFF)
 if(NOT MSVC)
   option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON)
 else()
@@ -278,6 +279,22 @@ if(USE_MKLDNN)
   set_target_properties(dnnl PROPERTIES CXX_CLANG_TIDY "")  # don't lint 3rdparty dependency
 endif()
 
+if(USE_INTGEMM)
+  message(STATUS "Using intgemm")
+  include(FetchContent)
+  FetchContent_Declare(
+    intgemm
+    GIT_REPOSITORY https://github.com/kpu/intgemm.git
+    GIT_TAG        02f671cf537fdbc818cf8111d1d9e557a8650d7a
+  )
+  FetchContent_GetProperties(intgemm)
+  if(NOT intgemm_POPULATED)
+    FetchContent_Populate(intgemm)
+  endif()
+  add_subdirectory(${intgemm_SOURCE_DIR} ${intgemm_BINARY_DIR} EXCLUDE_FROM_ALL)
+  add_definitions(-DMXNET_USE_INTGEMM=1)
+endif()
+
 # Allow Cuda compiles outside of src tree to find things in 'src' and 'include'
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
@@ -474,6 +491,11 @@ endif()
 FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
 FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
 
+if(NOT USE_INTGEMM)
+  FILE(GLOB_RECURSE INTGEMM_OPERATOR_SOURCE "src/operator/contrib/intgemm/*.cc" "src/operator/contrib/intgemm/*.h")
+  list(REMOVE_ITEM SOURCE ${INTGEMM_OPERATOR_SOURCE})
+endif()
+
 # add nnvm to source
 FILE(GLOB_RECURSE NNVMSOURCE
   3rdparty/tvm/nnvm/src/c_api/*.cc
@@ -750,6 +772,10 @@ if(USE_MKLDNN)
       ${CMAKE_BINARY_DIR}/3rdparty/mkldnn/include/dnnl_version.h  ${CMAKE_SOURCE_DIR}/include/mkldnn/)
 endif()
 
+if(USE_INTGEMM)
+  target_link_libraries(mxnet PRIVATE intgemm)
+endif()
+
 function(BuildTVMOP)
   # scope the variables in BuildTVM.cmake to avoid conflict
   include(cmake/BuildTVM.cmake)
diff --git a/LICENSE b/LICENSE
index 9aa20d166394..4a8f8dd5e6e8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -309,6 +309,8 @@
          Licensed MIT © Zeno Rocha
     11. mx-theme - For details, see docs/python_docs/themes/mx-theme/LICENSE
          Copyright (c) 2016 myyasuda
+    12. intgemm - Refer to 3rdparty/intgemm/LICENSE
+         Copyright (c) 2017--2019 University of Edinburgh, Nikolay Bogoychev, Mateusz Chudyk, Kenneth Heafield, and Microsoft Corporation
 
 
     =======================================================================================
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index aa0021d543a0..addd7665f5be 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -539,7 +539,7 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
 #define ADD_FILELINE "\n\nDefined in " __FILE__ ":L" STRINGIZE(__LINE__)
 
 
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
 constexpr size_t kMKLDNNAlign = 64;
 #endif
 
diff --git a/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
new file mode 100644
index 000000000000..216f5ce47ecc
--- /dev/null
+++ b/src/operator/contrib/intgemm/intgemm_fully_connected_op.cc
@@ -0,0 +1,328 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file intgemm_fully_connected_op.cc
+ * \brief Operator wrapping intgemm's Multiply routine
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <cstdlib>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+struct IntgemmFullyConnectedParam : public dmlc::Parameter<IntgemmFullyConnectedParam> {
+  int out_type;
+  int num_hidden;
+  bool no_bias;
+  bool flatten;
+  DMLC_DECLARE_PARAMETER(IntgemmFullyConnectedParam) {
+    // This part os a copy of the FullyConnected parameters.
+    DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1)
+    .describe("Number of hidden nodes of the output.");
+    DMLC_DECLARE_FIELD(no_bias).set_default(false)
+    .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(flatten).set_default(true)
+    .describe("Whether to collapse all but the first axis of the input data tensor.");
+
+    DMLC_DECLARE_FIELD(out_type)
+    .add_enum("float32", mshadow::kFloat32)
+    .add_enum("int32", mshadow::kInt32)
+    .set_default(mshadow::kFloat32)
+    .describe("Output data type.");
+  }
+};
+DMLC_REGISTER_PARAMETER(IntgemmFullyConnectedParam);
+
+namespace {
+// Parse the above fields into indices for parameters.
+// The order is: data weight [scaling] [bias].
+struct ParameterIndices {
+  explicit ParameterIndices(const IntgemmFullyConnectedParam& param) :
+    data(0),
+    weight(1),
+    scaling(param.out_type == mshadow::kFloat32 ? 2 : kInvalid),
+    bias(param.no_bias ? kInvalid : (HaveScaling() ? 3 : 2)),
+    count(2U + HaveScaling() + HaveBias()) {}
+  bool HaveScaling() const { return scaling != kInvalid; }
+  bool HaveBias() const { return bias != kInvalid; }
+  const unsigned int data;
+  const unsigned int weight;
+  const unsigned int scaling;
+  const unsigned int bias;
+  const unsigned int count;
+  static const unsigned int kInvalid = std::numeric_limits<unsigned int>::max();
+};
+template<class T> ParameterIndices Sanity(const nnvm::NodeAttrs& attrs,
+                                          T* in,
+                                          T* out) {
+  // 3-4 parameters: A, B, scaling, and optional bias
+  ParameterIndices ret(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed));
+  CHECK_EQ(in->size(), ret.count);
+  CHECK_EQ(out->size(), 1U);
+  return ret;
+}
+}  // namespace
+
+inline bool IntgemmFullyConnectedOpShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_shape,
+                             mxnet::ShapeVector* out_shape) {
+  const ParameterIndices indices(Sanity(attrs, in_shape, out_shape));
+  const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+  // This follows FullyConnectedShape except for scaling.
+  using namespace mshadow;
+  mxnet::TShape dshape = (*in_shape)[indices.data];
+  mxnet::TShape oshape = (*out_shape)[0];
+  // require data to be known
+  if (!mxnet::ndim_is_known(dshape)) return false;
+
+  index_t num_input;
+  if (!param.flatten) {
+    num_input = dshape[dshape.ndim()-1];
+  } else {
+    num_input = dshape.ProdShape(1, dshape.ndim());
+  }
+  SHAPE_ASSIGN_CHECK(*in_shape, indices.weight, Shape2(param.num_hidden, num_input));
+  if (indices.HaveScaling()) {
+    SHAPE_ASSIGN_CHECK(*in_shape, indices.scaling, mxnet::TShape(1, 1));
+  }
+  if (indices.HaveBias()) {
+    if (!shape_assign(&(*in_shape)[indices.bias], Shape1(param.num_hidden)) &&
+        !shape_assign(&(*in_shape)[indices.bias], Shape2(param.num_hidden, 1))) {
+      LOG(FATAL) << "Unexpected shape for bias " << (*in_shape)[indices.bias];
+    }
+  }
+
+  if (!param.flatten) {
+    mxnet::TShape result_shape(dshape);
+    result_shape[dshape.ndim()-1] = param.num_hidden;
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden));
+  }
+  if (oshape.ndim() > 0) {
+    dshape[0] = oshape[0];
+    SHAPE_ASSIGN_CHECK(*in_shape, indices.data, dshape);
+  }
+  return true;
+}
+
+bool IntgemmFullyConnectedOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  const ParameterIndices indices(Sanity(attrs, in_attrs, out_attrs));
+  const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+
+  // Match the configuration for output.
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, param.out_type);
+  if (indices.HaveBias()) {
+    // Bias has same type as output.
+    TYPE_ASSIGN_CHECK(*in_attrs, indices.bias, (*out_attrs)[0]);
+    TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[indices.bias]);
+  }
+  // Scaling is float32.
+  if (indices.HaveScaling()) {
+    TYPE_ASSIGN_CHECK(*in_attrs, indices.scaling, mshadow::kFloat32);
+  }
+  // Users have to prepare B. It wasn't intended to be efficient.
+  TYPE_ASSIGN_CHECK(*in_attrs, indices.weight, mshadow::kInt8);
+  // A can be a float (in which case it is automatically quantized) or int8.
+  if (type_is_none((*in_attrs)[indices.data])) {
+    return false;
+  }
+  return ((*in_attrs)[indices.data] == mshadow::kInt8 ||
+      (*in_attrs)[indices.data] == mshadow::kFloat32);
+}
+
+void IntgemmFullyConnectedOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  const ParameterIndices indices(Sanity(attrs, &inputs, &outputs));
+  const IntgemmFullyConnectedParam& param = nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "TODO: doing more than overwriting for intgemm.";
+
+  const TBlob &A = inputs[indices.data], &B = inputs[indices.weight], &C = outputs[0];
+
+  CHECK(A.type_flag_ == mshadow::kInt8 || A.type_flag_ == mshadow::kFloat32);
+  CHECK_EQ(B.type_flag_, mshadow::kInt8);
+  CHECK(C.type_flag_ == mshadow::kInt32 || C.type_flag_ == mshadow::kFloat32);
+  CHECK(A.CheckContiguous());
+  CHECK(B.CheckContiguous());
+  CHECK(C.CheckContiguous());
+  CHECK_GE(A.shape_.ndim(), 1);
+  CHECK_GE(B.shape_.ndim(), 2);
+  size_t A_rows = A.shape_.ProdShape(0, A.shape_.ndim() - 1);
+  size_t inner = A.shape_[A.shape_.ndim() - 1];
+  CHECK_EQ(B.shape_[B.shape_.ndim() - 1], inner);
+  size_t B_cols = B.shape_.ProdShape(0, B.shape_.ndim() - 1);
+
+  CHECK_EQ(C.shape_.Size(), A_rows * B_cols);
+
+  bool bias = !param.no_bias;
+  if (bias) {
+    CHECK_EQ(inputs[indices.bias].type_flag_, C.type_flag_);
+    CHECK_EQ(inputs[indices.bias].shape_.Size(), param.num_hidden);
+  }
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "intgemm requires B have a multiple of " << ::intgemm::Int8::tile_info.b_cols <<
+    " columns in the equation C = AB.";
+
+  float out_float_multiplier;
+  if (indices.HaveScaling()) {
+    out_float_multiplier = *inputs[indices.scaling].dptr<float>();
+  } else {
+    out_float_multiplier = 0.0;  // Unused; stop compiler from complaining.
+  }
+
+  int8_t *A_quant;
+  mshadow::Tensor<cpu, 1, int8_t> A_quant_store;
+  if (A.type_flag_ == mshadow::kFloat32) {
+    const float *A_raw = A.dptr<float>();
+    // Quantize A for the user.
+    // Future: allow scale to be passed in? Should the induced scale be an output?
+    float scale = 127.0 / ::intgemm::MaxAbsolute(A_raw, A_raw + A.shape_.Size());
+    out_float_multiplier /= scale;
+    A_quant_store = ctx.requested[0].get_space_typed<cpu, 1, int8_t>(
+        mshadow::Shape1(A.shape_.Size()),
+        ctx.get_stream<cpu>());
+    A_quant = A_quant_store.dptr_;
+    ::intgemm::Int8::PrepareA(A_raw, A_quant, scale, A_rows, inner);
+  } else {
+    CHECK_EQ(A.type_flag_, mshadow::kInt8);
+    A_quant = A.dptr<int8_t>();
+  }
+  const int8_t *B_quant = B.dptr<int8_t>();
+  CHECK_EQ(reinterpret_cast<intptr_t>(A_quant) % 64, 0) <<
+    "Pointers should be aligned to a multiple of 64.";
+  CHECK_EQ(reinterpret_cast<intptr_t>(B_quant) % 64, 0) <<
+    "Pointers should be aligned to a multiple of 64.";
+  if (C.type_flag_ == mshadow::kFloat32) {
+    CHECK_EQ(reinterpret_cast<intptr_t>(C.dptr<float>()) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+  } else {
+    CHECK_EQ(reinterpret_cast<intptr_t>(C.dptr<int32_t>()) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+  }
+
+  if (bias) {
+    if (C.type_flag_ == mshadow::kFloat32) {
+      CHECK_EQ(reinterpret_cast<intptr_t>(inputs[indices.bias].dptr<float>()) % 64, 0) <<
+        "Pointers should be aligned to a multiple of 64.";
+      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite cb(
+          out_float_multiplier,
+          inputs[indices.bias].dptr<float>(),
+          C.dptr<float>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    } else {
+      // int32
+      CHECK_EQ(reinterpret_cast<intptr_t>(inputs[indices.bias].dptr<int32_t>()) % 64, 0) <<
+        "Pointers should be aligned to a multiple of 64.";
+      ::intgemm::callbacks::AddBiasAndWrite cb(
+          inputs[indices.bias].dptr<int32_t>(),
+          C.dptr<int32_t>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    }
+  } else {
+    if (C.type_flag_ == mshadow::kFloat32) {
+      ::intgemm::callbacks::UnquantizeAndWrite cb(out_float_multiplier, C.dptr<float>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    } else {
+      // int32
+      ::intgemm::callbacks::Write<int32_t> cb(C.dptr<int32_t>());
+      ::intgemm::Int8::Multiply(A_quant, B_quant, A_rows, inner, B_cols, cb);
+    }
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_fully_connected)
+.add_alias("_npx_intgemm_fully_connected")
+.describe(R"code(Multiply matrices using 8-bit integers.  data * weight.
+
+Input tensor arguments are: data weight [scaling] [bias]
+
+data: either float32 or prepared using intgemm_prepare_data (in which case it is int8).
+
+weight: must be prepared using intgemm_prepare_weight.
+
+scaling: present if and only if out_type is float32. If so this is multiplied by the result before adding bias. Typically:
+scaling = (max passed to intgemm_prepare_weight)/127.0 if data is in float32
+scaling = (max_passed to intgemm_prepare_data)/127.0 * (max passed to intgemm_prepare_weight)/127.0 if data is in int8
+
+bias: present if and only if !no_bias. This is added to the output after scaling and has the same number of columns as the output.
+
+out_type: type of the output.
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<IntgemmFullyConnectedParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  return ParameterIndices(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed)).count;
+})
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    std::vector<std::string> ret{"data", "weight"};
+    ParameterIndices indices(nnvm::get<IntgemmFullyConnectedParam>(attrs.parsed));
+    if (indices.HaveScaling()) {
+      ret.emplace_back("scaling");
+    }
+    if (indices.HaveBias()) {
+      ret.emplace_back("bias");
+    }
+    return ret;
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", IntgemmFullyConnectedOpShape)
+.set_attr<nnvm::FInferType>("FInferType", IntgemmFullyConnectedOpType)
+.set_attr<FCompute>("FCompute<cpu>", IntgemmFullyConnectedOpForwardCPU)
+.add_argument(
+    "data",
+    "NDArray-or-Symbol",
+    "First argument to multiplication. Tensor of float32 (quantized on the fly) or int8 from "
+      "intgemm_prepare_data. If you use a different quantizer, be sure to ban -128. The last "
+      "dimension must be a multiple of 64.")
+.add_argument(
+    "weight",
+    "NDArray-or-Symbol",
+    "Second argument to multiplication. Tensor of int8 from intgemm_prepare_weight. The last "
+      "dimension must be a multiple of 64.  The product of non-last dimensions must be a multiple "
+      "of 8.")
+.add_argument("scaling", "NDArray-or-Symbol", "Scaling factor to apply if output type is float32.")
+.add_argument("bias", "NDArray-or-Symbol", "Bias term.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_arguments(IntgemmFullyConnectedParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/max_absolute_op.cc b/src/operator/contrib/intgemm/max_absolute_op.cc
new file mode 100644
index 000000000000..01e10b0f9908
--- /dev/null
+++ b/src/operator/contrib/intgemm/max_absolute_op.cc
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file max_absolute_op.cc
+ * \brief Computes maximum absolute value of a tensor using intgemm
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool MaxAbsoluteOpShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector* in_attrs,
+                             mxnet::ShapeVector* out_attrs) {
+  // One in, one out.
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, mxnet::TShape(1, 1));
+  return shape_is_known(in_attrs->at(0));
+}
+
+inline bool MaxAbsoluteOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  return true;
+}
+
+inline bool MaxAbsoluteOpStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int>* in_attrs,
+                                   std::vector<int>* out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
+}
+
+void MaxAbsoluteOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const TBlob &in = inputs.front(), &out = outputs.front();
+  CHECK_EQ(in.type_flag_, mshadow::kFloat32);
+  CHECK_EQ(out.type_flag_, mshadow::kFloat32);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+
+  const std::size_t size = in.shape_.Size();
+
+  const float *data = in.dptr<float>();
+  // To maintain alignment, be a multiple of AVX512 register size.
+  const std::size_t kMultiple = 512 / 8;
+  CHECK_EQ(reinterpret_cast<intptr_t>(data) % kMultiple, 0)
+    << "Data must be aligned to " << kMultiple << " bytes.";
+
+  float result = ::intgemm::MaxAbsolute(data, data + size);
+  KERNEL_ASSIGN(*out.dptr<float>(), req[0], result);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_maxabsolute)
+.add_alias("_npx_intgemm_maxabsolute")
+.describe(R"code(Compute the maximum absolute value in a tensor of float32 fast on a CPU.  The tensor's total size must be a multiple of 16 and aligned to a multiple of 64 bytes.
+mxnet.nd.contrib.intgemm_maxabsolute(arr) == arr.abs().max()
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", MaxAbsoluteOpShape)
+.set_attr<nnvm::FInferType>("FInferType", MaxAbsoluteOpType)
+.set_attr<FInferStorageType>("FInferStorageType", MaxAbsoluteOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", MaxAbsoluteOpForwardCPU)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.add_argument("data", "NDArray-or-Symbol", "Tensor to compute maximum absolute value of");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_data_op.cc b/src/operator/contrib/intgemm/prepare_data_op.cc
new file mode 100644
index 000000000000..1d5719de36d2
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_data_op.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_data_op.cc
+ * \brief Converts data aka A matrices (typically activations) to intgemm's
+ * representation for A in C=AB.  This just quantizes to int8 and bans -128.
+ * The only difference from Quantize/QuantizeV2 is that it bans -128.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+bool PrepareDataOpShape(const nnvm::NodeAttrs& attrs,
+                    mxnet::ShapeVector* in_attrs,
+                    mxnet::ShapeVector* out_attrs) {
+  // data and maximum
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1));
+
+  return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareDataOpType(const nnvm::NodeAttrs& attrs,
+                   std::vector<int>* in_attrs,
+                   std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  // This routine converts from float to int8 with a scaling factor
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  return true;
+}
+
+bool PrepareDataOpStorageType(const nnvm::NodeAttrs& attrs,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage);
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  return true;
+}
+
+void PrepareDataOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+  const TBlob &in = inputs[0], &out = outputs[0];
+
+  CHECK_EQ(in.type_flag_, mshadow::kFloat32);
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+
+  const float *A = in.dptr<float>();
+  int8_t *quantA = out.dptr<int8_t>();
+  CHECK_EQ(reinterpret_cast<intptr_t>(A) % 64, 0);
+  CHECK_EQ(reinterpret_cast<intptr_t>(quantA) % 64, 0);
+  const float multiplier = 127.0 / *inputs[1].dptr<float>();
+  ::intgemm::Int8::Quantize(A, quantA, multiplier, in.shape_.Size());
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_prepare_data)
+.add_alias("_npx_intgemm_prepare_data")
+.describe(R"code(This operator converts quantizes float32 to int8 while also banning -128.
+
+It it suitable for preparing an data matrix for use by intgemm's C=data * weights operation.
+
+The float32 values are scaled such that maxabs maps to 127. Typically maxabs = maxabsolute(A).
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "maxabs"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareDataOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareDataOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareDataOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", PrepareDataOpForwardCPU)
+.add_argument("data", "NDArray-or-Symbol", "Activation matrix to be prepared for multiplication.")
+.add_argument(
+    "maxabs",
+    "NDArray-or-Symbol",
+    "Maximum absolute value to be used for scaling.  (The values will be multiplied by 127.0 / "
+      "maxabs.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/prepare_weight_op.cc b/src/operator/contrib/intgemm/prepare_weight_op.cc
new file mode 100644
index 000000000000..ad106ebca00b
--- /dev/null
+++ b/src/operator/contrib/intgemm/prepare_weight_op.cc
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file prepare_weight_op.cc
+ * \brief Converts weight matrices to intgemm's representation.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+struct PrepareWeightParam : public dmlc::Parameter<PrepareWeightParam> {
+  bool already_quantized;
+  DMLC_DECLARE_PARAMETER(PrepareWeightParam) {
+    DMLC_DECLARE_FIELD(already_quantized).set_default(false)
+    .describe("Is the weight matrix already quantized?");
+  }
+};
+DMLC_REGISTER_PARAMETER(PrepareWeightParam);
+
+bool PrepareWeightOpShape(const nnvm::NodeAttrs& attrs,
+                    mxnet::ShapeVector* in_attrs,
+                    mxnet::ShapeVector* out_attrs) {
+  // Optimal maximum parameter.
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+
+  if (in_attrs->size() == 2U) {
+    SHAPE_ASSIGN_CHECK(*in_attrs, 1, mxnet::TShape(1, 1));
+  }
+  return shape_is_known(out_attrs->at(0));
+}
+
+bool PrepareWeightOpType(const nnvm::NodeAttrs& attrs,
+                   std::vector<int>* in_attrs,
+                   std::vector<int>* out_attrs) {
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  if (in_attrs->size() == 1U) {
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+  } else if (in_attrs->size() == 2U) {
+    TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+  }
+  return true;
+}
+
+bool PrepareWeightOpStorageType(const nnvm::NodeAttrs& attrs,
+                          const int dev_mask,
+                          DispatchMode* dispatch_mode,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_GE(in_attrs->size(), 1U) << "Need at least weight to quantize.";
+  CHECK_LE(in_attrs->size(), 2U) << "weight and maximum for scaling.";
+  CHECK_EQ(out_attrs->size(), 1U);
+  STORAGE_TYPE_ASSIGN_CHECK(*out_attrs, 0, kDefaultStorage);
+  STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 0, kDefaultStorage);
+  if (in_attrs->size() == 2U) {
+    STORAGE_TYPE_ASSIGN_CHECK(*in_attrs, 1, kDefaultStorage);
+  }
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+  return true;
+}
+
+void PrepareWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
+  const PrepareWeightParam& params = nnvm::get<PrepareWeightParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), params.already_quantized ? 1U : 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "intgemm only overwrites";
+
+  const TBlob &in = inputs.front();
+  const TBlob &out = outputs.front();
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(in.CheckContiguous());
+  CHECK(out.CheckContiguous());
+  size_t B_cols = in.shape_.ProdShape(0, in.shape_.ndim() - 1);
+  size_t inner = in.shape_[in.shape_.ndim() - 1];
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "intgemm requires the output dimension (the product of all but the last dimension of the "
+    "weight matrix) to be a multiple of " << ::intgemm::Int8::tile_info.b_cols << ".";
+
+  int8_t *quantB = out.dptr<int8_t>();
+  CHECK_EQ(reinterpret_cast<intptr_t>(quantB) % 64, 0) <<
+    "Pointers should be aligned to a multiple of 64.";
+  CHECK(in.type_flag_ == mshadow::kFloat32 || in.type_flag_ == mshadow::kInt8) <<
+    "Expected either 32-bit values to be quantized or 8-bit values to rearrange.";
+  if (in.type_flag_ == mshadow::kInt8) {
+    const int8_t *B = in.dptr<int8_t>();
+    CHECK_EQ(reinterpret_cast<intptr_t>(B) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+    ::intgemm::Int8::PrepareBQuantizedTransposed(B, quantB, inner, B_cols);
+  } else if (in.type_flag_ == mshadow::kFloat32) {
+    const float *B = in.dptr<float>();
+    CHECK_EQ(reinterpret_cast<intptr_t>(B) % 64, 0) <<
+      "Pointers should be aligned to a multiple of 64.";
+    ::intgemm::Int8::PrepareBTransposed(
+        B,
+        quantB,
+        127.0 / *inputs[1].dptr<float>(),
+        inner,
+        B_cols);
+  }
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_prepare_weight)
+.add_alias("_npx_intgemm_prepare_weight")
+.describe(R"code(This operator converts a weight matrix in column-major format to intgemm's internal fast representation of weight matrices.  MXNet customarily stores weight matrices in column-major (transposed) format. This operator is not meant to be fast; it is meant to be run offline to quantize a model.
+
+In other words, it prepares weight for the operation C = data * weight^T.
+
+If the provided weight matrix is float32, it will be quantized first.  The quantization function is (int8_t)(127.0 / max * weight) where multiplier is provided as argument 1 (the weight matrix is argument 0).  Then the matrix will be rearranged into the CPU-dependent format.
+
+If the provided weight matrix is already int8, the matrix will only be rearranged into the CPU-dependent format.  This way one can quantize with intgemm_prepare_data (which just quantizes), store to disk in a consistent format, then at load time convert to CPU-dependent format with intgemm_prepare_weight.
+
+The internal representation depends on register length.  So AVX512, AVX2, and SSSE3 have different formats.  AVX512BW and AVX512VNNI have the same representation.
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<PrepareWeightParam>)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const PrepareWeightParam& params = nnvm::get<PrepareWeightParam>(attrs.parsed);
+  return params.already_quantized ? 1 : 2;
+})
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+  const PrepareWeightParam& params = nnvm::get<PrepareWeightParam>(attrs.parsed);
+  return params.already_quantized ?
+    std::vector<std::string>{"weight"} : std::vector<std::string>{"weight", "maxabs"};
+})
+.set_attr<mxnet::FInferShape>("FInferShape", PrepareWeightOpShape)
+.set_attr<nnvm::FInferType>("FInferType", PrepareWeightOpType)
+.set_attr<FInferStorageType>("FInferStorageType", PrepareWeightOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", PrepareWeightOpForwardCPU)
+.add_argument("weight", "NDArray-or-Symbol", "Parameter matrix to be prepared for multiplication.")
+.add_argument(
+    "maxabs",
+    "NDArray-or-Symbol",
+    "Maximum absolute value for scaling. The weights will be multipled by 127.0 / maxabs.")
+// TODO(Xinyu): a temp solution to enable GluonCV INT8 flow,
+// will be reverted after the improvement of CachedOP is done.
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
+.add_arguments(PrepareWeightParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/contrib/intgemm/take_weight_op.cc b/src/operator/contrib/intgemm/take_weight_op.cc
new file mode 100644
index 000000000000..09e320e47327
--- /dev/null
+++ b/src/operator/contrib/intgemm/take_weight_op.cc
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file select_weight_op.cc
+ * \brief Takes from the all-but-last dimension of a tensor stored in
+ * intgemm's weight format.  This is particularly useful for output matrices where
+ * some outputs are excluded.
+ */
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include "../../mshadow_op.h"
+#include "../../mxnet_op.h"
+#include "../../operator_common.h"
+#include "../../tensor/init_op.h"
+
+#include "intgemm/intgemm.h"
+
+namespace mxnet {
+namespace op {
+
+inline bool TakeWeightOpShape(const nnvm::NodeAttrs& shape,
+                             mxnet::ShapeVector* in_shape,
+                             mxnet::ShapeVector* out_shape) {
+  // 0 is weight, 1 is indices.
+  CHECK_EQ(in_shape->size(), 2U);
+  CHECK_EQ(out_shape->size(), 1U);
+
+  mxnet::TShape &weight = (*in_shape)[0];
+  mxnet::TShape &indices = (*in_shape)[1];
+  mxnet::TShape &out = (*out_shape)[0];
+
+  // weight matrices should be 2-dimensional by now.
+  SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape(2, -1));
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape(2, -1));
+  // indices are 1-dimensional.
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape(1, -1));
+
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, mxnet::TShape({indices[0], weight[1]}));
+  SHAPE_ASSIGN_CHECK(*in_shape, 0, mxnet::TShape({-1, out[1]}));
+  SHAPE_ASSIGN_CHECK(*in_shape, 1, mxnet::TShape({out[0]}));
+
+  return shape_is_known(weight) && shape_is_known(indices) && shape_is_known(out);
+}
+
+inline bool TakeWeightOpType(const nnvm::NodeAttrs& attrs,
+                             std::vector<int>* in_attrs,
+                             std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kInt8);
+  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kInt32);
+  return true;
+}
+
+inline bool TakeWeightOpStorageType(const nnvm::NodeAttrs& attrs,
+                                    const int dev_mask,
+                                    DispatchMode* dispatch_mode,
+                                    std::vector<int>* in_attrs,
+                                    std::vector<int>* out_attrs) {
+  *dispatch_mode = DispatchMode::kFCompute;
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  (*out_attrs)[0] = kDefaultStorage;
+  return true;
+}
+
+void TakeWeightOpForwardCPU(const nnvm::NodeAttrs& attrs,
+                            const OpContext& ctx,
+                            const std::vector<TBlob>& inputs,
+                            const std::vector<OpReqType>& req,
+                            const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  CHECK_EQ(req[0], kWriteTo) << "TODO request types other than write";
+  const TBlob &weight = inputs.front(), &indices = inputs[1], &out = outputs.front();
+  CHECK_EQ(weight.type_flag_, mshadow::kInt8);
+  CHECK_EQ(indices.type_flag_, mshadow::kInt32);
+  CHECK_EQ(out.type_flag_, mshadow::kInt8);
+  CHECK(weight.CheckContiguous());
+  CHECK(indices.CheckContiguous());
+  CHECK(out.CheckContiguous());
+  size_t B_cols = indices.shape_[0];
+  size_t inner = weight.shape_[weight.shape_.ndim() - 1];
+  CHECK_EQ(inner % ::intgemm::Int8::tile_info.b_rows, 0) <<
+    "intgemm requires the inner dimension be a multiple of " << ::intgemm::Int8::tile_info.b_rows;
+  CHECK_EQ(B_cols % ::intgemm::Int8::tile_info.b_cols, 0) <<
+    "For efficiency, intgemm requires there to be a multiple of " <<
+    ::intgemm::Int8::tile_info.b_cols << " indices.";
+  // mxnet doesn't have a uint32_t type so we'll just pointer cast. But check the sizes are the
+  // same.  Ideally this should be static.
+  assert(sizeof(int32_t) == sizeof(::intgemm::Index));
+  const ::intgemm::Index *index =
+    reinterpret_cast<const ::intgemm::Index*>(indices.dptr<int32_t>());
+
+  ::intgemm::Int8::SelectColumnsB(
+      weight.dptr<int8_t>(),
+      out.dptr<int8_t>(),
+      inner,
+      index,
+      index + B_cols);
+}
+
+NNVM_REGISTER_OP(_contrib_intgemm_take_weight)
+.add_alias("_npx_intgemm_take_weight")
+.describe(R"code(Index a weight matrix stored in intgemm's weight format.
+The indices select the outputs of matrix multiplication, not the inner dot product dimension.
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"weight", "indices"};
+  })
+.set_attr<mxnet::FInferShape>("FInferShape", TakeWeightOpShape)
+.set_attr<nnvm::FInferType>("FInferType", TakeWeightOpType)
+.set_attr<FInferStorageType>("FInferStorageType", TakeWeightOpStorageType)
+.set_attr<FCompute>("FCompute<cpu>", TakeWeightOpForwardCPU)
+.add_argument(
+    "weight",
+    "NDArray-or-Symbol",
+    "Tensor already in intgemm weight format to select from")
+.add_argument("indices", "NDArray-or-Symbol", "indices to select on the 0th dimension of weight");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index eca7eaa057fc..1987fa1c0095 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -50,7 +50,7 @@ class CPUDeviceStorage {
   /*!
    * \brief Alignment of allocation.
    */
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
   // MKLDNN requires special alignment. 64 is used by the MKLDNN library in
   // memory allocation.
   static constexpr size_t alignment_ = kMKLDNNAlign;
diff --git a/src/storage/storage_manager_helpers.h b/src/storage/storage_manager_helpers.h
index 14f9ea7727fc..dd5ff16f3c53 100644
--- a/src/storage/storage_manager_helpers.h
+++ b/src/storage/storage_manager_helpers.h
@@ -120,7 +120,7 @@ class ContextHelperCPU : public ContextHelper {
   }
 
  private:
-#if MXNET_USE_MKLDNN == 1
+#if MXNET_USE_MKLDNN == 1 || MXNET_USE_INTGEMM == 1
   // MKLDNN requires special alignment. 64 is used by the MKLDNN library in
   // memory allocation.
   static constexpr size_t alignment_ = kMKLDNNAlign;
diff --git a/tests/python/unittest/test_contrib_intgemm.py b/tests/python/unittest/test_contrib_intgemm.py
new file mode 100644
index 000000000000..a6c28fdc7448
--- /dev/null
+++ b/tests/python/unittest/test_contrib_intgemm.py
@@ -0,0 +1,219 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet import np, npx
+from mxnet.test_utils import same, use_np, assert_almost_equal
+from common import with_seed
+import random
+import pytest
+
+@use_np
+@with_seed()
+@pytest.mark.parametrize('shape',
+    [(3, 2), (9,17), (2, 7, 1, 8)] + [(i,) for i in range(1,65)])
+def test_contrib_intgemm_maxabsolute(shape):
+    if "intgemm_maxabsolute" not in dir(mx.nd.contrib):
+        return
+    # mx.nd API
+    m = mx.nd.random_uniform(low=-100.0, high=100.0, shape=shape)
+    fast = mx.nd.contrib.intgemm_maxabsolute(m)
+    slow = mx.nd.max(mx.nd.abs(m))
+    assert same(fast, slow)
+    # np API
+    m = np.random.uniform(low=-100.0, high=100.0, size=shape)
+    fast = npx.intgemm_maxabsolute(m).reshape(())
+    slow = np.max(np.abs(m))
+    assert same(fast, slow)
+
+@use_np
+@with_seed()
+@pytest.mark.parametrize('shape', [(i,) for i in range(1, 67)] + [(2,3), (130, 12)])
+@pytest.mark.parametrize('max_quant', [2.0])
+def test_contrib_intgemm_prepare_data(shape, max_quant):
+    if "intgemm_prepare_data" not in dir(mx.nd.contrib):
+        return
+    m = mx.nd.random_uniform(low=-3.0, high=3.0, shape=shape)
+    scaled = m * 127.0 / max_quant
+    # Rounding 0.5 can go up or down.  Move values away from 0.5.
+    too_close = mx.nd.abs(mx.nd.round(scaled) - scaled) > 0.45
+    m += max_quant / 127.0 * 0.05 * too_close
+
+    # Reference: scale and round
+    ref = mx.nd.round(m * 127.0 / max_quant)
+    # Clip to [-127, 127].  Because otherwise e.g. -129 casts to +127.
+    ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127.0]))
+    ref = mx.nd.broadcast_minimum(ref, mx.nd.array([127.0]))
+    # Reference: cast to int8
+    ref = mx.nd.cast(ref, dtype='int8')
+    # Reference: ban -128
+    ref = mx.nd.broadcast_maximum(ref, mx.nd.array([-127], dtype = 'int8'))
+
+    test = mx.nd.contrib.intgemm_prepare_data(m, mx.nd.array([max_quant]))
+    assert same(test, ref)
+    test = npx.intgemm_prepare_data(m.as_np_ndarray(), np.array([max_quant]))
+    assert same(test, ref.as_np_ndarray())
+
+@use_np  
+@with_seed()
+@pytest.mark.parametrize('shape', [(8, 64), (16, 64), (8, 128), (16, 128), (2, 4, 64)])
+@pytest.mark.parametrize('max_quant', [0.2, 3.0])
+@pytest.mark.parametrize('api', [(mx.nd.contrib, mx.nd), (npx, np)])
+def test_contrib_intgemm_weight_consistent(shape, max_quant, api):
+    # The weight format is actually CPU-dependent so we don't directly test the
+    # output, but indirectly test that it works.
+    if "intgemm_prepare_weight" not in dir(mx.nd.contrib):
+        return
+    contrib, top = api
+    max_array = top.array([max_quant])
+    if top == mx.nd:
+        m = top.random_uniform(low=-3.0, high=3.0, shape=shape)
+    else:
+        m = np.random.uniform(size=shape)
+    direct = contrib.intgemm_prepare_weight(m, max_array)
+    quant = contrib.intgemm_prepare_data(m, max_array) 
+    indirect = contrib.intgemm_prepare_weight(quant, already_quantized=True)
+    # Should get the same data from direct call and already_quantized version.
+    assert same(direct, indirect)
+    
+@use_np
+@with_seed()
+@pytest.mark.parametrize('indices', [
+        [0,1,2,3,4,5,6,7],
+        [1,2,1,2,1,2,1,2],
+        [7,6,5,4,3,2,1,0],
+        [3,1,4,1,5,9,2,6],
+        # Since random_uniform doesn't support int8, use python
+        [random.randint(0,15) for i in range(8)],
+        [random.randint(0,15) for i in range(16)],
+        [random.randint(0,15) for i in range(24)]
+    ])
+@pytest.mark.parametrize('api', [(mx.nd.contrib, mx.nd), (npx, np)])
+def test_contrib_intgemm_take_weight(indices, api):
+    if "intgemm_take_weight" not in dir(mx.nd.contrib):
+        return
+    contrib, top = api
+    m = top.array([random.randint(-127,127) for i in range(16 * 64)], dtype='int8')
+    m = m.reshape((16, 64))
+    indices = top.array(indices, dtype='int32')
+    # Prepare weight then take.
+    test = contrib.intgemm_prepare_weight(m, already_quantized=True)
+    test = contrib.intgemm_take_weight(test, indices)
+    # Take then prepare.
+    ref = m.take(indices, axis=0)
+    ref = contrib.intgemm_prepare_weight(ref, already_quantized=True)
+    assert same(test, ref)
+
+@use_np
+@pytest.mark.parametrize('data_rows', range(1, 5))
+@pytest.mark.parametrize('inner', range(64, 256, 64))
+@pytest.mark.parametrize('weight_cols', range(8, 24, 8))
+@pytest.mark.parametrize('api', [
+    (mx.nd.contrib, mx.nd, mx.nd.FullyConnected, mx.nd.cast),
+    (npx, np, npx.fully_connected, npx.cast)])
+def test_contrib_intgemm_multiply(data_rows, inner, weight_cols, api):
+    if "intgemm_fully_connected" not in dir(mx.nd.contrib):
+        return
+    contrib, top, fully_connected, cast = api
+    #The multiplication routine has approximations so everything is tested
+    #deterministically to ensure bounds are met.
+    random.seed(1)
+
+    # Don't use full range (-127, 127) to avoid saturation.
+    data = [random.randint(-64, 64) for i in range(data_rows * inner)]
+    data = top.array(data, dtype='int8').reshape((data_rows, inner))
+    weight = [random.randint(-64, 64) for i in range(inner * weight_cols)]
+    weight = top.array(weight, dtype='int8').reshape((weight_cols, inner))
+    weight_prepared = contrib.intgemm_prepare_weight(weight, already_quantized=True)
+
+    # int32 output, no bias
+    test = contrib.intgemm_fully_connected(data,
+                                           weight_prepared,
+                                           no_bias=True,
+                                           flatten=False,
+                                           out_type='int32',
+                                           num_hidden=weight_cols)
+    ref = fully_connected(cast(data, dtype='float32'),
+                          cast(weight, dtype='float32'),
+                          no_bias=True,
+                          flatten=False,
+                          num_hidden=weight_cols)
+    assert_almost_equal(cast(test, dtype='float32'), ref, rtol=0.01, atol=0.01)
+
+    # float32 output, no bias
+    scale = 3.0
+    test = contrib.intgemm_fully_connected(data,
+                                           weight_prepared,
+                                           top.array([scale]),
+                                           no_bias=True,
+                                           flatten=False,
+                                           out_type='float32',
+                                           num_hidden=weight_cols)
+    assert_almost_equal(test, ref * scale, rtol=0.01, atol=0.01)
+
+    # int32 output, bias
+    bias = top.array([random.randint(-60000, 60000) for i in range(weight_cols)], dtype = 'int32')
+    test = contrib.intgemm_fully_connected(data,
+                                           weight_prepared,
+                                           bias,
+                                           no_bias=False,
+                                           flatten=False,
+                                           out_type='int32',
+                                           num_hidden=weight_cols)
+    ref = fully_connected(cast(data, dtype='float32'),
+                               cast(weight, dtype='float32'),
+                               cast(bias, dtype='float32'),
+                               no_bias=False,
+                               flatten=False,
+                               num_hidden=weight_cols)
+    assert_almost_equal(cast(test, dtype='float32'), ref, rtol=0.01, atol=0.01)
+
+    # float32 output, bias
+    # Scaling is applied before bias (and bias is not scaled). So to make the
+    # reference comparison easy, just scale the bias beforehand.
+    test = contrib.intgemm_fully_connected(data,
+                                           weight_prepared,
+                                           top.array([scale]),
+                                           cast(bias, dtype='float32') * scale,
+                                           no_bias=False,
+                                           flatten=False,
+                                           out_type='float32',
+                                           num_hidden=weight_cols)
+    assert_almost_equal(test, ref * scale, rtol=0.01, atol=0.01)
+
+    # float32 input should work the same as manually prepared int8 input.
+    data_float = top.array([random.uniform(-3.14, 3.14) for i in range(data_rows * inner)])
+    data_float = data_float.reshape(data_rows, inner)
+    direct = contrib.intgemm_fully_connected(data_float,
+                                             weight_prepared,
+                                             top.array([scale]),
+                                             cast(bias, dtype='float32'),
+                                             no_bias=False,
+                                             flatten=False,
+                                             out_type='float32',
+                                             num_hidden=weight_cols)
+    maxabs = contrib.intgemm_maxabsolute(data_float)
+    data_prepared = contrib.intgemm_prepare_data(data_float, maxabs)
+    cooked = contrib.intgemm_fully_connected(data_prepared,
+                                             weight_prepared,
+                                             top.array(scale * maxabs / 127.0),
+                                             cast(bias, dtype='float32'),
+                                             no_bias=False,
+                                             flatten=False,
+                                             out_type='float32',
+                                             num_hidden=weight_cols)
+    assert_almost_equal(direct, cooked, rtol=0.01, atol=0.01)