Support per-axis quantization (#2390)

Reworked the simplify_qdq pass to support: Per-axis quantization (ie. allow 1D scales and zero points) Allow broadcast and transpose ops between dq and quant_op
ROCm · Nov 15, 2023 · 0039b11 · 0039b11
1 parent b2a40ea
commit 0039b11
Show file tree

Hide file tree

Showing 4 changed files with 555 additions and 175 deletions.
diff --git a/src/include/migraphx/matcher.hpp b/src/include/migraphx/matcher.hpp
@@ -591,6 +591,19 @@ MIGRAPHX_PRED_MATCHER(same_input_shapes, instruction_ref ins)
         ins->inputs().begin(), ins->inputs().end(), [&](auto x) { return x->get_shape() == s; });
 }
 
+MIGRAPHX_PRED_MATCHER(has_same_value, instruction_ref ins)
+{
+    if(ins->name() != "@literal")
+        return false;
+    bool all_same = false;
+    ins->get_literal().visit([&](auto s) {
+        all_same = std::all_of(s.begin() + 1, s.end(), [&](const auto& scale) {
+            return float_equal(scale, s.front());
+        });
+    });
+    return all_same;
+}
+
 MIGRAPHX_BASIC_MATCHER(output, const matcher_context&, instruction_ref ins)
 {
     if(ins->outputs().size() == 1)
@@ -844,6 +857,12 @@ auto skip_broadcasts_converts(Ms... ms)
     return skip(name("broadcast", "multibroadcast", "contiguous", "convert"))(ms...);
 }
 
+template <class... Ms>
+auto skip_broadcasts_transposes_contiguous(Ms... ms)
+{
+    return skip(name("broadcast", "multibroadcast", "contiguous", "transpose"))(ms...);
+}
+
 template <class T>
 inline auto has_value(T x, float tolerance = 1e-6)
 {

diff --git a/src/simplify_qdq.cpp b/src/simplify_qdq.cpp
@@ -45,77 +45,145 @@ std::unordered_set<std::string> get_quantizable_op_names()
     return s;
 }
 
-MIGRAPHX_PRED_MATCHER(has_same_value, instruction_ref ins)
+struct match_find_quantizable_ops
 {
-    if(ins->name() != "@literal")
-        return false;
-    bool all_same = false;
-    ins->get_literal().visit([&](auto s) {
-        all_same = std::all_of(s.begin() + 1, s.end(), [&](const auto& scale) {
-            return float_equal(scale, s.front());
+    static bool
+    is_valid_scale(instruction_ref scale, std::vector<std::size_t> lens, std::size_t axis)
+    {
+        return scale->get_shape().scalar() or scale->get_shape().elements() == lens.at(axis);
+    }
+
+    static bool is_valid_zero_point(instruction_ref zp)
+    {
+        if(not zp->can_eval())
+            return false;
+
+        bool all_zeros = false;
+        zp->eval().visit([&](auto z) {
+            all_zeros =
+                std::all_of(z.begin(), z.end(), [&](auto val) { return float_equal(val, 0); });
         });
-    });
-    return all_same;
-}
+        return all_zeros;
+    }
 
-struct match_find_quantizable_ops
-{
+    static auto
+    scale_broadcast_op(instruction_ref scale, std::vector<std::size_t> lens, std::size_t axis)
+    {
+        if(scale->get_shape().scalar())
+        {
+            return migraphx::make_op("multibroadcast", {{"out_lens", lens}});
+        }
+        else
+        {
+            return migraphx::make_op("broadcast", {{"out_lens", lens}, {"axis", axis}});
+        }
+    }
 
-    static auto dequantizelinear_op(const std::string& name, const std::string& scale)
+    // Helper function to insert quantized versions of any broadcasts and transpose ops that
+    // occur between dequantizelinear and the quantized op
+    static auto
+    propagate_quantized_ins(module& m, const instruction_ref dqins, const instruction_ref qop)
+    {
+        auto qinp     = dqins->inputs().front();
+        auto next_ins = dqins;
+
+        while(next_ins != qop)
+        {
+            if(next_ins->name() != "dequantizelinear")
+            {
+                qinp = m.insert_instruction(qop, next_ins->get_operator(), qinp);
+            }
+            next_ins = next_ins->outputs().front();
+        }
+        return qinp;
+    }
+
+    static auto dequantizelinear_op(const std::string& scale, const std::string& zp)
     {
         return match::name("dequantizelinear")(
-            match::arg(0)(match::skip(match::name("quantizelinear"))(match::any().bind(name))),
-            match::arg(1)(match::skip_broadcasts(has_same_value().bind(scale))),
-            match::arg(2)(match::skip_broadcasts(match::all_of(match::has_value(0)))));
+            match::arg(0)(match::skip(match::name("quantizelinear"))(match::any())),
+            match::arg(1)(match::skip_broadcasts(match::is_constant().bind(scale))),
+            match::arg(2)(match::skip_broadcasts(match::is_constant().bind(zp))));
     }
 
     auto matcher() const
     {
         return match::name(get_quantizable_op_names())(
-            match::arg(0)(dequantizelinear_op("x1", "scale1")),
-            match::arg(1)(dequantizelinear_op("x2", "scale2")));
+            match::arg(0)(match::skip_broadcasts_transposes_contiguous(
+                dequantizelinear_op("scale1", "zp1").bind("dq1"))),
+            match::arg(1)(match::skip_broadcasts_transposes_contiguous(
+                dequantizelinear_op("scale2", "zp2").bind("dq2"))));
     }
 
     void apply(module& m, const match::matcher_result& r) const
     {
         auto qop    = r.result;
-        auto q1     = r.instructions["x1"];
-        auto q2     = r.instructions["x2"];
+        auto dq1    = r.instructions["dq1"];
+        auto dq2    = r.instructions["dq2"];
         auto scale1 = r.instructions["scale1"];
         auto scale2 = r.instructions["scale2"];
+        auto zp1    = r.instructions["zp1"];
+        auto zp2    = r.instructions["zp2"];
 
         // Only INT8 type currently supported
-        if(q1->get_shape().type() != migraphx::shape::int8_type or
-           q2->get_shape().type() != migraphx::shape::int8_type)
+        if(dq1->inputs().front()->get_shape().type() != migraphx::shape::int8_type or
+           dq2->inputs().front()->get_shape().type() != migraphx::shape::int8_type)
             return;
 
-        double scale;
-        visit_all(scale1->get_literal(), scale2->get_literal())(
-            [&](const auto s1, const auto s2) { scale = s1.front() * s2.front(); });
+        // Only symmetric quantization supported (ie. non-zero zero_points not allowed)
+        if(not(is_valid_zero_point(zp1) and is_valid_zero_point(zp2)))
+            return;
 
+        // Only support scalar and 1D scales
+        if(scale1->get_shape().lens().size() != 1 or scale2->get_shape().lens().size() != 1)
+            return;
+
+        // Propagate q1 and q2 through any broadcasts and transposes before qop
         auto qop_args  = qop->inputs();
-        qop_args.at(0) = q1;
-        qop_args.at(1) = q2;
+        qop_args.at(0) = propagate_quantized_ins(m, dq1, qop);
+        qop_args.at(1) = propagate_quantized_ins(m, dq2, qop);
         instruction_ref dq;
-        instruction_ref dq_scale;
+        instruction_ref out_scale;
         instruction_ref zero_point;
         if(qop->name() == "convolution")
         {
             auto conv_val = qop->get_operator().to_value();
             dq            = m.insert_instruction(
                 qop, migraphx::make_op("quant_convolution", conv_val), qop_args);
+            auto out_lens = dq->get_shape().lens();
+
+            // Input scale should always be scalar and weight scale can be scalar or 1D of the
+            // same lens as the output channel dim (dim 1 in the output)
+            if(not(is_valid_scale(scale1, out_lens, 1) and is_valid_scale(scale2, out_lens, 1)))
+                return;
+
+            auto s1_bcast =
+                m.insert_instruction(qop, scale_broadcast_op(scale1, out_lens, 1), scale1);
+            auto s2_bcast =
+                m.insert_instruction(qop, scale_broadcast_op(scale2, out_lens, 1), scale2);
+
+            out_scale = m.insert_instruction(qop, migraphx::make_op("mul"), s1_bcast, s2_bcast);
         }
         else if(qop->name() == "dot")
         {
-            dq = m.insert_instruction(qop, migraphx::make_op("quant_dot"), qop_args);
+            dq            = m.insert_instruction(qop, migraphx::make_op("quant_dot"), qop_args);
+            auto out_lens = dq->get_shape().lens();
+
+            // For (..., M, N) x (..., N, K) dot, only support cases where quantization axis is M
+            // for input1 and K for input 2
+            if(not(is_valid_scale(scale1, out_lens, out_lens.size() - 2) and
+                   is_valid_scale(scale2, out_lens, out_lens.size() - 1)))
+                return;
+
+            auto s1_bcast = m.insert_instruction(
+                qop, scale_broadcast_op(scale1, out_lens, out_lens.size() - 2), scale1);
+            auto s2_bcast = m.insert_instruction(
+                qop, scale_broadcast_op(scale2, out_lens, out_lens.size() - 1), scale2);
+
+            out_scale = m.insert_instruction(qop, migraphx::make_op("mul"), s1_bcast, s2_bcast);
         }
-        auto ins_type = qop->get_shape().type();
-        dq_scale      = m.add_literal(literal({ins_type}, {scale}));
 
-        auto lens = dq->get_shape().lens();
-        auto scale_mb =
-            m.insert_instruction(qop, make_op("multibroadcast", {{"out_lens", lens}}), dq_scale);
-        dq = m.insert_instruction(qop, make_op("dequantizelinear"), dq, scale_mb);
+        dq = m.insert_instruction(qop, make_op("dequantizelinear"), dq, out_scale);
         m.replace_instruction(qop, dq);
     }
 };