fixes

dan-garvey · dan-garvey · commit a7e8a77b3f8f · 2024-11-15T12:06:06.000-06:00
diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -60,21 +60,19 @@ def main():
     dataset_type = cli.get_input_data_files(args)
     dataset_type = "irpa" if "irpa" in dataset_type else "gguf"
     dataset = cli.get_input_dataset(args)
-
-    kv_cache_dtype = getattr(torch, args.kv_cache_dtype)
     hp = configs.LlamaHParams.from_gguf_props(dataset.properties)
     tensor_parallelism_size = (
         dataset.properties["tensor_parallelism_size"]
         if "tensor_parallelism_size" in dataset.properties
         else 1
     )
+
     llama_config = LlamaModelConfig(
         hp,
         tensor_parallelism_size=tensor_parallelism_size,
         use_hf=False,
         static_tables=False,  # Rely on the compiler for hoisting tables.
         kv_cache_type="direct" if args.bs == [1] else "paged",
-        kv_cache_dtype=kv_cache_dtype,
         attention_kernel=args.attention_kernel,
     )
     llama_config.fake_quant = args.fake_quant
diff --git a/sharktank/sharktank/examples/paged_llm_v1.py b/sharktank/sharktank/examples/paged_llm_v1.py
@@ -253,15 +253,12 @@ def main():
     cli.add_quantization_options(parser)
     cli.add_model_options(parser)
     args = cli.parse(parser)
-
     device = torch.device(args.device) if args.device else None
     activation_dtype = getattr(torch, args.activation_dtype)
     assert isinstance(activation_dtype, torch.dtype)
-    kv_cache_dtype = getattr(torch, args.kv_cache_dtype)
     dataset = cli.get_input_dataset(args)
     tokenizer = cli.get_tokenizer(args)
     prompts = args.prompt
-
     config = LlamaModelConfig(
         hp=configs.LlamaHParams.from_gguf_props(dataset.properties),
         block_seq_stride=16,
@@ -270,7 +267,6 @@ def main():
         activation_dtype=activation_dtype,
         attention_dtype=activation_dtype,
         attention_kernel=args.attention_kernel,
-        kv_cache_dtype=kv_cache_dtype,
         use_hf=args.use_hf,
         tensor_parallelism_size=args.tensor_parallelism_size,
         fake_quant=args.fake_quant,
diff --git a/sharktank/sharktank/layers/configs/llm_configs.py b/sharktank/sharktank/layers/configs/llm_configs.py
@@ -153,8 +153,6 @@ class LlamaModelConfig:
     # Dtype to use for general FP activations not otherwise configured.
     activation_dtype: torch.dtype = torch.float16
 
-    kv_cache_dtype: torch.dtype = torch.float16
-
     # Dtype to use for attention.
     attention_dtype: torch.dtype = torch.float16
 
@@ -180,4 +178,4 @@ class LlamaModelConfig:
     # the compiler to transform it to an initialization time step. This can
     # be the difference of many gigabytes of static data being embedded in
     # the program and not.
-    static_tables: bool = True
+    static_tables: bool = True
diff --git a/sharktank/sharktank/layers/kv_cache.py b/sharktank/sharktank/layers/kv_cache.py
@@ -191,8 +191,8 @@ def write_timestep(
         update_count = len(cache_partitions)
 
         for b in range(bs):
-            row_index = torch.tensor(b, dtype=torch.int64)
-            row_start_pos = seq_positions[row_index]
+            row_index = torch.tensor([b], dtype=torch.int64)
+            row_start_pos = seq_positions[row_index].unsqueeze(0)
 
             for i, update in enumerate(cache_partitions):
                 cache = state[transformer_block_index * update_count + i]
@@ -477,7 +477,6 @@ def write(
 
         page_ids = page_ids.flatten(0, 1)
         part_block_view = part_block_view.flatten(0, 1)
-        part_block_view = ops.to(part_block_view, page_table.dtype)
 
         page_table.index_put_(
             (
diff --git a/sharktank/sharktank/layers/linear.py b/sharktank/sharktank/layers/linear.py
@@ -30,6 +30,10 @@ class LinearLayer(ThetaLayer):
     if premul_input is not None:
       x = x * premul_input
     matmul(x, weight.T) + bias
+
+    fake_quant exists to allow export without adding dequant ops.
+    when fake_quant is True, the op will in quant dequant fashion.
+    When false, it will keep quantized types.
     ```
     """
 
@@ -70,21 +74,21 @@ def forward(self, x):
             x = q_input.quantize(x)
             if self.fake_quant:
                 x = x.unpack().dequant()
+        elif qdq_input is not None and self.fake_quant:
+            x = qdq_input.quantize(x).unpack().dequant()
 
         y = ops.linear(x, weight, bias)
 
         # Unconditionally dequantize.
-        # TODO: Support a q_output specifier that signals the layer to let
-        # the QuantizedTensor escape.
         if isinstance(y, QuantizedTensor) and not self.fake_quant:
             y = y.unpack().dequant()
         # Note that f8_e4m3fnuz types on AMD GPUs accumulate to fp32.
         # We can truncate to fp16 in iree, so we do a cast here
-        # to account for this in the IR.
+        # to account for this in the IR. This is may not be the right
+        # level to do this, but for now its here.
         if not self.fake_quant and y.dtype == torch.float8_e4m3fnuz:
             y = ops.to(y, torch.float16)
             return y
-        if qdq_output is not None:
-            # TODO: same as above.
+        if qdq_output is not None and self.fake_quant:
             y = qdq_output.quantize(y).unpack().dequant()
         return y
diff --git a/sharktank/sharktank/layers/paged_llama_attention_block.py b/sharktank/sharktank/layers/paged_llama_attention_block.py
@@ -151,9 +151,6 @@ def forward(
                 xk = self.cache_quantizer.quantize(xk).unpack().qs
                 xv = self.cache_quantizer.quantize(xv).unpack().qs
 
-        print(xk.dtype)
-        print(xv.dtype)
-        print(self.cache.dtype)
         xk, xv = self.transact_cache(
             xk_cache_update=xk,
             xv_cache_update=xv,
diff --git a/sharktank/sharktank/ops/qlinear_impls.py b/sharktank/sharktank/ops/qlinear_impls.py
@@ -55,6 +55,8 @@ def qlinear_tensor_scaled(
             return matmul(x_layout.qs, weight_layout.qs, transpose_rhs=True).to(
                 torch.float16
             )
+        else:
+            return NotImplemented
 
     # Bias.
     quantized_bias_accum = False
diff --git a/sharktank/sharktank/types/quantizers.py b/sharktank/sharktank/types/quantizers.py
@@ -83,7 +83,6 @@ def _quantize_raw_tensor(self, t: torch.Tensor, *, name: str) -> QuantizedTensor
         ...
 
 
-
 @register_inference_tensor
 class StaticScaledQuantizer(QuantizerTensor):
     """Quantizes to a `TensorScaledLayout` (per-tensor) or (TBD) for per-axis.
diff --git a/sharktank/tests/layers/linear_test.py b/sharktank/tests/layers/linear_test.py
@@ -84,7 +84,7 @@ def testNativeQuant_SymPerTensor_AsymPerAxis0_Dynamic(self):
                 bias_quant,
             ]
         )
-        linear = LinearLayer(theta)
+        linear = LinearLayer(theta, fake_quant=False)
 
         output = linear(lhs)
         output_ref = torch.matmul(lhs, rhs.T) + bias
diff --git a/sharktank/tests/layers/paged_llama_attention_block_test.py b/sharktank/tests/layers/paged_llama_attention_block_test.py
@@ -115,7 +115,6 @@ def forward(self, h, seq_block_ids, cache_state):
         output = aot.export(ep)
         output.verify()
         asm = str(output.mlir_module)
-        output.save_mlir("temp.mlir")
         self.assertNotIn("scaled_dot_product_attention", asm)
 
     def testExportNondecomposed(self):

Original file line number	Diff line number	Diff line change
`@@ -55,6 +55,8 @@ def qlinear_tensor_scaled(`
`55`	`55`	`return matmul(x_layout.qs, weight_layout.qs, transpose_rhs=True).to(`
`56`	`56`	`torch.float16`
`57`	`57`	`)`
	`58`	`+ else:`
	`59`	`+ return NotImplemented`
`58`	`60`
`59`	`61`	`# Bias.`
`60`	`62`	`quantized_bias_accum = False`
Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@ def testNativeQuant_SymPerTensor_AsymPerAxis0_Dynamic(self):`
`84`	`84`	`bias_quant,`
`85`	`85`	`]`
`86`	`86`	`)`
`87`		`- linear = LinearLayer(theta)`
	`87`	`+ linear = LinearLayer(theta, fake_quant=False)`
`88`	`88`
`89`	`89`	`output = linear(lhs)`
`90`	`90`	`output_ref = torch.matmul(lhs, rhs.T) + bias`