From f8956474e1e4d9bf612f630f4f157d9aa8ddb010 Mon Sep 17 00:00:00 2001
From: Jim Blandy <jimb@red-bean.com>
Date: Wed, 30 Apr 2025 00:43:56 -0700
Subject: [PATCH] Fix return types of dot4add_i8packed, dot4add_u8packed, and
 dot2add.

Change the definition of the HLSL `dot4add_i8packed`,
`dot4add_u8packed`, and `dot2add` intrinsics in
`utils/hct/gen_intrin_main.txt` to simply spell out the return types,
rather than saying that their return type is determined by their third
argument.

This prevents DXC from trying to give those functions declarations like

    declare i64 @"\01?dot4add_u8packed@hlsl@@YA_JII_J@Z"(i32, i32, i64 signext) #1

which seems to expect a 64-bit third argument and return value.

`HLSLExternalSource::MatchArguments` assumes that functions whose
return type depends on their arguments' types will get cleaned up by
`TryEvalInstrinsic`. Unfortunately, the `dot4add` variants cannot be
constant expressions, so this cleanup does not happen for them. But
these functions are not generic, and they have only one overload, so
there is no need to use interesting `uComponentTypeId` values to get
the right effects in the first place.

Fixes #7400.
---
 tools/clang/lib/Sema/SemaHLSL.cpp             |  4 +--
 .../test/DXC/dot4add_i8_u8_packed-types.hlsl  | 34 +++++++++++++++++++
 utils/hct/gen_intrin_main.txt                 |  6 ++--
 3 files changed, 39 insertions(+), 5 deletions(-)
 create mode 100644 tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl
diff --git a/tools/clang/lib/Sema/SemaHLSL.cpp b/tools/clang/lib/Sema/SemaHLSL.cpp
index ba0801dd52..ed0291b909 100644
--- a/tools/clang/lib/Sema/SemaHLSL.cpp
+++ b/tools/clang/lib/Sema/SemaHLSL.cpp
@@ -6216,8 +6216,8 @@ bool HLSLExternalSource::MatchArguments(
           (iArg != retArgIdx && retTypeIdx == pIntrinsicArg->uComponentTypeId);
       // For literal arg which don't affect return type, find concrete type.
       // For literal arg affect return type,
-      //   TryEvalIntrinsic in CGHLSLMS.cpp will take care of cases
-      //     where all argumentss are literal.
+      //   TryEvalIntrinsic in CGHLSLMSFinishCodeGen.cpp will take care of
+      //     cases where all arguments are literal.
       //   CombineBasicTypes will cover the rest cases.
       if (!affectRetType) {
         TypeInfoEltKind =
diff --git a/tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl b/tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl
new file mode 100644
index 0000000000..53c87bb9c1
--- /dev/null
+++ b/tools/clang/test/DXC/dot4add_i8_u8_packed-types.hlsl
@@ -0,0 +1,34 @@
+// RUN: %dxc /enable-16bit-types /T cs_6_8 %s | FileCheck %s
+
+// Compiling this HLSL would fail this assertion in TranslateDot4AddPacked:
+//
+//     DXASSERT(
+//         !accTy->isVectorTy() && accTy->isIntegerTy(32),
+//         "otherwise, unexpected vector support in high level intrinsic template");
+//
+// Bug was fixed by changing the declarations of dot4add_i8packed and
+// dot4add_u8packed in utils/hct/gen_intrin_main.txt to simply write
+// out their argument and return types, rather than using the $typeN
+// reference syntax.
+
+// CHECK: call i32 @dx.op.dot4AddPacked.i32{{.*}}Dot4AddI8Packed(acc,a,b)
+// CHECK: call i32 @dx.op.dot4AddPacked.i32{{.*}}Dot4AddU8Packed(acc,a,b)
+// CHECK: call float @dx.op.dot2AddHalf.f32{{.*}}Dot2AddHalf(acc,ax,ay,bx,by)
+
+RWByteAddressBuffer buf;
+
+[numthreads(1, 1, 1)]
+void main()
+{
+    int a = dot4add_i8packed(0, 0, 0);
+    int b = dot4add_i8packed(0, 0, a);
+    buf.Store<int>(0, b);
+
+    uint c = dot4add_u8packed(0, 0, 0);
+    uint d = dot4add_u8packed(0, 0, c);
+    buf.Store<uint>(4, d);
+
+    float e = dot2add(half2(0,0), half2(0,0), 1.0);
+    float f = dot2add(half2(0,0), half2(0,0), e);
+    buf.Store<float>(8, f);
+}
diff --git a/utils/hct/gen_intrin_main.txt b/utils/hct/gen_intrin_main.txt
index 7f7637b230..404ac6eb5a 100644
--- a/utils/hct/gen_intrin_main.txt
+++ b/utils/hct/gen_intrin_main.txt
@@ -336,9 +336,9 @@ float<4,3> [[rn]] ObjectToWorld4x3();
 float<4,3> [[rn]] WorldToObject4x3();
 
 // Packed dot products with accumulate:
-$type3 [[rn]] dot4add_u8packed(in uint a, in $type1 b, in uint c);
-$type3 [[rn]] dot4add_i8packed(in uint a, in $type1 b, in int c);
-$type3 [[rn]] dot2add(in float16_t<2> a, in $type1 b, in float c);
+uint [[rn]] dot4add_u8packed(in uint a, in $type1 b, in uint c);
+int [[rn]] dot4add_i8packed(in uint a, in $type1 b, in int c);
+float [[rn]] dot2add(in float16_t<2> a, in $type1 b, in float c);
 
 // Unpacking intrinsics
 int16_t<4> [[rn]] unpack_s8s16(in p32i8 pk);