Skip to content

Commit

Permalink
[CIR][CIRGen] Generate CIR for vset_lane and vsetq_lane intrinsics (l…
Browse files Browse the repository at this point in the history
…lvm#882)

As title. 
Notice that for those intrinsics, just like OG, we do not lower to llvm
intrinsics, instead, do vector insert.
The test case is partially from OG
[aarch64-neon-vget.c](https://github.com/llvm/clangir/blob/85bc6407f559221afebe08a60ed2b50bf1edf7fa/clang/test/CodeGen/aarch64-neon-vget.c)
But, I did not do all signed and unsigned int tests because unsigned and
signed of the same width essentially just use the same intrinsic ID thus
exactly same code path as far as this PR concerns.

---------

Co-authored-by: Guojin He <[email protected]>
  • Loading branch information
2 people authored and smeenai committed Oct 9, 2024
1 parent 8da22e3 commit b18d38d
Show file tree
Hide file tree
Showing 2 changed files with 246 additions and 2 deletions.
10 changes: 8 additions & 2 deletions clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2162,14 +2162,20 @@ CIRGenFunction::buildAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
case NEON::BI__builtin_neon_vset_lane_i16:
case NEON::BI__builtin_neon_vset_lane_i32:
case NEON::BI__builtin_neon_vset_lane_i64:
case NEON::BI__builtin_neon_vset_lane_bf16:
case NEON::BI__builtin_neon_vset_lane_f32:
case NEON::BI__builtin_neon_vsetq_lane_i8:
case NEON::BI__builtin_neon_vsetq_lane_i16:
case NEON::BI__builtin_neon_vsetq_lane_i32:
case NEON::BI__builtin_neon_vsetq_lane_i64:
case NEON::BI__builtin_neon_vsetq_lane_bf16:
case NEON::BI__builtin_neon_vsetq_lane_f32:
Ops.push_back(buildScalarExpr(E->getArg(2)));
return builder.create<mlir::cir::VecInsertOp>(getLoc(E->getExprLoc()),
Ops[1], Ops[0], Ops[2]);
case NEON::BI__builtin_neon_vset_lane_bf16:
case NEON::BI__builtin_neon_vsetq_lane_bf16:
// No support for now as no real/test case for them
// at the moment, the implementation should be the same as above
// vset_lane or vsetq_lane intrinsics
llvm_unreachable("NYI");
case NEON::BI__builtin_neon_vset_lane_f64:
// The vector type needs a cast for the v1f64 variant.
Expand Down
238 changes: 238 additions & 0 deletions clang/test/CIR/CodeGen/aarch64-neon-vset.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \
// RUN: -emit-cir -target-feature +neon %s -o %t.cir
// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
// RUN: %clang_cc1 -triple aarch64-none-linux-android24 -fclangir \
// RUN: -emit-llvm -target-feature +neon %s -o %t.ll
// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s

// This test file is similar to but not the same as
// clang/test/CodeGen/aarch64-neon-vget.c
// The difference is that this file only tests uses vset intrinsics, as we feel
// it would be proper to have a separate test file testing vget intrinsics
// with the file name aarch64-neon-vget.c
// Also, for each integer type, we only test signed or unsigned, not both.
// This is because integer types of the same size just use same intrinsic.

// REQUIRES: aarch64-registered-target || arm-registered-target
#include <arm_neon.h>

uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
return vset_lane_u8(a, b, 7);
}

// CIR-LABEL: test_vset_lane_u8
// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i loc(#loc7)
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s8i x 8>

// LLVM: define dso_local <8 x i8> @test_vset_lane_u8(i8 [[A:%.*]], <8 x i8> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca i8, i64 1, align 1
// LLVM: [[B_ADR:%.*]] = alloca <8 x i8>, i64 1, align 8
// LLVM: store i8 [[A]], ptr [[A_ADR]], align 1
// LLVM: store <8 x i8> [[B]], ptr [[B_ADR]], align 8
// LLVM: [[TMP_A0:%.*]] = load i8, ptr [[A_ADR]], align 1
// LLVM: store i8 [[TMP_A0]], ptr [[S0:%.*]], align 1
// LLVM: [[TMP_B0:%.*]] = load <8 x i8>, ptr [[B_ADR]], align 8
// LLVM: store <8 x i8> [[TMP_B0]], ptr [[S1:%.*]], align 8
// LLVM: [[INTRN_ARG0:%.*]] = load i8, ptr [[S0]], align 1
// LLVM: [[INTRN_ARG1:%.*]] = load <8 x i8>, ptr [[S1]], align 8
// LLVM: [[INTRN_RES:%.*]] = insertelement <8 x i8> [[INTRN_ARG1]], i8 [[INTRN_ARG0]], i32 7
// LLVM: ret <8 x i8> {{%.*}}

uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
return vset_lane_u16(a, b, 3);
}

// CIR-LABEL: test_vset_lane_u16
// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s16i x 4>

// LLVM: define dso_local <4 x i16> @test_vset_lane_u16(i16 [[A:%.*]], <4 x i16> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca i16, i64 1, align 2
// LLVM: [[B_ADR:%.*]] = alloca <4 x i16>, i64 1, align 8
// LLVM: store i16 [[A]], ptr [[A_ADR]], align 2
// LLVM: store <4 x i16> [[B]], ptr [[B_ADR]], align 8
// LLVM: [[TMP_A0:%.*]] = load i16, ptr [[A_ADR]], align 2
// LLVM: store i16 [[TMP_A0]], ptr [[S0:%.*]], align 2
// LLVM: [[TMP_B0:%.*]] = load <4 x i16>, ptr [[B_ADR]], align 8
// LLVM: store <4 x i16> [[TMP_B0]], ptr [[S1:%.*]], align 8
// LLVM: [[INTRN_ARG0:%.*]] = load i16, ptr [[S0]], align 2
// LLVM: [[INTRN_ARG1:%.*]] = load <4 x i16>, ptr [[S1]], align 8
// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x i16> [[INTRN_ARG1]], i16 [[INTRN_ARG0]], i32 3
// LLVM: ret <4 x i16> {{%.*}}

uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
return vset_lane_u32(a, b, 1);
}

// CIR-LABEL: test_vset_lane_u32
// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s32i x 2>

// LLVM: define dso_local <2 x i32> @test_vset_lane_u32(i32 [[A:%.*]], <2 x i32> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca i32, i64 1, align 4
// LLVM: [[B_ADR:%.*]] = alloca <2 x i32>, i64 1, align 8
// LLVM: store i32 [[A]], ptr [[A_ADR]], align 4
// LLVM: store <2 x i32> [[B]], ptr [[B_ADR]], align 8
// LLVM: [[TMP_A0:%.*]] = load i32, ptr [[A_ADR]], align 4
// LLVM: store i32 [[TMP_A0]], ptr [[S0:%.*]], align 4
// LLVM: [[TMP_B0:%.*]] = load <2 x i32>, ptr [[B_ADR]], align 8
// LLVM: store <2 x i32> [[TMP_B0]], ptr [[S1:%.*]], align 8
// LLVM: [[INTRN_ARG0:%.*]] = load i32, ptr [[S0]], align 4
// LLVM: [[INTRN_ARG1:%.*]] = load <2 x i32>, ptr [[S1]], align 8
// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x i32> [[INTRN_ARG1]], i32 [[INTRN_ARG0]], i32 1
// LLVM: ret <2 x i32> {{%.*}}


int64x1_t test_vset_lane_u64(int64_t a, int64x1_t b) {
return vset_lane_u64(a, b, 0);
}

// CIR-LABEL: test_vset_lane_u64
// CIR: [[IDX:%.*]] = cir.const #cir.int<0> : !s32i
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s64i x 1>

// LLVM: define dso_local <1 x i64> @test_vset_lane_u64(i64 [[A:%.*]], <1 x i64> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca i64, i64 1, align 8
// LLVM: [[B_ADR:%.*]] = alloca <1 x i64>, i64 1, align 8
// LLVM: store i64 [[A]], ptr [[A_ADR]], align 8
// LLVM: store <1 x i64> [[B]], ptr [[B_ADR]], align 8
// LLVM: [[TMP_A0:%.*]] = load i64, ptr [[A_ADR]], align 8
// LLVM: store i64 [[TMP_A0]], ptr [[S0:%.*]], align 8
// LLVM: [[TMP_B0:%.*]] = load <1 x i64>, ptr [[B_ADR]], align 8
// LLVM: store <1 x i64> [[TMP_B0]], ptr [[S1:%.*]], align 8
// LLVM: [[INTRN_ARG0:%.*]] = load i64, ptr [[S0]], align 8
// LLVM: [[INTRN_ARG1:%.*]] = load <1 x i64>, ptr [[S1]], align 8
// LLVM: [[INTRN_RES:%.*]] = insertelement <1 x i64> [[INTRN_ARG1]], i64 [[INTRN_ARG0]], i32 0
// LLVM: ret <1 x i64> {{%.*}}

float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
return vset_lane_f32(a, b, 1);
}

// CIR-LABEL: test_vset_lane_f32
// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 2>

// LLVM: define dso_local <2 x float> @test_vset_lane_f32(float [[A:%.*]], <2 x float> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca float, i64 1, align 4
// LLVM: [[B_ADR:%.*]] = alloca <2 x float>, i64 1, align 8
// LLVM: store float [[A]], ptr [[A_ADR]], align 4
// LLVM: store <2 x float> [[B]], ptr [[B_ADR]], align 8
// LLVM: [[TMP_A0:%.*]] = load float, ptr [[A_ADR]], align 4
// LLVM: store float [[TMP_A0]], ptr [[S0:%.*]], align 4
// LLVM: [[TMP_B0:%.*]] = load <2 x float>, ptr [[B_ADR]], align 8
// LLVM: store <2 x float> [[TMP_B0]], ptr [[S1:%.*]], align 8
// LLVM: [[INTRN_ARG0:%.*]] = load float, ptr [[S0]], align 4
// LLVM: [[INTRN_ARG1:%.*]] = load <2 x float>, ptr [[S1]], align 8
// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x float> [[INTRN_ARG1]], float [[INTRN_ARG0]], i32 1
// LLVM: ret <2 x float> {{%.*}}

uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
return vsetq_lane_u8(a, b, 15);
}

// CIR-LABEL: test_vsetq_lane_u8
// CIR: [[IDX:%.*]] = cir.const #cir.int<15> : !s32i
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s8i x 16>

// LLVM: define dso_local <16 x i8> @test_vsetq_lane_u8(i8 [[A:%.*]], <16 x i8> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca i8, i64 1, align 1
// LLVM: [[B_ADR:%.*]] = alloca <16 x i8>, i64 1, align 16
// LLVM: store i8 [[A]], ptr [[A_ADR]], align 1
// LLVM: store <16 x i8> [[B]], ptr [[B_ADR]], align 16
// LLVM: [[TMP_A0:%.*]] = load i8, ptr [[A_ADR]], align 1
// LLVM: store i8 [[TMP_A0]], ptr [[S0:%.*]], align 1
// LLVM: [[TMP_B0:%.*]] = load <16 x i8>, ptr [[B_ADR]], align 16
// LLVM: store <16 x i8> [[TMP_B0]], ptr [[S1:%.*]], align 16
// LLVM: [[INTRN_ARG0:%.*]] = load i8, ptr [[S0]], align 1
// LLVM: [[INTRN_ARG1:%.*]] = load <16 x i8>, ptr [[S1]], align 16
// LLVM: [[INTRN_RES:%.*]] = insertelement <16 x i8> [[INTRN_ARG1]], i8 [[INTRN_ARG0]], i32 15
// LLVM: ret <16 x i8> {{%.*}}

uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
return vsetq_lane_u16(a, b, 7);
}

// CIR-LABEL: test_vsetq_lane_u16
// CIR: [[IDX:%.*]] = cir.const #cir.int<7> : !s32i
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s16i x 8>

// LLVM: define dso_local <8 x i16> @test_vsetq_lane_u16(i16 [[A:%.*]], <8 x i16> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca i16, i64 1, align 2
// LLVM: [[B_ADR:%.*]] = alloca <8 x i16>, i64 1, align 16
// LLVM: store i16 [[A]], ptr [[A_ADR]], align 2
// LLVM: store <8 x i16> [[B]], ptr [[B_ADR]], align 16
// LLVM: [[TMP_A0:%.*]] = load i16, ptr [[A_ADR]], align 2
// LLVM: store i16 [[TMP_A0]], ptr [[S0:%.*]], align 2
// LLVM: [[TMP_B0:%.*]] = load <8 x i16>, ptr [[B_ADR]], align 16
// LLVM: store <8 x i16> [[TMP_B0]], ptr [[S1:%.*]], align 16
// LLVM: [[INTRN_ARG0:%.*]] = load i16, ptr [[S0]], align 2
// LLVM: [[INTRN_ARG1:%.*]] = load <8 x i16>, ptr [[S1]], align 16
// LLVM: [[INTRN_RES:%.*]] = insertelement <8 x i16> [[INTRN_ARG1]], i16 [[INTRN_ARG0]], i32 7
// LLVM: ret <8 x i16> {{%.*}}

uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
return vsetq_lane_u32(a, b, 3);
}

// CIR-LABEL: test_vsetq_lane_u32
// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s32i x 4>

// LLVM: define dso_local <4 x i32> @test_vsetq_lane_u32(i32 [[A:%.*]], <4 x i32> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca i32, i64 1, align 4
// LLVM: [[B_ADR:%.*]] = alloca <4 x i32>, i64 1, align 16
// LLVM: store i32 [[A]], ptr [[A_ADR]], align 4
// LLVM: store <4 x i32> [[B]], ptr [[B_ADR]], align 16
// LLVM: [[TMP_A0:%.*]] = load i32, ptr [[A_ADR]], align 4
// LLVM: store i32 [[TMP_A0]], ptr [[S0:%.*]], align 4
// LLVM: [[TMP_B0:%.*]] = load <4 x i32>, ptr [[B_ADR]], align 16
// LLVM: store <4 x i32> [[TMP_B0]], ptr [[S1:%.*]], align 16
// LLVM: [[INTRN_ARG0:%.*]] = load i32, ptr [[S0]], align 4
// LLVM: [[INTRN_ARG1:%.*]] = load <4 x i32>, ptr [[S1]], align 16
// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x i32> [[INTRN_ARG1]], i32 [[INTRN_ARG0]], i32 3
// LLVM: ret <4 x i32> {{%.*}}

int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
return vsetq_lane_s64(a, b, 1);
}

// CIR-LABEL: test_vsetq_lane_s64
// CIR: [[IDX:%.*]] = cir.const #cir.int<1> : !s32i
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!s64i x 2>

// LLVM: define dso_local <2 x i64> @test_vsetq_lane_s64(i64 [[A:%.*]], <2 x i64> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca i64, i64 1, align 8
// LLVM: [[B_ADR:%.*]] = alloca <2 x i64>, i64 1, align 16
// LLVM: store i64 [[A]], ptr [[A_ADR]], align 8
// LLVM: store <2 x i64> [[B]], ptr [[B_ADR]], align 16
// LLVM: [[TMP_A0:%.*]] = load i64, ptr [[A_ADR]], align 8
// LLVM: store i64 [[TMP_A0]], ptr [[S0:%.*]], align 8
// LLVM: [[TMP_B0:%.*]] = load <2 x i64>, ptr [[B_ADR]], align 16
// LLVM: store <2 x i64> [[TMP_B0]], ptr [[S1:%.*]], align 16
// LLVM: [[INTRN_ARG0:%.*]] = load i64, ptr [[S0]], align 8
// LLVM: [[INTRN_ARG1:%.*]] = load <2 x i64>, ptr [[S1]], align 16
// LLVM: [[INTRN_RES:%.*]] = insertelement <2 x i64> [[INTRN_ARG1]], i64 [[INTRN_ARG0]], i32 1
// LLVM: ret <2 x i64> {{%.*}}

float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
return vsetq_lane_f32(a, b, 3);
}

// CIR-LABEL: test_vsetq_lane_f32
// CIR: [[IDX:%.*]] = cir.const #cir.int<3> : !s32i
// CIR: {{%.*}} = cir.vec.insert {{%.*}}, {{%.*}}[[[IDX]] : !s32i] : !cir.vector<!cir.float x 4>

// LLVM: define dso_local <4 x float> @test_vsetq_lane_f32(float [[A:%.*]], <4 x float> [[B:%.*]])
// LLVM: [[A_ADR:%.*]] = alloca float, i64 1, align 4
// LLVM: [[B_ADR:%.*]] = alloca <4 x float>, i64 1, align 16
// LLVM: store float [[A]], ptr [[A_ADR]], align 4
// LLVM: store <4 x float> [[B]], ptr [[B_ADR]], align 16
// LLVM: [[TMP_A0:%.*]] = load float, ptr [[A_ADR]], align 4
// LLVM: store float [[TMP_A0]], ptr [[S0:%.*]], align 4
// LLVM: [[TMP_B0:%.*]] = load <4 x float>, ptr [[B_ADR]], align 16
// LLVM: store <4 x float> [[TMP_B0]], ptr [[S1:%.*]], align 16
// LLVM: [[INTRN_ARG0:%.*]] = load float, ptr [[S0]], align 4
// LLVM: [[INTRN_ARG1:%.*]] = load <4 x float>, ptr [[S1]], align 16
// LLVM: [[INTRN_RES:%.*]] = insertelement <4 x float> [[INTRN_ARG1]], float [[INTRN_ARG0]], i32 3
// LLVM: ret <4 x float> {{%.*}}

0 comments on commit b18d38d

Please sign in to comment.