diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 09489e24984530..9bbd6590d099eb 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -179,8 +179,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) { // Match insert into fixed vector of scalar value. // TODO: Handle non-zero insert index. Value *Scalar; - if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) || - !Scalar->hasOneUse()) + if (!match(&I, + m_InsertElt(m_Poison(), m_OneUse(m_Value(Scalar)), m_ZeroInt()))) return false; // Optionally match an extract from another vector. diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll index 94b8c98a80df5c..83a0490b42a79a 100644 --- a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll +++ b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll @@ -12,8 +12,8 @@ define protected amdgpu_kernel void @load_from_other_as(ptr nocapture nonnull %r ; CHECK-NEXT: bb: ; CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5) ; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr -; CHECK-NEXT: [[TMP1:%.*]] = load <1 x float>, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> +; CHECK-NEXT: [[D:%.*]] = load float, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[E:%.*]] = insertelement <4 x float> undef, float [[D]], i32 0 ; CHECK-NEXT: store <4 x float> [[E]], ptr [[RESULTPTR:%.*]], align 16 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll index bdd05a1a37c70f..b12104c5c673ea 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -158,8 +158,8 @@ define double @larger_fp_scalar_256bit_vec(ptr align 32 dereferenceable(32) %p) define <4 x float> @load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, ptr %p, align 4 @@ -169,8 +169,8 @@ define <4 x float> @load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) n define <4 x float> @casted_load_f32_insert_v4f32(ptr align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_f32_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, ptr %p, align 4 @@ -182,8 +182,8 @@ define <4 x float> @casted_load_f32_insert_v4f32(ptr align 4 dereferenceable(16) define <4 x i32> @load_i32_insert_v4i32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_i32_insert_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0 ; CHECK-NEXT: ret <4 x i32> [[R]] ; %s = load i32, ptr %p, align 4 @@ -195,8 +195,8 @@ define <4 x i32> @load_i32_insert_v4i32(ptr align 16 dereferenceable(16) %p) nof define <4 x i32> @casted_load_i32_insert_v4i32(ptr align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_i32_insert_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0 ; CHECK-NEXT: ret <4 x i32> [[R]] ; %s = load i32, ptr %p, align 4 @@ -208,8 +208,8 @@ define <4 x i32> @casted_load_i32_insert_v4i32(ptr align 4 dereferenceable(16) % define <4 x float> @gep00_load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0 ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, ptr %p, align 16 @@ -221,8 +221,8 @@ define <4 x float> @gep00_load_f32_insert_v4f32(ptr align 16 dereferenceable(16) define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(ptr addrspace(44) align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(44) [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load float, ptr addrspace(44) [[P:%.*]], align 16 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0 ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, ptr addrspace(44) %p, align 16 @@ -235,8 +235,8 @@ define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(ptr addrspace(44) alig define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync { ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 2 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 @@ -248,16 +248,11 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) % ; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(17) %p) nofree nosync { -; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 -; SSE2-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 2 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 2 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 %s = load i16, ptr %gep, align 2 @@ -268,16 +263,11 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable ; Verify that alignment of the new load is not over-specified. define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 dereferenceable(16) %p) nofree nosync { -; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 -; SSE2-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 8 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 +; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 8 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 %s = load i16, ptr %gep, align 8 @@ -304,8 +294,9 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceabl define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync { ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 12 +; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0 ; CHECK-NEXT: ret <4 x i32> [[R]] ; %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 12 @@ -336,8 +327,8 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync { ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 16 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 ; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 @@ -439,8 +430,8 @@ define <4 x float> @load_f32_insert_v4f32_volatile(ptr align 16 dereferenceable( define <4 x float> @load_f32_insert_v4f32_align(ptr align 1 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v4f32_align( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 ; CHECK-NEXT: ret <4 x float> [[R]] ; %s = load float, ptr %p, align 4 @@ -463,8 +454,8 @@ define <4 x float> @load_f32_insert_v4f32_deref(ptr align 4 dereferenceable(15) define <8 x i32> @load_i32_insert_v8i32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_i32_insert_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, ptr %p, align 4 @@ -474,8 +465,8 @@ define <8 x i32> @load_i32_insert_v8i32(ptr align 16 dereferenceable(16) %p) nof define <8 x i32> @casted_load_i32_insert_v8i32(ptr align 4 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @casted_load_i32_insert_v8i32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0 ; CHECK-NEXT: ret <8 x i32> [[R]] ; %s = load i32, ptr %p, align 4 @@ -485,8 +476,8 @@ define <8 x i32> @casted_load_i32_insert_v8i32(ptr align 4 dereferenceable(16) % define <16 x float> @load_f32_insert_v16f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v16f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <16 x i32> +; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0 ; CHECK-NEXT: ret <16 x float> [[R]] ; %s = load float, ptr %p, align 4 @@ -496,8 +487,8 @@ define <16 x float> @load_f32_insert_v16f32(ptr align 16 dereferenceable(16) %p) define <2 x float> @load_f32_insert_v2f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_f32_insert_v2f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0 ; CHECK-NEXT: ret <2 x float> [[R]] ; %s = load float, ptr %p, align 4 @@ -549,8 +540,9 @@ define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr noc define <4 x float> @load_v2f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { ; CHECK-LABEL: @load_v2f32_extract_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[S:%.*]] = extractelement <2 x float> [[L]], i32 0 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 ; CHECK-NEXT: ret <4 x float> [[R]] ; %l = load <2 x float>, ptr %p, align 4 @@ -560,10 +552,17 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(ptr align 16 dereferenceable } define <4 x float> @load_v8f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync { -; CHECK-LABEL: @load_v8f32_extract_insert_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16 -; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: ret <4 x float> [[R]] +; SSE2-LABEL: @load_v8f32_extract_insert_v4f32( +; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x float>, ptr [[P:%.*]], i32 0, i32 0 +; SSE2-NEXT: [[S:%.*]] = load float, ptr [[TMP1]], align 4 +; SSE2-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 +; SSE2-NEXT: ret <4 x float> [[R]] +; +; AVX2-LABEL: @load_v8f32_extract_insert_v4f32( +; AVX2-NEXT: [[L:%.*]] = load <8 x float>, ptr [[P:%.*]], align 4 +; AVX2-NEXT: [[S:%.*]] = extractelement <8 x float> [[L]], i32 0 +; AVX2-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0 +; AVX2-NEXT: ret <4 x float> [[R]] ; %l = load <8 x float>, ptr %p, align 4 %s = extractelement <8 x float> %l, i32 0 @@ -589,17 +588,12 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer ; Can't safely load the offset vector, but can load+shuffle if it is profitable. define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceable(16) %p) nofree nosync { -; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, ptr [[P:%.*]], i64 1 -; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, ptr [[GEP]], i32 0, i32 0 -; SSE2-NEXT: [[S:%.*]] = load i16, ptr [[TMP1]], align 8 -; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 -; SSE2-NEXT: ret <8 x i16> [[R]] -; -; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 4 -; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> -; AVX2-NEXT: ret <8 x i16> [[R]] +; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16( +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, ptr [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, ptr [[GEP]], i32 0, i32 0 +; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0 +; CHECK-NEXT: ret <8 x i16> [[R]] ; %gep = getelementptr inbounds <2 x i16>, ptr %p, i64 1 %l = load <2 x i16>, ptr %gep, align 8