Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LLVM SIMD code generation bug - "Both operands to ICmp instruction are not of the same type!" #4602

Open
ChuuniMage opened this issue Dec 20, 2024 · 0 comments

Comments

@ChuuniMage
Copy link
Contributor

ChuuniMage commented Dec 20, 2024

Context

Please provide any relevant information about your setup. This is important in case the issue is not reproducible except for under certain conditions.

  • Operating System & Odin Version: Windows 10, version dev-2024-12:597fba7c3
  • Please paste odin report output:
    Odin: dev-2024-12:597fba7c3
    OS: Windows 10 Professional (version: 20H2), build 19042.1706
    CPU: Intel(R) Core(TM) i7-10700KF CPU @ 3.80GHz
    RAM: 32637 MiB
    Backend: LLVM 18.1.8

Expected Behavior

LLVM compiles the proc GET_ANIM_IDX_SIMD correctly

Current Behavior

Following llvm error is generated with "odin run .":

`
LLVM CODE GEN FAILED FOR PROCEDURE: main.GET_ANIM_IDX_SIMD
define i32 @main.GET_ANIM_IDX_SIMD(ptr %0, i32 %1, ptr noalias nocapture nonnull %__.context_ptr) {
decls:
%2 = alloca <4 x i32>, align 16
%3 = alloca i32, align 4
%4 = alloca <4 x i32>, align 16
%INDICES = alloca <4 x i32>, align 16
%GT = alloca <4 x i32>, align 16
%result = alloca i32, align 4
br label %entry

entry: ; preds = %decls
call void @llvm.memcpy.inline.p0.p0.i64(ptr %2, ptr %0, i64 16, i1 false)
store i32 %1, ptr %3, align 4
store <4 x i32> zeroinitializer, ptr %4, align 16
%5 = insertelement <4 x i32> zeroinitializer, i32 %1, i32 0
%6 = insertelement <4 x i32> %5, i32 %1, i32 1
%7 = insertelement <4 x i32> %6, i32 %1, i32 2
%8 = insertelement <4 x i32> %7, i32 %1, i32 3
store <4 x i32> %8, ptr %4, align 16
%9 = load <4 x i32>, ptr %4, align 16
store <4 x i32> %9, ptr %INDICES, align 16
%10 = load <4 x i32>, ptr %2, align 16
%11 = icmp sgt <4 x i32> %10, i32 %1
%12 = sext <4 x i1> %11 to <4 x i32>
store <4 x i32> %12, ptr %GT, align 16
%13 = load <4 x i32>, ptr %GT, align 16
%14 = sub <4 x i32> zeroinitializer, %13
store <4 x i32> %14, ptr %GT, align 16
%15 = load <4 x i32>, ptr %GT, align 16
%16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %15)
store i32 %16, ptr %result, align 4
%17 = load i32, ptr %result, align 4
ret i32 %17
}

Both operands to ICmp instruction are not of the same type!
%11 = icmp sgt <4 x i32> %10, i32 %1

Steps to Reproduce

Run the provided code

package main;

import "core:fmt";

Universal_Player_States :: enum {
_5,
_2,
_Transition_5_to_2,
_Transition_2_to_5,
_WALK_F,
_WALK_B,
_DASH_F,
_DASH_B,
_5G,
_2G,
_Transition_5G_to_2G,
_Transition_2G_to_5G,
_Blockstun_5ing,
_Blockstun_2ing,
_Hitstun_Stand_Highreel,
_Hitstun_Stand_Midgut,
_Hitstun_Crouch_Reel,
_ThrowBreaking,
_Dead,
_Ringout,
}

// Optimise this to make this a flat array of 5 elements? perhaps 8 for cache alignment? or does just the whole thing need to be cache aligned?
// curious optimisation cases

HBOX_DURATIONS_UNIVERSAL :: [Universal_Player_States][]i32{
._5 = {66, },
._2 = {255,},
._Transition_5_to_2 = {3,},
._Transition_2_to_5 = {3, },
._WALK_F = {32,},
._WALK_B = {28,},
._DASH_F = {16,},
._DASH_B = {14,},
._5G = {255,},
._2G = {255,},
._Transition_5G_to_2G = {5,},
._Transition_2G_to_5G = {4,},
._Blockstun_5ing = {255,},
._Blockstun_2ing = {255,},
._Hitstun_Stand_Highreel = {255,},
._Hitstun_Stand_Midgut = {255,},
._Hitstun_Crouch_Reel = {255,},
._ThrowBreaking = {38,},
._Dead = {35, 155},
._Ringout = {255,},
}

HBOX_DURATIONS_CLASSIC :: [Classic_Player_State][]i32{
._5S = {11, 14, 18, 27,},
._2S = {11, 13, 17, 33,},
._6S = {13, 15, 26, 32, 36,},
._3S = {14, 16, 20, 42,},
._8S = {7, 19, 21, 27, 42,},
._5Throw = {9, 14, 33,},
._5Throw_Hitting = {12, 14, 36},
._4Throw = {9, 14, 33,},
._4Throw_Hitting = {24, 34, 45, 52,},
}

Character :: enum {
Classic,
Wrassler,
Striker,
CounterMan,
}

Classic_Player_State :: enum {
_5S,
_2S,
_6S,
_3S,
_8S,
_5Throw,
_5Throw_Hitting,
_4Throw,
_4Throw_Hitting,
}

Wrassler_Player_State :: enum {
_5S,
_2S,
_6S,
_3S,
_4S,
_5Throw,
_5Throw_Hitting,
_4Throw,
_4Throw_Hitting,
_6Throw,
_6Throw_Hitting,
}

Striker_Player_State :: enum {
_5S,
_2S,
_6S,
_6Sstr_S,
_3S,
_1S,
_5Throw,
_5Throw_Hitting,
_6Throw,
_6Throw_Hitting,
}

CounterMan_Player_State :: enum {
_5S,
_2S,
_6S,
_8S,
_3S,
_1S,
_5Throw,
_5Throw_Hitting,
_6Throw,
_6Throw_Hitting,
_4Throw,
_4Throw_Hitting,
}

Which_Player_State :: enum {
_Universal,
_CharSpecific,
}

// Problem: Need to use an enum for which character they are using anyway
// so which should just be 1 bit 61 thingo anyway

_Player_State_1 :: bit_field u64 {
which: Which_Player_State | 1,
p_state: Universal_Player_States | 63,
}

_Player_State_2 :: bit_field u64 {
which: Which_Player_State | 1,
c_state: Classic_Player_State | 63,
}

_Player_State_3 :: bit_field u64 {
which: Which_Player_State | 1,
c_state: Wrassler_Player_State | 63,
}

_Player_State_4 :: bit_field u64 {
which: Which_Player_State | 1,
c_state: Striker_Player_State | 63,
}

_Player_State_5 :: bit_field u64 {
which: Which_Player_State | 1,
c_state: CounterMan_Player_State | 63,
}

Player_State :: struct #raw_union {
using _: _Player_State_1,
using _: _Player_State_2,
using _: _Player_State_3,
using _: _Player_State_4,
using _: _Player_State_5,
}

Cursed_1 :: bit_field u64 {
uh_oh: i32 | 32,
}

Cursed_2 :: bit_field u64 {
uh_oh: u8 | 8,
}

Cursed :: struct #raw_union {
using _:Cursed_1
}

Player_Data :: struct {
state_counter: i32,
character: Character,
current_state: Player_State,
}

GET_ANIM_IDX_ORIGINAL :: proc (dur:[]i32, state_counter:i32) -> i32 {
anim_idx := i32{}
for el, idx in dur {
if state_counter > el do continue;
anim_idx = cast(i32)idx
break;
}
return anim_idx
}

_5S_SIMD :: #simd[4]i32{11, 14, 18, 27,}

import "core:simd"

GET_ANIM_IDX_SIMD :: proc (dur:#simd[4]i32, state_counter:i32) -> i32 {

INDICES : #simd[4]i32 = {state_counter, state_counter, state_counter, state_counter}

GT := simd.lanes_gt(dur, state_counter)
GT = simd.neg(GT)
result := simd.reduce_add_ordered(GT)
return transmute(i32)result
}

// import "shared:prof"

expected_5S_index := [28]i32 {
0 = 0,
1 = 0,
2 = 0,
3 = 0,
4 = 0,
5 = 0,
6 = 0,
7 = 0,
8 = 0,
9 = 0,
10 = 0,
11 = 0,
12 = 1,
13 = 1,
14 = 1,
15 = 2,
16 = 2,
17 = 2,
18 = 2,
19 = 3,
20 = 3,
21 = 3,
22 = 3,
23 = 3,
24 = 3,
25 = 3,
26 = 3,
27 = 3,
}

main :: proc(){
fmt.printf("Hello, world! Your Odin project is set up.\n")
honk:Player_State
honk.which = ._Universal
honk.p_state = ._5
honk.c_state = ._5S
for i in cast(i32)0..=27 {
result := GET_ANIM_IDX_ORIGINAL(HBOX_DURATIONS_CLASSIC[._5S], i)
assert(result == expected_5S_index[i])
}
fmt.printf("Original test passed!")
for i in cast(i32)0..=27 {
result := GET_ANIM_IDX_SIMD(_5S_SIMD, i)
assert(result == expected_5S_index[i])
}
fmt.printf("SIMD test passed!")
result:i32
// prof.TIME_TYPE = .micros

fmt.printf("Original algorithm: ")
for x in 0..<1_000_000 {
for i in cast(i32)0..=27 {
// prof.profile_start(.micros)
result = GET_ANIM_IDX_ORIGINAL(HBOX_DURATIONS_CLASSIC[._5S], i)
// prof.profile_end()
assert(result != -1)
}
}
// prof.average_profiling_data()

fmt.printf("SIMD algorithm: ")
for x in 0..<1_000_000 {
for i in cast(i32)0..=27 {
// prof.profile_start(.micros)
result = GET_ANIM_IDX_SIMD(_5S_SIMD, i)
// prof.profile_end()
assert(result != -1)
}
}
// prof.average_profiling_data()

fmt.printf("cursed %v \n", honk.which)
};

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant