Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for intrinsics for NTuple{VecElement} #55118

Draft
wants to merge 12 commits into
base: master
Choose a base branch
from
24 changes: 23 additions & 1 deletion base/compiler/tfuncs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ end

const INT_INF = typemax(Int) # integer infinity

const N_IFUNC = reinterpret(Int32, have_fma) + 1
const N_IFUNC = reinterpret(Int32, preferred_vector_width) + 1
const T_IFUNC = Vector{Tuple{Int, Int, Any}}(undef, N_IFUNC)
const T_IFUNC_COST = Vector{Int}(undef, N_IFUNC)
const T_FFUNC_KEY = Vector{Any}()
Expand Down Expand Up @@ -318,6 +318,28 @@ add_tfunc(Core.Intrinsics.cglobal, 1, 2, cglobal_tfunc, 5)

add_tfunc(Core.Intrinsics.have_fma, 1, 1, @nospecs((𝕃::AbstractLattice, x)->Bool), 1)

@nospecs function preferred_vector_width_tfunc(𝕃::AbstractLattice, t)
return preferred_vector_width_tfunc(widenlattice(𝕃), t)
end

@nospecs function preferred_vector_width_tfunc(𝕃::ConstsLattice, t)
# Want to return Union(Const(1), Const(2))
# hardcode AVX256
if sizeof(widenconst(t)) === 1
return Const(32)
elseif sizeof(widenconst(t)) === 2
return Const(16)
elseif sizeof(widenconst(t)) === 4
return Const(8)
elseif sizeof(widenconst(t)) === 8
return Const(4)
elseif sizeof(widenconst(t)) === 16
return Const(2)
end
return Union{Nothing, Int}
end
add_tfunc(Core.Intrinsics.preferred_vector_width, 1, 1, preferred_vector_width_tfunc, 1)

# builtin functions
# =================

Expand Down
2 changes: 2 additions & 0 deletions base/experimental.jl
Original file line number Diff line number Diff line change
Expand Up @@ -471,4 +471,6 @@ function entrypoint(@nospecialize(argt::Type))
nothing
end

include("simd.jl")

end
147 changes: 147 additions & 0 deletions base/simd.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
module SIMD

import Base: VecElement, Memory, MemoryRef, IEEEFloat
import Base: @propagate_inbounds, @_propagate_inbounds_meta, @_boundscheck, @_noub_if_noinbounds_meta
import Base: memoryrefget, memoryrefnew, memoryrefset!

import Core.Intrinsics: preferred_vector_width

export Vec
export vload, vstore!, preferred_vector, width, select

# TODO: See C# and Co Vec type

Check warning on line 12 in base/simd.jl

View workflow job for this annotation

GitHub Actions / Check whitespace

Whitespace check

trailing whitespace
# TODO: Hardware portable vector types...
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this include vscale? 👀


# TODO: tfunc support for preferred_vector_width does allow for "constant prop"
# but the intrinsic is not removed just yet during JIT, we should only need
# it for AOT or on a machine with scaleable vector types...

Check warning on line 17 in base/simd.jl

View workflow job for this annotation

GitHub Actions / Check for new typos

perhaps "scaleable" should be "scalable".

struct Vec{N, T}
data::NTuple{N, VecElement{T}}
end

width(::Type{<:Vec{N}}) where N = N
width(::Vec{N}) where N = N

function preferred_vector(::Type{T}) where T
width = preferred_vector_width(T)
if width === nothing
error("$T has no preferred_vector_width")
end
return Vec{width, T}
end

# Constructors
@inline Vec(v::NTuple{N, T}) where {N, T} = Vec(VecElement.(v))
@inline Vec(v::Vararg{T, N}) where {N, T} = Vec(v)
@inline Vec(v::Vec) = v

# Numbers defines this and it is needed in power_by_squaring...
Base.copy(v::Vec) = v

function Base.show(io::IO, v::Vec{N, T}) where {N, T}
io = IOContext(io, :typeinfo => eltype(v))
print(io, "<$N x $T>[")
join(io, [sprint(show, x.value; context=io) for x in v.data], ", ")
print(io, "]")
end

# TODO: llvm.vp expects a mask of i1
const Mask{N} = Vec{N, Bool}

function Vec{N}(val) where N
Vec(ntuple(_->VecElement(val),Val(N)))
end

# select(m::Mask{N}, a::Vec{N, T}, b::Vec{N,T}) where {N,T} = Core.ifelse(m.data, a.data, b.data)
# ERROR: TypeError: non-boolean (NTuple{4, VecElement{Bool}}) used in boolean context
# Mocked select, relying on SLP
function select(m::Mask{N}, a::Vec{N, T}, b::Vec{N,T}) where {N,T}
data = ntuple(Val(N)) do j
VecElement(Core.ifelse(m.data[j].value, a.data[j].value, b.data[j].value))
end
return Vec(data)
end

# Mocked vload/vstore! relying on SLP

@inline function vload(::Type{Vec{N, T}}, A::Array{T}, i::Int) where {N, T}
@_noub_if_noinbounds_meta
# TODO: Alignment...; may need an intrinsic for vectorized loads.
# Writting my own boundscheck loop since `inbounds` doesn't propagate through `ntuple` FFS

Check warning on line 71 in base/simd.jl

View workflow job for this annotation

GitHub Actions / Check for new typos

perhaps "Writting" should be "Writing".
@boundscheck checkbounds(A, i:(i+ N - 1))
mem = A.ref
data = ntuple(Val(N)) do j
# why does `@inbounds ref = memoryrefnew(mem, i + j - 1, @_boundscheck)` not work?
ref = memoryrefnew(mem, i + j - 1, false)
VecElement{T}(memoryrefget(ref, :not_atomic, false))
end
return Vec(data)
end

@inline function vstore!(A::Array{T}, v::Vec{N, T}, i::Int) where {N, T}
@_noub_if_noinbounds_meta
# TODO: Alignment...; may need an intrinsic for vectorized loads.
# Writting my own boundscheck loop since `inbounds` doesn't propagate through `ntuple` FFS

Check warning on line 85 in base/simd.jl

View workflow job for this annotation

GitHub Actions / Check for new typos

perhaps "Writting" should be "Writing".
@boundscheck checkbounds(A, i:(i+ N - 1))
mem = A.ref
data = v.data
ntuple(Val(N)) do j
# why does `@inbounds ref = memoryrefnew(mem, i + j - 1, @_boundscheck)` not work?
ref = memoryrefnew(mem, i + j - 1, false)
memoryrefset!(ref, data[j].value, :not_atomic, false)
return nothing
end
return nothing
end

import Base: +, -, *, /, muladd, promote_rule, widen
import Core.Intrinsics: add_float, sub_float, mul_float, div_float, muladd_float, neg_float

## floating point promotions ##
promote_rule(::Type{Vec{N, Float32}}, ::Type{Vec{N, Float16}}) where N = Vec{N, Float32}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is not obvious to me that these should be defined. When you are doing low level SIMD stuff you probably don't want to accidentally promote things and in case where you really want to work with different types, an explicit convert might be better for clarity?

promote_rule(::Type{Vec{N, Float64}}, ::Type{Vec{N, Float16}}) where N = Vec{N, Float64}
promote_rule(::Type{Vec{N, Float64}}, ::Type{Vec{N, Float32}}) where N = Vec{N, Float64}

widen(::Type{Vec{N, Float16}}) where N = Vec{N, Float16}
widen(::Type{Vec{N, Float32}}) where N = Vec{N, Float32}

## floating point arithmetic ##
-(x::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(neg_float(x.data))

+(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(add_float(x.data, y.data))
-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(sub_float(x.data, y.data))
*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(mul_float(x.data, y.data))
/(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(div_float(x.data, y.data))

muladd(x::Vec{N,T}, y::Vec{N,T}, z::Vec{N,T}) where {N, T<:IEEEFloat} =
Vec(muladd_float(x.data, y.data, z.data))

## integer arithmetic ##
import Base: ÷, BitInteger, BitSigned, BitUnsigned
import Core.Intrinsics: add_int, sub_int, mul_int, sdiv_int, udiv_int, neg_int

+(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(add_int(x.data, y.data))
-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(sub_int(x.data, y.data))
*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(mul_int(x.data, y.data))
# TODO ought we implement div by zero?
÷(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitSigned} = Vec(sdiv_int(x.data, y.data))
÷(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitUnsigned} = Vec(udiv_int(x.data, y.data))

## logical ops
import Base: xor, |, &
import Core.Intrinsics: xor_int, and_int, or_int
xor(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(xor_int(x.data, y.data))
(|)(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(and_int(x.data, y.data))
(&)(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(or_int(x.data, y.data))

## integer shifts
# unsigned shift counts always shift in the same direction
import Base: >>, <<, >>>
import Core.Intrinsics: ashr_int, lshr_int, shl_int, lshr_int
>>(x::Vec{N, <:BitSigned}, y::Vec{N, <:BitUnsigned}) where N = ashr_int(x, y)
>>(x::Vec{N, <:BitUnsigned}, y::Vec{N, <:BitUnsigned}) where N = lshr_int(x, y)
<<(x::Vec{N, <:BitInteger}, y::Vec{N, <:BitUnsigned}) where N = shl_int(x, y)
>>>(x::Vec{N, <:BitInteger}, y::Vec{N, <:BitUnsigned}) where N = lshr_int(x, y)

end # module

Check warning on line 147 in base/simd.jl

View workflow job for this annotation

GitHub Actions / Check whitespace

Whitespace check

no trailing newline
79 changes: 74 additions & 5 deletions src/intrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ STATISTIC(Emitted_fptrunc, "Number of fptrunc calls emitted");
STATISTIC(Emitted_fpext, "Number of fpext calls emitted");
STATISTIC(Emitted_not_int, "Number of not_int calls emitted");
STATISTIC(Emitted_have_fma, "Number of have_fma calls emitted");
STATISTIC(Emitted_preferred_vector_width, "Number of prefferred_vector_width calls emitted");
STATISTIC(EmittedUntypedIntrinsics, "Number of untyped intrinsics emitted");

using namespace JL_I;
Expand Down Expand Up @@ -144,6 +145,13 @@ static Type *FLOATT(Type *t)
{
if (t->isFloatingPointTy())
return t;
if (auto *tv = dyn_cast<VectorType>(t))
{
Type *st = FLOATT(tv->getElementType());
if (!st)
return NULL;
return VectorType::get(st, tv->getElementCount());
}
unsigned nb = (t->isPointerTy() ? sizeof(void*) * 8 : t->getPrimitiveSizeInBits());
auto &ctxt = t->getContext();
if (nb == 64)
Expand All @@ -165,6 +173,13 @@ static Type *INTT(Type *t, const DataLayout &DL)
return t;
if (t->isPointerTy())
return DL.getIntPtrType(t);
if (auto *tv = dyn_cast<VectorType>(t))
{
Type *st = INTT(tv->getElementType(), DL);
if (!st)
return NULL;
return VectorType::get(st, tv->getElementCount());
}
if (t == getDoubleTy(ctxt))
return getInt64Ty(ctxt);
if (t == getFloatTy(ctxt))
Expand Down Expand Up @@ -1287,7 +1302,6 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
return emit_llvmcall(ctx, args, nargs);
if (f == cglobal_auto || f == cglobal)
return emit_cglobal(ctx, args, nargs);

SmallVector<jl_cgval_t, 0> argv(nargs);
for (size_t i = 0; i < nargs; ++i) {
jl_cgval_t arg = emit_expr(ctx, args[i + 1]);
Expand Down Expand Up @@ -1406,20 +1420,75 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
return mark_julia_type(ctx, ret, false, jl_bool_type);
}

case preferred_vector_width: {
++Emitted_preferred_vector_width;
assert(nargs == 1);
const jl_cgval_t &x = argv[0];
if (!x.constant || !jl_is_datatype(x.constant))
return emit_runtime_call(ctx, f, argv, nargs);
jl_datatype_t *dt = (jl_datatype_t*) x.constant;

// select the appropriated overloaded intrinsic
std::string intr_name = "julia.cpu.preferred_vector_width.";
switch (jl_datatype_size(dt)) {
case 1: {
intr_name += "b1";
break;
case 2: {
intr_name += "b2";
break;
}
case 4: {
intr_name += "b4";
break;
}
case 8: {
intr_name += "b8";
break;
}
case 16: {
intr_name += "b16";
break;
}
default:
return emit_runtime_call(ctx, f, argv, nargs);
}
}

#ifdef _P64
FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, getInt64Ty(ctx.builder.getContext()));
auto ret = ctx.builder.CreateCall(intr);
return mark_julia_type(ctx, ret, false, jl_int64_type);
#else
FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, getInt32Ty(ctx.builder.getContext()));
auto ret = ctx.builder.CreateCall(intr);
return mark_julia_type(ctx, ret, false, jl_int32_type);
#endif
}

default: {
assert(nargs >= 1 && "invalid nargs for intrinsic call");
const jl_cgval_t &xinfo = argv[0];

// verify argument types
if (!jl_is_primitivetype(xinfo.typ))
if (jl_is_primitivetype(xinfo.typ)){}
else if (is_ntuple_type(xinfo.typ) && jl_nparams(xinfo.typ) > 0)
{
jl_value_t *et = jl_tparam0(xinfo.typ);
if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0)))
et = jl_tparam0(et);
else
return emit_runtime_call(ctx, f, argv, nargs);
}
else
return emit_runtime_call(ctx, f, argv, nargs);
Type *xtyp = bitstype_to_llvm(xinfo.typ, ctx.builder.getContext(), true);
bool isboxed=true;
Type *xtyp = julia_type_to_llvm(ctx, xinfo.typ, &(isboxed));
if (float_func()[f])
xtyp = FLOATT(xtyp);
else
xtyp = INTT(xtyp, DL);
if (!xtyp)
return emit_runtime_call(ctx, f, argv, nargs);
return emit_runtime_call(ctx, f, argv, nargs);
////Bool are required to be in the range [0,1]
////so while they are represented as i8,
////the operations need to be done in mod 1
Expand Down
1 change: 1 addition & 0 deletions src/intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@
ALIAS(llvmcall, llvmcall) \
/* cpu feature tests */ \
ADD_I(have_fma, 1) \
ADD_I(preferred_vector_width, 1) \
/* hidden intrinsics */ \
ADD_HIDDEN(cglobal_auto, 1)

Expand Down
18 changes: 18 additions & 0 deletions src/julia.h
Original file line number Diff line number Diff line change
Expand Up @@ -1717,6 +1717,24 @@ STATIC_INLINE int jl_is_tuple_type(void *t) JL_NOTSAFEPOINT
((jl_datatype_t*)(t))->name == jl_tuple_typename);
}

STATIC_INLINE int is_ntuple_type(jl_value_t *tt)
{
if (!jl_is_tuple_type(tt))
{
return 0;
}
size_t i, nfields = jl_nparams(tt);
if(!nfields)
return 1;
jl_value_t *t1 = jl_tparam0(tt);
for (i = 1; i < nfields; i++) {
if (jl_tparam(tt, i) != t1) {
return 0;
}
}
return 1;
}

STATIC_INLINE int jl_is_namedtuple_type(void *t) JL_NOTSAFEPOINT
{
return (jl_is_datatype(t) &&
Expand Down
1 change: 1 addition & 0 deletions src/julia_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1614,6 +1614,7 @@ JL_DLLEXPORT jl_value_t *jl_copysign_float(jl_value_t *a, jl_value_t *b);
JL_DLLEXPORT jl_value_t *jl_flipsign_int(jl_value_t *a, jl_value_t *b);

JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *a);
JL_DLLEXPORT jl_value_t *jl_preferred_vector_width(jl_value_t *a);
JL_DLLEXPORT int jl_stored_inline(jl_value_t *el_type);
JL_DLLEXPORT jl_value_t *(jl_array_data_owner)(jl_array_t *a);
JL_DLLEXPORT jl_array_t *jl_array_copy(jl_array_t *ary);
Expand Down
Loading
Loading