JuliaLang · oscardssmith · Jul 13, 2024 · Jul 13, 2024 · Jul 13, 2024 · Jul 15, 2024
diff --git a/base/compiler/tfuncs.jl b/base/compiler/tfuncs.jl
@@ -52,7 +52,7 @@ end
 
 const INT_INF = typemax(Int) # integer infinity
 
-const N_IFUNC = reinterpret(Int32, have_fma) + 1
+const N_IFUNC = reinterpret(Int32, preferred_vector_width) + 1
 const T_IFUNC = Vector{Tuple{Int, Int, Any}}(undef, N_IFUNC)
 const T_IFUNC_COST = Vector{Int}(undef, N_IFUNC)
 const T_FFUNC_KEY = Vector{Any}()
@@ -318,6 +318,28 @@ add_tfunc(Core.Intrinsics.cglobal, 1, 2, cglobal_tfunc, 5)
 
 add_tfunc(Core.Intrinsics.have_fma, 1, 1, @nospecs((𝕃::AbstractLattice, x)->Bool), 1)
 
+@nospecs function preferred_vector_width_tfunc(𝕃::AbstractLattice, t)
+    return preferred_vector_width_tfunc(widenlattice(𝕃), t)
+end
+
+@nospecs function preferred_vector_width_tfunc(𝕃::ConstsLattice, t)
+    # Want to return Union(Const(1), Const(2))
+    # hardcode AVX256
+    if sizeof(widenconst(t)) === 1
+        return Const(32)
+    elseif sizeof(widenconst(t)) === 2
+        return Const(16)
+    elseif sizeof(widenconst(t)) === 4
+        return Const(8)
+    elseif sizeof(widenconst(t)) === 8
+        return Const(4)
+    elseif sizeof(widenconst(t)) === 16
+        return Const(2)
+    end
+    return Union{Nothing, Int}
+end
+add_tfunc(Core.Intrinsics.preferred_vector_width, 1, 1, preferred_vector_width_tfunc, 1)
+
 # builtin functions
 # =================
 

diff --git a/base/experimental.jl b/base/experimental.jl
@@ -471,4 +471,6 @@ function entrypoint(@nospecialize(argt::Type))
     nothing
 end
 
+include("simd.jl")
+
 end
diff --git a/base/simd.jl b/base/simd.jl
@@ -0,0 +1,147 @@
+module SIMD
+
+import Base: VecElement, Memory, MemoryRef, IEEEFloat
+import Base: @propagate_inbounds, @_propagate_inbounds_meta, @_boundscheck, @_noub_if_noinbounds_meta
+import Base: memoryrefget, memoryrefnew, memoryrefset!
+
+import Core.Intrinsics: preferred_vector_width
+
+export Vec
+export vload, vstore!, preferred_vector, width, select
+
+# TODO: See C# and Co Vec type 
+# TODO: Hardware portable vector types...
+
+# TODO: tfunc support for preferred_vector_width does allow for "constant prop"
+#       but the intrinsic is not removed just yet during JIT, we should only need
+#       it for AOT or on a machine with scaleable vector types...
+
+struct Vec{N, T}
+    data::NTuple{N, VecElement{T}}
+end
+
+width(::Type{<:Vec{N}}) where N = N
+width(::Vec{N}) where N = N
+
+function preferred_vector(::Type{T}) where T
+    width = preferred_vector_width(T)
+    if width === nothing
+        error("$T has no preferred_vector_width")
+    end
+    return Vec{width, T}
+end
+
+# Constructors
+@inline Vec(v::NTuple{N, T}) where {N, T} = Vec(VecElement.(v))
+@inline Vec(v::Vararg{T, N}) where {N, T} = Vec(v)
+@inline Vec(v::Vec) = v
+
+# Numbers defines this and it is needed in power_by_squaring...
+Base.copy(v::Vec) = v
+
+function Base.show(io::IO, v::Vec{N, T}) where {N, T}
+    io = IOContext(io, :typeinfo => eltype(v))
+    print(io, "<$N x $T>[")
+    join(io, [sprint(show, x.value; context=io) for x in v.data], ", ")
+    print(io, "]")
+end
+
+# TODO: llvm.vp expects a mask of i1
+const Mask{N} = Vec{N, Bool}
+
+function Vec{N}(val) where N
+    Vec(ntuple(_->VecElement(val),Val(N)))
+end
+
+# select(m::Mask{N}, a::Vec{N, T}, b::Vec{N,T}) where {N,T} = Core.ifelse(m.data, a.data, b.data)
+# ERROR: TypeError: non-boolean (NTuple{4, VecElement{Bool}}) used in boolean context
+# Mocked select, relying on SLP
+function select(m::Mask{N}, a::Vec{N, T}, b::Vec{N,T}) where {N,T}
+    data = ntuple(Val(N)) do j
+        VecElement(Core.ifelse(m.data[j].value, a.data[j].value, b.data[j].value))
+    end
+    return Vec(data)
+end
+
+# Mocked vload/vstore! relying on SLP
+
+@inline function vload(::Type{Vec{N, T}}, A::Array{T}, i::Int) where {N, T}
+    @_noub_if_noinbounds_meta
+    # TODO: Alignment...; may need an intrinsic for vectorized loads.
+    # Writting my own boundscheck loop since `inbounds` doesn't propagate through `ntuple` FFS
+    @boundscheck checkbounds(A, i:(i+ N - 1))
+    mem = A.ref
+    data = ntuple(Val(N)) do j
+        # why does `@inbounds  ref = memoryrefnew(mem, i + j - 1, @_boundscheck)` not work?
+        ref = memoryrefnew(mem, i + j - 1, false)
+        VecElement{T}(memoryrefget(ref, :not_atomic, false))
+    end
+    return Vec(data)
+end
+
+@inline function vstore!(A::Array{T}, v::Vec{N, T}, i::Int) where {N, T}
+    @_noub_if_noinbounds_meta
+    # TODO: Alignment...; may need an intrinsic for vectorized loads.
+    # Writting my own boundscheck loop since `inbounds` doesn't propagate through `ntuple` FFS
+    @boundscheck checkbounds(A, i:(i+ N - 1))
+    mem = A.ref
+    data = v.data
+    ntuple(Val(N)) do j
+        # why does `@inbounds  ref = memoryrefnew(mem, i + j - 1, @_boundscheck)` not work?
+        ref = memoryrefnew(mem, i + j - 1, false)
+        memoryrefset!(ref, data[j].value, :not_atomic, false)
+        return nothing
+    end
+    return nothing
+end
+
+import Base: +, -, *, /, muladd, promote_rule, widen
+import Core.Intrinsics: add_float, sub_float, mul_float, div_float, muladd_float, neg_float
+
+## floating point promotions ##
+promote_rule(::Type{Vec{N, Float32}}, ::Type{Vec{N, Float16}}) where N = Vec{N, Float32}
+promote_rule(::Type{Vec{N, Float64}}, ::Type{Vec{N, Float16}}) where N = Vec{N, Float64}
+promote_rule(::Type{Vec{N, Float64}}, ::Type{Vec{N, Float32}}) where N = Vec{N, Float64}
+
+widen(::Type{Vec{N, Float16}}) where N = Vec{N, Float16}
+widen(::Type{Vec{N, Float32}}) where N = Vec{N, Float32}
+
+## floating point arithmetic ##
+-(x::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(neg_float(x.data))
+
++(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(add_float(x.data, y.data))
+-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(sub_float(x.data, y.data))
+*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(mul_float(x.data, y.data))
+/(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:IEEEFloat} = Vec(div_float(x.data, y.data))
+
+muladd(x::Vec{N,T}, y::Vec{N,T}, z::Vec{N,T}) where {N, T<:IEEEFloat} =
+    Vec(muladd_float(x.data, y.data, z.data))
+
+## integer arithmetic ##
+import Base: ÷, BitInteger, BitSigned, BitUnsigned
+import Core.Intrinsics: add_int, sub_int, mul_int, sdiv_int, udiv_int, neg_int
+
++(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(add_int(x.data, y.data))
+-(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(sub_int(x.data, y.data))
+*(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(mul_int(x.data, y.data))
+# TODO ought we implement div by zero?
+÷(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitSigned}   = Vec(sdiv_int(x.data, y.data))
+÷(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitUnsigned} = Vec(udiv_int(x.data, y.data))
+
+## logical ops
+import Base: xor, |, &
+import Core.Intrinsics: xor_int, and_int, or_int
+xor(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(xor_int(x.data, y.data))
+(|)(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(and_int(x.data, y.data))
+(&)(x::Vec{N,T}, y::Vec{N,T}) where {N,T<:BitInteger} = Vec(or_int(x.data, y.data))
+
+## integer shifts
+# unsigned shift counts always shift in the same direction
+import Base: >>, <<, >>>
+import Core.Intrinsics: ashr_int, lshr_int, shl_int, lshr_int
+>>(x::Vec{N, <:BitSigned},   y::Vec{N, <:BitUnsigned}) where N = ashr_int(x, y)
+>>(x::Vec{N, <:BitUnsigned}, y::Vec{N, <:BitUnsigned}) where N = lshr_int(x, y)
+<<(x::Vec{N, <:BitInteger},  y::Vec{N, <:BitUnsigned}) where N = shl_int(x, y)
+>>>(x::Vec{N, <:BitInteger}, y::Vec{N, <:BitUnsigned}) where N = lshr_int(x, y)
+
+end # module
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
@@ -37,6 +37,7 @@ STATISTIC(Emitted_fptrunc, "Number of fptrunc calls emitted");
 STATISTIC(Emitted_fpext, "Number of fpext calls emitted");
 STATISTIC(Emitted_not_int, "Number of not_int calls emitted");
 STATISTIC(Emitted_have_fma, "Number of have_fma calls emitted");
+STATISTIC(Emitted_preferred_vector_width, "Number of prefferred_vector_width calls emitted");
 STATISTIC(EmittedUntypedIntrinsics, "Number of untyped intrinsics emitted");
 
 using namespace JL_I;
@@ -144,6 +145,13 @@ static Type *FLOATT(Type *t)
 {
     if (t->isFloatingPointTy())
         return t;
+    if (auto *tv = dyn_cast<VectorType>(t))
+    {
+        Type *st = FLOATT(tv->getElementType());
+        if (!st)
+            return NULL;
+        return VectorType::get(st, tv->getElementCount());
+    }
     unsigned nb = (t->isPointerTy() ? sizeof(void*) * 8 : t->getPrimitiveSizeInBits());
     auto &ctxt = t->getContext();
     if (nb == 64)
@@ -165,6 +173,13 @@ static Type *INTT(Type *t, const DataLayout &DL)
         return t;
     if (t->isPointerTy())
         return DL.getIntPtrType(t);
+    if (auto *tv = dyn_cast<VectorType>(t))
+    {
+        Type *st = INTT(tv->getElementType(), DL);
+        if (!st)
+            return NULL;
+        return VectorType::get(st, tv->getElementCount());
+    }
     if (t == getDoubleTy(ctxt))
         return getInt64Ty(ctxt);
     if (t == getFloatTy(ctxt))
@@ -1287,7 +1302,6 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
         return emit_llvmcall(ctx, args, nargs);
     if (f == cglobal_auto || f == cglobal)
         return emit_cglobal(ctx, args, nargs);
-
     SmallVector<jl_cgval_t, 0> argv(nargs);
     for (size_t i = 0; i < nargs; ++i) {
         jl_cgval_t arg = emit_expr(ctx, args[i + 1]);
@@ -1406,20 +1420,75 @@ static jl_cgval_t emit_intrinsic(jl_codectx_t &ctx, intrinsic f, jl_value_t **ar
         return mark_julia_type(ctx, ret, false, jl_bool_type);
     }
 
+    case preferred_vector_width: {
+        ++Emitted_preferred_vector_width;
+        assert(nargs == 1);
+        const jl_cgval_t &x = argv[0];
+        if (!x.constant || !jl_is_datatype(x.constant))
+            return emit_runtime_call(ctx, f, argv, nargs);
+        jl_datatype_t *dt = (jl_datatype_t*) x.constant;
+
+        // select the appropriated overloaded intrinsic
+        std::string intr_name = "julia.cpu.preferred_vector_width.";
+        switch (jl_datatype_size(dt)) {
+            case 1: {
+                intr_name += "b1";
+                break;
+            case 2: {
+                intr_name += "b2";
+                break;
+            }
+            case 4: {
+                intr_name += "b4";
+                break;
+            }
+            case 8: {
+                intr_name += "b8";
+                break;
+            }
+            case 16: {
+                intr_name += "b16";
+                break;
+            }
+            default:
+                return emit_runtime_call(ctx, f, argv, nargs);
+            }
+        }
+
+#ifdef _P64
+        FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, getInt64Ty(ctx.builder.getContext()));
+        auto ret = ctx.builder.CreateCall(intr);
+        return mark_julia_type(ctx, ret, false, jl_int64_type);
+#else
+        FunctionCallee intr = jl_Module->getOrInsertFunction(intr_name, getInt32Ty(ctx.builder.getContext()));
+        auto ret = ctx.builder.CreateCall(intr);
+        return mark_julia_type(ctx, ret, false, jl_int32_type);
+#endif
+    }
+
     default: {
         assert(nargs >= 1 && "invalid nargs for intrinsic call");
         const jl_cgval_t &xinfo = argv[0];
-
         // verify argument types
-        if (!jl_is_primitivetype(xinfo.typ))
+        if (jl_is_primitivetype(xinfo.typ)){}
+        else if (is_ntuple_type(xinfo.typ) && jl_nparams(xinfo.typ) > 0)
+        {
+            jl_value_t *et = jl_tparam0(xinfo.typ);
+            if (((jl_datatype_t*)et)->name == jl_vecelement_typename && jl_is_primitivetype(jl_tparam(et, 0)))
+                et = jl_tparam0(et);
+            else
+                return emit_runtime_call(ctx, f, argv, nargs);
+        }
+        else
             return emit_runtime_call(ctx, f, argv, nargs);
-        Type *xtyp = bitstype_to_llvm(xinfo.typ, ctx.builder.getContext(), true);
+        bool isboxed=true;
+        Type *xtyp = julia_type_to_llvm(ctx, xinfo.typ, &(isboxed));
         if (float_func()[f])
             xtyp = FLOATT(xtyp);
         else
             xtyp = INTT(xtyp, DL);
         if (!xtyp)
-            return emit_runtime_call(ctx, f, argv, nargs);
+             return emit_runtime_call(ctx, f, argv, nargs);
         ////Bool are required to be in the range [0,1]
         ////so while they are represented as i8,
         ////the operations need to be done in mod 1

diff --git a/src/intrinsics.h b/src/intrinsics.h
@@ -102,6 +102,7 @@
     ALIAS(llvmcall, llvmcall) \
     /*  cpu feature tests */ \
     ADD_I(have_fma, 1) \
+    ADD_I(preferred_vector_width, 1) \
     /*  hidden intrinsics */ \
     ADD_HIDDEN(cglobal_auto, 1)
 

diff --git a/src/julia.h b/src/julia.h
@@ -1717,6 +1717,24 @@ STATIC_INLINE int jl_is_tuple_type(void *t) JL_NOTSAFEPOINT
             ((jl_datatype_t*)(t))->name == jl_tuple_typename);
 }
 
+STATIC_INLINE int is_ntuple_type(jl_value_t *tt)
+{
+    if (!jl_is_tuple_type(tt))
+    {
+        return 0;
+    }
+    size_t i, nfields = jl_nparams(tt);
+    if(!nfields)
+        return 1;
+    jl_value_t *t1 = jl_tparam0(tt);
+    for (i = 1; i < nfields; i++) {
+        if (jl_tparam(tt, i) != t1) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
 STATIC_INLINE int jl_is_namedtuple_type(void *t) JL_NOTSAFEPOINT
 {
     return (jl_is_datatype(t) &&

diff --git a/src/julia_internal.h b/src/julia_internal.h
@@ -1614,6 +1614,7 @@ JL_DLLEXPORT jl_value_t *jl_copysign_float(jl_value_t *a, jl_value_t *b);
 JL_DLLEXPORT jl_value_t *jl_flipsign_int(jl_value_t *a, jl_value_t *b);
 
 JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *a);
+JL_DLLEXPORT jl_value_t *jl_preferred_vector_width(jl_value_t *a);
 JL_DLLEXPORT int jl_stored_inline(jl_value_t *el_type);
 JL_DLLEXPORT jl_value_t *(jl_array_data_owner)(jl_array_t *a);
 JL_DLLEXPORT jl_array_t *jl_array_copy(jl_array_t *ary);
-Original file line number
+Diff line change
@@ Expand Up / @@ -471,4 +471,6 @@ function entrypoint(@nospecialize(argt::Type)) @@
         nothing
     end
+    include("simd.jl")
     end