diff --git a/doc/src/manual/calling-c-and-fortran-code.md b/doc/src/manual/calling-c-and-fortran-code.md index d198c796a2e0b..aa317468b0f75 100644 --- a/doc/src/manual/calling-c-and-fortran-code.md +++ b/doc/src/manual/calling-c-and-fortran-code.md @@ -547,15 +547,14 @@ is not valid, since the type layout of `T` is not known statically. ### SIMD Values -Note: This feature is currently implemented on 64-bit x86 and AArch64 platforms only. - If a C/C++ routine has an argument or return value that is a native SIMD type, the corresponding Julia type is a homogeneous tuple of `VecElement` that naturally maps to the SIMD type. Specifically: -> * The tuple must be the same size as the SIMD type. For example, a tuple representing an `__m128` -> on x86 must have a size of 16 bytes. -> * The element type of the tuple must be an instance of `VecElement{T}` where `T` is a primitive type that -> is 1, 2, 4 or 8 bytes. +> * The tuple must be the same size and elements as the SIMD type. For example, a tuple +> representing an `__m128` on x86 must have a size of 16 bytes and Float32 elements. +> * The element type of the tuple must be an instance of `VecElement{T}` where `T` is a +> primitive type with a power-of-two number of bytes (e.g. 1, 2, 4, 8, 16, etc) such as +> Int8 or Float64. For instance, consider this C routine that uses AVX intrinsics: @@ -628,6 +627,10 @@ For translating a C argument list to Julia: * `T`, where `T` is a concrete Julia type * argument value will be copied (passed by value) + * `vector T` (or `__attribute__ vector_size`, or a typedef such as `__m128`) + + * `NTuple{N, VecElement{T}}`, where `T` is a primitive Julia type of the correct size + and N is the number of elements in the vector (equal to `vector_size / sizeof T`). * `void*` * depends on how this parameter is used, first translate this to the intended pointer type, then @@ -674,13 +677,16 @@ For translating a C return type to Julia: * `T`, where `T` is one of the primitive types: `char`, `int`, `long`, `short`, `float`, `double`, `complex`, `enum` or any of their `typedef` equivalents - * `T`, where `T` is an equivalent Julia Bits Type (per the table above) - * if `T` is an `enum`, the argument type should be equivalent to `Cint` or `Cuint` + * same as C argument list * argument value will be copied (returned by-value) * `struct T` (including typedef to a struct) - * `T`, where `T` is a concrete Julia Type + * same as C argument list * argument value will be copied (returned by-value) + + * `vector T` + + * same as C argument list * `void*` * depends on how this parameter is used, first translate this to the intended pointer type, then diff --git a/src/cgutils.cpp b/src/cgutils.cpp index 8c40e12539598..fb4c9aa8cb415 100644 --- a/src/cgutils.cpp +++ b/src/cgutils.cpp @@ -3103,11 +3103,7 @@ static jl_cgval_t emit_getfield_knownidx(jl_codectx_t &ctx, const jl_cgval_t &st else if (strct.ispointer()) { auto tbaa = best_field_tbaa(ctx, strct, jt, idx, byte_offset); Value *staddr = data_pointer(ctx, strct); - Value *addr; - if (jl_is_vecelement_type((jl_value_t*)jt) || byte_offset == 0) - addr = staddr; // VecElement types are unwrapped in LLVM. - else - addr = emit_ptrgep(ctx, staddr, byte_offset); + Value *addr = (byte_offset == 0 ? staddr : emit_ptrgep(ctx, staddr, byte_offset)); if (addr != staddr) setNameWithField(ctx.emission_context, addr, get_objname, jt, idx, Twine("_ptr")); if (jl_field_isptr(jt, idx)) { @@ -3571,7 +3567,7 @@ static void union_alloca_type(jl_uniontype_t *ut, [&](unsigned idx, jl_datatype_t *jt) { if (!jl_is_datatype_singleton(jt)) { size_t nb1 = jl_datatype_size(jt); - size_t align1 = jl_datatype_align(jt); + size_t align1 = julia_alignment((jl_value_t*)jt); if (nb1 > nbytes) nbytes = nb1; if (align1 > align) @@ -4133,10 +4129,11 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg // choose whether we should perform the initialization with the struct as a IR value // or instead initialize the stack buffer with stores (the later is nearly always better) + // although we do the former if it is a vector or could be a vector element auto tracked = split_value_size(sty); assert(CountTrackedPointers(lt).count == tracked.second); bool init_as_value = false; - if (lt->isVectorTy() || jl_is_vecelement_type(ty)) { // maybe also check the size ? + if (lt->isVectorTy() || jl_special_vector_alignment(1, ty) != 0) { init_as_value = true; } @@ -4343,7 +4340,7 @@ static jl_cgval_t emit_new_struct(jl_codectx_t &ctx, jl_value_t *ty, size_t narg if (strct) { jl_aliasinfo_t ai = jl_aliasinfo_t::fromTBAA(ctx, ctx.tbaa().tbaa_stack); promotion_point = ai.decorateInst(ctx.builder.CreateMemSet(strct, ConstantInt::get(getInt8Ty(ctx.builder.getContext()), 0), - jl_datatype_size(ty), MaybeAlign(jl_datatype_align(ty)))); + jl_datatype_size(ty), Align(julia_alignment(ty)))); } ctx.builder.restoreIP(savedIP); } diff --git a/src/datatype.c b/src/datatype.c index 240b1ace2295f..f2f633adc3623 100644 --- a/src/datatype.c +++ b/src/datatype.c @@ -300,9 +300,10 @@ static jl_datatype_layout_t *jl_get_layout(uint32_t sz, } // Determine if homogeneous tuple with fields of type t will have -// a special alignment beyond normal Julia rules. +// a special alignment and vector-ABI beyond normal rules for aggregates. // Return special alignment if one exists, 0 if normal alignment rules hold. // A non-zero result *must* match the LLVM rules for a vector type . +// Matching the compiler's `__attribute__ vector_size` behavior. // For sake of Ahead-Of-Time (AOT) compilation, this routine has to work // without LLVM being available. unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t) @@ -317,8 +318,12 @@ unsigned jl_special_vector_alignment(size_t nfields, jl_value_t *t) // motivating use case comes up for Julia, we reject pointers. return 0; size_t elsz = jl_datatype_size(ty); - if (elsz != 1 && elsz != 2 && elsz != 4 && elsz != 8) - // Only handle power-of-two-sized elements (for now) + if (next_power_of_two(elsz) != elsz) + // Only handle power-of-two-sized elements (for now), since other + // lengths may be packed into very complicated arrangements (llvm pads + // extra bits on most platforms when computing alignment but not when + // computing type size, but adds no extra bytes for each element, so + // their effect on offsets are never what you may naturally expect). return 0; size_t size = nfields * elsz; // Use natural alignment for this vector: this matches LLVM and clang. @@ -723,9 +728,9 @@ void jl_compute_field_offsets(jl_datatype_t *st) } else { fsz = sizeof(void*); - if (fsz > MAX_ALIGN) - fsz = MAX_ALIGN; al = fsz; + if (al > MAX_ALIGN) + al = MAX_ALIGN; desc[i].isptr = 1; zeroinit = 1; npointers++; @@ -769,8 +774,6 @@ void jl_compute_field_offsets(jl_datatype_t *st) if (al > alignm) alignm = al; } - if (alignm > MAX_ALIGN) - alignm = MAX_ALIGN; // We cannot guarantee alignments over 16 bytes because that's what our heap is aligned as if (LLT_ALIGN(sz, alignm) > sz) { haspadding = 1; sz = LLT_ALIGN(sz, alignm); @@ -939,6 +942,14 @@ JL_DLLEXPORT jl_datatype_t *jl_new_primitivetype(jl_value_t *name, jl_module_t * uint32_t nbytes = (nbits + 7) / 8; uint32_t alignm = next_power_of_two(nbytes); # if defined(_CPU_X86_) && !defined(_OS_WINDOWS_) + // datalayout strings are often weird: on 64-bit they usually follow fairly simple rules, + // but on x86 32 bit platforms, sometimes 5 to 8 byte types are + // 32-bit aligned even though the MAX_ALIGN (for types 9+ bytes) is 16 + // (except for f80 which is align 4 on Mingw, Linux, and BSDs--but align 16 on MSVC and Darwin) + // https://llvm.org/doxygen/ARMTargetMachine_8cpp.html#adb29b487708f0dc2a940345b68649270 + // https://llvm.org/doxygen/AArch64TargetMachine_8cpp.html#a003a58caf135efbf7273c5ed84e700d7 + // https://llvm.org/doxygen/X86TargetMachine_8cpp.html#aefdbcd6131ef195da070cef7fdaf0532 + // 32-bit alignment is weird if (alignm == 8) alignm = 4; # endif diff --git a/test/core.jl b/test/core.jl index fd607cc86f1d5..b27a648f543d1 100644 --- a/test/core.jl +++ b/test/core.jl @@ -5773,6 +5773,13 @@ let ni128 = sizeof(FP128test) รท sizeof(Int), @test reinterpret(UInt128, arr[2].fp) == expected end +# make sure VecElement Tuple has the C alignment and ABI for supported types +primitive type Int24 24 end +@test Base.datatype_alignment(NTuple{10,VecElement{Int16}}) == 32 +@test Base.datatype_alignment(NTuple{10,VecElement{Int24}}) == 4 +@test Base.datatype_alignment(NTuple{10,VecElement{Int64}}) == 128 +@test Base.datatype_alignment(NTuple{10,VecElement{Int128}}) == 256 + # issue #21516 struct T21516 x::Vector{Float64}