Merge branch 'master' into complex-ops

PhilipVinc · web-flow · commit 6f3d8a441240 · 2019-09-28T15:33:42.000+02:00
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+CITATION.bib linguist-detectable=false
+test/perf/* linguist-detectable=false
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "CUDAnative"
 uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
-version = "2.2.1"
+version = "2.3.1"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@ CUDAnative.jl
 
 *Support for compiling and executing native Julia kernels on CUDA hardware.*
 
-[![][docs-latest-img]][docs-latest-url] [![][codecov-img]][codecov-url] [![][doi-img]][doi-url]
+[![][docs-latest-img]][docs-latest-url] [![][discourse-img]][discourse-url] [![][codecov-img]][codecov-url] [![][doi-img]][doi-url]
 
 [codecov-img]: https://codecov.io/gh/JuliaGPU/CUDAnative.jl/branch/master/graph/badge.svg
 [codecov-url]: https://codecov.io/gh/JuliaGPU/CUDAnative.jl
@@ -14,6 +14,9 @@ CUDAnative.jl
 [doi-img]: https://zenodo.org/badge/DOI/10.1109/TPDS.2018.2872064.svg
 [doi-url]: https://doi.org/10.1109/TPDS.2018.2872064
 
+[discourse-img]: https://img.shields.io/badge/discourse-julia%20%23gpu-red
+[discourse-url]: https://discourse.julialang.org/c/domain/gpu
+
 
 
 Installation
diff --git a/codecov.yml b/codecov.yml
@@ -1,7 +1,6 @@
 coverage:
   ignore:
-    - "deps/*"
-    - "src/device/*"
+    - "src/device"
   status:
     patch: false
     project: false
diff --git a/examples/pairwise.jl b/examples/pairwise.jl
@@ -1,7 +1,9 @@
 # calculate pairwise distance between every point in a vector
 
 using CUDAdrv, CUDAnative
-include(joinpath(@__DIR__, "..", "test", "array.jl"))   # real applications: use CuArrays.jl
+
+include(joinpath(@__DIR__, "..", "test", "array.jl"))
+const CuArray = CuTestArray    # real applications: use CuArrays.jl
 
 
 function haversine_cpu(lat1::Float32, lon1::Float32, lat2::Float32, lon2::Float32, radius::Float32)
@@ -78,12 +80,12 @@ end
 
 function pairwise_dist_gpu(lat::Vector{Float32}, lon::Vector{Float32})
     # upload
-    lat_gpu = CuTestArray(lat)
-    lon_gpu = CuTestArray(lon)
+    lat_gpu = CuArray(lat)
+    lon_gpu = CuArray(lon)
 
     # allocate
     n = length(lat)
-    rowresult_gpu = CuTestArray(zeros(Float32, n, n))
+    rowresult_gpu = CuArray(zeros(Float32, n, n))
 
     # calculate launch configuration
     function get_config(kernel)
diff --git a/examples/peakflops.jl b/examples/peakflops.jl
@@ -1,5 +1,7 @@
 using CUDAdrv, CUDAnative
-include(joinpath(@__DIR__, "..", "test", "array.jl"))   # real applications: use CuArrays.jl
+
+include(joinpath(@__DIR__, "..", "test", "array.jl"))
+const CuArray = CuTestArray    # real applications: use CuArrays.jl
 
 using Test
 
@@ -30,10 +32,10 @@ function peakflops(n::Integer=5000, dev::CuDevice=CuDevice(0))
     c = round.(rand(Float32, dims) * 100)
     out = similar(a)
 
-    d_a = CuTestArray(a)
-    d_b = CuTestArray(b)
-    d_c = CuTestArray(c)
-    d_out = CuTestArray(out)
+    d_a = CuArray(a)
+    d_b = CuArray(b)
+    d_c = CuArray(c)
+    d_out = CuArray(out)
 
     len = prod(dims)
     threads = min(len, 1024)
diff --git a/examples/reduce/benchmark.jl b/examples/reduce/benchmark.jl
@@ -40,8 +40,8 @@ benchmark_gpu = @benchmarkable begin
         val = Array(gpu_output)[1]
     end setup=(
         val = nothing;
-        gpu_input = CuTestArray($input);
-        gpu_output = CuTestArray($output)
+        gpu_input = CuArray($input);
+        gpu_output = CuArray($output)
     ) teardown=(
         gpu_input = nothing;
         gpu_output = nothing
diff --git a/examples/reduce/reduce.jl b/examples/reduce/reduce.jl
@@ -8,7 +8,9 @@
 # Based on devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
 
 using CUDAdrv, CUDAnative
-include(joinpath(@__DIR__, "..", "..", "test", "array.jl"))   # real applications: use CuArrays.jl
+
+include(joinpath(@__DIR__, "..", "..", "test", "array.jl"))
+const CuArray = CuTestArray    # real applications: use CuArrays.jl
 
 
 #
@@ -84,7 +86,7 @@ Reduce a large array.
 
 Kepler-specific implementation, ie. you need sm_30 or higher to run this code.
 """
-function gpu_reduce(op::Function, input::CuTestArray{T}, output::CuTestArray{T}) where {T}
+function gpu_reduce(op::Function, input::CuArray{T}, output::CuArray{T}) where {T}
     len = length(input)
 
     function get_config(kernel)
diff --git a/examples/reduce/verify.jl b/examples/reduce/verify.jl
@@ -16,8 +16,8 @@ cpu_val = reduce(+, input)
 
 # CUDAnative
 let
-    gpu_input = CuTestArray(input)
-    gpu_output = CuTestArray(output)
+    gpu_input = CuArray(input)
+    gpu_output = CuArray(output)
     gpu_reduce(+, gpu_input, gpu_output)
     gpu_val = Array(gpu_output)[1]
     @assert cpu_val == gpu_val
diff --git a/examples/scan.jl b/examples/scan.jl
@@ -4,7 +4,9 @@
 # Based on https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html
 
 using CUDAdrv, CUDAnative
-include(joinpath(@__DIR__, "..", "test", "array.jl"))   # real applications: use CuArrays.jl
+
+include(joinpath(@__DIR__, "..", "test", "array.jl"))
+const CuArray = CuTestArray    # real applications: use CuArrays.jl
 
 function cpu_accumulate!(op::Function, data::Matrix{T}) where {T}
     cols = size(data,2)
@@ -65,7 +67,7 @@ a = rand(Int, rows, cols)
 cpu_a = copy(a)
 cpu_accumulate!(+, cpu_a)
 
-gpu_a = CuTestArray(a)
+gpu_a = CuArray(a)
 @cuda blocks=cols threads=rows shmem=2*rows*sizeof(eltype(a)) gpu_accumulate!(+, gpu_a)
 
 using Test
diff --git a/examples/vadd.jl b/examples/vadd.jl
@@ -1,7 +1,9 @@
 using Test
 
 using CUDAdrv, CUDAnative
-include(joinpath(@__DIR__, "..", "test", "array.jl"))   # real applications: use CuArrays.jl
+
+include(joinpath(@__DIR__, "..", "test", "array.jl"))
+const CuArray = CuTestArray    # real applications: use CuArrays.jl
 
 function vadd(a, b, c)
     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
@@ -14,9 +16,9 @@ a = round.(rand(Float32, dims) * 100)
 b = round.(rand(Float32, dims) * 100)
 c = similar(a)
 
-d_a = CuTestArray(a)
-d_b = CuTestArray(b)
-d_c = CuTestArray(c)
+d_a = CuArray(a)
+d_b = CuArray(b)
+d_c = CuArray(c)
 
 len = prod(dims)
 @cuda threads=len vadd(d_a, d_b, d_c)
diff --git a/src/compiler/irgen.jl b/src/compiler/irgen.jl
@@ -34,7 +34,7 @@ function backtrace(job::CompilerJob, call_stack::Vector{Core.MethodInstance})
             # calls to overdub directly, the backtrace therefore is collapsed and we have to
             # lookup the overdubbed function, but only if we likely are using the generated variant.
             actual_sig = Tuple{method_instance.specTypes.parameters[3:end]...}
-            m = ccall(:jl_gf_invoke_lookup, Any, (Any, UInt), tt, typemax(UInt))
+            m = ccall(:jl_gf_invoke_lookup, Any, (Any, UInt), actual_sig, typemax(UInt))
             method = m.func::Method
         end
         frame = StackTraces.StackFrame(method.name, method.file, method.line)
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
@@ -84,13 +84,18 @@ end
 # NOTE: maybe we should use XDG_CACHE_PATH/%LOCALAPPDATA%, but other Julia cache files
 #       are put in .julia anyway so let's just follow suit for now.
 function cachedir()
-    # this mimicks Base.compilecache. we can't just call the function, or we micht actually
+    # this mimicks Base.compilecache. we can't just call the function, or we might actually
     # _generate_ a cache file, e.g., when running with `--compiled-modules=no`.
-    cachefile = abspath(DEPOT_PATH[1], Base.cache_file_entry(Base.PkgId(CUDAnative)))
+    if VERSION >= v"1.3.0-alpha.146"
+        entrypath, entryfile = Base.cache_file_entry(Base.PkgId(CUDAnative))
+        abspath(DEPOT_PATH[1], entrypath, entryfile)
+    else
+        cachefile = abspath(DEPOT_PATH[1], Base.cache_file_entry(Base.PkgId(CUDAnative)))
 
-    # the cachefile consists of `/depot/compiled/vXXX/CUDAnative/$slug.ji`
-    # transform that into `/depot/compiled/vXXX/CUDAnative/$slug/`
-    splitext(cachefile)[1]
+        # the cachefile consists of `/depot/compiled/vXXX/CUDAnative/$slug.ji`
+        # transform that into `/depot/compiled/vXXX/CUDAnative/$slug/`
+        splitext(cachefile)[1]
+    end
 end
 
 runtimedir() = joinpath(cachedir(), "runtime")
diff --git a/src/compiler/validation.jl b/src/compiler/validation.jl
@@ -264,10 +264,13 @@ function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst)
             frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0)
             if length(frames) >= 1
                 @compiler_assert length(frames) == 1 job frames=frames
-                fn, file, line, linfo, fromC, inlined, ip = last(frames)
+                if VERSION >= v"1.4.0-DEV.123"
+                    fn, file, line, linfo, fromC, inlined = last(frames)
+                else
+                    fn, file, line, linfo, fromC, inlined, ip = last(frames)
+                end
                 push!(errors, (POINTER_FUNCTION, bt, fn))
             else
-                fn, file, line, linfo, fromC, inlined, ip = last(frames)
                 push!(errors, (POINTER_FUNCTION, bt, nothing))
             end
         end
diff --git a/src/device/cuda/math.jl b/src/device/cuda/math.jl
@@ -156,11 +156,11 @@
 @inline isfinite(x::Float32) = (@wrap __nv_finitef(x::float)::i32) != 0
 @inline isfinite(x::Float64) = (@wrap __nv_isfinited(x::double)::i32) != 0
 
-@inline isinf(x::Float32) = (@wrap __nv_isinfd(x::double)::i32) != 0
-@inline isinf(x::Float64) = (@wrap __nv_isinff(x::float)::i32) != 0
+@inline isinf(x::Float64) = (@wrap __nv_isinfd(x::double)::i32) != 0
+@inline isinf(x::Float32) = (@wrap __nv_isinff(x::float)::i32) != 0
 
-@inline isnan(x::Float32) = (@wrap __nv_isnand(x::double)::i32) != 0
-@inline isnan(x::Float64) = (@wrap __nv_isnanf(x::float)::i32) != 0
+@inline isnan(x::Float64) = (@wrap __nv_isnand(x::double)::i32) != 0
+@inline isnan(x::Float32) = (@wrap __nv_isnanf(x::float)::i32) != 0
 
 @inline nearbyint(x::Float64) = @wrap __nv_nearbyint(x::double)::double
 @inline nearbyint(x::Float32) = @wrap __nv_nearbyintf(x::float)::float
diff --git a/src/device/cuda/output.jl b/src/device/cuda/output.jl
@@ -206,8 +206,7 @@ end
 
 @doc (@doc @cuprint) ->
 macro cuprintln(parts...)
-    parts = map(part -> isa(part, Expr) || isa(part, Symbol) ? esc(part) : part, parts)
-    quote
-        @cuprint($(parts...), "\n")
-    end
+    esc(quote
+        CUDAnative.@cuprint($(parts...), "\n")
+    end)
 end
diff --git a/src/device/cuda/warp_shuffle.jl b/src/device/cuda/warp_shuffle.jl
diff --git a/src/device/llvm.jl b/src/device/llvm.jl
diff --git a/test/device/array.jl b/test/device/array.jl
diff --git a/test/device/codegen.jl b/test/device/codegen.jl
diff --git a/test/device/cuda.jl b/test/device/cuda.jl
diff --git a/test/device/execution.jl b/test/device/execution.jl
diff --git a/test/device/pointer.jl b/test/device/pointer.jl
diff --git a/test/runtests.jl b/test/runtests.jl

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+CITATION.bib linguist-detectable=false`
	`2`	`+test/perf/* linguist-detectable=false`