Skip to content
This repository was archived by the owner on May 27, 2021. It is now read-only.

Commit 6f3d8a4

Browse files
authored
Merge branch 'master' into complex-ops
2 parents 345956c + a4a56bd commit 6f3d8a4

24 files changed

+324
-194
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
CITATION.bib linguist-detectable=false
2+
test/perf/* linguist-detectable=false

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "CUDAnative"
22
uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
3-
version = "2.2.1"
3+
version = "2.3.1"
44

55
[deps]
66
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ CUDAnative.jl
33

44
*Support for compiling and executing native Julia kernels on CUDA hardware.*
55

6-
[![][docs-latest-img]][docs-latest-url] [![][codecov-img]][codecov-url] [![][doi-img]][doi-url]
6+
[![][docs-latest-img]][docs-latest-url] [![][discourse-img]][discourse-url] [![][codecov-img]][codecov-url] [![][doi-img]][doi-url]
77

88
[codecov-img]: https://codecov.io/gh/JuliaGPU/CUDAnative.jl/branch/master/graph/badge.svg
99
[codecov-url]: https://codecov.io/gh/JuliaGPU/CUDAnative.jl
@@ -14,6 +14,9 @@ CUDAnative.jl
1414
[doi-img]: https://zenodo.org/badge/DOI/10.1109/TPDS.2018.2872064.svg
1515
[doi-url]: https://doi.org/10.1109/TPDS.2018.2872064
1616

17+
[discourse-img]: https://img.shields.io/badge/discourse-julia%20%23gpu-red
18+
[discourse-url]: https://discourse.julialang.org/c/domain/gpu
19+
1720

1821

1922
Installation

codecov.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
coverage:
22
ignore:
3-
- "deps/*"
4-
- "src/device/*"
3+
- "src/device"
54
status:
65
patch: false
76
project: false

examples/pairwise.jl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
# calculate pairwise distance between every point in a vector
22

33
using CUDAdrv, CUDAnative
4-
include(joinpath(@__DIR__, "..", "test", "array.jl")) # real applications: use CuArrays.jl
4+
5+
include(joinpath(@__DIR__, "..", "test", "array.jl"))
6+
const CuArray = CuTestArray # real applications: use CuArrays.jl
57

68

79
function haversine_cpu(lat1::Float32, lon1::Float32, lat2::Float32, lon2::Float32, radius::Float32)
@@ -78,12 +80,12 @@ end
7880

7981
function pairwise_dist_gpu(lat::Vector{Float32}, lon::Vector{Float32})
8082
# upload
81-
lat_gpu = CuTestArray(lat)
82-
lon_gpu = CuTestArray(lon)
83+
lat_gpu = CuArray(lat)
84+
lon_gpu = CuArray(lon)
8385

8486
# allocate
8587
n = length(lat)
86-
rowresult_gpu = CuTestArray(zeros(Float32, n, n))
88+
rowresult_gpu = CuArray(zeros(Float32, n, n))
8789

8890
# calculate launch configuration
8991
function get_config(kernel)

examples/peakflops.jl

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
using CUDAdrv, CUDAnative
2-
include(joinpath(@__DIR__, "..", "test", "array.jl")) # real applications: use CuArrays.jl
2+
3+
include(joinpath(@__DIR__, "..", "test", "array.jl"))
4+
const CuArray = CuTestArray # real applications: use CuArrays.jl
35

46
using Test
57

@@ -30,10 +32,10 @@ function peakflops(n::Integer=5000, dev::CuDevice=CuDevice(0))
3032
c = round.(rand(Float32, dims) * 100)
3133
out = similar(a)
3234

33-
d_a = CuTestArray(a)
34-
d_b = CuTestArray(b)
35-
d_c = CuTestArray(c)
36-
d_out = CuTestArray(out)
35+
d_a = CuArray(a)
36+
d_b = CuArray(b)
37+
d_c = CuArray(c)
38+
d_out = CuArray(out)
3739

3840
len = prod(dims)
3941
threads = min(len, 1024)

examples/reduce/benchmark.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ benchmark_gpu = @benchmarkable begin
4040
val = Array(gpu_output)[1]
4141
end setup=(
4242
val = nothing;
43-
gpu_input = CuTestArray($input);
44-
gpu_output = CuTestArray($output)
43+
gpu_input = CuArray($input);
44+
gpu_output = CuArray($output)
4545
) teardown=(
4646
gpu_input = nothing;
4747
gpu_output = nothing

examples/reduce/reduce.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
# Based on devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
99

1010
using CUDAdrv, CUDAnative
11-
include(joinpath(@__DIR__, "..", "..", "test", "array.jl")) # real applications: use CuArrays.jl
11+
12+
include(joinpath(@__DIR__, "..", "..", "test", "array.jl"))
13+
const CuArray = CuTestArray # real applications: use CuArrays.jl
1214

1315

1416
#
@@ -84,7 +86,7 @@ Reduce a large array.
8486
8587
Kepler-specific implementation, ie. you need sm_30 or higher to run this code.
8688
"""
87-
function gpu_reduce(op::Function, input::CuTestArray{T}, output::CuTestArray{T}) where {T}
89+
function gpu_reduce(op::Function, input::CuArray{T}, output::CuArray{T}) where {T}
8890
len = length(input)
8991

9092
function get_config(kernel)

examples/reduce/verify.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ cpu_val = reduce(+, input)
1616

1717
# CUDAnative
1818
let
19-
gpu_input = CuTestArray(input)
20-
gpu_output = CuTestArray(output)
19+
gpu_input = CuArray(input)
20+
gpu_output = CuArray(output)
2121
gpu_reduce(+, gpu_input, gpu_output)
2222
gpu_val = Array(gpu_output)[1]
2323
@assert cpu_val == gpu_val

examples/scan.jl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
# Based on https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html
55

66
using CUDAdrv, CUDAnative
7-
include(joinpath(@__DIR__, "..", "test", "array.jl")) # real applications: use CuArrays.jl
7+
8+
include(joinpath(@__DIR__, "..", "test", "array.jl"))
9+
const CuArray = CuTestArray # real applications: use CuArrays.jl
810

911
function cpu_accumulate!(op::Function, data::Matrix{T}) where {T}
1012
cols = size(data,2)
@@ -65,7 +67,7 @@ a = rand(Int, rows, cols)
6567
cpu_a = copy(a)
6668
cpu_accumulate!(+, cpu_a)
6769

68-
gpu_a = CuTestArray(a)
70+
gpu_a = CuArray(a)
6971
@cuda blocks=cols threads=rows shmem=2*rows*sizeof(eltype(a)) gpu_accumulate!(+, gpu_a)
7072

7173
using Test

examples/vadd.jl

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
using Test
22

33
using CUDAdrv, CUDAnative
4-
include(joinpath(@__DIR__, "..", "test", "array.jl")) # real applications: use CuArrays.jl
4+
5+
include(joinpath(@__DIR__, "..", "test", "array.jl"))
6+
const CuArray = CuTestArray # real applications: use CuArrays.jl
57

68
function vadd(a, b, c)
79
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
@@ -14,9 +16,9 @@ a = round.(rand(Float32, dims) * 100)
1416
b = round.(rand(Float32, dims) * 100)
1517
c = similar(a)
1618

17-
d_a = CuTestArray(a)
18-
d_b = CuTestArray(b)
19-
d_c = CuTestArray(c)
19+
d_a = CuArray(a)
20+
d_b = CuArray(b)
21+
d_c = CuArray(c)
2022

2123
len = prod(dims)
2224
@cuda threads=len vadd(d_a, d_b, d_c)

src/compiler/irgen.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ function backtrace(job::CompilerJob, call_stack::Vector{Core.MethodInstance})
3434
# calls to overdub directly, the backtrace therefore is collapsed and we have to
3535
# lookup the overdubbed function, but only if we likely are using the generated variant.
3636
actual_sig = Tuple{method_instance.specTypes.parameters[3:end]...}
37-
m = ccall(:jl_gf_invoke_lookup, Any, (Any, UInt), tt, typemax(UInt))
37+
m = ccall(:jl_gf_invoke_lookup, Any, (Any, UInt), actual_sig, typemax(UInt))
3838
method = m.func::Method
3939
end
4040
frame = StackTraces.StackFrame(method.name, method.file, method.line)

src/compiler/rtlib.jl

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,18 @@ end
8484
# NOTE: maybe we should use XDG_CACHE_PATH/%LOCALAPPDATA%, but other Julia cache files
8585
# are put in .julia anyway so let's just follow suit for now.
8686
function cachedir()
87-
# this mimicks Base.compilecache. we can't just call the function, or we micht actually
87+
# this mimicks Base.compilecache. we can't just call the function, or we might actually
8888
# _generate_ a cache file, e.g., when running with `--compiled-modules=no`.
89-
cachefile = abspath(DEPOT_PATH[1], Base.cache_file_entry(Base.PkgId(CUDAnative)))
89+
if VERSION >= v"1.3.0-alpha.146"
90+
entrypath, entryfile = Base.cache_file_entry(Base.PkgId(CUDAnative))
91+
abspath(DEPOT_PATH[1], entrypath, entryfile)
92+
else
93+
cachefile = abspath(DEPOT_PATH[1], Base.cache_file_entry(Base.PkgId(CUDAnative)))
9094

91-
# the cachefile consists of `/depot/compiled/vXXX/CUDAnative/$slug.ji`
92-
# transform that into `/depot/compiled/vXXX/CUDAnative/$slug/`
93-
splitext(cachefile)[1]
95+
# the cachefile consists of `/depot/compiled/vXXX/CUDAnative/$slug.ji`
96+
# transform that into `/depot/compiled/vXXX/CUDAnative/$slug/`
97+
splitext(cachefile)[1]
98+
end
9499
end
95100

96101
runtimedir() = joinpath(cachedir(), "runtime")

src/compiler/validation.jl

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,10 +264,13 @@ function check_ir!(job, errors::Vector{IRError}, inst::LLVM.CallInst)
264264
frames = ccall(:jl_lookup_code_address, Any, (Ptr{Cvoid}, Cint,), ptr, 0)
265265
if length(frames) >= 1
266266
@compiler_assert length(frames) == 1 job frames=frames
267-
fn, file, line, linfo, fromC, inlined, ip = last(frames)
267+
if VERSION >= v"1.4.0-DEV.123"
268+
fn, file, line, linfo, fromC, inlined = last(frames)
269+
else
270+
fn, file, line, linfo, fromC, inlined, ip = last(frames)
271+
end
268272
push!(errors, (POINTER_FUNCTION, bt, fn))
269273
else
270-
fn, file, line, linfo, fromC, inlined, ip = last(frames)
271274
push!(errors, (POINTER_FUNCTION, bt, nothing))
272275
end
273276
end

src/device/cuda/math.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -156,11 +156,11 @@
156156
@inline isfinite(x::Float32) = (@wrap __nv_finitef(x::float)::i32) != 0
157157
@inline isfinite(x::Float64) = (@wrap __nv_isfinited(x::double)::i32) != 0
158158

159-
@inline isinf(x::Float32) = (@wrap __nv_isinfd(x::double)::i32) != 0
160-
@inline isinf(x::Float64) = (@wrap __nv_isinff(x::float)::i32) != 0
159+
@inline isinf(x::Float64) = (@wrap __nv_isinfd(x::double)::i32) != 0
160+
@inline isinf(x::Float32) = (@wrap __nv_isinff(x::float)::i32) != 0
161161

162-
@inline isnan(x::Float32) = (@wrap __nv_isnand(x::double)::i32) != 0
163-
@inline isnan(x::Float64) = (@wrap __nv_isnanf(x::float)::i32) != 0
162+
@inline isnan(x::Float64) = (@wrap __nv_isnand(x::double)::i32) != 0
163+
@inline isnan(x::Float32) = (@wrap __nv_isnanf(x::float)::i32) != 0
164164

165165
@inline nearbyint(x::Float64) = @wrap __nv_nearbyint(x::double)::double
166166
@inline nearbyint(x::Float32) = @wrap __nv_nearbyintf(x::float)::float

src/device/cuda/output.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,7 @@ end
206206

207207
@doc (@doc @cuprint) ->
208208
macro cuprintln(parts...)
209-
parts = map(part -> isa(part, Expr) || isa(part, Symbol) ? esc(part) : part, parts)
210-
quote
211-
@cuprint($(parts...), "\n")
212-
end
209+
esc(quote
210+
CUDAnative.@cuprint($(parts...), "\n")
211+
end)
213212
end

0 commit comments

Comments
 (0)