Get the performance of ArrayPartition up to that of Vector

Some quick tests show that when using broadcast `ArrayPartition`s do quite well:

```julia
using RecursiveArrayTools
u0 = zeros(100)
v0 = ones(100)
t1 = ArrayPartition((u0,v0))
t2 = [u0;v0]
t12 = ArrayPartition((similar(u0),similar(v0)))
t22 = similar(t2)

using BenchmarkTools
function f(t12,t1)
    for i in eachindex(t12.x)
        t12.x[i] .= t1.x[i] .+ t1.x[i]
    end
end
@benchmark f(t12,t1)

#=
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     70.867 ns (0.00% GC)
  median time:      74.457 ns (0.00% GC)
  mean time:        81.489 ns (0.00% GC)
  maximum time:     249.982 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     979
 =#

function g(t22,t2)
    t22 .= t2 .+ t2
end
@benchmark g(t22,t2)

#=
BenchmarkTools.Trial:
  memory estimate:  64 bytes
  allocs estimate:  3
  --------------
  minimum time:     247.830 ns (0.00% GC)
  median time:      264.466 ns (0.00% GC)
  mean time:        294.141 ns (1.59% GC)
  maximum time:     6.066 μs (91.20% GC)
  --------------
  samples:          10000
  evals/sample:     352
=#

add_idxs(x,expr) = expr
add_idxs{T<:ArrayPartition}(::Type{T},expr) = :($(expr).x[i])
@generated function Base.broadcast!(f,A::ArrayPartition,B...)
  exs = ((add_idxs(B[i],:(B[$i])) for i in eachindex(B))...)
  :(for i in eachindex(A.x)
    broadcast!(f,A.x[i],$(exs...))
  end)
end

function h(t12,t1)
    t12 .= t1 .+ t1
end
@benchmark h(t12,t1)

#=
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     60.261 ns (0.00% GC)
  median time:      62.329 ns (0.00% GC)
  mean time:        68.939 ns (0.00% GC)
  maximum time:     225.685 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     991
=#

u0 = zeros(2)
v0 = ones(2)
t1 = ArrayPartition((u0,v0))
t2 = [u0;v0]
t12 = ArrayPartition((similar(u0),similar(v0)))
t22 = similar(t2)

@benchmark f(t12,t1)

#=
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     36.886 ns (0.00% GC)
  median time:      39.520 ns (0.00% GC)
  mean time:        43.894 ns (0.00% GC)
  maximum time:     173.009 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000
=#

@benchmark g(t22,t2)

#=
BenchmarkTools.Trial:
  memory estimate:  64 bytes
  allocs estimate:  3
  --------------
  minimum time:     211.612 ns (0.00% GC)
  median time:      226.932 ns (0.00% GC)
  mean time:        253.315 ns (1.97% GC)
  maximum time:     4.982 μs (92.19% GC)
  --------------
  samples:          10000
  evals/sample:     516
=#

@benchmark h(t12,t1)

#=
BenchmarkTools.Trial:
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     27.224 ns (0.00% GC)
  median time:      28.689 ns (0.00% GC)
  mean time:        32.268 ns (0.00% GC)
  maximum time:     168.033 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1000
=#
```

This is great because this allows for heterogeneously typed arrays and forms the basis of the partitioned methods. But how does it do inside the full ODE solver?

```julia
using DiffEqBase, OrdinaryDiffEq, Base.Test, RecursiveArrayTools, DiffEqDevTools
using BenchmarkTools

u0 = zeros(100)
v0 = ones(100)
f1 = function (t,u,v,du)
  du .= v
end
f2 = function (t,u,v,dv)
  dv .= -u
end
function (::typeof(f2))(::Type{Val{:analytic}}, x, y0)
  u0, v0 = y0
  ArrayPartition(u0*cos(x) + v0*sin(x), -u0*sin(x) + v0*cos(x))
end

prob = DynamicalODEProblem(f1,f2,u0,v0,(0.0,5.0))

@benchmark solve(prob,RK4(),dt=1/2,save_everystep=false)

#=
BenchmarkTools.Trial: 
  memory estimate:  82.59 KiB
  allocs estimate:  565
  --------------
  minimum time:     126.171 μs (0.00% GC)
  median time:      137.295 μs (0.00% GC)
  mean time:        159.308 μs (4.95% GC)
  maximum time:     3.158 ms (88.39% GC)
  --------------
  samples:          10000
  evals/sample:     1
=#

function f(t,u,du)
    uu = @view u[1:100]
    v = @view u[101:end]
    du[1:100] .= uu
    du[101:200] .= v
end
u02 = [zeros(100);ones(100)]
tspan=(0.0,5.0)
prob = ODEProblem(f,u02,tspan)
@benchmark sol = solve(prob,RK4(),dt=1/2,save_everystep=false)

#=
BenchmarkTools.Trial: 
  memory estimate:  43.98 KiB
  allocs estimate:  408
  --------------
  minimum time:     65.574 μs (0.00% GC)
  median time:      71.722 μs (0.00% GC)
  mean time:        83.817 μs (6.20% GC)
  maximum time:     3.776 ms (93.39% GC)
  --------------
  samples:          10000
  evals/sample:     1
=#
```

This matters now because the dynamical ODEs generate a problem like this which can now be used with the general ODE solvers as a fallback, so it would be nice to reduce the overhead of that fallback, especially given that there doesn't seem to be some fundamental reason that the type's elementwise operations are slower.

That timing is with a modification to RK4 to make it use broadcast.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Get the performance of ArrayPartition up to that of Vector #122

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Get the performance of ArrayPartition up to that of Vector #122

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions