TuringLang · Red-Portal · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/Project.toml b/Project.toml
@@ -25,13 +25,15 @@ Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
+Tapir = "07d77754-e150-4737-8c94-cd238a1fb45b"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [extensions]
 AdvancedVIBijectorsExt = "Bijectors"
 AdvancedVIEnzymeExt = "Enzyme"
 AdvancedVIForwardDiffExt = "ForwardDiff"
 AdvancedVIReverseDiffExt = "ReverseDiff"
+AdvancedVITapirExt = "Tapir"
 AdvancedVIZygoteExt = "Zygote"
 
 [compat]
@@ -55,6 +57,7 @@ Requires = "1.0"
 ReverseDiff = "1.15.1"
 SimpleUnPack = "1.1.0"
 StatsBase = "0.32, 0.33, 0.34"
+Tapir = "0.2.23"
 Zygote = "0.6.63"
 julia = "1.6"
 

diff --git a/ext/AdvancedVIBijectorsExt.jl b/ext/AdvancedVIBijectorsExt.jl
@@ -42,9 +42,9 @@ function AdvancedVI.reparam_with_entropy(
     n_samples::Int,
     ent_est  ::AdvancedVI.AbstractEntropyEstimator
 )
-    transform       = q.transform
-    q_unconst       = q.dist
-    q_unconst_stop  = q_stop.dist
+    transform      = q.transform
+    q_unconst      = q.dist
+    q_unconst_stop = q_stop.dist
 
     # Draw samples and compute entropy of the uncontrained distribution
     unconstr_samples, unconst_entropy = AdvancedVI.reparam_with_entropy(

diff --git a/ext/AdvancedVIForwardDiffExt.jl b/ext/AdvancedVIForwardDiffExt.jl
@@ -14,16 +14,31 @@ end
 getchunksize(::ADTypes.AutoForwardDiff{chunksize}) where {chunksize} = chunksize
 
 function AdvancedVI.value_and_gradient!(
-    ad::ADTypes.AutoForwardDiff, f, θ::AbstractVector{T}, out::DiffResults.MutableDiffResult
-) where {T<:Real}
+    ad   ::ADTypes.AutoForwardDiff,
+         ::Any,
+    f,
+    x    ::AbstractVector,
+    out  ::DiffResults.MutableDiffResult
+)
     chunk_size = getchunksize(ad)
     config = if isnothing(chunk_size)
-        ForwardDiff.GradientConfig(f, θ)
+        ForwardDiff.GradientConfig(f, x)
     else
-        ForwardDiff.GradientConfig(f, θ, ForwardDiff.Chunk(length(θ), chunk_size))
+        ForwardDiff.GradientConfig(f, x, ForwardDiff.Chunk(length(x), chunk_size))
     end
-    ForwardDiff.gradient!(out, f, θ, config)
+    ForwardDiff.gradient!(out, f, x, config)
     return out
 end
 
+function AdvancedVI.value_and_gradient!(
+    ad    ::ADTypes.AutoForwardDiff,
+    st_ad,
+    f,
+    x     ::AbstractVector,
+    aux, 
+    out   ::DiffResults.MutableDiffResult
+)
+    AdvancedVI.value_and_gradient!(ad, st_ad, x′ -> f(x′, aux), x, out)
+end
+
 end
diff --git a/ext/AdvancedVIReverseDiffExt.jl b/ext/AdvancedVIReverseDiffExt.jl
@@ -13,11 +13,26 @@ end
 
 # ReverseDiff without compiled tape
 function AdvancedVI.value_and_gradient!(
-    ad::ADTypes.AutoReverseDiff, f, θ::AbstractVector{<:Real}, out::DiffResults.MutableDiffResult
+       ::ADTypes.AutoReverseDiff,
+       ::Any,
+    f,
+    x  ::AbstractVector{<:Real},
+    out::DiffResults.MutableDiffResult
 )
-    tp = ReverseDiff.GradientTape(f, θ)
-    ReverseDiff.gradient!(out, tp, θ)
+    tp = ReverseDiff.GradientTape(f, x)
+    ReverseDiff.gradient!(out, tp, x)
     return out
 end
 
+function AdvancedVI.value_and_gradient!(
+    ad    ::ADTypes.AutoReverseDiff,
+    st_ad,
+    f,
+    x     ::AbstractVector{<:Real},
+    aux,
+    out   ::DiffResults.MutableDiffResult
+)
+    AdvancedVI.value_and_gradient!(ad, st_ad, x′ -> f(x′, aux), x, out)
+end
+
 end
diff --git a/ext/AdvancedVIZygoteExt.jl b/ext/AdvancedVIZygoteExt.jl
@@ -4,21 +4,38 @@ module AdvancedVIZygoteExt
 if isdefined(Base, :get_extension)
     using AdvancedVI
     using AdvancedVI: ADTypes, DiffResults
+    using ChainRulesCore
     using Zygote
 else
     using ..AdvancedVI
     using ..AdvancedVI: ADTypes, DiffResults
+    using ..ChainRulesCore
     using ..Zygote
 end
 
 function AdvancedVI.value_and_gradient!(
-    ad::ADTypes.AutoZygote, f, θ::AbstractVector{<:Real}, out::DiffResults.MutableDiffResult
+       ::ADTypes.AutoZygote,
+       ::Any,
+    f,
+    x  ::AbstractVector{<:Real},
+    out::DiffResults.MutableDiffResult
 )
-    y, back = Zygote.pullback(f, θ)
-    ∇θ = back(one(y))
+    y, back = Zygote.pullback(f, x)
+    ∇x = back(one(y))
     DiffResults.value!(out, y)
-    DiffResults.gradient!(out, only(∇θ))
+    DiffResults.gradient!(out, only(∇x))
     return out
 end
 
+function AdvancedVI.value_and_gradient!(
+    ad    ::ADTypes.AutoZygote,
+    st_ad,
+    f,
+    x     ::AbstractVector{<:Real},
+    aux,
+    out   ::DiffResults.MutableDiffResult
+)
+    AdvancedVI.value_and_gradient!(ad, st_ad, x′ -> f(x′, aux), x, out)
+end
+
 end
diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl
@@ -25,18 +25,40 @@ using StatsBase
 
 # derivatives
 """
-    value_and_gradient!(ad, f, θ, out)
+    value_and_gradient!(adtype, ad_st, f, x, out)
+    value_and_gradient!(adtype, ad_st, f, x, aux, out)
 
-Evaluate the value and gradient of a function `f` at `θ` using the automatic differentiation backend `ad` and store the result in `out`.
+Evaluate the value and gradient of a function `f` at `x` using the automatic differentiation (AD) backend `ad` and store the result in `out`.
+`f` may receive auxiliary input as `f(x,aux)`.
 
 # Arguments
-- `ad::ADTypes.AbstractADType`: Automatic differentiation backend. 
+- `adtype::ADTypes.AbstractADType`: AD backend. 
+- `ad_st`: State used by the AD backend. (This will often be pre-compiled tapes/caches.)
 - `f`: Function subject to differentiation.
-- `θ`: The point to evaluate the gradient.
+- `x`: The point to evaluate the gradient.
+- `aux`: Auxiliary input passed to `f`.
 - `out::DiffResults.MutableDiffResult`: Buffer to contain the output gradient and function value.
 """
 function value_and_gradient! end
 
+"""
+    init_adbackend(adtype, f, x)
+    init_adbackend(adtype, f, x, aux)
+
+Initialize the AD backend and setup states necessary.
+
+# Arguments
+- `ad::ADTypes.AbstractADType`: Automatic differentiation backend. 
+- `f`: Function subject to differentiation.
+- `x`: The point to evaluate the gradient.
+- `aux`: Auxiliary input passed to `f`.
+
+# Returns
+- `ad_st`: State of the AD backend. (This will often be pre-compiled tapes/caches.)
+"""
+init_adbackend(::ADTypes.AbstractADType, ::Any, ::Any)        = nothing
+init_adbackend(::ADTypes.AbstractADType, ::Any, ::Any, ::Any) = nothing
+
 # Update for gradient descent step
 """
     update_variational_params!(family_type, opt_st, params, restructure, grad)
@@ -78,22 +100,27 @@ If the estimator is stateful, it can implement `init` to initialize the state.
 abstract type AbstractVariationalObjective end
 
 """
-    init(rng, obj, λ, restructure)
+    init(rng, obj, adtype, prob, params, restructure)
 
-Initialize a state of the variational objective `obj` given the initial variational parameters `λ`.
+Initialize a state of the variational objective `obj`.
 This function needs to be implemented only if `obj` is stateful.
+The state of the AD backend `adtype` shall also be initialized here.
 
 # Arguments
 - `rng::Random.AbstractRNG`: Random number generator.
 - `obj::AbstractVariationalObjective`: Variational objective.
-- `λ`: Initial variational parameters.
+- `adtype::ADTypes.ADType`:Automatic differentiation backend.
+- `prob`: The target log-joint likelihood implementing the `LogDensityProblem` interface.
+- `params`: Initial variational parameters.
 - `restructure`: Function that reconstructs the variational approximation from `λ`.
 """
 init(
     ::Random.AbstractRNG,
     ::AbstractVariationalObjective,
-    ::AbstractVector,
-    ::Any
+    ::Any,
+    ::Any,
+    ::Any,
+    ::Any,
 ) = nothing
 
 """

diff --git a/src/objectives/elbo/repgradelbo.jl b/src/objectives/elbo/repgradelbo.jl
@@ -56,14 +56,13 @@ function estimate_energy_with_samples(prob, samples)
 end
 
 """
-    reparam_with_entropy(rng, q, q_stop, n_samples, ent_est)
+    reparam_with_entropy(rng, q, n_samples, ent_est)
 
 Draw `n_samples` from `q` and compute its entropy.
 
 # Arguments
 - `rng::Random.AbstractRNG`: Random number generator.
 - `q`: Variational approximation.
-- `q_stop`: `q` but with its gradient stopped.
 - `n_samples::Int`: Number of Monte Carlo samples 
 - `ent_est`: The entropy estimation strategy. (See `estimate_entropy`.)
 
@@ -72,7 +71,11 @@ Draw `n_samples` from `q` and compute its entropy.
 - `entropy`: An estimate (or exact value) of the differential entropy of `q`.
 """
 function reparam_with_entropy(
-    rng::Random.AbstractRNG, q, q_stop, n_samples::Int, ent_est::AbstractEntropyEstimator
+    rng      ::Random.AbstractRNG,
+    q,
+    q_stop,
+    n_samples::Int,
+    ent_est  ::AbstractEntropyEstimator
 )
     samples = rand(rng, q, n_samples)
     entropy = estimate_entropy_maybe_stl(ent_est, samples, q, q_stop)
@@ -94,28 +97,46 @@ end
 estimate_objective(obj::RepGradELBO, q, prob; n_samples::Int = obj.n_samples) =
     estimate_objective(Random.default_rng(), obj, q, prob; n_samples)
 
+function estimate_repgradelbo_ad_forward(params′, aux)
+    @unpack rng, obj, problem, restructure, q_stop = aux
+    q = restructure(params′)
+    samples, entropy = reparam_with_entropy(rng, q, q_stop, obj.n_samples, obj.entropy)
+    energy = estimate_energy_with_samples(problem, samples)
+    elbo = energy + entropy
+    -elbo
+end
+
+function init(
+    rng         ::Random.AbstractRNG,
+    obj         ::RepGradELBO,
+    adtype      ::ADTypes.AbstractADType,
+    prob,
+    params,
+    restructure,
+)
+    q_stop = restructure(params)
+    aux = (rng=rng, obj=obj, problem=prob, restructure=restructure, q_stop=q_stop)
+    ad_st = init_adbackend(adtype, estimate_repgradelbo_ad_forward, params, aux)
+    (ad_st=ad_st,)
+end
+
 function estimate_gradient!(
     rng   ::Random.AbstractRNG,
     obj   ::RepGradELBO,
     adtype::ADTypes.AbstractADType,
     out   ::DiffResults.MutableDiffResult,
     prob,
-    λ,
+    params,
     restructure,
     state,
 )
-    q_stop = restructure(λ)
-    function f(λ′)
-        q = restructure(λ′)
-        samples, entropy = reparam_with_entropy(rng, q, q_stop, obj.n_samples, obj.entropy)
-        energy = estimate_energy_with_samples(prob, samples)
-        elbo = energy + entropy
-        -elbo
-    end
-    value_and_gradient!(adtype, f, λ, out)
-
+    q_stop = restructure(params)
+    ad_st  = state.ad_st
+    aux = (rng=rng, obj=obj, problem=prob, restructure=restructure, q_stop=q_stop)
+    value_and_gradient!(
+        adtype, ad_st, estimate_repgradelbo_ad_forward, params, aux, out
+    )
     nelbo = DiffResults.value(out)
     stat  = (elbo=-nelbo,)
-
-    out, nothing, stat
+    out, state, stat
 end
diff --git a/src/optimize.jl b/src/optimize.jl
@@ -66,7 +66,9 @@ function optimize(
 )
     params, restructure = Optimisers.destructure(deepcopy(q_init))
     opt_st   = maybe_init_optimizer(state_init, optimizer, params)
-    obj_st   = maybe_init_objective(state_init, rng, objective, params, restructure)
+    obj_st   = maybe_init_objective(
+        state_init, rng, adtype, objective, problem, params, restructure
+    )
     grad_buf = DiffResults.DiffResult(zero(eltype(params)), similar(params))
     stats    = NamedTuple[]
 

diff --git a/src/utils.jl b/src/utils.jl
@@ -6,19 +6,29 @@ end
 function maybe_init_optimizer(
     state_init::NamedTuple,
     optimizer ::Optimisers.AbstractRule,
-    params    ::AbstractVector
+    params
 )
-    haskey(state_init, :optimizer) ? state_init.optimizer : Optimisers.setup(optimizer, params)
+    if haskey(state_init, :optimizer)
+        state_init.optimizer
+    else
+        Optimisers.setup(optimizer, params)
+    end
 end
 
 function maybe_init_objective(
     state_init::NamedTuple,
     rng       ::Random.AbstractRNG,
+    adtype    ::ADTypes.AbstractADType,
     objective ::AbstractVariationalObjective,
-    params    ::AbstractVector,
+    problem,
+    params,
     restructure
 )
-    haskey(state_init, :objective) ? state_init.objective : init(rng, objective, params, restructure)
+    if haskey(state_init, :objective)
+        state_init.objective
+    else
+        init(rng, objective, adtype, problem, params, restructure)
+    end
 end
 
 eachsample(samples::AbstractMatrix) = eachcol(samples)

diff --git a/test/Project.toml b/test/Project.toml
@@ -1,9 +1,9 @@
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
+DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 DistributionsAD = "ced4e74d-a319-5a8a-b0ac-84af2272839c"
-Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
@@ -17,6 +17,7 @@ ReverseDiff = "37e2e3b7-166d-5795-8a7a-e32c996b4267"
 SimpleUnPack = "ce78b400-467f-4804-87d8-8f486da07d0a"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+Tapir = "07d77754-e150-4737-8c94-cd238a1fb45b"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
@@ -26,7 +27,6 @@ ADTypes = "0.2.1, 1"
 Bijectors = "0.13"
 Distributions = "0.25.100"
 DistributionsAD = "0.6.45"
-Enzyme = "0.12"
 FillArrays = "1.6.1"
 ForwardDiff = "0.10.36"
 Functors = "0.4.5"
@@ -39,6 +39,7 @@ ReverseDiff = "1.15.1"
 SimpleUnPack = "1.1.0"
 StableRNGs = "1.0.0"
 Statistics = "1"
+Tapir = "0.2.23"
 Test = "1"
 Tracker = "0.2.20"
 Zygote = "0.6.63"