Compatibility with StateSpaceSets v2 (#392)

* update compat with statespacesets 2 * Depend on AbstractStateSpaceSet, not deprecated AbstractDataset * sssets v2 * temporary constructor for multiple dataset inputs the current statespacesets v2.3 constructor is way too slow * Fix transfer entropy transfer operator estimation * shuffle order, so utils are loaded first * horizontal concatenation * Use correct name, not the deprecated one * Use local variables * Explicitly test vector + statespaceset input * move agreement test to utils * Add more OCE tests * Fix marginal construction * use local variables * Improve docstrings * Update changelog * Set minimum julia version to LTS release
JuliaDynamics · Nov 16, 2024 · 1054b8f · 1054b8f · kahaaga · Nov 16, 2024
1 parent 27a2bb8
commit 1054b8f
Show file tree

Hide file tree

Showing 21 changed files with 157 additions and 51 deletions.
diff --git a/Project.toml b/Project.toml
@@ -2,7 +2,7 @@ name = "Associations"
 uuid = "614afb3a-e278-4863-8805-9959372b9ec2"
 authors = ["Kristian Agasøster Haaga <[email protected]>", "Tor Einar Møller <[email protected]>", "George Datseris <[email protected]>"]
 repo = "https://github.com/kahaaga/Associations.jl.git"
-version = "4.2.0"
+version = "4.3.0"
 
 [deps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
@@ -32,23 +32,23 @@ TimeseriesSurrogates = "c804724b-8c18-5caa-8579-6025a0767c70"
 [compat]
 Accessors = "^0.1.28"
 Combinatorics = "1"
-ComplexityMeasures = "3.6.5"
+ComplexityMeasures = "3.7.3"
 DSP = "^0.7"
-DelayEmbeddings = "2.7"
+DelayEmbeddings = "2.8"
 Distances = "^0.10"
 Distributions = "^0.25"
 Graphs = "^1.11"
 HypothesisTests = "^0.11"
 Neighborhood = "^0.2.4"
 ProgressMeter = "1.10"
-RecurrenceAnalysis = "2"
+RecurrenceAnalysis = "2.1"
 Reexport = "1"
 Scratch = "1"
 SpecialFunctions = "2"
-StateSpaceSets = "^1.5"
+StateSpaceSets = "2.1"
 StaticArrays = "^1"
 Statistics = "1"
 StatsBase = "^0.34"
 StyledStrings = "1"
-TimeseriesSurrogates = "2.6"
-julia = "^1.10"
+TimeseriesSurrogates = "2.7"
+julia = "^1.10.6"
diff --git a/changelog.md b/changelog.md
@@ -2,6 +2,11 @@
 
 From version v4.0 onwards, this package has been renamed to to Associations.jl.
 
+# 4.3
+
+- Compatiblity with StateSpaceSets.jl v2.X
+- Improved documentation strings for `GaoOhViswanath` and `GaoKannanOhViswanath`
+
 # 4.2
 
 - New association measure: `AzadkiaChatterjeeCoefficient`.

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -26,4 +26,4 @@ TimeseriesSurrogates = "c804724b-8c18-5caa-8579-6025a0767c70"
 
 [compat]
 DynamicalSystemsBase = "3"
-julia = "^1.6"
+julia = "^1.10.6"
diff --git a/src/Associations.jl b/src/Associations.jl
@@ -10,17 +10,16 @@ module Associations
     using Reexport
 
     using StateSpaceSets
+
     using DelayEmbeddings: embed, genembed
     export embed, genembed
 
     import HypothesisTests: pvalue
     export trajectory
-    @reexport using StateSpaceSets
-    @reexport using ComplexityMeasures
-    @reexport using TimeseriesSurrogates
 
-    include("utils/utils.jl")
+
     include("core.jl")
+    include("utils/utils.jl")
 
     include("methods/information/information.jl")
     include("methods/crossmappings/crossmappings.jl")
@@ -37,6 +36,9 @@ module Associations
 
     include("deprecations/deprecations.jl")
 
+    @reexport using StateSpaceSets
+    @reexport using ComplexityMeasures
+    @reexport using TimeseriesSurrogates
     # Update messages:
     using Scratch
     display_update = true

diff --git a/src/causal_graphs/oce/OCE.jl b/src/causal_graphs/oce/OCE.jl
@@ -89,7 +89,7 @@ function infer_graph(alg::OCE, x; verbose = true)
     return select_parents(alg, x; verbose)
 end
 
-function infer_graph(alg::OCE, x::AbstractDataset; verbose = true)
+function infer_graph(alg::OCE, x::AbstractStateSpaceSet; verbose = true)
     return infer_graph(alg, columns(x); verbose)
 end
 

diff --git a/src/methods/information/definitions/mutual_informations/mutual_informations.jl b/src/methods/information/definitions/mutual_informations/mutual_informations.jl
@@ -32,13 +32,12 @@ end
 
 function marginal_entropies_mi3h_discrete(est::EntropyDecomposition{<:MutualInformation, <:DiscreteInfoEstimator}, x, y)
     # Encode marginals to integers based on the outcome space.
-    eX, eY = codified_marginals(est.discretization, x, y)
-    eXY = StateSpaceSet(eX, eY)
+    eX::StateSpaceSet, eY::StateSpaceSet = StateSpaceSet.(codified_marginals(est.discretization, x, y))
+    eXY::StateSpaceSet = StateSpaceSet(eX, eY)
 
     # The outcome space is no longer relevant from this point on. We're done discretizing, 
     # so now we can just count (i.e. use `UniqueElements` as the outcome space).
     o = UniqueElements()
-
     modified_est = estimator_with_overridden_parameters(est.definition, est.est)
     HX = information(modified_est, est.pest, o, eX) # estimates entropy in the X marginal
     HY = information(modified_est, est.pest, o, eY) # estimates entropy in the Y marginal

diff --git a/src/methods/information/definitions/transferentropy/transferoperator.jl b/src/methods/information/definitions/transferentropy/transferoperator.jl
@@ -1,4 +1,3 @@
-
 import ComplexityMeasures: TransferOperator, invariantmeasure, InvariantMeasure, Probabilities
 using ComplexityMeasures.GroupSlices
 export TransferOperator
@@ -47,19 +46,9 @@ end
 
 function _marginal_encodings(encoder::RectangularBinEncoding, x::VectorOrStateSpaceSet...)
     X = StateSpaceSet(StateSpaceSet.(x)...)
-    bins = [vec(encode_as_tuple(encoder, pt))' for pt in X]
+    bins = [vec(encode_as_tuple(encoder, pt))' for pt in unique(X.data)]
     joint_bins = reduce(vcat, bins)
-    idxs = size.(x, 2) #each input can have different dimensions
-    s = 1
-    encodings = Vector{StateSpaceSet}(undef, length(idxs))
-    for (i, cidx) in enumerate(idxs)
-        variable_subset = s:(s + cidx - 1)
-        s += cidx
-        y = @views joint_bins[:, variable_subset]
-        encodings[i] = StateSpaceSet(y)
-    end
-
-    return encodings
+    return StateSpaceSet(joint_bins)
 end
 
 # Only works for `RelativeAmount`, because probabilities are obtained from the 
@@ -88,9 +77,11 @@ function h4_marginal_probs(
     # marginals, not a single encoding integer for each bin. Otherwise, we can't
     # properly subset marginals here and relate them to the approximated invariant measure.
     encoding = iv.to.encoding
+    # Visited bins (absolute coordinates)
     visited_bins_coordinates = StateSpaceSet(decode.(Ref(encoding), iv.to.bins))
-    unique_visited_bins = _marginal_encodings(iv.to.encoding, visited_bins_coordinates)[1]
 
+    # Visited bins (coordinates encoded to integers using rectangular encoding)
+    unique_visited_bins = _marginal_encodings(iv.to.encoding, visited_bins_coordinates)
     # # The subset of visited bins with nonzero measure
     inds_non0measure = findall(iv.ρ .> 0)
     positive_measure_bins = unique_visited_bins[inds_non0measure]

diff --git a/src/methods/information/estimators/mutual_info_estimators/GaoKannanOhViswanath.jl b/src/methods/information/estimators/mutual_info_estimators/GaoKannanOhViswanath.jl
@@ -4,7 +4,7 @@ export GaoKannanOhViswanath
 
 """
     GaoKannanOhViswanath <: MutualInformationEstimator
-    GaoKannanOhViswanath(; k = 1, w = 0)
+    GaoKannanOhViswanath(definition = MIShannon(); k = 1, w = 0)
 
 The `GaoKannanOhViswanath` (Shannon) estimator is designed for estimating
 Shannon mutual information between variables that may be either discrete, continuous or
@@ -14,6 +14,14 @@ a mixture of both [GaoKannanOhViswanath2017](@cite).
 
 - [`MIShannon`](@ref)
 
+## Keyword arguments
+
+- **`k::Int`**: The number of nearest neighbors to consider. Only information about the
+    `k`-th nearest neighbor is actually used.
+- **`w::Int`**: The Theiler window, which determines if temporal neighbors are excluded
+    during neighbor searches in the joint space. Defaults to `0`, meaning that only the
+    point itself is excluded.
+
 ## Usage
 
 - Use with [`association`](@ref) to compute Shannon mutual information from input data.

diff --git a/src/methods/information/estimators/mutual_info_estimators/GaoOhViswanath.jl b/src/methods/information/estimators/mutual_info_estimators/GaoOhViswanath.jl
@@ -3,6 +3,7 @@ export GaoOhViswanath
 
 """
     GaoOhViswanath <: MutualInformationEstimator
+    GaoOhViswanath(definition = MIShannon(); k = 1, w = 0)
 
 The `GaoOhViswanath` is a mutual information estimator based on nearest neighbors,
 and is also called the bias-improved-KSG estimator, or BI-KSG, by [Gao2018](@cite).
@@ -11,6 +12,14 @@ and is also called the bias-improved-KSG estimator, or BI-KSG, by [Gao2018](@cit
 
 - [`MIShannon`](@ref)
 
+## Keyword arguments
+
+- **`k::Int`**: The number of nearest neighbors to consider. Only information about the
+    `k`-th nearest neighbor is actually used.
+- **`w::Int`**: The Theiler window, which determines if temporal neighbors are excluded
+    during neighbor searches in the joint space. Defaults to `0`, meaning that only the
+    point itself is excluded.
+
 ## Usage
 
 - Use with [`association`](@ref) to compute Shannon mutual information from input data.

diff --git a/src/methods/information/estimators/mutual_info_estimators/KSG1.jl b/src/methods/information/estimators/mutual_info_estimators/KSG1.jl
@@ -68,10 +68,13 @@ const KSG1 = KraskovStögbauerGrassberger1
 
 function association(est::KSG1{<:MIShannon}, x::VectorOrStateSpaceSet...)
     verify_number_of_inputs_vars(est.definition, length(x))
-
     (; definition, k, w, metric_joint, metric_marginals) = est
-    joint = StateSpaceSet(x...)
+
     marginals = map(xᵢ -> StateSpaceSet(xᵢ), x)
+    # Note: this uses a StateSpaceSet constructor that is overloaded from StateSpaceSets.jl, because the native 
+    # one is extremely slow.
+    joint::StateSpaceSet = StateSpaceSet(marginals...)
+
     M = length(x)
     N = length(joint)
 

diff --git a/src/methods/information/estimators/mutual_info_estimators/KSG2.jl b/src/methods/information/estimators/mutual_info_estimators/KSG2.jl
@@ -97,8 +97,10 @@ function association(est::KSG2{<:MIShannon}, x::VectorOrStateSpaceSet...)
         error("Need at leats two input StateSpaceSets to compute mutual information between them.")
 
     (; definition, k, w, metric_joint, metric_marginals) = est
-    joint = StateSpaceSet(x...)
     marginals = map(xᵢ -> StateSpaceSet(xᵢ), x)
+    # Note: this uses a StateSpaceSet constructor that is overloaded from StateSpaceSets.jl, because the native 
+    # one is extremely slow.
+    joint::StateSpaceSet = StateSpaceSet(marginals...)
     M = length(x)
     N = length(joint)
 

diff --git a/src/utils/statespaceset_concat.jl b/src/utils/statespaceset_concat.jl
@@ -0,0 +1,9 @@
+# This is a workaround until we can make faster horizontal concatenation in StateSpaceSets.jl
+import StateSpaceSets: StateSpaceSet
+using StaticArrays: SVector
+export StateSpaceSet
+
+function StateSpaceSet(x::VectorOrStateSpaceSet...; container = SVector)
+    xs = (xᵢ isa AbstractStateSpaceSet ? Matrix(xᵢ) : reshape(xᵢ, length(xᵢ), 1) for xᵢ in x)
+    StateSpaceSet(hcat(xs...); container)
+end
diff --git a/src/utils/utils.jl b/src/utils/utils.jl
@@ -1,2 +1,3 @@
 include("logs.jl")
 include("multidimensional_surrogates.jl")
+include("statespaceset_concat.jl")
diff --git a/test/causal_graphs/oce.jl b/test/causal_graphs/oce.jl
@@ -5,6 +5,45 @@ using StableRNGs
 using Graphs.SimpleGraphs: SimpleEdge
 using DynamicalSystemsBase
 
+
+# ----------------------------------------------------------------
+# Check most possible combinations
+# ----------------------------------------------------------------
+@testset "OCE: SurrogateAssociationTest with MI/CMI"  begin
+    rng = StableRNG(123)
+    sys = system(Logistic4Chain(; rng))
+    X = columns(first(trajectory(sys, 15, Ttr = 10000)))
+
+    mi_ests = [
+        KSG1(MIShannon(), k = 2, w = 1), 
+        KSG2(MIShannon(), k = 2, w = 1), 
+        GaussianMI(),
+        GaoOhViswanath(),
+        ChatterjeeCorrelation(),
+        PearsonCorrelation(),
+        ]
+    cmi_ests = [
+        FPVP(CMIShannon(), k = 2, w = 1), 
+        MesnerShalizi(CMIShannon(), k = 2, w = 1), 
+        Rahimzamani(CMIShannon(); k = 2, w = 1),
+        GaussianCMI(),
+        AzadkiaChatterjeeCoefficient(),
+        PartialCorrelation(),
+    ]
+    for mi_est in mi_ests
+        utest = SurrogateAssociationTest(mi_est; rng, nshuffles = 2)
+        for cmi_est in cmi_ests
+            ctest = LocalPermutationTest(cmi_est; rng, nshuffles = 2)
+            alg = OCE(; utest, ctest, τmax = 1)
+            parents = infer_graph(alg, X; verbose = false)
+            @test parents isa Vector{<:OCESelectedParents}
+        end
+    end
+end
+# ----------------------------------------------------------------
+# A few examples with more data and more iterations
+# ----------------------------------------------------------------
+
 rng = StableRNG(123)
 sys = system(Logistic4Chain(; rng))
 X = columns(first(trajectory(sys, 50, Ttr = 10000)))
@@ -20,7 +59,7 @@ parents = infer_graph(alg, X; verbose = true)
 
 # Convenience method for `StateSpaceSet`s.
 d = first(trajectory(sys, 50, Ttr = 10000))
-parents = infer_graph(alg, d; verbose = true)
+parents = infer_graph(alg, d; verbose = false)
 @test parents isa Vector{<:OCESelectedParents}
 
 rng = StableRNG(123)
@@ -31,7 +70,7 @@ uest = KSG1(MIShannon(); k = 5, w = 1)
 cest = MesnerShalizi(CMIShannon(); k = 5, w = 1)
 utest = SurrogateAssociationTest(uest; rng, nshuffles = 19)
 ctest = LocalPermutationTest(cest; rng, nshuffles = 19)
-parents = infer_graph(OCE(; utest, ctest, τmax = 1), X; verbose = true)
+parents = infer_graph(OCE(; utest, ctest, τmax = 1), X; verbose = false)
 @test parents isa Vector{<:OCESelectedParents}
 g = SimpleDiGraph(parents)
 @test g isa SimpleDiGraph

diff --git a/test/independence/LocalPermutationTest/part_mutual_information.jl b/test/independence/LocalPermutationTest/part_mutual_information.jl
@@ -10,9 +10,9 @@ Y = StateSpaceSet(y)
 Z = StateSpaceSet(z)
 
 nshuffles = 2
-est_ord = JointProbabilities(PMI(), CodifyVariables(OrdinalPatterns()))
-est_vh = JointProbabilities(PMI(), CodifyVariables(ValueHistogram(3)))
-est_dp = JointProbabilities(PMI(), CodifyVariables( Dispersion(m = 2)))
+est_ord = JointProbabilities(PartialMutualInformation(), CodifyVariables(OrdinalPatterns()))
+est_vh = JointProbabilities(PartialMutualInformation(), CodifyVariables(ValueHistogram(3)))
+est_dp = JointProbabilities(PartialMutualInformation(), CodifyVariables( Dispersion(m = 2)))
 
 lptest_sp = LocalPermutationTest(est_ord; nshuffles, rng)
 lptest_vh = LocalPermutationTest(est_vh; nshuffles, rng)

diff --git a/test/methods/correlation/azadkia_chatterjee_coefficient.jl b/test/methods/correlation/azadkia_chatterjee_coefficient.jl
@@ -18,7 +18,10 @@ Tₙs_pairwise = zeros(10)
 
 m = AzadkiaChatterjeeCoefficient()
 for i = 1:10
-    n = 1000; x1, x2 = randn(n), randn(n); y = x1 .^2 .+ x2 .^ 2; z = atan.(x1 ./ x2)
+    n = 1000; 
+    local x1 = randn(n)
+    local x2 = randn(n); y = x1 .^2 .+ x2 .^ 2; 
+    local z = atan.(x1 ./ x2)
     Tₙs_cond[i] = association(m, y, z, x1)
     Tₙs_pairwise[i] = association(m, y, z)
 end

diff --git a/test/methods/correlation/chatterjee_correlation.jl b/test/methods/correlation/chatterjee_correlation.jl
@@ -14,8 +14,8 @@ m = ChatterjeeCorrelation()
 # We should get exactly the same results for preallocated measure 
 # as for non-preallocated measure.
 for i = 1:10
-    x = rand(rng, 15)
-    y = rand(rng, 15)
+    local x = rand(rng, 15)
+    local y = rand(rng, 15)
 
     # We must initialize identical seeds to ensure reproducible results
     rng_seed = rand(rng, 1:100) 

diff --git a/test/methods/information/mutual_informations/mi_shannon.jl b/test/methods/information/mutual_informations/mi_shannon.jl
@@ -70,6 +70,16 @@ def = MIShannon()
 @test association(GaussianMI(def, normalize = false), x, y) isa Real
 @test association(GaussianMI(def; normalize = true), x, y) isa Real
 
+# input is vector + dataset
+x = rand(rng, 30)
+Y = StateSpaceSet(rand(rng, 30))
+@test association(KSG1(def, k = 2), x, Y) isa Real
+@test association(KSG2(def, k = 2), x, Y) isa Real
+@test association(GaoOhViswanath(def, k = 2), x, Y) isa Real
+@test association(GaoKannanOhViswanath(def, k = 2), x, Y) isa Real
+@test association(GaussianMI(def, normalize = false), x, Y) isa Real
+@test association(GaussianMI(def; normalize = true), x, Y) isa Real
+
 # ---------------
 # Pretty printing
 # ---------------