Nessie (#47)

* question * somewhat better * took it out * tout en vrac * back to the formula in relsurv -- results differ IDK why. * coment * up on ratetable branch * Typo * remove old version * dunno what to do with the output * some docs * question * somewhat better * took it out * tout en vrac * back to the formula in relsurv -- results differ IDK why. * coment * up on ratetable branch * Typo * some docs * remove old version * dunno what to do with the output * remove useless comments * upgrade RateTables dependency * error * correct tet * last push * docs --------- Co-authored-by: Oskar Laverny <[email protected]>
JuliaSurv · May 23, 2024 · 9c06966 · 9c06966
1 parent 4e327c8
commit 9c06966
Show file tree

Hide file tree

Showing 9 changed files with 115 additions and 8 deletions.
diff --git a/Project.toml b/Project.toml
@@ -19,7 +19,7 @@ CSV = "0.10"
 DataFrames = "1"
 Distributions = "0.25"
 LinearAlgebra = "1.6"
-RateTables = "0.1"
+RateTables = "0.1.1"
 RCall = "0.14"
 StatsAPI = "1"
 StatsBase = "0.34"

diff --git a/README.md b/README.md
@@ -24,6 +24,7 @@ Some key features in `NetSurvival.jl` are:
 
 - A panel of different non-parametric net survival estimators (Ederer I, Ederer II, Hakulinen, Pohar Perme) with an interface compliant with Julia's standards. 
 - Grafféo's log-rank test to compare net survival curves accross groups, including stratified testing.
+- A 'Nessie' function that outputs the estimated sample size by yearly intervals and the average lifespan expectancy left for a given group. 
 - A compact, readable and efficient codebase (up to 100x less LOC than `relsurv` for the same functionalities), ensuring long-term maintenability.
 - Significant performance improvements (see below) compared `relsurv`.
 

diff --git a/docs/src/example.md b/docs/src/example.md
@@ -238,3 +238,23 @@ plot(plot1, plot2, layout = (1, 2))
 ```
 
 Visually, it is almost immediately understood that there are no worthy differences between the two sexes whereas the `age65` variable seems to play a big role.
+
+
+## Estimated sample size and life expectancy 
+
+Given that the age group plays a significant role in the study, we will now estimate the sample size by yearly intervals in order to better compare the age groups.
+
+```@example 2
+elt, ess = nessie(@formula(Surv(time,status)~age65), colrec, slopop)
+elt
+```
+
+The expected life time for the younger patients is significatively higher than for older patients (24.78 years > 10.29 years).
+
+```@example 2
+hcat(ess[:,3]...)
+```
+
+Finally, the table above represents yearly expected sample sizes for both age groups under 65 and above, with the second column representing the latter. We can see that the sample size decreases for the older patients in a much more dramatic way than for the younger ages.
+
+Unsurprisingly, we can thus conclude that age plays an important role in the study.
diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md
@@ -115,6 +115,22 @@ Under $H_0$, the statistic $U(T)$ is asymptotically $\chi^2(k-1)$-distributed. W
 GraffeoTest
 ```
 
+## Nessie
+
+The Nessie function estimates the sample size by yearly intervals as well as averages an estimated lifespan left for a given group.  
+
+This function is highly dependant on the `Life` function taken from the `RateTables.jl` package which you can find documented [here](https://juliasurv.github.io/RateTables.jl/dev/).
+
+The sample size is thus taken by the following formula:
+
+$$ESS(t) = \sum_i^N S_{P_i}(t) * \exp(-\Lambda_{P_i}(t))$$
+
+While the estimated lifepsan is directly taken from the `expectation` function. 
+
+```@docs
+nessie
+```
+
 ## References
 
 ```@bibliography

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -18,7 +18,8 @@ Some key features in `NetSurvival.jl` are:
 
 - A panel of different non-parametric net survival estimators (Ederer I [Ederer1961](@cite), Ederer II [Ederer1959](@cite), Hakulinen [Hakulinen1977](@cite), Pohar Perme [PoharPerme2012](@cite)) with an interface compliant with Julia's standards. 
 - Grafféo's log-rank test [Graffeo2016](@cite) to compare net survival curves accross groups, including stratified testing.
-- Crude mortality, Expected Sample Size, and other usefull metrics in net survival field.
+- Crude mortality, Expected Sample Size, and other useful metrics in net survival field.
+- A 'Nessie' function that outputs the estimated sample size by yearly intervals and the average lifespan expectancy left for a given group. 
 - A compact, readable and efficient codebase (up to 1000x less LOC than `relsurv` for the same functionalities), ensuring long-term maintenability.
 - Significant performance improvements (up to 50x) compared to the R package `relsurv`.
 

diff --git a/src/NPNSEstimator.jl b/src/NPNSEstimator.jl
@@ -40,7 +40,7 @@ function _get_rate_predictors(rt,df)
     return prd
 end
 
-function StatsBase.fit(::Type{E}, formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) where {E<:NPNSEstimator}
+function StatsBase.fit(::Type{E}, formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) where {E<:Union{NPNSEstimator, Nessie}}
     rate_predictors = _get_rate_predictors(rt,df)
     formula_applied = apply_schema(formula,schema(df))
 

diff --git a/src/Nessie.jl b/src/Nessie.jl
@@ -0,0 +1,44 @@
+struct Nessie
+    expected_sample_size::Vector{Float64}
+    expected_life_time::Float64
+    grid::Vector{Float64}
+    function Nessie(T, Δ, age, year, rate_preds, ratetable)
+        annual_grid = 0:RateTables.RT_DAYS_IN_YEAR:maximum(T)
+        exp_spl_size = zeros(length(annual_grid))
+        exp_life_time = 0.0
+        for i in eachindex(age)
+            Lᵢ = Life(ratetable[rate_preds[i,:]...], age[i], year[i])
+            for j in eachindex(annual_grid)
+                exp_spl_size[j] += ccdf(Lᵢ, annual_grid[j])
+            end
+            exp_life_time += expectation(Lᵢ)
+        end
+        return new(exp_spl_size, exp_life_time / RateTables.RT_DAYS_IN_YEAR / length(age), annual_grid)
+    end
+end
+
+"""
+    nessie 
+
+To call this function, use the formula below: 
+
+    nessie(@formula(Surv(time,status)~covariate), data, ratetable)
+"""
+function nessie(args...)
+    r = fit(Nessie,args...)
+    if (typeof(r)<:Nessie)
+        return r
+    end
+    transform!(r, :estimator => ByRow(x-> (x.grid, x.expected_life_time, x.expected_sample_size)) => [:grid, :expected_life_time,:expected_sample_size])
+    select!(r, Not(:estimator))
+
+    lt = deepcopy(r)
+    select!(lt, Not([:expected_sample_size, :grid]))
+
+    select!(r, Not(:expected_life_time))
+    return lt, r
+end
+
+# Maybe not necessary ? No need to clutter the interface too much.. 
+expected_life_time(x::Nessie) = x.expected_life_time
+expected_sample_size(x::Nessie) = x.expected_sample_size
diff --git a/src/NetSurvival.jl b/src/NetSurvival.jl
@@ -13,19 +13,18 @@ using RateTables
 
 include("fetch_datasets.jl")
 include("Surv_and_Strata.jl")
-
+include("Nessie.jl")
 include("NPNSEstimator.jl")
 include("PoharPerme.jl")
 include("EdererI.jl")
 include("EdererII.jl")
 include("Hakulinen.jl")
-
 include("CrudeMortality.jl")
-
 include("GraffeoTest.jl")
 
 export PoharPerme, EdererI, EdererII, Hakulinen
 export CrudeMortality
+export Nessie, nessie
 export fit, confint
 export GraffeoTest
 export Surv, Strata

diff --git a/test/sampletest.jl b/test/sampletest.jl
@@ -108,11 +108,11 @@ end
 
     # Coompare results with R: 
     compare_with_R(v1, vR)
-    compare_with_R(v1_strat, vR_strat) # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<------------------- This ones fails 
+    compare_with_R(v1_strat, vR_strat)
 
     # Check for equality of the two interfaces: 
     check_equal(v1,v2)
-    check_equal(v1_strat,v2_strat) # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<------------------- This ones fails 
+    check_equal(v1_strat,v2_strat)
 end
 
 
@@ -146,4 +146,30 @@ end
     err_pop =       (r[:population][:est][2:end, :] .- instance.Λₚ[1:end, :]) ./ r[:population][:est][2:end, :]
     @test all(abs.(err_causeSpec) .<= 0.01)
     @test all(abs.(err_pop)       .<= 0.01)
+end
+
+@testitem "Assess Nessie" begin
+    using RateTables
+    using RCall
+
+    R"""
+    rez = relsurv::nessie(survival::Surv(time, stat) ~ sex, data = relsurv::colrec, ratetable = relsurv::slopop, rmap = list(age = age, sex = sex, year = diag))
+    mata = t(as.matrix(rez$mata))
+    povp = rez$povp
+    """
+    r_mata = @rget mata
+    r_povp = @rget povp
+    r_male, r_female = r_mata[:,1], r_mata[:,2]
+
+    instance = nessie(@formula(Surv(time,status)~sex), colrec, slopop)
+    jl_male, jl_female = instance[2].expected_sample_size
+    jl_povp = instance[1].expected_life_time
+
+    err_male = (r_male[1:end-1]  .- jl_male) ./ r_male[1:end-1]
+    err_female = (r_female[1:end-1]  .- jl_female) ./ r_female[1:end-1]
+    err_povp = (r_povp  .- jl_povp) ./ r_povp
+
+    @test all(abs.(err_male)   .<= 0.01)
+    @test all(abs.(err_female) .<= 0.01)
+    @test all(abs.(err_povp)   .<= 0.01)
 end