From ba34ddbb17d882232b7920e739394bdcd9f9cd4c Mon Sep 17 00:00:00 2001 From: rimhajal Date: Tue, 14 May 2024 15:28:03 +0200 Subject: [PATCH 01/28] question --- src/Nessie.jl | 40 ++++++++++++++++++++++++++++++++++++++++ src/NetSurvival.jl | 3 +++ 2 files changed, 43 insertions(+) create mode 100644 src/Nessie.jl diff --git a/src/Nessie.jl b/src/Nessie.jl new file mode 100644 index 0000000..84986c0 --- /dev/null +++ b/src/Nessie.jl @@ -0,0 +1,40 @@ +function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables.AbstractRateTable) + formula_applied = apply_schema(formula,schema(df)) + + nms = StatsModels.termnames(formula_applied.rhs) + if isa(nms, String) + pred_names = [nms] + else + pred_names = nms + end + + times = sort(unique(floor.(df.time ./ 365.241))) + times = unique([0.0; times]) + + times_d = times .* 365.241 + + new_df = groupby(df, pred_names) + k = Matrix(undef, nrow(unique(df[!,pred_names])), length(times)) + povp = zeros(nrow(unique(df[!,pred_names]))) + sit = zeros(length(times)) + num_pop = zeros(length(times)) + + for i in 1:nrow(unique(df[!,pred_names])) + for j in 1:nrow(new_df[i]) + Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) + rtᵢ = rt[rate_preds[j,:]...] + Λₚ = 0.0 + + for m in 1:Tᵢ + λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) + ∂Λₚ = λₚ #* (times_d[m+1]-times_d[m]) + Λₚ += ∂Λₚ + Sₚ = exp(-Λₚ) + num_pop[m] += Sₚ + sit[m] += (1-Sₚ) / λₚ + end + end + povp[i] = mean(sit ./ 365.241) + end + return num_pop, povp +end \ No newline at end of file diff --git a/src/NetSurvival.jl b/src/NetSurvival.jl index abead04..02fabda 100644 --- a/src/NetSurvival.jl +++ b/src/NetSurvival.jl @@ -22,10 +22,13 @@ include("Hakulinen.jl") include("CrudeMortality.jl") +include("Nessie.jl") + include("GraffeoTest.jl") export PoharPerme, EdererI, EdererII, Hakulinen export CrudeMortality +export Nessie export fit, confint export GraffeoTest export Surv, Strata From c7168a7b6c62d1a76d41f2f3f18ced9bc4b7aab4 Mon Sep 17 00:00:00 2001 From: rimhajal Date: Tue, 14 May 2024 15:28:03 +0200 Subject: [PATCH 02/28] somewhat better --- src/Nessie.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index 84986c0..3c949be 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -14,10 +14,9 @@ function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables. times_d = times .* 365.241 new_df = groupby(df, pred_names) - k = Matrix(undef, nrow(unique(df[!,pred_names])), length(times)) povp = zeros(nrow(unique(df[!,pred_names]))) sit = zeros(length(times)) - num_pop = zeros(length(times)) + num_pop = zeros(nrow(unique(df[!,pred_names])), length(times)) for i in 1:nrow(unique(df[!,pred_names])) for j in 1:nrow(new_df[i]) @@ -30,10 +29,10 @@ function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables. ∂Λₚ = λₚ #* (times_d[m+1]-times_d[m]) Λₚ += ∂Λₚ Sₚ = exp(-Λₚ) - num_pop[m] += Sₚ + num_pop[i,m] += Sₚ sit[m] += (1-Sₚ) / λₚ end - end + end povp[i] = mean(sit ./ 365.241) end return num_pop, povp From d26a29bf9c11e84a6a62bc16c84c44b72d72e0bd Mon Sep 17 00:00:00 2001 From: rimhajal Date: Tue, 14 May 2024 15:28:03 +0200 Subject: [PATCH 03/28] took it out --- src/Nessie.jl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index 3c949be..3ba5be2 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -1,5 +1,6 @@ -function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables.AbstractRateTable) +function Nessie(formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) formula_applied = apply_schema(formula,schema(df)) + rate_predictors = String.([RateTables.predictors(rt)...]) nms = StatsModels.termnames(formula_applied.rhs) if isa(nms, String) @@ -21,19 +22,20 @@ function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables. for i in 1:nrow(unique(df[!,pred_names])) for j in 1:nrow(new_df[i]) Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) + rate_preds = select(new_df[i],rate_predictors) rtᵢ = rt[rate_preds[j,:]...] Λₚ = 0.0 for m in 1:Tᵢ λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) - ∂Λₚ = λₚ #* (times_d[m+1]-times_d[m]) + ∂Λₚ = λₚ Λₚ += ∂Λₚ Sₚ = exp(-Λₚ) num_pop[i,m] += Sₚ sit[m] += (1-Sₚ) / λₚ end - end - povp[i] = mean(sit ./ 365.241) + end + povp[i] = mean(sit ./ 365.241) end return num_pop, povp end \ No newline at end of file From 4f4babf10dc2582506c3c00bec5bd5c57029e254 Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Tue, 14 May 2024 17:07:08 +0200 Subject: [PATCH 04/28] tout en vrac --- src/NPNSEstimator.jl | 2 +- src/Nessie.jl | 122 ++++++++++++++++++++++++++++++------------- src/NetSurvival.jl | 6 ++- 3 files changed, 92 insertions(+), 38 deletions(-) diff --git a/src/NPNSEstimator.jl b/src/NPNSEstimator.jl index b61e96c..7206807 100644 --- a/src/NPNSEstimator.jl +++ b/src/NPNSEstimator.jl @@ -39,7 +39,7 @@ function _get_rate_predictors(rt,df) return prd end -function StatsBase.fit(::Type{E}, formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) where {E<:NPNSEstimator} +function StatsBase.fit(::Type{E}, formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) where {E<:Union{NPNSEstimator, Nessie}} rate_predictors = _get_rate_predictors(rt,df) formula_applied = apply_schema(formula,schema(df)) diff --git a/src/Nessie.jl b/src/Nessie.jl index 3ba5be2..86f917b 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -1,41 +1,93 @@ -function Nessie(formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) - formula_applied = apply_schema(formula,schema(df)) - rate_predictors = String.([RateTables.predictors(rt)...]) - - nms = StatsModels.termnames(formula_applied.rhs) - if isa(nms, String) - pred_names = [nms] - else - pred_names = nms +struct Nessie + expected_sample_size::Vector{Float64} + expected_life_time::Float64 + grid::Vector{Float64} + function Nessie(T, Δ, age, year, rate_preds, ratetable) + grid = mk_grid([1,maximum(T)],1) + # grid = mk_grid(T,1) + expected_sample_size = zero(grid) + for i in eachindex(age) + # Tᵢ = searchsortedlast(grid, T[i]) + Λₚ = 0.0 + rtᵢ = ratetable[rate_preds[i,:]...] + for j in 1:(length(grid)-1) + λₚ = daily_hazard(rtᵢ, age[i] + grid[j], year[i] + grid[j]) + ∂Λₚ = λₚ * (grid[j+1]-grid[j]) # λₚ * ∂t + Λₚ += ∂Λₚ + Sₚ = exp(-Λₚ) + expected_sample_size[j] += Sₚ + end + end + expected_life_time = sum(expected_sample_size[1:(end-1)] .* diff(grid)) / length(age) + + annual_indices = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] + return new(expected_sample_size[annual_indices], expected_life_time / 365.241, grid[annual_indices]) end +end - times = sort(unique(floor.(df.time ./ 365.241))) - times = unique([0.0; times]) +""" + nessie(formula, data, ratetable) - times_d = times .* 365.241 +bla bla - new_df = groupby(df, pred_names) - povp = zeros(nrow(unique(df[!,pred_names]))) - sit = zeros(length(times)) - num_pop = zeros(nrow(unique(df[!,pred_names])), length(times)) +""" +function nessie(args...) + r = fit(Nessie,args...) + transform!(r, :estimator => ByRow(x-> (x.grid, x.expected_life_time, x.expected_sample_size)) => [:expected_sample_size,:expected_life_time, :grid]) + select!(r, Not(:estimator)) - for i in 1:nrow(unique(df[!,pred_names])) - for j in 1:nrow(new_df[i]) - Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) - rate_preds = select(new_df[i],rate_predictors) - rtᵢ = rt[rate_preds[j,:]...] - Λₚ = 0.0 + lt = deepcopy(r) + select!(lt, Not([:expected_sample_size, :grid])) - for m in 1:Tᵢ - λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) - ∂Λₚ = λₚ - Λₚ += ∂Λₚ - Sₚ = exp(-Λₚ) - num_pop[i,m] += Sₚ - sit[m] += (1-Sₚ) / λₚ - end - end - povp[i] = mean(sit ./ 365.241) - end - return num_pop, povp -end \ No newline at end of file + select!(r, Not(:expected_life_time)) + return lt, r +end + + +expected_life_time(x::Nessie) = x.expected_life_time +expected_sample_size(x::Nessie) = x.expected_sample_size + + + + +# function old_Nessie(formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) +# formula_applied = apply_schema(formula,schema(df)) +# rate_predictors = String.([RateTables.predictors(rt)...]) + +# nms = StatsModels.termnames(formula_applied.rhs) +# if isa(nms, String) +# pred_names = [nms] +# else +# pred_names = nms +# end + +# times = sort(unique(floor.(df.time ./ 365.241))) +# times = unique([0.0; times]) + +# times_d = times .* 365.241 + +# new_df = groupby(df, pred_names) +# povp = zeros(nrow(unique(df[!,pred_names]))) +# sit = zeros(length(times)) +# num_pop = zeros(nrow(unique(df[!,pred_names])), length(times)) + +# for i in 1:nrow(unique(df[!,pred_names])) +# for j in 1:nrow(new_df[i]) +# Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) +# rate_preds = select(new_df[i],rate_predictors) +# rtᵢ = rt[rate_preds[j,:]...] +# Λₚ = 0.0 + +# for m in 1:Tᵢ +# λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) +# ∂Λₚ = λₚ * 365.241 +# Λₚ += ∂Λₚ +# Sₚ = exp(-Λₚ) +# num_pop[i,m] += Sₚ +# sit[m] += (1-Sₚ) / λₚ +# end +# end +# povp[i] = mean(sit ./ 365.241) +# end +# return num_pop, povp +# end \ No newline at end of file diff --git a/src/NetSurvival.jl b/src/NetSurvival.jl index 02fabda..5197429 100644 --- a/src/NetSurvival.jl +++ b/src/NetSurvival.jl @@ -14,6 +14,8 @@ using RateTables include("fetch_datasets.jl") include("Surv_and_Strata.jl") +include("Nessie.jl") + include("NPNSEstimator.jl") include("PoharPerme.jl") include("EdererI.jl") @@ -22,13 +24,13 @@ include("Hakulinen.jl") include("CrudeMortality.jl") -include("Nessie.jl") + include("GraffeoTest.jl") export PoharPerme, EdererI, EdererII, Hakulinen export CrudeMortality -export Nessie +export Nessie, nessie export fit, confint export GraffeoTest export Surv, Strata From 79781305c7f4d5e840abf3693b99095e650f8f9b Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Tue, 14 May 2024 17:59:29 +0200 Subject: [PATCH 05/28] back to the formula in relsurv -- results differ IDK why. --- src/Nessie.jl | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index 86f917b..55fa747 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -3,25 +3,26 @@ struct Nessie expected_life_time::Float64 grid::Vector{Float64} function Nessie(T, Δ, age, year, rate_preds, ratetable) - grid = mk_grid([1,maximum(T)],1) - # grid = mk_grid(T,1) - expected_sample_size = zero(grid) + grid = mk_grid([1,maximum(T)],1) # mk_grid(T,1) + exp_spl_size = zeros(length(grid)) + life_time = 0.0 for i in eachindex(age) - # Tᵢ = searchsortedlast(grid, T[i]) Λₚ = 0.0 rtᵢ = ratetable[rate_preds[i,:]...] for j in 1:(length(grid)-1) + Sₚ = exp(-Λₚ) λₚ = daily_hazard(rtᵢ, age[i] + grid[j], year[i] + grid[j]) ∂Λₚ = λₚ * (grid[j+1]-grid[j]) # λₚ * ∂t Λₚ += ∂Λₚ - Sₚ = exp(-Λₚ) - expected_sample_size[j] += Sₚ + exp_spl_size[j] += Sₚ + life_time += Sₚ * (1 - exp(-∂Λₚ)) / λₚ end end - expected_life_time = sum(expected_sample_size[1:(end-1)] .* diff(grid)) / length(age) - annual_indices = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] - return new(expected_sample_size[annual_indices], expected_life_time / 365.241, grid[annual_indices]) + exp_life_time = life_time / 365.241 / length(age) + annually = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] + + return new(exp_spl_size[annually], exp_life_time, grid[annually]) end end @@ -33,7 +34,7 @@ bla bla """ function nessie(args...) r = fit(Nessie,args...) - transform!(r, :estimator => ByRow(x-> (x.grid, x.expected_life_time, x.expected_sample_size)) => [:expected_sample_size,:expected_life_time, :grid]) + transform!(r, :estimator => ByRow(x-> (x.grid, x.expected_life_time, x.expected_sample_size)) => [:grid, :expected_life_time,:expected_sample_size]) select!(r, Not(:estimator)) lt = deepcopy(r) From add3414bd81ef4d303af98fc5b65fc22d9f121ea Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Tue, 14 May 2024 18:01:09 +0200 Subject: [PATCH 06/28] coment --- src/Nessie.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index 55fa747..c4ad26f 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -15,13 +15,13 @@ struct Nessie ∂Λₚ = λₚ * (grid[j+1]-grid[j]) # λₚ * ∂t Λₚ += ∂Λₚ exp_spl_size[j] += Sₚ - life_time += Sₚ * (1 - exp(-∂Λₚ)) / λₚ + life_time += Sₚ * (1 - exp(-∂Λₚ)) / λₚ # see relsurv, netwei.c l 200. end end exp_life_time = life_time / 365.241 / length(age) annually = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] - + return new(exp_spl_size[annually], exp_life_time, grid[annually]) end end From 6e9846629771370807f6fb9209b9e5dd5a067706 Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Tue, 14 May 2024 21:34:41 +0200 Subject: [PATCH 07/28] up on ratetable branch --- src/Nessie.jl | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index c4ad26f..f6942f3 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -3,26 +3,17 @@ struct Nessie expected_life_time::Float64 grid::Vector{Float64} function Nessie(T, Δ, age, year, rate_preds, ratetable) - grid = mk_grid([1,maximum(T)],1) # mk_grid(T,1) - exp_spl_size = zeros(length(grid)) - life_time = 0.0 + annual_grid = 1:365.241:maximum(T) + exp_spl_size = zeros(length(annual_grid)) + exp_life_time = 0.0 for i in eachindex(age) - Λₚ = 0.0 - rtᵢ = ratetable[rate_preds[i,:]...] - for j in 1:(length(grid)-1) - Sₚ = exp(-Λₚ) - λₚ = daily_hazard(rtᵢ, age[i] + grid[j], year[i] + grid[j]) - ∂Λₚ = λₚ * (grid[j+1]-grid[j]) # λₚ * ∂t - Λₚ += ∂Λₚ - exp_spl_size[j] += Sₚ - life_time += Sₚ * (1 - exp(-∂Λₚ)) / λₚ # see relsurv, netwei.c l 200. + Lᵢ = Life(ratetable[rate_preds[i,:]...], age[i], year[i]) + for j in eachindex(annual_grid) + exp_spl_size[j] += ccdf(Lᵢ, annual_grid[j]) end + exp_life_time += expectation(Lᵢ) end - - exp_life_time = life_time / 365.241 / length(age) - annually = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] - - return new(exp_spl_size[annually], exp_life_time, grid[annually]) + return new(exp_spl_size, exp_life_time / 365.241 / length(age), annual_grid) end end @@ -34,6 +25,9 @@ bla bla """ function nessie(args...) r = fit(Nessie,args...) + if (typeof(r)<:Nessie) + return r + end transform!(r, :estimator => ByRow(x-> (x.grid, x.expected_life_time, x.expected_sample_size)) => [:grid, :expected_life_time,:expected_sample_size]) select!(r, Not(:estimator)) From 03dbafbb889a5d974e79d18892d424bc23dbcbfc Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Wed, 15 May 2024 16:11:20 +0200 Subject: [PATCH 08/28] Typo --- src/Nessie.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index f6942f3..fe3ad28 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -3,7 +3,7 @@ struct Nessie expected_life_time::Float64 grid::Vector{Float64} function Nessie(T, Δ, age, year, rate_preds, ratetable) - annual_grid = 1:365.241:maximum(T) + annual_grid = 0:365.241:maximum(T) exp_spl_size = zeros(length(annual_grid)) exp_life_time = 0.0 for i in eachindex(age) From a315b5e3d287e3c659c68b3eb9b8fb36b188b8b1 Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Thu, 16 May 2024 11:02:23 +0200 Subject: [PATCH 09/28] remove old version --- src/Nessie.jl | 49 ++----------------------------------------------- 1 file changed, 2 insertions(+), 47 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index fe3ad28..cb334d6 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -3,7 +3,7 @@ struct Nessie expected_life_time::Float64 grid::Vector{Float64} function Nessie(T, Δ, age, year, rate_preds, ratetable) - annual_grid = 0:365.241:maximum(T) + annual_grid = 0:RateTables.RT_DAYS_IN_YEAR:maximum(T) exp_spl_size = zeros(length(annual_grid)) exp_life_time = 0.0 for i in eachindex(age) @@ -13,7 +13,7 @@ struct Nessie end exp_life_time += expectation(Lᵢ) end - return new(exp_spl_size, exp_life_time / 365.241 / length(age), annual_grid) + return new(exp_spl_size, exp_life_time / RateTables.RT_DAYS_IN_YEAR / length(age), annual_grid) end end @@ -41,48 +41,3 @@ end expected_life_time(x::Nessie) = x.expected_life_time expected_sample_size(x::Nessie) = x.expected_sample_size - - - - -# function old_Nessie(formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) -# formula_applied = apply_schema(formula,schema(df)) -# rate_predictors = String.([RateTables.predictors(rt)...]) - -# nms = StatsModels.termnames(formula_applied.rhs) -# if isa(nms, String) -# pred_names = [nms] -# else -# pred_names = nms -# end - -# times = sort(unique(floor.(df.time ./ 365.241))) -# times = unique([0.0; times]) - -# times_d = times .* 365.241 - -# new_df = groupby(df, pred_names) -# povp = zeros(nrow(unique(df[!,pred_names]))) -# sit = zeros(length(times)) -# num_pop = zeros(nrow(unique(df[!,pred_names])), length(times)) - -# for i in 1:nrow(unique(df[!,pred_names])) -# for j in 1:nrow(new_df[i]) -# Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) -# rate_preds = select(new_df[i],rate_predictors) -# rtᵢ = rt[rate_preds[j,:]...] -# Λₚ = 0.0 - -# for m in 1:Tᵢ -# λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) -# ∂Λₚ = λₚ * 365.241 -# Λₚ += ∂Λₚ -# Sₚ = exp(-Λₚ) -# num_pop[i,m] += Sₚ -# sit[m] += (1-Sₚ) / λₚ -# end -# end -# povp[i] = mean(sit ./ 365.241) -# end -# return num_pop, povp -# end \ No newline at end of file From 31441dcaaf62b4771024cdd25d6298f3bb46b0c3 Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Thu, 16 May 2024 11:02:54 +0200 Subject: [PATCH 10/28] dunno what to do with the output --- src/Nessie.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index cb334d6..e436a26 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -38,6 +38,6 @@ function nessie(args...) return lt, r end - +# Maybe not necessary ? No need to clutter the interface too much.. expected_life_time(x::Nessie) = x.expected_life_time expected_sample_size(x::Nessie) = x.expected_sample_size From 0de4cb504c07c728ccdf7e80ba6b149ae026acef Mon Sep 17 00:00:00 2001 From: rimhajal Date: Thu, 16 May 2024 14:58:53 +0200 Subject: [PATCH 11/28] some docs --- README.md | 1 + docs/src/example.md | 12 ++++++++++++ docs/src/getting_started.md | 12 ++++++++++++ docs/src/index.md | 1 + 4 files changed, 26 insertions(+) diff --git a/README.md b/README.md index 9d8e78a..c1c7819 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ Some key features in `NetSurvival.jl` are: - A panel of different non-parametric net survival estimators (Ederer I, Ederer II, Hakulinen, Pohar Perme) with an interface compliant with Julia's standards. - Grafféo's log-rank test to compare net survival curves accross groups, including stratified testing. +- A 'Nessie' function that outputs the estimated sample size by yearly intervals and the average lifespan expectancy left for a given group. - A compact, readable and efficient codebase (up to 100x less LOC than `relsurv` for the same functionalities), ensuring long-term maintenability. - Significant performance improvements (see below) compared `relsurv`. diff --git a/docs/src/example.md b/docs/src/example.md index 937e6ab..905b618 100644 --- a/docs/src/example.md +++ b/docs/src/example.md @@ -238,3 +238,15 @@ plot(plot1, plot2, layout = (1, 2)) ``` Visually, it is almost immediately understood that there are no worthy differences between the two sexes whereas the `age65` variable seems to play a big role. + + +## Estimated sample size and life expectancy + +Given that the age group plays a significant role in the study, we will now define a new variable that groups patients in specific age groups. We will use these groups to estimate the sample size by yearly intervals and compare the groups. + +```@example 1 +breaks = [0; collect(45:5:90); Inf] + +colrec.agegr = cut(colrec.age./365.241, breaks, right=false) +ess = Nessie(@formula(Surv(time,status)~sex+agegr), colrec, slopop) +``` \ No newline at end of file diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index 6bbb17e..7182bb1 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -115,6 +115,18 @@ Under $H_0$, the statistic $U(T)$ is asymptotically $\chi^2(k-1)$-distributed. W GraffeoTest ``` +## Nessie + +The Nessie function estimates the sample size by yearly intervals as well as averages an estimated lifespan left for a given group. + +This function is highly dependant on the `Life` function taken from the `RateTables.jl` package which you can find documented [here](https://juliasurv.github.io/RateTables.jl/dev/). + +The sample size is thus taken by the following formula: + +$$ ESS = \sum_i^N S_{P_i} * exp(-\Lambda_p) $$ + +While the estimated lifepsan is directly taken from the `expectation` function. + ## References ```@bibliography diff --git a/docs/src/index.md b/docs/src/index.md index 6833488..97d9958 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -19,6 +19,7 @@ Some key features in `NetSurvival.jl` are: - A panel of different non-parametric net survival estimators (Ederer I [Ederer1961](@cite), Ederer II [Ederer1959](@cite), Hakulinen [Hakulinen1977](@cite), Pohar Perme [PoharPerme2012](@cite)) with an interface compliant with Julia's standards. - Grafféo's log-rank test [Graffeo2016](@cite) to compare net survival curves accross groups, including stratified testing. +- A 'Nessie' function that outputs the estimated sample size by yearly intervals and the average lifespan expectancy left for a given group. - A compact, readable and efficient codebase (up to 1000x less LOC than `relsurv` for the same functionalities), ensuring long-term maintenability. - Significant performance improvements (up to 50x) compared to the R package `relsurv`. From 7f9becd1437a884178468182746e80d26b6f22b7 Mon Sep 17 00:00:00 2001 From: rimhajal Date: Fri, 17 May 2024 11:30:56 +0200 Subject: [PATCH 12/28] question --- src/Nessie.jl | 40 ++++++++++++++++++++++++++++++++++++++++ src/NetSurvival.jl | 3 +++ 2 files changed, 43 insertions(+) create mode 100644 src/Nessie.jl diff --git a/src/Nessie.jl b/src/Nessie.jl new file mode 100644 index 0000000..84986c0 --- /dev/null +++ b/src/Nessie.jl @@ -0,0 +1,40 @@ +function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables.AbstractRateTable) + formula_applied = apply_schema(formula,schema(df)) + + nms = StatsModels.termnames(formula_applied.rhs) + if isa(nms, String) + pred_names = [nms] + else + pred_names = nms + end + + times = sort(unique(floor.(df.time ./ 365.241))) + times = unique([0.0; times]) + + times_d = times .* 365.241 + + new_df = groupby(df, pred_names) + k = Matrix(undef, nrow(unique(df[!,pred_names])), length(times)) + povp = zeros(nrow(unique(df[!,pred_names]))) + sit = zeros(length(times)) + num_pop = zeros(length(times)) + + for i in 1:nrow(unique(df[!,pred_names])) + for j in 1:nrow(new_df[i]) + Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) + rtᵢ = rt[rate_preds[j,:]...] + Λₚ = 0.0 + + for m in 1:Tᵢ + λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) + ∂Λₚ = λₚ #* (times_d[m+1]-times_d[m]) + Λₚ += ∂Λₚ + Sₚ = exp(-Λₚ) + num_pop[m] += Sₚ + sit[m] += (1-Sₚ) / λₚ + end + end + povp[i] = mean(sit ./ 365.241) + end + return num_pop, povp +end \ No newline at end of file diff --git a/src/NetSurvival.jl b/src/NetSurvival.jl index abead04..02fabda 100644 --- a/src/NetSurvival.jl +++ b/src/NetSurvival.jl @@ -22,10 +22,13 @@ include("Hakulinen.jl") include("CrudeMortality.jl") +include("Nessie.jl") + include("GraffeoTest.jl") export PoharPerme, EdererI, EdererII, Hakulinen export CrudeMortality +export Nessie export fit, confint export GraffeoTest export Surv, Strata From c254835eb50394381926e51fbebf61e58a16d09a Mon Sep 17 00:00:00 2001 From: rimhajal Date: Fri, 17 May 2024 11:30:56 +0200 Subject: [PATCH 13/28] somewhat better --- src/Nessie.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index 84986c0..3c949be 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -14,10 +14,9 @@ function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables. times_d = times .* 365.241 new_df = groupby(df, pred_names) - k = Matrix(undef, nrow(unique(df[!,pred_names])), length(times)) povp = zeros(nrow(unique(df[!,pred_names]))) sit = zeros(length(times)) - num_pop = zeros(length(times)) + num_pop = zeros(nrow(unique(df[!,pred_names])), length(times)) for i in 1:nrow(unique(df[!,pred_names])) for j in 1:nrow(new_df[i]) @@ -30,10 +29,10 @@ function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables. ∂Λₚ = λₚ #* (times_d[m+1]-times_d[m]) Λₚ += ∂Λₚ Sₚ = exp(-Λₚ) - num_pop[m] += Sₚ + num_pop[i,m] += Sₚ sit[m] += (1-Sₚ) / λₚ end - end + end povp[i] = mean(sit ./ 365.241) end return num_pop, povp From e9070b72d50ae89ac39e2937b644320385ef4657 Mon Sep 17 00:00:00 2001 From: rimhajal Date: Fri, 17 May 2024 11:30:56 +0200 Subject: [PATCH 14/28] took it out --- src/Nessie.jl | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index 3c949be..3ba5be2 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -1,5 +1,6 @@ -function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables.AbstractRateTable) +function Nessie(formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) formula_applied = apply_schema(formula,schema(df)) + rate_predictors = String.([RateTables.predictors(rt)...]) nms = StatsModels.termnames(formula_applied.rhs) if isa(nms, String) @@ -21,19 +22,20 @@ function Nessie(formula::FormulaTerm, df::DataFrame, rate_preds, rt::RateTables. for i in 1:nrow(unique(df[!,pred_names])) for j in 1:nrow(new_df[i]) Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) + rate_preds = select(new_df[i],rate_predictors) rtᵢ = rt[rate_preds[j,:]...] Λₚ = 0.0 for m in 1:Tᵢ λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) - ∂Λₚ = λₚ #* (times_d[m+1]-times_d[m]) + ∂Λₚ = λₚ Λₚ += ∂Λₚ Sₚ = exp(-Λₚ) num_pop[i,m] += Sₚ sit[m] += (1-Sₚ) / λₚ end - end - povp[i] = mean(sit ./ 365.241) + end + povp[i] = mean(sit ./ 365.241) end return num_pop, povp end \ No newline at end of file From 68deffd07ddc5c185a58468425a2731e7ae16378 Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Fri, 17 May 2024 11:30:56 +0200 Subject: [PATCH 15/28] tout en vrac --- src/NPNSEstimator.jl | 2 +- src/Nessie.jl | 122 ++++++++++++++++++++++++++++++------------- src/NetSurvival.jl | 6 ++- 3 files changed, 92 insertions(+), 38 deletions(-) diff --git a/src/NPNSEstimator.jl b/src/NPNSEstimator.jl index 4791466..6cbb7e4 100644 --- a/src/NPNSEstimator.jl +++ b/src/NPNSEstimator.jl @@ -40,7 +40,7 @@ function _get_rate_predictors(rt,df) return prd end -function StatsBase.fit(::Type{E}, formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) where {E<:NPNSEstimator} +function StatsBase.fit(::Type{E}, formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) where {E<:Union{NPNSEstimator, Nessie}} rate_predictors = _get_rate_predictors(rt,df) formula_applied = apply_schema(formula,schema(df)) diff --git a/src/Nessie.jl b/src/Nessie.jl index 3ba5be2..86f917b 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -1,41 +1,93 @@ -function Nessie(formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) - formula_applied = apply_schema(formula,schema(df)) - rate_predictors = String.([RateTables.predictors(rt)...]) - - nms = StatsModels.termnames(formula_applied.rhs) - if isa(nms, String) - pred_names = [nms] - else - pred_names = nms +struct Nessie + expected_sample_size::Vector{Float64} + expected_life_time::Float64 + grid::Vector{Float64} + function Nessie(T, Δ, age, year, rate_preds, ratetable) + grid = mk_grid([1,maximum(T)],1) + # grid = mk_grid(T,1) + expected_sample_size = zero(grid) + for i in eachindex(age) + # Tᵢ = searchsortedlast(grid, T[i]) + Λₚ = 0.0 + rtᵢ = ratetable[rate_preds[i,:]...] + for j in 1:(length(grid)-1) + λₚ = daily_hazard(rtᵢ, age[i] + grid[j], year[i] + grid[j]) + ∂Λₚ = λₚ * (grid[j+1]-grid[j]) # λₚ * ∂t + Λₚ += ∂Λₚ + Sₚ = exp(-Λₚ) + expected_sample_size[j] += Sₚ + end + end + expected_life_time = sum(expected_sample_size[1:(end-1)] .* diff(grid)) / length(age) + + annual_indices = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] + return new(expected_sample_size[annual_indices], expected_life_time / 365.241, grid[annual_indices]) end +end - times = sort(unique(floor.(df.time ./ 365.241))) - times = unique([0.0; times]) +""" + nessie(formula, data, ratetable) - times_d = times .* 365.241 +bla bla - new_df = groupby(df, pred_names) - povp = zeros(nrow(unique(df[!,pred_names]))) - sit = zeros(length(times)) - num_pop = zeros(nrow(unique(df[!,pred_names])), length(times)) +""" +function nessie(args...) + r = fit(Nessie,args...) + transform!(r, :estimator => ByRow(x-> (x.grid, x.expected_life_time, x.expected_sample_size)) => [:expected_sample_size,:expected_life_time, :grid]) + select!(r, Not(:estimator)) - for i in 1:nrow(unique(df[!,pred_names])) - for j in 1:nrow(new_df[i]) - Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) - rate_preds = select(new_df[i],rate_predictors) - rtᵢ = rt[rate_preds[j,:]...] - Λₚ = 0.0 + lt = deepcopy(r) + select!(lt, Not([:expected_sample_size, :grid])) - for m in 1:Tᵢ - λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) - ∂Λₚ = λₚ - Λₚ += ∂Λₚ - Sₚ = exp(-Λₚ) - num_pop[i,m] += Sₚ - sit[m] += (1-Sₚ) / λₚ - end - end - povp[i] = mean(sit ./ 365.241) - end - return num_pop, povp -end \ No newline at end of file + select!(r, Not(:expected_life_time)) + return lt, r +end + + +expected_life_time(x::Nessie) = x.expected_life_time +expected_sample_size(x::Nessie) = x.expected_sample_size + + + + +# function old_Nessie(formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) +# formula_applied = apply_schema(formula,schema(df)) +# rate_predictors = String.([RateTables.predictors(rt)...]) + +# nms = StatsModels.termnames(formula_applied.rhs) +# if isa(nms, String) +# pred_names = [nms] +# else +# pred_names = nms +# end + +# times = sort(unique(floor.(df.time ./ 365.241))) +# times = unique([0.0; times]) + +# times_d = times .* 365.241 + +# new_df = groupby(df, pred_names) +# povp = zeros(nrow(unique(df[!,pred_names]))) +# sit = zeros(length(times)) +# num_pop = zeros(nrow(unique(df[!,pred_names])), length(times)) + +# for i in 1:nrow(unique(df[!,pred_names])) +# for j in 1:nrow(new_df[i]) +# Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) +# rate_preds = select(new_df[i],rate_predictors) +# rtᵢ = rt[rate_preds[j,:]...] +# Λₚ = 0.0 + +# for m in 1:Tᵢ +# λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) +# ∂Λₚ = λₚ * 365.241 +# Λₚ += ∂Λₚ +# Sₚ = exp(-Λₚ) +# num_pop[i,m] += Sₚ +# sit[m] += (1-Sₚ) / λₚ +# end +# end +# povp[i] = mean(sit ./ 365.241) +# end +# return num_pop, povp +# end \ No newline at end of file diff --git a/src/NetSurvival.jl b/src/NetSurvival.jl index 02fabda..5197429 100644 --- a/src/NetSurvival.jl +++ b/src/NetSurvival.jl @@ -14,6 +14,8 @@ using RateTables include("fetch_datasets.jl") include("Surv_and_Strata.jl") +include("Nessie.jl") + include("NPNSEstimator.jl") include("PoharPerme.jl") include("EdererI.jl") @@ -22,13 +24,13 @@ include("Hakulinen.jl") include("CrudeMortality.jl") -include("Nessie.jl") + include("GraffeoTest.jl") export PoharPerme, EdererI, EdererII, Hakulinen export CrudeMortality -export Nessie +export Nessie, nessie export fit, confint export GraffeoTest export Surv, Strata From b497a91999949641c1f743242c19964a8002a9a4 Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Fri, 17 May 2024 11:30:56 +0200 Subject: [PATCH 16/28] back to the formula in relsurv -- results differ IDK why. --- src/Nessie.jl | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index 86f917b..55fa747 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -3,25 +3,26 @@ struct Nessie expected_life_time::Float64 grid::Vector{Float64} function Nessie(T, Δ, age, year, rate_preds, ratetable) - grid = mk_grid([1,maximum(T)],1) - # grid = mk_grid(T,1) - expected_sample_size = zero(grid) + grid = mk_grid([1,maximum(T)],1) # mk_grid(T,1) + exp_spl_size = zeros(length(grid)) + life_time = 0.0 for i in eachindex(age) - # Tᵢ = searchsortedlast(grid, T[i]) Λₚ = 0.0 rtᵢ = ratetable[rate_preds[i,:]...] for j in 1:(length(grid)-1) + Sₚ = exp(-Λₚ) λₚ = daily_hazard(rtᵢ, age[i] + grid[j], year[i] + grid[j]) ∂Λₚ = λₚ * (grid[j+1]-grid[j]) # λₚ * ∂t Λₚ += ∂Λₚ - Sₚ = exp(-Λₚ) - expected_sample_size[j] += Sₚ + exp_spl_size[j] += Sₚ + life_time += Sₚ * (1 - exp(-∂Λₚ)) / λₚ end end - expected_life_time = sum(expected_sample_size[1:(end-1)] .* diff(grid)) / length(age) - annual_indices = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] - return new(expected_sample_size[annual_indices], expected_life_time / 365.241, grid[annual_indices]) + exp_life_time = life_time / 365.241 / length(age) + annually = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] + + return new(exp_spl_size[annually], exp_life_time, grid[annually]) end end @@ -33,7 +34,7 @@ bla bla """ function nessie(args...) r = fit(Nessie,args...) - transform!(r, :estimator => ByRow(x-> (x.grid, x.expected_life_time, x.expected_sample_size)) => [:expected_sample_size,:expected_life_time, :grid]) + transform!(r, :estimator => ByRow(x-> (x.grid, x.expected_life_time, x.expected_sample_size)) => [:grid, :expected_life_time,:expected_sample_size]) select!(r, Not(:estimator)) lt = deepcopy(r) From b4d7c7e1f8b37ca0d03577e7271bb1f2afe7b07a Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Fri, 17 May 2024 11:30:56 +0200 Subject: [PATCH 17/28] coment --- src/Nessie.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index 55fa747..c4ad26f 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -15,13 +15,13 @@ struct Nessie ∂Λₚ = λₚ * (grid[j+1]-grid[j]) # λₚ * ∂t Λₚ += ∂Λₚ exp_spl_size[j] += Sₚ - life_time += Sₚ * (1 - exp(-∂Λₚ)) / λₚ + life_time += Sₚ * (1 - exp(-∂Λₚ)) / λₚ # see relsurv, netwei.c l 200. end end exp_life_time = life_time / 365.241 / length(age) annually = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] - + return new(exp_spl_size[annually], exp_life_time, grid[annually]) end end From e7dbf0bff166a6a95300a62cf5b2631bb22d7198 Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Fri, 17 May 2024 11:30:57 +0200 Subject: [PATCH 18/28] up on ratetable branch --- src/Nessie.jl | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index c4ad26f..f6942f3 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -3,26 +3,17 @@ struct Nessie expected_life_time::Float64 grid::Vector{Float64} function Nessie(T, Δ, age, year, rate_preds, ratetable) - grid = mk_grid([1,maximum(T)],1) # mk_grid(T,1) - exp_spl_size = zeros(length(grid)) - life_time = 0.0 + annual_grid = 1:365.241:maximum(T) + exp_spl_size = zeros(length(annual_grid)) + exp_life_time = 0.0 for i in eachindex(age) - Λₚ = 0.0 - rtᵢ = ratetable[rate_preds[i,:]...] - for j in 1:(length(grid)-1) - Sₚ = exp(-Λₚ) - λₚ = daily_hazard(rtᵢ, age[i] + grid[j], year[i] + grid[j]) - ∂Λₚ = λₚ * (grid[j+1]-grid[j]) # λₚ * ∂t - Λₚ += ∂Λₚ - exp_spl_size[j] += Sₚ - life_time += Sₚ * (1 - exp(-∂Λₚ)) / λₚ # see relsurv, netwei.c l 200. + Lᵢ = Life(ratetable[rate_preds[i,:]...], age[i], year[i]) + for j in eachindex(annual_grid) + exp_spl_size[j] += ccdf(Lᵢ, annual_grid[j]) end + exp_life_time += expectation(Lᵢ) end - - exp_life_time = life_time / 365.241 / length(age) - annually = [searchsortedlast(grid, i) for i in (365.241 * (0:floor(maximum(T)/365.241))).+1] - - return new(exp_spl_size[annually], exp_life_time, grid[annually]) + return new(exp_spl_size, exp_life_time / 365.241 / length(age), annual_grid) end end @@ -34,6 +25,9 @@ bla bla """ function nessie(args...) r = fit(Nessie,args...) + if (typeof(r)<:Nessie) + return r + end transform!(r, :estimator => ByRow(x-> (x.grid, x.expected_life_time, x.expected_sample_size)) => [:grid, :expected_life_time,:expected_sample_size]) select!(r, Not(:estimator)) From f4554b3347927367fe375abf54f3f3b3cef2efeb Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Fri, 17 May 2024 11:30:57 +0200 Subject: [PATCH 19/28] Typo --- src/Nessie.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index f6942f3..fe3ad28 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -3,7 +3,7 @@ struct Nessie expected_life_time::Float64 grid::Vector{Float64} function Nessie(T, Δ, age, year, rate_preds, ratetable) - annual_grid = 1:365.241:maximum(T) + annual_grid = 0:365.241:maximum(T) exp_spl_size = zeros(length(annual_grid)) exp_life_time = 0.0 for i in eachindex(age) From 39e26fadd4897412c3cc2d738b847331e5ceba88 Mon Sep 17 00:00:00 2001 From: rimhajal Date: Fri, 17 May 2024 11:32:29 +0200 Subject: [PATCH 20/28] some docs --- README.md | 1 + docs/src/example.md | 12 ++++++++++++ docs/src/getting_started.md | 12 ++++++++++++ docs/src/index.md | 3 ++- 4 files changed, 27 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d8e78a..c1c7819 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ Some key features in `NetSurvival.jl` are: - A panel of different non-parametric net survival estimators (Ederer I, Ederer II, Hakulinen, Pohar Perme) with an interface compliant with Julia's standards. - Grafféo's log-rank test to compare net survival curves accross groups, including stratified testing. +- A 'Nessie' function that outputs the estimated sample size by yearly intervals and the average lifespan expectancy left for a given group. - A compact, readable and efficient codebase (up to 100x less LOC than `relsurv` for the same functionalities), ensuring long-term maintenability. - Significant performance improvements (see below) compared `relsurv`. diff --git a/docs/src/example.md b/docs/src/example.md index 937e6ab..905b618 100644 --- a/docs/src/example.md +++ b/docs/src/example.md @@ -238,3 +238,15 @@ plot(plot1, plot2, layout = (1, 2)) ``` Visually, it is almost immediately understood that there are no worthy differences between the two sexes whereas the `age65` variable seems to play a big role. + + +## Estimated sample size and life expectancy + +Given that the age group plays a significant role in the study, we will now define a new variable that groups patients in specific age groups. We will use these groups to estimate the sample size by yearly intervals and compare the groups. + +```@example 1 +breaks = [0; collect(45:5:90); Inf] + +colrec.agegr = cut(colrec.age./365.241, breaks, right=false) +ess = Nessie(@formula(Surv(time,status)~sex+agegr), colrec, slopop) +``` \ No newline at end of file diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index 17e0c36..c0e06fb 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -115,6 +115,18 @@ Under $H_0$, the statistic $U(T)$ is asymptotically $\chi^2(k-1)$-distributed. W GraffeoTest ``` +## Nessie + +The Nessie function estimates the sample size by yearly intervals as well as averages an estimated lifespan left for a given group. + +This function is highly dependant on the `Life` function taken from the `RateTables.jl` package which you can find documented [here](https://juliasurv.github.io/RateTables.jl/dev/). + +The sample size is thus taken by the following formula: + +$$ ESS = \sum_i^N S_{P_i} * exp(-\Lambda_p) $$ + +While the estimated lifepsan is directly taken from the `expectation` function. + ## References ```@bibliography diff --git a/docs/src/index.md b/docs/src/index.md index 37d72f6..32985f5 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -18,7 +18,8 @@ Some key features in `NetSurvival.jl` are: - A panel of different non-parametric net survival estimators (Ederer I [Ederer1961](@cite), Ederer II [Ederer1959](@cite), Hakulinen [Hakulinen1977](@cite), Pohar Perme [PoharPerme2012](@cite)) with an interface compliant with Julia's standards. - Grafféo's log-rank test [Graffeo2016](@cite) to compare net survival curves accross groups, including stratified testing. -- Crude mortality, Expected Sample Size, and other usefull metrics in net survival field. +- Crude mortality, Expected Sample Size, and other useful metrics in net survival field. +- A 'Nessie' function that outputs the estimated sample size by yearly intervals and the average lifespan expectancy left for a given group. - A compact, readable and efficient codebase (up to 1000x less LOC than `relsurv` for the same functionalities), ensuring long-term maintenability. - Significant performance improvements (up to 50x) compared to the R package `relsurv`. From 520b33e4bd32861f4b55a555c5b2c68a88e5f2ec Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Fri, 17 May 2024 11:32:29 +0200 Subject: [PATCH 21/28] remove old version --- src/Nessie.jl | 49 ++----------------------------------------------- 1 file changed, 2 insertions(+), 47 deletions(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index fe3ad28..cb334d6 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -3,7 +3,7 @@ struct Nessie expected_life_time::Float64 grid::Vector{Float64} function Nessie(T, Δ, age, year, rate_preds, ratetable) - annual_grid = 0:365.241:maximum(T) + annual_grid = 0:RateTables.RT_DAYS_IN_YEAR:maximum(T) exp_spl_size = zeros(length(annual_grid)) exp_life_time = 0.0 for i in eachindex(age) @@ -13,7 +13,7 @@ struct Nessie end exp_life_time += expectation(Lᵢ) end - return new(exp_spl_size, exp_life_time / 365.241 / length(age), annual_grid) + return new(exp_spl_size, exp_life_time / RateTables.RT_DAYS_IN_YEAR / length(age), annual_grid) end end @@ -41,48 +41,3 @@ end expected_life_time(x::Nessie) = x.expected_life_time expected_sample_size(x::Nessie) = x.expected_sample_size - - - - -# function old_Nessie(formula::FormulaTerm, df::DataFrame, rt::RateTables.AbstractRateTable) -# formula_applied = apply_schema(formula,schema(df)) -# rate_predictors = String.([RateTables.predictors(rt)...]) - -# nms = StatsModels.termnames(formula_applied.rhs) -# if isa(nms, String) -# pred_names = [nms] -# else -# pred_names = nms -# end - -# times = sort(unique(floor.(df.time ./ 365.241))) -# times = unique([0.0; times]) - -# times_d = times .* 365.241 - -# new_df = groupby(df, pred_names) -# povp = zeros(nrow(unique(df[!,pred_names]))) -# sit = zeros(length(times)) -# num_pop = zeros(nrow(unique(df[!,pred_names])), length(times)) - -# for i in 1:nrow(unique(df[!,pred_names])) -# for j in 1:nrow(new_df[i]) -# Tᵢ = searchsortedlast(times_d, new_df[i].time[j]) -# rate_preds = select(new_df[i],rate_predictors) -# rtᵢ = rt[rate_preds[j,:]...] -# Λₚ = 0.0 - -# for m in 1:Tᵢ -# λₚ = daily_hazard(rtᵢ, new_df[i].age[j] + times_d[m], new_df[i].year[j] + times_d[m]) -# ∂Λₚ = λₚ * 365.241 -# Λₚ += ∂Λₚ -# Sₚ = exp(-Λₚ) -# num_pop[i,m] += Sₚ -# sit[m] += (1-Sₚ) / λₚ -# end -# end -# povp[i] = mean(sit ./ 365.241) -# end -# return num_pop, povp -# end \ No newline at end of file From b8c53c54654f0878c27526671443fa135bffae22 Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Fri, 17 May 2024 11:32:29 +0200 Subject: [PATCH 22/28] dunno what to do with the output --- src/Nessie.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Nessie.jl b/src/Nessie.jl index cb334d6..e436a26 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -38,6 +38,6 @@ function nessie(args...) return lt, r end - +# Maybe not necessary ? No need to clutter the interface too much.. expected_life_time(x::Nessie) = x.expected_life_time expected_sample_size(x::Nessie) = x.expected_sample_size From d51d237367ccf79f01d942f782e801c9bc4f9677 Mon Sep 17 00:00:00 2001 From: rimhajal Date: Fri, 17 May 2024 11:34:41 +0200 Subject: [PATCH 23/28] remove useless comments --- test/sampletest.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sampletest.jl b/test/sampletest.jl index c3d3b06..471b7d4 100644 --- a/test/sampletest.jl +++ b/test/sampletest.jl @@ -108,11 +108,11 @@ end # Coompare results with R: compare_with_R(v1, vR) - compare_with_R(v1_strat, vR_strat) # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<------------------- This ones fails + compare_with_R(v1_strat, vR_strat) # Check for equality of the two interfaces: check_equal(v1,v2) - check_equal(v1_strat,v2_strat) # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<------------------- This ones fails + check_equal(v1_strat,v2_strat) end From 000568d2c5c5d627f7cd060e38f7b8bdd2117923 Mon Sep 17 00:00:00 2001 From: rimhajal Date: Fri, 17 May 2024 11:41:26 +0200 Subject: [PATCH 24/28] upgrade RateTables dependency --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 740fbed..ad83249 100644 --- a/Project.toml +++ b/Project.toml @@ -19,7 +19,7 @@ CSV = "0.10" DataFrames = "1" Distributions = "0.25" LinearAlgebra = "1.6" -RateTables = "0.1" +RateTables = "0.1.1" RCall = "0.14" StatsAPI = "1" StatsBase = "0.34" From be48fecb70fee38f423accf926881c768b5f68bf Mon Sep 17 00:00:00 2001 From: rimhajal Date: Fri, 17 May 2024 13:40:33 +0200 Subject: [PATCH 25/28] error --- test/sampletest.jl | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/test/sampletest.jl b/test/sampletest.jl index 471b7d4..0bbfcef 100644 --- a/test/sampletest.jl +++ b/test/sampletest.jl @@ -146,4 +146,29 @@ end err_pop = (r[:population][:est][2:end, :] .- instance.Λₚ[1:end, :]) ./ r[:population][:est][2:end, :] @test all(abs.(err_causeSpec) .<= 0.01) @test all(abs.(err_pop) .<= 0.01) +end + +@testitem "Assess Nessie" begin + using RateTables + using RCall + + R""" + rez = relsurv::nessie(survival::Surv(time, stat) ~ sex, data = relsurv::colrec, ratetable = relsurv::slopop, rmap = list(age = age, sex = sex, year = diag)) + rez_male = rez$mata[1,] + rez_female = rez$mata[2,] + """ + + rESS = @rget rez + rESS_male = @rget rez_male + rESS_female = @rget rez_female + + instance = nessie(@formula(Surv(time,status)~sex), colrec, slopop) + + err_ESS_male = (rESS_male[1:end,:] .- instance[2].expected_sample_size[1]) ./ rESS_male[1:end,:] + err_ESS_female = (rESS_female[1:end,:] .- instance[2].expected_sample_size[2]) ./ rESS_male[1:end,:] + err_ELT = (rESS[:povp][1:end, :] .- instance[1].expected_life_time[1:end, :]) ./ rESS[:povp][1:end, :] + + @test all(err_ESS_male .<= 0.01) + @test abs(eachrow(err_ESS_female)) <= 0.01 + @test all(abs.(err_ELT) .<= 0.01) end \ No newline at end of file From 225a6a7a2614d734dcf216fc97a258133dd81edb Mon Sep 17 00:00:00 2001 From: Oskar Laverny Date: Fri, 17 May 2024 14:20:37 +0200 Subject: [PATCH 26/28] correct tet --- src/NetSurvival.jl | 6 ------ test/sampletest.jl | 25 +++++++++++++------------ 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/NetSurvival.jl b/src/NetSurvival.jl index 5197429..a0dc006 100644 --- a/src/NetSurvival.jl +++ b/src/NetSurvival.jl @@ -13,19 +13,13 @@ using RateTables include("fetch_datasets.jl") include("Surv_and_Strata.jl") - include("Nessie.jl") - include("NPNSEstimator.jl") include("PoharPerme.jl") include("EdererI.jl") include("EdererII.jl") include("Hakulinen.jl") - include("CrudeMortality.jl") - - - include("GraffeoTest.jl") export PoharPerme, EdererI, EdererII, Hakulinen diff --git a/test/sampletest.jl b/test/sampletest.jl index 0bbfcef..db94d00 100644 --- a/test/sampletest.jl +++ b/test/sampletest.jl @@ -154,21 +154,22 @@ end R""" rez = relsurv::nessie(survival::Surv(time, stat) ~ sex, data = relsurv::colrec, ratetable = relsurv::slopop, rmap = list(age = age, sex = sex, year = diag)) - rez_male = rez$mata[1,] - rez_female = rez$mata[2,] + mata = t(as.matrix(rez$mata)) + povp = rez$povp """ - - rESS = @rget rez - rESS_male = @rget rez_male - rESS_female = @rget rez_female + r_mata = @rget mata + r_povp = @rget povp + r_male, r_female = r_mata[:,1], r_mata[:,2] instance = nessie(@formula(Surv(time,status)~sex), colrec, slopop) + jl_male, jl_female = instance[2].expected_sample_size + jl_povp = instance[1].expected_life_time - err_ESS_male = (rESS_male[1:end,:] .- instance[2].expected_sample_size[1]) ./ rESS_male[1:end,:] - err_ESS_female = (rESS_female[1:end,:] .- instance[2].expected_sample_size[2]) ./ rESS_male[1:end,:] - err_ELT = (rESS[:povp][1:end, :] .- instance[1].expected_life_time[1:end, :]) ./ rESS[:povp][1:end, :] + err_male = (r_male[1:end-1] .- jl_male) ./ r_male[1:end-1] + err_female = (r_female[1:end-1] .- jl_female) ./ r_female[1:end-1] + err_povp = (r_povp .- jl_povp) ./ r_povp - @test all(err_ESS_male .<= 0.01) - @test abs(eachrow(err_ESS_female)) <= 0.01 - @test all(abs.(err_ELT) .<= 0.01) + @test all(abs.(err_male) .<= 0.01) + @test all(abs.(err_female) .<= 0.01) + @test all(abs.(err_povp) .<= 0.01) end \ No newline at end of file From 6c32f2ac309e2eeb3b93e91b1d74f01501d228fe Mon Sep 17 00:00:00 2001 From: rimhajal Date: Tue, 21 May 2024 12:34:00 +0200 Subject: [PATCH 27/28] last push --- docs/src/example.md | 9 +++------ src/Nessie.jl | 3 +-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/docs/src/example.md b/docs/src/example.md index 905b618..d763bdc 100644 --- a/docs/src/example.md +++ b/docs/src/example.md @@ -242,11 +242,8 @@ Visually, it is almost immediately understood that there are no worthy differenc ## Estimated sample size and life expectancy -Given that the age group plays a significant role in the study, we will now define a new variable that groups patients in specific age groups. We will use these groups to estimate the sample size by yearly intervals and compare the groups. +Given that the age group plays a significant role in the study, we will now estimate the sample size by yearly intervals in order to better compare the age groups. -```@example 1 -breaks = [0; collect(45:5:90); Inf] - -colrec.agegr = cut(colrec.age./365.241, breaks, right=false) -ess = Nessie(@formula(Surv(time,status)~sex+agegr), colrec, slopop) +```@example 2 +ess = nessie(@formula(Surv(time,status)~age65), colrec, slopop) ``` \ No newline at end of file diff --git a/src/Nessie.jl b/src/Nessie.jl index e436a26..8cfe0f3 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -20,8 +20,7 @@ end """ nessie(formula, data, ratetable) -bla bla - +The Nessie function estimates the sample size by yearly intervals as well as averages an estimated lifespan left for a given group. """ function nessie(args...) r = fit(Nessie,args...) From 1409ec82662c9e50a9f4435e8ce0d5b10482c18d Mon Sep 17 00:00:00 2001 From: rimhajal Date: Thu, 23 May 2024 10:49:27 +0200 Subject: [PATCH 28/28] docs --- docs/src/example.md | 15 +++++++++++++-- docs/src/getting_started.md | 6 +++++- src/Nessie.jl | 6 ++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/docs/src/example.md b/docs/src/example.md index d763bdc..7f94a35 100644 --- a/docs/src/example.md +++ b/docs/src/example.md @@ -245,5 +245,16 @@ Visually, it is almost immediately understood that there are no worthy differenc Given that the age group plays a significant role in the study, we will now estimate the sample size by yearly intervals in order to better compare the age groups. ```@example 2 -ess = nessie(@formula(Surv(time,status)~age65), colrec, slopop) -``` \ No newline at end of file +elt, ess = nessie(@formula(Surv(time,status)~age65), colrec, slopop) +elt +``` + +The expected life time for the younger patients is significatively higher than for older patients (24.78 years > 10.29 years). + +```@example 2 +hcat(ess[:,3]...) +``` + +Finally, the table above represents yearly expected sample sizes for both age groups under 65 and above, with the second column representing the latter. We can see that the sample size decreases for the older patients in a much more dramatic way than for the younger ages. + +Unsurprisingly, we can thus conclude that age plays an important role in the study. \ No newline at end of file diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index c0e06fb..ce81892 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -123,10 +123,14 @@ This function is highly dependant on the `Life` function taken from the `RateTab The sample size is thus taken by the following formula: -$$ ESS = \sum_i^N S_{P_i} * exp(-\Lambda_p) $$ +$$ESS(t) = \sum_i^N S_{P_i}(t) * \exp(-\Lambda_{P_i}(t))$$ While the estimated lifepsan is directly taken from the `expectation` function. +```@docs +nessie +``` + ## References ```@bibliography diff --git a/src/Nessie.jl b/src/Nessie.jl index 8cfe0f3..c63ee1b 100644 --- a/src/Nessie.jl +++ b/src/Nessie.jl @@ -18,9 +18,11 @@ struct Nessie end """ - nessie(formula, data, ratetable) + nessie -The Nessie function estimates the sample size by yearly intervals as well as averages an estimated lifespan left for a given group. +To call this function, use the formula below: + + nessie(@formula(Surv(time,status)~covariate), data, ratetable) """ function nessie(args...) r = fit(Nessie,args...)