add norm. residuals to fit results + cosmetic improvements

legend-exp · Feb 5, 2024 · 2acd7a2 · 2acd7a2
1 parent 041fd42
commit 2acd7a2
Show file tree

Hide file tree

Showing 9 changed files with 103 additions and 44 deletions.
diff --git a/Project.toml b/Project.toml
@@ -9,6 +9,7 @@ BAT = "c0cd4b16-88b7-57fa-983b-ab80aecada7e"
 ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
 DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
@@ -53,12 +54,10 @@ Distributions = "0.24, 0.25"
 FillArrays = "0.7,0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1"
 Formatting = "0.4"
 ForwardDiff = "0.10"
-Interpolations =  "v0.15.1"
 IntervalSets = "0.7"
 InverseFunctions = "0.1"
 IrrationalConstants = "0.1, 0.2"
 LegendDataManagement = "0.2.4"
-LegendDataTypes =  "v0.1.5"
 LinearAlgebra = "1"
 LinearRegression = "0.2"
 LsqFit = "0.14, 0.15"

diff --git a/ext/LegendSpecFitsRecipesBaseExt.jl b/ext/LegendSpecFitsRecipesBaseExt.jl
@@ -374,4 +374,5 @@ end
 
 end
 
+
 end # module LegendSpecFitsRecipesBaseExt
diff --git a/src/LegendSpecFits.jl b/src/LegendSpecFits.jl
@@ -17,12 +17,10 @@ using BAT
 using Distributions
 using FillArrays
 using ForwardDiff
-using Interpolations
 using IntervalSets
 using InverseFunctions
 using IrrationalConstants
 using LegendDataManagement
-using LegendDataTypes: fast_flatten
 using LinearRegression
 using LsqFit
 using Optim
@@ -51,7 +49,6 @@ include("simple_calibration.jl")
 include("auto_calibration.jl")
 include("aoe_calibration.jl")
 include("specfit_combined.jl")
-include("specfit_testdata.jl")
 include("ctc.jl")
 include("qc.jl")
 include("gof.jl")

diff --git a/src/gof.jl b/src/gof.jl
@@ -5,47 +5,50 @@ several functions to calculate goodness-of-fit (gof) for fits (-> `specfits.jl`)
 """
 
 """
+    _prepare_data(h::Histogram{<:Real,1})
 aux. function to convert histogram data into bin edges, bin width and bin counts
 """
-function prepare_data(h::Histogram{<:Real,1})
+function _prepare_data(h::Histogram{<:Real,1})
     # get bin center, width and counts from histogrammed data
     bin_edges = first(h.edges)
     counts = h.weights
     bin_centers = (bin_edges[begin:end-1] .+ bin_edges[begin+1:end]) ./ 2
     bin_widths = bin_edges[begin+1:end] .- bin_edges[begin:end-1]
     return counts, bin_widths, bin_centers
 end
-export prepare_data
+
 
 """
+    _get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
 aux. function to get modelled peakshape based on  histogram binning and best-fit parameter
 """
-function get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
+function _get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
     model_func  = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
     model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies)
     return model_counts
 end
-export get_model_counts
+
 
 
 """ 
-`p_value(f_fit, h, v_ml)` : calculate p-value based on least-squares
+    p_value(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple) 
+calculate p-value based on least-squares
 baseline method to get goodness-of-fit (gof)
- input:
- * f_fit --> function handle of fit function (peakshape)
- * h --> histogram of data
- * v_ml --> best-fit parameters
- output:
- * pval --> p-value of chi2 test
- * chi2 --> chi2 value
- * dof --> degrees of freedom
+# input:
+ * `f_fit`function handle of fit function (peakshape)
+ * `h` histogram of data
+ * `v_ml` best-fit parameters
+# returns:
+ * `pval` p-value of chi2 test
+ * `chi2` chi2 value
+ * `dof` degrees of freedom
 """
 function p_value(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
     # prepare data
-    counts, bin_widths, bin_centers = prepare_data(h)
+    counts, bin_widths, bin_centers = _prepare_data(h)
 
     # get peakshape of best-fit 
-    model_counts = get_model_counts(f_fit, v_ml, bin_centers,bin_widths)
+    model_counts = _get_model_counts(f_fit, v_ml, bin_centers,bin_widths)
 
     # calculate chi2
     chi2    = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
@@ -62,13 +65,16 @@ end
 export p_value
 
 
-""" alternative p-value via loglikelihood ratio"""
+""" 
+    p_value_LogLikeRatio(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
+alternative p-value via loglikelihood ratio
+"""
 function p_value_LogLikeRatio(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
     # prepare data
-    counts, bin_widths, bin_centers = prepare_data(h)
+    counts, bin_widths, bin_centers = _prepare_data(h)
 
     # get peakshape of best-fit 
-    model_counts =get_model_counts(f_fit, v_ml, bin_centers,bin_widths)
+    model_counts = _get_model_counts(f_fit, v_ml, bin_centers,bin_widths)
 
     # calculate chi2
     chi2    = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
@@ -86,14 +92,16 @@ return pval, chi2, dof
 end
 export p_value_LogLikeRatio
 
-""" alternative p-value calculation via Monte Carlo sampling. Warning: computational more expensive than p_vaule() and p_value_LogLikeRatio()
+"""
+    p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background)},v_ml::NamedTuple,;n_samples::Int64=1000) 
+alternative p-value calculation via Monte Carlo sampling. Warning: computational more expensive than p_vaule() and p_value_LogLikeRatio()
 * Create n_samples randomized histograms. For each bin, samples are drawn from a Poisson distribution with λ = model peak shape (best-fit parameter)
 * Each sample histogram is fit using the model function `f_fit`
 * For each sample fit, the max. loglikelihood fit is calculated
 % p value --> comparison of sample max. loglikelihood and max. loglikelihood of best-fit
 """
 function p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background)},v_ml::NamedTuple,;n_samples::Int64=1000)
-    counts, bin_widths, bin_centers = prepare_data(h) # get data 
+    counts, bin_widths, bin_centers = _prepare_data(h) # get data 
 
     # get peakshape of best-fit and maximum likelihood value
     model_func  = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
@@ -123,4 +131,38 @@ function p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{
     pval= sum(loglike_bf_mc.<=loglike_bf)./n_samples # preliminary. could be improved e.g. with interpolation
     return pval 
 end
-export p_value_MC
+export p_value_MC
+
+""" 
+    residuals(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
+calculate bin-wise residuals and normalized residuals 
+calcualte bin-wise p-value based on poisson distribution for each bin 
+# input:
+ * `f_fit`function handle of fit function (peakshape)
+ * `h` histogram of data
+ * `v_ml` best-fit parameters
+# returns:
+ * `residuals` difference: model - data (histogram bin count)
+ * `residuals_norm` normalized residuals: model - data / sqrt(model)
+ * `p_value_binwise` p-value for each bin based on poisson distribution
+"""
+function get_residuals(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
+    # prepare data
+    counts, bin_widths, bin_centers = _prepare_data(h)
+
+    # get peakshape of best-fit 
+    model_counts = _get_model_counts(f_fit, v_ml, bin_centers,bin_widths)
+
+    # calculate bin-wise residuals 
+    residuals    = model_counts[model_counts.>0]-counts[model_counts.>0]
+    sigma        = sqrt.(model_counts[model_counts.>0])
+    residuals_norm = residuals./sigma
+
+    # calculate something like a bin-wise p-value (in case that makes sense)
+    dist = Poisson.(model_counts) # each bin: poisson distributed 
+    cdf_value_low = cdf.(dist, model_counts.-abs.(residuals)) 
+    cdf_value_up  = 1 .-cdf.(dist, model_counts.+abs.(residuals))  
+    p_value_binwise = cdf_value_low .+ cdf_value_up # significance of residuals -> ~proabability that residual (for a given bin) is as large as observed or larger
+    return residuals, residuals_norm, p_value_binwise, bin_centers
+end
+
diff --git a/src/specfit.jl b/src/specfit.jl
@@ -189,9 +189,13 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw
 
         # Extract the parameter uncertainties
         v_ml_err = array_to_tuple(sqrt.(abs.(diag(param_covariance))), v_ml)
+
         # calculate p-value
         pval, chi2, dof = p_value(th228_fit_functions.f_fit, h, v_ml)
 
+        # calculate normalized residuals
+        residuals, residuals_norm, p_value_binwise, bin_centers =  get_residuals(th228_fit_functions.f_fit, h, v_ml)
+
         # get fwhm of peak
         fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, param_covariance)
 
@@ -202,7 +206,7 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw
         @debug "p: $pval , chi2 = $(chi2) with $(dof) dof"
         @debug "FWHM: $(fwhm) ± $(fwhm_err)"
 
-        result = merge(v_ml, (pval = pval, chi2 = chi2, dof = dof, fwhm = fwhm,covmat = param_covariance, covmat_raw = param_covariance_raw,),(err = merge(v_ml_err, (fwhm = fwhm_err,)),))
+        result = merge(v_ml, (pval = pval, chi2 = chi2, dof = dof, fwhm = fwhm,covmat = param_covariance, covmat_raw = param_covariance_raw,residuals = residuals, residuals_norm = residuals_norm, p_value_binwise= p_value_binwise,bin_centers = bin_centers,),(err = merge(v_ml_err, (fwhm = fwhm_err,)),))
         report = (
             v = v_ml,
             h = h,

diff --git a/src/utils.jl b/src/utils.jl
@@ -66,15 +66,19 @@ end
 
 """ 
     get_mc_value_shapes(v::NamedTuple, v_err::NamedTuple, n::Int64)
-
 Return a `NamedTuple` with the same fields as `v` and `v_err` but with
 `Normal` distributions for each field.
 """
 function get_mc_value_shapes(v::NamedTuple, v_err::NamedTuple, n::Int64)
     vs = BAT.distprod(map(Normal, v, v_err))
     NamedTuple.(rand(vs, n))
 end
-function get_mc_value_shapes(v::NamedTuple, v_err::Matrix, n::Int64)
+
+"""
+    get_mc_value_shapes(v::NamedTuple, v_err::Matrix, n::Union{Int64,Int32})
+Generate `n` random samples of fit parameters using their respective best-fit values `v` and covariance matrix `v_err`
+"""
+function get_mc_value_shapes(v::NamedTuple, v_err::Matrix, n::Union{Int64,Int32})
     if !isposdef(v_err)
         v_err = nearestSPD(v_err)
         @debug "Covariance matrix not positive definite. Using nearestSPD"
@@ -128,10 +132,11 @@ function get_number_of_bins(x::AbstractArray,; method::Symbol=:sqrt)
 end
 
 """
-nearestSPD(A) returns the nearest positive definite matrix to A
-calculation is based on matrix factorization techniques described in https://www.sciencedirect.com/science/article/pii/0024379588902236
+    nearestSPD(A::Matrix{<:Real}) 
+Returns the nearest positive definite matrix to A
+Calculation is based on matrix factorization techniques described in https://www.sciencedirect.com/science/article/pii/0024379588902236
 """
-function nearestSPD(A)
+function nearestSPD(A::Matrix{<:Real})
     B = (A + A') / 2  # make sure matrix is symmetric
     _, s, V = svd(B)  # singular value decomposition (SVD), s = singular values (~eigenvalues), V = right singular vector  (~eigenvector)
     H = V * diagm(0 => max.(s, 0)) * V' # symmetric polar factor of B

diff --git a/test/Project.toml b/test/Project.toml
@@ -2,6 +2,8 @@
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 # BAT = "c0cd4b16-88b7-57fa-983b-ab80aecada7e"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
+LegendDataTypes = "99e09c13-5545-5ee2-bfa2-77f358fb75d8"
 LegendHDF5IO = "c9265ca6-b027-5446-b1a4-febfa8dd10b0"
 Optim = "429524aa-4258-5aef-a3af-852621145aeb"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
@@ -10,3 +12,5 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 Documenter = "1"
+Interpolations =  "0.15"
+LegendDataTypes =  "0.1"
diff --git a/test/test_specfit.jl b/test/test_specfit.jl
@@ -1,9 +1,13 @@
 # This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT).
 using LegendSpecFits
 using Test
+using LegendDataTypes: fast_flatten
+using Interpolations
 
 @testset "specfit" begin
     # load data, simple calibration 
+    include("test_utils.jl")
+
     energy_test, th228_lines = generate_mc_spectrum(200000)
 
     # simple calibration fit 
@@ -12,4 +16,5 @@ using Test
 
     # fit 
     result, report = fit_peaks(result_simple.peakhists, result_simple.peakstats, th228_lines,; uncertainty=true);
-end
+end
+
diff --git a/src/specfit_testdata.jl → test/test_utils.jl b/src/specfit_testdata.jl → test/test_utils.jl
@@ -1,12 +1,15 @@
 # This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT).
-#=
-Sample Legend200 calibration data based on "Inverse Transform Sampling" method: 
-- pdf of th228 calibration calibration peak is estimated from fit model function f_fit from LegendSpecFits
-- calculate the cumulative distribution function F(x)
-- generate a random number u from a uniform distribution between 0 and 1.
-- find the value x such that  F(x) = u  by solving for  x . --> done by interpolation of the inverse cdf
-- repeat for many u  --> energy samples 
-=# 
+
+"""
+    generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit_functions.f_fit)
+Sample Legend200 calibration data based on "Inverse Transform Sampling" method
+# Method:
+* pdf of th228 calibration calibration peak is estimated from fit model function f_fit from LegendSpecFits
+* calculate the cumulative distribution function F(x)
+* generate a random number u from a uniform distribution between 0 and 1.
+* find the value x such that  F(x) = u  by solving for  x . done by interpolation of the inverse cdf
+* repeat for many u : energy samples 
+"""
 function generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit_functions.f_fit)
 
     th228_lines =  [583.191,  727.330,  860.564,  1592.53,    1620.50,    2103.53,    2614.51]
@@ -34,7 +37,7 @@ function generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit
         bin_widths = range(bw,bw, length=n_step) 
 
         # save as intermediate result 
-        model_counts_all[i] = get_model_counts(f_fit, v[i], bin_centers_all[i], bin_widths)
+        model_counts_all[i] =LegendSpecFits._get_model_counts(f_fit, v[i], bin_centers_all[i], bin_widths)
         PeakMax[i] = maximum(model_counts_all[i])
 
         # create CDF
@@ -50,11 +53,10 @@ function generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit
     for i=1:length(th228_lines)
         bandwidth = maximum(model_cdf_all[i])-minimum(model_cdf_all[i])
         rand_i = minimum(model_cdf_all[i]).+bandwidth.*rand(n_i[i]); # make sure sample is within model range 
-        interp_cdf_inv = LinearInterpolation(model_cdf_all[i],bin_centers_all[i]) # inverse cdf
+        interp_cdf_inv = linear_interpolation(model_cdf_all[i],bin_centers_all[i]) # inverse cdf
         energy_mc_all[i] = interp_cdf_inv.(rand_i) 
     end
 
     energy_mc = fast_flatten(energy_mc_all)
     return energy_mc, th228_lines
 end
-export generate_mc_spectrum
Original file line number	Diff line number	Diff line change
Expand Up		@@ -374,4 +374,5 @@ end

		end


		end # module LegendSpecFitsRecipesBaseExt