Skip to content

Commit

Permalink
add norm. residuals to fit results + cosmetic improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
LisaSchlueter committed Feb 5, 2024
1 parent 041fd42 commit 2acd7a2
Show file tree
Hide file tree
Showing 9 changed files with 103 additions and 44 deletions.
3 changes: 1 addition & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ BAT = "c0cd4b16-88b7-57fa-983b-ab80aecada7e"
ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
Expand Down Expand Up @@ -53,12 +54,10 @@ Distributions = "0.24, 0.25"
FillArrays = "0.7,0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1"
Formatting = "0.4"
ForwardDiff = "0.10"
Interpolations = "v0.15.1"
IntervalSets = "0.7"
InverseFunctions = "0.1"
IrrationalConstants = "0.1, 0.2"
LegendDataManagement = "0.2.4"
LegendDataTypes = "v0.1.5"
LinearAlgebra = "1"
LinearRegression = "0.2"
LsqFit = "0.14, 0.15"
Expand Down
1 change: 1 addition & 0 deletions ext/LegendSpecFitsRecipesBaseExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -374,4 +374,5 @@ end

end


end # module LegendSpecFitsRecipesBaseExt
3 changes: 0 additions & 3 deletions src/LegendSpecFits.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,10 @@ using BAT
using Distributions
using FillArrays
using ForwardDiff
using Interpolations
using IntervalSets
using InverseFunctions
using IrrationalConstants
using LegendDataManagement
using LegendDataTypes: fast_flatten
using LinearRegression
using LsqFit
using Optim
Expand Down Expand Up @@ -51,7 +49,6 @@ include("simple_calibration.jl")
include("auto_calibration.jl")
include("aoe_calibration.jl")
include("specfit_combined.jl")
include("specfit_testdata.jl")
include("ctc.jl")
include("qc.jl")
include("gof.jl")
Expand Down
84 changes: 63 additions & 21 deletions src/gof.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,47 +5,50 @@ several functions to calculate goodness-of-fit (gof) for fits (-> `specfits.jl`)
"""

"""
_prepare_data(h::Histogram{<:Real,1})
aux. function to convert histogram data into bin edges, bin width and bin counts
"""
function prepare_data(h::Histogram{<:Real,1})
function _prepare_data(h::Histogram{<:Real,1})
# get bin center, width and counts from histogrammed data
bin_edges = first(h.edges)
counts = h.weights
bin_centers = (bin_edges[begin:end-1] .+ bin_edges[begin+1:end]) ./ 2
bin_widths = bin_edges[begin+1:end] .- bin_edges[begin:end-1]
return counts, bin_widths, bin_centers
end
export prepare_data


"""
_get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
aux. function to get modelled peakshape based on histogram binning and best-fit parameter
"""
function get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
function _get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
model_func = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies)
return model_counts
end
export get_model_counts



"""
`p_value(f_fit, h, v_ml)` : calculate p-value based on least-squares
p_value(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
calculate p-value based on least-squares
baseline method to get goodness-of-fit (gof)
input:
* f_fit --> function handle of fit function (peakshape)
* h --> histogram of data
* v_ml --> best-fit parameters
output:
* pval --> p-value of chi2 test
* chi2 --> chi2 value
* dof --> degrees of freedom
# input:
* `f_fit`function handle of fit function (peakshape)
* `h` histogram of data
* `v_ml` best-fit parameters
# returns:
* `pval` p-value of chi2 test
* `chi2` chi2 value
* `dof` degrees of freedom
"""
function p_value(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
# prepare data
counts, bin_widths, bin_centers = prepare_data(h)
counts, bin_widths, bin_centers = _prepare_data(h)

# get peakshape of best-fit
model_counts = get_model_counts(f_fit, v_ml, bin_centers,bin_widths)
model_counts = _get_model_counts(f_fit, v_ml, bin_centers,bin_widths)

# calculate chi2
chi2 = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
Expand All @@ -62,13 +65,16 @@ end
export p_value


""" alternative p-value via loglikelihood ratio"""
"""
p_value_LogLikeRatio(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
alternative p-value via loglikelihood ratio
"""
function p_value_LogLikeRatio(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
# prepare data
counts, bin_widths, bin_centers = prepare_data(h)
counts, bin_widths, bin_centers = _prepare_data(h)

# get peakshape of best-fit
model_counts =get_model_counts(f_fit, v_ml, bin_centers,bin_widths)
model_counts = _get_model_counts(f_fit, v_ml, bin_centers,bin_widths)

# calculate chi2
chi2 = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
Expand All @@ -86,14 +92,16 @@ return pval, chi2, dof
end
export p_value_LogLikeRatio

""" alternative p-value calculation via Monte Carlo sampling. Warning: computational more expensive than p_vaule() and p_value_LogLikeRatio()
"""
p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background)},v_ml::NamedTuple,;n_samples::Int64=1000)
alternative p-value calculation via Monte Carlo sampling. Warning: computational more expensive than p_vaule() and p_value_LogLikeRatio()
* Create n_samples randomized histograms. For each bin, samples are drawn from a Poisson distribution with λ = model peak shape (best-fit parameter)
* Each sample histogram is fit using the model function `f_fit`
* For each sample fit, the max. loglikelihood fit is calculated
% p value --> comparison of sample max. loglikelihood and max. loglikelihood of best-fit
"""
function p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background)},v_ml::NamedTuple,;n_samples::Int64=1000)
counts, bin_widths, bin_centers = prepare_data(h) # get data
counts, bin_widths, bin_centers = _prepare_data(h) # get data

# get peakshape of best-fit and maximum likelihood value
model_func = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
Expand Down Expand Up @@ -123,4 +131,38 @@ function p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{
pval= sum(loglike_bf_mc.<=loglike_bf)./n_samples # preliminary. could be improved e.g. with interpolation
return pval
end
export p_value_MC
export p_value_MC

"""
residuals(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
calculate bin-wise residuals and normalized residuals
calcualte bin-wise p-value based on poisson distribution for each bin
# input:
* `f_fit`function handle of fit function (peakshape)
* `h` histogram of data
* `v_ml` best-fit parameters
# returns:
* `residuals` difference: model - data (histogram bin count)
* `residuals_norm` normalized residuals: model - data / sqrt(model)
* `p_value_binwise` p-value for each bin based on poisson distribution
"""
function get_residuals(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
# prepare data
counts, bin_widths, bin_centers = _prepare_data(h)

# get peakshape of best-fit
model_counts = _get_model_counts(f_fit, v_ml, bin_centers,bin_widths)

# calculate bin-wise residuals
residuals = model_counts[model_counts.>0]-counts[model_counts.>0]
sigma = sqrt.(model_counts[model_counts.>0])
residuals_norm = residuals./sigma

# calculate something like a bin-wise p-value (in case that makes sense)
dist = Poisson.(model_counts) # each bin: poisson distributed
cdf_value_low = cdf.(dist, model_counts.-abs.(residuals))
cdf_value_up = 1 .-cdf.(dist, model_counts.+abs.(residuals))
p_value_binwise = cdf_value_low .+ cdf_value_up # significance of residuals -> ~proabability that residual (for a given bin) is as large as observed or larger
return residuals, residuals_norm, p_value_binwise, bin_centers
end

6 changes: 5 additions & 1 deletion src/specfit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -189,9 +189,13 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw

# Extract the parameter uncertainties
v_ml_err = array_to_tuple(sqrt.(abs.(diag(param_covariance))), v_ml)

# calculate p-value
pval, chi2, dof = p_value(th228_fit_functions.f_fit, h, v_ml)

# calculate normalized residuals
residuals, residuals_norm, p_value_binwise, bin_centers = get_residuals(th228_fit_functions.f_fit, h, v_ml)

# get fwhm of peak
fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, param_covariance)

Expand All @@ -202,7 +206,7 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw
@debug "p: $pval , chi2 = $(chi2) with $(dof) dof"
@debug "FWHM: $(fwhm) ± $(fwhm_err)"

result = merge(v_ml, (pval = pval, chi2 = chi2, dof = dof, fwhm = fwhm,covmat = param_covariance, covmat_raw = param_covariance_raw,),(err = merge(v_ml_err, (fwhm = fwhm_err,)),))
result = merge(v_ml, (pval = pval, chi2 = chi2, dof = dof, fwhm = fwhm,covmat = param_covariance, covmat_raw = param_covariance_raw,residuals = residuals, residuals_norm = residuals_norm, p_value_binwise= p_value_binwise,bin_centers = bin_centers,),(err = merge(v_ml_err, (fwhm = fwhm_err,)),))
report = (
v = v_ml,
h = h,
Expand Down
15 changes: 10 additions & 5 deletions src/utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,19 @@ end

"""
get_mc_value_shapes(v::NamedTuple, v_err::NamedTuple, n::Int64)
Return a `NamedTuple` with the same fields as `v` and `v_err` but with
`Normal` distributions for each field.
"""
function get_mc_value_shapes(v::NamedTuple, v_err::NamedTuple, n::Int64)
vs = BAT.distprod(map(Normal, v, v_err))
NamedTuple.(rand(vs, n))
end
function get_mc_value_shapes(v::NamedTuple, v_err::Matrix, n::Int64)

"""
get_mc_value_shapes(v::NamedTuple, v_err::Matrix, n::Union{Int64,Int32})
Generate `n` random samples of fit parameters using their respective best-fit values `v` and covariance matrix `v_err`
"""
function get_mc_value_shapes(v::NamedTuple, v_err::Matrix, n::Union{Int64,Int32})
if !isposdef(v_err)
v_err = nearestSPD(v_err)
@debug "Covariance matrix not positive definite. Using nearestSPD"
Expand Down Expand Up @@ -128,10 +132,11 @@ function get_number_of_bins(x::AbstractArray,; method::Symbol=:sqrt)
end

"""
nearestSPD(A) returns the nearest positive definite matrix to A
calculation is based on matrix factorization techniques described in https://www.sciencedirect.com/science/article/pii/0024379588902236
nearestSPD(A::Matrix{<:Real})
Returns the nearest positive definite matrix to A
Calculation is based on matrix factorization techniques described in https://www.sciencedirect.com/science/article/pii/0024379588902236
"""
function nearestSPD(A)
function nearestSPD(A::Matrix{<:Real})
B = (A + A') / 2 # make sure matrix is symmetric
_, s, V = svd(B) # singular value decomposition (SVD), s = singular values (~eigenvalues), V = right singular vector (~eigenvector)
H = V * diagm(0 => max.(s, 0)) * V' # symmetric polar factor of B
Expand Down
4 changes: 4 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
# BAT = "c0cd4b16-88b7-57fa-983b-ab80aecada7e"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
LegendDataTypes = "99e09c13-5545-5ee2-bfa2-77f358fb75d8"
LegendHDF5IO = "c9265ca6-b027-5446-b1a4-febfa8dd10b0"
Optim = "429524aa-4258-5aef-a3af-852621145aeb"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Expand All @@ -10,3 +12,5 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[compat]
Documenter = "1"
Interpolations = "0.15"
LegendDataTypes = "0.1"
7 changes: 6 additions & 1 deletion test/test_specfit.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
# This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT).
using LegendSpecFits
using Test
using LegendDataTypes: fast_flatten
using Interpolations

@testset "specfit" begin
# load data, simple calibration
include("test_utils.jl")

energy_test, th228_lines = generate_mc_spectrum(200000)

# simple calibration fit
Expand All @@ -12,4 +16,5 @@ using Test

# fit
result, report = fit_peaks(result_simple.peakhists, result_simple.peakstats, th228_lines,; uncertainty=true);
end
end

24 changes: 13 additions & 11 deletions src/specfit_testdata.jl → test/test_utils.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
# This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT).
#=
Sample Legend200 calibration data based on "Inverse Transform Sampling" method:
- pdf of th228 calibration calibration peak is estimated from fit model function f_fit from LegendSpecFits
- calculate the cumulative distribution function F(x)
- generate a random number u from a uniform distribution between 0 and 1.
- find the value x such that F(x) = u by solving for x . --> done by interpolation of the inverse cdf
- repeat for many u --> energy samples
=#

"""
generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit_functions.f_fit)
Sample Legend200 calibration data based on "Inverse Transform Sampling" method
# Method:
* pdf of th228 calibration calibration peak is estimated from fit model function f_fit from LegendSpecFits
* calculate the cumulative distribution function F(x)
* generate a random number u from a uniform distribution between 0 and 1.
* find the value x such that F(x) = u by solving for x . done by interpolation of the inverse cdf
* repeat for many u : energy samples
"""
function generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit_functions.f_fit)

th228_lines = [583.191, 727.330, 860.564, 1592.53, 1620.50, 2103.53, 2614.51]
Expand Down Expand Up @@ -34,7 +37,7 @@ function generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit
bin_widths = range(bw,bw, length=n_step)

# save as intermediate result
model_counts_all[i] = get_model_counts(f_fit, v[i], bin_centers_all[i], bin_widths)
model_counts_all[i] =LegendSpecFits._get_model_counts(f_fit, v[i], bin_centers_all[i], bin_widths)
PeakMax[i] = maximum(model_counts_all[i])

# create CDF
Expand All @@ -50,11 +53,10 @@ function generate_mc_spectrum(n_tot::Int=200000,; f_fit::Base.Callable=th228_fit
for i=1:length(th228_lines)
bandwidth = maximum(model_cdf_all[i])-minimum(model_cdf_all[i])
rand_i = minimum(model_cdf_all[i]).+bandwidth.*rand(n_i[i]); # make sure sample is within model range
interp_cdf_inv = LinearInterpolation(model_cdf_all[i],bin_centers_all[i]) # inverse cdf
interp_cdf_inv = linear_interpolation(model_cdf_all[i],bin_centers_all[i]) # inverse cdf
energy_mc_all[i] = interp_cdf_inv.(rand_i)
end

energy_mc = fast_flatten(energy_mc_all)
return energy_mc, th228_lines
end
export generate_mc_spectrum

0 comments on commit 2acd7a2

Please sign in to comment.