Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pvalue/gof, fwhm correlated uncertainty, iterative fit, SPD covmat, test data, norm. residuals #33

Merged
merged 23 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
05acef5
add p-value for fit_single_peak_th228
LisaSchlueter Jan 29, 2024
416c299
add 2 more p-value calculation alternatives
LisaSchlueter Jan 30, 2024
5815b73
implement iterative fit for cali peaks based on covmat
LisaSchlueter Jan 30, 2024
56ee493
add doc to gof
LisaSchlueter Jan 30, 2024
226d511
iterative_fit: add flag, default is false -> no iterative fit
LisaSchlueter Jan 31, 2024
f4fef01
fwhm uncertainty with covmat
LisaSchlueter Feb 2, 2024
4203bde
function to sample test data from model, run test doesnt need to read…
LisaSchlueter Feb 3, 2024
574f4b2
add distribution package
LisaSchlueter Feb 3, 2024
7c85c10
fix pull request issue, forgot a plot
LisaSchlueter Feb 3, 2024
041fd42
add fast flatten
LisaSchlueter Feb 5, 2024
2acd7a2
add norm. residuals to fit results + cosmetic improvements
LisaSchlueter Feb 5, 2024
39249e6
Cleaned dosctrings
theHenks Feb 6, 2024
309605b
Changed some formatting
theHenks Feb 6, 2024
8b70401
Apply suggestions from code review
LisaSchlueter Feb 6, 2024
b4976e1
Apply suggestions from code review
LisaSchlueter Feb 6, 2024
3ae2b5c
Fixed compat
theHenks Jan 15, 2024
72630f0
Fix docstring
theHenks Feb 6, 2024
d937058
Increased LegendDataManagement package version
theHenks Feb 6, 2024
44413c4
add fast flatten
LisaSchlueter Feb 5, 2024
b4fba2a
add norm. residuals to fit results + cosmetic improvements
LisaSchlueter Feb 5, 2024
fc1f130
Fixed deps
theHenks Feb 6, 2024
42e2138
Fixed specfit test
theHenks Feb 6, 2024
cf7bcfa
Merge branch 'dev' into dev
theHenks Feb 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ BAT = "c0cd4b16-88b7-57fa-983b-ab80aecada7e"
ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6"
LegendDataManagement = "9feedd95-f0e0-423f-a8dc-de0970eae6b3"
LegendDataTypes = "99e09c13-5545-5ee2-bfa2-77f358fb75d8"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LinearRegression = "92481ed7-9fb7-40fd-80f2-46fd0f076581"
LsqFit = "2fda8390-95c7-5789-9bda-21331edee243"
Expand Down Expand Up @@ -56,7 +58,6 @@ InverseFunctions = "0.1"
IrrationalConstants = "0.1, 0.2"
LegendDataManagement = "0.2.7"
LinearAlgebra = "1"
LinearAlgebra = "1"
LinearRegression = "0.2"
LsqFit = "0.14, 0.15"
Optim = "1"
Expand Down
1 change: 1 addition & 0 deletions ext/LegendSpecFitsRecipesBaseExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -374,4 +374,5 @@ end

end


end # module LegendSpecFitsRecipesBaseExt
2 changes: 1 addition & 1 deletion src/LegendSpecFits.jl
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ include("aoe_calibration.jl")
include("specfit_combined.jl")
include("ctc.jl")
include("qc.jl")

include("gof.jl")
include("precompile.jl")

end # module
176 changes: 176 additions & 0 deletions src/gof.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# This file is a part of LegendSpecFits.jl, licensed under the MIT License (MIT).


"""
_prepare_data(h::Histogram{<:Real,1})
aux. function to convert histogram data into bin edges, bin width and bin counts
"""
function _prepare_data(h::Histogram{<:Real,1})
# get bin center, width and counts from histogrammed data
bin_edges = first(h.edges)
counts = h.weights
bin_centers = (bin_edges[begin:end-1] .+ bin_edges[begin+1:end]) ./ 2
bin_widths = bin_edges[begin+1:end] .- bin_edges[begin:end-1]
return counts, bin_widths, bin_centers
end


"""
_get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
aux. function to get modelled peakshape based on histogram binning and best-fit parameter
"""
function _get_model_counts(f_fit::Base.Callable,v_ml::NamedTuple,bin_centers::StepRangeLen,bin_widths::StepRangeLen)
model_func = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies)
return model_counts
end



"""
p_value(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
calculate p-value based on least-squares
baseline method to get goodness-of-fit (gof)
# input:
* `f_fit`function handle of fit function (peakshape)
* `h` histogram of data
* `v_ml` best-fit parameters
# returns:
* `pval` p-value of chi2 test
* `chi2` chi2 value
* `dof` degrees of freedom
"""
function p_value(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
# prepare data
counts, bin_widths, bin_centers = _prepare_data(h)

# get peakshape of best-fit
model_counts = _get_model_counts(f_fit, v_ml, bin_centers,bin_widths)

# calculate chi2
chi2 = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
npar = length(v_ml)
dof = length(counts[model_counts.>0])-npar
pval = ccdf(Chisq(dof),chi2)
if any(model_counts.<=5)
@warn "WARNING: bin with <=$(round(minimum(model_counts),digits=0)) counts - chi2 test might be not valid"
else
@debug "p-value = $(round(pval,digits=2))"
end
return pval, chi2, dof
end
export p_value


"""
p_value_LogLikeRatio(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
alternative p-value via loglikelihood ratio
"""
function p_value_LogLikeRatio(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
# prepare data
counts, bin_widths, bin_centers = _prepare_data(h)

# get peakshape of best-fit
model_counts = _get_model_counts(f_fit, v_ml, bin_centers,bin_widths)

# calculate chi2
chi2 = sum((model_counts[model_counts.>0]-counts[model_counts.>0]).^2 ./ model_counts[model_counts.>0])
npar = length(v_ml)
dof = length(counts[model_counts.>0])-npar
pval = ccdf(Chisq(dof),chi2)
if any(model_counts.<=5)
@warn "WARNING: bin with <=$(minimum(model_counts)) counts - chi2 test might be not valid"
else
@debug "p-value = $(round(pval,digits=2))"
end
chi2 = 2*sum(model_counts.*log.(model_counts./counts)+model_counts-counts)
pval = ccdf(Chisq(dof),chi2)
return pval, chi2, dof
end
export p_value_LogLikeRatio

"""
p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background)},v_ml::NamedTuple,;n_samples::Int64=1000)
alternative p-value calculation via Monte Carlo sampling. **Warning**: computational more expensive than p_vaule() and p_value_LogLikeRatio()
# Input:
* `f_fit`function handle of fit function (peakshape)
* `h` histogram of data
* `ps` best-fit parameters
* `v_ml` best-fit parameters
* `n_samples` number of samples

# Performed Steps:
* Create n_samples randomized histograms. For each bin, samples are drawn from a Poisson distribution with λ = model peak shape (best-fit parameter)
* Each sample histogram is fit using the model function `f_fit`
* For each sample fit, the max. loglikelihood fit is calculated

# Returns
* % p value --> comparison of sample max. loglikelihood and max. loglikelihood of best-fit
"""
function p_value_MC(f_fit::Base.Callable, h::Histogram{<:Real,1},ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background)},v_ml::NamedTuple,;n_samples::Int64=1000)
counts, bin_widths, bin_centers = _prepare_data(h) # get data
# get peakshape of best-fit and maximum likelihood value
model_func = Base.Fix2(f_fit, v_ml) # fix the fit parameters to ML best-estimate
model_counts = bin_widths.*map(energy->model_func(energy), bin_centers) # evaluate model at bin center (= binned measured energies)
loglike_bf = -hist_loglike(model_func,h)

# draw sample for each bin
dists = Poisson.(model_counts) # create poisson distribution for each bin
counts_mc_vec = rand.(dists,n_samples) # randomized histogram counts
counts_mc = [ [] for _ in 1:n_samples ] #re-structure data_samples_vec to array of arrays, there is probably a better way to do this...
for i = 1:n_samples
counts_mc[i] = map(x -> x[i],counts_mc_vec)
end

# fit every sample histogram and calculate max. loglikelihood
loglike_bf_mc = NaN.*ones(n_samples)
h_mc = h # make copy of data histogram
for i=1:n_samples
h_mc.weights = counts_mc[i] # overwrite counts with MC values
result_fit_mc, report = fit_single_peak_th228(h_mc, ps ; uncertainty=false) # fit MC histogram
fit_par_mc = result_fit_mc[(:μ, :σ, :n, :step_amplitude, :skew_fraction, :skew_width, :background)]
model_func_sample = Base.Fix2(f_fit, fit_par_mc) # fix the fit parameters to ML best-estimate
loglike_bf_mc[i] = -hist_loglike(model_func_sample,h_mc) # loglikelihood for best-fit
end

# calculate p-value
pval= sum(loglike_bf_mc.<=loglike_bf)./n_samples # preliminary. could be improved e.g. with interpolation
return pval
end
export p_value_MC

"""
residuals(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
Calculate bin-wise residuals and normalized residuals.
Calcualte bin-wise p-value based on poisson distribution for each bin.

# Input:
* `f_fit`function handle of fit function (peakshape)
* `h` histogram of data
* `v_ml` best-fit parameters

# Returns:
* `residuals` difference: model - data (histogram bin count)
* `residuals_norm` normalized residuals: model - data / sqrt(model)
* `p_value_binwise` p-value for each bin based on poisson distribution
"""
function get_residuals(f_fit::Base.Callable, h::Histogram{<:Real,1},v_ml::NamedTuple)
# prepare data
counts, bin_widths, bin_centers = _prepare_data(h)

# get peakshape of best-fit
model_counts = _get_model_counts(f_fit, v_ml, bin_centers,bin_widths)

# calculate bin-wise residuals
residuals = model_counts[model_counts.>0]-counts[model_counts.>0]
sigma = sqrt.(model_counts[model_counts.>0])
residuals_norm = residuals./sigma

# calculate something like a bin-wise p-value (in case that makes sense)
dist = Poisson.(model_counts) # each bin: poisson distributed
cdf_value_low = cdf.(dist, model_counts.-abs.(residuals))
cdf_value_up = 1 .-cdf.(dist, model_counts.+abs.(residuals))
p_value_binwise = cdf_value_low .+ cdf_value_up # significance of residuals -> ~proabability that residual (for a given bin) is as large as observed or larger
return residuals, residuals_norm, p_value_binwise, bin_centers
end

66 changes: 44 additions & 22 deletions src/specfit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ th228_fit_functions = (
f_bck = (x, v) -> background_peakshape(x, v.μ, v.σ, v.step_amplitude, v.background),
f_sigWithTail = (x, v) -> signal_peakshape(x, v.μ, v.σ, v.n, v.skew_fraction) + lowEtail_peakshape(x, v.μ, v.σ, v.n, v.skew_fraction, v.skew_width)
)

"""
estimate_single_peak_stats(h::Histogram, calib_type::Symbol=:th228)

Expand Down Expand Up @@ -38,7 +37,6 @@ function estimate_single_peak_stats(h::Histogram,; calib_type::Symbol=:th228)
end
export estimate_single_peak_stats


function estimate_single_peak_stats_th228(h::Histogram{T}) where T<:Real
W = h.weights
E = first(h.edges)
Expand Down Expand Up @@ -88,16 +86,16 @@ Perform a fit of the peakshape to the data in `peakhists` using the initial valu
* `peak_fit_plots`: array of plots of the peak fits
* `return_vals`: dictionary of the fit results
"""
function fit_peaks(peakhists::Array, peakstats::StructArray, th228_lines::Array,; calib_type::Symbol=:th228, uncertainty::Bool=true, low_e_tail::Bool=true)
function fit_peaks(peakhists::Array, peakstats::StructArray, th228_lines::Array,; calib_type::Symbol=:th228, uncertainty::Bool=true, low_e_tail::Bool=true,iterative_fit::Bool=false)
if calib_type == :th228
return fit_peaks_th228(peakhists, peakstats, th228_lines,; uncertainty=uncertainty, low_e_tail=low_e_tail)
return fit_peaks_th228(peakhists, peakstats, th228_lines,; uncertainty=uncertainty, low_e_tail=low_e_tail,iterative_fit=iterative_fit)
else
error("Calibration type not supported")
end
end
export fit_peaks

function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::Array{T},; uncertainty::Bool=true, low_e_tail::Bool=true) where T<:Any
function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::Array{T},; uncertainty::Bool=true, low_e_tail::Bool=true, iterative_fit::Bool=false) where T<:Any
# create return and result dicts
result = Dict{T, NamedTuple}()
report = Dict{T, NamedTuple}()
Expand All @@ -108,6 +106,17 @@ function fit_peaks_th228(peakhists::Array, peakstats::StructArray, th228_lines::
ps = peakstats[i]
# fit peak
result_peak, report_peak = fit_single_peak_th228(h, ps, ; uncertainty=uncertainty, low_e_tail=low_e_tail)

# check covariance matrix for being semi positive definite (no negative uncertainties)
if uncertainty
if iterative_fit && !isposdef(result_peak.covmat)
@warn "Covariance matrix not positive definite for peak $peak - repeat fit without low energy tail"
pval_save = result_peak.pval
result_peak, report_peak = fit_single_peak_th228(h, ps, ; uncertainty=uncertainty, low_e_tail=false)
@info "New covariance matrix is positive definite: $(isposdef(result_peak.covmat))"
@info "p-val with low-energy tail p=$(round(pval_save,digits=5)) , without low-energy tail: p=$(round((result_peak.pval),digits=5))"
end
end
# save results
result[peak] = result_peak
report[peak] = report_peak
Expand All @@ -124,7 +133,9 @@ Also, FWHM is calculated from the fitted peakshape with MC error propagation. Th
* `result`: NamedTuple of the fit results containing values and errors
* `report`: NamedTuple of the fit report which can be plotted
"""
function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background), NTuple{5, T}}; uncertainty::Bool=true, low_e_tail::Bool=true, fixed_position::Bool=false, pseudo_prior::NamedTupleDist=NamedTupleDist(empty = true)) where T<:Real
function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fwhm, :peak_sigma, :peak_counts, :mean_background), NTuple{5, T}};
uncertainty::Bool=true, low_e_tail::Bool=true, fixed_position::Bool=false, pseudo_prior::NamedTupleDist=NamedTupleDist(empty = true),
fit_fun::Symbol=:f_fit) where T<:Real
# create standard pseudo priors
standard_pseudo_prior = NamedTupleDist(
μ = ifelse(fixed_position, ConstValueDist(ps.peak_pos), Uniform(ps.peak_pos-10, ps.peak_pos+10)),
Expand Down Expand Up @@ -152,8 +163,8 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw
# start values for MLE
v_init = mean(pseudo_prior)

# create loglikehood function
f_loglike = let f_fit=th228_fit_functions.f_fit, h=h
# create loglikehood function: f_loglike(v) that can be evaluated for any set of v (fit parameter)
f_loglike = let f_fit=th228_fit_functions[fit_fun], h=h
v -> hist_loglike(Base.Fix2(f_fit, v), h)
end

Expand All @@ -163,37 +174,45 @@ function fit_single_peak_th228(h::Histogram, ps::NamedTuple{(:peak_pos, :peak_fw
# best fit results
v_ml = inverse(f_trafo)(Optim.minimizer(opt_r))

f_loglike_array = let f_fit=gamma_peakshape, h=h
v -> - hist_loglike(x -> f_fit(x, v...), h)
f_loglike_array = let f_fit=th228_fit_functions[fit_fun], h=h, v_keys = keys(standard_pseudo_prior) #same loglikelihood function as f_loglike, but has array as input instead of NamedTuple
v -> - hist_loglike( x -> f_fit(x,NamedTuple{v_keys}(v)), h)
end

if uncertainty
# Calculate the Hessian matrix using ForwardDiff
H = ForwardDiff.hessian(f_loglike_array, tuple_to_array(v_ml))

# Calculate the parameter covariance matrix
param_covariance = inv(H)

param_covariance_raw = inv(H)
param_covariance = nearestSPD(param_covariance_raw)

# Extract the parameter uncertainties
v_ml_err = array_to_tuple(sqrt.(abs.(diag(param_covariance))), v_ml)

# calculate p-value
pval, chi2, dof = p_value(th228_fit_functions.f_fit, h, v_ml)

# calculate normalized residuals
residuals, residuals_norm, p_value_binwise, bin_centers = get_residuals(th228_fit_functions.f_fit, h, v_ml)

# get fwhm of peak
fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, v_ml_err)
fwhm, fwhm_err = get_peak_fwhm_th228(v_ml, param_covariance)

@debug "Best Fit values"
@debug "μ: $(v_ml.μ) ± $(v_ml_err.μ)"
@debug "σ: $(v_ml.σ) ± $(v_ml_err.σ)"
@debug "n: $(v_ml.n) ± $(v_ml_err.n)"
@debug "p: $pval , chi2 = $(chi2) with $(dof) dof"
@debug "FWHM: $(fwhm) ± $(fwhm_err)"

result = merge(v_ml, (fwhm = fwhm, err = merge(v_ml_err, (fwhm = fwhm_err,))))
result = merge(v_ml, (pval = pval, chi2 = chi2, dof = dof, fwhm = fwhm,covmat = param_covariance, covmat_raw = param_covariance_raw,residuals = residuals, residuals_norm = residuals_norm, p_value_binwise= p_value_binwise,bin_centers = bin_centers,),(err = merge(v_ml_err, (fwhm = fwhm_err,)),))
report = (
v = v_ml,
h = h,
f_fit = x -> Base.Fix2(th228_fit_functions.f_fit, v_ml)(x),
f_sig = x -> Base.Fix2(th228_fit_functions.f_sig, v_ml)(x),
f_lowEtail = x -> Base.Fix2(th228_fit_functions.f_lowEtail, v_ml)(x),
f_bck = x -> Base.Fix2(th228_fit_functions.f_bck, v_ml)(x)
f_bck = x -> Base.Fix2(th228_fit_functions.f_bck, v_ml)(x),
)
else
# get fwhm of peak
Expand Down Expand Up @@ -240,8 +259,6 @@ function estimate_fwhm(v::NamedTuple)
return NaN
end
end


"""
get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::NamedTuple)
Get the FWHM of a peak from the fit parameters while performing a MC error propagation.
Expand All @@ -250,22 +267,27 @@ Get the FWHM of a peak from the fit parameters while performing a MC error propa
* `fwhm`: the FWHM of the peak
* `fwhm_err`: the uncertainty of the FWHM of the peak
"""
function get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::NamedTuple, uncertainty::Bool=true)
function get_peak_fwhm_th228(v_ml::NamedTuple, v_ml_err::Union{Matrix,NamedTuple},uncertainty::Bool=true)
# get fwhm for peak fit
fwhm = estimate_fwhm(v_ml)
if !uncertainty
return fwhm, NaN
end

# get MC for FWHM err
v_mc = get_mc_value_shapes(v_ml, v_ml_err, 1000)
if isa(v_ml_err,Matrix)# use correlated fit parameter uncertainties
v_mc = get_mc_value_shapes(v_ml, v_ml_err, 10000)
elseif isa(v_ml_err,NamedTuple) # use uncorrelated fit parameter uncertainties
v_mc = get_mc_value_shapes(v_ml, v_ml_err, 1000)
end
fwhm_mc = estimate_fwhm.(v_mc)
fwhm_err = std(fwhm_mc[isfinite.(fwhm_mc)])
return fwhm, fwhm_err
end

export get_peak_fwhm_th228

"""
fitCalibration
fitCalibration(peaks::Array, μ::Array)
Fit the calibration lines to a linear function.
# Returns
* `slope`: the slope of the linear fit
Expand Down
Loading
Loading