diff --git a/.gitignore b/.gitignore index 2f04c95d8..4bd102c4d 100644 --- a/.gitignore +++ b/.gitignore @@ -433,5 +433,6 @@ Session.vim # auto-generated tag files tags +validphys2/src/validphys/test_utils # End of https://www.gitignore.io/api/c++,latex,cmake,python,jupyternotebook,qtcreator,vim diff --git a/PBSP_logos/PBSP_black.pdf b/PBSP_logos/PBSP_black.pdf new file mode 100644 index 000000000..2b2780dc7 Binary files /dev/null and b/PBSP_logos/PBSP_black.pdf differ diff --git a/PBSP_logos/PBSP_dark.pdf b/PBSP_logos/PBSP_dark.pdf new file mode 100644 index 000000000..40ac2baf2 Binary files /dev/null and b/PBSP_logos/PBSP_dark.pdf differ diff --git a/PBSP_logos/PBSP_light.pdf b/PBSP_logos/PBSP_light.pdf new file mode 100644 index 000000000..619bd630f Binary files /dev/null and b/PBSP_logos/PBSP_light.pdf differ diff --git a/validphys2/src/validphys/commondata_new_to_old.py b/validphys2/src/validphys/commondata_new_to_old.py new file mode 100644 index 000000000..b7ba5e042 --- /dev/null +++ b/validphys2/src/validphys/commondata_new_to_old.py @@ -0,0 +1,109 @@ +""" + Commondata converter script from new to old format: + it must be run in an up to date simunet environment, in the `commondata_converter_new_to_old` branch. +""" + +import os +import sys +import yaml +from validphys.utils import uncertainty_yaml_to_systype, convert_new_data_to_old + +# test whether the runcard is passed +if len(sys.argv) != 2: + raise Exception("No runcard is passed!") +card_name = sys.argv[1] +if not os.path.isfile(card_name): + raise Exception("Runcard does not exist!") +# load runcard +with open(card_name, "rb") as stream: + runcard = yaml.safe_load(stream) +# load datasets to convert +datasets = runcard["dataset_inputs"] + +# create test directory if it does not already exist +test_dir = "test_utils" +if not os.path.isdir(test_dir): + os.mkdir(test_dir) + +# changed by the user +nnpdf_path = "/Users/teto/Software/nnpdf_git/nnpdf" +# new commondata path +new_commondata = f"{nnpdf_path}/nnpdf_data/nnpdf_data/commondata" +# open conversion dictionary +with open(f"{new_commondata}/dataset_names.yml", "rb") as stream: + conversion = yaml.safe_load(stream) + +# old format +old_format_names = list(conversion.keys()) +# new format +new_format_names = [] +for c in conversion: + try: + new_format_names.append(conversion[c]["dataset"]) + except TypeError: + new_format_names.append(conversion[c]) + +# prepare list of the datasets to be converted +conversion_ds = [] +for ds in datasets: + if ds["dataset"] in old_format_names: + d = conversion[ds["dataset"]] + d["name"] = ds["dataset"] + conversion_ds.append(d) + elif ds["dataset"] in new_format_names: + conversion_ds.append({"dataset": ds["dataset"], "variant": "legacy", "name": ds["dataset"]}) + else: + conversion_ds.append({"dataset": ds["dataset"], "variant": None, "name": ds["dataset"]}) + +# separate the dataset & the observable names +for ds in conversion_ds: + s = ds["dataset"] + ds["dataset"] = s[:s.rfind("_")] + ds["obs"] = s[s.rfind("_")+1:] + n = ds["name"] + ds["name"] = n[:n.rfind("_")] + +# convert +for i, ds in enumerate(conversion_ds): + var_int, obs_ind = "variant", "obs" + # load metadata file + path_metadata = new_commondata+"/"+ds["dataset"]+f"/metadata.yaml" + with open(path_metadata, "r") as stream: + metadata = yaml.safe_load(stream) + for o in metadata["implemented_observables"]: + if o["observable_name"] == ds[obs_ind]: + data_file_name, unc_file_name, kin_file_name = o["data_central"], o["data_uncertainties"][0], o["kinematics"]["file"] + # if only in the new format + if not ds[var_int]: + path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/{data_file_name}" + path_unc_file = new_commondata+"/"+ds["dataset"]+f"/{unc_file_name}" + path_kin = new_commondata+"/"+ds["dataset"]+f"/{kin_file_name}" + # if also in the old format (legacy variants) + else: + if os.path.isfile(new_commondata+"/"+ds["dataset"]+f"/data_{ds[var_int]}_{ds[obs_ind]}.yaml"): + path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/data_{ds[var_int]}_{ds[obs_ind]}.yaml" + else: + path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/data_legacy_{ds[obs_ind]}.yaml" + path_unc_file = new_commondata+"/"+ds["dataset"]+f"/uncertainties_{ds[var_int]}_{ds[obs_ind]}.yaml" + path_kin = new_commondata+"/"+ds["dataset"]+f"/kinematics_{ds[obs_ind]}.yaml" + # write uncertainty files + + uncertainty_yaml_to_systype(path_unc_file, + name_dataset=ds["name"], + observable=ds["obs"], + path_systype=test_dir) + # write commondata files + convert_new_data_to_old(path_data_yaml, + path_unc_file, + path_kin, + path_metadata, + name_dataset=ds["name"], + observable=ds["obs"], + path_DATA=test_dir) + # output + name = ds["name"]+"_"+ds["obs"] + print(f"{i+1:>2}. {name:>40} converted!") + +# write check runcard +with open("test_utils/check_commondata_new_to_old.yaml", "w") as stream: + yaml.safe_dump(conversion_ds, stream) \ No newline at end of file diff --git a/validphys2/src/validphys/config.py b/validphys2/src/validphys/config.py index 8375b3f2e..bad0bd964 100644 --- a/validphys2/src/validphys/config.py +++ b/validphys2/src/validphys/config.py @@ -38,6 +38,7 @@ MatchedCuts, SimilarCuts, ThCovMatSpec, + PDF, ) from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas from validphys.loader import ( @@ -171,6 +172,10 @@ def parse_pdf(self, name: str): except NotImplementedError as e: raise ConfigError(str(e)) return pdf + + def parse_fakepdf(self, name: str) -> PDF: + """PDF set used to generate the fake data in a closure test.""" + return self.parse_pdf(name) def parse_load_weights_from_fit(self, name: str): """A fit in the results folder, containing at least a valid filter result.""" diff --git a/validphys2/src/validphys/convolution.py b/validphys2/src/validphys/convolution.py index b0dec81fd..0c8c7880e 100644 --- a/validphys2/src/validphys/convolution.py +++ b/validphys2/src/validphys/convolution.py @@ -119,17 +119,17 @@ def _predictions(dataset, pdf, fkfunc): # predictions instead. all_predictions = [] for fk in dataset.fkspecs: - if not fk.use_fixed_predictions: - all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf)) - else: - with open(fk.fixed_predictions_path, 'rb') as f: - fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed']) - # Now need to reshape it according it to the expected number of predictions - if fkfunc == central_fk_predictions: - all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data'])) - elif fkfunc == fk_predictions: - fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1)) - all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())])) + if not fk.use_fixed_predictions: + all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf)) + else: + with open(fk.fixed_predictions_path, 'rb') as f: + fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed'])[cuts] + # Now need to reshape it according it to the expected number of predictions + if fkfunc == central_fk_predictions: + all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data'])) + elif fkfunc == fk_predictions: + fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1)) + all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())])) return opfunc(*all_predictions) diff --git a/validphys2/src/validphys/core.py b/validphys2/src/validphys/core.py index 0772d948a..be329c5d9 100644 --- a/validphys2/src/validphys/core.py +++ b/validphys2/src/validphys/core.py @@ -316,6 +316,18 @@ def load(self)->CommonData: #TODO: Use better path handling in python 3.6 return CommonData.ReadFile(str(self.datafile), str(self.sysfile)) + def load_commondata(self, cuts=None): + """ + Loads a coredata.CommonData object from a core.CommonDataSetSpec object + cuts are applied if provided. + """ + # import here to avoid circular imports + from validphys.commondataparser import load_commondata + cd = load_commondata(self) + if cuts is not None: + cd = cd.with_cuts(cuts) + return cd + @property def plot_kinlabels(self): return get_plot_kinlabels(self) diff --git a/validphys2/src/validphys/coredata.py b/validphys2/src/validphys/coredata.py index 4b8c3cd3d..edd6023e8 100644 --- a/validphys2/src/validphys/coredata.py +++ b/validphys2/src/validphys/coredata.py @@ -263,6 +263,9 @@ def additive_errors(self): add_table.columns = add_systype["name"].to_numpy() return add_table.loc[:, add_table.columns != "SKIP"] + @property + def commondata_table_indices(self): + return self.commondata_table.index - 1 def systematic_errors(self, central_values=None): """Returns all systematic errors as absolute uncertainties, with a diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index af2e7b657..1f10ab9b3 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -866,7 +866,11 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): """ info = get_info(dataset) - table = kitable(dataset, info) + try: + table = kitable(dataset, info) + except: + log.warning(f"Problems with kitable loading {dataset.name}") + table = kitable(dataset.commondata, info) figby = sane_groupby_iter(table, info.figure_by) basis = obs_pdf_correlations.basis @@ -880,7 +884,9 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): plotting_var = info.get_xcol(table) #TODO: vmin vmax should be global or by figure? - vmin,vmax = min(plotting_var), max(plotting_var) + vmin, vmax = min(plotting_var), max(plotting_var) + if type(vmin) == str or type(vmax) == str: + vmin, vmax = 0, 1 if info.x_scale == 'log': norm = mcolors.LogNorm(vmin, vmax) else: @@ -889,7 +895,7 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): sm = cm.ScalarMappable(cmap=cm.viridis, norm=norm) for same_vals, fb in figby: - grid = fullgrid[ np.asarray(fb.index),...] + grid = fullgrid[np.arange(len(fb.index)), ...] #Use the maximum absolute correlation for plotting purposes @@ -906,9 +912,13 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): h*=2.5 fig,axes = plt.subplots(nrows=nf ,sharex=True, figsize=(w,h), sharey=True) fig.suptitle(title) - colors = sm.to_rgba(info.get_xcol(fb)) + if np.vectorize(isinstance)(info.get_xcol(fb), str).any(): + temp = np.linspace(start=0, stop=1, num=len(info.get_xcol(fb))) + colors = sm.to_rgba(temp) + else: + colors = sm.to_rgba(info.get_xcol(fb)) for flindex, (ax, fl) in enumerate(zip(axes, fls)): - for i,color in enumerate(colors): + for i, color in enumerate(colors): ax.plot(x, grid[i,flindex,:].T, color=color) diff --git a/validphys2/src/validphys/make_plotting_files.py b/validphys2/src/validphys/make_plotting_files.py new file mode 100644 index 000000000..d115f9e0f --- /dev/null +++ b/validphys2/src/validphys/make_plotting_files.py @@ -0,0 +1,72 @@ +import os +import sys +import yaml +import shutil +import filecmp + +# simunet environment commondata path +old_commondata = "/Users/teto/miniconda3/envs/simunet_release/share/NNPDF/data/commondata" +# nnpdf commondata path +new_commondata = "/Users/teto/Software/nnpdf_git/nnpdf/nnpdf_data/nnpdf_data/commondata" +# test whether the runcard is passed +if len(sys.argv) != 2: + raise Exception("No runcard is passed!") +card_name = sys.argv[1] +if not os.path.isfile(card_name): + raise Exception("Runcard does not exist!") +# load runcard +with open(card_name, "rb") as stream: + card = yaml.safe_load(stream) +# load conversion dictionary +with open(new_commondata+"/dataset_names.yml", "rb") as stream: + conv = yaml.safe_load(stream) +# load datasets to convert +datasets = card["dataset_inputs"] +# temporary list +temp = [] +# back conversion map +back_conv = {} +# loop over datasets to convert +for ds in datasets: + ds_name = ds["dataset"] + if ds_name in list(conv.keys()) and "-" in ds_name: + # save the datasets to map + temp.append(conv[ds_name]) + # print(f"{ds_name} is in the old format with a new name! (Do it manually)") + else: + for cds in conv: + try: + flag = ds_name == conv[cds]["dataset"] + except TypeError: + flag = ds_name == conv[cds] + if flag: + back_conv[ds_name] = cds +# loop over the datasets that we still have to convert +for ds in temp: + ds_name, ds_var = ds["dataset"], ds["variant"] + back_conv[ds_name] = [] + for cds in conv: + try: + flag = (ds_name == conv[cds]["dataset"]) and (ds_var == conv[cds]["variant"] and "-" not in cds) + except TypeError: + flag = ds_name == conv[cds] + if flag: + back_conv[ds_name] = cds +# copy +for i, bc in enumerate(back_conv): + # new file name + filename_new = f"test_utils/PLOTTING_{bc}.yml" + # old file name + if os.path.isfile(old_commondata+f"/PLOTTING_{back_conv[bc]}.yml"): + filename_old = old_commondata+f"/PLOTTING_{back_conv[bc]}.yml" + elif os.path.isfile(old_commondata+f"/PLOTTING_{back_conv[bc]}.yaml"): + filename_old = old_commondata+f"/PLOTTING_{back_conv[bc]}.yaml" + else: + print(f"Missing PLOTTING file for {back_conv[bc]}!") + # copy + shutil.copy(filename_old, filename_new) + # test the copies + if filecmp.cmp(filename_old, filename_new): + print(f"{i+1:>2}. Copied plotting file {back_conv[bc]:>40} -> {bc:>40}!") + else: + print(f"{i+1:>2}. Error during copy of plotting file {back_conv[bc]:>40} -> {bc:>40}!") \ No newline at end of file diff --git a/validphys2/src/validphys/pdfplots.py b/validphys2/src/validphys/pdfplots.py index dd85b51be..4f4493760 100644 --- a/validphys2/src/validphys/pdfplots.py +++ b/validphys2/src/validphys/pdfplots.py @@ -630,7 +630,7 @@ def plot_lumi1d( if isinstance(gv, MCStats) and show_mc_errors: ax.plot(mx, errstddown / norm, linestyle="--", color=color) ax.plot(mx, errstdup / norm, linestyle="--", color=color) - label_add = r"($68%$ c.l.+$1\sigma$)" if legend_stat_labels else "" + label_add = r"($68\%$ c.l.+$1\sigma$)" if legend_stat_labels else "" outer = True else: label_add = r"($68\%$ c.l.)" if legend_stat_labels else "" diff --git a/validphys2/src/validphys/pseudodata.py b/validphys2/src/validphys/pseudodata.py index 9fd5863a9..4c682e2f7 100644 --- a/validphys2/src/validphys/pseudodata.py +++ b/validphys2/src/validphys/pseudodata.py @@ -9,8 +9,13 @@ import numpy as np import pandas as pd +import os +import yaml -from validphys.covmats import INTRA_DATASET_SYS_NAME +from validphys.covmats import INTRA_DATASET_SYS_NAME, dataset_t0_predictions + +from validphys.convolution import central_predictions +from validphys.loader import Loader from reportengine import collect @@ -18,6 +23,8 @@ log = logging.getLogger(__name__) +l = Loader() + DataTrValSpec = namedtuple('DataTrValSpec', ['pseudodata', 'tr_idx', 'val_idx']) context_index = collect("groups_index", ("fitcontext",)) @@ -235,6 +242,239 @@ def indexed_make_replica(groups_index, make_replica): return pd.DataFrame(make_replica, index=groups_index, columns=["data"]) +def level0_commondata_wc( + data, + fakepdf + ): + """ + Given a validphys.core.DataGroupSpec object, load commondata and + generate a new commondata instance with central values replaced + by fakepdf prediction + + Parameters + ---------- + + data : validphys.core.DataGroupSpec + + fakepdf: validphys.core.PDF + + Returns + ------- + list + list of validphys.coredata.CommonData instances corresponding to + all datasets within one experiment. The central value is replaced + by Level 0 fake data. + + Example + ------- + >>> from validphys.api import API + >>> API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], + use_cuts="internal", + theoryid=200, + fakepdf="NNPDF40_nnlo_as_01180") + + [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] + """ + + level0_commondata_instances_wc = [] + + # import IPython; IPython.embed() + + for dataset in data.datasets: + + commondata_wc = dataset.commondata.load_commondata() + if dataset.cuts is not None: + cuts = dataset.cuts.load() + commondata_wc = commondata_wc.with_cuts(cuts=cuts) + + # == Generate a new CommonData instance with central value given by Level 0 data generated with fakepdf ==# + t0_prediction = dataset_t0_predictions(dataset=dataset, + t0set=fakepdf) + # N.B. cuts already applied to th. pred. + level0_commondata_instances_wc.append(commondata_wc.with_central_value(t0_prediction)) + + return level0_commondata_instances_wc + + +def make_level1_data( + level0_commondata_wc, + filterseed, + data_index): + """ + Given a list of Level 0 commondata instances, return the + same list with central values replaced by Level 1 data. + + Level 1 data is generated using validphys.make_replica. + The covariance matrix, from which the stochastic Level 1 + noise is sampled, is built from Level 0 commondata + instances (level0_commondata_wc). This, in particular, + means that the multiplicative systematics are generated + from the Level 0 central values. + + Note that the covariance matrix used to generate Level 2 + pseudodata is consistent with the one used at Level 1 + up to corrections of the order eta * eps, where eta and + eps are defined as shown below: + + Generate L1 data: L1 = L0 + eta, eta ~ N(0,CL0) + Generate L2 data: L2_k = L1 + eps_k, eps_k ~ N(0,CL1) + + where CL0 and CL1 means that the multiplicative entries + have been constructed from Level 0 and Level 1 central + values respectively. + + + Parameters + ---------- + + level0_commondata_wc : list + list of validphys.coredata.CommonData instances corresponding to + all datasets within one experiment. The central value is replaced + by Level 0 fake data. Cuts already applied. + + filterseed : int + random seed used for the generation of Level 1 data + + data_index : pandas.MultiIndex + + Returns + ------- + list + list of validphys.coredata.CommonData instances corresponding to + all datasets within one experiment. The central value is replaced + by Level 1 fake data. + + Example + ------- + + >>> from validphys.api import API + >>> API.make_level1_data(dataset_inputs=[{"dataset": "NMC"}], + use_cuts="internal", + theoryid=200, + fakepdf="NNPDF40_nnlo_as_01180", + filterseed=0, + data_index) + [CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)] + """ + + # ================== generation of Level1 data ======================# + level1_data = make_replica(level0_commondata_wc, + filterseed, + genrep=True, + ) + + indexed_level1_data = indexed_make_replica(data_index, level1_data) + + dataset_order = {cd.setname: i for i, cd in enumerate(level0_commondata_wc)} + + # ===== create commondata instances with central values given by pseudo_data =====# + level1_commondata_dict = {c.setname: c for c in level0_commondata_wc} + level1_commondata_instances_wc = [] + + for xx, grp in indexed_level1_data.groupby('dataset'): + level1_commondata_instances_wc.append( + level1_commondata_dict[xx].with_central_value(grp.values) + ) + # sort back so as to mantain same order as in level0_commondata_wc + level1_commondata_instances_wc.sort(key=lambda x: dataset_order[x.setname]) + + return level1_commondata_instances_wc + + +def make_level1_list_data( + level0_commondata_wc, + filterseed, + n_samples, + data_index, +): + """ + Given a list of validphys.coredata.CommonData instances with central + values replaced with `fakepdf` predictions with cuts applied + generate a list of level 1 data from such instances + + Parameters + ---------- + + level0_commondata:_wc: list of validphys.coredata.CommonData instances + where the central value is replaced by level 0 + `fakepdf` predictions + + filterseed: int starting seed used to make different replicas + + n_samples: int number of replicas + + data_index: pandas.MultiIndex providing information on the experiment, + the dataset, and the cut index + + Returns + ------- + list + list of lists of validphys.coredata.CommonData instances corresponding + to all datasets within one experiment. The central value is replaced + by Level 1 fake data. + + Example + ------- + >>> from validphys.api import API + >>> from validphys.loader import Loader + >>> from validphys.results import data_index + >>> l = Loader() + >>> dataset = l.check_dataset(name="NMC", theoryid=200) + >>> experiment = l.check_experiment(name="data", datasets=[dataset]) + >>> lv0_cd_wc = API.level0_commondata_wc(dataset_inputs=[{"dataset":"NMC"}], + use_cuts="internal", + theoryid=200, + fakepdf="NNPDF40_nnlo_as_01180" + ) + >>> API.make_level1_list_data(level0_commondata_wc=lv0_cd_wc, + filterseed=0, + n_samples=1, + data_index=data_index(experiment) + ) + + [[CommonData(setname='NMC', ndata=204, commondataproc='DIS_NCE', nkin=3, nsys=16)]] + """ + samples = [make_level1_data(level0_commondata_wc=level0_commondata_wc, + filterseed=filterseed+i, + data_index=data_index) for i in range(n_samples)] + + return samples + + +def sm_predictions( + dataset_inputs, + pdf, + theoryid + ): + + """ + Parameters + ---------- + dataset_inputs: NSList of core.DataSetInput objects + + pdf: core.PDF object + + theoryid: TheoryIDSpec + + Returns + ------- + + dict + dictionary of standard model predictions for the + given dataset_input, pdf, and theory + + """ + + sm_dict = {} + + for dataset in dataset_inputs: + data = l.check_dataset(dataset.name, cfac=dataset.cfac, theoryid=theoryid) + + sm_dict[dataset.name] = central_predictions(data, pdf) + + return sm_dict + + _group_recreate_pseudodata = collect('indexed_make_replica', ('group_dataset_inputs_by_experiment',)) _recreate_fit_pseudodata = collect('_group_recreate_pseudodata', ('fitreplicas', 'fitenvironment')) _recreate_pdf_pseudodata = collect('_group_recreate_pseudodata', ('pdfreplicas', 'fitenvironment')) diff --git a/validphys2/src/validphys/results.py b/validphys2/src/validphys/results.py index 950a11af1..48a9f5222 100644 --- a/validphys2/src/validphys/results.py +++ b/validphys2/src/validphys/results.py @@ -37,6 +37,8 @@ predictions, PredictionsRequireCutsError, ) +from validphys.plotoptions.core import get_info + from validphys.n3fit_data_utils import parse_simu_parameters_names_CF @@ -169,6 +171,55 @@ def from_convolution(cls, pdf, posset): procs_data = collect("data", ("group_dataset_inputs_by_process",)) +def data_index(data): + + """ + Parameters + ---------- + + data: core.DataGroupSpec + + Returns + ------- + + pandas.MultiIndex + + Example + ------- + + >>> from validphys.loader import Loader + >>> from validphys.results import data_index + >>> l = Loader() + >>> dataset = l.check_dataset(name="NMC", + theoryid=200 + ) + >>> experiment = l.check_experiment(name="data", + datasets=[dataset] + ) + >>> data_index(experiment) + + MultiIndex([('NMC', 'NMC', 16), + ('NMC', 'NMC', 21), + ('NMC', 'NMC', 22), + ... + ('NMC', 'NMC', 289), + ('NMC', 'NMC', 290), + ('NMC', 'NMC', 291)], + names=['experiment', 'dataset', 'id'], length=204) + + """ + + tuples = [] + + for dataset in data.datasets: + exp = get_info(dataset).experiment + for i in dataset.cuts.load(): + tp = (exp, dataset.name, i) + tuples.append(tp) + + return pd.MultiIndex.from_tuples(tuples, names=('experiment', 'dataset', 'id')) + + def groups_index(groups_data): """Return a pandas.MultiIndex with levels for group, dataset and point respectively, the group is determined by a key in the dataset metadata, and @@ -541,7 +592,7 @@ def dataset_inputs_results( # ``results`` to support this. # TODO: The above comment doesn't make sense after adding T0. Deprecate this def pdf_results( - dataset: (DataSetSpec, DataGroupSpec), + dataset: (DataSetSpec, DataGroupSpec), # type: ignore pdfs: Sequence, covariance_matrix, sqrt_covmat, @@ -557,12 +608,12 @@ def pdf_results( @require_one("pdfs", "pdf") @remove_outer("pdfs", "pdf") def one_or_more_results( - dataset: (DataSetSpec, DataGroupSpec), + dataset: (DataSetSpec, DataGroupSpec), # type: ignore covariance_matrix, sqrt_covmat, dataset_bsm_factor, - pdfs: (type(None), Sequence) = None, - pdf: (type(None), PDF) = None, + pdfs: (type(None), Sequence) = None, # type: ignore + pdf: (type(None), PDF) = None, # type: ignore ): """Generate a list of results, where the first element is the data values, and the next is either the prediction for pdf or for each of the pdfs. diff --git a/validphys2/src/validphys/simunet_analysis.py b/validphys2/src/validphys/simunet_analysis.py index 2e7d054e1..cafc97ba7 100644 --- a/validphys2/src/validphys/simunet_analysis.py +++ b/validphys2/src/validphys/simunet_analysis.py @@ -20,6 +20,8 @@ import pandas as pd import seaborn as sns import itertools +import yaml +import os from reportengine.figure import figure, figuregen from reportengine.checks import make_check, CheckError, make_argcheck, check @@ -715,9 +717,9 @@ def plot_bsm_pdf_corr( Q, bsm_names_to_latex, mark_threshold: float = 0.9, - ymin: (float, type(None)) = None, - ymax: (float, type(None)) = None, - dashed_line_flavours: (list, type(None)) = None, + ymin: (float, type(None)) = None, # type: ignore + ymax: (float, type(None)) = None, # type: ignore + dashed_line_flavours: (list, type(None)) = None, # type: ignore ): """ Plot the correlation between BSM factors and a PDF. @@ -1635,7 +1637,7 @@ def dataset_scaled_fit_cfactor(dataset, pdf, read_pdf_cfactors, quad_cfacs): res: np.arrays An ``ndat`` x ``nrep`` array containing the scaled fit cfactors. """ - parsed_cfacs = parse_fit_cfac(dataset.fit_cfac, dataset.cuts) + parsed_cfacs = parse_fit_cfac(dataset.fit_cfac, dataset.cuts) # type: ignore if parsed_cfacs is None or not read_pdf_cfactors.values.size: # We want an array of ones that ndata x nrep # where ndata is the number of post cut datapoints @@ -1649,7 +1651,7 @@ def dataset_scaled_fit_cfactor(dataset, pdf, read_pdf_cfactors, quad_cfacs): scaled_replicas = read_pdf_cfactors.values * fit_cfac_df.values[:, np.newaxis] if quad_cfacs: log.debug("Scaling results using quadratic cfactors") - parsed_quads = parse_quad_cfacs(dataset.fit_cfac, dataset.cuts, quad_cfacs) + parsed_quads = parse_quad_cfacs(dataset.fit_cfac, dataset.cuts, quad_cfacs) # type: ignore quad_cfac_df = pd.DataFrame( {k: v.central_value.squeeze() for k, v in parsed_quads.items()} ) @@ -1969,4 +1971,168 @@ def principal_component_vectors(fisher_information_matrix, simu_parameters_names fisher = fisher - fisher.mean(axis=0) _, _, vectors = np.linalg.svd(fisher) vectors = pd.DataFrame(vectors, columns=simu_parameters_names) - return vectors \ No newline at end of file + return vectors + + +def load_datasets_contamination( + contamination_parameters, + theoryid, + dataset_inputs + ): + + """ + Parameters + ---------- + + contamination_parameters: dict with + + theoryid: TheoryIDSpec + + dataset_inputs: NSList of DataSetInput objects + + Returns + ------- + + dict + dictionary of BSM k-factors to apply on certain datasets + + """ + + cont_path = l.datapath / f"theory_{theoryid.id}" / "simu_factors" + + cont_name = contamination_parameters["name"] + cont_value = contamination_parameters["value"] + cont_lin_comb = contamination_parameters["linear_combination"] + + bsm_dict = {} + + for dataset in dataset_inputs: + + bsmfile = cont_path / f"SIMU_{dataset.name}.yaml" + + cont_order = dataset.contamination + + if cont_order == None: + log.warning( + f"{dataset.name} is not contaminated. Is it right?" + ) + bsm_dict[dataset.name] = np.array([1.]) + elif not os.path.exists(bsmfile): + log.error( + f"Could not find a BSM-factor for {dataset.name}. Are you sure they exist in the given theory?" + ) + bsm_dict[dataset.name] = np.array([1.]) + else: + log.info( + f"Loading {dataset.name}" + ) + with open(bsmfile, "r+") as stream: + simu_card = yaml.safe_load(stream) + stream.close() + + k_factors = np.zeros(len(simu_card["SM_fixed"])) + for op in cont_lin_comb: + k_factors += cont_lin_comb[op] * np.array(simu_card[cont_order][op]) + k_factors = 1. + k_factors * cont_value / np.array(simu_card[cont_order]["SM"]) + + bsm_dict[dataset.name] = k_factors + + return bsm_dict + + +def compute_datasets_chi2_dist( + make_level1_list_data, + sm_predictions, + groups_covmat, + load_datasets_contamination + ): + + """ + Parameters + ---------- + + make_level1_list_data + + sm_predictions + + groups_covmat + + load_contamination + + Returns + ------- + + dict + dictionary of lists of chi2 per dataset + + """ + + covmat = groups_covmat + samples = make_level1_list_data + bsm_factors = load_datasets_contamination + + chi2_dict = {dataset.setname: [] for dataset in samples[0]} + + for sample in samples: + + for dataset in sample: + data_name = dataset.setname + bsm_fac = bsm_factors[data_name] + + if bsm_fac.shape[0] == 1: + data_values = dataset.central_values * bsm_fac + else: + indices = dataset.commondata_table_indices + data_values = dataset.central_values * bsm_fac[indices] + + num_data = dataset.ndata + + covmat_dataset = ( + covmat.xs(data_name, level=1, drop_level=False) + .T.xs(data_name, level=1, drop_level=False) + .values + ) + + theory = sm_predictions[data_name].values.squeeze() + + diff = (data_values - theory).squeeze() + + if diff.size == 1: + chi2 = diff**2 / covmat_dataset[0,0] / num_data + else: + chi2 = (diff.T @ np.linalg.inv(covmat_dataset) @ diff) / num_data + + chi2_dict[data_name].append(chi2) + + return chi2_dict + + +def write_datasets_chi2_dist_csv( + pdf, + compute_datasets_chi2_dist, + level0_commondata_wc + ): + + """ + Parameters + ---------- + + pdf: core.PDF + + compute_chi2 + + level0_commondata_wc + + Returns + ------- + + """ + + ndata = {dataset.setname: [dataset.ndata] for dataset in level0_commondata_wc} + + df_ndat = pd.DataFrame(ndata) + df_chi2 = pd.DataFrame(compute_datasets_chi2_dist) + + chi2 = pd.concat([df_ndat, df_chi2], ignore_index=True) + + chi2.to_csv(f"{pdf}_chi2_dist.csv", index=False) diff --git a/validphys2/src/validphys/test_commondata_new_to_old.py b/validphys2/src/validphys/test_commondata_new_to_old.py new file mode 100644 index 000000000..292b073a5 --- /dev/null +++ b/validphys2/src/validphys/test_commondata_new_to_old.py @@ -0,0 +1,55 @@ +""" + Test the commondata converter from new to old format: + it must be run in an up to date nnpdf environment. +""" + +import yaml +from numpy import allclose +from validphys.commondataparser import parse_set_metadata, load_commondata_new, load_commondata_old +from validphys.covmats import covmat_from_systematics + +# nnpdf path +nnpdf_path = "/Users/teto/Software/nnpdf_git/nnpdf" +# open the yaml file created by commondata_new_to_old script +with open("test_utils/check_commondata_new_to_old.yaml", "rb") as stream: + datasets = yaml.safe_load(stream) +# silly dictionary to output if the feature is sound or not +ok = {1: "OK :D", 0: "NOT OK :C"} +# fake dataset input for covmat_from_systematics +inp = None +# list to store the implementation errors, useful for IPython debug +cd_errors, cm_errors = [], [] +# loop over the selected datasets +for i, ds in enumerate(datasets): + # dataset name, observable name, and dataset variant + setname, observable, variant = ds["dataset"], ds["obs"], ds["variant"] + # old commondata + cd_old = load_commondata_old(commondatafile=f"test_utils/DATA_{setname}_{observable}.dat", + systypefile=f"test_utils/SYSTYPE_{setname}_{observable}_DEFAULT.dat", + setname=setname) + # load metadata of the new commondata + metadata = parse_set_metadata(metadata_file=f"{nnpdf_path}/nnpdf_data/nnpdf_data/commondata/{setname}/metadata.yaml") + # new commondata + if variant: + cd_new = load_commondata_new(metadata=metadata.select_observable(observable).apply_variant(variant)) + else: + cd_new = load_commondata_new(metadata=metadata.select_observable(observable)) + # load covariance matrices + covmat_old = covmat_from_systematics(loaded_commondata_with_cuts=cd_old, + dataset_input=inp, + use_weights_in_covmat=False) + covmat_new = covmat_from_systematics(loaded_commondata_with_cuts=cd_new, + dataset_input=inp, + use_weights_in_covmat=False) + # test central values + ds["commondata"] = allclose(cd_old.central_values, cd_new.central_values) + if not ds["commondata"]: + cd_errors.append({"old": cd_old, "new": cd_new}) + # test covariance matrix + ds["covmat"] = allclose(covmat_old, covmat_new) + if not ds["covmat"]: + cm_errors.append({"old": covmat_old, "new": covmat_new}) + # output + cd, cm = ds["commondata"], ds["covmat"] + name = f"{setname}_{observable}" + print(f"{i+1:2}. {name:>40} -> commondata is {ok[cd]:>9} & covariance matrix is {ok[cm]:>9}") \ No newline at end of file diff --git a/validphys2/src/validphys/utils.py b/validphys2/src/validphys/utils.py index 0c2956daa..3b99b5af4 100644 --- a/validphys2/src/validphys/utils.py +++ b/validphys2/src/validphys/utils.py @@ -12,6 +12,7 @@ import numpy as np from validobj import parse_input, ValidationError +from reportengine.compat import yaml def parse_yaml_inp(inp, spec, path): @@ -219,3 +220,158 @@ def scale_from_grid(grid): Returns ``'linear'`` if the scale of the grid object is linear, and otherwise ``' log'``.""" return 'linear' if grid.scale == 'linear' else 'log' + + +def uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, observable, path_systype=None, write_to_file=True): + """ + Convert the new style uncertainty yaml file to the old style systype. + Writes + + Parameters + ---------- + path_uncertainty_yaml : str, or Path + Path to the new style uncertainty yaml file to be converted + + path_systype : str, or Path, optional + path to the output systype file + + Returns + ------- + n_sys : int + Number of systematics in the systype file + """ + # open the uncertainty yaml file + with open(path_uncertainty_yaml) as f: + uncertainty = yaml.safe_load(f) + + # get uncertainty definitions + uncertainty_definitions = uncertainty['definitions'] + + # check whether path_systype is provided else save it in the same directory in which the uncertainty yaml file is + if path_systype is None: + if isinstance(path_uncertainty_yaml, str): + path_uncertainty_yaml = pathlib.Path(path_uncertainty_yaml) + path_systype = path_uncertainty_yaml.parent / f"SYSTYPE_{name_dataset}_{observable}_DEFAULT.dat" + else: + path_systype = pathlib.Path(path_systype) / f"SYSTYPE_{name_dataset}_{observable}_DEFAULT.dat" + + # get number of sys (note: stat is not included in the sys) + if 'stat' in uncertainty_definitions.keys(): + n_sys = len(uncertainty_definitions.keys()) - 1 + else: + n_sys = len(uncertainty_definitions.keys()) + + if write_to_file: + # open the systype file for writing + with open(path_systype, 'w') as stream: + + # header: number of sys + stream.write(f"{n_sys}\n") + + # write the systype treatments + + # remove stat from the uncertainty definitions + uncertainty_definitions.pop('stat', None) + + for i, (_, sys_dict) in enumerate(uncertainty_definitions.items()): + # four spaces seems to be the standard format (has to be checked for other datasets than CMS_1JET_8TEV) + stream.write(f"{i+1} {sys_dict['treatment']} {sys_dict['type']}\n") + + return n_sys + + +def convert_new_data_to_old(path_data_yaml, path_uncertainty_yaml, path_kinematics, path_metadata, name_dataset, observable, path_DATA=None): + """ + Convert the new data format into the old data format + """ + + # open the metadata yaml file + with open(path_metadata) as f: + metadata = yaml.safe_load(f) + + # open the data yaml file + with open(path_data_yaml) as f: + data = yaml.safe_load(f) + + # open the uncertainty yaml file + with open(path_uncertainty_yaml) as f: + uncertainty = yaml.safe_load(f) + + # open the kinematics yaml file + with open(path_kinematics) as f: + kinematics = yaml.safe_load(f) + + # get uncertainty definitions and values + uncertainty_definitions = uncertainty['definitions'] + uncertainty_values = uncertainty['bins'] + n_sys = uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, observable, write_to_file=False) + stats = [] + for entr in uncertainty_values: + try: stats.append(entr["stat"]) + except KeyError: stats.append(0.) + stats = np.array(stats) + + # get data values + data_values = data['data_central'] + + # check whether path_DATA is provided else save it in the same directory in which the uncertainty yaml file is + if path_DATA is None: + if isinstance(path_uncertainty_yaml, str): + path_uncertainty_yaml = pathlib.Path(path_uncertainty_yaml) + path_DATA = path_uncertainty_yaml.parent / f"DATA_{name_dataset}_{observable}.dat" + else: + path_DATA = pathlib.Path(path_DATA) / f"DATA_{name_dataset}_{observable}.dat" + + kin_names = list(kinematics['bins'][0].keys()) + kin_values = kinematics['bins'] + # open the DATA file for writing + with open(path_DATA, 'w') as stream: + + # write the header: Dataset name, number of sys errors, and number of data points, whitespace separated + stream.write(f"{name_dataset}_{observable} {n_sys} {len(data_values)}\n") + + for i, data_value in enumerate(data_values): + cd_line = f"{i+1:6}\t{metadata['implemented_observables'][0]['process_type']:6}\t" + + for index in [2, 1, 0]: + if kin_values[i][kin_names[index]]['mid'] == None: + kin_values[i][kin_names[index]]['mid'] = (kin_values[i][kin_names[index]]['min'] + kin_values[i][kin_names[index]]['max']) / 2 + if kin_names[index] == "pT": + cd_line += f"{kin_values[i][kin_names[index]]['mid']**2:20.12e}\t" + else: + cd_line += f"{kin_values[i][kin_names[index]]['mid']:20.12e}\t" + + cd_line += f"\t{data_value:20.12e}\t{stats[i]:20.12e}\t" + + # for j, sys in enumerate(uncertainty_values): + sys = uncertainty_values[i] + for j, (sys_name, sys_val) in enumerate(sys.items()): + if sys_name == 'stat': + continue + + add_sys = sys_val + if data_value != 0.0: + mult_sys = add_sys * 100.0 / data_value + else: + mult_sys = 0.0 + + if j == len(sys)-1: + cd_line += f"{add_sys:20.12e}\t {mult_sys:20.12e}\n" + else: + cd_line += f"{add_sys:20.12e}\t {mult_sys:20.12e}\t" + + stream.write(cd_line) + + + +if __name__ == '__main__': + new_commondata = "/Users/teto/Software/nnpdf_git/nnpdf/nnpdf_data/nnpdf_data/new_commondata" + test_dir = "/Users/teto/Software/simunet_git/SIMUnet/validphys2/src/validphys/test_utils" + name_dataset = "ATLAS_1JET_13TEV_DIF" + observable = "PT-Y" + path_unc_file = new_commondata+"/"+name_dataset+"/uncertainties.yaml" + path_data_yaml = new_commondata+"/"+name_dataset+"/data.yaml" + path_kin = new_commondata+"/"+name_dataset+"/kinematics.yaml" + path_metadata = new_commondata+"/"+name_dataset+"/metadata.yaml" + uncertainty_yaml_to_systype(path_unc_file, name_dataset=name_dataset, observable=observable, path_systype=test_dir) + convert_new_data_to_old(path_data_yaml, path_unc_file, path_kin, path_metadata, name_dataset=name_dataset, observable=observable, path_DATA=test_dir) \ No newline at end of file