diff --git a/.gitignore b/.gitignore index 2f04c95d8..4bd102c4d 100644 --- a/.gitignore +++ b/.gitignore @@ -433,5 +433,6 @@ Session.vim # auto-generated tag files tags +validphys2/src/validphys/test_utils # End of https://www.gitignore.io/api/c++,latex,cmake,python,jupyternotebook,qtcreator,vim diff --git a/validphys2/src/validphys/commondata_new_to_old.py b/validphys2/src/validphys/commondata_new_to_old.py new file mode 100644 index 000000000..cf606a5c0 --- /dev/null +++ b/validphys2/src/validphys/commondata_new_to_old.py @@ -0,0 +1,108 @@ +""" + Commondata converter script from new to old format: + it must be run in an up to date simunet environment, in the `commondata_converter_new_to_old` branch. +""" + +import os +import sys +import yaml +from validphys.utils import uncertainty_yaml_to_systype, convert_new_data_to_old + +# test whether the runcard is passed +if len(sys.argv) != 2: + raise Exception("No runcard is passed!") +card_name = sys.argv[1] +if not os.path.isfile(card_name): + raise Exception("Runcard does not exist!") +# load runcard +with open(card_name, "rb") as stream: + runcard = yaml.safe_load(stream) +# load datasets to convert +datasets = runcard["dataset_inputs"] + +# create test directory if it does not already exist +test_dir = "test_utils" +if not os.path.isdir(test_dir): + os.mkdir(test_dir) + +# changed by the user +nnpdf_path = "/home/ubunteto/Software/nnpdf" +# new commondata path +new_commondata = f"{nnpdf_path}/nnpdf_data/nnpdf_data/commondata" +# open conversion dictionary +with open(f"{new_commondata}/dataset_names.yml", "rb") as stream: + conversion = yaml.safe_load(stream) + +# old format +old_format_names = list(conversion.keys()) +# new format +new_format_names = [] +for c in conversion: + try: + new_format_names.append(conversion[c]["dataset"]) + except TypeError: + new_format_names.append(conversion[c]) + +# prepare list of the datasets to be converted +conversion_ds = [] +for ds in datasets: + if ds["dataset"] in old_format_names: + d = conversion[ds["dataset"]] + d["name"] = ds["dataset"] + conversion_ds.append(d) + elif ds["dataset"] in new_format_names: + conversion_ds.append({"dataset": ds["dataset"], "variant": "legacy", "name": ds["dataset"]}) + else: + conversion_ds.append({"dataset": ds["dataset"], "variant": None, "name": ds["dataset"]}) + +# separate the dataset & the observable names +for ds in conversion_ds: + s = ds["dataset"] + ds["dataset"] = s[:s.rfind("_")] + ds["obs"] = s[s.rfind("_")+1:] + n = ds["name"] + ds["name"] = n[:n.rfind("_")] + +# convert +for i, ds in enumerate(conversion_ds): + var_ind, obs_ind = "variant", "obs" + # load metadata file + path_metadata = new_commondata+"/"+ds["dataset"]+f"/metadata.yaml" + with open(path_metadata, "r") as stream: + metadata = yaml.safe_load(stream) + for o in metadata["implemented_observables"]: + if o["observable_name"] == ds[obs_ind]: + data_file_name, unc_file_names, kin_file_name = o["data_central"], o["data_uncertainties"], o["kinematics"]["file"] + # if only in the new format + if not ds[var_ind]: + path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/{data_file_name}" + path_kin = new_commondata+"/"+ds["dataset"]+f"/{kin_file_name}" + path_unc_files = [new_commondata+"/"+ds["dataset"]+f"/{unc_file_name}" for unc_file_name in unc_file_names] + # if also in the old format (legacy variants) + else: + if os.path.isfile(new_commondata+"/"+ds["dataset"]+f"/data_{ds[var_ind]}_{ds[obs_ind]}.yaml"): + path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/data_{ds[var_ind]}_{ds[obs_ind]}.yaml" + else: + path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/data_legacy_{ds[obs_ind]}.yaml" + path_unc_file = new_commondata+"/"+ds["dataset"]+f"/uncertainties_{ds[var_ind]}_{ds[obs_ind]}.yaml" + path_kin = new_commondata+"/"+ds["dataset"]+f"/kinematics_{ds[obs_ind]}.yaml" + # write uncertainty files + uncertainty_yaml_to_systype(path_unc_files, + name_dataset=ds["name"], + observable=ds["obs"], + path_systype=test_dir) + # write commondata files + convert_new_data_to_old(path_data_yaml, + path_unc_files, + path_kin, + path_metadata, + name_dataset=ds["name"], + observable=ds["obs"], + path_DATA=test_dir) + # output + name = ds["name"]+"_"+ds["obs"] + print(f"{i+1:>2}. {name:>40} converted!") + +# write check runcard +with open("test_utils/check_commondata_new_to_old.yaml", "w") as stream: + yaml.safe_dump(conversion_ds, stream) \ No newline at end of file diff --git a/validphys2/src/validphys/convolution.py b/validphys2/src/validphys/convolution.py index b0dec81fd..0c8c7880e 100644 --- a/validphys2/src/validphys/convolution.py +++ b/validphys2/src/validphys/convolution.py @@ -119,17 +119,17 @@ def _predictions(dataset, pdf, fkfunc): # predictions instead. all_predictions = [] for fk in dataset.fkspecs: - if not fk.use_fixed_predictions: - all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf)) - else: - with open(fk.fixed_predictions_path, 'rb') as f: - fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed']) - # Now need to reshape it according it to the expected number of predictions - if fkfunc == central_fk_predictions: - all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data'])) - elif fkfunc == fk_predictions: - fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1)) - all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())])) + if not fk.use_fixed_predictions: + all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf)) + else: + with open(fk.fixed_predictions_path, 'rb') as f: + fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed'])[cuts] + # Now need to reshape it according it to the expected number of predictions + if fkfunc == central_fk_predictions: + all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data'])) + elif fkfunc == fk_predictions: + fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1)) + all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())])) return opfunc(*all_predictions) diff --git a/validphys2/src/validphys/dataplots.py b/validphys2/src/validphys/dataplots.py index af2e7b657..1f10ab9b3 100644 --- a/validphys2/src/validphys/dataplots.py +++ b/validphys2/src/validphys/dataplots.py @@ -866,7 +866,11 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): """ info = get_info(dataset) - table = kitable(dataset, info) + try: + table = kitable(dataset, info) + except: + log.warning(f"Problems with kitable loading {dataset.name}") + table = kitable(dataset.commondata, info) figby = sane_groupby_iter(table, info.figure_by) basis = obs_pdf_correlations.basis @@ -880,7 +884,9 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): plotting_var = info.get_xcol(table) #TODO: vmin vmax should be global or by figure? - vmin,vmax = min(plotting_var), max(plotting_var) + vmin, vmax = min(plotting_var), max(plotting_var) + if type(vmin) == str or type(vmax) == str: + vmin, vmax = 0, 1 if info.x_scale == 'log': norm = mcolors.LogNorm(vmin, vmax) else: @@ -889,7 +895,7 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): sm = cm.ScalarMappable(cmap=cm.viridis, norm=norm) for same_vals, fb in figby: - grid = fullgrid[ np.asarray(fb.index),...] + grid = fullgrid[np.arange(len(fb.index)), ...] #Use the maximum absolute correlation for plotting purposes @@ -906,9 +912,13 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9): h*=2.5 fig,axes = plt.subplots(nrows=nf ,sharex=True, figsize=(w,h), sharey=True) fig.suptitle(title) - colors = sm.to_rgba(info.get_xcol(fb)) + if np.vectorize(isinstance)(info.get_xcol(fb), str).any(): + temp = np.linspace(start=0, stop=1, num=len(info.get_xcol(fb))) + colors = sm.to_rgba(temp) + else: + colors = sm.to_rgba(info.get_xcol(fb)) for flindex, (ax, fl) in enumerate(zip(axes, fls)): - for i,color in enumerate(colors): + for i, color in enumerate(colors): ax.plot(x, grid[i,flindex,:].T, color=color) diff --git a/validphys2/src/validphys/make_plotting_files.py b/validphys2/src/validphys/make_plotting_files.py new file mode 100644 index 000000000..d115f9e0f --- /dev/null +++ b/validphys2/src/validphys/make_plotting_files.py @@ -0,0 +1,72 @@ +import os +import sys +import yaml +import shutil +import filecmp + +# simunet environment commondata path +old_commondata = "/Users/teto/miniconda3/envs/simunet_release/share/NNPDF/data/commondata" +# nnpdf commondata path +new_commondata = "/Users/teto/Software/nnpdf_git/nnpdf/nnpdf_data/nnpdf_data/commondata" +# test whether the runcard is passed +if len(sys.argv) != 2: + raise Exception("No runcard is passed!") +card_name = sys.argv[1] +if not os.path.isfile(card_name): + raise Exception("Runcard does not exist!") +# load runcard +with open(card_name, "rb") as stream: + card = yaml.safe_load(stream) +# load conversion dictionary +with open(new_commondata+"/dataset_names.yml", "rb") as stream: + conv = yaml.safe_load(stream) +# load datasets to convert +datasets = card["dataset_inputs"] +# temporary list +temp = [] +# back conversion map +back_conv = {} +# loop over datasets to convert +for ds in datasets: + ds_name = ds["dataset"] + if ds_name in list(conv.keys()) and "-" in ds_name: + # save the datasets to map + temp.append(conv[ds_name]) + # print(f"{ds_name} is in the old format with a new name! (Do it manually)") + else: + for cds in conv: + try: + flag = ds_name == conv[cds]["dataset"] + except TypeError: + flag = ds_name == conv[cds] + if flag: + back_conv[ds_name] = cds +# loop over the datasets that we still have to convert +for ds in temp: + ds_name, ds_var = ds["dataset"], ds["variant"] + back_conv[ds_name] = [] + for cds in conv: + try: + flag = (ds_name == conv[cds]["dataset"]) and (ds_var == conv[cds]["variant"] and "-" not in cds) + except TypeError: + flag = ds_name == conv[cds] + if flag: + back_conv[ds_name] = cds +# copy +for i, bc in enumerate(back_conv): + # new file name + filename_new = f"test_utils/PLOTTING_{bc}.yml" + # old file name + if os.path.isfile(old_commondata+f"/PLOTTING_{back_conv[bc]}.yml"): + filename_old = old_commondata+f"/PLOTTING_{back_conv[bc]}.yml" + elif os.path.isfile(old_commondata+f"/PLOTTING_{back_conv[bc]}.yaml"): + filename_old = old_commondata+f"/PLOTTING_{back_conv[bc]}.yaml" + else: + print(f"Missing PLOTTING file for {back_conv[bc]}!") + # copy + shutil.copy(filename_old, filename_new) + # test the copies + if filecmp.cmp(filename_old, filename_new): + print(f"{i+1:>2}. Copied plotting file {back_conv[bc]:>40} -> {bc:>40}!") + else: + print(f"{i+1:>2}. Error during copy of plotting file {back_conv[bc]:>40} -> {bc:>40}!") \ No newline at end of file diff --git a/validphys2/src/validphys/test_commondata_new_to_old.py b/validphys2/src/validphys/test_commondata_new_to_old.py new file mode 100644 index 000000000..e33735d54 --- /dev/null +++ b/validphys2/src/validphys/test_commondata_new_to_old.py @@ -0,0 +1,58 @@ +""" + Test the commondata converter from new to old format: + it must be run in an up to date nnpdf environment. +""" + +import yaml +from numpy import allclose +from validphys.commondataparser import parse_set_metadata, load_commondata_new, load_commondata_old +from validphys.covmats import covmat_from_systematics +from matplotlib.pyplot import matshow, show + +# nnpdf path +nnpdf_path = "/home/ubunteto/Software/nnpdf" +# open the yaml file created by commondata_new_to_old script +with open("test_utils/check_commondata_new_to_old.yaml", "rb") as stream: + datasets = yaml.safe_load(stream) +# silly dictionary to output if the feature is sound or not +ok = {1: "OK :D", 0: "NOT OK :C"} +# fake dataset input for covmat_from_systematics +inp = None +# list to store the implementation errors, useful for IPython debug +cd_errors, cm_errors = [], [] +# loop over the selected datasets +for i, ds in enumerate(datasets): + # dataset name, observable name, and dataset variant + setname, name, observable, variant = ds["dataset"], ds["name"], ds["obs"], ds["variant"] + # old commondata + cd_old = load_commondata_old(commondatafile=f"test_utils/DATA_{name}_{observable}.dat", + systypefile=f"test_utils/SYSTYPE_{name}_{observable}_DEFAULT.dat", + setname=setname) + # load metadata of the new commondata + metadata = parse_set_metadata(metadata_file=f"{nnpdf_path}/nnpdf_data/nnpdf_data/commondata/{setname}/metadata.yaml") + # new commondata + if variant: + cd_new = load_commondata_new(metadata=metadata.select_observable(observable).apply_variant(variant)) + else: + cd_new = load_commondata_new(metadata=metadata.select_observable(observable)) + # load covariance matrices + covmat_old = covmat_from_systematics(loaded_commondata_with_cuts=cd_old, + dataset_input=inp, + use_weights_in_covmat=False) + covmat_new = covmat_from_systematics(loaded_commondata_with_cuts=cd_new, + dataset_input=inp, + use_weights_in_covmat=False) + # matshow(covmat_new - covmat_old) + # show() + # test central values + ds["commondata"] = allclose(cd_old.central_values, cd_new.central_values) + if not ds["commondata"]: + cd_errors.append({"old": cd_old, "new": cd_new}) + # test covariance matrix + ds["covmat"] = allclose(covmat_old, covmat_new) + if not ds["covmat"]: + cm_errors.append({"old": covmat_old, "new": covmat_new}) + # output + cd, cm = ds["commondata"], ds["covmat"] + name = f"{setname}_{observable}" + print(f"{i+1:2}. {name:>40} -> commondata is {ok[cd]:>9} & covariance matrix is {ok[cm]:>9}") \ No newline at end of file diff --git a/validphys2/src/validphys/utils.py b/validphys2/src/validphys/utils.py index c67c5ee39..3e6437836 100644 --- a/validphys2/src/validphys/utils.py +++ b/validphys2/src/validphys/utils.py @@ -224,3 +224,171 @@ def scale_from_grid(grid): Returns ``'linear'`` if the scale of the grid object is linear, and otherwise ``' log'``.""" return 'linear' if grid.scale == 'linear' else 'log' + + +def uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, observable, path_systype=None, write_to_file=True): + """ + Convert the new style uncertainty yaml file to the old style systype. + Writes + + Parameters + ---------- + path_uncertainty_yaml : str, or Path + Path to the new style uncertainty yaml file to be converted + + path_systype : str, or Path, optional + path to the output systype file + + Returns + ------- + n_sys : int + Number of systematics in the systype file + """ + # open the uncertainty yaml file + with open(path_uncertainty_yaml[0]) as f: + uncertainty = yaml_safe.load(f) + for path in path_uncertainty_yaml[1:]: + with open(path) as f: + loaded_file = yaml_safe.load(f) + for k in loaded_file["definitions"]: + uncertainty["definitions"][k] = loaded_file["definitions"][k] + assert(len(uncertainty["bins"]) == len(loaded_file["bins"])) + for i,k in enumerate(loaded_file["bins"]): + uncertainty["bins"][i].update(k) + # get uncertainty definitions + uncertainty_definitions = uncertainty['definitions'] + + # check whether path_systype is provided else save it in the same directory in which the uncertainty yaml file is + if path_systype is None: + path = pathlib.Path(path_uncertainty_yaml[0]).parent + path_systype = path / f"SYSTYPE_{name_dataset}_{observable}_DEFAULT.dat" + else: + path_systype = pathlib.Path(path_systype) / f"SYSTYPE_{name_dataset}_{observable}_DEFAULT.dat" + + # get number of sys (note: stat is not included in the sys) + if 'stat' in uncertainty_definitions.keys(): + n_sys = len(uncertainty_definitions.keys()) - 1 + else: + n_sys = len(uncertainty_definitions.keys()) + + if write_to_file: + # open the systype file for writing + with open(path_systype, 'w') as stream: + + # header: number of sys + stream.write(f"{n_sys}\n") + + # write the systype treatments + + # remove stat from the uncertainty definitions + uncertainty_definitions.pop('stat', None) + + for i, (_, sys_dict) in enumerate(uncertainty_definitions.items()): + # four spaces seems to be the standard format (has to be checked for other datasets than CMS_1JET_8TEV) + stream.write(f"{i+1} {sys_dict['treatment']} {sys_dict['type']}\n") + + return n_sys + + +def convert_new_data_to_old(path_data_yaml, path_uncertainty_yaml, path_kinematics, path_metadata, name_dataset, observable, path_DATA=None): + """ + Convert the new data format into the old data format + """ + + # open the metadata yaml file + with open(path_metadata) as f: + metadata = yaml_safe.load(f) + + # open the data yaml file + with open(path_data_yaml) as f: + data = yaml_safe.load(f) + + # open the uncertainty yaml file + with open(path_uncertainty_yaml[0]) as f: + uncertainty = yaml_safe.load(f) + for path in path_uncertainty_yaml[1:]: + with open(path) as f: + loaded_file = yaml_safe.load(f) + for k in loaded_file["definitions"]: + uncertainty["definitions"][k] = loaded_file["definitions"][k] + assert(len(uncertainty["bins"]) == len(loaded_file["bins"])) + for i,k in enumerate(loaded_file["bins"]): + uncertainty["bins"][i].update(k) + + # open the kinematics yaml file + with open(path_kinematics) as f: + kinematics = yaml_safe.load(f) + + # get uncertainty definitions and values + uncertainty_definitions = uncertainty['definitions'] + uncertainty_values = uncertainty['bins'] + n_sys = uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, observable, write_to_file=False) + stats = [] + for entr in uncertainty_values: + try: stats.append(entr["stat"]) + except KeyError: stats.append(0.) + stats = np.array(stats) + + # get data values + data_values = data['data_central'] + + # check whether path_DATA is provided else save it in the same directory in which the uncertainty yaml file is + if path_DATA is None: + path = pathlib.Path(path_uncertainty_yaml[0]).parent + path_DATA = path / f"DATA_{name_dataset}_{observable}.dat" + else: + path_DATA = pathlib.Path(path_DATA) / f"DATA_{name_dataset}_{observable}.dat" + + kin_names = list(kinematics['bins'][0].keys()) + kin_values = kinematics['bins'] + # open the DATA file for writing + with open(path_DATA, 'w') as stream: + + # write the header: Dataset name, number of sys errors, and number of data points, whitespace separated + stream.write(f"{name_dataset}_{observable} {n_sys} {len(data_values)}\n") + + for i, data_value in enumerate(data_values): + cd_line = f"{i+1:6}\t{metadata['implemented_observables'][0]['process_type']:6}\t" + + for index in [0, 1, 2]: + if kin_values[i][kin_names[index]]['mid'] == None: + kin_values[i][kin_names[index]]['mid'] = (kin_values[i][kin_names[index]]['min'] + kin_values[i][kin_names[index]]['max']) / 2 + if kin_names[index] == "pT": + cd_line += f"{kin_values[i][kin_names[index]]['mid']**2:20.12e}\t" + else: + cd_line += f"{kin_values[i][kin_names[index]]['mid']:20.12e}\t" + + cd_line += f"\t{data_value:20.12e}\t{stats[i]:20.12e}\t" + + # for j, sys in enumerate(uncertainty_values): + sys = uncertainty_values[i] + for j, (sys_name, sys_val) in enumerate(sys.items()): + if sys_name == 'stat': + continue + + add_sys = sys_val + if data_value != 0.0: + mult_sys = add_sys * 100.0 / data_value + else: + mult_sys = 0.0 + + if j == len(sys)-1: + cd_line += f"{add_sys:20.12e}\t {mult_sys:20.12e}\n" + else: + cd_line += f"{add_sys:20.12e}\t {mult_sys:20.12e}\t" + + stream.write(cd_line) + + + +if __name__ == '__main__': + new_commondata = "/Users/teto/Software/nnpdf_git/nnpdf/nnpdf_data/nnpdf_data/new_commondata" + test_dir = "/Users/teto/Software/simunet_git/SIMUnet/validphys2/src/validphys/test_utils" + name_dataset = "ATLAS_1JET_13TEV_DIF" + observable = "PT-Y" + path_unc_file = new_commondata+"/"+name_dataset+"/uncertainties.yaml" + path_data_yaml = new_commondata+"/"+name_dataset+"/data.yaml" + path_kin = new_commondata+"/"+name_dataset+"/kinematics.yaml" + path_metadata = new_commondata+"/"+name_dataset+"/metadata.yaml" + uncertainty_yaml_to_systype(path_unc_file, name_dataset=name_dataset, observable=observable, path_systype=test_dir) + convert_new_data_to_old(path_data_yaml, path_unc_file, path_kin, path_metadata, name_dataset=name_dataset, observable=observable, path_DATA=test_dir) \ No newline at end of file