Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New to old data converter #64

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
759c92d
first draft of new to old data converter
comane Apr 16, 2024
db610a8
added convertion from new data to old DATA_... format
comane Apr 27, 2024
302437b
kinematics written as numbers
comane Apr 29, 2024
ccd2d9c
fix typo
FrancescoMerlotti May 8, 2024
08c8894
add .pdf PBSP logos
FrancescoMerlotti May 13, 2024
18308d5
added load_commondata to core, level0_commondata_wc and make_level1_d…
FrancescoMerlotti May 14, 2024
db2124a
added parse_fakepdf to config.py
FrancescoMerlotti May 16, 2024
f5ba638
add chi2 provider functions
FrancescoMerlotti May 20, 2024
f3a6dd9
added usage write_chi2
FrancescoMerlotti May 20, 2024
3513ff1
fixed repo
FrancescoMerlotti May 20, 2024
dd605f9
moved function in simunet_analysis & changed their name
FrancescoMerlotti May 26, 2024
c9cb4cb
changed cuts to commondata_table_indices
FrancescoMerlotti May 30, 2024
2caf4cb
changed cuts to commondata_table_indices
FrancescoMerlotti May 30, 2024
664a69b
added rules classes, static KIN_LABEL dict, and replaced cpp Export m…
FrancescoMerlotti Jun 12, 2024
2deae97
added commondatawriter.py & export method for CommonData python objects
FrancescoMerlotti Jun 12, 2024
b34de76
added xq2 map for hadronic MQQ processes ref. [2303.06159]
FrancescoMerlotti Jun 12, 2024
98f9c77
Revert "added xq2 map for hadronic MQQ processes ref. [2303.06159]"
FrancescoMerlotti Jun 18, 2024
cd67771
Revert "added commondatawriter.py & export method for CommonData pyth…
FrancescoMerlotti Jun 18, 2024
9c32bea
Revert "added rules classes, static KIN_LABEL dict, and replaced cpp …
FrancescoMerlotti Jun 18, 2024
387a866
debug convert_new_data_to_old
FrancescoMerlotti Jun 25, 2024
b65ceb6
added test_utils to .gitignore
FrancescoMerlotti Jun 25, 2024
069d827
tested writer
FrancescoMerlotti Jul 4, 2024
ce8d085
changes in utils are useful for data converter, changes in dataplots …
FrancescoMerlotti Jul 11, 2024
b55a5e2
scripts to convert and test the conversion
FrancescoMerlotti Oct 24, 2024
d1b60f4
make plotting files script (works on dis datasets)
FrancescoMerlotti Oct 25, 2024
b4eaf59
corrected bug
FrancescoMerlotti Oct 25, 2024
3d610ab
added cuts to fixed sm predictions
FrancescoMerlotti Nov 4, 2024
ed8fb57
update of commodata converter picking the right files
FrancescoMerlotti Nov 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -433,5 +433,6 @@ Session.vim
# auto-generated tag files
tags

validphys2/src/validphys/test_utils

# End of https://www.gitignore.io/api/c++,latex,cmake,python,jupyternotebook,qtcreator,vim
Binary file added PBSP_logos/PBSP_black.pdf
Binary file not shown.
Binary file added PBSP_logos/PBSP_dark.pdf
Binary file not shown.
Binary file added PBSP_logos/PBSP_light.pdf
Binary file not shown.
109 changes: 109 additions & 0 deletions validphys2/src/validphys/commondata_new_to_old.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""
Commondata converter script from new to old format:
it must be run in an up to date simunet environment, in the `commondata_converter_new_to_old` branch.
"""

import os
import sys
import yaml
from validphys.utils import uncertainty_yaml_to_systype, convert_new_data_to_old

# test whether the runcard is passed
if len(sys.argv) != 2:
raise Exception("No runcard is passed!")
card_name = sys.argv[1]
if not os.path.isfile(card_name):
raise Exception("Runcard does not exist!")
# load runcard
with open(card_name, "rb") as stream:
runcard = yaml.safe_load(stream)
# load datasets to convert
datasets = runcard["dataset_inputs"]

# create test directory if it does not already exist
test_dir = "test_utils"
if not os.path.isdir(test_dir):
os.mkdir(test_dir)

# changed by the user
nnpdf_path = "/Users/teto/Software/nnpdf_git/nnpdf"
# new commondata path
new_commondata = f"{nnpdf_path}/nnpdf_data/nnpdf_data/commondata"
# open conversion dictionary
with open(f"{new_commondata}/dataset_names.yml", "rb") as stream:
conversion = yaml.safe_load(stream)

# old format
old_format_names = list(conversion.keys())
# new format
new_format_names = []
for c in conversion:
try:
new_format_names.append(conversion[c]["dataset"])
except TypeError:
new_format_names.append(conversion[c])

# prepare list of the datasets to be converted
conversion_ds = []
for ds in datasets:
if ds["dataset"] in old_format_names:
d = conversion[ds["dataset"]]
d["name"] = ds["dataset"]
conversion_ds.append(d)
elif ds["dataset"] in new_format_names:
conversion_ds.append({"dataset": ds["dataset"], "variant": "legacy", "name": ds["dataset"]})
else:
conversion_ds.append({"dataset": ds["dataset"], "variant": None, "name": ds["dataset"]})

# separate the dataset & the observable names
for ds in conversion_ds:
s = ds["dataset"]
ds["dataset"] = s[:s.rfind("_")]
ds["obs"] = s[s.rfind("_")+1:]
n = ds["name"]
ds["name"] = n[:n.rfind("_")]

# convert
for i, ds in enumerate(conversion_ds):
var_int, obs_ind = "variant", "obs"
# load metadata file
path_metadata = new_commondata+"/"+ds["dataset"]+f"/metadata.yaml"
with open(path_metadata, "r") as stream:
metadata = yaml.safe_load(stream)
for o in metadata["implemented_observables"]:
if o["observable_name"] == ds[obs_ind]:
data_file_name, unc_file_name, kin_file_name = o["data_central"], o["data_uncertainties"][0], o["kinematics"]["file"]
# if only in the new format
if not ds[var_int]:
path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/{data_file_name}"
path_unc_file = new_commondata+"/"+ds["dataset"]+f"/{unc_file_name}"
path_kin = new_commondata+"/"+ds["dataset"]+f"/{kin_file_name}"
# if also in the old format (legacy variants)
else:
if os.path.isfile(new_commondata+"/"+ds["dataset"]+f"/data_{ds[var_int]}_{ds[obs_ind]}.yaml"):
path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/data_{ds[var_int]}_{ds[obs_ind]}.yaml"
else:
path_data_yaml = new_commondata+"/"+ds["dataset"]+f"/data_legacy_{ds[obs_ind]}.yaml"
path_unc_file = new_commondata+"/"+ds["dataset"]+f"/uncertainties_{ds[var_int]}_{ds[obs_ind]}.yaml"
path_kin = new_commondata+"/"+ds["dataset"]+f"/kinematics_{ds[obs_ind]}.yaml"
# write uncertainty files

uncertainty_yaml_to_systype(path_unc_file,
name_dataset=ds["name"],
observable=ds["obs"],
path_systype=test_dir)
# write commondata files
convert_new_data_to_old(path_data_yaml,
path_unc_file,
path_kin,
path_metadata,
name_dataset=ds["name"],
observable=ds["obs"],
path_DATA=test_dir)
# output
name = ds["name"]+"_"+ds["obs"]
print(f"{i+1:>2}. {name:>40} converted!")

# write check runcard
with open("test_utils/check_commondata_new_to_old.yaml", "w") as stream:
yaml.safe_dump(conversion_ds, stream)
5 changes: 5 additions & 0 deletions validphys2/src/validphys/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
MatchedCuts,
SimilarCuts,
ThCovMatSpec,
PDF,
)
from validphys.fitdata import fitted_replica_indexes, num_fitted_replicas
from validphys.loader import (
Expand Down Expand Up @@ -171,6 +172,10 @@ def parse_pdf(self, name: str):
except NotImplementedError as e:
raise ConfigError(str(e))
return pdf

def parse_fakepdf(self, name: str) -> PDF:
"""PDF set used to generate the fake data in a closure test."""
return self.parse_pdf(name)

def parse_load_weights_from_fit(self, name: str):
"""A fit in the results folder, containing at least a valid filter result."""
Expand Down
22 changes: 11 additions & 11 deletions validphys2/src/validphys/convolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,17 +119,17 @@ def _predictions(dataset, pdf, fkfunc):
# predictions instead.
all_predictions = []
for fk in dataset.fkspecs:
if not fk.use_fixed_predictions:
all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf))
else:
with open(fk.fixed_predictions_path, 'rb') as f:
fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed'])
# Now need to reshape it according it to the expected number of predictions
if fkfunc == central_fk_predictions:
all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data']))
elif fkfunc == fk_predictions:
fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1))
all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())]))
if not fk.use_fixed_predictions:
all_predictions.append(fkfunc(load_fktable(fk).with_cuts(cuts), pdf))
else:
with open(fk.fixed_predictions_path, 'rb') as f:
fixed_predictions = np.array(yaml.safe_load(f)['SM_fixed'])[cuts]
# Now need to reshape it according it to the expected number of predictions
if fkfunc == central_fk_predictions:
all_predictions.append(pd.DataFrame(fixed_predictions, columns=['data']))
elif fkfunc == fk_predictions:
fixed_predictions = np.tile(fixed_predictions, (pdf.get_members(), 1))
all_predictions.append(pd.DataFrame(fixed_predictions.T, columns=[i for i in range(pdf.get_members())]))

return opfunc(*all_predictions)

Expand Down
12 changes: 12 additions & 0 deletions validphys2/src/validphys/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,18 @@ def load(self)->CommonData:
#TODO: Use better path handling in python 3.6
return CommonData.ReadFile(str(self.datafile), str(self.sysfile))

def load_commondata(self, cuts=None):
"""
Loads a coredata.CommonData object from a core.CommonDataSetSpec object
cuts are applied if provided.
"""
# import here to avoid circular imports
from validphys.commondataparser import load_commondata
cd = load_commondata(self)
if cuts is not None:
cd = cd.with_cuts(cuts)
return cd

@property
def plot_kinlabels(self):
return get_plot_kinlabels(self)
Expand Down
3 changes: 3 additions & 0 deletions validphys2/src/validphys/coredata.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ def additive_errors(self):
add_table.columns = add_systype["name"].to_numpy()
return add_table.loc[:, add_table.columns != "SKIP"]

@property
def commondata_table_indices(self):
return self.commondata_table.index - 1

def systematic_errors(self, central_values=None):
"""Returns all systematic errors as absolute uncertainties, with a
Expand Down
20 changes: 15 additions & 5 deletions validphys2/src/validphys/dataplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,7 +866,11 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9):
"""
info = get_info(dataset)

table = kitable(dataset, info)
try:
table = kitable(dataset, info)
except:
log.warning(f"Problems with kitable loading {dataset.name}")
table = kitable(dataset.commondata, info)
figby = sane_groupby_iter(table, info.figure_by)

basis = obs_pdf_correlations.basis
Expand All @@ -880,7 +884,9 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9):
plotting_var = info.get_xcol(table)

#TODO: vmin vmax should be global or by figure?
vmin,vmax = min(plotting_var), max(plotting_var)
vmin, vmax = min(plotting_var), max(plotting_var)
if type(vmin) == str or type(vmax) == str:
vmin, vmax = 0, 1
if info.x_scale == 'log':
norm = mcolors.LogNorm(vmin, vmax)
else:
Expand All @@ -889,7 +895,7 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9):
sm = cm.ScalarMappable(cmap=cm.viridis, norm=norm)

for same_vals, fb in figby:
grid = fullgrid[ np.asarray(fb.index),...]
grid = fullgrid[np.arange(len(fb.index)), ...]


#Use the maximum absolute correlation for plotting purposes
Expand All @@ -906,9 +912,13 @@ def plot_smpdf(pdf, dataset, obs_pdf_correlations, mark_threshold:float=0.9):
h*=2.5
fig,axes = plt.subplots(nrows=nf ,sharex=True, figsize=(w,h), sharey=True)
fig.suptitle(title)
colors = sm.to_rgba(info.get_xcol(fb))
if np.vectorize(isinstance)(info.get_xcol(fb), str).any():
temp = np.linspace(start=0, stop=1, num=len(info.get_xcol(fb)))
colors = sm.to_rgba(temp)
else:
colors = sm.to_rgba(info.get_xcol(fb))
for flindex, (ax, fl) in enumerate(zip(axes, fls)):
for i,color in enumerate(colors):
for i, color in enumerate(colors):
ax.plot(x, grid[i,flindex,:].T, color=color)


Expand Down
72 changes: 72 additions & 0 deletions validphys2/src/validphys/make_plotting_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import sys
import yaml
import shutil
import filecmp

# simunet environment commondata path
old_commondata = "/Users/teto/miniconda3/envs/simunet_release/share/NNPDF/data/commondata"
# nnpdf commondata path
new_commondata = "/Users/teto/Software/nnpdf_git/nnpdf/nnpdf_data/nnpdf_data/commondata"
# test whether the runcard is passed
if len(sys.argv) != 2:
raise Exception("No runcard is passed!")
card_name = sys.argv[1]
if not os.path.isfile(card_name):
raise Exception("Runcard does not exist!")
# load runcard
with open(card_name, "rb") as stream:
card = yaml.safe_load(stream)
# load conversion dictionary
with open(new_commondata+"/dataset_names.yml", "rb") as stream:
conv = yaml.safe_load(stream)
# load datasets to convert
datasets = card["dataset_inputs"]
# temporary list
temp = []
# back conversion map
back_conv = {}
# loop over datasets to convert
for ds in datasets:
ds_name = ds["dataset"]
if ds_name in list(conv.keys()) and "-" in ds_name:
# save the datasets to map
temp.append(conv[ds_name])
# print(f"{ds_name} is in the old format with a new name! (Do it manually)")
else:
for cds in conv:
try:
flag = ds_name == conv[cds]["dataset"]
except TypeError:
flag = ds_name == conv[cds]
if flag:
back_conv[ds_name] = cds
# loop over the datasets that we still have to convert
for ds in temp:
ds_name, ds_var = ds["dataset"], ds["variant"]
back_conv[ds_name] = []
for cds in conv:
try:
flag = (ds_name == conv[cds]["dataset"]) and (ds_var == conv[cds]["variant"] and "-" not in cds)
except TypeError:
flag = ds_name == conv[cds]
if flag:
back_conv[ds_name] = cds
# copy
for i, bc in enumerate(back_conv):
# new file name
filename_new = f"test_utils/PLOTTING_{bc}.yml"
# old file name
if os.path.isfile(old_commondata+f"/PLOTTING_{back_conv[bc]}.yml"):
filename_old = old_commondata+f"/PLOTTING_{back_conv[bc]}.yml"
elif os.path.isfile(old_commondata+f"/PLOTTING_{back_conv[bc]}.yaml"):
filename_old = old_commondata+f"/PLOTTING_{back_conv[bc]}.yaml"
else:
print(f"Missing PLOTTING file for {back_conv[bc]}!")
# copy
shutil.copy(filename_old, filename_new)
# test the copies
if filecmp.cmp(filename_old, filename_new):
print(f"{i+1:>2}. Copied plotting file {back_conv[bc]:>40} -> {bc:>40}!")
else:
print(f"{i+1:>2}. Error during copy of plotting file {back_conv[bc]:>40} -> {bc:>40}!")
2 changes: 1 addition & 1 deletion validphys2/src/validphys/pdfplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ def plot_lumi1d(
if isinstance(gv, MCStats) and show_mc_errors:
ax.plot(mx, errstddown / norm, linestyle="--", color=color)
ax.plot(mx, errstdup / norm, linestyle="--", color=color)
label_add = r"($68%$ c.l.+$1\sigma$)" if legend_stat_labels else ""
label_add = r"($68\%$ c.l.+$1\sigma$)" if legend_stat_labels else ""
outer = True
else:
label_add = r"($68\%$ c.l.)" if legend_stat_labels else ""
Expand Down
Loading