Skip to content

Commit

Permalink
Merge branch 'main' into jtcantin-coarse_set_add_dmrg_ref_energies
Browse files Browse the repository at this point in the history
  • Loading branch information
jtcantin committed Dec 10, 2024
2 parents 410ff7f + 6b03107 commit 916a116
Show file tree
Hide file tree
Showing 263 changed files with 315,790 additions and 6,859 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# private working folder that doesn't get pushed to repo:
*_private_working_folder*
*.vscode*

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
87 changes: 0 additions & 87 deletions BubbleML/miniML/TESTING_ONLY_Hamiltonian_features.csv

This file was deleted.

87 changes: 0 additions & 87 deletions BubbleML/miniML/TESTING_ONLY_solver.ccsdt.labels.csv

This file was deleted.

92 changes: 72 additions & 20 deletions BubbleML/miniML/miniML.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,30 +33,31 @@



########################### start of functions #####################


def evaluate(model, test_features, test_labels,model_name):
################################################################################
def evaluate(model, test_features, test_labels, model_name):
'''
This function returns the accuracy by the trained ML model ("model" with "model_name") on test_features with test_labels.
Returns the f1-score (harmonic mean of precision and recall)
'''
y_pred = model.predict(test_features)
labels = np.array([0,1]) #because I want the results back in this order
prec, recall, f1, support = precision_recall_fscore_support(test_labels, y_pred,labels = labels)
prec, recall, f1_score, support = precision_recall_fscore_support(test_labels, y_pred,labels = labels)
#The support is the number of occurrences of each class in y_true.

logging.info(model_name,' Performance:')
logging.info('Precision [class (target=False) , class (target=true) ]: {:0.2f}%, {:0.2f}%,'.format(prec[0]*100,prec[1]*100))
logging.info('Recall [class (target=False) , class (target=true) ]: {:0.2f}%, {:0.2f}%,'.format(recall[0]*100,recall[1]*100))
logging.info('F1-score [class (target=False) , class (target=true) ]: {:0.2f}%, {:0.2f}%,'.format(f1[0]*100,f1[1]*100))
logging.info('F1-score [class (target=False) , class (target=true) ]: {:0.2f}%, {:0.2f}%,'.format(f1_score[0]*100,f1_score[1]*100))

cr = classification_report(test_labels, y_pred)
pprint.pp(cr)
return f1_score




################################################################################
def trainML(
X,
Y,
Expand All @@ -72,8 +73,6 @@ def trainML(
'''
This function trains a machine learning model (name given by model_name) with data points X and labels Y
with or without hyperparamterization and cross-validation (option given by hypopt_cv)
Returns the model used and the accuracy
'''

X_train = X #will be scaling this for svm
Expand Down Expand Up @@ -110,7 +109,7 @@ def trainML(
if hypopt_cv == 0:
#uses all the data for train and tests on the same data
model.fit(X_train,y_train)
accuracy = evaluate(model, X_train, y_train, model_name)
f1_score = evaluate(model, X_train, y_train, model_name)
from pprint import pprint
# Look at parameters used by our current forest (print this into the Text Edit Box). #later
print('Parameters currently in use by base model:\n')
Expand All @@ -124,12 +123,12 @@ def trainML(
cv = kfold_num, n_jobs = -1, verbose = 2)

model.fit(X_train,y_train)
accuracy = evaluate(model, X_train, y_train, model_name)
f1_score = evaluate(model, X_train, y_train, model_name)


#ratio_of_solved_to_truesolved @ above 50% (bounded by min and max of points or convex hull - as in the figure)
#sending original X (it will be transformed)
ratio = compute_ratio_of_solved_to_unsolved(
ml_solvability_ratio = compute_ratio_of_solved_to_unsolved(
X,
y_train,
sc,
Expand All @@ -139,7 +138,7 @@ def trainML(
solver_uuid=solver_uuid,
draw_plot=verbose
)
logging.info('Percent of solvable space: ', str(ratio))
logging.info('Percent of solvable space: ', str(ml_solvability_ratio))

#explain all the predictions in the test set
plt.figure()
Expand Down Expand Up @@ -171,10 +170,17 @@ def trainML(
logging.info(f"wrote probs to file {probs_file_name}.")



return model, accuracy
return ml_solvability_ratio, model, f1_score









################################################################################
def getProjectedData(X, latent_model_name):

if latent_model_name == 'PCA':
Expand Down Expand Up @@ -211,6 +217,7 @@ def getProjectedData(X, latent_model_name):



################################################################################
def getConvexHull(points):

# Compute the convex hull
Expand All @@ -228,6 +235,7 @@ def getConvexHull(points):
return hull


################################################################################
def compute_ratio_of_solved_to_unsolved(
X,
Y,
Expand Down Expand Up @@ -310,14 +318,41 @@ def compute_ratio_of_solved_to_unsolved(


result = np.where(prob[:,1] > conf_thresh)
ratio_solvable = len(result[0])/len(prob[:,1])
ml_solvability_ratio = len(result[0])/len(prob[:,1])

print(f"ml_solvability_ratio: {ml_solvability_ratio}")
return ml_solvability_ratio







return ratio_solvable

############################### end of functions #################










################################################################################
def main(args):

# if args.verbose:
# #configure logging. we are probably running this script one time by itself
# logger = logging.getLogger()
# logger.setLevel(logging.INFO)
# console_handler = logging.StreamHandler()
# formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
# console_handler.setFormatter(formatter)
# logger.addHandler(console_handler)


mini_ml_config_file_name = args.config_file
with open(mini_ml_config_file_name, 'r') as j:
mini_ml_config = json.loads(j.read())
Expand All @@ -327,9 +362,12 @@ def main(args):
df = pd.merge(
df_hams,
df_labels,
on="fcidump_uuid",
how="outer" # fill in NaN when merging if uuids missing from either file.
on="task_uuid",
how="inner"
# how=inner only match rows that by task_uuid that exist in either file (possibly fewer rows).
# how=outer fill in NaN when merging if uuids missing from either file.
)
# df.to_csv("artifact.miniML.features_and_labels.csv")

selected_features = mini_ml_config["features"]
target = "label" # a column header in the solver_labels.csv file.
Expand All @@ -354,7 +392,7 @@ def main(args):
latent_model_name = 'NNMF'
model_name = 'SVM'
importance_features_desired = 1
model, cr = trainML(
ml_solvability_ratio, model, f1_score = trainML(
X=X,
Y=Y,
latent_model_name=latent_model_name,
Expand All @@ -367,9 +405,23 @@ def main(args):
)


if args.verbose:
logging.info(f"solver_uuid: {args.solver_uuid}")
logging.info(f"ml_solvability_ratio: {ml_solvability_ratio}")
logging.info(f"f1_score: {f1_score}")
print(f"solver_uuid: {args.solver_uuid}")
print(f"ml_solvability_ratio: {ml_solvability_ratio}")
print(f"f1_score: {f1_score}")


return ml_solvability_ratio, f1_score






################################################################################
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""
Expand Down Expand Up @@ -418,7 +470,7 @@ def main(args):
help="""
The/path/to/the solver_labels.csv file. The labels are True/False
to indicate that a solver can find the ground state energy of a
Hamiltonian (by FCIDUMP UUID).
Hamiltonian (by task_uuid).
"""
)

Expand Down
2 changes: 1 addition & 1 deletion BubbleML/miniML/miniML_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"log_fci_dim",
"n_elec",
"n_orbs",
"df_spectral_gap"
"df_gap"
],
"threshold_for_confidence_of_solvability": 0.75
}
Binary file not shown.
Loading

0 comments on commit 916a116

Please sign in to comment.