From b7865fac84c0dd8ff71c3e2290cfe5e8d3b87f09 Mon Sep 17 00:00:00 2001 From: Rishabh Srivastava Date: Fri, 11 Aug 2023 22:09:57 +0000 Subject: [PATCH] added instructions for running HF models --- README.md | 17 +++++++++ eval/hf_runner.py | 89 ++++++++++++++++++++++++++++++++++++++--------- main.py | 3 +- 3 files changed, 92 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index acf4d06..3b51b2c 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ To test your own query generator with our framework, you would need to extend `Q ### Running the test +#### OpenAI To test it out with just 1 question (instead of all 175): ```bash @@ -73,6 +74,22 @@ python main.py \ -v ``` +#### HuggingFace +To test it out with just 10 questions (instead of all 175): + +```bash +mkdir results #create directory for storing results + +# use the -W option to ignore warnings about sequential use of pipeline +python -W main.py \ + -q data/questions_gen.csv \ + -o results/results.csv \ + -g hf \ + -f query_generators/prompts/sample_hf_prompt.md \ + -m defog/starcoder-finetune-v3 \ + -n 10 +``` + You can explore the results generated and aggregated the various metrics that you care about to understand your query generator's performance. Happy iterating! ## Misc diff --git a/eval/hf_runner.py b/eval/hf_runner.py index 61b09b7..876c7ad 100644 --- a/eval/hf_runner.py +++ b/eval/hf_runner.py @@ -1,8 +1,10 @@ from eval.eval import compare_df, query_postgres_db, subset_df import pandas as pd import torch -from transformers import AutoTokenizer, AutoModelForCausalLM +from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from utils.pruning import prune_metadata_str +from tqdm import tqdm +from psycopg2.extensions import QueryCanceledError def prepare_questions_df(questions_file, num_questions): question_query_df = pd.read_csv(questions_file, nrows=num_questions) @@ -39,6 +41,7 @@ def run_hf_eval( prompt_file: str, num_questions: int = None, model_name : str = "defog/starcoder-finetune-v3", + output_file : str = "results.csv", ): print("preparing questions...") # get questions @@ -50,25 +53,79 @@ def run_hf_eval( print("questions prepared\nnow loading model...") # initialize tokenizer and model tokenizer, model = get_tokenizer_model(model_name) - tokenizer.pad_token = tokenizer.eos_token - print("model loaded\nnow generating predictions...") + print("model loaded\nnow generating and evaluating predictions...") # generate predictions eos_token_id = tokenizer.convert_tokens_to_ids(["```"])[0] - inputs = tokenizer(df["prompt"].tolist(), return_tensors="pt", padding=True) - outputs = model.generate( - **inputs, - max_new_tokens=600, - do_sample=False, - num_beams=4, - num_return_sequences=1, - eos_token_id=eos_token_id, + pipe = pipeline("text-generation", + model=model, + tokenizer=tokenizer ) - predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True) - df["prediction"] = predictions - df['generated_query'] = df['prediction'].apply(lambda x: x.split("```sql")[-1].split(";")[0].strip()) # from here, just do the usual eval stuff + total_tried = 0 + total_correct = 0 + output_rows = [] + with tqdm(total=len(df)) as pbar: + for row in df.to_dict("records"): + total_tried += 1 + generated_query = pipe( + row['prompt'], + max_new_tokens=600, + do_sample=False, + num_beams=4, + num_return_sequences=1, + eos_token_id=eos_token_id, + pad_token_id=eos_token_id, + )[0]['generated_text'].split("```sql")[-1].split(";")[0].strip() + + row["generated_query"] = generated_query + golden_query = row["query"] + db_name = row["db_name"] + question = row["question"] + query_category = row["query_category"] + correct = subset = 0 + generated_result = expected_result = None - # export results to CSV before doing anything else - df.to_csv("hf_pred.csv", index=False) \ No newline at end of file + try: + expected_result = query_postgres_db( + golden_query, db_name + ).rename(columns=str.lower) + + generated_result = query_postgres_db( + generated_query, db_name + ).rename(columns=str.lower) + + correct = subset = int( + compare_df( + expected_result, generated_result, query_category, question + ) + ) + if not correct: + subset = subset_df( + df_sub=expected_result, + df_super=generated_result, + query_category=query_category, + question=question, + ) + row["correct"] = int(correct) + row["subset"] = int(subset) + row["error_msg"] = "" + if subset: + total_correct += 1 + except QueryCanceledError as e: + row["timeout"] = 1 + row["error_msg"] = f"QUERY EXECUTION TIMEOUT: {e}" + except Exception as e: + row["error_db_exec"] = 1 + row["error_msg"] = f"QUERY EXECUTION ERROR: {e}" + + output_rows.append(row) + pbar.update(1) + pbar.set_description( + f"Correct so far: {total_correct}/{total_tried} ({100*total_correct/total_tried:.2f}%)" + ) + + output_df = pd.DataFrame(output_rows) + output_df = output_df.sort_values(by=["db_name", "query_category", "question"]) + output_df.to_csv(output_file, index=False, float_format="%.2f") \ No newline at end of file diff --git a/main.py b/main.py index 8fc91fc..e8b0f36 100644 --- a/main.py +++ b/main.py @@ -25,7 +25,8 @@ questions_file = args.questions_file, prompt_file = args.prompt_file, num_questions = args.num_questions, - model_name = args.model + model_name = args.model, + output_file = args.output_file, ) else: raise ValueError(f"Invalid model type: {args.model_type}. Model type must be one of: 'openai', 'hf'")