Skip to content

Commit b7865fa

Browse files
committed
added instructions for running HF models
1 parent a4caf5a commit b7865fa

File tree

3 files changed

+92
-17
lines changed

3 files changed

+92
-17
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ To test your own query generator with our framework, you would need to extend `Q
5858

5959
### Running the test
6060

61+
#### OpenAI
6162
To test it out with just 1 question (instead of all 175):
6263

6364
```bash
@@ -73,6 +74,22 @@ python main.py \
7374
-v
7475
```
7576

77+
#### HuggingFace
78+
To test it out with just 10 questions (instead of all 175):
79+
80+
```bash
81+
mkdir results #create directory for storing results
82+
83+
# use the -W option to ignore warnings about sequential use of pipeline
84+
python -W main.py \
85+
-q data/questions_gen.csv \
86+
-o results/results.csv \
87+
-g hf \
88+
-f query_generators/prompts/sample_hf_prompt.md \
89+
-m defog/starcoder-finetune-v3 \
90+
-n 10
91+
```
92+
7693
You can explore the results generated and aggregated the various metrics that you care about to understand your query generator's performance. Happy iterating!
7794

7895
## Misc

eval/hf_runner.py

Lines changed: 73 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from eval.eval import compare_df, query_postgres_db, subset_df
22
import pandas as pd
33
import torch
4-
from transformers import AutoTokenizer, AutoModelForCausalLM
4+
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
55
from utils.pruning import prune_metadata_str
6+
from tqdm import tqdm
7+
from psycopg2.extensions import QueryCanceledError
68

79
def prepare_questions_df(questions_file, num_questions):
810
question_query_df = pd.read_csv(questions_file, nrows=num_questions)
@@ -39,6 +41,7 @@ def run_hf_eval(
3941
prompt_file: str,
4042
num_questions: int = None,
4143
model_name : str = "defog/starcoder-finetune-v3",
44+
output_file : str = "results.csv",
4245
):
4346
print("preparing questions...")
4447
# get questions
@@ -50,25 +53,79 @@ def run_hf_eval(
5053
print("questions prepared\nnow loading model...")
5154
# initialize tokenizer and model
5255
tokenizer, model = get_tokenizer_model(model_name)
53-
tokenizer.pad_token = tokenizer.eos_token
5456

55-
print("model loaded\nnow generating predictions...")
57+
print("model loaded\nnow generating and evaluating predictions...")
5658
# generate predictions
5759
eos_token_id = tokenizer.convert_tokens_to_ids(["```"])[0]
58-
inputs = tokenizer(df["prompt"].tolist(), return_tensors="pt", padding=True)
59-
outputs = model.generate(
60-
**inputs,
61-
max_new_tokens=600,
62-
do_sample=False,
63-
num_beams=4,
64-
num_return_sequences=1,
65-
eos_token_id=eos_token_id,
60+
pipe = pipeline("text-generation",
61+
model=model,
62+
tokenizer=tokenizer
6663
)
67-
predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
68-
df["prediction"] = predictions
69-
df['generated_query'] = df['prediction'].apply(lambda x: x.split("```sql")[-1].split(";")[0].strip())
7064

7165
# from here, just do the usual eval stuff
66+
total_tried = 0
67+
total_correct = 0
68+
output_rows = []
69+
with tqdm(total=len(df)) as pbar:
70+
for row in df.to_dict("records"):
71+
total_tried += 1
72+
generated_query = pipe(
73+
row['prompt'],
74+
max_new_tokens=600,
75+
do_sample=False,
76+
num_beams=4,
77+
num_return_sequences=1,
78+
eos_token_id=eos_token_id,
79+
pad_token_id=eos_token_id,
80+
)[0]['generated_text'].split("```sql")[-1].split(";")[0].strip()
81+
82+
row["generated_query"] = generated_query
83+
golden_query = row["query"]
84+
db_name = row["db_name"]
85+
question = row["question"]
86+
query_category = row["query_category"]
87+
correct = subset = 0
88+
generated_result = expected_result = None
7289

73-
# export results to CSV before doing anything else
74-
df.to_csv("hf_pred.csv", index=False)
90+
try:
91+
expected_result = query_postgres_db(
92+
golden_query, db_name
93+
).rename(columns=str.lower)
94+
95+
generated_result = query_postgres_db(
96+
generated_query, db_name
97+
).rename(columns=str.lower)
98+
99+
correct = subset = int(
100+
compare_df(
101+
expected_result, generated_result, query_category, question
102+
)
103+
)
104+
if not correct:
105+
subset = subset_df(
106+
df_sub=expected_result,
107+
df_super=generated_result,
108+
query_category=query_category,
109+
question=question,
110+
)
111+
row["correct"] = int(correct)
112+
row["subset"] = int(subset)
113+
row["error_msg"] = ""
114+
if subset:
115+
total_correct += 1
116+
except QueryCanceledError as e:
117+
row["timeout"] = 1
118+
row["error_msg"] = f"QUERY EXECUTION TIMEOUT: {e}"
119+
except Exception as e:
120+
row["error_db_exec"] = 1
121+
row["error_msg"] = f"QUERY EXECUTION ERROR: {e}"
122+
123+
output_rows.append(row)
124+
pbar.update(1)
125+
pbar.set_description(
126+
f"Correct so far: {total_correct}/{total_tried} ({100*total_correct/total_tried:.2f}%)"
127+
)
128+
129+
output_df = pd.DataFrame(output_rows)
130+
output_df = output_df.sort_values(by=["db_name", "query_category", "question"])
131+
output_df.to_csv(output_file, index=False, float_format="%.2f")

main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
questions_file = args.questions_file,
2626
prompt_file = args.prompt_file,
2727
num_questions = args.num_questions,
28-
model_name = args.model
28+
model_name = args.model,
29+
output_file = args.output_file,
2930
)
3031
else:
3132
raise ValueError(f"Invalid model type: {args.model_type}. Model type must be one of: 'openai', 'hf'")

0 commit comments

Comments
 (0)