Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QOL changes for generations #166

Merged
merged 21 commits into from
Jan 8, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 44 additions & 16 deletions bigcode_eval/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
import os
import warnings

from typing import List


from bigcode_eval import tasks
from bigcode_eval.generation import parallel_generations

Expand All @@ -24,7 +27,6 @@
################################################################################\
"""


class Evaluator:
def __init__(self, accelerator, model, tokenizer, args):
self.accelerator = accelerator
Expand All @@ -38,11 +40,16 @@ def __init__(self, accelerator, model, tokenizer, args):
# code evaluation permission
self.allow_code_execution = args.allow_code_execution

def generate_text(self, task_name):
def generate_text(self, task_name, intermediate_generations=None):
task = tasks.get_task(task_name, self.args)
dataset = task.get_dataset()
# if args.limit is None, use all samples
n_tasks = self.args.limit if self.args.limit else len(dataset)
# if args.limit is used, make sure args.limit_start + args.limit <= len(dataset)
n_tasks = min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset)
# when args.limit is None
# adjust n_tasks by args.limit_start to prevent out of bounds issues
if not self.args.limit:
n_tasks -= self.args.limit_start
maxmatical marked this conversation as resolved.
Show resolved Hide resolved
references = [task.get_reference(dataset[i]) for i in range(self.args.limit_start, self.args.limit_start+n_tasks)]

if self.args.check_references:
Expand All @@ -52,41 +59,46 @@ def generate_text(self, task_name):
solutions = [[ref] for ref in references]
return solutions, references

generations = parallel_generations(
generations = [] # list[list[str | None] | None]
if intermediate_generations:
generations = [gen for gen in intermediate_generations if gen]
n_tasks -= len(generations)
intermediate_save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}_intermediate.json"
curr_sample_idx = len(generations)

new_generations = parallel_generations(
task,
dataset,
self.accelerator,
self.model,
self.tokenizer,
n_tasks=n_tasks,
args=self.args,
curr_sample_idx=curr_sample_idx, # curr_sample_idx will added to limit_start to fix indexing
save_every_k_tasks=self.args.save_every_k_tasks,
intermediate_generations=generations,
intermediate_save_generations_path=intermediate_save_generations_path,
maxmatical marked this conversation as resolved.
Show resolved Hide resolved
)
generations.extend(new_generations)

if len(generations[0]) > self.args.n_samples:
generations = [l[: self.args.n_samples] for l in generations]
warnings.warn(
f"Number of tasks wasn't proportional to number of devices, we removed extra predictions to only keep nsamples={self.args.n_samples}"
)
return generations, references

def evaluate(self, task_name):
def evaluate(self, task_name, intermediate_generations=None):
task = tasks.get_task(task_name, self.args)
if task.requires_execution and not self.allow_code_execution:
raise ValueError(_WARNING)

generations, references = self.generate_text(task_name)
generations, references = self.generate_text(task_name, intermediate_generations=intermediate_generations)

if self.accelerator.is_main_process:
if not self.args.load_generations_path:
if self.args.save_generations:
with open(self.args.save_generations_path, "w") as fp:
json.dump(generations, fp)
print(
f"generations were saved at {self.args.save_generations_path}"
)
if self.args.save_references:
with open(self.args.save_references_path, "w") as fp:
json.dump(references, fp)
print(f"references were saved at {self.args.save_references_path}")
save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}.json"
self.save_json_files(generations, references, save_generations_path, f"references_{task_name}.json")

# make sure tokenizer plays nice with multiprocessing
os.environ["TOKENIZERS_PARALLELISM"] = "false"
Expand All @@ -95,3 +107,19 @@ def evaluate(self, task_name):
print("Evaluating generations...")
results = task.process_results(generations, references)
return results

def save_json_files(
self,
generations: List[str],
references: List[str],
save_generations_path: str,
save_references_path: str,
) -> None:
if self.args.save_generations:
with open(save_generations_path, "w") as fp:
json.dump(generations, fp)
print(f"generations were saved at {save_generations_path}")
if self.args.save_references:
with open(save_references_path, "w") as fp:
json.dump(references, fp)
print(f"references were saved at {save_references_path}")
23 changes: 20 additions & 3 deletions bigcode_eval/generation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
from math import ceil

from typing import List, Optional

from accelerate.utils import set_seed
from torch.utils.data.dataloader import DataLoader
from transformers import StoppingCriteria, StoppingCriteriaList
Expand Down Expand Up @@ -37,7 +39,19 @@ def __call__(self, input_ids, scores, **kwargs):
return input_ids.shape[1] > int(self.input_length * self.multiplier)


def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks, args):
def parallel_generations(
task,
dataset,
accelerator,
model,
tokenizer,
n_tasks,
args,
curr_sample_idx: int = 0,
save_every_k_tasks: int = -1,
intermediate_generations: Optional[List[Optional[List[Optional[str]]]]] = None,
intermediate_save_generations_path: Optional[str] = None,
):
if args.load_generations_path:
# load generated code
with open(args.load_generations_path) as fp:
Expand Down Expand Up @@ -100,7 +114,7 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks,
tokenizer,
num_devices=accelerator.state.num_processes,
max_length=args.max_length_generation,
limit_start=args.limit_start,
limit_start=args.limit_start + curr_sample_idx,
n_tasks=n_tasks,
n_copies=n_copies,
prefix=args.prefix,
Expand Down Expand Up @@ -131,12 +145,15 @@ def parallel_generations(task, dataset, accelerator, model, tokenizer, n_tasks,
tokenizer,
ds_loader,
n_tasks=n_tasks,
limit_start=args.limit_start,
limit_start=args.limit_start + curr_sample_idx,
batch_size=args.batch_size,
prefix=args.prefix,
instruction_tokens=instruction_tokens,
postprocess=args.postprocess,
is_wrapped=is_loaded_in_8bit or is_loaded_in_4bit,
save_every_k_tasks=save_every_k_tasks,
intermediate_generations=intermediate_generations,
intermediate_save_generations_path=intermediate_save_generations_path,
**gen_kwargs,
)
return generations
85 changes: 70 additions & 15 deletions bigcode_eval/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import json
import math
import re
import warnings
from collections import defaultdict
from typing import List, Optional

import torch
from torch.utils.data import IterableDataset
Expand Down Expand Up @@ -49,7 +52,7 @@ def __iter__(self):
prompts_encoder = []
infill = []
instruction = []
for sample in range(self.limit_start, self.limit_start+self.n_tasks):
for sample in range(self.limit_start, self.limit_start + self.n_tasks):
prompt_contents = self.task.get_prompt(self.dataset[sample])
if isinstance(prompt_contents, str):
# Normal code completion mode
Expand Down Expand Up @@ -111,8 +114,6 @@ def __iter__(self):
return_token_type_ids=return_token_type_ids,
)



if self.n_copies == 1 and self.n_tasks % self.num_devices != 0:
self.n_copies = 2
warnings.warn(
Expand All @@ -127,7 +128,9 @@ def __iter__(self):
"ids_encoder": outputs_encoder.input_ids[sample],
"task_id": sample,
"input_len": outputs.attention_mask[sample].sum(),
"input_len_encoder": outputs_encoder.attention_mask[sample].sum(),
"input_len_encoder": outputs_encoder.attention_mask[
sample
].sum(),
}
else:
yield {
Expand Down Expand Up @@ -231,14 +234,20 @@ def complete_code(
instruction_tokens=None,
postprocess=True,
is_wrapped=False,
save_every_k_tasks: int = -1,
intermediate_generations: Optional[List[Optional[List[Optional[str]]]]] = None,
intermediate_save_generations_path: Optional[str] = None,
**gen_kwargs,
):
"""Generate multiple codes for each task in the dataset using multiple GPUs with accelerate.
dataloader sends all the prompts from the evalution dataset to the model as the following:
[p_0_0, p_0_1, ..., p_0_nc-1, p_1_0, ..., p_nt-1_nc-1] where nc is the number of copies of the prompt,
and nt is the number of tasks. nc is such that num_samples(for each task)= nc * batch_size
"""

# keep track of the list of generated codes
# where len(code_gens) = n_tasks and len(code_gens[0]) = number of generated code samples
code_gens: List[List[Optional[str]]] = [[] for _ in range(n_tasks)]
intermediate_generations = [] if not intermediate_generations else intermediate_generations
gen_token_dict = defaultdict(list) # dict of list of generated tokens
for step, batch in tqdm(
enumerate(dataloader),
Expand All @@ -251,12 +260,14 @@ def complete_code(
# Set the start_length after which to check for stopping to be the longest input ignoring padding
max_len = batch["input_len"].max().item()
if "ids_encoder" in batch:
max_len += 1 # Add 1 for decoder_start_token_id
max_len += 1 # Add 1 for decoder_start_token_id
gen_kwargs["stopping_criteria"][0].start_length = max_len
if hasattr(task, "max_length_multiplier") and task.max_length_multiplier:
idx = 1 if task.stop_words else 0
gen_kwargs["stopping_criteria"][idx].input_length = batch["input_len"].max().item()

gen_kwargs["stopping_criteria"][idx].input_length = (
batch["input_len"].max().item()
)

inputs = batch["ids"][:, : batch["input_len"]]
if "ids_encoder" in batch:
if is_wrapped:
Expand Down Expand Up @@ -306,7 +317,55 @@ def complete_code(
for sample, generated_tokens in zip(generated_tasks, generated_tokens):
gen_token_dict[sample].append(generated_tokens)

code_gens = [[] for _ in range(n_tasks)]
if save_every_k_tasks >= 1 and (step + 1) % save_every_k_tasks == 0:
if not intermediate_save_generations_path:
raise ValueError(
"intermediate_save_generations_path cannot be empty!"
)

code_gens = update_code_gens(
task,
tokenizer,
limit_start,
prefix,
instruction_tokens,
postprocess,
code_gens,
gen_token_dict,
)
with open(intermediate_save_generations_path, "w") as fp:
intermediate_generations.extend(code_gens)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if there are multiple saving steps, I think we'll add the same generations several times into intermediate_generations.
Also this list is also extended in evaluator.py

generations.extend(new_generations)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're right, made 2 changes in the new commit

  1. instead of extending on intermediate_generations when saving which end up duplicating new generations, extend on a deepcopy instead which prevents duplications
  2. instead of extending at evaluator, return intermediate_generations.extend(code_gens) in complete_code, which makes a bit more sense with this new logic. since we never mutate intermediate_generations when saving, this will return the non-duplicated generations

json.dump(intermediate_generations, fp)
print(
f"intermediate generations were saved at {intermediate_save_generations_path}"
)
# reset gen_token_dict - prevent redundant decoding
gen_token_dict = defaultdict(list)

code_gens = update_code_gens(
task,
tokenizer,
limit_start,
prefix,
instruction_tokens,
postprocess,
code_gens,
gen_token_dict,
)

return code_gens


def update_code_gens(
task,
tokenizer,
limit_start,
prefix,
instruction_tokens,
postprocess,
code_gens,
gen_token_dict,
):
for sample, generated_tokens in gen_token_dict.items():
for s in generated_tokens:
if INFILL_MODE or tokenizer.eos_token in task.stop_words:
Expand All @@ -315,7 +374,7 @@ def complete_code(
# Treat eos token as a regular stop word not removing it from the output
# If it's removed it may have the effect of removing it in the middle of a
# longer generation in case a batch size > 1 is used, which will result in
# a wrong generation as it won't be used for splitting lateron
# a wrong generation as it won't be used for splitting lateron
gen_code = tokenizer.decode(
s, skip_special_tokens=False, clean_up_tokenization_spaces=False
)
Expand All @@ -338,13 +397,9 @@ def complete_code(
"model output is not postprocessed, this might lower evaluation scores"
)
code_gens[sample].append(gen_code)

return code_gens


import re


def remove_after_return(code):
"""
Takes as input a code, and removes everything that is after the return.
Expand All @@ -361,6 +416,6 @@ def remove_after_return(code):
and start_match < len(code)
and code[start_match].strip() != ""
):
return code[0:start_match]
return code[0: start_match]
end_last_match = end_match
return code
Loading