Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 2 robustness task based on MRPC #1

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25,703 changes: 25,703 additions & 0 deletions MRPC Confirmation.ipynb

Large diffs are not rendered by default.

893 changes: 893 additions & 0 deletions MRPC Negative.ipynb

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@ To benchmark a baseline GPT-2 model with WMT and TyDiQA datasets on GPU, run

```shell
python3 -m evaluation.eval \
--model_name_or_path gpt2 \
--eval_tasks wmt tydiqa_secondary \
--device cuda \
--model_name_or_path bigscience/T0_3B \
--eval_tasks mrpc_confirmation mrpc_negative \
--output_dir outputs
```

Expand Down
15 changes: 15 additions & 0 deletions What_Remains_To_Add.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Datasets:


WMT19
IMDB
AGNES
RTE
MRPC

# Models

T0 (and its decendants)
XGLM and its descendants % https://huggingface.co/facebook/xglm-564M


21 changes: 21 additions & 0 deletions classification_experiement.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export HF_DATASETS_CACHE="/gpfswork/rech/tts/unm25jp/datasets"
for dataset in "mnli" "rte" "mrpc" "wmt" "imdb" "emotion" "ag-news"; do
for exp in "two-sentences-classification" "single-sentence-classification"; do
for MODEL_NAME in t5-small t5-base t5-large t5-3b bigscience/T0_3B bigscience/T0pp bigscience/T0p bigscience/T0 gpt gpt2 distilgpt2 EleutherAI/gpt-neo-125M EleutherAI/gpt-neo-1.3B EleutherAI/gpt-j-6B EleutherAI/gpt-neo-2.7B; do
sbatch --job-name=${MODEL_NAME}${exp}${dataset} \
--gres=gpu:1 \
--account=six@gpu \
--no-requeue \
--cpus-per-task=10 \
--hint=nomultithread \
--time=5:00:00 \
-C v100-32g \
--output=jobinfo/${MODEL_NAME}${exp}${dataset}_%j.out \
--error=jobinfo/${MODEL_NAME}${exp}${dataset}_%j.err \
--qos=qos_gpu-t3 \
--wrap="module purge; module load pytorch-gpu/py3/1.7.0 ; python evaluation/eval.py --model_name_or_path ${MODEL_NAME} --eval_tasks $exp --dataset_name $dataset --output_dir outputs --tag ${MODEL_NAME}${exp}${dataset}"

done

done
done
53 changes: 49 additions & 4 deletions evaluation/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,14 @@
from typing import List, Optional

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments, set_seed
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, \
TrainingArguments, set_seed

import sys

sys.path.append(os.path.join(os.getcwd(), '/evaluation/'))
sys.path.append(os.path.join(os.getcwd(), '/single-sentence-classification/'))
sys.path.append(os.getcwd())
import evaluation.tasks # noqa: F401
from evaluation.tasks.auto_task import AutoTask
from evaluation.utils.log import get_logger
Expand All @@ -17,6 +23,9 @@ class EvaluationArguments:
Arguments for any adjustable params in this evaluation script
"""

dataset_name: str = field(
metadata={"help": "The model checkpoint that we want to evaluate, could be name or the path."}
)
model_name_or_path: str = field(
metadata={"help": "The model checkpoint that we want to evaluate, could be name or the path."}
)
Expand All @@ -32,6 +41,34 @@ class EvaluationArguments:

data_dir: Optional[str] = field(default=None, metadata={"help": "Path to the local dataset folder"})

do_sample: Optional[bool] = field(default=False, metadata={"help": "Whether to use sampling instead of greedy."})
use_multi_gpu: Optional[bool] = field(default=False, metadata={"help": "Whether to use multi gpus."})
early_stopping: Optional[bool] = field(default=False,
metadata={"help": "Whether to stop when the correct number of sample"})
min_length: Optional[int] = field(
default=None, metadata={"help": "Of the generated sentence"}
)
num_beams: Optional[int] = field(
default=None, metadata={"help": "Number of sentences in the beam"}
)
temperature: Optional[float] = field(
default=None,
metadata={"help": "Temperature for sampling, makes no sens to be used without passing do_sample true"}
)
top_k: Optional[int] = field(
default=None, metadata={"help": "Number of highest probability vocabulary tokens to keep for top-k-filtering"}
)
top_p: Optional[float] = field(
default=None, metadata={"help": "Number of highest probability vocabulary tokens to keep for top-k-filtering"}
)
repetition_penalty: Optional[float] = field(
default=None, metadata={"help": "Repetition penalty for generating diverse beam search"}
)
length_penalty: Optional[float] = field(
default=None, metadata={
"help": "Repetition penalty for generating diverse longer sentence 1 no penalty >1 foster long sentences"}
)


def main():
parser = HfArgumentParser((EvaluationArguments, TrainingArguments))
Expand Down Expand Up @@ -63,14 +100,21 @@ def main():
tokenizer = AutoTokenizer.from_pretrained(eval_args.tokenizer_name or eval_args.model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

model = AutoModelForCausalLM.from_pretrained(
if ("t5" in eval_args.model_name_or_path.lower()) or "t0" in (
eval_args.model_name_or_path.lower()): # in ["bigscience/T0_3B", "bigscience/T0"]:
MODEL_TYPE = AutoModelForSeq2SeqLM
wilsonyhlee marked this conversation as resolved.
Show resolved Hide resolved
else:
MODEL_TYPE = AutoModelForCausalLM
model = MODEL_TYPE.from_pretrained(
eval_args.model_name_or_path,
pad_token_id=tokenizer.eos_token,
)
if eval_args.use_multi_gpu:
model.parallelize()
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer))
model.to(device)
model.eval()
wilsonyhlee marked this conversation as resolved.
Show resolved Hide resolved

# Exporting results
tag = eval_args.tag or datetime.now().strftime("%y%m%d_%H%M%S")
Expand All @@ -82,13 +126,14 @@ def main():
task = AutoTask.from_task_name(
eval_task,
model=model,
args=eval_args,
tokenizer=tokenizer,
device=device,
english_only=eval_args.english_only,
data_dir=eval_args.data_dir,
)
set_seed(train_args.seed)
task.evaluate()
task.evaluate(dataset_name=eval_args.dataset_name)
task.save_metrics(output_dir, logger)


Expand Down
45 changes: 25 additions & 20 deletions evaluation/tasks/auto_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,21 @@

from evaluation.models.loader import load_model
from evaluation.utils.io import load_json, save_json
from argparse import ArgumentParser


class AutoTask(ABC):
def __init__(
self,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizerFast,
device: torch.device,
english_only: bool,
data_dir: Optional[str] = None,
self,
args: ArgumentParser,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizerFast,
device: torch.device,
english_only: bool,
data_dir: Optional[str] = None,
):
self.model = model
self.args = args
self.tokenizer = tokenizer
self.device = device
self.metrics = {}
Expand All @@ -35,17 +38,19 @@ def _get_task(cls, task_name):

@classmethod
def from_task_name(
cls,
task_name: str,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizerFast,
device: torch.device,
english_only: bool,
data_dir: Optional[str] = None,
cls,
task_name: str,
args: ArgumentParser,
model: PreTrainedModel,
tokenizer: PreTrainedTokenizerFast,
device: torch.device,
english_only: bool,
data_dir: Optional[str] = None,
):
task = cls._get_task(task_name)
return task(
model=model,
args=args,
tokenizer=tokenizer,
device=device,
english_only=english_only,
Expand All @@ -54,13 +59,13 @@ def from_task_name(

@classmethod
def from_spec(
cls,
task_name: str,
model_name_or_path: str,
tokenizer_name: str,
device: torch.device,
english_only: bool,
data_dir: Optional[str] = None,
cls,
task_name: str,
model_name_or_path: str,
tokenizer_name: str,
device: torch.device,
english_only: bool,
data_dir: Optional[str] = None,
):
task = cls._get_task(task_name)
model = load_model(model_name_or_path)
Expand Down
Empty file.
5 changes: 5 additions & 0 deletions evaluation/tasks/generation-consistency/english.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"pair": "kk-en",
"stride": 512,
"batch_size": 8
}
Loading