Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alternative evals #675

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`.
- Added support for flash attention and gradient checkpointing to `hf_olmo`.
- Add `basic_algebra` downstream task.

## [v0.5.0](https://github.com/allenai/OLMo/releases/tag/v0.5.0) - 2024-08-26

Expand Down
27 changes: 27 additions & 0 deletions olmo/eval/downstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,6 +855,32 @@ def __init__(
)


class BasicAlgebra(ArcEasy):
"""This is a basic algebra task follows the same prompt format as ArcEasy.
Example:
{"id": "q793_max1d_max4",
"question": "a = 4; b = 4; c = 9; d = 7; y = a - b + c - d; print(y);",
"choices": {"text": ["-23","2","18","-15"],
"label": ["A", "B", "C", "D"]},
"answerKey": "B", "type_tag": "medium"}

"""

metric_type = "acc"

def __init__(
self,
tokenizer,
dataset_path="allenai/basic_algebra",
dataset_name=None,
):
super().__init__(
tokenizer=tokenizer,
dataset_path=dataset_path,
dataset_name=dataset_name,
)


class CommonsenseQA(ArcEasy):
"""CommonsenseQA
Example:
Expand Down Expand Up @@ -1612,6 +1638,7 @@ def doc_to_label(self, doc) -> int:
"arc_easy_ppl": ArcEasyCELoss,
"arc_challenge": ArcChallenge,
"basic_arithmetic": BasicArithmetic,
"basic_algebra": BasicAlgebra,
"copa": COPA,
"rte": RTE,
"commitment_bank": CommitmentBank,
Expand Down
3 changes: 2 additions & 1 deletion olmo/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -948,7 +948,8 @@ def system_metrics(self) -> Dict[str, float]:
metrics["System/Peak GPU Memory (MB)"] = peak_gpu_mb
return metrics

def log_metrics_to_console(self, prefix: str, metrics: Dict[str, float]):
@classmethod
def log_metrics_to_console(cls, prefix: str, metrics: Dict[str, float]):
def format_float(value: float) -> str:
if value < 0.0001:
return str(value) # scientific notation
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{
"builder_name": "json",
"citation": "",
"config_name": "default",
"dataset_name": "json",
"dataset_size": 136672,
"description": "",
"download_checksums": {
"/Users/akshitab/local/code/OLMo/basic_algebra.jsonl": {
"num_bytes": 189088,
"checksum": null
}
},
"download_size": 189088,
"features": {
"id": {
"dtype": "string",
"_type": "Value"
},
"question": {
"dtype": "string",
"_type": "Value"
},
"choices": {
"text": {
"feature": {
"dtype": "string",
"_type": "Value"
},
"_type": "Sequence"
},
"label": {
"feature": {
"dtype": "string",
"_type": "Value"
},
"_type": "Sequence"
}
},
"answerKey": {
"dtype": "string",
"_type": "Value"
},
"type_tag": {
"dtype": "string",
"_type": "Value"
}
},
"homepage": "",
"license": "",
"size_in_bytes": 325760,
"splits": {
"train": {
"name": "train",
"num_bytes": 136672,
"num_examples": 1008,
"dataset_name": "json"
}
},
"version": {
"version_str": "0.0.0",
"major": 0,
"minor": 0,
"patch": 0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"_data_files": [
{
"filename": "data-00000-of-00001.arrow"
}
],
"_fingerprint": "d2abd53a1a2e1035",
"_format_columns": null,
"_format_kwargs": {},
"_format_type": null,
"_output_all_columns": false,
"_split": "train"
}
91 changes: 91 additions & 0 deletions scripts/add_math_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""
Script to create downstream eval datasets for math.
"""
import random
from typing import Any, Dict

import numpy as np
import pandas as pd

from olmo.torch_util import seed_all

seed_all(6198)


def run(save_to: str):
num_questions = 1000

simple_operations = ["+", "-"]
# operations = ['+', '-', '*', '/']

# easy; only additions and subtractions

def build_eqn(variables, operations):
question = ""
y_eqn = "y ="
for i, var in enumerate(variables):
var_name = chr(ord("a") + i)
question += f"{var_name} = {str(var)}; "
y_eqn += f" {var_name}"
if i < len(variables) - 1:
y_eqn += f" {operations[i]}"
question += f"{y_eqn}; print(y);"
return question

how_many = int(np.ceil(num_questions / (4 * 3)))
# how_many = 1
q_count = 0
instances = []
labels = ["A", "B", "C", "D"]
for var_count in [1, 2, 3, 4]:
num_ops = var_count - 1
for num_digits in [1, 2, 3]:
min_val = -(10**num_digits)
max_val = 10**num_digits
max_output = max_val * var_count # only addition and subtraction
min_output = -max_output
for _ in range(how_many):
vars = np.random.randint(min_val, max_val, size=var_count)
ops = [simple_operations[i] for i in np.random.randint(0, 2, num_ops)]

# eqn = 'a = 2\nb = 3\nc = -7\ny = a + b - c\nprint(y)\n12
question = build_eqn(vars, ops)
outputs: Dict[str, Any] = {}
exec(question, outputs)
answer = outputs["y"]
choices = [str(answer)] + [
str(random.choice([j for j in range(min_output, max_output) if j != answer])) for _ in range(3)
]
random.shuffle(choices)
answer_key = labels[choices.index(str(answer))]

q_count += 1

id_str = f"q{q_count}_max{num_digits}d_max{var_count}"

if var_count < 2:
type_tag = "easy"
elif num_digits <= 2:
type_tag = "medium"
else:
type_tag = "hard"

instance = {
"id": id_str,
"question": question,
"choices": {"text": choices, "label": labels},
"answerKey": answer_key,
"type_tag": type_tag,
}
instances.append(instance)

random.shuffle(instances)
df = pd.DataFrame.from_records(instances)
df.to_json(save_to, lines=True, compression=None, orient="records")
return instances


if __name__ == "__main__":
import sys

run(sys.argv[1])
17 changes: 10 additions & 7 deletions scripts/beaker/ladder-launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,29 @@ else
fi

gantry run \
--workspace ai2/OLMo-training \
--workspace ai2/OLMo-tiny \
--task-name ladder \
--description "OLMo ladder with $*" \
--priority normal \
--priority urgent \
--preemptible \
--beaker-image shanea/olmo-torch2.2-gantry \
--beaker-image shanea/olmo-torch23-gantry \
--cluster ai2/jupiter-cirrascale-2 \
--weka=oe-training-default:/weka/oe-training-default \
--cluster ai2/pluto-cirrascale \
--cluster ai2/allennlp-cirrascale \
--gpus 8 \
$MULTI_NODE_ARGS \
--budget ai2/oe-training \
--no-nfs \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env OLMO_TASK=model \
--env-secret WANDB_API_KEY=DIRKG_WANDB_API_KEY \
--env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \
--env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \
--env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \
--shared-memory 10GiB \
--venv base \
--yes \
--timeout=-1 \
-- /bin/bash -c "${COMMAND}"

# --weka=oe-training-default:/weka/oe-training-default \
6 changes: 6 additions & 0 deletions scripts/beaker/ladder.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ shift
BEAKER_REPLICA_RANK=$1
shift

## Install flash attn
pip install packaging ninja
export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE
pip install flash-attn==2.5.9.post1 --no-build-isolation
pip install '.[train]'

torchrun \
--nnodes ${NUM_NODES}:${NUM_NODES} \
--nproc-per-node 8 \
Expand Down
1 change: 1 addition & 0 deletions scripts/ladder.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ def config_from_args(args: argparse.Namespace) -> TrainConfig:
EvaluatorConfig(label="mmlu_social_sciences_mc_5shot_test", type=EvaluatorType.downstream),
EvaluatorConfig(label="mmlu_other_mc_5shot_test", type=EvaluatorType.downstream),
EvaluatorConfig(label="basic_arithmetic", type=EvaluatorType.downstream),
EvaluatorConfig(label="basic_algebra", type=EvaluatorType.downstream),
EvaluatorConfig(label="trivia_qa_wiki_ppl", type=EvaluatorType.downstream),
EvaluatorConfig(label="natural_qs_open_ppl", type=EvaluatorType.downstream),
EvaluatorConfig(label="arc_easy_ppl", type=EvaluatorType.downstream),
Expand Down
Loading
Loading