allenai · AkshitaB · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Added ability to try loading latest checkpoint from save folder using `--try_load_latest_save`.
 - Added support for flash attention and gradient checkpointing to `hf_olmo`.
+- Add `basic_algebra` downstream task.
 
 ## [v0.5.0](https://github.com/allenai/OLMo/releases/tag/v0.5.0) - 2024-08-26
 

diff --git a/olmo/eval/downstream.py b/olmo/eval/downstream.py
@@ -855,6 +855,32 @@ def __init__(
         )
 
 
+class BasicAlgebra(ArcEasy):
+    """This is a basic algebra task follows the same prompt format as ArcEasy.
+    Example:
+    {"id": "q793_max1d_max4",
+    "question": "a = 4; b = 4; c = 9; d = 7; y = a - b + c - d; print(y);",
+    "choices": {"text": ["-23","2","18","-15"],
+    "label": ["A", "B", "C", "D"]},
+    "answerKey": "B", "type_tag": "medium"}
+
+    """
+
+    metric_type = "acc"
+
+    def __init__(
+        self,
+        tokenizer,
+        dataset_path="allenai/basic_algebra",
+        dataset_name=None,
+    ):
+        super().__init__(
+            tokenizer=tokenizer,
+            dataset_path=dataset_path,
+            dataset_name=dataset_name,
+        )
+
+
 class CommonsenseQA(ArcEasy):
     """CommonsenseQA
     Example:
@@ -1612,6 +1638,7 @@ def doc_to_label(self, doc) -> int:
     "arc_easy_ppl": ArcEasyCELoss,
     "arc_challenge": ArcChallenge,
     "basic_arithmetic": BasicArithmetic,
+    "basic_algebra": BasicAlgebra,
     "copa": COPA,
     "rte": RTE,
     "commitment_bank": CommitmentBank,

diff --git a/olmo/train.py b/olmo/train.py
@@ -948,7 +948,8 @@ def system_metrics(self) -> Dict[str, float]:
                 metrics["System/Peak GPU Memory (MB)"] = peak_gpu_mb
         return metrics
 
-    def log_metrics_to_console(self, prefix: str, metrics: Dict[str, float]):
+    @classmethod
+    def log_metrics_to_console(cls, prefix: str, metrics: Dict[str, float]):
         def format_float(value: float) -> str:
             if value < 0.0001:
                 return str(value)  # scientific notation

diff --git a/olmo_data/hf_datasets/allenai/basic_algebra/none/validation/data-00000-of-00001.arrow b/olmo_data/hf_datasets/allenai/basic_algebra/none/validation/data-00000-of-00001.arrow
diff --git a/olmo_data/hf_datasets/allenai/basic_algebra/none/validation/dataset_info.json b/olmo_data/hf_datasets/allenai/basic_algebra/none/validation/dataset_info.json
@@ -0,0 +1,66 @@
+{
+  "builder_name": "json",
+  "citation": "",
+  "config_name": "default",
+  "dataset_name": "json",
+  "dataset_size": 136672,
+  "description": "",
+  "download_checksums": {
+    "/Users/akshitab/local/code/OLMo/basic_algebra.jsonl": {
+      "num_bytes": 189088,
+      "checksum": null
+    }
+  },
+  "download_size": 189088,
+  "features": {
+    "id": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "choices": {
+      "text": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      },
+      "label": {
+        "feature": {
+          "dtype": "string",
+          "_type": "Value"
+        },
+        "_type": "Sequence"
+      }
+    },
+    "answerKey": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "type_tag": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": "",
+  "size_in_bytes": 325760,
+  "splits": {
+    "train": {
+      "name": "train",
+      "num_bytes": 136672,
+      "num_examples": 1008,
+      "dataset_name": "json"
+    }
+  },
+  "version": {
+    "version_str": "0.0.0",
+    "major": 0,
+    "minor": 0,
+    "patch": 0
+  }
+}
diff --git a/olmo_data/hf_datasets/allenai/basic_algebra/none/validation/state.json b/olmo_data/hf_datasets/allenai/basic_algebra/none/validation/state.json
@@ -0,0 +1,13 @@
+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "d2abd53a1a2e1035",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": "train"
+}
diff --git a/scripts/add_math_eval.py b/scripts/add_math_eval.py
@@ -0,0 +1,91 @@
+"""
+Script to create downstream eval datasets for math.
+"""
+import random
+from typing import Any, Dict
+
+import numpy as np
+import pandas as pd
+
+from olmo.torch_util import seed_all
+
+seed_all(6198)
+
+
+def run(save_to: str):
+    num_questions = 1000
+
+    simple_operations = ["+", "-"]
+    # operations = ['+', '-', '*', '/']
+
+    # easy; only additions and subtractions
+
+    def build_eqn(variables, operations):
+        question = ""
+        y_eqn = "y ="
+        for i, var in enumerate(variables):
+            var_name = chr(ord("a") + i)
+            question += f"{var_name} = {str(var)}; "
+            y_eqn += f" {var_name}"
+            if i < len(variables) - 1:
+                y_eqn += f" {operations[i]}"
+        question += f"{y_eqn}; print(y);"
+        return question
+
+    how_many = int(np.ceil(num_questions / (4 * 3)))
+    # how_many = 1
+    q_count = 0
+    instances = []
+    labels = ["A", "B", "C", "D"]
+    for var_count in [1, 2, 3, 4]:
+        num_ops = var_count - 1
+        for num_digits in [1, 2, 3]:
+            min_val = -(10**num_digits)
+            max_val = 10**num_digits
+            max_output = max_val * var_count  # only addition and subtraction
+            min_output = -max_output
+            for _ in range(how_many):
+                vars = np.random.randint(min_val, max_val, size=var_count)
+                ops = [simple_operations[i] for i in np.random.randint(0, 2, num_ops)]
+
+                # eqn = 'a = 2\nb = 3\nc = -7\ny = a + b - c\nprint(y)\n12
+                question = build_eqn(vars, ops)
+                outputs: Dict[str, Any] = {}
+                exec(question, outputs)
+                answer = outputs["y"]
+                choices = [str(answer)] + [
+                    str(random.choice([j for j in range(min_output, max_output) if j != answer])) for _ in range(3)
+                ]
+                random.shuffle(choices)
+                answer_key = labels[choices.index(str(answer))]
+
+                q_count += 1
+
+                id_str = f"q{q_count}_max{num_digits}d_max{var_count}"
+
+                if var_count < 2:
+                    type_tag = "easy"
+                elif num_digits <= 2:
+                    type_tag = "medium"
+                else:
+                    type_tag = "hard"
+
+                instance = {
+                    "id": id_str,
+                    "question": question,
+                    "choices": {"text": choices, "label": labels},
+                    "answerKey": answer_key,
+                    "type_tag": type_tag,
+                }
+                instances.append(instance)
+
+    random.shuffle(instances)
+    df = pd.DataFrame.from_records(instances)
+    df.to_json(save_to, lines=True, compression=None, orient="records")
+    return instances
+
+
+if __name__ == "__main__":
+    import sys
+
+    run(sys.argv[1])
diff --git a/scripts/beaker/ladder-launch.sh b/scripts/beaker/ladder-launch.sh
@@ -14,26 +14,29 @@ else
 fi
 
 gantry run \
-  --workspace ai2/OLMo-training \
+  --workspace ai2/OLMo-tiny \
   --task-name ladder \
   --description "OLMo ladder with $*" \
-  --priority normal \
+  --priority urgent \
   --preemptible \
-  --beaker-image shanea/olmo-torch2.2-gantry \
+  --beaker-image shanea/olmo-torch23-gantry \
   --cluster ai2/jupiter-cirrascale-2 \
-  --weka=oe-training-default:/weka/oe-training-default \
+  --cluster ai2/pluto-cirrascale \
+  --cluster ai2/allennlp-cirrascale \
   --gpus 8 \
   $MULTI_NODE_ARGS \
   --budget ai2/oe-training \
   --no-nfs \
   --env LOG_FILTER_TYPE=local_rank0_only \
   --env OMP_NUM_THREADS=8 \
   --env OLMO_TASK=model \
-  --env-secret WANDB_API_KEY=DIRKG_WANDB_API_KEY \
-  --env-secret AWS_ACCESS_KEY_ID=AWS_ACCESS_KEY_ID \
-  --env-secret AWS_SECRET_ACCESS_KEY=AWS_SECRET_ACCESS_KEY \
+  --env-secret WANDB_API_KEY=AKSHITAB_WANDB_API_KEY \
+  --env-secret AWS_ACCESS_KEY_ID=AKSHITAB_AWS_ACCESS_KEY_ID \
+  --env-secret AWS_SECRET_ACCESS_KEY=AKSHITAB_AWS_SECRET_ACCESS_KEY \
   --shared-memory 10GiB \
   --venv base \
   --yes \
   --timeout=-1 \
   -- /bin/bash -c "${COMMAND}"
+
+  # --weka=oe-training-default:/weka/oe-training-default \
diff --git a/scripts/beaker/ladder.sh b/scripts/beaker/ladder.sh
@@ -11,6 +11,12 @@ shift
 BEAKER_REPLICA_RANK=$1
 shift
 
+## Install flash attn
+pip install packaging ninja
+export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE
+pip install flash-attn==2.5.9.post1 --no-build-isolation
+pip install '.[train]'
+
 torchrun \
   --nnodes ${NUM_NODES}:${NUM_NODES} \
   --nproc-per-node 8 \

diff --git a/scripts/ladder.py b/scripts/ladder.py
@@ -290,6 +290,7 @@ def config_from_args(args: argparse.Namespace) -> TrainConfig:
             EvaluatorConfig(label="mmlu_social_sciences_mc_5shot_test", type=EvaluatorType.downstream),
             EvaluatorConfig(label="mmlu_other_mc_5shot_test", type=EvaluatorType.downstream),
             EvaluatorConfig(label="basic_arithmetic", type=EvaluatorType.downstream),
+            EvaluatorConfig(label="basic_algebra", type=EvaluatorType.downstream),
             EvaluatorConfig(label="trivia_qa_wiki_ppl", type=EvaluatorType.downstream),
             EvaluatorConfig(label="natural_qs_open_ppl", type=EvaluatorType.downstream),
             EvaluatorConfig(label="arc_easy_ppl", type=EvaluatorType.downstream),