TurkuNLP · Muennighoff · Jan 27, 2023
diff --git a/tasks/eval_harness/download_bsevalharness.py b/tasks/eval_harness/download_bsevalharness.py
@@ -0,0 +1,20 @@
+# Downloads the specified taks in the evaluation harness
+# This is particularly useful when running in environments where the GPU nodes 
+# do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
+
+from lm_eval import tasks
+from lm_eval.tasks import ALL_TASKS
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description='Download evaluation harness', allow_abbrev=False)
+parser.add_argument('--task_list', type=str, default = "all", help='Either "all" or comma separated list of tasks to download.')
+args = parser.parse_args()
+
+def main():
+    task_list = ALL_TASKS if args.task_list == 'all' else args.task_list.split(',')
+    tasks.get_task_dict_promptsource(task_list)
+
+if __name__ == '__main__':
+    main()
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
@@ -399,6 +399,7 @@ def tasks_args(parser):
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
     group.add_argument('--add_denoiser',  default = False, action='store_true', help='Whether to add a denoiser to the model')
+    group.add_argument('--fewshots', type=int, default=0, help='Number of fewshots')
     return parser
 
 from megatron.global_vars import _parse_args
@@ -431,11 +432,11 @@ def main():
         global_results = {"results": {}, "versions": {}}
         timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
         iteration_id = load_path.split("/")[-1].replace("/", "")
-        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
+        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots.json")
         # Backup file in case of interruption during writing
-        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
+        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots_backup.json")
         for task_name, task in task_dict.items():
-            results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+            results = evaluator.evaluate(adaptor, {task_name: task}, False, args.fewshots, None, bootstrap_iters=args.bootstrap_iters)
             global_results["results"] = {**global_results["results"], **results["results"]}
             global_results["versions"] = {**global_results["versions"], **results["versions"]}
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
@@ -445,7 +446,7 @@ def main():
                 with open(results_path_backup, 'w') as outfile:
                     json.dump(global_results, outfile, indent=4)
     else:
-        global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+        global_results = evaluator.evaluate(adaptor, task_dict, False, args.fewshots, None, bootstrap_iters=args.bootstrap_iters)
         if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
             print(json.dumps(global_results, indent=2))
             with open(args.results_path, 'w') as outfile: