adds wandb loging of metrics (#676)

NathanHB · web-flow · commit 989f5f5586de · 2025-04-23T14:20:10.000+02:00
diff --git a/docs/source/saving-and-reading-results.mdx b/docs/source/saving-and-reading-results.mdx
@@ -31,6 +31,20 @@ This will create a Tensorboard dashboard in a HF org set with the `--results-org
 option.
 
 
+## Pushing results to WandB
+
+You can push the results to WandB by setting `--wandb`. This will init a WandB
+run and log the results.
+
+Wandb args need to be set in your env variables.
+
+```
+export WANDB_PROJECT="lighteval"
+```
+
+You can find a list of variable in the [wandb documentation](https://docs.wandb.ai/guides/track/environment-variables/).
+
+
 ## How to load and investigate details
 
 ### Load from local detail files
diff --git a/pyproject.toml b/pyproject.toml
@@ -110,6 +110,7 @@ multilingual = [
     "pyvi", # for vietnamese tokenizer
 ]
 math = ["latex2sympy2_extended==1.0.6"]
+wandb = ["wandb"]
 
 [project.urls]
 Homepage = "https://github.com/huggingface/lighteval"
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -126,6 +126,7 @@ def __init__(
         tensorboard_metric_prefix: str = "eval",
         public: bool = False,
         nanotron_run_info: "GeneralArgs" = None,
+        wandb: bool = False,
     ) -> None:
         """Creates all the necessary loggers for evaluation tracking."""
         self.details_logger = DetailsLogger()
@@ -145,6 +146,7 @@ def __init__(
 
         self.should_push_to_hub = push_to_hub
         self.should_save_details = save_details
+        self.wandb = wandb
 
         self.should_push_results_to_tensorboard = push_to_tensorboard
         self.tensorboard_repo = f"{hub_results_org}/tensorboard_logs"
@@ -153,6 +155,20 @@ def __init__(
 
         self.public = public
 
+        if wandb is True:
+            import wandb
+
+            self.wandb_project = os.environ.get("WANDB_PROJECT", None)
+
+            if self.wandb_project is None:
+                raise ValueError("You need to specify the project name in wandb_args")
+
+            wandb.login()
+            self.wandb_run = wandb.init(
+                project=self.wandb_project,
+                resume="allow",
+            )
+
     @property
     def results(self):
         config_general = asdict(self.general_config_logger)
@@ -222,11 +238,23 @@ def save(self) -> None:
                 results_dict=results_dict,
             )
 
+        if self.wandb is True:
+            self.push_to_wandb(
+                results_dict=results_dict,
+                details_datasets=details_datasets,
+            )
+
         if self.should_push_results_to_tensorboard:
             self.push_to_tensorboard(
                 results=self.metrics_logger.metric_aggregated, details=self.details_logger.compiled_details
             )
 
+    def push_to_wandb(self, results_dict: dict, details_datasets: dict) -> None:
+        self.wandb_run.log(
+            {**results_dict["results"]},
+        )
+        self.wandb_run.finish()
+
     def save_results(self, date_id: str, results_dict: dict):
         output_dir_results = Path(self.output_dir) / "results" / self.general_config_logger.model_name
         self.fs.mkdirs(output_dir_results, exist_ok=True)
diff --git a/src/lighteval/main_accelerate.py b/src/lighteval/main_accelerate.py
@@ -85,6 +85,13 @@ def accelerate(  # noqa C901
     save_details: Annotated[
         bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
+    wandb: Annotated[
+        bool,
+        Option(
+            help="Push results to wandb. This will only work if you have wandb installed and logged in. We use env variable to configure wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = False,
     # === debug ===
     max_samples: Annotated[
         Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
@@ -112,6 +119,7 @@ def accelerate(  # noqa C901
         push_to_tensorboard=push_to_tensorboard,
         public=public_run,
         hub_results_org=results_org,
+        wandb=wandb,
     )
     pipeline_params = PipelineParameters(
         launcher_type=ParallelismManager.ACCELERATE,
diff --git a/src/lighteval/main_endpoint.py b/src/lighteval/main_endpoint.py
@@ -87,6 +87,13 @@ def inference_endpoint(
     save_details: Annotated[
         bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
+    wandb: Annotated[
+        bool,
+        Option(
+            help="Push results to wandb. This will only work if you have wandb installed and logged in. We use env variable to configure wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = False,
     # === debug ===
     max_samples: Annotated[
         Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
@@ -109,6 +116,7 @@ def inference_endpoint(
         push_to_tensorboard=push_to_tensorboard,
         public=public_run,
         hub_results_org=results_org,
+        wandb=wandb,
     )
 
     parallelism_manager = ParallelismManager.NONE  # since we're using inference endpoints in remote
diff --git a/src/lighteval/main_sglang.py b/src/lighteval/main_sglang.py
@@ -78,6 +78,13 @@ def sglang(
     save_details: Annotated[
         bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
+    wandb: Annotated[
+        bool,
+        Option(
+            help="Push results to wandb. This will only work if you have wandb installed and logged in. We use env variable to configure wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = False,
     # === debug ===
     max_samples: Annotated[
         Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
@@ -102,6 +109,7 @@ def sglang(
         push_to_tensorboard=push_to_tensorboard,
         public=public_run,
         hub_results_org=results_org,
+        wandb=wandb,
     )
 
     pipeline_params = PipelineParameters(
diff --git a/src/lighteval/main_vllm.py b/src/lighteval/main_vllm.py
@@ -81,6 +81,13 @@ def vllm(
     save_details: Annotated[
         bool, Option(help="Save detailed, sample per sample, results.", rich_help_panel=HELP_PANEL_NAME_2)
     ] = False,
+    wandb: Annotated[
+        bool,
+        Option(
+            help="Push results to wandb. This will only work if you have wandb installed and logged in. We use env variable to configure wandb. see here: https://docs.wandb.ai/guides/track/environment-variables/",
+            rich_help_panel=HELP_PANEL_NAME_2,
+        ),
+    ] = False,
     # === debug ===
     max_samples: Annotated[
         Optional[int], Option(help="Maximum number of samples to evaluate on.", rich_help_panel=HELP_PANEL_NAME_3)
@@ -105,6 +112,7 @@ def vllm(
         push_to_tensorboard=push_to_tensorboard,
         public=public_run,
         hub_results_org=results_org,
+        wandb=wandb,
     )
 
     pipeline_params = PipelineParameters(

Original file line number	Diff line number	Diff line change
`@@ -110,6 +110,7 @@ multilingual = [`
`110`	`110`	`"pyvi", # for vietnamese tokenizer`
`111`	`111`	`]`
`112`	`112`	`math = ["latex2sympy2_extended==1.0.6"]`
	`113`	`+wandb = ["wandb"]`
`113`	`114`
`114`	`115`	`[project.urls]`
`115`	`116`	`Homepage = "https://github.com/huggingface/lighteval"`