Address PR comments

PrimeIntellect-ai · Sep 29, 2024 · bee6938 · bee6938
1 parent 0d36e74
commit bee6938
Show file tree

Hide file tree

Showing 6 changed files with 41 additions and 57 deletions.
diff --git a/configs/150M/3090.toml b/configs/150M/3090.toml
@@ -1,10 +1,8 @@
 name_model = "150M"
 project = "debug_150m_zero_band"
-run_id = "2c774d7c830b49e7855f4f9be6ea4d09"
 
 [metric_logger]
 type = "dummy"
-base_url = "https://protocol-api.primeintellect.ai"
 
 [train]
 micro_bs = 16 # change this base on the gpu

diff --git a/configs/debug/diloco_http_logger.toml b/configs/debug/diloco_http_logger.toml
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,12 +11,15 @@ dependencies = [
     "transformers>=4.44.2",
     "datasets>=3.0.0",
     "pydantic_config @ git+https://github.com/samsja/pydantic_config.git@e529c9c",
-    "einops"
 ]
 
 [project.optional-dependencies]
 all = [
     "wandb",
+    "einops",
+    "asyncio>=3.4.3",
+    "aiohttp>=3.10.5",
+    "requests>=2.32.3",
 ]
 
 [build-system]

diff --git a/src/zeroband/train.py b/src/zeroband/train.py
@@ -55,7 +55,9 @@ class TrainConfig(BaseConfig):
 
 
 class MetricLogger(BaseConfig):
-    type: Literal["wandb", "dummy", "http"] = "http"
+    type: Literal["wandb", "dummy", "http"] = "dummy"
+    batch_size: int = 10
+    # for http monitor
     base_url: str | None = None
     auth_token: str | None = None
 

diff --git a/src/zeroband/utils/monitor.py b/src/zeroband/utils/monitor.py
@@ -21,7 +21,7 @@ class HttpMonitor:
 
     def __init__(self, config, *args, **kwargs):
         self.data = []
-        self.batch_size = getattr(config.progress_logger, 'batch_size', 10)
+        self.batch_size = getattr(config.metric_logger, 'batch_size', 10)
         self.run_id = config.get('run_id', 'default_run')
         self.base_url = config['metric_logger']['base_url']
         self.auth_token = config['metric_logger']['auth_token']
@@ -37,19 +37,19 @@ def _remove_duplicates(self):
         self.data = unique_logs
 
     def log(self, data: dict[str, Any]):
+        import asyncio
+
         # Lowercase the keys in the data dictionary
         lowercased_data = {k.lower(): v for k, v in data.items()}
         self.data.append(lowercased_data)
         if len(self.data) >= self.batch_size:
-            self._remove_duplicates()  # Remove duplicates before sending
-            self._send_batch()
+            # do this in a separate thread to not affect training loop
+            asyncio.create_task(self._send_batch())
+
+    async def _send_batch(self):
+        import aiohttp
 
-    def _send_batch(self):
-        import requests
-        # Remove duplicates before sending
         self._remove_duplicates()
-
-        # Send batch of logs to API endpoint
         batch = self.data[:self.batch_size]
         headers = {
             "Content-Type": "application/json",
@@ -59,13 +59,15 @@ def _send_batch(self):
             "logs": batch
         }
         api = f"{self.base_url}/training_runs/{self.run_id}/logs"
-        try:
-            response = requests.post(api, json=payload, headers=headers)
-            response.raise_for_status()
-        except requests.RequestException as e:
-            logger.debug(f"Failed to send batch of logs to http monitor: {e}")
-            return False
-
+
+        async with aiohttp.ClientSession() as session:
+            try:
+                async with session.post(api, json=payload, headers=headers) as response:
+                    await response.raise_for_status()
+            except aiohttp.ClientError as e:
+                logger.debug(f"Failed to send batch of logs to http monitor: {e}")
+                return False
+
         self.data = self.data[self.batch_size:]
         return True
 
@@ -84,9 +86,6 @@ def _finish(self):
             return False
 
     def finish(self):
-        # Remove duplicates before sending any remaining logs
-        self._remove_duplicates()
-
         # Send any remaining logs
         while self.data:
             self._send_batch()

diff --git a/uv.lock b/uv.lock