From daad6b54aa34fbdf685003834975a3871dc8abc1 Mon Sep 17 00:00:00 2001 From: tazlin Date: Fri, 23 Aug 2024 19:29:29 -0400 Subject: [PATCH 01/50] feat: use `horde_engine~=2.14.2` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a10e658d..689a6bdf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - horde_safety==0.2.3 - torch==2.3.1 - ruamel.yaml - - horde_engine==2.13.3 + - horde_engine==2.14.2 - horde_sdk==0.14.0 - horde_model_reference==0.8.1 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index b62a3029..f626a593 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.0 horde_model_reference~=0.8.1 horde_engine~=2.13.3 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.0 horde_model_reference~=0.8.1 horde_engine~=2.14.2 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index bc1b4408..6dda2dcb 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -3,7 +3,7 @@ torch==2.3.1+rocm6.0 horde_sdk~=0.14.0 horde_safety~=0.2.3 -horde_engine~=2.13.3 +horde_engine~=2.14.2 horde_model_reference~=0.8.1 python-dotenv diff --git a/requirements.txt b/requirements.txt index f41f457d..5f2b5461 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ torch==2.3.1 horde_sdk~=0.14.0 horde_safety~=0.2.3 -horde_engine~=2.13.3 +horde_engine~=2.14.2 horde_model_reference>=0.8.1 python-dotenv From 5dfc26bb52daa9b8744a8dcb52d9f36e689b14cf Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 24 Aug 2024 08:34:29 -0400 Subject: [PATCH 02/50] fix: use `--novram` approach Despite the name, `--novram` still allows the GPU to be used. However, comfyui uses this flag to much more aggressively avoid leaving tensors in VRAM. I am hoping that this will reduce VRAM OOMs and/or shared memory usage (in windows). --- horde_worker_regen/process_management/worker_entry_points.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/horde_worker_regen/process_management/worker_entry_points.py b/horde_worker_regen/process_management/worker_entry_points.py index dc9c2957..4f851fc9 100644 --- a/horde_worker_regen/process_management/worker_entry_points.py +++ b/horde_worker_regen/process_management/worker_entry_points.py @@ -61,13 +61,15 @@ def start_inference_process( if high_memory_mode: extra_comfyui_args.append("--highvram") + else: + extra_comfyui_args.append("--novram") with logger.catch(reraise=True): hordelib.initialise( setup_logging=None, process_id=process_id, logging_verbosity=0, - force_normal_vram_mode=not high_memory_mode, + force_normal_vram_mode=False, extra_comfyui_args=extra_comfyui_args, ) except Exception as e: From aac8c6ef9cfc113f2de6663e0756ff8b86203516 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 24 Aug 2024 08:53:18 -0400 Subject: [PATCH 03/50] refactor/fix: `unload_models_from_vram` more often With some recent comfyui changes it appears that the logic prior to this commit was not aggressive enough to avoid OOMs with relying on comfyui's internal decision making alone. This commit causes the worker to unload models from VRAM immediately after an inference result (if it is not about to be used) and right before post processing. Post processing as implemented today almost always overestimates the amount of free VRAM, and tends to cause OOMs or shared memory usage (on window) so more proactively unloading the model should help minimize that problem. --- .../process_management/inference_process.py | 2 + .../process_management/process_manager.py | 78 ++++++++++++------- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/horde_worker_regen/process_management/inference_process.py b/horde_worker_regen/process_management/inference_process.py index 128f1763..ed06679d 100644 --- a/horde_worker_regen/process_management/inference_process.py +++ b/horde_worker_regen/process_management/inference_process.py @@ -431,6 +431,8 @@ def progress_callback( except Exception as e: logger.error(f"Failed to release inference semaphore: {type(e).__name__} {e}") + self.unload_models_from_vram() + if progress_report.comfyui_progress is not None and progress_report.comfyui_progress.current_step > 0: self.send_heartbeat_message(heartbeat_type=HordeHeartbeatType.INFERENCE_STEP) else: diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 9cc1746f..8a99eb05 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -1669,6 +1669,9 @@ def receive_and_handle_process_messages(self) -> None: break self.total_num_completed_jobs += 1 + + self.unload_models_from_vram(process_with_model=self._process_map[message.process_id]) + if message.time_elapsed is not None: logger.info( f"Inference finished for job {message.sdk_api_job_info.id_} on process {message.process_id}. " @@ -2030,34 +2033,7 @@ def start_inference(self) -> None: # Unload all models from vram from any other process that isn't running a job if configured to do so if self.bridge_data.unload_models_from_vram: - next_n_models = list(self.get_next_n_models(self.max_inference_processes)) - for process_info in self._process_map.values(): - if process_info.process_id == process_with_model.process_id: - continue - - if process_info.is_process_busy(): - continue - - if process_info.loaded_horde_model_name is None: - continue - - if len(self.job_deque) == len(self.jobs_in_progress) + len(self.jobs_pending_safety_check): - logger.debug("Not unloading models from VRAM because there are no jobs to make room for.") - continue - - # If the model would be used by another process soon, don't unload it - if process_info.loaded_horde_model_name in next_n_models: - continue - - if process_info.last_control_flag != HordeControlFlag.UNLOAD_MODELS_FROM_VRAM: - process_info.safe_send_message( - HordeControlModelMessage( - control_flag=HordeControlFlag.UNLOAD_MODELS_FROM_VRAM, - horde_model_name=process_info.loaded_horde_model_name, - ), - ) - process_info.last_job_referenced = None - process_info.last_control_flag = HordeControlFlag.UNLOAD_MODELS_FROM_VRAM + self.unload_models_from_vram(process_with_model) logger.info(f"Starting inference for job {next_job.id_} on process {process_with_model.process_id}") # region Log job info @@ -2124,6 +2100,48 @@ def start_inference(self) -> None: ) self.handle_job_fault(faulted_job=next_job, process_info=process_with_model) + def unload_models_from_vram( + self, + process_with_model: HordeProcessInfo, + ) -> None: + """Unload models from VRAM from processes that are not running a job. + + Args: + process_with_model: The process that is running a job. + """ + next_n_models = list(self.get_next_n_models(self.max_inference_processes)) + for process_info in self._process_map.values(): + if process_info.process_id == process_with_model.process_id: + continue + + if process_info.is_process_busy(): + continue + + if process_info.loaded_horde_model_name is None: + continue + + # if len(self.job_deque) == len(self.jobs_in_progress) + len(self.jobs_pending_safety_check): + # logger.debug("Not unloading models from VRAM because there are no jobs to make room for.") + # continue + + if len(self.bridge_data.image_models_to_load) == 1: + logger.debug("Not unloading models from VRAM because there is only one model to load.") + continue + + # If the model would be used by another process soon, don't unload it + if process_info.loaded_horde_model_name in next_n_models: + continue + + if process_info.last_control_flag != HordeControlFlag.UNLOAD_MODELS_FROM_VRAM: + process_info.safe_send_message( + HordeControlModelMessage( + control_flag=HordeControlFlag.UNLOAD_MODELS_FROM_VRAM, + horde_model_name=process_info.loaded_horde_model_name, + ), + ) + process_info.last_job_referenced = None + process_info.last_control_flag = HordeControlFlag.UNLOAD_MODELS_FROM_VRAM + def unload_from_ram(self, process_id: int) -> None: """Unload models from a process, either from VRAM or both VRAM and system RAM. @@ -3122,9 +3140,9 @@ async def api_job_pop(self) -> None: # If there are long running jobs, don't start any more even if there is space in the deque if self.should_wait_for_pending_megapixelsteps(): if self.get_pending_megapixelsteps() < 40: - seconds_to_wait = self.get_pending_megapixelsteps() * 0.5 + seconds_to_wait = self.get_pending_megapixelsteps() * 0.6 elif self.get_pending_megapixelsteps() < 80: - seconds_to_wait = self.get_pending_megapixelsteps() * 0.7 + seconds_to_wait = self.get_pending_megapixelsteps() * 0.8 else: seconds_to_wait = self.get_pending_megapixelsteps() * 0.9 From c344985cdf4021bb1f83f73e6cf6294c0180c6a1 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 24 Aug 2024 11:02:57 -0400 Subject: [PATCH 04/50] feat: use `horde_engine~=2.14.3` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 689a6bdf..ced3db97 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - horde_safety==0.2.3 - torch==2.3.1 - ruamel.yaml - - horde_engine==2.14.2 + - horde_engine==2.14.3 - horde_sdk==0.14.0 - horde_model_reference==0.8.1 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index f626a593..d4cbaa0a 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.0 horde_model_reference~=0.8.1 horde_engine~=2.14.2 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.0 horde_model_reference~=0.8.1 horde_engine~=2.14.3 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index 6dda2dcb..6880403d 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -3,7 +3,7 @@ torch==2.3.1+rocm6.0 horde_sdk~=0.14.0 horde_safety~=0.2.3 -horde_engine~=2.14.2 +horde_engine~=2.14.3 horde_model_reference~=0.8.1 python-dotenv diff --git a/requirements.txt b/requirements.txt index 5f2b5461..64710584 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ torch==2.3.1 horde_sdk~=0.14.0 horde_safety~=0.2.3 -horde_engine~=2.14.2 +horde_engine~=2.14.3 horde_model_reference>=0.8.1 python-dotenv From 364c7b1e25ac10869879333b342380326a0097e8 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 24 Aug 2024 13:42:49 -0400 Subject: [PATCH 05/50] fix: always unload models from ram --- horde_worker_regen/process_management/process_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 8a99eb05..34a99d35 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -2241,8 +2241,7 @@ def unload_models(self) -> None: if process_info.loaded_horde_model_name in next_n_models: continue - if self.get_process_total_ram_usage() > self.target_ram_bytes_used: - self.unload_from_ram(process_info.process_id) + self.unload_from_ram(process_info.process_id) def start_evaluate_safety(self) -> None: """Start evaluating the safety of the next job pending a safety check, if any.""" @@ -3650,6 +3649,7 @@ def print_status_method(self) -> None: [ f"dreamer_name: {self.bridge_data.dreamer_worker_name}", f"(v{horde_worker_regen.__version__})", + f"num_models: {len(self.bridge_data.image_models_to_load)}", f"max_power: {self.bridge_data.max_power}", f"max_threads: {self.max_concurrent_inference_processes}", f"queue_size: {self.bridge_data.queue_size}", From 42344120970f9feed15e0aa6136f10a345b43023 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 24 Aug 2024 13:52:06 -0400 Subject: [PATCH 06/50] fix: more aggressively unload from system ram The worker seems to be holding onto too much system RAM on average. I previously relied on comfyui internals to handle this implicitly but recent changes seem to have broken some assumptions I was making. This is an purposely over-zealous attempt to keep system RAM usage down. --- .../process_management/process_manager.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 34a99d35..3029713a 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -1669,8 +1669,9 @@ def receive_and_handle_process_messages(self) -> None: break self.total_num_completed_jobs += 1 - - self.unload_models_from_vram(process_with_model=self._process_map[message.process_id]) + if self.bridge_data.unload_models_from_vram: + self.unload_models_from_vram(process_with_model=self._process_map[message.process_id]) + self.unload_models() if message.time_elapsed is not None: logger.info( @@ -2034,6 +2035,7 @@ def start_inference(self) -> None: # Unload all models from vram from any other process that isn't running a job if configured to do so if self.bridge_data.unload_models_from_vram: self.unload_models_from_vram(process_with_model) + self.unload_models() logger.info(f"Starting inference for job {next_job.id_} on process {process_with_model.process_id}") # region Log job info @@ -2143,7 +2145,7 @@ def unload_models_from_vram( process_info.last_control_flag = HordeControlFlag.UNLOAD_MODELS_FROM_VRAM def unload_from_ram(self, process_id: int) -> None: - """Unload models from a process, either from VRAM or both VRAM and system RAM. + """Unload models from a process. Args: process_id: The process to unload models from. @@ -2154,10 +2156,11 @@ def unload_from_ram(self, process_id: int) -> None: process_info = self._process_map[process_id] if process_info.loaded_horde_model_name is None: - raise ValueError(f"process_id {process_id} is not loaded with a model") + logger.debug(f"Process {process_id} has no model loaded, so nothing to unload") + return if not self._horde_model_map.is_model_loaded(process_info.loaded_horde_model_name): - raise ValueError(f"process_id {process_id} is loaded with a model that is not loaded") + raise ValueError(f"process_id {process_id} is references an invalid model`") if process_info.last_control_flag != HordeControlFlag.UNLOAD_MODELS_FROM_RAM: process_info.safe_send_message( @@ -2220,6 +2223,10 @@ def unload_models(self) -> None: if len(self.job_deque) == len(self.jobs_in_progress) + len(self.jobs_pending_safety_check): return + # 1 thread, 1 model, no need to unload as it should always be in use (or at least available) + if self._max_concurrent_inference_processes == 1 and len(self.bridge_data.image_models_to_load) == 1: + return + next_n_models: set[str] = self.get_next_n_models(self.max_inference_processes) for process_info in self._process_map.values(): @@ -2239,6 +2246,9 @@ def unload_models(self) -> None: continue if process_info.loaded_horde_model_name in next_n_models: + logger.debug( + f"Model {process_info.loaded_horde_model_name} is in use by another process, not unloading", + ) continue self.unload_from_ram(process_info.process_id) @@ -3564,7 +3574,7 @@ async def _process_control_loop(self) -> None: await asyncio.sleep(self._loop_interval / 2) self._replace_all_safety_process() - # self.unload_models() + self.unload_models() if self._shutting_down: self.end_inference_processes() From 5debcf25b12269e24af63d08bbffa8ccbb0eccc6 Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 26 Aug 2024 16:30:03 -0400 Subject: [PATCH 07/50] feat: use `horde_engine~=2.14.4` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ced3db97..31398af2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - horde_safety==0.2.3 - torch==2.3.1 - ruamel.yaml - - horde_engine==2.14.3 + - horde_engine==2.14.4 - horde_sdk==0.14.0 - horde_model_reference==0.8.1 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index d4cbaa0a..b036c45e 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.0 horde_model_reference~=0.8.1 horde_engine~=2.14.3 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.0 horde_model_reference~=0.8.1 horde_engine~=2.14.4 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index 6880403d..1fd7fc39 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -3,7 +3,7 @@ torch==2.3.1+rocm6.0 horde_sdk~=0.14.0 horde_safety~=0.2.3 -horde_engine~=2.14.3 +horde_engine~=2.14.4 horde_model_reference~=0.8.1 python-dotenv diff --git a/requirements.txt b/requirements.txt index 64710584..f8e94fe1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ torch==2.3.1 horde_sdk~=0.14.0 horde_safety~=0.2.3 -horde_engine~=2.14.3 +horde_engine~=2.14.4 horde_model_reference>=0.8.1 python-dotenv From c90a0196be578098e0af70442416011996c8770f Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 26 Aug 2024 16:35:10 -0400 Subject: [PATCH 08/50] fix: unload models more often and more appropriately --- .../process_management/inference_process.py | 35 +++--- .../process_management/process_manager.py | 104 ++++++++++-------- .../process_management/worker_entry_points.py | 10 +- 3 files changed, 89 insertions(+), 60 deletions(-) diff --git a/horde_worker_regen/process_management/inference_process.py b/horde_worker_regen/process_management/inference_process.py index ed06679d..3fad4338 100644 --- a/horde_worker_regen/process_management/inference_process.py +++ b/horde_worker_regen/process_management/inference_process.py @@ -431,8 +431,6 @@ def progress_callback( except Exception as e: logger.error(f"Failed to release inference semaphore: {type(e).__name__} {e}") - self.unload_models_from_vram() - if progress_report.comfyui_progress is not None and progress_report.comfyui_progress.current_step > 0: self.send_heartbeat_message(heartbeat_type=HordeHeartbeatType.INFERENCE_STEP) else: @@ -634,6 +632,18 @@ def _receive_and_handle_control_message(self, message: HordeControlMessage) -> N ) elif isinstance(message, HordeInferenceControlMessage): if message.control_flag == HordeControlFlag.START_INFERENCE: + if self._active_model_name is None: + self.preload_model( + horde_model_name=message.horde_model_name, + will_load_loras=message.sdk_api_job_info.payload.loras is not None + and len( + message.sdk_api_job_info.payload.loras, + ) + > 0, + seamless_tiling_enabled=message.sdk_api_job_info.payload.tiling, + job_info=message.sdk_api_job_info, + ) + if message.horde_model_name != self._active_model_name: error_message = f"Received START_INFERENCE control message for model {message.horde_model_name} " error_message += f"but currently active model is {self._active_model_name}" @@ -698,17 +708,6 @@ def _receive_and_handle_control_message(self, message: HordeControlMessage) -> N else: logger.critical(f"Received unexpected message: {message}") return - elif isinstance(message, HordeControlModelMessage): - if message.control_flag == HordeControlFlag.DOWNLOAD_MODEL: - self.download_model(message.horde_model_name) - elif message.control_flag == HordeControlFlag.UNLOAD_MODELS_FROM_VRAM: - self.unload_models_from_vram() - elif message.control_flag == HordeControlFlag.UNLOAD_MODELS_FROM_RAM: - self.unload_models_from_ram() - else: - logger.critical(f"Received unexpected message: {message}") - return - elif message.control_flag == HordeControlFlag.END_PROCESS: self.send_process_state_change_message( process_state=HordeProcessState.PROCESS_ENDING, @@ -716,3 +715,13 @@ def _receive_and_handle_control_message(self, message: HordeControlMessage) -> N ) self._end_process = True + return + + if isinstance(message, HordeControlModelMessage) and message.control_flag == HordeControlFlag.DOWNLOAD_MODEL: + self.download_model(horde_model_name=message.horde_model_name) + + if isinstance(message, HordeControlMessage): + if message.control_flag == HordeControlFlag.UNLOAD_MODELS_FROM_VRAM: + self.unload_models_from_vram() + elif message.control_flag == HordeControlFlag.UNLOAD_MODELS_FROM_RAM: + self.unload_models_from_ram() diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 3029713a..d3865994 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -1671,7 +1671,6 @@ def receive_and_handle_process_messages(self) -> None: self.total_num_completed_jobs += 1 if self.bridge_data.unload_models_from_vram: self.unload_models_from_vram(process_with_model=self._process_map[message.process_id]) - self.unload_models() if message.time_elapsed is not None: logger.info( @@ -2035,7 +2034,6 @@ def start_inference(self) -> None: # Unload all models from vram from any other process that isn't running a job if configured to do so if self.bridge_data.unload_models_from_vram: self.unload_models_from_vram(process_with_model) - self.unload_models() logger.info(f"Starting inference for job {next_job.id_} on process {process_with_model.process_id}") # region Log job info @@ -2116,33 +2114,41 @@ def unload_models_from_vram( if process_info.process_id == process_with_model.process_id: continue - if process_info.is_process_busy(): + if process_info.process_type != HordeProcessType.INFERENCE: continue - if process_info.loaded_horde_model_name is None: + if process_info.is_process_busy(): continue + if process_info.loaded_horde_model_name is not None: + # if len(self.job_deque) == len(self.jobs_in_progress) + len(self.jobs_pending_safety_check): # logger.debug("Not unloading models from VRAM because there are no jobs to make room for.") # continue - if len(self.bridge_data.image_models_to_load) == 1: - logger.debug("Not unloading models from VRAM because there is only one model to load.") - continue + if len(self.bridge_data.image_models_to_load) == 1: + logger.debug("Not unloading models from VRAM because there is only one model to load.") + continue # If the model would be used by another process soon, don't unload it - if process_info.loaded_horde_model_name in next_n_models: - continue + if process_info.loaded_horde_model_name in next_n_models: + continue - if process_info.last_control_flag != HordeControlFlag.UNLOAD_MODELS_FROM_VRAM: + if process_info.last_control_flag != HordeControlFlag.UNLOAD_MODELS_FROM_VRAM: + process_info.safe_send_message( + HordeControlModelMessage( + control_flag=HordeControlFlag.UNLOAD_MODELS_FROM_VRAM, + horde_model_name=process_info.loaded_horde_model_name, + ), + ) + process_info.last_job_referenced = None + process_info.last_control_flag = HordeControlFlag.UNLOAD_MODELS_FROM_VRAM + else: process_info.safe_send_message( - HordeControlModelMessage( + HordeControlMessage( control_flag=HordeControlFlag.UNLOAD_MODELS_FROM_VRAM, - horde_model_name=process_info.loaded_horde_model_name, ), ) - process_info.last_job_referenced = None - process_info.last_control_flag = HordeControlFlag.UNLOAD_MODELS_FROM_VRAM def unload_from_ram(self, process_id: int) -> None: """Unload models from a process. @@ -2155,30 +2161,37 @@ def unload_from_ram(self, process_id: int) -> None: process_info = self._process_map[process_id] - if process_info.loaded_horde_model_name is None: - logger.debug(f"Process {process_id} has no model loaded, so nothing to unload") + if process_info.process_type != HordeProcessType.INFERENCE: + logger.warning(f"Process {process_id} is not an inference process, not unloading models") return - if not self._horde_model_map.is_model_loaded(process_info.loaded_horde_model_name): - raise ValueError(f"process_id {process_id} is references an invalid model`") + if process_info.loaded_horde_model_name is not None: + if not self._horde_model_map.is_model_loaded(process_info.loaded_horde_model_name): + raise ValueError(f"process_id {process_id} is references an invalid model`") + + if process_info.last_control_flag != HordeControlFlag.UNLOAD_MODELS_FROM_RAM: + process_info.safe_send_message( + HordeControlModelMessage( + control_flag=HordeControlFlag.UNLOAD_MODELS_FROM_RAM, + horde_model_name=process_info.loaded_horde_model_name, + ), + ) + + process_info.last_job_referenced = None + process_info.last_control_flag = HordeControlFlag.UNLOAD_MODELS_FROM_RAM - if process_info.last_control_flag != HordeControlFlag.UNLOAD_MODELS_FROM_RAM: + self._horde_model_map.update_entry( + horde_model_name=process_info.loaded_horde_model_name, + load_state=ModelLoadState.ON_DISK, + process_id=process_id, + ) + else: process_info.safe_send_message( - HordeControlModelMessage( + HordeControlMessage( control_flag=HordeControlFlag.UNLOAD_MODELS_FROM_RAM, - horde_model_name=process_info.loaded_horde_model_name, ), ) - process_info.last_job_referenced = None - process_info.last_control_flag = HordeControlFlag.UNLOAD_MODELS_FROM_RAM - - self._horde_model_map.update_entry( - horde_model_name=process_info.loaded_horde_model_name, - load_state=ModelLoadState.ON_DISK, - process_id=process_id, - ) - self._process_map.on_model_load_state_change( process_id=process_id, horde_model_name=None, @@ -2230,26 +2243,27 @@ def unload_models(self) -> None: next_n_models: set[str] = self.get_next_n_models(self.max_inference_processes) for process_info in self._process_map.values(): - if process_info.is_process_busy(): + if process_info.process_type != HordeProcessType.INFERENCE: continue - if process_info.loaded_horde_model_name is None: + if process_info.is_process_busy(): continue - if self._horde_model_map.is_model_loading(process_info.loaded_horde_model_name): - continue + if process_info.loaded_horde_model_name is not None: + if self._horde_model_map.is_model_loading(process_info.loaded_horde_model_name): + continue - if ( - self._horde_model_map.root[process_info.loaded_horde_model_name].horde_model_load_state - == ModelLoadState.IN_USE - ): - continue + if ( + self._horde_model_map.root[process_info.loaded_horde_model_name].horde_model_load_state + == ModelLoadState.IN_USE + ): + continue - if process_info.loaded_horde_model_name in next_n_models: - logger.debug( - f"Model {process_info.loaded_horde_model_name} is in use by another process, not unloading", - ) - continue + if process_info.loaded_horde_model_name in next_n_models: + logger.debug( + f"Model {process_info.loaded_horde_model_name} is in use by another process, not unloading", + ) + continue self.unload_from_ram(process_info.process_id) @@ -3574,7 +3588,7 @@ async def _process_control_loop(self) -> None: await asyncio.sleep(self._loop_interval / 2) self._replace_all_safety_process() - self.unload_models() + # self.unload_models() if self._shutting_down: self.end_inference_processes() diff --git a/horde_worker_regen/process_management/worker_entry_points.py b/horde_worker_regen/process_management/worker_entry_points.py index 4f851fc9..22214f03 100644 --- a/horde_worker_regen/process_management/worker_entry_points.py +++ b/horde_worker_regen/process_management/worker_entry_points.py @@ -20,7 +20,9 @@ def start_inference_process( disk_lock: Lock, aux_model_lock: Lock, *, + low_memory_mode: bool = True, high_memory_mode: bool = False, + very_high_memory_mode: bool = False, amd_gpu: bool = False, ) -> None: """Start an inference process. @@ -59,10 +61,14 @@ def start_inference_process( if amd_gpu: extra_comfyui_args.append("--use-pytorch-cross-attention") - if high_memory_mode: + if very_high_memory_mode: + extra_comfyui_args.append("--gpu-only") + elif high_memory_mode: extra_comfyui_args.append("--highvram") - else: + elif low_memory_mode: extra_comfyui_args.append("--novram") + else: + extra_comfyui_args.append("--normalvram") with logger.catch(reraise=True): hordelib.initialise( From 73418c181a86ae9baafb05d2b07acfb4e7a254d1 Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 26 Aug 2024 16:44:29 -0400 Subject: [PATCH 09/50] docs: add missing arg docstrings --- horde_worker_regen/process_management/worker_entry_points.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/horde_worker_regen/process_management/worker_entry_points.py b/horde_worker_regen/process_management/worker_entry_points.py index 22214f03..e6f6ce99 100644 --- a/horde_worker_regen/process_management/worker_entry_points.py +++ b/horde_worker_regen/process_management/worker_entry_points.py @@ -34,7 +34,10 @@ def start_inference_process( inference_semaphore (Semaphore): The semaphore to use to limit concurrent inference. disk_lock (Lock): The lock to use for disk access. aux_model_lock (Lock): The lock to use for auxiliary model downloading. + low_memory_mode (bool, optional): If true, the process will attempt to use less memory. Defaults to True. high_memory_mode (bool, optional): If true, the process will attempt to use more memory. Defaults to False. + very_high_memory_mode (bool, optional): If true, the process will attempt to use even more memory. + Defaults to False. amd_gpu (bool, optional): If true, the process will attempt to use AMD GPU-specific optimisations. Defaults to False. """ From 602a958a193dcd47e058cc9a20be0ae5e386d129 Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 26 Aug 2024 22:02:44 -0400 Subject: [PATCH 10/50] feat: more configurable memory management Redefines the broken existing `high_memory_mode` to leverage the recent memory management extension --- horde_worker_regen/bridge_data/data_model.py | 16 +++++----------- .../process_management/inference_process.py | 13 ++++++++++++- .../process_management/process_manager.py | 4 ++-- .../process_management/worker_entry_points.py | 14 +++++++++++--- 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/horde_worker_regen/bridge_data/data_model.py b/horde_worker_regen/bridge_data/data_model.py index 3713f002..2ebc6d8b 100644 --- a/horde_worker_regen/bridge_data/data_model.py +++ b/horde_worker_regen/bridge_data/data_model.py @@ -40,7 +40,7 @@ class reGenBridgeData(CombinedHordeBridgeData): default=None, alias="civitai_api_token", ) - unload_models_from_vram: bool = Field(default=True) + unload_models_from_vram_often: bool = Field(default=True) process_timeout: int = Field(default=900) """The maximum amount of time to allow a job to run before it is killed""" @@ -107,20 +107,14 @@ def validate_performance_modes(self) -> reGenBridgeData: if self.high_memory_mode and not self.very_high_memory_mode: if self.max_threads != 1: - self.max_threads = 1 logger.warning( - "High memory mode is enabled, so the max_threads value has been set to 1.", + "High memory mode is enabled. You may experience performance issues with more than one thread.", ) - if self.queue_size == 0: + if self.unload_models_from_vram_often: logger.warning( - "High memory mode is enabled and works best with a queue_size of 1.", - ) - - if self.queue_size > 1: - self.queue_size = 1 - logger.warning( - "High memory mode is enabled, so the queue_size value has been set to 1.", + "Please let us know if `unload_models_from_vram_often` improves or degrades performance with" + " `high_memory_mode` enabled.", ) if self.cycle_process_on_model_change: diff --git a/horde_worker_regen/process_management/inference_process.py b/horde_worker_regen/process_management/inference_process.py index 3fad4338..e3603c60 100644 --- a/horde_worker_regen/process_management/inference_process.py +++ b/horde_worker_regen/process_management/inference_process.py @@ -87,6 +87,8 @@ def __init__( inference_semaphore: Semaphore, aux_model_lock: Lock, disk_lock: Lock, + *, + high_memory_mode: bool = False, ) -> None: """Initialise the HordeInferenceProcess. @@ -98,6 +100,9 @@ def __init__( inference_semaphore (Semaphore): A semaphore used to limit the number of concurrent inference jobs. aux_model_lock (Lock): A lock used to prevent multiple processes from downloading auxiliary models at the \ disk_lock (Lock): A lock used to prevent multiple processes from accessing disk at the same time. + high_memory_mode (bool, optional): Whether or not to use high memory mode. This mode uses more memory, but\ + may be faster if the system has enough memory and VRAM. \ + Defaults to False. """ super().__init__( process_id=process_id, @@ -123,8 +128,12 @@ def __init__( from hordelib.nodes.node_model_loader import HordeCheckpointLoader try: + logger.info(f"Initialising HordeLib with high_memory_mode={high_memory_mode}") with logger.catch(reraise=True): - self._horde = HordeLib(comfyui_callback=self._comfyui_callback) + self._horde = HordeLib( + comfyui_callback=self._comfyui_callback, + aggressive_unloading=not high_memory_mode, + ) self._shared_model_manager = SharedModelManager(do_not_load_model_mangers=True) except Exception as e: logger.critical(f"Failed to initialise HordeLib: {type(e).__name__} {e}") @@ -394,6 +403,8 @@ def preload_model( job_info=job_info, ) + self.send_memory_report_message(include_vram=True) + self.send_process_state_change_message( process_state=HordeProcessState.WAITING_FOR_JOB, info=f"Preloaded model {horde_model_name}", diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index d3865994..c5048e27 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -1669,7 +1669,7 @@ def receive_and_handle_process_messages(self) -> None: break self.total_num_completed_jobs += 1 - if self.bridge_data.unload_models_from_vram: + if self.bridge_data.unload_models_from_vram_often: self.unload_models_from_vram(process_with_model=self._process_map[message.process_id]) if message.time_elapsed is not None: @@ -2032,7 +2032,7 @@ def start_inference(self) -> None: ) # Unload all models from vram from any other process that isn't running a job if configured to do so - if self.bridge_data.unload_models_from_vram: + if self.bridge_data.unload_models_from_vram_often: self.unload_models_from_vram(process_with_model) logger.info(f"Starting inference for job {next_job.id_} on process {process_with_model.process_id}") diff --git a/horde_worker_regen/process_management/worker_entry_points.py b/horde_worker_regen/process_management/worker_entry_points.py index e6f6ce99..ce0bd876 100644 --- a/horde_worker_regen/process_management/worker_entry_points.py +++ b/horde_worker_regen/process_management/worker_entry_points.py @@ -64,14 +64,21 @@ def start_inference_process( if amd_gpu: extra_comfyui_args.append("--use-pytorch-cross-attention") + models_not_to_force_load = [] + if very_high_memory_mode: extra_comfyui_args.append("--gpu-only") elif high_memory_mode: - extra_comfyui_args.append("--highvram") + extra_comfyui_args.append("--normalvram") + models_not_to_force_load = [ + "cascade", + ] elif low_memory_mode: extra_comfyui_args.append("--novram") - else: - extra_comfyui_args.append("--normalvram") + models_not_to_force_load = [ + "sdxl", + "cascade", + ] with logger.catch(reraise=True): hordelib.initialise( @@ -79,6 +86,7 @@ def start_inference_process( process_id=process_id, logging_verbosity=0, force_normal_vram_mode=False, + models_not_to_force_load=models_not_to_force_load, extra_comfyui_args=extra_comfyui_args, ) except Exception as e: From 47f08c7413c686dd2bb5dcde676ad972f0e2aeac Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 26 Aug 2024 22:03:43 -0400 Subject: [PATCH 11/50] chore: log addtl info/config; warn for incorrect high memory mode --- .../process_management/process_manager.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index c5048e27..de8a90bc 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -369,6 +369,11 @@ def on_memory_report( self[process_id].last_received_timestamp = time.time() + logger.debug( + f"Process {process_id} memory report: " + f"ram: {ram_usage_bytes} vram: {vram_usage_bytes} total vram: {total_vram_bytes}", + ) + def on_process_state_change(self, process_id: int, new_state: HordeProcessState) -> None: """Update the process state for the given process ID. @@ -3673,6 +3678,7 @@ def print_status_method(self) -> None: [ f"dreamer_name: {self.bridge_data.dreamer_worker_name}", f"(v{horde_worker_regen.__version__})", + f"horde user: {self.user_info.username if self.user_info is not None else 'Unknown'}", f"num_models: {len(self.bridge_data.image_models_to_load)}", f"max_power: {self.bridge_data.max_power}", f"max_threads: {self.max_concurrent_inference_processes}", @@ -3689,12 +3695,38 @@ def print_status_method(self) -> None: f"allow_controlnet: {self.bridge_data.allow_controlnet}", f"allow_sdxl_controlnet: {self.bridge_data.allow_sdxl_controlnet}", f"allow_post_processing: {self.bridge_data.allow_post_processing}", + f"custom_models: {bool(self.bridge_data.custom_models)}", f"jobs_pending_safety_check: {len(self.jobs_pending_safety_check)}", f"jobs_being_safety_checked: {len(self.jobs_being_safety_checked)}", f"jobs_in_progress: {len(self.jobs_in_progress)}", ], ), ) + logger.debug( + " | ".join( + [ + f"high_performance_mode: {self.bridge_data.high_performance_mode}", + f"moderate_performance_mode: {self.bridge_data.moderate_performance_mode}", + f"high_memory_mode: {self.bridge_data.high_memory_mode}", + f"very_high_memory_mode: {self.bridge_data.very_high_memory_mode}", + f"unload_models_from_vram_often: {self.bridge_data.unload_models_from_vram_often}", + ], + ), + ) + + logger.debug( + " | ".join( + [ + f"post_process_job_overlap: {self.bridge_data.post_process_job_overlap}", + f"preload_timeout: {self.bridge_data.preload_timeout}", + f"download_timeout: {self.bridge_data.download_timeout}", + f"post_process_timeout: {self.bridge_data.post_process_timeout}", + f"cycle_process_on_model_change: {self.bridge_data.cycle_process_on_model_change}", + f"exit_on_unhandled_faults: {self.bridge_data.exit_on_unhandled_faults}", + ], + ), + ) + jobs = [f"<{x.id_}: {x.model}>" for x in self.job_deque] logger.info(f'Jobs: {", ".join(jobs)}') @@ -3732,6 +3764,19 @@ def print_status_method(self) -> None: "`git pull` and `update-runtime` to update.", ) + for device in self._device_map.root.values(): + total_memory_mb = device.total_memory / 1024 / 1024 + if total_memory_mb < 10_000 and self.bridge_data.high_memory_mode: + logger.warning( + f"Device {device.device_name} ({device.device_index}) has less than 10GB of memory. " + "This may cause issues with `high_memory_mode` enabled.", + ) + elif total_memory_mb > 20_000 and not self.bridge_data.high_memory_mode: + logger.warning( + f"Device {device.device_name} ({device.device_index}) has more than 20GB of memory. " + "You should enable `high_memory_mode` in your config to take advantage of this.", + ) + self._last_status_message_time = time.time() _bridge_data_loop_interval = 1.0 From 1c46af6003a1ee0fe0ce1e358761fe2462cee6c0 Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 26 Aug 2024 22:04:28 -0400 Subject: [PATCH 12/50] feat: use `horde_engine~=2.14.5` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 31398af2..490e7152 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - horde_safety==0.2.3 - torch==2.3.1 - ruamel.yaml - - horde_engine==2.14.4 + - horde_engine==2.14.5 - horde_sdk==0.14.0 - horde_model_reference==0.8.1 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index b036c45e..7d55624c 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.0 horde_model_reference~=0.8.1 horde_engine~=2.14.4 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.0 horde_model_reference~=0.8.1 horde_engine~=2.14.5 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index 1fd7fc39..6f5833c6 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -3,7 +3,7 @@ torch==2.3.1+rocm6.0 horde_sdk~=0.14.0 horde_safety~=0.2.3 -horde_engine~=2.14.4 +horde_engine~=2.14.5 horde_model_reference~=0.8.1 python-dotenv diff --git a/requirements.txt b/requirements.txt index f8e94fe1..47a5b65f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ torch==2.3.1 horde_sdk~=0.14.0 horde_safety~=0.2.3 -horde_engine~=2.14.4 +horde_engine~=2.14.5 horde_model_reference>=0.8.1 python-dotenv From 9e6009562f82d0002f2a58aad898c1b85444fcd8 Mon Sep 17 00:00:00 2001 From: tazlin Date: Tue, 27 Aug 2024 11:54:55 -0400 Subject: [PATCH 13/50] fix: don't pass memory arg to comfyui when init safety process --- horde_worker_regen/process_management/worker_entry_points.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/horde_worker_regen/process_management/worker_entry_points.py b/horde_worker_regen/process_management/worker_entry_points.py index ce0bd876..716abf7a 100644 --- a/horde_worker_regen/process_management/worker_entry_points.py +++ b/horde_worker_regen/process_management/worker_entry_points.py @@ -149,9 +149,6 @@ def start_safety_process( if amd_gpu: extra_comfyui_args.append("--use-pytorch-cross-attention") - if high_memory_mode: - extra_comfyui_args.append("--highvram") - with logger.catch(reraise=True): hordelib.initialise( setup_logging=None, From 67ba52bb7f1d05ec002a3ef9b1e113f0a6bde850 Mon Sep 17 00:00:00 2001 From: tazlin Date: Tue, 27 Aug 2024 12:29:25 -0400 Subject: [PATCH 14/50] feat: more informative kudos/user log messages --- .../process_management/process_manager.py | 122 +++++++++++++++--- 1 file changed, 107 insertions(+), 15 deletions(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index de8a90bc..760fd1e2 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -949,6 +949,8 @@ def get_process_total_ram_usage(self) -> int: _completed_jobs_lock: Lock_Asyncio kudos_generated_this_session: float = 0 + kudos_events: list[tuple[float, float]] + """A deque of kudos events, each is a tuple of the time the event occurred and the amount of kudos generated.""" session_start_time: float = 0 _aiohttp_client_session: aiohttp.ClientSession @@ -1135,6 +1137,8 @@ def __init__( self._process_message_queue = multiprocessing.Queue() + self.kudos_events = [] + self.stable_diffusion_reference = None while self.stable_diffusion_reference is None: @@ -2578,6 +2582,7 @@ async def _do_upload(new_submit: PendingSubmitJob, image_in_buffer_bytes: bytes) self._num_jobs_faulted += 1 self.kudos_generated_this_session += job_submit_response.reward + self.kudos_events.append((time.time(), job_submit_response.reward)) new_submit.succeed(new_submit.kudos_reward, new_submit.kudos_per_second) return new_submit @@ -3388,6 +3393,107 @@ async def api_job_pop(self) -> None: _current_worker_id: str | None = None + def calculate_kudos_info(self) -> None: + """Calculate and log information about the kudos generated in the current session.""" + time_since_session_start = time.time() - self.session_start_time + kudos_per_hour_session = self.kudos_generated_this_session / time_since_session_start * 3600 + + kudos_total_past_hour = self.calculate_kudos_totals() + + kudos_info_string = self.generate_kudos_info_string( + time_since_session_start, + kudos_per_hour_session, + kudos_total_past_hour, + ) + + self.log_kudos_info(kudos_info_string) + + def calculate_kudos_totals(self) -> float: + """Calculate the total kudos generated in the past hour. + + Returns: + float: The total kudos generated in the past hour. + """ + kudos_total_past_hour = 0.0 + num_events_found = 0 + current_time = time.time() + + for event_time, kudos in reversed(self.kudos_events): + if current_time - event_time > 3600: + break + + num_events_found += 1 + kudos_total_past_hour += kudos + + elements_to_remove = len(self.kudos_events) - num_events_found + if elements_to_remove > 0: + self.kudos_events = self.kudos_events[:-elements_to_remove] + + return kudos_total_past_hour + + def generate_kudos_info_string( + self, + time_since_session_start: float, + kudos_per_hour_session: float, + kudos_total_past_hour: float, + ) -> str: + """Generate a string with information about the kudos generated in the current session. + + Args: + time_since_session_start (float): The time since the session started. + kudos_per_hour_session (float): The kudos per hour generated in the current session. + kudos_total_past_hour (float): The total kudos generated in the past hour. + + Returns: + str: A string with information about the kudos generated in the current session. + """ + kudos_info_string_elements = [] + if time_since_session_start < 3600: + kudos_info_string_elements = [ + f"Total Session Kudos: {self.kudos_generated_this_session:.2f} over " + f"{time_since_session_start / 60:.2f} minutes", + ] + else: + kudos_info_string_elements = [ + f"Total Session Kudos: {self.kudos_generated_this_session:.2f} over " + f"{time_since_session_start / 3600:.2f} hours", + ] + + if time_since_session_start > 3600: + kudos_info_string_elements.append( + f"Session: {kudos_per_hour_session:.2f} (actual) kudos/hr", + ) + kudos_info_string_elements.append( + f"Last Hour: {kudos_total_past_hour:.2f} kudos/hr", + ) + else: + kudos_info_string_elements.append( + f"Session: {kudos_per_hour_session:.2f} (extrapolated) kudos/hr", + ) + kudos_info_string_elements.append( + "Last Hour: (pending) kudos/hr", + ) + + return " | ".join(kudos_info_string_elements) + + def log_kudos_info(self, kudos_info_string: str) -> None: + """Log the kudos information string. + + Args: + kudos_info_string (str): The kudos information string to log. + """ + if self.kudos_generated_this_session > 0: + logger.success(kudos_info_string) + + logger.debug(f"len(kudos_events): {len(self.kudos_events)}") + if self.user_info is not None and self.user_info.kudos_details is not None: + logger.info( + f"Total Kudos Accumulated: {self.user_info.kudos_details.accumulated:.2f} " + f"(all workers for {self.user_info.username})", + ) + if self.user_info.kudos_details.accumulated is not None and self.user_info.kudos_details.accumulated < 0: + logger.info("Negative kudos means you've requested more than you've earned. This can be normal.") + async def api_get_user_info(self) -> None: """Get the information associated with this API key from the API.""" if self._shutting_down: @@ -3408,21 +3514,7 @@ async def api_get_user_info(self) -> None: self._user_info_failed_reason = None if self.user_info.kudos_details is not None: - # print kudos this session and kudos per hour based on self.session_start_time - kudos_per_hour = self.kudos_generated_this_session / (time.time() - self.session_start_time) * 3600 - - if self.kudos_generated_this_session > 0: - logger.success( - f"Kudos this session: {self.kudos_generated_this_session:.2f} " - f"(~{kudos_per_hour:.2f} kudos/hour)", - ) - - logger.info(f"Worker Kudos Accumulated: {self.user_info.kudos_details.accumulated:.2f}") - if ( - self.user_info.kudos_details.accumulated is not None - and self.user_info.kudos_details.accumulated < 0 - ): - logger.info("Negative kudos means you've requested more than you've earned. This can be normal.") + self.calculate_kudos_info() except _async_client_exceptions as e: self._user_info_failed = True From 55ddc03d2b99623a93770c1a38757689b7a60493 Mon Sep 17 00:00:00 2001 From: tazlin Date: Wed, 28 Aug 2024 07:53:45 -0400 Subject: [PATCH 15/50] fix: more acuate units in log messages --- .../process_management/process_manager.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 760fd1e2..b9b8323d 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -2558,7 +2558,7 @@ async def _do_upload(new_submit: PendingSubmitJob, image_in_buffer_bytes: bytes) if new_submit.completed_job_info.state != GENERATION_STATE.faulted: logger.success( f"Submitted job {new_submit.job_id} (model: " - f"{new_submit.completed_job_info.sdk_api_job_info.model}) for {job_submit_response.reward:.2f} " + f"{new_submit.completed_job_info.sdk_api_job_info.model}) for {job_submit_response.reward:,.2f} " f"kudos. Job popped {time_taken} seconds ago " f"and took {new_submit.completed_job_info.time_to_generate:.2f} " f"to generate. ({kudos_per_second * new_submit.batch_count:.2f} " @@ -3450,28 +3450,28 @@ def generate_kudos_info_string( kudos_info_string_elements = [] if time_since_session_start < 3600: kudos_info_string_elements = [ - f"Total Session Kudos: {self.kudos_generated_this_session:.2f} over " + f"Total Session Kudos: {self.kudos_generated_this_session:,.2f} over " f"{time_since_session_start / 60:.2f} minutes", ] else: kudos_info_string_elements = [ - f"Total Session Kudos: {self.kudos_generated_this_session:.2f} over " + f"Total Session Kudos: {self.kudos_generated_this_session:,.2f} over " f"{time_since_session_start / 3600:.2f} hours", ] if time_since_session_start > 3600: kudos_info_string_elements.append( - f"Session: {kudos_per_hour_session:.2f} (actual) kudos/hr", + f"Session: {kudos_per_hour_session:,.2f} (actual) kudos/hr", ) kudos_info_string_elements.append( - f"Last Hour: {kudos_total_past_hour:.2f} kudos/hr", + f"Last Hour: {kudos_total_past_hour:,.2f} kudos", ) else: kudos_info_string_elements.append( - f"Session: {kudos_per_hour_session:.2f} (extrapolated) kudos/hr", + f"Session: {kudos_per_hour_session:,.2f} (extrapolated) kudos/hr", ) kudos_info_string_elements.append( - "Last Hour: (pending) kudos/hr", + "Last Hour: (pending) kudos", ) return " | ".join(kudos_info_string_elements) @@ -3488,7 +3488,7 @@ def log_kudos_info(self, kudos_info_string: str) -> None: logger.debug(f"len(kudos_events): {len(self.kudos_events)}") if self.user_info is not None and self.user_info.kudos_details is not None: logger.info( - f"Total Kudos Accumulated: {self.user_info.kudos_details.accumulated:.2f} " + f"Total Kudos Accumulated: {self.user_info.kudos_details.accumulated:,.2f} " f"(all workers for {self.user_info.username})", ) if self.user_info.kudos_details.accumulated is not None and self.user_info.kudos_details.accumulated < 0: From 22202b34ed45c3ba381c95b0108b3177e8ba7567 Mon Sep 17 00:00:00 2001 From: tazlin Date: Wed, 28 Aug 2024 16:56:17 -0400 Subject: [PATCH 16/50] fix: pass very high memory mode config to inf. proc. --- horde_worker_regen/process_management/process_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index b9b8323d..6c2e336c 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -1341,6 +1341,7 @@ def _start_inference_process(self, pid: int) -> HordeProcessInfo: self._aux_model_lock, ), kwargs={ + "very_high_memory_mode": self.bridge_data.very_high_memory_mode, "high_memory_mode": self.bridge_data.high_memory_mode, "amd_gpu": self._amd_gpu, }, From 6dd52aaa3bae7ee7238f3e4bfca754d0c66d8bd2 Mon Sep 17 00:00:00 2001 From: tazlin Date: Thu, 29 Aug 2024 09:55:54 -0400 Subject: [PATCH 17/50] feat: use `horde_sdk~=0.14.1` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 490e7152..683be640 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -41,6 +41,6 @@ repos: - torch==2.3.1 - ruamel.yaml - horde_engine==2.14.5 - - horde_sdk==0.14.0 + - horde_sdk==0.14.1 - horde_model_reference==0.8.1 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index 7d55624c..ca116636 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.0 horde_model_reference~=0.8.1 horde_engine~=2.14.5 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.1 horde_model_reference~=0.8.1 horde_engine~=2.14.5 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index 6f5833c6..bf6739ca 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -1,7 +1,7 @@ numpy==1.26.4 torch==2.3.1+rocm6.0 -horde_sdk~=0.14.0 +horde_sdk~=0.14.1 horde_safety~=0.2.3 horde_engine~=2.14.5 horde_model_reference~=0.8.1 diff --git a/requirements.txt b/requirements.txt index 47a5b65f..a80da3a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy==1.26.4 torch==2.3.1 -horde_sdk~=0.14.0 +horde_sdk~=0.14.1 horde_safety~=0.2.3 horde_engine~=2.14.5 horde_model_reference>=0.8.1 From 00b60330c221d0d1978505bf8fd0d06841f26d2f Mon Sep 17 00:00:00 2001 From: tazlin Date: Thu, 29 Aug 2024 10:30:47 -0400 Subject: [PATCH 18/50] fix: print to console `PROCESS_ENDED` message's info This will clarify when the situations such as the shared model manager failing to load or no models being found occur (e.g., when download_models.py isn't) --- horde_worker_regen/process_management/process_manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 6c2e336c..75a48c6c 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -1545,7 +1545,11 @@ def receive_and_handle_process_messages(self) -> None: new_state=message.process_state, ) - logger.debug(f"Process {message.process_id} changed state to {message.process_state}") + if message.process_state == HordeProcessState.PROCESS_ENDED: + logger.info(f"Process {message.process_id} has ended with message: {message.info}") + else: + logger.debug(f"Process {message.process_id} changed state to {message.process_state}") + if message.process_state == HordeProcessState.INFERENCE_STARTING: # logger.info(f"Process {message.process_id} is starting inference on model {message.info}") From 3d41fdd44a970852387af5471211c360d66e227a Mon Sep 17 00:00:00 2001 From: tazlin Date: Wed, 11 Sep 2024 17:31:13 -0400 Subject: [PATCH 19/50] chore: version bump --- horde_worker_regen/__init__.py | 2 +- horde_worker_regen/_version_meta.json | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/horde_worker_regen/__init__.py b/horde_worker_regen/__init__.py index c7b48512..2b7db86f 100644 --- a/horde_worker_regen/__init__.py +++ b/horde_worker_regen/__init__.py @@ -8,7 +8,7 @@ ASSETS_FOLDER_PATH = Path(__file__).parent / "assets" -__version__ = "8.1.2" +__version__ = "9.0.0" import pkg_resources # noqa: E402 diff --git a/horde_worker_regen/_version_meta.json b/horde_worker_regen/_version_meta.json index d3329eab..2a0372b3 100644 --- a/horde_worker_regen/_version_meta.json +++ b/horde_worker_regen/_version_meta.json @@ -1,5 +1,5 @@ { - "recommended_version": "8.1.2", + "recommended_version": "9.0.0", "required_min_version": "4.2.7", "required_min_version_update_date": "2024-03-09", "required_min_version_info": { diff --git a/pyproject.toml b/pyproject.toml index 6e80dd6c..df489971 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "horde_worker_regen" -version = "8.1.2" +version = "9.0.0" description = "Allows you to connect to the AI Horde and generate images for users." authors = [ {name = "tazlin", email = "tazlin.on.github@gmail.com"}, From 3f7a47c4789db467f91c409014f2b52c2c17b4af Mon Sep 17 00:00:00 2001 From: tazlin Date: Wed, 11 Sep 2024 17:31:17 -0400 Subject: [PATCH 20/50] feat: use `horde_model_reference>=0.9.0` for flux support --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a80da3a7..ca010285 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ torch==2.3.1 horde_sdk~=0.14.1 horde_safety~=0.2.3 horde_engine~=2.14.5 -horde_model_reference>=0.8.1 +horde_model_reference>=0.9.0 python-dotenv ruamel.yaml From 70f7a0b97ea9c9ed739c336424bcd2c7287b7481 Mon Sep 17 00:00:00 2001 From: tazlin Date: Wed, 11 Sep 2024 17:38:59 -0400 Subject: [PATCH 21/50] fix: use latest compat. `horde_model_reference` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 683be640..8aa09cbb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -42,5 +42,5 @@ repos: - ruamel.yaml - horde_engine==2.14.5 - horde_sdk==0.14.1 - - horde_model_reference==0.8.1 + - horde_model_reference==0.9.0 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index ca116636..4d9edbc3 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.1 horde_model_reference~=0.8.1 horde_engine~=2.14.5 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.1 horde_model_reference~=0.9.0 horde_engine~=2.14.5 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index bf6739ca..b88a2126 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -4,7 +4,7 @@ torch==2.3.1+rocm6.0 horde_sdk~=0.14.1 horde_safety~=0.2.3 horde_engine~=2.14.5 -horde_model_reference~=0.8.1 +horde_model_reference~=0.9.0 python-dotenv ruamel.yaml From aa0ca394e647152127410a2b26e7bef9fe7c1e57 Mon Sep 17 00:00:00 2001 From: tazlin Date: Wed, 11 Sep 2024 17:47:52 -0400 Subject: [PATCH 22/50] fix: use `horde_sdk==0.14.2` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8aa09cbb..d2827388 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -41,6 +41,6 @@ repos: - torch==2.3.1 - ruamel.yaml - horde_engine==2.14.5 - - horde_sdk==0.14.1 + - horde_sdk==0.14.2 - horde_model_reference==0.9.0 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index 4d9edbc3..c6932798 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.1 horde_model_reference~=0.9.0 horde_engine~=2.14.5 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.2 horde_model_reference~=0.9.0 horde_engine~=2.14.5 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index b88a2126..8fa4057f 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -1,7 +1,7 @@ numpy==1.26.4 torch==2.3.1+rocm6.0 -horde_sdk~=0.14.1 +horde_sdk~=0.14.2 horde_safety~=0.2.3 horde_engine~=2.14.5 horde_model_reference~=0.9.0 diff --git a/requirements.txt b/requirements.txt index ca010285..b25eb786 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy==1.26.4 torch==2.3.1 -horde_sdk~=0.14.1 +horde_sdk~=0.14.2 horde_safety~=0.2.3 horde_engine~=2.14.5 horde_model_reference>=0.9.0 From e21f192fd98502bb3d2e805d871c919a5d6a2306 Mon Sep 17 00:00:00 2001 From: tazlin Date: Thu, 12 Sep 2024 14:14:17 -0400 Subject: [PATCH 23/50] fix: clarify "currently popped" in log messages --- horde_worker_regen/process_management/process_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 75a48c6c..f6f917ce 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -3840,7 +3840,7 @@ def print_status_method(self) -> None: job_info_message = "Session job info: " + " | ".join( [ - f"popped: {len(self.job_deque)} (eMPS: {self.get_pending_megapixelsteps()})", + f"currently popped: {len(self.job_deque)} (eMPS: {self.get_pending_megapixelsteps()})", f"submitted: {self.total_num_completed_jobs}", f"faulted: {self._num_jobs_faulted}", f"slow_jobs: {self._num_job_slowdowns}", From b0bf64b7189f9e99f48fb3a6df47b78dcfd0beee Mon Sep 17 00:00:00 2001 From: db0 Date: Fri, 13 Sep 2024 08:37:05 +0200 Subject: [PATCH 24/50] doc: custom models --- README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/README.md b/README.md index 07104c47..bcbb0849 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,32 @@ To update: - **Advanced users**: If you do not want to use mamba or you are comfortable with python/venvs, see [README_advanced.md](README_advanced.md). 1. Continue with [Starting/stopping](#startingstopping) instructions above +# Custom Models + +You can host your own image models on the horde which are not available in our model reference, but this process is a bit more complex. + +To start with, you need to manually request the `customizer` role from then horde team. You can ask for it in the discord channel. This is a manually assigned role to prevent abuse of this feature. + +Once you have the customizer role, you need to download the model files you want to host. Place them in any location on your system. + +Finally, you need to point your worker to their location and provide some information about them. On your bridgeData.yaml simply add lines like the following + +``` +custom_models: + - name: Movable figure model XL + baseline: stable_diffusion_xl + filepath: /home/db0/projects/CUSTOM_MODELS/PVCStyleModelMovable_beta25Realistic.safetensors +``` + +And then add the same "name" to your models_to_load. + +If everything was setup correctly, you should now see a `custom_models.json` in your worker directory after the worker starts, and the model should be offered by your worker. + +Note that: + +* You cannot serve custom models with the same name as any of our regular models +* The horde doesn't know your model, so it will treat it as a SD 1.5 model for kudos rewards and cannot warn people using the wrong parameters such as clip_skip + # Docker See [README_advanced.md](README_advanced.md). @@ -173,3 +199,4 @@ See [README_advanced.md](README_advanced.md). # Model Usage Many models in this project use the CreativeML OpenRAIL License. [Please read the full license here.](https://huggingface.co/spaces/CompVis/stable-diffusion-license) + From 1ab8b07f80d5bdc1fb7b86fe8948312a8b726997 Mon Sep 17 00:00:00 2001 From: db0 Date: Fri, 13 Sep 2024 12:50:37 +0200 Subject: [PATCH 25/50] adds extra_slow_worker and limit_max_steps vars --- bridgeData_template.yaml | 13 +++++++++++++ horde_worker_regen/bridge_data/data_model.py | 5 +++++ .../process_management/process_manager.py | 2 ++ 3 files changed, 20 insertions(+) diff --git a/bridgeData_template.yaml b/bridgeData_template.yaml index 0816d880..157b8bcc 100644 --- a/bridgeData_template.yaml +++ b/bridgeData_template.yaml @@ -113,6 +113,19 @@ allow_lora: false # The number of gigabytes of LoRas too keep cached. This is in addition to the preselected LoRas. max_lora_cache_size: 10 # In gigabytes. Min is 10. +# Set this to true, if your worker is extraordinarily slow, such below 0.1 mps/s +# When your worker is set as extra slow, users can freely choose to skip it when requesting generations +# However you get an the job timeout for generations you pick up is tripled +# And the whole request itself receives triple the expiry timeout (60 mins, instead of 20 mins) when a slow worker picks it up. +# We hope this can help people onboard older GPU generations to serve people who do not need immediate generations +# IMPORTANT: This option is NOT meant to allow CPU workers. It's just for slow GPUs. There's still A timeout. +extra_slow_worker: false + +# Set this to true to make your worker only pick up jobs requesting steps lower than the model's average steps. +# This is meant to be used for slower workers who can handle a few steps within the limits but might time out if someone asks for 100 or more +# You can also use this if you don't want to serve request with an extraordinary amount of steps whatsoever. +limit_max_steps: false + # Automatically determine the models which have the highest queue and offer those. dynamic_models: false # Currently unused in reGen diff --git a/horde_worker_regen/bridge_data/data_model.py b/horde_worker_regen/bridge_data/data_model.py index 2ebc6d8b..f2feee4d 100644 --- a/horde_worker_regen/bridge_data/data_model.py +++ b/horde_worker_regen/bridge_data/data_model.py @@ -75,10 +75,15 @@ class reGenBridgeData(CombinedHordeBridgeData): exit_on_unhandled_faults: bool = Field(default=False) purge_loras_on_download: bool = Field(default=False) + + extra_slow_worker: bool = Field(default=False) + + limit_max_steps: bool = Field(default=False) custom_models: list[dict] = Field( default_factory=list, ) + @model_validator(mode="after") def validate_performance_modes(self) -> reGenBridgeData: diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index f6f917ce..7447267a 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -3296,6 +3296,8 @@ async def api_job_pop(self) -> None: allow_post_processing=self.bridge_data.allow_post_processing, allow_controlnet=self.bridge_data.allow_controlnet, allow_sdxl_controlnet=self.bridge_data.allow_sdxl_controlnet, + extra_slow_worker=self.bridge_data.extra_slow_worker, + limit_max_steps=self.bridge_data.limit_max_steps, allow_lora=self.bridge_data.allow_lora, amount=self.bridge_data.max_batch, ) From 48206d8a5900a231ae6827ba557c9ad6a63aeb36 Mon Sep 17 00:00:00 2001 From: tazlin Date: Fri, 13 Sep 2024 07:56:42 -0400 Subject: [PATCH 26/50] feat: `horde_sdk==0.14.3` for `extra_slow_worker`/`limit_max_steps` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d2827388..8770dcb4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -41,6 +41,6 @@ repos: - torch==2.3.1 - ruamel.yaml - horde_engine==2.14.5 - - horde_sdk==0.14.2 + - horde_sdk==0.14.3 - horde_model_reference==0.9.0 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index c6932798..e4e81f12 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.2 horde_model_reference~=0.9.0 horde_engine~=2.14.5 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.3 horde_model_reference~=0.9.0 horde_engine~=2.14.5 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index 8fa4057f..eb7536b8 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -1,7 +1,7 @@ numpy==1.26.4 torch==2.3.1+rocm6.0 -horde_sdk~=0.14.2 +horde_sdk~=0.14.3 horde_safety~=0.2.3 horde_engine~=2.14.5 horde_model_reference~=0.9.0 diff --git a/requirements.txt b/requirements.txt index b25eb786..3a96fd47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy==1.26.4 torch==2.3.1 -horde_sdk~=0.14.2 +horde_sdk~=0.14.3 horde_safety~=0.2.3 horde_engine~=2.14.5 horde_model_reference>=0.9.0 From 6d748317659d07349206dc97de678265c9474e75 Mon Sep 17 00:00:00 2001 From: tazlin Date: Fri, 13 Sep 2024 08:00:25 -0400 Subject: [PATCH 27/50] fix: remove redundant bridge data fields (already in SDK) --- horde_worker_regen/bridge_data/data_model.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/horde_worker_regen/bridge_data/data_model.py b/horde_worker_regen/bridge_data/data_model.py index f2feee4d..2ebc6d8b 100644 --- a/horde_worker_regen/bridge_data/data_model.py +++ b/horde_worker_regen/bridge_data/data_model.py @@ -75,15 +75,10 @@ class reGenBridgeData(CombinedHordeBridgeData): exit_on_unhandled_faults: bool = Field(default=False) purge_loras_on_download: bool = Field(default=False) - - extra_slow_worker: bool = Field(default=False) - - limit_max_steps: bool = Field(default=False) custom_models: list[dict] = Field( default_factory=list, ) - @model_validator(mode="after") def validate_performance_modes(self) -> reGenBridgeData: From b1f501375e610233c53738eb9176d3f093412d55 Mon Sep 17 00:00:00 2001 From: tazlin Date: Fri, 13 Sep 2024 08:09:06 -0400 Subject: [PATCH 28/50] style: fix --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index bcbb0849..7efe0387 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ To update: # Custom Models -You can host your own image models on the horde which are not available in our model reference, but this process is a bit more complex. +You can host your own image models on the horde which are not available in our model reference, but this process is a bit more complex. To start with, you need to manually request the `customizer` role from then horde team. You can ask for it in the discord channel. This is a manually assigned role to prevent abuse of this feature. @@ -199,4 +199,3 @@ See [README_advanced.md](README_advanced.md). # Model Usage Many models in this project use the CreativeML OpenRAIL License. [Please read the full license here.](https://huggingface.co/spaces/CompVis/stable-diffusion-license) - From f1bbd5eb90d303f64e66031b553739c639293068 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 14 Sep 2024 12:36:22 -0400 Subject: [PATCH 29/50] feat: use `horde_engine~=2.14.6` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8770dcb4..8437508c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - horde_safety==0.2.3 - torch==2.3.1 - ruamel.yaml - - horde_engine==2.14.5 + - horde_engine==2.14.6 - horde_sdk==0.14.3 - horde_model_reference==0.9.0 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index e4e81f12..e7355030 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.3 horde_model_reference~=0.9.0 horde_engine~=2.14.5 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.3 horde_model_reference~=0.9.0 horde_engine~=2.14.6 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index eb7536b8..3a989b05 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -3,7 +3,7 @@ torch==2.3.1+rocm6.0 horde_sdk~=0.14.3 horde_safety~=0.2.3 -horde_engine~=2.14.5 +horde_engine~=2.14.6 horde_model_reference~=0.9.0 python-dotenv diff --git a/requirements.txt b/requirements.txt index 3a96fd47..e52af577 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ torch==2.3.1 horde_sdk~=0.14.3 horde_safety~=0.2.3 -horde_engine~=2.14.5 +horde_engine~=2.14.6 horde_model_reference>=0.9.0 python-dotenv From 0934de5ea58b96b7410ef8f5e1d27d549e521186 Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 16 Sep 2024 11:57:05 -0400 Subject: [PATCH 30/50] feat: use `horde_engine==2.15.0` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8437508c..d4688c95 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - horde_safety==0.2.3 - torch==2.3.1 - ruamel.yaml - - horde_engine==2.14.6 + - horde_engine==2.15.0 - horde_sdk==0.14.3 - horde_model_reference==0.9.0 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index e7355030..a1e59a2c 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.3 horde_model_reference~=0.9.0 horde_engine~=2.14.6 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.3 horde_model_reference~=0.9.0 horde_engine~=2.15.0 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index 3a989b05..7f2d185c 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -3,7 +3,7 @@ torch==2.3.1+rocm6.0 horde_sdk~=0.14.3 horde_safety~=0.2.3 -horde_engine~=2.14.6 +horde_engine~=2.15.0 horde_model_reference~=0.9.0 python-dotenv diff --git a/requirements.txt b/requirements.txt index e52af577..bb0bc0a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ torch==2.3.1 horde_sdk~=0.14.3 horde_safety~=0.2.3 -horde_engine~=2.14.6 +horde_engine~=2.15.0 horde_model_reference>=0.9.0 python-dotenv From 6dfcf9307588b737155e47c0cd2ec334d42445d3 Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 16 Sep 2024 12:10:17 -0400 Subject: [PATCH 31/50] fix: add flux to known slow/vram heavy models --- horde_worker_regen/consts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/horde_worker_regen/consts.py b/horde_worker_regen/consts.py index ff665e8b..62626c1b 100644 --- a/horde_worker_regen/consts.py +++ b/horde_worker_regen/consts.py @@ -7,8 +7,8 @@ ) -KNOWN_SLOW_MODELS_DIFFICULTIES = {"Stable Cascade 1.0": 6.0} -VRAM_HEAVY_MODELS = ["Stable Cascade 1.0"] +KNOWN_SLOW_MODELS_DIFFICULTIES = {"Stable Cascade 1.0": 6.0, "Flux.1-Schnell fp8 (Compact)": 6.0} +VRAM_HEAVY_MODELS = ["Stable Cascade 1.0", "Flux.1-Schnell fp16 (Compact)", "Flux.1-Schnell fp8 (Compact)"] KNOWN_SLOW_WORKFLOWS = {"qr_code": 2.0} KNOWN_CONTROLNET_WORKFLOWS = ["qr_code"] From 0c1de03e213dd8c04e0d23d6dbdb1e4f2386994a Mon Sep 17 00:00:00 2001 From: tazlin Date: Tue, 17 Sep 2024 12:48:42 -0400 Subject: [PATCH 32/50] fix: enforce constraints on other configs w/ `extra_slow_worker` --- horde_worker_regen/bridge_data/data_model.py | 46 +++++++++++++++++++ .../process_management/process_manager.py | 35 +++++++++++--- 2 files changed, 75 insertions(+), 6 deletions(-) diff --git a/horde_worker_regen/bridge_data/data_model.py b/horde_worker_regen/bridge_data/data_model.py index 2ebc6d8b..5251e10a 100644 --- a/horde_worker_regen/bridge_data/data_model.py +++ b/horde_worker_regen/bridge_data/data_model.py @@ -99,6 +99,52 @@ def validate_performance_modes(self) -> reGenBridgeData: "The queue_size value has been set to 2 because the max_threads value is greater than 2.", ) + if self.extra_slow_worker: + if self.high_performance_mode: + self.high_performance_mode = False + logger.warning( + "Extra slow worker is enabled, so the high_performance_mode value has been set to False.", + ) + if self.moderate_performance_mode: + self.moderate_performance_mode = False + logger.warning( + "Extra slow worker is enabled, so the moderate_performance_mode value has been set to False.", + ) + if self.high_memory_mode: + self.high_memory_mode = False + logger.warning( + "Extra slow worker is enabled, so the high_memory_mode value has been set to False.", + ) + if self.very_high_memory_mode: + self.very_high_memory_mode = False + logger.warning( + "Extra slow worker is enabled, so the very_high_memory_mode value has been set to False.", + ) + if self.queue_size > 0: + self.queue_size = 0 + logger.warning( + "Extra slow worker is enabled, so the queue_size value has been set to 0. " + "This behavior may change in the future.", + ) + if self.max_threads > 1: + self.max_threads = 1 + logger.warning( + "Extra slow worker is enabled, so the max_threads value has been set to 1. " + "This behavior may change in the future.", + ) + if self.preload_timeout < 120: + self.preload_timeout = 120 + logger.warning( + "Extra slow worker is enabled, so the preload_timeout value has been set to 120. " + "This behavior may change in the future.", + ) + if not self.post_process_job_overlap: + self.post_process_job_overlap = True + logger.warning( + "Extra slow worker is enabled, so the post_process_job_overlap value has been set to True. " + "This behavior may change in the future.", + ) + if self.very_high_memory_mode and not self.high_memory_mode: self.high_memory_mode = True logger.warning( diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 7447267a..154e30c2 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -3470,16 +3470,16 @@ def generate_kudos_info_string( kudos_info_string_elements.append( f"Session: {kudos_per_hour_session:,.2f} (actual) kudos/hr", ) - kudos_info_string_elements.append( - f"Last Hour: {kudos_total_past_hour:,.2f} kudos", - ) + # kudos_info_string_elements.append( + # f"Last Hour: {kudos_total_past_hour:,.2f} kudos", + # ) else: kudos_info_string_elements.append( f"Session: {kudos_per_hour_session:,.2f} (extrapolated) kudos/hr", ) - kudos_info_string_elements.append( - "Last Hour: (pending) kudos", - ) + # kudos_info_string_elements.append( + # "Last Hour: (pending) kudos", + # ) return " | ".join(kudos_info_string_elements) @@ -3863,6 +3863,23 @@ def print_status_method(self) -> None: "`git pull` and `update-runtime` to update.", ) + if self.bridge_data.extra_slow_worker: + if not self.bridge_data.limit_max_steps: + logger.warning( + "Extra slow worker mode is enabled, but limit_max_steps is not enabled. " + "Consider enabling limit_max_steps to prevent long running jobs.", + ) + if self.bridge_data.max_batch > 1: + logger.warning( + "Extra slow worker mode is enabled, but max_batch is greater than 1. " + "Consider setting max_batch to 1 to prevent long running batch jobs.", + ) + if self.bridge_data.allow_sdxl_controlnet: + logger.warning( + "Extra slow worker mode is enabled, but allow_sdxl_controlnet is enabled. " + "Consider disabling allow_sdxl_controlnet to prevent long running jobs.", + ) + for device in self._device_map.root.values(): total_memory_mb = device.total_memory / 1024 / 1024 if total_memory_mb < 10_000 and self.bridge_data.high_memory_mode: @@ -3875,6 +3892,12 @@ def print_status_method(self) -> None: f"Device {device.device_name} ({device.device_index}) has more than 20GB of memory. " "You should enable `high_memory_mode` in your config to take advantage of this.", ) + elif total_memory_mb > 20_000 and self.bridge_data.extra_slow_worker: + logger.warning( + f"Device {device.device_name} ({device.device_index}) has more than 20GB of memory. " + "There are very few GPUs with this much memory that should be running in extra slow worker " + "mode. Consider disabling `extra_slow_worker` in your config.", + ) self._last_status_message_time = time.time() From e142601d1ac2fbb646630d0fbf93049d514b2a73 Mon Sep 17 00:00:00 2001 From: tazlin Date: Tue, 17 Sep 2024 12:49:14 -0400 Subject: [PATCH 33/50] fix: respect `exit_on_unhandled_faults` on deadlocks --- horde_worker_regen/process_management/process_manager.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 154e30c2..e4a2e95e 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -3754,8 +3754,13 @@ def detect_deadlock(self) -> None: and (self._last_deadlock_detected_time + 10) < time.time() and self._process_map.num_busy_processes() == 0 ): - logger.debug("Deadlock still detected after 10 seconds. Attempting to recover.") - self._purge_jobs() + if self.bridge_data.exit_on_unhandled_faults: + logger.error("Exiting due to exit_on_unhandled_faults being enabled") + self._abort() + else: + logger.debug("Deadlock still detected after 10 seconds. Attempting to recover.") + self._purge_jobs() + self._in_deadlock = False elif ( self._in_deadlock From 5ef71b4805bf9edc2343baa5f80ddbbe13e28fe8 Mon Sep 17 00:00:00 2001 From: db0 Date: Fri, 20 Sep 2024 09:35:57 +0200 Subject: [PATCH 34/50] feat: adds remove_maintenance_on_init secret var --- convert_config_to_env.py | 1 - horde_worker_regen/bridge_data/data_model.py | 2 + horde_worker_regen/bridge_data/load_config.py | 2 +- horde_worker_regen/download_models.py | 2 +- .../process_management/main_entry_point.py | 1 - .../process_management/process_manager.py | 41 +++++++++++++++++-- horde_worker_regen/run_worker.py | 2 +- pyproject.toml | 1 + tests/test_bridge_data.py | 2 +- 9 files changed, 44 insertions(+), 10 deletions(-) diff --git a/convert_config_to_env.py b/convert_config_to_env.py index c0712c60..d6c738f4 100644 --- a/convert_config_to_env.py +++ b/convert_config_to_env.py @@ -12,7 +12,6 @@ import argparse from horde_model_reference.model_reference_manager import ModelReferenceManager - from horde_worker_regen.bridge_data.load_config import BridgeDataLoader, ConfigFormat diff --git a/horde_worker_regen/bridge_data/data_model.py b/horde_worker_regen/bridge_data/data_model.py index 5251e10a..5eed4e80 100644 --- a/horde_worker_regen/bridge_data/data_model.py +++ b/horde_worker_regen/bridge_data/data_model.py @@ -76,6 +76,8 @@ class reGenBridgeData(CombinedHordeBridgeData): purge_loras_on_download: bool = Field(default=False) + remove_maintenance_on_init: bool = Field(default=False) + custom_models: list[dict] = Field( default_factory=list, ) diff --git a/horde_worker_regen/bridge_data/load_config.py b/horde_worker_regen/bridge_data/load_config.py index 6da41d6d..56eced63 100644 --- a/horde_worker_regen/bridge_data/load_config.py +++ b/horde_worker_regen/bridge_data/load_config.py @@ -6,13 +6,13 @@ from enum import auto from pathlib import Path -from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_sdk.ai_horde_api.ai_horde_clients import AIHordeAPIManualClient from horde_sdk.ai_horde_worker.model_meta import ImageModelLoadResolver from loguru import logger from ruamel.yaml import YAML from strenum import StrEnum +from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_worker_regen.bridge_data import AIWORKER_REGEN_PREFIX from horde_worker_regen.bridge_data.data_model import reGenBridgeData diff --git a/horde_worker_regen/download_models.py b/horde_worker_regen/download_models.py index df17f20c..d97ffaf7 100644 --- a/horde_worker_regen/download_models.py +++ b/horde_worker_regen/download_models.py @@ -12,9 +12,9 @@ def download_all_models( if not load_config_from_env_vars: load_env_vars_from_config() - from horde_model_reference.model_reference_manager import ModelReferenceManager from loguru import logger + from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_worker_regen.bridge_data.load_config import BridgeDataLoader, reGenBridgeData from horde_worker_regen.consts import BRIDGE_CONFIG_FILENAME diff --git a/horde_worker_regen/process_management/main_entry_point.py b/horde_worker_regen/process_management/main_entry_point.py index 04e539f5..f72e28e1 100644 --- a/horde_worker_regen/process_management/main_entry_point.py +++ b/horde_worker_regen/process_management/main_entry_point.py @@ -1,7 +1,6 @@ from multiprocessing.context import BaseContext from horde_model_reference.model_reference_manager import ModelReferenceManager - from horde_worker_regen.bridge_data.data_model import reGenBridgeData from horde_worker_regen.process_management.process_manager import HordeWorkerProcessManager diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index e4a2e95e..1f7ffe72 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -27,18 +27,21 @@ import psutil import yarl from aiohttp import ClientSession -from horde_model_reference.meta_consts import MODEL_REFERENCE_CATEGORY, STABLE_DIFFUSION_BASELINE_CATEGORY -from horde_model_reference.model_reference_manager import ModelReferenceManager -from horde_model_reference.model_reference_records import StableDiffusion_ModelReference from horde_sdk import RequestErrorResponse from horde_sdk.ai_horde_api import GENERATION_STATE -from horde_sdk.ai_horde_api.ai_horde_clients import AIHordeAPIAsyncClientSession, AIHordeAPIAsyncSimpleClient +from horde_sdk.ai_horde_api.ai_horde_clients import ( + AIHordeAPIAsyncClientSession, + AIHordeAPIAsyncSimpleClient, + AIHordeAPISimpleClient, +) from horde_sdk.ai_horde_api.apimodels import ( FindUserRequest, GenMetadataEntry, ImageGenerateJobPopRequest, ImageGenerateJobPopResponse, JobSubmitResponse, + ModifyWorkerRequest, + SingleWorkerDetailsResponse, UserDetailsResponse, ) from horde_sdk.ai_horde_api.consts import KNOWN_UPSCALERS, METADATA_TYPE, METADATA_VALUE @@ -48,6 +51,9 @@ from typing_extensions import override import horde_worker_regen +from horde_model_reference.meta_consts import MODEL_REFERENCE_CATEGORY, STABLE_DIFFUSION_BASELINE_CATEGORY +from horde_model_reference.model_reference_manager import ModelReferenceManager +from horde_model_reference.model_reference_records import StableDiffusion_ModelReference from horde_worker_regen.bridge_data.data_model import reGenBridgeData from horde_worker_regen.bridge_data.load_config import BridgeDataLoader from horde_worker_regen.consts import ( @@ -1114,6 +1120,8 @@ def __init__( logger.debug(f"Target RAM overhead: {self.target_ram_overhead_bytes / 1024 / 1024 / 1024} GB") self.enable_performance_mode() + if self.bridge_data.remove_maintenance_on_init: + self.remove_maintenance() # Get the total memory of each GPU import torch @@ -1158,6 +1166,31 @@ def __init__( logger.error(e) time.sleep(5) + def remove_maintenance(self) -> None: + """Removes the maintenance from the named worker.""" + simple_client = AIHordeAPISimpleClient() + worker_details: SingleWorkerDetailsResponse = simple_client.worker_details_by_name( + worker_name=self.bridge_data.dreamer_worker_name, + ) + if worker_details is None: + logger.debug( + f"Worker with name {self.bridge_data.dreamer_worker_name} " + "does not appear to exist already to remove maintenance.", + ) + return + modify_worker_request = ModifyWorkerRequest( + apikey=self.bridge_data.api_key, + worker_id=worker_details.id_, + maintenance=False, + ) + + simple_client.worker_modify(modify_worker_request) + + logger.debug( + f"Ensured worker with name {self.bridge_data.dreamer_worker_name} " + "({worker_details.id_}) is removed from maintenance.", + ) + def enable_performance_mode(self) -> None: """Enable performance mode.""" if self.bridge_data.high_performance_mode: diff --git a/horde_worker_regen/run_worker.py b/horde_worker_regen/run_worker.py index 8f48af80..fe584d27 100644 --- a/horde_worker_regen/run_worker.py +++ b/horde_worker_regen/run_worker.py @@ -20,9 +20,9 @@ def main(ctx: BaseContext, load_from_env_vars: bool = False, *, amd_gpu: bool = False) -> None: """Check for a valid config and start the driver ('main') process for the reGen worker.""" - from horde_model_reference.model_reference_manager import ModelReferenceManager from pydantic import ValidationError + from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_worker_regen.bridge_data.load_config import BridgeDataLoader, reGenBridgeData from horde_worker_regen.consts import BRIDGE_CONFIG_FILENAME from horde_worker_regen.process_management.main_entry_point import start_working diff --git a/pyproject.toml b/pyproject.toml index df489971..5f37eaf8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,7 @@ exclude = ''' | \.mypy_cache | \.tox | \.venv + | venv | _build | buck-out | build diff --git a/tests/test_bridge_data.py b/tests/test_bridge_data.py index 8218e6f7..4a24551c 100644 --- a/tests/test_bridge_data.py +++ b/tests/test_bridge_data.py @@ -2,10 +2,10 @@ import pathlib import pytest -from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_sdk.generic_api.consts import ANON_API_KEY from ruamel.yaml import YAML +from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_worker_regen.bridge_data.data_model import reGenBridgeData from horde_worker_regen.bridge_data.load_config import BridgeDataLoader, ConfigFormat From 519f7344e28c2bd4e67310ff12480c60b969e282 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 21 Sep 2024 16:34:36 -0400 Subject: [PATCH 35/50] style: fix --- horde_worker_regen/process_management/process_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 1f7ffe72..c659033e 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -27,6 +27,9 @@ import psutil import yarl from aiohttp import ClientSession +from horde_model_reference.meta_consts import MODEL_REFERENCE_CATEGORY, STABLE_DIFFUSION_BASELINE_CATEGORY +from horde_model_reference.model_reference_manager import ModelReferenceManager +from horde_model_reference.model_reference_records import StableDiffusion_ModelReference from horde_sdk import RequestErrorResponse from horde_sdk.ai_horde_api import GENERATION_STATE from horde_sdk.ai_horde_api.ai_horde_clients import ( @@ -51,9 +54,6 @@ from typing_extensions import override import horde_worker_regen -from horde_model_reference.meta_consts import MODEL_REFERENCE_CATEGORY, STABLE_DIFFUSION_BASELINE_CATEGORY -from horde_model_reference.model_reference_manager import ModelReferenceManager -from horde_model_reference.model_reference_records import StableDiffusion_ModelReference from horde_worker_regen.bridge_data.data_model import reGenBridgeData from horde_worker_regen.bridge_data.load_config import BridgeDataLoader from horde_worker_regen.consts import ( From 91d6f35ce4c39c8ce51f0911651755ed71a7940f Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 21 Sep 2024 18:11:00 -0400 Subject: [PATCH 36/50] fix: less flux slowdown --- .pre-commit-config.yaml | 6 +- horde-bridge.cmd | 2 +- .../process_management/process_manager.py | 57 +++++++++++++------ .../process_management/worker_entry_points.py | 20 ++++--- requirements.dev.txt | 12 ++-- requirements.rocm.txt | 6 +- requirements.txt | 6 +- 7 files changed, 68 insertions(+), 41 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4688c95..e0b37ea0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,7 +19,7 @@ repos: - id: mypy args: [] additional_dependencies: - - pydantic==2.7.4 + - pydantic==2.9.2 - types-requests - types-pytz - types-setuptools @@ -40,7 +40,7 @@ repos: - horde_safety==0.2.3 - torch==2.3.1 - ruamel.yaml - - horde_engine==2.15.0 - - horde_sdk==0.14.3 + - horde_engine==2.15.1 + - horde_sdk==0.14.7 - horde_model_reference==0.9.0 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index a1e59a2c..4d8cbdd4 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.3 horde_model_reference~=0.9.0 horde_engine~=2.15.0 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.7 horde_model_reference~=0.9.0 horde_engine~=2.15.1 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index c659033e..5a352831 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -51,7 +51,8 @@ from horde_sdk.ai_horde_api.fields import JobID from loguru import logger from pydantic import BaseModel, ConfigDict, RootModel, ValidationError -from typing_extensions import override +from typing import Literal, Union +from typing_extensions import override, TypeAlias import horde_worker_regen from horde_worker_regen.bridge_data.data_model import reGenBridgeData @@ -103,18 +104,15 @@ _async_client_exceptions = (asyncio.exceptions.TimeoutError, aiohttp.client_exceptions.ClientError, OSError) _excludes_for_job_dump = { - "job_image_results": ..., + "job_image_results": True, "sdk_api_job_info": { - "payload": { - "prompt", - "special", - }, - "skipped": ..., - "source_image": ..., - "source_mask": ..., - "extra_source_images": ..., - "r2_upload": ..., - "r2_uploads": ..., + "payload": {"prompt": True, "special": True}, + "skipped": True, + "source_image": True, + "source_mask": True, + "extra_source_images": True, + "r2_upload": True, + "r2_uploads": True, }, } @@ -1740,7 +1738,7 @@ def receive_and_handle_process_messages(self) -> None: ) logger.debug( - f"Job data: {message.sdk_api_job_info.model_dump(exclude=_excludes_for_job_dump)}", + f"Job data: {message.sdk_api_job_info.model_dump(exclude=_excludes_for_job_dump)}", # type: ignore ) self.completed_jobs.append(job_info) @@ -2789,7 +2787,7 @@ async def api_submit_job(self) -> None: ): model_dump = hji.model_dump( - exclude=_excludes_for_job_dump, + exclude=_excludes_for_job_dump, # type: ignore ) if ( self.stable_diffusion_reference is not None @@ -3158,6 +3156,9 @@ async def _get_source_images(self, job_pop_response: ImageGenerateJobPopResponse return job_pop_response _last_pop_no_jobs_available: bool = False + _too_many_consecutive_failed_jobs: bool = False + _too_many_consecutive_failed_jobs_time: float = 0.0 + _too_many_consecutive_failed_jobs_wait_time = 180 @logger.catch(reraise=True) async def api_job_pop(self) -> None: @@ -3165,6 +3166,18 @@ async def api_job_pop(self) -> None: if self._shutting_down: return + cur_time = time.time() + + if self._too_many_consecutive_failed_jobs: + if ( + cur_time - self._too_many_consecutive_failed_jobs_time + > self._too_many_consecutive_failed_jobs_wait_time + ): + self._too_many_consecutive_failed_jobs = False + self._too_many_consecutive_failed_jobs_time = 0 + logger.debug("Resuming job pops after too many consecutive failed jobs") + return + if self._consecutive_failed_jobs >= 3: logger.error( "Too many consecutive failed jobs, pausing job pops. " @@ -3174,9 +3187,8 @@ async def api_job_pop(self) -> None: if self.bridge_data.exit_on_unhandled_faults: logger.error("Exiting due to exit_on_unhandled_faults being enabled") self._abort() - await asyncio.sleep(180) - self._consecutive_failed_jobs = 0 - logger.info("Resuming job pops") + self._too_many_consecutive_failed_jobs = True + self._too_many_consecutive_failed_jobs_time = cur_time return max_jobs_in_queue = self.bridge_data.queue_size + 1 @@ -3937,6 +3949,17 @@ def print_status_method(self) -> None: "mode. Consider disabling `extra_slow_worker` in your config.", ) + if self._too_many_consecutive_failed_jobs: + time_since_failure = time.time() - self._too_many_consecutive_failed_jobs_time + logger.error( + "Too many consecutive failed jobs. This may be due to a misconfiguration or other issue. " + "Please check your logs and configuration.", + ) + logger.error( + f"Time since last job failure: {time_since_failure:.2f}s). " + f"{self._too_many_consecutive_failed_jobs_wait_time} seconds must pass before resuming.", + ) + self._last_status_message_time = time.time() _bridge_data_loop_interval = 1.0 diff --git a/horde_worker_regen/process_management/worker_entry_points.py b/horde_worker_regen/process_management/worker_entry_points.py index 716abf7a..af8abc71 100644 --- a/horde_worker_regen/process_management/worker_entry_points.py +++ b/horde_worker_regen/process_management/worker_entry_points.py @@ -64,21 +64,25 @@ def start_inference_process( if amd_gpu: extra_comfyui_args.append("--use-pytorch-cross-attention") - models_not_to_force_load = [] + models_not_to_force_load = ["flux"] if very_high_memory_mode: extra_comfyui_args.append("--gpu-only") elif high_memory_mode: extra_comfyui_args.append("--normalvram") - models_not_to_force_load = [ - "cascade", - ] + models_not_to_force_load.extend( + [ + "cascade", + ], + ) elif low_memory_mode: extra_comfyui_args.append("--novram") - models_not_to_force_load = [ - "sdxl", - "cascade", - ] + models_not_to_force_load.extend( + [ + "sdxl", + "cascade", + ], + ) with logger.catch(reraise=True): hordelib.initialise( diff --git a/requirements.dev.txt b/requirements.dev.txt index 976ab2bb..35c65242 100644 --- a/requirements.dev.txt +++ b/requirements.dev.txt @@ -1,9 +1,9 @@ -pytest==8.3.1 -mypy==1.11.0 -black==24.4.2 -ruff==0.5.4 -tox~=4.16.0 -pre-commit~=3.7.1 +pytest==8.3.3 +mypy==1.11.2 +black==24.8.0 +ruff==0.6.5 +tox~=4.18.1 +pre-commit~=3.8.0 build>=0.10.0 coverage>=7.2.7 diff --git a/requirements.rocm.txt b/requirements.rocm.txt index 7f2d185c..91395bc6 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -1,9 +1,9 @@ numpy==1.26.4 torch==2.3.1+rocm6.0 -horde_sdk~=0.14.3 +horde_sdk~=0.14.7 horde_safety~=0.2.3 -horde_engine~=2.15.0 +horde_engine~=2.15.1 horde_model_reference~=0.9.0 python-dotenv @@ -13,7 +13,7 @@ wheel python-Levenshtein -pydantic>=2.7.4 +pydantic>=2.9.2 typing_extensions requests StrEnum diff --git a/requirements.txt b/requirements.txt index bb0bc0a1..18ad68a9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ numpy==1.26.4 torch==2.3.1 -horde_sdk~=0.14.3 +horde_sdk~=0.14.7 horde_safety~=0.2.3 -horde_engine~=2.15.0 +horde_engine~=2.15.1 horde_model_reference>=0.9.0 python-dotenv @@ -12,7 +12,7 @@ semver python-Levenshtein -pydantic>=2.7.4 +pydantic>=2.9.2 typing_extensions requests StrEnum From ddc7bfeca1a51afa3e8111b7bc26b7eb61d2f526 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 21 Sep 2024 20:44:52 -0400 Subject: [PATCH 37/50] style: fix --- convert_config_to_env.py | 1 + horde_worker_regen/bridge_data/load_config.py | 2 +- horde_worker_regen/download_models.py | 2 +- horde_worker_regen/process_management/main_entry_point.py | 1 + horde_worker_regen/process_management/process_manager.py | 3 +-- horde_worker_regen/run_worker.py | 2 +- tests/test_bridge_data.py | 2 +- 7 files changed, 7 insertions(+), 6 deletions(-) diff --git a/convert_config_to_env.py b/convert_config_to_env.py index d6c738f4..c0712c60 100644 --- a/convert_config_to_env.py +++ b/convert_config_to_env.py @@ -12,6 +12,7 @@ import argparse from horde_model_reference.model_reference_manager import ModelReferenceManager + from horde_worker_regen.bridge_data.load_config import BridgeDataLoader, ConfigFormat diff --git a/horde_worker_regen/bridge_data/load_config.py b/horde_worker_regen/bridge_data/load_config.py index 56eced63..6da41d6d 100644 --- a/horde_worker_regen/bridge_data/load_config.py +++ b/horde_worker_regen/bridge_data/load_config.py @@ -6,13 +6,13 @@ from enum import auto from pathlib import Path +from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_sdk.ai_horde_api.ai_horde_clients import AIHordeAPIManualClient from horde_sdk.ai_horde_worker.model_meta import ImageModelLoadResolver from loguru import logger from ruamel.yaml import YAML from strenum import StrEnum -from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_worker_regen.bridge_data import AIWORKER_REGEN_PREFIX from horde_worker_regen.bridge_data.data_model import reGenBridgeData diff --git a/horde_worker_regen/download_models.py b/horde_worker_regen/download_models.py index d97ffaf7..df17f20c 100644 --- a/horde_worker_regen/download_models.py +++ b/horde_worker_regen/download_models.py @@ -12,9 +12,9 @@ def download_all_models( if not load_config_from_env_vars: load_env_vars_from_config() + from horde_model_reference.model_reference_manager import ModelReferenceManager from loguru import logger - from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_worker_regen.bridge_data.load_config import BridgeDataLoader, reGenBridgeData from horde_worker_regen.consts import BRIDGE_CONFIG_FILENAME diff --git a/horde_worker_regen/process_management/main_entry_point.py b/horde_worker_regen/process_management/main_entry_point.py index f72e28e1..04e539f5 100644 --- a/horde_worker_regen/process_management/main_entry_point.py +++ b/horde_worker_regen/process_management/main_entry_point.py @@ -1,6 +1,7 @@ from multiprocessing.context import BaseContext from horde_model_reference.model_reference_manager import ModelReferenceManager + from horde_worker_regen.bridge_data.data_model import reGenBridgeData from horde_worker_regen.process_management.process_manager import HordeWorkerProcessManager diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 5a352831..8e4ed9ab 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -51,8 +51,7 @@ from horde_sdk.ai_horde_api.fields import JobID from loguru import logger from pydantic import BaseModel, ConfigDict, RootModel, ValidationError -from typing import Literal, Union -from typing_extensions import override, TypeAlias +from typing_extensions import override import horde_worker_regen from horde_worker_regen.bridge_data.data_model import reGenBridgeData diff --git a/horde_worker_regen/run_worker.py b/horde_worker_regen/run_worker.py index fe584d27..8f48af80 100644 --- a/horde_worker_regen/run_worker.py +++ b/horde_worker_regen/run_worker.py @@ -20,9 +20,9 @@ def main(ctx: BaseContext, load_from_env_vars: bool = False, *, amd_gpu: bool = False) -> None: """Check for a valid config and start the driver ('main') process for the reGen worker.""" + from horde_model_reference.model_reference_manager import ModelReferenceManager from pydantic import ValidationError - from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_worker_regen.bridge_data.load_config import BridgeDataLoader, reGenBridgeData from horde_worker_regen.consts import BRIDGE_CONFIG_FILENAME from horde_worker_regen.process_management.main_entry_point import start_working diff --git a/tests/test_bridge_data.py b/tests/test_bridge_data.py index 4a24551c..8218e6f7 100644 --- a/tests/test_bridge_data.py +++ b/tests/test_bridge_data.py @@ -2,10 +2,10 @@ import pathlib import pytest +from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_sdk.generic_api.consts import ANON_API_KEY from ruamel.yaml import YAML -from horde_model_reference.model_reference_manager import ModelReferenceManager from horde_worker_regen.bridge_data.data_model import reGenBridgeData from horde_worker_regen.bridge_data.load_config import BridgeDataLoader, ConfigFormat From a52d5a39abb0189a8ee324d843c970393b611870 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 21 Sep 2024 21:40:05 -0400 Subject: [PATCH 38/50] fix: better process crash handling/logging --- .../process_management/process_manager.py | 45 +++++++++++-------- 1 file changed, 26 insertions(+), 19 deletions(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 8e4ed9ab..aaa7f980 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -1754,9 +1754,11 @@ def receive_and_handle_process_messages(self) -> None: break if completed_job_info is None or completed_job_info.job_image_results is None: - raise ValueError( - f"Expected to find a completed job with ID {message.job_id} but none was found", + logger.error( + f"Expected to find a completed job with ID {message.job_id} but none was found" + "This should only happen when certain process crashes occur.", ) + continue num_images_censored = 0 num_images_csam = 0 @@ -1909,7 +1911,7 @@ def preload_models(self) -> bool: will_load_loras = job.payload.loras is not None and len(job.payload.loras) > 0 seamless_tiling_enabled = job.payload.tiling is not None and job.payload.tiling - available_process.safe_send_message( + if available_process.safe_send_message( HordePreloadInferenceModelMessage( control_flag=HordeControlFlag.PRELOAD_MODEL, horde_model_name=job.model, @@ -1917,21 +1919,23 @@ def preload_models(self) -> bool: seamless_tiling_enabled=seamless_tiling_enabled, sdk_api_job_info=job, ), - ) - available_process.last_control_flag = HordeControlFlag.PRELOAD_MODEL + ): + available_process.last_control_flag = HordeControlFlag.PRELOAD_MODEL - self._horde_model_map.update_entry( - horde_model_name=job.model, - load_state=ModelLoadState.LOADING, - process_id=available_process.process_id, - ) + self._horde_model_map.update_entry( + horde_model_name=job.model, + load_state=ModelLoadState.LOADING, + process_id=available_process.process_id, + ) - self._process_map.on_model_load_state_change( - process_id=available_process.process_id, - horde_model_name=job.model, - last_job_referenced=job, - ) + self._process_map.on_model_load_state_change( + process_id=available_process.process_id, + horde_model_name=job.model, + last_job_referenced=job, + ) + # Even if the message fails to send, we still want to return True so that we can let the main loop + # catch up and potentially replace the process. return True return False @@ -2188,11 +2192,12 @@ def unload_models_from_vram( process_info.last_job_referenced = None process_info.last_control_flag = HordeControlFlag.UNLOAD_MODELS_FROM_VRAM else: - process_info.safe_send_message( + if not process_info.safe_send_message( HordeControlMessage( control_flag=HordeControlFlag.UNLOAD_MODELS_FROM_VRAM, ), - ) + ): + self._replace_inference_process(process_info) def unload_from_ram(self, process_id: int) -> None: """Unload models from a process. @@ -3181,7 +3186,7 @@ async def api_job_pop(self) -> None: logger.error( "Too many consecutive failed jobs, pausing job pops. " "Please look into what happened and let the devs know. ", - "Waiting 180 seconds...", + f"Waiting {self._too_many_consecutive_failed_jobs_wait_time} seconds...", ) if self.bridge_data.exit_on_unhandled_faults: logger.error("Exiting due to exit_on_unhandled_faults being enabled") @@ -3959,6 +3964,9 @@ def print_status_method(self) -> None: f"{self._too_many_consecutive_failed_jobs_wait_time} seconds must pass before resuming.", ) + if self._shutting_down: + logger.warning("Shutting down after current jobs are finished...") + self._last_status_message_time = time.time() _bridge_data_loop_interval = 1.0 @@ -4268,7 +4276,6 @@ def timed_unset_recently_recovered() -> None: if self._check_and_replace_process(process_info, timeout, state, error_message): any_replaced = True self._recently_recovered = True - break if any_replaced: threading.Thread(target=timed_unset_recently_recovered).start() From f5ba73c70f506b32be6ae5547848ec2b264f7d91 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sat, 21 Sep 2024 21:53:24 -0400 Subject: [PATCH 39/50] fix: dont default to `low_memory_mode` by default --- horde_worker_regen/process_management/worker_entry_points.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/horde_worker_regen/process_management/worker_entry_points.py b/horde_worker_regen/process_management/worker_entry_points.py index af8abc71..c7e77941 100644 --- a/horde_worker_regen/process_management/worker_entry_points.py +++ b/horde_worker_regen/process_management/worker_entry_points.py @@ -20,7 +20,7 @@ def start_inference_process( disk_lock: Lock, aux_model_lock: Lock, *, - low_memory_mode: bool = True, + low_memory_mode: bool = False, high_memory_mode: bool = False, very_high_memory_mode: bool = False, amd_gpu: bool = False, From 5e01bfe1985f51f0bdbf454102702f3700d65876 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sun, 22 Sep 2024 08:31:52 -0400 Subject: [PATCH 40/50] fix: detect more deadlocks; less crashes w/ unsresponsive logic - More fallback logic if there are jobs popped, processes available, but nothing happening. - Resolves certain problems with the unresponsive logic - The case of it ending all jobs after a long period of "No Job" messages from the server followed by successful pops. - Now no longer shuts down in error while processes are restarting --- .../process_management/process_manager.py | 150 ++++++++++++------ 1 file changed, 98 insertions(+), 52 deletions(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index aaa7f980..f5b9d6f3 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -1210,13 +1210,13 @@ def enable_performance_mode(self) -> None: def is_time_for_shutdown(self) -> bool: """Return true if it is time to shut down.""" - if ( - all( - inference_process.last_process_state == HordeProcessState.PROCESS_ENDING - or inference_process.last_process_state == HordeProcessState.PROCESS_ENDED - for inference_process in self._process_map.values() - ) - and not self._recently_recovered + if self._recently_recovered: + return False + + if all( + inference_process.last_process_state == HordeProcessState.PROCESS_ENDING + or inference_process.last_process_state == HordeProcessState.PROCESS_ENDED + for inference_process in self._process_map.values() ): return True @@ -2900,6 +2900,9 @@ async def api_submit_job(self) -> None: _last_job_pop_time = 0.0 """The time at which the last job was popped from the API.""" + def _last_pop_recently(self) -> bool: + return (time.time() - self._last_job_pop_time) < 10 + _last_job_submitted_time = time.time() """The time at which the last job was submitted to the API.""" @@ -3781,9 +3784,59 @@ async def _process_control_loop(self) -> None: _last_deadlock_detected_time = 0.0 _in_deadlock = False + _in_queue_deadlock = False + _last_queue_deadlock_detected_time = 0.0 + _queue_deadlock_model: str | None = None + _queue_deadlock_process_id: int | None = None def detect_deadlock(self) -> None: """Detect if there are jobs in the queue but no processes doing anything.""" + + def _print_deadlock_info() -> None: + logger.debug(f"Jobs in queue: {len(self.job_deque)}") + logger.debug(f"Jobs in progress: {len(self.jobs_in_progress)}") + logger.debug(f"Jobs pending safety check: {len(self.jobs_pending_safety_check)}") + logger.debug(f"Jobs being safety checked: {len(self.jobs_being_safety_checked)}") + logger.debug(f"Jobs completed: {len(self.completed_jobs)}") + logger.debug(f"Jobs faulted: {self._num_jobs_faulted}") + logger.debug(f"horde_model_map: {self._horde_model_map}") + logger.debug(f"process_map: {self._process_map}") + + if self._last_pop_recently(): + # We just popped a job, lets allow some time for gears to start turning + # before we assume we're in a deadlock + return + + if ( + not self._in_queue_deadlock + and (self._process_map.num_busy_processes() == 0 and len(self.job_deque) > 0) + and len(self.jobs_in_progress) == 0 + ): + + currently_loaded_models = set() + model_process_map: dict[str, int] = {} + for process in self._process_map.values(): + if process.loaded_horde_model_name is not None: + currently_loaded_models.add(process.loaded_horde_model_name) + model_process_map[process.loaded_horde_model_name] = process.process_id + + for job in self.job_deque: + if job.model in currently_loaded_models: + self._in_queue_deadlock = True + self._last_queue_deadlock_detected_time = time.time() + self._queue_deadlock_model = job.model + self._queue_deadlock_process_id = model_process_map[job.model] + + elif self._in_queue_deadlock and (self._last_queue_deadlock_detected_time + 10) < time.time(): + logger.debug("Queue deadlock detected") + _print_deadlock_info() + logger.debug(f"Model causing deadlock: {self._queue_deadlock_model}") + if self._queue_deadlock_process_id is not None: + self._replace_inference_process(self._process_map[self._queue_deadlock_process_id]) + self._in_queue_deadlock = False + self._queue_deadlock_model = None + self._queue_deadlock_process_id = None + if ( (not self._in_deadlock) and (len(self.job_deque) > 0 or len(self.jobs_in_progress) > 0 or len(self.jobs_lookup) > 0) @@ -3792,12 +3845,7 @@ def detect_deadlock(self) -> None: self._last_deadlock_detected_time = time.time() self._in_deadlock = True logger.debug("Deadlock detected") - logger.debug(f"Jobs in queue: {len(self.job_deque)}") - logger.debug(f"Jobs in progress: {len(self.jobs_in_progress)}") - logger.debug(f"Jobs pending safety check: {len(self.jobs_pending_safety_check)}") - logger.debug(f"Jobs being safety checked: {len(self.jobs_being_safety_checked)}") - logger.debug(f"Jobs completed: {len(self.completed_jobs)}") - logger.debug(f"Jobs faulted: {self._num_jobs_faulted}") + _print_deadlock_info() elif ( self._in_deadlock and (self._last_deadlock_detected_time + 10) < time.time() @@ -4192,51 +4240,14 @@ def _abort(self) -> None: def replace_hung_processes(self) -> bool: """Replaces processes that haven't checked in since `process_timeout` seconds in bridgeData.""" - now = time.time() - - import threading - - def timed_unset_recently_recovered() -> None: - time.sleep(60) - self._recently_recovered = False - - # If every process hasn't done anything for a while or if we haven't submitted a job for a while, - # AND the last job pop returned a job, we're in a black hole and we need to exit because none of the ways to - # recover worked - if ( - all( - ((now - process_info.last_received_timestamp) > self.bridge_data.process_timeout) - for process_info in self._process_map.values() - ) - or ((now - self._last_job_submitted_time) > self.bridge_data.process_timeout) - ) and not (self._last_pop_no_jobs_available or self._recently_recovered): - self._purge_jobs() - - if self.bridge_data.exit_on_unhandled_faults: - logger.error("All processes have been unresponsive for too long, exiting.") - - self._abort() - logger.error("Exiting due to exit_on_unhandled_faults being enabled") - - return True - - logger.error("All processes have been unresponsive for too long, attempting to recover.") - self._recently_recovered = True - - for process_info in self._process_map.values(): - if process_info.process_type == HordeProcessType.INFERENCE: - self._replace_inference_process(process_info) - - threading.Thread(target=timed_unset_recently_recovered).start() - - return True - if self._shutting_down: return False if self._last_pop_no_jobs_available or self._recently_recovered: return False + now = time.time() + any_replaced = False for process_info in self._process_map.values(): if self._process_map.is_stuck_on_inference(process_info.process_id): @@ -4277,6 +4288,41 @@ def timed_unset_recently_recovered() -> None: any_replaced = True self._recently_recovered = True + import threading + + def timed_unset_recently_recovered() -> None: + time.sleep(self.bridge_data.preload_timeout) + self._recently_recovered = False + + # If every process hasn't done anything for a while or if we haven't submitted a job for a while, + # AND the last job pop returned a job, we're in a black hole and we need to exit because none of the ways to + # recover worked + if ( + all( + ((now - process_info.last_received_timestamp) > self.bridge_data.process_timeout) + for process_info in self._process_map.values() + ) + or ((now - self._last_job_submitted_time) > self.bridge_data.process_timeout) + ) and not (self._last_pop_no_jobs_available or self._recently_recovered): + self._purge_jobs() + + if self.bridge_data.exit_on_unhandled_faults: + logger.error("All processes have been unresponsive for too long, exiting.") + + self._abort() + logger.error("Exiting due to exit_on_unhandled_faults being enabled") + + return True + + logger.error("All processes have been unresponsive for too long, attempting to recover.") + self._recently_recovered = True + + for process_info in self._process_map.values(): + if process_info.process_type == HordeProcessType.INFERENCE: + self._replace_inference_process(process_info) + + threading.Thread(target=timed_unset_recently_recovered).start() + if any_replaced: threading.Thread(target=timed_unset_recently_recovered).start() From d8178846315c29b3194e9636fc24e474762ad29f Mon Sep 17 00:00:00 2001 From: tazlin Date: Sun, 22 Sep 2024 08:36:39 -0400 Subject: [PATCH 41/50] chore: version bump --- horde_worker_regen/__init__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/horde_worker_regen/__init__.py b/horde_worker_regen/__init__.py index 2b7db86f..0a63a3ec 100644 --- a/horde_worker_regen/__init__.py +++ b/horde_worker_regen/__init__.py @@ -8,7 +8,7 @@ ASSETS_FOLDER_PATH = Path(__file__).parent / "assets" -__version__ = "9.0.0" +__version__ = "9.0.1" import pkg_resources # noqa: E402 diff --git a/pyproject.toml b/pyproject.toml index 5f37eaf8..f5bd7cfb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "horde_worker_regen" -version = "9.0.0" +version = "9.0.1" description = "Allows you to connect to the AI Horde and generate images for users." authors = [ {name = "tazlin", email = "tazlin.on.github@gmail.com"}, From f9729796ecfff6cf1a0224fcd58faec38ddb466f Mon Sep 17 00:00:00 2001 From: tazlin Date: Sun, 22 Sep 2024 14:15:03 -0400 Subject: [PATCH 42/50] fix: use `horde_engine==2.15.2` --- .pre-commit-config.yaml | 2 +- horde-bridge.cmd | 2 +- requirements.rocm.txt | 2 +- requirements.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e0b37ea0..016cdff1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,7 +40,7 @@ repos: - horde_safety==0.2.3 - torch==2.3.1 - ruamel.yaml - - horde_engine==2.15.1 + - horde_engine==2.15.2 - horde_sdk==0.14.7 - horde_model_reference==0.9.0 - semver diff --git a/horde-bridge.cmd b/horde-bridge.cmd index 4d8cbdd4..a03ece1b 100644 --- a/horde-bridge.cmd +++ b/horde-bridge.cmd @@ -5,7 +5,7 @@ cd /d %~dp0 call runtime python -s -m pip -V call python -s -m pip uninstall hordelib -call python -s -m pip install horde_sdk~=0.14.7 horde_model_reference~=0.9.0 horde_engine~=2.15.1 horde_safety~=0.2.3 -U +call python -s -m pip install horde_sdk~=0.14.7 horde_model_reference~=0.9.0 horde_engine~=2.15.2 horde_safety~=0.2.3 -U if %ERRORLEVEL% NEQ 0 ( echo "Please run update-runtime.cmd." diff --git a/requirements.rocm.txt b/requirements.rocm.txt index 91395bc6..af71193b 100644 --- a/requirements.rocm.txt +++ b/requirements.rocm.txt @@ -3,7 +3,7 @@ torch==2.3.1+rocm6.0 horde_sdk~=0.14.7 horde_safety~=0.2.3 -horde_engine~=2.15.1 +horde_engine~=2.15.2 horde_model_reference~=0.9.0 python-dotenv diff --git a/requirements.txt b/requirements.txt index 18ad68a9..1f3f4f75 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ torch==2.3.1 horde_sdk~=0.14.7 horde_safety~=0.2.3 -horde_engine~=2.15.1 +horde_engine~=2.15.2 horde_model_reference>=0.9.0 python-dotenv From 2f88886251ea335ea7876934a7d83ce83c3811e7 Mon Sep 17 00:00:00 2001 From: tazlin Date: Sun, 22 Sep 2024 15:59:40 -0400 Subject: [PATCH 43/50] fix: reset failed job counter after conseq. pause --- horde_worker_regen/process_management/process_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index f5b9d6f3..30018182 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -3180,8 +3180,8 @@ async def api_job_pop(self) -> None: cur_time - self._too_many_consecutive_failed_jobs_time > self._too_many_consecutive_failed_jobs_wait_time ): + self._consecutive_failed_jobs = 0 self._too_many_consecutive_failed_jobs = False - self._too_many_consecutive_failed_jobs_time = 0 logger.debug("Resuming job pops after too many consecutive failed jobs") return From 9aa79b72d80044f68ba58c1e0a0f78a8b91326cc Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 23 Sep 2024 10:06:48 -0400 Subject: [PATCH 44/50] feat: time spent w/o jobs logging Tracks the time spent without any available jobs. This will help worker operators identify potential issues with their configuration. A warning will be logged if the worker spends more than 5 minutes without any jobs, suggesting possible actions to increase job demand. --- .../process_management/process_manager.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/horde_worker_regen/process_management/process_manager.py b/horde_worker_regen/process_management/process_manager.py index 30018182..1f67fa44 100644 --- a/horde_worker_regen/process_management/process_manager.py +++ b/horde_worker_regen/process_management/process_manager.py @@ -3163,6 +3163,8 @@ async def _get_source_images(self, job_pop_response: ImageGenerateJobPopResponse return job_pop_response _last_pop_no_jobs_available: bool = False + _last_pop_no_jobs_available_time: float = 0.0 + _time_spent_no_jobs_available: float = 0.0 _too_many_consecutive_failed_jobs: bool = False _too_many_consecutive_failed_jobs_time: float = 0.0 _too_many_consecutive_failed_jobs_wait_time = 180 @@ -3397,12 +3399,20 @@ async def api_job_pop(self) -> None: if job_pop_response.id_ is None: logger.info(info_string) + cur_time = time.time() + if self._last_pop_no_jobs_available_time == 0.0: + self._last_pop_no_jobs_available_time = cur_time + + self._time_spent_no_jobs_available += cur_time - self._last_pop_no_jobs_available_time + self._last_pop_no_jobs_available_time = cur_time + self._last_pop_no_jobs_available = True return self.job_faults[job_pop_response.id_] = [] self._last_pop_no_jobs_available = False + self._last_pop_no_jobs_available_time = 0.0 logger.info( f"Popped job {job_pop_response.id_} " @@ -3949,6 +3959,7 @@ def print_status_method(self) -> None: f"faulted: {self._num_jobs_faulted}", f"slow_jobs: {self._num_job_slowdowns}", f"process_recoveries: {self._num_process_recoveries}", + f"{self._time_spent_no_jobs_available:.2f} seconds without jobs", ], ) @@ -4008,10 +4019,17 @@ def print_status_method(self) -> None: "Please check your logs and configuration.", ) logger.error( - f"Time since last job failure: {time_since_failure:.2f}s). " + f"Time since last job failure: {time_since_failure:.2f}s. " f"{self._too_many_consecutive_failed_jobs_wait_time} seconds must pass before resuming.", ) + if self._time_spent_no_jobs_available > 60 * 5: + logger.warning( + "Your worker spent more than 5 minutes without jobs. This may be due to low demand. " + "However, offering more models or increasing your max_power may help increase the number of jobs " + "you receive.", + ) + if self._shutting_down: logger.warning("Shutting down after current jobs are finished...") From c5f1bd9e07997217d052cd0aecf2f699f17f5d0f Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 23 Sep 2024 10:36:14 -0400 Subject: [PATCH 45/50] chore: add suggested settings in README.md --- README.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/README.md b/README.md index 7efe0387..78c075ef 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,52 @@ You can double click the provided script files below from a file explorer or run 1. Make a copy of `bridgeData_template.yaml` to `bridgeData.yaml` 1. Edit `bridgeData.yaml` and follow the instructions within to fill in your details. +#### Suggested settings + +Models are loaded as needed and just-in-time. You can offer as many models as you want **provided you have an SSD, at least 32gb of ram, and at least 8gb of VRAM (see [Important Info](#important-info)**. Workers with HDDs are not recommended at this time but those with HDDs should run exactly 1 model. A typical SD1.5 model is around 2gb each, while a typical SDXL model is around 7gb each. Offering `all` models is currently around 700gb total and we commit to keeping that number below 1TB with any future changes. + +> Note: We suggest you disable any 'sleep' or reduced power modes for your system while the worker is running. + +- If you have a **24gb+ vram card**: + ```yaml + - safety_on_gpu: true + - high_memory_mode: true + - high_performance_mode: true + - post_process_job_overlap: true + - unload_models_from_vram_often: false + - max_threads: 1 # If you have Flux/Cascade loaded, otherwise 2 max + - queue_size: 2 # You can set to 3 if you have 64GB or more of RAM + - max_batch: 8 # or higher + +- If you have a **12gb - 16gb card**: + ```yaml + - safety_on_gpu: true # Consider setting to `false` if offering Cascade or Flux + - high_memory_mode: true + - moderate_performance_mode: true + - unload_models_from_vram_often: false + - max_threads: 1 + - max_batch: 4 # or higher + +- If you have an **8gb-10gb vram card**: + - ```yaml + - queue_size: 1 # max **or** only offer flux + - safety_on_gpu: false + - max_threads: 1 + - max_power: 32 # no higher than 32 + - max_batch: 4 # no higher than 4 + - allow_post_processing: false # If offering SDXL or Flux, otherwise you may set to true + - allow_sdxl_controlnet: false + + - Be sure to shut every single VRAM consuming application you can and do not use the computer with the worker running for any purpose. + +- Workers which have **low end cards or have low performance for other reasons**: + ```yaml + - extra_slow_worker: true + # gives you considerably more time to finish job, but requests will not go to your worker unless the requester opts-in (even anon users do not use extra_slow_workers by default). You should only consider using this if you have historically had less than 0.3 MPS/S or less than 3000 kudos/hr consistently **and** you are sure the worker is otherwise configured correctly. + - limit_max_steps: true + # reduces the maximum total number of steps in a single job you will receive based on the model baseline. + - preload_timeout: 120 + # gives you more time to load models off disk. **Note**: Abusing this value can lead to a major loss of kudos and may also lead to maintainance mode, even with `extra_slow_worker: true`. ### Starting/stopping From b54376fe04e8d9eede43b0ea5e0e96f546ad8c9b Mon Sep 17 00:00:00 2001 From: tazlin Date: Mon, 23 Sep 2024 10:36:35 -0400 Subject: [PATCH 46/50] chore: update `bridgeData_template.yaml` --- bridgeData_template.yaml | 307 +++++++++++++++++++++------------------ 1 file changed, 168 insertions(+), 139 deletions(-) diff --git a/bridgeData_template.yaml b/bridgeData_template.yaml index 157b8bcc..e87b2a49 100644 --- a/bridgeData_template.yaml +++ b/bridgeData_template.yaml @@ -1,259 +1,288 @@ -## Common for all worker Types +## Common for all worker types -# The horde url +# !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! +# See also the readme's "Suggested settings" section for recommended settings. !!! +# !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! !!! + +# The Horde URL. Do not change this unless you are using a custom Horde. horde_url: "https://aihorde.net" -# The api_key identifies a unique user in the horde -# Visit https://stablehorde.net/register to create one before you can join +# The API key identifies a unique user in the Horde +# Visit https://aihorde.net/register to create one before you can join api_key: "0000000000" -# Put other users whose prompts you want to prioritize. -# The owner's username is always included so you don't need to add it here if you use the key specified in `api_key` for requests +# List of usernames whose prompts you want to prioritize. +# The owner's username is always included, so you don't need to add it here if you use the key specified in `api_key` for requests. priority_usernames: [] -# The amount of parallel jobs to pick up for the horde. -# Only high end cards (e.g, 3080 or better) benefit from this setting. +# The maximum number of parallel jobs to run at the same time. +# Only high-end cards (e.g., 3080 or better) benefit from this setting. # If you have a 20xx or earlier, or a xx60/xx70, do not change this setting from 1. max_threads: 1 +# 24GB+ VRAM: 1 (2 max if Flux/Cascade loaded) +# 12GB-16GB VRAM: 1 +# 8GB-10GB VRAM: 1 -# We will keep this many requests in the queue so we can start working as soon as a thread is available -# This generally should be or 1 or 2. You should never set this higher than 2 if your max_threads is 2. +# Number of requests to keep in the queue to start working as soon as a thread is available. +# Generally should be 1 or 2. Never set this higher than 2 if your max_threads is 2. +# Warning: Increasing this value directly increases system RAM usage significantly. queue_size: 1 +# 24GB+ VRAM: 2 (3 if 64GB+ RAM) +# 8GB-10GB VRAM: 1 (max or only offer flux) -# This will try to pull these many jobs per request and perform batched inference. -# This is way more optimized than doing them 1 by 1, but is slower. -# Keep in mind, that the horde will not give your max batch at your max resolution -# In order to avoid running out of VRAM. -# The Horde will assume you can fulfil your max batch at HALF you max resolution. -# So make sure you can generate your max_batch @ max_power/2 -# Over your half max_power, AI Horde will smartly assign only as much batches -# as it calculates you can achieve. If you start running out of VRAM, reduce -# max_power or max_batch. +# Number of jobs to pull per request and perform batched inference. +# More optimized than doing them one by one but slower. +# Ensure you can generate your max_batch at half your max_power. max_batch: 1 +# 24GB+ VRAM: 8 or higher +# 12GB-16GB VRAM: 4 or higher +# 8GB-10GB VRAM: 4 (no higher than 4) - -# When Enabled will run CLIP model (Checking for potential CSAM or NSFW) on GPU insted of CPU -# Enable this on cards with 12gb or more VRAM to increase the rate you complete jobs -# You can enable this on cards with less VRAM if you do not load SD2.0 or SDXL models, and keep your max_power low (<32) +# Run CLIP model (checking for potential CSAM or NSFW) on GPU instead of CPU. +# Enable this on cards with 12GB or more VRAM to increase job completion rate. +# ~1.2GB of VRAM overhead safety_on_gpu: false +# 24GB+ VRAM: true +# 12GB-16GB VRAM: true (consider false if offering Cascade or Flux) +# 8GB-10GB VRAM: false - -# If set to True, this worker will not only pick up jobs where the user has the required kudos upfront. -# Effectively this will exclude all anonymous accounts, and registered accounts who haven't contributed. -# Users in priority_usernames and trusted users will bypass this restriction +# Only pick up jobs where the user has the required kudos upfront. +# Excludes all anonymous accounts and registered accounts who haven't contributed. require_upfront_kudos: false -# If set, this worker will use this civitai API token when downloading any resources from civitai. -# This is required in order to provide LoRas/TIs (or other resources) -# which are marked as requiring a civitai token to download. -# -# You can get your civitai API Key from https://civitai.com/user/account (look for 'Add API Key') -# -# Remove the # from the line below and add your civitai API token to enable this feature. +# Use this Civitai API token when downloading resources from Civitai. +# Required for providing LoRas/TIs or other resources marked as requiring a Civitai token. +# Get your Civitai API Key from https://civitai.com/user/account (look for 'Add API Key'). +# Remove the # from the line below and add your Civitai API token to enable this feature. # civitai_api_token: ####################################### ## Dreamer (Stable Diffusion Worker) ## ####################################### -# The worker name to use when running a dreamer instance. +# Worker name for running a Dreamer instance. dreamer_name: "An Awesome Dreamer" -# This is representation of your max resolution (max pixels) supported. -# The formula is `64 * 64 * 8 * max_power` (giving total pixels) -# e.g.: -# 8 = 512x512 -# 18 = 768x768 -# 32 = 1024x1024 -# 50 = 1280x1280 -# ... - +# Max resolution (max pixels) supported. +# Formula: `64 * 64 * 8 * max_power` (total pixels) +# Examples: +# 8 = 512x512 +# 18 = 768x768 +# 32 = 1024x1024 +# 50 = 1280x1280 max_power: 8 - -# A list of words which you do not want to your worker to accept if they are in the prompt +# Suggested values: +# 8GB-10GB VRAM: 32 (no higher than 32) +# 12GB-16GB VRAM: 32-64 (no higher than 64) +# 24GB+ VRAM: 64-128 (no higher than 128) + +# Use more VRAM on average but reduce time spent loading models. +high_memory_mode: false +# Suggested values: +# 24GB+ VRAM: true +# 12GB-16GB VRAM: true (consider false if offering Cascade or Flux) + +# Fill local queue much faster but may be penalized by the server if you cannot keep up with jobs. +high_performance_mode: false +# Suggested values: +# 24GB+ VRAM: true + +# Fill local queue somewhat faster but may be penalized by the server if you cannot keep up with jobs. +# Overridden by high_performance_mode. +moderate_performance_mode: false +# Suggested values: +# 12GB-16GB VRAM: true + +# Start processing the next job before the current job finishes post-processing. +# Reduces time between jobs but may cause crashes on low RAM or VRAM systems. +post_process_job_overlap: false +# Suggested values: +# 24GB+ VRAM: true + +# Aggressively unload models from VRAM when not in use. +# Should be true for most workers with GPUs with less than 16GB of VRAM. +unload_models_from_vram_often: true +# Suggested values: +# 24GB+ VRAM: false +# 12GB-16GB VRAM: false +# 8GB-10GB VRAM: true + +# List of words to reject if they appear in the prompt. blacklist: [] -# If you do not want to serve NSFW images, set this to false. +# Serve NSFW images if true. nsfw: true -# If you want +# Censor NSFW images if true. censor_nsfw: false -# A list of words for which you always want to censor, even if `nsfw` is true. +# List of words to always censor, even if `nsfw` is true. censorlist: [] -# Accept jobs which use a user-supplied image. +# Accept jobs using a user-supplied image. allow_img2img: true -# Accept jobs which use a user-supplied image and an inpainting specific model. +# Accept jobs using a user-supplied image and an inpainting-specific model. # Forced to false if `allow_img2img` is false. allow_painting: true -# Allow user request which are from behind VPNs. -# Note: The worker does not directly interact with user IPs - it only interacts with the stablehorde API. +# Allow user requests from behind VPNs. +# Note: The worker does not directly interact with user IPs - it only interacts with the StableHorde API. allow_unsafe_ip: true -# Allow upscaling, facefixer and other post-generation features to be performed by the worker. +# Allow upscaling, facefixer, and other post-generation features. allow_post_processing: true +# 8GB-10GB VRAM: false (if offering SDXL or Flux, otherwise true) -# Allow controlnet jobs to be done by this worker. -# Note: There is additional RAM/VRAM overhead with this option. Low VRAM cards (<6gb) should be cautious to enable this. +# Allow ControlNet jobs. +# Note: Additional RAM/VRAM overhead. Low VRAM cards (<6GB) should be cautious. allow_controlnet: false -# Allow SDXL jobs with high memory add-ons like controlnet or transparency to be done by this worker. -# Note: There is significant additional RAM/VRAM overhead with this option. Medium VRAM cards (<12gb) should be cautious to enable this. -# Note that if this is true, allow_controlnet must also be true +# Allow SDXL jobs with high memory add-ons like ControlNet or transparency. +# Note: Significant additional RAM/VRAM overhead. Medium VRAM cards (<12GB) should be cautious. +# Note that if this is true, allow_controlnet must also be true. allow_sdxl_controlnet: false +# 16GB+ VRAM: true +# 8GB-10GB VRAM: false -# Allow LoRas to be used. This requires that you have a fast internet connection. -# LoRas will be downloaded on demand. `max_lora_cache_size` controls how many gigabytes you will keep downloaded. -# 5gb of preselected LoRas are always downloaded the first time you start the worker with this setting. +# Allow LoRas to be used. Requires a fast internet connection. +# LoRas will be downloaded on demand. `max_lora_cache_size` controls how many gigabytes to keep downloaded. +# 5GB of preselected LoRas are always downloaded the first time you start the worker with this setting. +# Note that there can be a significant delay when downloading LoRas causing GPU downtime. allow_lora: false -# The number of gigabytes of LoRas too keep cached. This is in addition to the preselected LoRas. -max_lora_cache_size: 10 # In gigabytes. Min is 10. +# Delete any unknown LoRas from the loras folder when `download_models.py` is run. +# Warning: This option will delete any LoRas not in the model reference, including custom LoRas. +purge_loras_on_download: false + +# Number of gigabytes of LoRas to keep cached. Minimum is 10GB. +max_lora_cache_size: 10 -# Set this to true, if your worker is extraordinarily slow, such below 0.1 mps/s -# When your worker is set as extra slow, users can freely choose to skip it when requesting generations -# However you get an the job timeout for generations you pick up is tripled -# And the whole request itself receives triple the expiry timeout (60 mins, instead of 20 mins) when a slow worker picks it up. -# We hope this can help people onboard older GPU generations to serve people who do not need immediate generations -# IMPORTANT: This option is NOT meant to allow CPU workers. It's just for slow GPUs. There's still A timeout. +# Set to true if your worker is extraordinarily slow (below 0.1 mps/s). +# Users can choose to skip it when requesting generations, but job timeout and request expiry timeout are tripled. extra_slow_worker: false +# Low-end cards or low performance: true -# Set this to true to make your worker only pick up jobs requesting steps lower than the model's average steps. -# This is meant to be used for slower workers who can handle a few steps within the limits but might time out if someone asks for 100 or more -# You can also use this if you don't want to serve request with an extraordinary amount of steps whatsoever. +# Only pick up jobs requesting steps lower than the model's average steps. +# Useful for slower workers or if you don't want to serve requests with an extraordinary number of steps. limit_max_steps: false +# Low-end cards or low performance: true -# Automatically determine the models which have the highest queue and offer those. +# Automatically determine the models with the highest queue and offer those. dynamic_models: false # Currently unused in reGen -# The number of models to offer when `dynamic_models` is true. +# Number of models to offer when `dynamic_models` is true. number_of_dynamic_models: 0 # Currently unused in reGen -# If `dynamic_models` is true, the maximum number of models to download automatically for that purpose. +# Maximum number of models to download automatically for `dynamic_models`. max_models_to_download: 10 # Currently unused in reGen -# The frequency (in seconds) to output worker summary stats, such as kudos per hour. +# Frequency (in seconds) to output worker summary stats, such as kudos per hour. # Set to zero to disable stats output completely. stats_output_frequency: 30 - -# The location in which stable diffusion ckpt models are stored +# Location where models are stored. cache_home: "./models/" -# The location of the temp directory, also used for the model cache +# Location of the temp directory, also used for the model cache. temp_dir: "./tmp" # Currently unused in reGen - -# Always download models when required without prompting +# Always download models when required without prompting. always_download: true # Currently unused in reGen -# Disable the terminal GUI, which displays information about the worker and the horde. +# Disable the terminal GUI, which displays information about the worker and the Horde. disable_terminal_ui: false # Currently unused in reGen - # Obsolete vram_to_leave_free: "80%" # Currently unused in reGen -# The target amount of system ram to keep free. -# The worker only makes a best effort. You still have to avoid using up too much RAM with other programs. +# Target amount of system RAM to keep free. +# The worker only makes a best effort. Avoid using too much RAM with other programs. ram_to_leave_free: "80%" # Currently unused in reGen # Obsolete disable_disk_cache: false # Currently unused in reGen -# The models to use. -# Instead of a model name you may use of any of the following magic constants: -# "ALL" - means load all possible models. Expect this to take over 1TB of space! -# "TOP n" - load the top "N" most popular models, use for example, "top 5" or "top 3", etc. -# "BOTTOM n" - load the bottom "N" models (i.e., the least popular N models) use for example, "bottom 5" or "bottom 3", etc. -# -# "ALL SD15 MODELS" - All Stable Diffusion 1.5 models -# "ALL SD21 MODELS" - All Stable Diffusion 2.0/2.1 models -# "ALL SDXL MODELS" - All Stable Diffusion XL models -# "ALL INPAINTING MODELS" - All models marked as being for inpainting -# -# "ALL SFW MODELS" - All models marked as being SFW -# "ALL NSFW MODELS" - All models marked as being NSFW -# -# (not currently supported) "ALL