Skip to content

Commit

Permalink
Merge pull request #2207 from FedML-AI/alay_and_raphael/debug/deploym…
Browse files Browse the repository at this point in the history
…ent_at_scale

Add logs in occupy_gpu_ids, and funcs in hardware_utils for debugging
  • Loading branch information
alaydshah authored Jul 2, 2024
2 parents babf08c + 084781f commit 62c4bb8
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 0 deletions.
25 changes: 25 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/hardware_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,29 @@
GPU_CARD_UTILS = [NvidiaGPUtil, QualcommNPUtil]


# This function is just for debugging, can be removed at later point
def get_gpu_list_and_realtime_gpu_available_ids() -> (List[dict], List[int]):
gpu_list = HardwareUtil.get_gpus()
gpu_count = len(gpu_list)
realtime_available_gpu_ids = HardwareUtil.get_available_gpu_ids(order='memory', limit=gpu_count, max_load=0.01,
max_memory=0.01)
return gpu_list, realtime_available_gpu_ids

# This function is just for debugging, can be removed at later point
def trim_unavailable_gpu_ids(gpu_ids) -> List[int]:
# Trim the gpu ids based on the realtime available gpu id list.
available_gpu_ids = [int(gpu_id) for gpu_id in gpu_ids]
gpu_list, realtime_available_gpu_ids = get_gpu_list_and_realtime_gpu_available_ids()
unavailable_gpu_ids = list()

for gpu_id in available_gpu_ids:
if gpu_id not in realtime_available_gpu_ids:
unavailable_gpu_ids.append(gpu_id)

trimmed_gpu_ids = list(set(available_gpu_ids) - set(unavailable_gpu_ids))
return trimmed_gpu_ids.copy()


class HardwareUtil(metaclass=Singleton):
__gpu_util: Optional[GPUCardUtil] = None

Expand Down Expand Up @@ -60,6 +83,8 @@ def get_docker_gpu_ids_by_container_name(container_name: str, docker_client: Doc
if __name__ == "__main__":
gpus = HardwareUtil.get_gpus()
get_available_gpu_cards = HardwareUtil.get_available_gpu_ids(limit=len(gpus))
trimmed_gpu_ids = trim_unavailable_gpu_ids(get_available_gpu_cards)
print(trimmed_gpu_ids)
device_mapping = HardwareUtil.get_docker_gpu_device_mapping(get_available_gpu_cards, len(get_available_gpu_cards))
print(gpus)
print(get_available_gpu_cards)
Expand Down
7 changes: 7 additions & 0 deletions python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None,
# Get the available GPU list, FEDML_GLOBAL_DEVICE_AVAILABLE_GPU_IDS_TAG-${device_id}
available_gpu_ids = ComputeCacheManager.get_instance().get_gpu_cache().get_device_available_gpu_ids(
device_id)
logging.info(
f"Available GPU Ids fetched from cache: {available_gpu_ids}")

logging.info(f"Check worker({device_id})'s realtime gpu availability in DB"
f" for run {run_id}: {available_gpu_ids}")
Expand All @@ -94,8 +96,11 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None,
if available_gpu_ids is None:
# Get realtime GPU availability list from the system
available_gpu_ids = JobRunnerUtils.get_realtime_gpu_available_ids().copy()
logging.info(f"Cache not set yet, fetching realtime available GPU Ids: {available_gpu_ids}")
else:
available_gpu_ids = JobRunnerUtils.trim_unavailable_gpu_ids(available_gpu_ids)
logging.info(
f"Trimmed available GPU Ids: {available_gpu_ids}")

# Get the matched gpu ids string by the request gpu num
cuda_visible_gpu_ids_str, matched_gpu_num = JobRunnerUtils.request_gpu_ids(request_gpu_num,
Expand All @@ -119,6 +124,8 @@ def occupy_gpu_ids(self, run_id, request_gpu_num, device_id, inner_id=None,

ComputeCacheManager.get_instance().get_gpu_cache().set_device_available_gpu_ids(
device_id, available_gpu_ids)

logging.info(f"Updated cache with following available gpu ids: {available_gpu_ids}")

# For a single run, could be scale up. So if existed such a key, should extend, not replace
existed_gpu_nums = ComputeCacheManager.get_instance().get_gpu_cache().get_device_run_num_gpus(
Expand Down

0 comments on commit 62c4bb8

Please sign in to comment.