From e4e7824fcefd8014ab2584b846cbf1a62efa25cb Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Wed, 4 Dec 2024 09:03:25 +0100 Subject: [PATCH 1/7] tests/levelzero: don't mix up a ZES and ZE function Didn't seem to break anything so far. Signed-off-by: Brice Goglin --- tests/hwloc/levelzero.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/hwloc/levelzero.c b/tests/hwloc/levelzero.c index c520e897b..f2ada2503 100644 --- a/tests/hwloc/levelzero.c +++ b/tests/hwloc/levelzero.c @@ -164,7 +164,7 @@ int main(void) sdvh = malloc(nbdevices * sizeof(*sdvh)); if (!sdvh) continue; - res = zeDeviceGet(sdrh[i], &nbdevices, sdvh); + res = zesDeviceGet(sdrh[i], &nbdevices, sdvh); if (res != ZE_RESULT_SUCCESS) { free(sdvh); continue; From efcd681cfa26db5bb4e302a142fa58d071d89a59 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Wed, 4 Dec 2024 09:05:11 +0100 Subject: [PATCH 2/7] tests/levelzero: remove some checks for ZES devices ZE and ZES may return devices in different orders. https://github.com/open-mpi/hwloc/pull/595#issuecomment-2515160994 Signed-off-by: Brice Goglin --- tests/hwloc/levelzero.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/hwloc/levelzero.c b/tests/hwloc/levelzero.c index f2ada2503..ea9c6c436 100644 --- a/tests/hwloc/levelzero.c +++ b/tests/hwloc/levelzero.c @@ -185,18 +185,17 @@ int main(void) printf("found OSDev %s\n", osdev->name); err = strncmp(osdev->name, "ze", 2); assert(!err); - assert(atoi(osdev->name+2) == (int) k); + /* don't check the index, + * ZE and ZES device orders may be different inside a single driver. + */ assert(osdev->attr->osdev.types == (HWLOC_OBJ_OSDEV_COPROC|HWLOC_OBJ_OSDEV_GPU)); assert(has_levelzero_backend); - value = hwloc_obj_get_info_by_name(osdev, "LevelZeroDriverIndex"); - assert(value); - assert(atoi(value) == (int) i); - value = hwloc_obj_get_info_by_name(osdev, "LevelZeroDriverDeviceIndex"); - assert(value); - assert(atoi(value) == (int) j); + /* don't check LevelZeroDriverIndex and LevelZeroDriverDeviceIndex, + * ZE and ZES device orders may be different inside a single driver. + */ set = hwloc_bitmap_alloc(); err = hwloc_levelzero_get_sysman_device_cpuset(topology, sdvh[j], set); From cb3a260c27f4c3235061b12b3d9eef80d4ba83d9 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Wed, 4 Dec 2024 11:11:17 +0100 Subject: [PATCH 3/7] levelzero.h: we always return the root/parent device, not a subdevice Refs #698 Signed-off-by: Brice Goglin --- include/hwloc/levelzero.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/hwloc/levelzero.h b/include/hwloc/levelzero.h index 2c972fe06..c1b438456 100644 --- a/include/hwloc/levelzero.h +++ b/include/hwloc/levelzero.h @@ -166,6 +166,10 @@ hwloc_levelzero_get_sysman_device_cpuset(hwloc_topology_t topology __hwloc_attri * topology. If not, the locality of the object may still be found using * hwloc_levelzero_get_device_cpuset(). * + * \note If the input ZE device is actually a subdevice, then its parent + * (root device) is actually translated, i.e. the main hwloc OS device + * is returned instead of one of its children. + * * \note The corresponding hwloc PCI device may be found by looking * at the result parent pointer (unless PCI devices are filtered out). * @@ -231,6 +235,10 @@ hwloc_levelzero_get_device_osdev(hwloc_topology_t topology, ze_device_handle_t d * topology. If not, the locality of the object may still be found using * hwloc_levelzero_get_device_cpuset(). * + * \note If the input ZES device is actually a subdevice, then its parent + * (root device) is actually translated, i.e. the main hwloc OS device + * is returned instead of one of its children. + * * \note The corresponding hwloc PCI device may be found by looking * at the result parent pointer (unless PCI devices are filtered out). */ From a987736e1a0353a9c197b4d41eccc07413014ce4 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Mon, 12 Jun 2023 11:08:05 +0200 Subject: [PATCH 4/7] levelzero: only get memory info from sysman Now that zesInit() is mandatory, don't bother falling back to the core API, Sysman shouldn't fail. Signed-off-by: Brice Goglin --- hwloc/topology-levelzero.c | 97 +------------------ .../include/levelzero/level_zero/ze_api.h | 7 -- 2 files changed, 4 insertions(+), 100 deletions(-) diff --git a/hwloc/topology-levelzero.c b/hwloc/topology-levelzero.c index 0252bae86..36af7c7bf 100644 --- a/hwloc/topology-levelzero.c +++ b/hwloc/topology-levelzero.c @@ -172,9 +172,9 @@ hwloc__levelzero_cqprops_get(ze_device_handle_t zeh, } static int -hwloc__levelzero_memory_get_from_sysman(zes_device_handle_t zesh, - hwloc_obj_t root_osdev, - unsigned nr_osdevs, hwloc_obj_t *sub_osdevs) +hwloc__levelzero_memory_get(zes_device_handle_t zesh, + hwloc_obj_t root_osdev, + unsigned nr_osdevs, hwloc_obj_t *sub_osdevs) { zes_mem_handle_t *mh; uint32_t nr_mems; @@ -279,95 +279,6 @@ hwloc__levelzero_memory_get_from_sysman(zes_device_handle_t zesh, return 0; } -static void -hwloc__levelzero_memory_get_from_coreapi(ze_device_handle_t zeh, - hwloc_obj_t osdev, - int ignore_ddr) -{ - ze_device_memory_properties_t *mh; - uint32_t nr_mems; - ze_result_t res; - - nr_mems = 0; - res = zeDeviceGetMemoryProperties(zeh, &nr_mems, NULL); - if (res != ZE_RESULT_SUCCESS || !nr_mems) - return; - hwloc_debug("L0/CoreAPI: found %u memories in osdev %s\n", - nr_mems, osdev->name); - - mh = malloc(nr_mems * sizeof(*mh)); - if (mh) { - res = zeDeviceGetMemoryProperties(zeh, &nr_mems, mh); - if (res == ZE_RESULT_SUCCESS) { - unsigned m; - for(m=0; mname); - if (!mh[m].totalSize) - continue; - if (ignore_ddr && !strcmp(_name, "DDR")) - continue; - if (!_name[0]) - _name = "Memory"; - snprintf(name, sizeof(name), "LevelZero%sSize", _name); /* HBM or DDR, or Memory if unknown */ - snprintf(value, sizeof(value), "%lluKiB", (unsigned long long) mh[m].totalSize >> 10); - hwloc_obj_add_info(osdev, name, value); - } - } - free(mh); - } -} - - -static void -hwloc__levelzero_memory_get(ze_device_handle_t zeh, zes_device_handle_t zesh, - hwloc_obj_t root_osdev, int is_integrated, - unsigned nr_subdevices, zes_device_handle_t *subzehs, hwloc_obj_t *sub_osdevs) -{ - static int memory_from_coreapi = -1; /* 1 means coreapi, 0 means sysman, -1 means sysman if available or coreapi otherwise */ - static int first = 1; - - if (first) { - char *env; - env = getenv("HWLOC_L0_COREAPI_MEMORY"); - if (env) - memory_from_coreapi = atoi(env); - - if (memory_from_coreapi == -1) { - int ret = hwloc__levelzero_memory_get_from_sysman(zesh, root_osdev, nr_subdevices, sub_osdevs); - if (!ret) { - /* sysman worked, we're done, disable coreapi for next time */ - hwloc_debug("levelzero: sysman/memory succeeded, disabling coreapi memory queries\n"); - memory_from_coreapi = 0; - return; - } - /* sysman failed, enable coreapi */ - hwloc_debug("levelzero: sysman/memory failed, enabling coreapi memory queries\n"); - memory_from_coreapi = 1; - } - - first = 0; - } - - if (memory_from_coreapi > 0) { - unsigned k; - int ignore_ddr = (memory_from_coreapi != 2) && is_integrated; /* DDR ignored in integrated GPUs, it's like the host DRAM */ - hwloc__levelzero_memory_get_from_coreapi(zeh, root_osdev, ignore_ddr); - for(k=0; k Date: Wed, 13 Nov 2024 16:27:13 +0100 Subject: [PATCH 5/7] levelzero: remove the is_integrated variable We don't need it anymore. Signed-off-by: Brice Goglin --- hwloc/topology-levelzero.c | 15 +++------------ .../ports/include/levelzero/level_zero/ze_api.h | 1 - 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/hwloc/topology-levelzero.c b/hwloc/topology-levelzero.c index 36af7c7bf..52ba537d7 100644 --- a/hwloc/topology-levelzero.c +++ b/hwloc/topology-levelzero.c @@ -63,14 +63,12 @@ hwloc__levelzero_osdev_array_find(struct hwloc_osdev_array *array, static void hwloc__levelzero_properties_get(ze_device_handle_t zeh, zes_device_handle_t zesh, - hwloc_obj_t osdev, - int *is_integrated_p) + hwloc_obj_t osdev) { ze_result_t res; ze_device_properties_t prop; zes_device_properties_t prop2; int is_subdevice = 0; - int is_integrated = 0; memset(&prop, 0, sizeof(prop)); res = zeDeviceGetProperties(zeh, &prop); @@ -110,14 +108,8 @@ hwloc__levelzero_properties_get(ze_device_handle_t zeh, zes_device_handle_t zesh if (prop.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE) is_subdevice = 1; - - if (prop.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) - is_integrated = 1; } - if (is_integrated_p) - *is_integrated_p = is_integrated; - if (is_subdevice) /* sysman API on subdevice returns the same as root device, and we don't need those duplicate attributes */ return; @@ -462,7 +454,6 @@ hwloc__levelzero_devices_get(struct hwloc_topology *topology, hwloc_obj_t osdev, parent, *subosdevs = NULL; ze_device_properties_t props; zes_uuid_t uuid; - int is_integrated = 0; ze_bool_t onSubdevice = 0; uint32_t subdeviceId = 0; @@ -494,7 +485,7 @@ hwloc__levelzero_devices_get(struct hwloc_topology *topology, snprintf(buffer, sizeof(buffer), "%u", j); hwloc_obj_add_info(osdev, "LevelZeroDriverDeviceIndex", buffer); - hwloc__levelzero_properties_get(zeh, zesh, osdev, &is_integrated); + hwloc__levelzero_properties_get(zeh, zesh, osdev); hwloc__levelzero_cqprops_get(zeh, osdev); @@ -538,7 +529,7 @@ hwloc__levelzero_devices_get(struct hwloc_topology *topology, snprintf(tmp, sizeof(tmp), "%u", k); hwloc_obj_add_info(subosdevs[k], "LevelZeroSubdeviceID", tmp); - hwloc__levelzero_properties_get(subzeh, subzesh, subosdevs[k], NULL); + hwloc__levelzero_properties_get(subzeh, subzesh, subosdevs[k]); hwloc__levelzero_cqprops_get(subzeh, subosdevs[k]); } diff --git a/tests/hwloc/ports/include/levelzero/level_zero/ze_api.h b/tests/hwloc/ports/include/levelzero/level_zero/ze_api.h index adbf84336..0cd4de2bf 100644 --- a/tests/hwloc/ports/include/levelzero/level_zero/ze_api.h +++ b/tests/hwloc/ports/include/levelzero/level_zero/ze_api.h @@ -28,7 +28,6 @@ typedef enum _ze_device_type { ZE_DEVICE_TYPE_VPU = 5 } ze_device_type_t; -#define ZE_DEVICE_PROPERTY_FLAG_INTEGRATED (1<<0) #define ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE (1<<1) #define ZE_MAX_DEVICE_UUID_SIZE 16 From f0a69cd92417258cf29952da17f28b4d81b40a6d Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Wed, 13 Nov 2024 16:33:16 +0100 Subject: [PATCH 6/7] levelzero: don't get device properties twice Signed-off-by: Brice Goglin --- hwloc/topology-levelzero.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/hwloc/topology-levelzero.c b/hwloc/topology-levelzero.c index 52ba537d7..50d4b52c8 100644 --- a/hwloc/topology-levelzero.c +++ b/hwloc/topology-levelzero.c @@ -63,16 +63,21 @@ hwloc__levelzero_osdev_array_find(struct hwloc_osdev_array *array, static void hwloc__levelzero_properties_get(ze_device_handle_t zeh, zes_device_handle_t zesh, - hwloc_obj_t osdev) + hwloc_obj_t osdev, ze_device_properties_t *prop) { ze_result_t res; - ze_device_properties_t prop; + ze_device_properties_t _prop; zes_device_properties_t prop2; int is_subdevice = 0; - memset(&prop, 0, sizeof(prop)); - res = zeDeviceGetProperties(zeh, &prop); - if (res == ZE_RESULT_SUCCESS) { + if (!prop) { + /* no properties were given, get ours */ + memset(&_prop, 0, sizeof(_prop)); + res = zeDeviceGetProperties(zeh, &_prop); + if (res == ZE_RESULT_SUCCESS) + prop = &_prop; + } + if (prop) { /* name is the model name followed by the deviceID * flags 1<<0 means integrated (vs discrete). */ @@ -81,7 +86,7 @@ hwloc__levelzero_properties_get(ze_device_handle_t zeh, zes_device_handle_t zesh unsigned i; const char *type; - switch (prop.type) { + switch (prop->type) { case ZE_DEVICE_TYPE_GPU: type = "GPU"; break; case ZE_DEVICE_TYPE_CPU: type = "CPU"; break; case ZE_DEVICE_TYPE_FPGA: type = "FPGA"; break; @@ -89,24 +94,24 @@ hwloc__levelzero_properties_get(ze_device_handle_t zeh, zes_device_handle_t zesh case ZE_DEVICE_TYPE_VPU: type = "VPU"; break; default: if (HWLOC_SHOW_ALL_ERRORS()) - fprintf(stderr, "hwloc/levelzero: unexpected device type %u\n", (unsigned) prop.type); + fprintf(stderr, "hwloc/levelzero: unexpected device type %u\n", (unsigned) prop->type); type = "Unknown"; } hwloc_obj_add_info(osdev, "LevelZeroDeviceType", type); - snprintf(tmp, sizeof(tmp), "%u", prop.numSlices); + snprintf(tmp, sizeof(tmp), "%u", prop->numSlices); hwloc_obj_add_info(osdev, "LevelZeroNumSlices", tmp); - snprintf(tmp, sizeof(tmp), "%u", prop.numSubslicesPerSlice); + snprintf(tmp, sizeof(tmp), "%u", prop->numSubslicesPerSlice); hwloc_obj_add_info(osdev, "LevelZeroNumSubslicesPerSlice", tmp); - snprintf(tmp, sizeof(tmp), "%u", prop.numEUsPerSubslice); + snprintf(tmp, sizeof(tmp), "%u", prop->numEUsPerSubslice); hwloc_obj_add_info(osdev, "LevelZeroNumEUsPerSubslice", tmp); - snprintf(tmp, sizeof(tmp), "%u", prop.numThreadsPerEU); + snprintf(tmp, sizeof(tmp), "%u", prop->numThreadsPerEU); hwloc_obj_add_info(osdev, "LevelZeroNumThreadsPerEU", tmp); for(i=0; iuuid.id[i]); hwloc_obj_add_info(osdev, "LevelZeroUUID", uuid); - if (prop.flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE) + if (prop->flags & ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE) is_subdevice = 1; } @@ -485,7 +490,7 @@ hwloc__levelzero_devices_get(struct hwloc_topology *topology, snprintf(buffer, sizeof(buffer), "%u", j); hwloc_obj_add_info(osdev, "LevelZeroDriverDeviceIndex", buffer); - hwloc__levelzero_properties_get(zeh, zesh, osdev); + hwloc__levelzero_properties_get(zeh, zesh, osdev, &props); hwloc__levelzero_cqprops_get(zeh, osdev); @@ -529,7 +534,7 @@ hwloc__levelzero_devices_get(struct hwloc_topology *topology, snprintf(tmp, sizeof(tmp), "%u", k); hwloc_obj_add_info(subosdevs[k], "LevelZeroSubdeviceID", tmp); - hwloc__levelzero_properties_get(subzeh, subzesh, subosdevs[k]); + hwloc__levelzero_properties_get(subzeh, subzesh, subosdevs[k], NULL); hwloc__levelzero_cqprops_get(subzeh, subosdevs[k]); } From f630dfcf76d0c2e025209d88650d9df13c01a849 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Mon, 12 Jun 2023 11:12:08 +0200 Subject: [PATCH 7/7] levelzero: only get pci locality info from sysman Now that zesInit() is mandatory, don't bother trying the core API extension first in case sysman wouldn't be available. Hence remove the zeDevicePciGetPropertiesExt() optional detection Signed-off-by: Brice Goglin --- config/hwloc.m4 | 2 -- hwloc/topology-levelzero.c | 21 ------------------- tests/hwloc/ports/Makefile.am | 3 +-- .../include/levelzero/level_zero/ze_api.h | 21 ------------------- 4 files changed, 1 insertion(+), 46 deletions(-) diff --git a/config/hwloc.m4 b/config/hwloc.m4 index 4e9560f04..9df69892a 100644 --- a/config/hwloc.m4 +++ b/config/hwloc.m4 @@ -1413,7 +1413,6 @@ return clGetDeviceIDs(0, 0, 0, NULL, NULL); HWLOC_PKG_CHECK_MODULES([LEVELZERO], [libze_loader], [zesDriverGetDeviceByUuidExp], [level_zero/zes_api.h], [hwloc_levelzero_happy=yes HWLOC_LEVELZERO_REQUIRES=libze_loader - AC_CHECK_LIB([ze_loader], [zeDevicePciGetPropertiesExt], [AC_DEFINE(HWLOC_HAVE_ZEDEVICEPCIGETPROPERTIESEXT, 1, [Define to 1 if zeDevicePciGetPropertiesExt is available])]) ], [hwloc_levelzero_happy=no]) if test x$hwloc_levelzero_happy = xno; then hwloc_levelzero_happy=yes @@ -1423,7 +1422,6 @@ return clGetDeviceIDs(0, 0, 0, NULL, NULL); AC_CHECK_LIB([ze_loader], [zesDriverGetDeviceByUuidExp], [HWLOC_LEVELZERO_LIBS="-lze_loader" - AC_CHECK_LIB([ze_loader], [zeDevicePciGetPropertiesExt], [AC_DEFINE(HWLOC_HAVE_ZEDEVICEPCIGETPROPERTIESEXT, 1, [Define to 1 if zeDevicePciGetPropertiesExt is available])]) ], [hwloc_levelzero_happy=no]) ], [hwloc_levelzero_happy=no]) ], [hwloc_levelzero_happy=no]) diff --git a/hwloc/topology-levelzero.c b/hwloc/topology-levelzero.c index 50d4b52c8..14207e7c6 100644 --- a/hwloc/topology-levelzero.c +++ b/hwloc/topology-levelzero.c @@ -555,26 +555,6 @@ hwloc__levelzero_devices_get(struct hwloc_topology *topology, hwloc__levelzero_ports_get(zesh, osdev, nr_subdevices, subosdevs, hports); parent = NULL; -#ifdef HWLOC_HAVE_ZEDEVICEPCIGETPROPERTIESEXT - { /* try getting PCI BDF+speed from core extension */ - ze_pci_ext_properties_t ext_pci; - ext_pci.stype = ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES; - ext_pci.pNext = NULL; - res = zeDevicePciGetPropertiesExt(zeh, &ext_pci); - if (res == ZE_RESULT_SUCCESS) { - parent = hwloc_pci_find_parent_by_busid(topology, - ext_pci.address.domain, - ext_pci.address.bus, - ext_pci.address.device, - ext_pci.address.function); - if (parent && parent->type == HWLOC_OBJ_PCI_DEVICE) { - if (ext_pci.maxSpeed.maxBandwidth > 0) - parent->attr->pcidev.linkspeed = ((float)ext_pci.maxSpeed.maxBandwidth)/1000/1000/1000; - } - } - } -#endif /* HWLOC_HAVE_LEVELZERO_CORE_PCI_EXT */ - if (!parent) { /* try getting PCI BDF+speed from sysman */ zes_pci_properties_t pci; res = zesDevicePciGetProperties(zesh, &pci); @@ -589,7 +569,6 @@ hwloc__levelzero_devices_get(struct hwloc_topology *topology, parent->attr->pcidev.linkspeed = ((float)pci.maxSpeed.maxBandwidth)/1000/1000/1000; } } - } if (!parent) parent = hwloc_get_root_obj(topology); diff --git a/tests/hwloc/ports/Makefile.am b/tests/hwloc/ports/Makefile.am index 645cd1d4c..8addaf2af 100644 --- a/tests/hwloc/ports/Makefile.am +++ b/tests/hwloc/ports/Makefile.am @@ -177,8 +177,7 @@ libhwloc_port_levelzero_la_SOURCES = \ include/levelzero/level_zero/ze_api.h \ include/levelzero/level_zero/zes_api.h libhwloc_port_levelzero_la_CPPFLAGS = $(common_CPPFLAGS) \ - -I$(HWLOC_top_srcdir)/tests/hwloc/ports/include/levelzero \ - -DHWLOC_HAVE_ZEDEVICEPCIGETPROPERTIESEXT=1 + -I$(HWLOC_top_srcdir)/tests/hwloc/ports/include/levelzero nodist_libhwloc_port_gl_la_SOURCES = topology-gl.c libhwloc_port_gl_la_SOURCES = \ diff --git a/tests/hwloc/ports/include/levelzero/level_zero/ze_api.h b/tests/hwloc/ports/include/levelzero/level_zero/ze_api.h index 0cd4de2bf..dffb4683b 100644 --- a/tests/hwloc/ports/include/levelzero/level_zero/ze_api.h +++ b/tests/hwloc/ports/include/levelzero/level_zero/ze_api.h @@ -57,25 +57,4 @@ extern ze_result_t zeDeviceGetCommandQueueGroupProperties(ze_driver_handle_t, ui extern ze_result_t zeDeviceGetSubDevices(ze_device_handle_t, uint32_t *, ze_device_handle_t*); -typedef struct ze_pci_address_ext { - uint32_t domain, bus, device, function; -} ze_pci_address_ext_t; - -typedef struct ze_pci_speed_ext { - int64_t maxBandwidth; -} ze_pci_speed_ext_t; - -typedef int ze_structure_type_t; - -#define ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES 0x10008 - -typedef struct ze_pci_ext_properties { - ze_structure_type_t stype; - void* pNext; - ze_pci_address_ext_t address; - ze_pci_speed_ext_t maxSpeed; -} ze_pci_ext_properties_t; - -extern ze_result_t zeDevicePciGetPropertiesExt(ze_device_handle_t, ze_pci_ext_properties_t *); - #endif /* HWLOC_PORT_L0_ZE_API_H */