Skip to content

Commit

Permalink
[shortfin] Add options to set allocators and logical devices per phys…
Browse files Browse the repository at this point in the history
…ical. (#442)

* `allocators=`, `amdgpu_allocators=` and `hostcpu_allocators=` keywords
control application of allocator decorators. Adding specifically to
enable `caching`.
* `amdgpu_logical_devices_per_physical_device=n` will create `n` logical
HAL devices per physical/visible AMDGPU device. This is used for some
forms of multi-device distribution.

Filed #443 with a disabled test which seems to tickle an IREE memory
leak in `iree_hal_configure_allocator_from_specs`.
  • Loading branch information
stellaraccident authored Nov 7, 2024
1 parent 01521fd commit 92170e1
Show file tree
Hide file tree
Showing 13 changed files with 256 additions and 27 deletions.
10 changes: 8 additions & 2 deletions shortfin/examples/python/mobilenet_server/build_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,11 @@ echo "Import onnx model"
python -m iree.compiler.tools.import_onnx $onnx_upgrade_path -o $mlir_path

echo "Compile onnx model"
python -m iree.compiler.tools.scripts.ireec \
$mlir_path -o "$vmfb_path" --iree-input-type=onnx --iree-hal-target-backends=llvm-cpu
if [ -z "$@" ]; then
compile_flags="--iree-hal-target-backends=llvm-cpu --iree-llvmcpu-target-cpu=host"
else
compile_flags="$@"
fi
echo "Using compile flags: $compile_flags"
python -m iree.compiler.tools.scripts.iree_compile \
$mlir_path -o "$vmfb_path" --iree-input-type=onnx $compile_flags
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,8 @@ def client():
# Done.
writer.close()

lsys = sf.host.CPUSystemBuilder().create_system()
sf.SystemBuilder.default_system_type = "hostcpu"
lsys = sf.SystemBuilder().create_system()
main = Main(lsys, home_dir)
lsys.init_worker.call_threadsafe(client)
lsys.run(main.main())
Expand Down
77 changes: 76 additions & 1 deletion shortfin/python/lib_ext.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,24 @@ or integer values:
**kwargs: Key/value arguments for controlling setup of the system.
)";

static const char DOCSTRING_HOSTCPU_SYSTEM_BUILDER_HOSTCPU_ALLOCATOR_SPECS[] =
R"(Allocator specs to apply to HOSTCPU devices configured by this builder.
This uses syntax like::
some_allocator
some_allocator:key=value
some_allocator:key=value,key=value
some_allocator:key=value,key=value;other_allocator:key=value
Typical values for `some_allocator` include `caching` and `debug`.
This can be set via a keyword of `amdgpu_allocators`, which will only apply to
HOSTCPU devices or `allocators` which will apply to all contained devices.
Similarly, it is available on a `SHORTFIN_` prefixed env variable if environment
lookup is not disabled.
)";

static const char DOCSTRING_PROGRAM_FUNCTION_INVOCATION[] =
R"(Creates an invocation object targeting the function.
Expand Down Expand Up @@ -1214,7 +1232,17 @@ void BindHostSystem(py::module_ &global_m) {
py::arg("cls") = py::none(), py::kw_only(),
py::arg("env_prefix").none() = "SHORTFIN_",
py::arg("validate_undef") = true, py::arg("kwargs"),
DOCSTRING_HOSTCPU_SYSTEM_BUILDER_CTOR);
DOCSTRING_HOSTCPU_SYSTEM_BUILDER_CTOR)
.def_prop_rw(
"hostcpu_allocator_specs",
[](local::systems::HostCPUSystemBuilder &self) {
return self.hostcpu_allocator_specs();
},
[](local::systems::HostCPUSystemBuilder &self,
std::vector<std::string> specs) {
self.hostcpu_allocator_specs() = std::move(specs);
},
DOCSTRING_HOSTCPU_SYSTEM_BUILDER_HOSTCPU_ALLOCATOR_SPECS);
py::class_<local::systems::HostCPUDevice, local::Device>(m, "HostCPUDevice");
}

Expand All @@ -1236,6 +1264,24 @@ constructor.
**kwargs: Key/value arguments for controlling setup of the system.
)";

static const char DOCSTRING_AMDGPU_SYSTEM_BUILDER_AMDGPU_ALLOCATOR_SPECS[] =
R"(Allocator specs to apply to AMDGPU devices configured by this builder.
This uses syntax like::
some_allocator
some_allocator:key=value
some_allocator:key=value,key=value
some_allocator:key=value,key=value;other_allocator:key=value
Typical values for `some_allocator` include `caching` and `debug`.
This can be set via a keyword of `amdgpu_allocators`, which will only apply to
AMDGPU devices or `allocators` which will apply to all contained devices.
Similarly, it is available on a `SHORTFIN_` prefixed env variable if environment
lookup is not disabled.
)";

static const char DOCSTRING_AMDGPU_SYSTEM_BUILDER_CPU_DEVICES_ENABLED[] =
R"(Whether to create a heterogenous system with hostcpu and amdgpu devices.
Expand Down Expand Up @@ -1267,6 +1313,16 @@ environment variable is searched as a fallback in all cases. Multiple paths
can be separated by semicolons on all platforms.
)";

static const char
DOCSTRING_AMDGPU_SYSTEM_BUILDER_LOGICAL_DEVICES_PER_PHYSICAL_DEVICE[] =
R"(Number of logical devices to open per physical, visible device.
This option can be set as an option keyword with the name
"amgdpu_logical_devices_per_physical_device" or the environment variable
"SHORTFIN_AMDGPU_LOGICAL_DEVICES_PER_PHYSICAL_DEVICE" (if `env_prefix` was not
changed at construction).
)";

static const char DOCSTRING_AMDGPU_SYSTEM_BUILDER_TRACING_LEVEL[] =
R"(Tracing level for AMDGPU device behavior.
Expand Down Expand Up @@ -1340,6 +1396,16 @@ void BindAMDGPUSystem(py::module_ &global_m) {
py::arg("env_prefix").none() = "SHORTFIN_",
py::arg("validate_undef") = true, py::arg("kwargs"),
DOCSTRING_AMDGPU_SYSTEM_BUILDER_CTOR)
.def_prop_rw(
"amdgpu_allocator_specs",
[](local::systems::AMDGPUSystemBuilder &self) {
return self.amdgpu_allocator_specs();
},
[](local::systems::AMDGPUSystemBuilder &self,
std::vector<std::string> specs) {
self.amdgpu_allocator_specs() = std::move(specs);
},
DOCSTRING_AMDGPU_SYSTEM_BUILDER_AMDGPU_ALLOCATOR_SPECS)
.def_prop_ro(
"available_devices",
[](local::systems::AMDGPUSystemBuilder &self) {
Expand Down Expand Up @@ -1373,6 +1439,15 @@ void BindAMDGPUSystem(py::module_ &global_m) {
self.tracing_level() = tracing_level;
},
DOCSTRING_AMDGPU_SYSTEM_BUILDER_TRACING_LEVEL)
.def_prop_rw(
"logical_devices_per_physical_device",
[](local::systems::AMDGPUSystemBuilder &self) -> size_t {
return self.logical_devices_per_physical_device();
},
[](local::systems::AMDGPUSystemBuilder &self, size_t value) {
self.logical_devices_per_physical_device() = value;
},
DOCSTRING_AMDGPU_SYSTEM_BUILDER_LOGICAL_DEVICES_PER_PHYSICAL_DEVICE)
.def_prop_rw(
"visible_devices",
[](local::systems::AMDGPUSystemBuilder &self)
Expand Down
1 change: 1 addition & 0 deletions shortfin/src/shortfin/local/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ shortfin_cc_component(
iree_base_base
iree_base_loop_sync
iree_hal_hal
iree_hal_utils_allocators
iree_io_formats_parser_registry
iree_modules_io_parameters_parameters
iree_modules_hal_hal
Expand Down
39 changes: 39 additions & 0 deletions shortfin/src/shortfin/local/system.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include <fmt/core.h>

#include "iree/hal/utils/allocators.h"
#include "shortfin/local/fiber.h"
#include "shortfin/support/logging.h"

Expand Down Expand Up @@ -239,4 +240,42 @@ void System::DeallocateProcess(int64_t pid) {
processes_by_pid_.erase(pid);
}

// -------------------------------------------------------------------------- //
// SystemBuilder
// -------------------------------------------------------------------------- //

void SystemBuilder::ConfigureAllocators(const std::vector<std::string> &specs,
iree_hal_device_t *device,
std::string_view device_debug_desc) {
if (specs.empty()) return;
std::vector<iree_string_view_t> spec_views;
spec_views.reserve(specs.size());
for (auto &spec : specs) {
spec_views.push_back(to_iree_string_view(spec));
}

logging::info("Configure allocator {} = [{}]", device_debug_desc,
fmt::join(specs, " ; "));

SHORTFIN_THROW_IF_ERROR(iree_hal_configure_allocator_from_specs(
spec_views.size(), spec_views.data(), device));
}

std::vector<std::string> SystemBuilder::GetConfigAllocatorSpecs(
std::optional<std::string_view> specific_config_key) {
std::optional<std::string_view> value;
if (specific_config_key) {
value = config_options().GetOption(*specific_config_key);
}
if (!value) {
value = config_options().GetOption("allocators");
}
if (!value) {
return {};
}

auto split_views = ConfigOptions::Split(*value, ';');
return std::vector<std::string>(split_views.begin(), split_views.end());
}

} // namespace shortfin::local
21 changes: 21 additions & 0 deletions shortfin/src/shortfin/local/system.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,27 @@ class SHORTFIN_API SystemBuilder {
// Construct a System
virtual SystemPtr CreateSystem() = 0;

protected:
// Uses the iree_hal_configure_allocator_from_specs() API to configure
// allocators for a device. The specs are parsed from the given config_key
// if it exists and take the form:
// some_allocator
// some_allocator:key=value
// some_allocator:key=value,key=value
// some_allocator:key=value,key=value;other_allocator:key=value
void ConfigureAllocators(const std::vector<std::string> &specs,
iree_hal_device_t *device,
std::string_view device_debug_desc);

// Gets a list of allocator specs from the config. If `specific_config_key`
// is given, this will be consulted first and used if available. Otherwise,
// "allocators" will be used. For SystemBuilders that handle multiple
// device types, the specific key will be something like "amdgpu_allocators"
// or "hostcpu_allocators" and will be used to allow independently scoped
// allocator specs.
std::vector<std::string> GetConfigAllocatorSpecs(
std::optional<std::string_view> specific_config_key);

private:
const iree_allocator_t host_allocator_;
ConfigOptions config_options_;
Expand Down
48 changes: 33 additions & 15 deletions shortfin/src/shortfin/local/systems/amdgpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ AMDGPUSystemBuilder::AMDGPUSystemBuilder(iree_allocator_t host_allocator,
available_devices_(host_allocator) {
iree_hal_hip_device_params_initialize(&default_device_params_);
InitializeDefaultSettings();
config_options().ValidateUndef();
}

AMDGPUSystemBuilder::~AMDGPUSystemBuilder() = default;
Expand All @@ -41,6 +42,10 @@ void AMDGPUSystemBuilder::InitializeDefaultSettings() {
}
}

// Gets allocator specs from either "amdgpu_allocators" or the fallback
// "allocators".
amdgpu_allocator_specs_ = GetConfigAllocatorSpecs("amdgpu_allocators");

// HIP options.
// "amdgpu_tracing_level": Matches IREE flag --hip_tracing:
// Permissible values are:
Expand All @@ -51,6 +56,13 @@ void AMDGPUSystemBuilder::InitializeDefaultSettings() {
config_options().GetInt("amdgpu_tracing_level", /*non_negative=*/true);
default_device_params_.stream_tracing = tracing_level ? *tracing_level : 2;

// Override logical_devices_per_physical_device if present.
auto logical_devices_per_physical_device = config_options().GetInt(
"amdgpu_logical_devices_per_physical_device", /*non_negative=*/true);
if (logical_devices_per_physical_device) {
logical_devices_per_physical_device_ = *logical_devices_per_physical_device;
}

// CPU devices.
cpu_devices_enabled_ = config_options().GetBool("amdgpu_cpu_devices_enabled");

Expand Down Expand Up @@ -176,21 +188,27 @@ SystemPtr AMDGPUSystemBuilder::CreateSystem() {
for (size_t instance_ordinal = 0; instance_ordinal < used_device_ids.size();
++instance_ordinal) {
iree_hal_device_id_t device_id = used_device_ids[instance_ordinal];
iree::hal_device_ptr device;
SHORTFIN_THROW_IF_ERROR(iree_hal_driver_create_device_by_id(
hip_hal_driver_, device_id, 0, nullptr, host_allocator(),
device.for_output()));
lsys->InitializeHalDevice(std::make_unique<AMDGPUDevice>(
DeviceAddress(
/*system_device_class=*/SYSTEM_DEVICE_CLASS,
/*logical_device_class=*/LOGICAL_DEVICE_CLASS,
/*hal_driver_prefix=*/HAL_DRIVER_PREFIX,
/*instance_ordinal=*/instance_ordinal,
/*queue_ordinal=*/0,
/*instance_topology_address=*/{0}),
/*hal_device=*/device,
/*node_affinity=*/0,
/*capabilities=*/static_cast<uint32_t>(Device::Capabilities::NONE)));
for (size_t logical_index = 0;
logical_index < logical_devices_per_physical_device_;
++logical_index) {
iree::hal_device_ptr device;
SHORTFIN_THROW_IF_ERROR(iree_hal_driver_create_device_by_id(
hip_hal_driver_, device_id, 0, nullptr, host_allocator(),
device.for_output()));
DeviceAddress address(
/*system_device_class=*/SYSTEM_DEVICE_CLASS,
/*logical_device_class=*/LOGICAL_DEVICE_CLASS,
/*hal_driver_prefix=*/HAL_DRIVER_PREFIX,
/*instance_ordinal=*/instance_ordinal,
/*queue_ordinal=*/0,
/*instance_topology_address=*/{logical_index});
ConfigureAllocators(amdgpu_allocator_specs_, device, address.device_name);
lsys->InitializeHalDevice(std::make_unique<AMDGPUDevice>(
address,
/*hal_device=*/device,
/*node_affinity=*/0,
/*capabilities=*/static_cast<uint32_t>(Device::Capabilities::NONE)));
}
}

// Initialize CPU devices if requested.
Expand Down
20 changes: 20 additions & 0 deletions shortfin/src/shortfin/local/systems/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,26 @@ class SHORTFIN_API AMDGPUSystemBuilder : public HostCPUSystemBuilder {
return visible_devices_;
};

// Allocator specs to apply to amdgpu devices in this builder.
std::vector<std::string> &amdgpu_allocator_specs() {
return amdgpu_allocator_specs_;
}

// "amdgpu_tracing_level": Matches IREE flag --hip_tracing:
// Permissible values are:
// 0 : stream tracing disabled.
// 1 : coarse command buffer level tracing enabled.
// 2 : fine-grained kernel level tracing enabled.
int32_t &tracing_level() { return default_device_params_.stream_tracing; }

// The number of logical HAL devices to create per physical, visible device.
// This form of topology can be useful in certain cases where we aim to have
// oversubscription emulating what would usually be achieved with process
// level isolation. Defaults to 1.
size_t &logical_devices_per_physical_device() {
return logical_devices_per_physical_device_;
}

// Gets all enumerated available device ids. This triggers enumeration, so
// any settings required for that must already be set. This does no filtering
// and will return all device ids.
Expand All @@ -77,6 +95,8 @@ class SHORTFIN_API AMDGPUSystemBuilder : public HostCPUSystemBuilder {
bool cpu_devices_enabled_ = false;
std::vector<std::string> hip_lib_search_paths_;
std::optional<std::vector<std::string>> visible_devices_;
size_t logical_devices_per_physical_device_ = 1;
std::vector<std::string> amdgpu_allocator_specs_;

// Valid post enumeration.
iree::hal_driver_ptr hip_hal_driver_;
Expand Down
21 changes: 13 additions & 8 deletions shortfin/src/shortfin/local/systems/host.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,9 @@ HostCPUSystemBuilder::Deps::~Deps() {
HostCPUSystemBuilder::HostCPUSystemBuilder(iree_allocator_t host_allocator,
ConfigOptions config_options)
: HostSystemBuilder(host_allocator, std::move(config_options)),
host_cpu_deps_(host_allocator) {}
host_cpu_deps_(host_allocator) {
hostcpu_allocator_specs_ = GetConfigAllocatorSpecs("hostcpu_allocators");
}

HostCPUSystemBuilder::~HostCPUSystemBuilder() = default;

Expand Down Expand Up @@ -199,16 +201,19 @@ void HostCPUSystemBuilder::InitializeHostCPUDevices(System &lsys,
SHORTFIN_THROW_IF_ERROR(iree_hal_driver_create_device_by_id(
driver, it->device_id, 0, nullptr, host_allocator(),
device.for_output()));
ConfigureAllocators(hostcpu_allocator_specs_, device, "hostcpu");

iree_host_size_t queue_index = 0;
for (auto node_id : queue_node_ids_) {
DeviceAddress address(
/*system_device_class=*/SYSTEM_DEVICE_CLASS,
/*logical_device_class=*/LOGICAL_DEVICE_CLASS,
/*hal_driver_prefix=*/HAL_DRIVER_PREFIX,
/*instance_ordinal=*/0,
/*queue_ordinal=*/queue_index,
/*instance_topology_address=*/{queue_index});
lsys.InitializeHalDevice(std::make_unique<HostCPUDevice>(
DeviceAddress(
/*system_device_class=*/SYSTEM_DEVICE_CLASS,
/*logical_device_class=*/LOGICAL_DEVICE_CLASS,
/*hal_driver_prefix=*/HAL_DRIVER_PREFIX,
/*instance_ordinal=*/0,
/*queue_ordinal=*/queue_index,
/*instance_topology_address=*/{queue_index}),
address,
/*hal_device=*/device,
/*node_affinity=*/node_id,
/*capabilities=*/
Expand Down
Loading

0 comments on commit 92170e1

Please sign in to comment.