From 476722ec30b0b1dc0f8e99e62c82f43e8ac77b92 Mon Sep 17 00:00:00 2001 From: "wangjianyu.wjy" Date: Fri, 15 Nov 2024 11:20:28 +0800 Subject: [PATCH] gpu: support strict gpu share with hami Signed-off-by: wangjianyu.wjy --- pkg/koordlet/runtimehooks/hooks/gpu/gpu.go | 26 +++++++++++++++++++ .../protocol/container_context.go | 10 +++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/pkg/koordlet/runtimehooks/hooks/gpu/gpu.go b/pkg/koordlet/runtimehooks/hooks/gpu/gpu.go index 65b6094b2..d35c12b32 100644 --- a/pkg/koordlet/runtimehooks/hooks/gpu/gpu.go +++ b/pkg/koordlet/runtimehooks/hooks/gpu/gpu.go @@ -20,6 +20,7 @@ import ( "fmt" "strings" + "github.com/containerd/nri/pkg/api" "k8s.io/klog/v2" ext "github.com/koordinator-sh/koordinator/apis/extension" @@ -70,5 +71,30 @@ func (p *gpuPlugin) InjectContainerGPUEnv(proto protocol.HooksProtocol) error { containerCtx.Response.AddContainerEnvs = make(map[string]string) } containerCtx.Response.AddContainerEnvs[GpuAllocEnv] = strings.Join(gpuIDs, ",") + + gpuResources := devices[0].Resources + gpuMemory, ok := gpuResources[ext.ResourceGPUMemory] + if !ok { + return fmt.Errorf("gpu memory not found in gpu resource") + } + gpuCore, ok := gpuResources[ext.ResourceGPUCore] + if !ok { + return fmt.Errorf("gpu core not found in gpu resource") + } + gpuMemoryRatio, ok := gpuResources[ext.ResourceGPUMemoryRatio] + if !ok { + return fmt.Errorf("gpu memory ratio not found in gpu resource") + } + if gpuMemoryRatio.Value() < 100 { + containerCtx.Response.AddContainerEnvs["CUDA_DEVICE_MEMORY_LIMIT"] = fmt.Sprintf("%d", gpuMemory.Value()) + containerCtx.Response.AddContainerEnvs["CUDA_DEVICE_SM_LIMIT"] = fmt.Sprintf("%d", gpuCore.Value()) + containerCtx.Response.AddContainerEnvs["LD_PRELOAD"] = "/libvgpu.so" + containerCtx.Response.AddContainerMounts = append(containerCtx.Response.AddContainerMounts, &api.Mount{ + Destination: "/libvgpu.so", + Type: "bind", + Source: "/usr/local/libvgpu.so", + }) + } + return nil } diff --git a/pkg/koordlet/runtimehooks/protocol/container_context.go b/pkg/koordlet/runtimehooks/protocol/container_context.go index a39ba04a9..311fc9bc7 100644 --- a/pkg/koordlet/runtimehooks/protocol/container_context.go +++ b/pkg/koordlet/runtimehooks/protocol/container_context.go @@ -181,8 +181,9 @@ func (c *ContainerRequest) FromReconciler(podMeta *statesinformer.PodMeta, conta } type ContainerResponse struct { - Resources Resources - AddContainerEnvs map[string]string + Resources Resources + AddContainerEnvs map[string]string + AddContainerMounts []*api.Mount } func (c *ContainerResponse) ProxyDone(resp *runtimeapi.ContainerResourceHookResponse) { @@ -278,6 +279,11 @@ func (c *ContainerContext) NriDone(executor resourceexecutor.ResourceUpdateExecu adjust.AddEnv(k, v) } } + if len(c.Response.AddContainerMounts) != 0 { + for _, m := range c.Response.AddContainerMounts { + adjust.AddMount(m) + } + } c.Update()