Skip to content

Commit

Permalink
gpu: support strict gpu share with hami
Browse files Browse the repository at this point in the history
Signed-off-by: wangjianyu.wjy <[email protected]>
  • Loading branch information
wangjianyu.wjy committed Nov 15, 2024
1 parent b4c53c8 commit 476722e
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
26 changes: 26 additions & 0 deletions pkg/koordlet/runtimehooks/hooks/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"strings"

"github.com/containerd/nri/pkg/api"
"k8s.io/klog/v2"

ext "github.com/koordinator-sh/koordinator/apis/extension"
Expand Down Expand Up @@ -70,5 +71,30 @@ func (p *gpuPlugin) InjectContainerGPUEnv(proto protocol.HooksProtocol) error {
containerCtx.Response.AddContainerEnvs = make(map[string]string)
}
containerCtx.Response.AddContainerEnvs[GpuAllocEnv] = strings.Join(gpuIDs, ",")

gpuResources := devices[0].Resources
gpuMemory, ok := gpuResources[ext.ResourceGPUMemory]
if !ok {
return fmt.Errorf("gpu memory not found in gpu resource")
}
gpuCore, ok := gpuResources[ext.ResourceGPUCore]
if !ok {
return fmt.Errorf("gpu core not found in gpu resource")
}
gpuMemoryRatio, ok := gpuResources[ext.ResourceGPUMemoryRatio]
if !ok {
return fmt.Errorf("gpu memory ratio not found in gpu resource")
}
if gpuMemoryRatio.Value() < 100 {
containerCtx.Response.AddContainerEnvs["CUDA_DEVICE_MEMORY_LIMIT"] = fmt.Sprintf("%d", gpuMemory.Value())
containerCtx.Response.AddContainerEnvs["CUDA_DEVICE_SM_LIMIT"] = fmt.Sprintf("%d", gpuCore.Value())
containerCtx.Response.AddContainerEnvs["LD_PRELOAD"] = "/libvgpu.so"
containerCtx.Response.AddContainerMounts = append(containerCtx.Response.AddContainerMounts, &api.Mount{
Destination: "/libvgpu.so",
Type: "bind",
Source: "/usr/local/libvgpu.so",
})
}

return nil
}
10 changes: 8 additions & 2 deletions pkg/koordlet/runtimehooks/protocol/container_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ func (c *ContainerRequest) FromReconciler(podMeta *statesinformer.PodMeta, conta
}

type ContainerResponse struct {
Resources Resources
AddContainerEnvs map[string]string
Resources Resources
AddContainerEnvs map[string]string
AddContainerMounts []*api.Mount
}

func (c *ContainerResponse) ProxyDone(resp *runtimeapi.ContainerResourceHookResponse) {
Expand Down Expand Up @@ -278,6 +279,11 @@ func (c *ContainerContext) NriDone(executor resourceexecutor.ResourceUpdateExecu
adjust.AddEnv(k, v)
}
}
if len(c.Response.AddContainerMounts) != 0 {
for _, m := range c.Response.AddContainerMounts {
adjust.AddMount(m)
}
}

c.Update()

Expand Down

0 comments on commit 476722e

Please sign in to comment.