Skip to content

Commit

Permalink
koordlet: support strict gpu share
Browse files Browse the repository at this point in the history
Signed-off-by: wangjianyu.wjy <[email protected]>
  • Loading branch information
wangjianyu.wjy committed Nov 20, 2024
1 parent b4c53c8 commit c777dc0
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 24 deletions.
8 changes: 8 additions & 0 deletions apis/extension/device_share.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,8 @@ const (
LabelGPUModel string = NodeDomainPrefix + "/gpu-model"
LabelGPUDriverVersion string = NodeDomainPrefix + "/gpu-driver-version"
LabelSecondaryDeviceWellPlanned string = NodeDomainPrefix + "/secondary-device-well-planned"

LabelGPUIsolationProvider = "koordinator.sh/gpu-isolation-provider"
)

// DeviceAllocations would be injected into Pod as form of annotation during Pre-bind stage.
Expand Down Expand Up @@ -202,6 +204,12 @@ const (
GPUPartitionPolicyPrefer GPUPartitionPolicy = "Prefer"
)

type GPUIsolationProvider string

const (
GPUIsolationProviderHAMICore GPUIsolationProvider = "HAMi-core"
)

func GetDeviceAllocations(podAnnotations map[string]string) (DeviceAllocations, error) {
deviceAllocations := DeviceAllocations{}
data, ok := podAnnotations[AnnotationDeviceAllocated]
Expand Down
26 changes: 26 additions & 0 deletions pkg/koordlet/runtimehooks/hooks/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"strings"

"github.com/containerd/nri/pkg/api"
"k8s.io/klog/v2"

ext "github.com/koordinator-sh/koordinator/apis/extension"
Expand Down Expand Up @@ -70,5 +71,30 @@ func (p *gpuPlugin) InjectContainerGPUEnv(proto protocol.HooksProtocol) error {
containerCtx.Response.AddContainerEnvs = make(map[string]string)
}
containerCtx.Response.AddContainerEnvs[GpuAllocEnv] = strings.Join(gpuIDs, ",")
if containerReq.PodAnnotations[ext.LabelGPUIsolationProvider] == string(ext.GPUIsolationProviderHAMICore) {
gpuResources := devices[0].Resources
gpuMemoryRatio, ok := gpuResources[ext.ResourceGPUMemoryRatio]
if !ok {
return fmt.Errorf("gpu memory ratio not found in gpu resource")
}
if gpuMemoryRatio.Value() < 100 {
gpuMemory, ok := gpuResources[ext.ResourceGPUMemory]
if !ok {
return fmt.Errorf("gpu memory not found in gpu resource")
}
containerCtx.Response.AddContainerEnvs["CUDA_DEVICE_MEMORY_LIMIT"] = fmt.Sprintf("%d", gpuMemory.Value())
gpuCore, ok := gpuResources[ext.ResourceGPUCore]
if ok {
containerCtx.Response.AddContainerEnvs["CUDA_DEVICE_SM_LIMIT"] = fmt.Sprintf("%d", gpuCore.Value())
}
containerCtx.Response.AddContainerEnvs["LD_PRELOAD"] = "/libvgpu.so"
containerCtx.Response.AddContainerMounts = append(containerCtx.Response.AddContainerMounts, &api.Mount{
Destination: "/libvgpu.so",
Type: "bind",
Source: "/data/bin/libvgpu.so",
})
}
}

return nil
}
79 changes: 57 additions & 22 deletions pkg/koordlet/runtimehooks/hooks/gpu/gpu_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package gpu
import (
"testing"

"github.com/containerd/nri/pkg/api"
"github.com/stretchr/testify/assert"

ext "github.com/koordinator-sh/koordinator/apis/extension"
Expand All @@ -31,49 +32,83 @@ func Test_InjectContainerGPUEnv(t *testing.T) {
expectedAllocStr string
expectedError bool
proto protocol.HooksProtocol
expectedMounts []*api.Mount
expectedEnvs map[string]string
}{
{
"test empty proto",
"",
true,
nil,
name: "test empty proto",
expectedAllocStr: "",
expectedError: true,
proto: nil,
},
{
"test normal gpu alloc",
"0,1",
false,
&protocol.ContainerContext{
name: "test normal gpu alloc",
expectedAllocStr: "0,1",
expectedError: false,
proto: &protocol.ContainerContext{
Request: protocol.ContainerRequest{
PodAnnotations: map[string]string{
ext.AnnotationDeviceAllocated: "{\"gpu\": [{\"minor\": 0},{\"minor\": 1}]}",
},
},
},
expectedEnvs: map[string]string{GpuAllocEnv: "0,1"},
},
{
"test empty gpu alloc",
"",
false,
&protocol.ContainerContext{
name: "test empty gpu alloc",
expectedAllocStr: "",
expectedError: false,
proto: &protocol.ContainerContext{
Request: protocol.ContainerRequest{
PodAnnotations: map[string]string{
ext.AnnotationDeviceAllocated: "{\"fpga\": [{\"minor\": 0},{\"minor\": 1}]}",
},
},
},
},
{
name: "gpu share with HAMi",
expectedAllocStr: "1",
expectedError: false,
proto: &protocol.ContainerContext{
Request: protocol.ContainerRequest{
PodAnnotations: map[string]string{
ext.AnnotationDeviceAllocated: `{"gpu":[{"minor":1,"resources":{"koordinator.sh/gpu-core":"50","koordinator.sh/gpu-memory":"16Gi","koordinator.sh/gpu-memory-ratio":"50"}}]}`,
ext.LabelGPUIsolationProvider: string(ext.GPUIsolationProviderHAMICore),
},
},
},
expectedEnvs: map[string]string{
GpuAllocEnv: "1",
"CUDA_DEVICE_MEMORY_LIMIT": "17179869184",
"CUDA_DEVICE_SM_LIMIT": "50",
"LD_PRELOAD": "/libvgpu.so",
},
expectedMounts: []*api.Mount{
{
Destination: "/libvgpu.so",
Type: "bind",
Source: "/data/bin/libvgpu.so",
},
},
},
}
plugin := gpuPlugin{}
for _, tt := range tests {
var containerCtx *protocol.ContainerContext
if tt.proto != nil {
containerCtx = tt.proto.(*protocol.ContainerContext)
}
err := plugin.InjectContainerGPUEnv(containerCtx)
assert.Equal(t, tt.expectedError, err != nil, tt.name)
if tt.proto != nil {
containerCtx := tt.proto.(*protocol.ContainerContext)
assert.Equal(t, containerCtx.Response.AddContainerEnvs[GpuAllocEnv], tt.expectedAllocStr, tt.name)
}
t.Run(tt.name, func(t *testing.T) {
var containerCtx *protocol.ContainerContext
if tt.proto != nil {
containerCtx = tt.proto.(*protocol.ContainerContext)
}
err := plugin.InjectContainerGPUEnv(containerCtx)
assert.Equal(t, tt.expectedError, err != nil, tt.name)
if tt.proto != nil {
containerCtx := tt.proto.(*protocol.ContainerContext)
assert.Equal(t, containerCtx.Response.AddContainerEnvs[GpuAllocEnv], tt.expectedAllocStr, tt.name)
assert.Equal(t, containerCtx.Response.AddContainerEnvs, tt.expectedEnvs, tt.name)
assert.Equal(t, containerCtx.Response.AddContainerMounts, tt.expectedMounts, tt.name)
}
})

}
}
10 changes: 8 additions & 2 deletions pkg/koordlet/runtimehooks/protocol/container_context.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ func (c *ContainerRequest) FromReconciler(podMeta *statesinformer.PodMeta, conta
}

type ContainerResponse struct {
Resources Resources
AddContainerEnvs map[string]string
Resources Resources
AddContainerEnvs map[string]string
AddContainerMounts []*api.Mount
}

func (c *ContainerResponse) ProxyDone(resp *runtimeapi.ContainerResourceHookResponse) {
Expand Down Expand Up @@ -278,6 +279,11 @@ func (c *ContainerContext) NriDone(executor resourceexecutor.ResourceUpdateExecu
adjust.AddEnv(k, v)
}
}
if len(c.Response.AddContainerMounts) != 0 {
for _, m := range c.Response.AddContainerMounts {
adjust.AddMount(m)
}
}

c.Update()

Expand Down

0 comments on commit c777dc0

Please sign in to comment.