Skip to content

Commit e25ad86

Browse files
author
zhangxl56
committed
Support cores and memory overselling by scaling
1 parent d7c9ccf commit e25ad86

File tree

6 files changed

+78
-40
lines changed

6 files changed

+78
-40
lines changed

cmd/vgpu/main.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ func init() {
6868
rootCmd.Flags().UintVar(&config.DeviceSplitCount, "device-split-count", 2, "the number for NVIDIA device split")
6969
rootCmd.Flags().UintVar(&config.GPUMemoryFactor, "gpu-memory-factor", 1, "the default gpu memory block size is 1MB")
7070
rootCmd.Flags().Float64Var(&config.DeviceCoresScaling, "device-cores-scaling", 1.0, "the ratio for NVIDIA device cores scaling")
71+
rootCmd.Flags().Float64Var(&config.DeviceMemoryScaling, "device-memory-scaling", 1.0, "the ratio for NVIDIA device memory scaling")
7172
rootCmd.Flags().StringVar(&config.NodeName, "node-name", viper.GetString("node-name"), "node name")
7273

7374
rootCmd.PersistentFlags().AddGoFlagSet(util.GlobalFlagSet())
@@ -104,13 +105,13 @@ func start() error {
104105
klog.Info("Starting OS watcher.")
105106
sigs := NewOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
106107

107-
nvidiaCfg := util.LoadNvidiaConfig()
108+
nvidiaCfg := util.LoadNvidiaConfig(migStrategyFlag)
108109

109110
cache := nvidiadevice.NewDeviceCache()
110111
cache.Start()
111112
defer cache.Stop()
112113

113-
register := nvidiadevice.NewDeviceRegister(cache)
114+
register := nvidiadevice.NewDeviceRegister(cache, nvidiaCfg)
114115
register.Start()
115116
defer register.Stop()
116117

@@ -122,7 +123,7 @@ restart:
122123
p.Stop()
123124
}
124125
klog.Info("Retreiving plugins.")
125-
migStrategy, err := nvidiadevice.NewMigStrategy(migStrategyFlag)
126+
migStrategy, err := nvidiadevice.NewMigStrategy(nvidiaCfg.MigStrategy)
126127
if err != nil {
127128
return fmt.Errorf("error creating MIG strategy: %v", err)
128129
}

docker/Dockerfile.ubuntu20.04

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ FROM ubuntu:20.04 AS builder
1616
ARG TARGETARCH
1717
RUN apt-get update
1818
RUN apt-get -y install ca-certificates g++ wget
19-
RUN wget -qO- https://storage.googleapis.com/golang/go1.23.7.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -zx
19+
RUN wget -qO- https://storage.googleapis.com/golang/go1.24.6.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -zx
2020
ENV GOPATH=/go
2121
ENV PATH=$GOPATH/bin:/usr/local/go/bin:$PATH
2222
WORKDIR /go/src/volcano.sh/devices

pkg/plugin/vgpu/config/config.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ type NvidiaConfig struct {
3939
DisableCoreLimit bool `yaml:"disableCoreLimit"`
4040
MigGeometriesList []AllowedMigGeometries `yaml:"knownMigGeometries"`
4141
GPUMemoryFactor uint `yaml:"gpuMemoryFactor"`
42+
MigStrategy string `yaml:"migStrategy"`
4243
}
4344

4445
var (
@@ -70,13 +71,14 @@ func Device() device.Interface {
7071
}
7172

7273
var (
73-
DeviceSplitCount uint
74-
GPUMemoryFactor uint
75-
Mode string
76-
DeviceCoresScaling float64
77-
NodeName string
78-
RuntimeSocketFlag string
79-
DisableCoreLimit bool
74+
DeviceSplitCount uint
75+
GPUMemoryFactor uint
76+
Mode string
77+
DeviceCoresScaling float64
78+
DeviceMemoryScaling float64
79+
NodeName string
80+
RuntimeSocketFlag string
81+
DisableCoreLimit bool
8082
)
8183

8284
type MigTemplate struct {

pkg/plugin/vgpu/plugin.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func (m *NvidiaDevicePlugin) initialize() {
126126
m.server = grpc.NewServer([]grpc.ServerOption{}...)
127127
m.health = make(chan *Device)
128128
m.stop = make(chan interface{})
129-
m.virtualDevices, _ = util.GetDevices(config.GPUMemoryFactor)
129+
m.virtualDevices, _ = util.GetDevices(config.GPUMemoryFactor, m.schedulerConfig.DeviceMemoryScaling, m.migStrategy)
130130
}
131131

132132
func (m *NvidiaDevicePlugin) cleanup() {
@@ -374,7 +374,7 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device
374374
}
375375

376376
} else {
377-
_ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices()})
377+
_ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices(m.schedulerConfig.DeviceCoreScaling)})
378378
for {
379379
select {
380380
case <-m.stop:
@@ -383,7 +383,7 @@ func (m *NvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.Device
383383
// FIXME: there is no way to recover from the Unhealthy state.
384384
//d.Health = pluginapi.Unhealthy
385385
log.Printf("'%s' device marked unhealthy: %s", m.resourceName, d.ID)
386-
_ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices()})
386+
_ = s.Send(&pluginapi.ListAndWatchResponse{Devices: m.apiDevices(m.schedulerConfig.DeviceCoreScaling)})
387387
}
388388
}
389389
}
@@ -559,7 +559,7 @@ func (m *NvidiaDevicePlugin) deviceIDsFromUUIDs(uuids []string) []string {
559559
return uuids
560560
}
561561

562-
func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device {
562+
func (m *NvidiaDevicePlugin) apiDevices(coreScaling float64) []*pluginapi.Device {
563563
if strings.Compare(m.migStrategy, "mixed") == 0 {
564564
var pdevs []*pluginapi.Device
565565
for _, d := range m.cachedDevices {
@@ -589,7 +589,11 @@ func (m *NvidiaDevicePlugin) apiDevices() []*pluginapi.Device {
589589
if strings.Compare(m.resourceName, util.ResourceCores) == 0 {
590590
for _, dev := range devices {
591591
i := 0
592-
for i < 100 {
592+
cores := 100
593+
if strings.Compare(m.migStrategy, "none") == 0 {
594+
cores = int(coreScaling * float64(100))
595+
}
596+
for i < cores {
593597
res = append(res, &pluginapi.Device{
594598
ID: fmt.Sprintf("%v-core-%v", dev.ID, i),
595599
Health: dev.Health,

pkg/plugin/vgpu/register.go

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,16 +30,18 @@ import (
3030
type DevListFunc func() []*Device
3131

3232
type DeviceRegister struct {
33-
deviceCache *DeviceCache
34-
unhealthy chan *Device
35-
stopCh chan struct{}
33+
deviceCache *DeviceCache
34+
unhealthy chan *Device
35+
nvidiaConfig *config.NvidiaConfig
36+
stopCh chan struct{}
3637
}
3738

38-
func NewDeviceRegister(deviceCache *DeviceCache) *DeviceRegister {
39+
func NewDeviceRegister(deviceCache *DeviceCache, nvidiaConfig *config.NvidiaConfig) *DeviceRegister {
3940
return &DeviceRegister{
40-
deviceCache: deviceCache,
41-
unhealthy: make(chan *Device),
42-
stopCh: make(chan struct{}),
41+
deviceCache: deviceCache,
42+
nvidiaConfig: nvidiaConfig,
43+
unhealthy: make(chan *Device),
44+
stopCh: make(chan struct{}),
4345
}
4446
}
4547

@@ -75,8 +77,11 @@ func (r *DeviceRegister) apiDevices() *[]*util.DeviceInfo {
7577
}
7678

7779
klog.V(3).Infoln("nvml registered device id=", dev.ID, "memory=", memory.Total, "type=", model)
78-
79-
registeredmem := int32(memory.Total/(1024*1024)) / int32(config.GPUMemoryFactor)
80+
memTotal := memory.Total / (1024 * 1024)
81+
if r.nvidiaConfig.MigStrategy == MigStrategyNone {
82+
memTotal = uint64(float64(memTotal) * r.nvidiaConfig.DeviceMemoryScaling)
83+
}
84+
registeredmem := int32(memTotal / uint64(config.GPUMemoryFactor))
8085
klog.V(3).Infoln("GPUMemoryFactor=", config.GPUMemoryFactor, "registeredmem=", registeredmem)
8186
res = append(res, &util.DeviceInfo{
8287
Id: dev.ID,
@@ -98,7 +103,7 @@ func (r *DeviceRegister) RegisterInAnnotation() error {
98103
klog.Errorln("get node error", err.Error())
99104
return err
100105
}
101-
encodeddevices := util.EncodeNodeDevices(*devices)
106+
encodeddevices := util.EncodeNodeDevices(*devices, r.nvidiaConfig.MigStrategy, r.nvidiaConfig.DeviceCoreScaling)
102107
annos[util.NodeHandshake] = "Reported " + time.Now().String()
103108
annos[util.NodeNvidiaDeviceRegistered] = encodeddevices
104109
klog.Infoln("Reporting devices", encodeddevices, "in", time.Now().String())

pkg/plugin/vgpu/util/util.go

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,14 @@ func DecodeNodeDevices(str string) []*DeviceInfo {
144144
return retval
145145
}
146146

147-
func EncodeNodeDevices(dlist []*DeviceInfo) string {
147+
func EncodeNodeDevices(dlist []*DeviceInfo, migStrategy string, coreScaling float64) string {
148148
tmp := ""
149+
core := "100"
150+
if strings.Compare(migStrategy, "none") == 0 {
151+
core = strconv.FormatInt(int64(coreScaling*100), 10)
152+
}
149153
for _, val := range dlist {
150-
tmp += val.Id + "," + strconv.FormatInt(int64(val.Count), 10) + "," + strconv.Itoa(int(val.Devmem)) + "," + val.Type + "," + strconv.FormatBool(val.Health) + "," + val.Mode + ":"
154+
tmp += val.Id + "," + strconv.FormatInt(int64(val.Count), 10) + "," + strconv.Itoa(int(val.Devmem)) + "," + core + "," + val.Type + "," + strconv.FormatBool(val.Health) + "," + val.Mode + ":"
151155
}
152156
klog.V(3).Infoln("Encoded node Devices", tmp)
153157
return tmp
@@ -171,19 +175,22 @@ func EncodePodDevices(pd PodDevices) string {
171175
return strings.Join(ss, ";")
172176
}
173177

174-
func DecodeContainerDevices(str string) ContainerDevices {
178+
func DecodeContainerDevices(str string) (ContainerDevices, error) {
175179
if len(str) == 0 {
176-
return ContainerDevices{}
180+
return ContainerDevices{}, nil
177181
}
178182
cd := strings.Split(str, ":")
179183
contdev := ContainerDevices{}
180184
tmpdev := ContainerDevice{}
181185
if len(str) == 0 {
182-
return contdev
186+
return contdev, nil
183187
}
184188
for _, val := range cd {
185189
if strings.Contains(val, ",") {
186190
tmpstr := strings.Split(val, ",")
191+
if len(tmpstr) < 4 {
192+
return contdev, fmt.Errorf("invalid container device format: %s", val)
193+
}
187194
tmpdev.UUID = tmpstr[0]
188195
tmpdev.Type = tmpstr[1]
189196
devmem, _ := strconv.ParseInt(tmpstr[2], 10, 32)
@@ -193,23 +200,30 @@ func DecodeContainerDevices(str string) ContainerDevices {
193200
contdev = append(contdev, tmpdev)
194201
}
195202
}
196-
return contdev
203+
return contdev, nil
197204
}
198205

199-
func DecodePodDevices(str string) PodDevices {
206+
func DecodePodDevices(str string) (PodDevices, error) {
200207
if len(str) == 0 {
201-
return PodDevices{}
208+
return PodDevices{}, nil
202209
}
203210
var pd PodDevices
204211
for _, s := range strings.Split(str, ";") {
205-
cd := DecodeContainerDevices(s)
212+
cd, err := DecodeContainerDevices(s)
213+
if err != nil {
214+
return nil, err
215+
}
206216
pd = append(pd, cd)
207217
}
208-
return pd
218+
return pd, nil
209219
}
210220

211221
func GetNextDeviceRequest(dtype string, p v1.Pod) (v1.Container, ContainerDevices, error) {
212-
pdevices := DecodePodDevices(p.Annotations[AssignedIDsToAllocateAnnotations])
222+
pdevices, err := DecodePodDevices(p.Annotations[AssignedIDsToAllocateAnnotations])
223+
if err != nil {
224+
klog.Errorf("failed to decode pod devices: %v", err)
225+
return v1.Container{}, nil, err
226+
}
213227
klog.Infoln("pdevices=", pdevices)
214228
res := ContainerDevices{}
215229
for idx, val := range pdevices {
@@ -228,7 +242,11 @@ func GetNextDeviceRequest(dtype string, p v1.Pod) (v1.Container, ContainerDevice
228242
}
229243

230244
func EraseNextDeviceTypeFromAnnotation(dtype string, p v1.Pod) error {
231-
pdevices := DecodePodDevices(p.Annotations[AssignedIDsToAllocateAnnotations])
245+
pdevices, err := DecodePodDevices(p.Annotations[AssignedIDsToAllocateAnnotations])
246+
if err != nil {
247+
klog.Errorf("failed to decode pod devices: %v", err)
248+
return err
249+
}
232250
res := PodDevices{}
233251
found := false
234252
for _, val := range pdevices {
@@ -383,7 +401,7 @@ func GenerateVirtualDeviceID(id uint, fakeCounter uint) string {
383401
}
384402

385403
// GetDevices returns virtual devices and all physical devices by index.
386-
func GetDevices(gpuMemoryFactor uint) ([]*pluginapi.Device, map[uint]string) {
404+
func GetDevices(gpuMemoryFactor uint, memoryScaling float64, migStrategy string) ([]*pluginapi.Device, map[uint]string) {
387405
n, ret := config.Nvml().DeviceGetCount()
388406
if ret != nvml.SUCCESS {
389407
klog.Fatalf("call nvml.DeviceGetCount with error: %v", ret)
@@ -407,6 +425,9 @@ func GetDevices(gpuMemoryFactor uint) ([]*pluginapi.Device, map[uint]string) {
407425
klog.Fatalf("call GetMemoryInfo with error: %v", ret)
408426
}
409427
deviceGPUMemory := uint(memory.Total / (1024 * 1024))
428+
if strings.Compare(migStrategy, "none") == 0 {
429+
deviceGPUMemory = uint(float64(deviceGPUMemory) * memoryScaling)
430+
}
410431
for j := uint(0); j < deviceGPUMemory/gpuMemoryFactor; j++ {
411432
klog.V(4).Infof("adding virtual device: %d", j)
412433
fakeID := GenerateVirtualDeviceID(id, j)
@@ -543,7 +564,7 @@ func ExtractMigTemplatesFromUUID(uuid string) (string, int, error) {
543564
return templateGroupName, pos, nil
544565
}
545566

546-
func LoadNvidiaConfig() *config.NvidiaConfig {
567+
func LoadNvidiaConfig(migStrategyFlag string) *config.NvidiaConfig {
547568
configs, err := LoadConfigFromCM("volcano-vgpu-device-config")
548569
if err != nil {
549570
klog.InfoS("configMap not found", err.Error())
@@ -555,6 +576,8 @@ func LoadNvidiaConfig() *config.NvidiaConfig {
555576
nvidiaConfig.DeviceSplitCount = config.DeviceSplitCount
556577
nvidiaConfig.DeviceCoreScaling = config.DeviceCoresScaling
557578
nvidiaConfig.GPUMemoryFactor = config.GPUMemoryFactor
579+
nvidiaConfig.DeviceMemoryScaling = config.DeviceMemoryScaling
580+
nvidiaConfig.MigStrategy = migStrategyFlag
558581
if err := readFromConfigFile(&nvidiaConfig); err != nil {
559582
klog.InfoS("readFrom device cm error", err.Error())
560583
}
@@ -583,6 +606,9 @@ func readFromConfigFile(sConfig *config.NvidiaConfig) error {
583606
if val.Devicecorescaling > 0 {
584607
sConfig.DeviceCoreScaling = val.Devicecorescaling
585608
}
609+
if val.Migstrategy != "" {
610+
sConfig.MigStrategy = val.Migstrategy
611+
}
586612
if val.Devicesplitcount > 0 {
587613
sConfig.DeviceSplitCount = val.Devicesplitcount
588614
}

0 commit comments

Comments
 (0)