@@ -144,10 +144,14 @@ func DecodeNodeDevices(str string) []*DeviceInfo {
144144 return retval
145145}
146146
147- func EncodeNodeDevices (dlist []* DeviceInfo ) string {
147+ func EncodeNodeDevices (dlist []* DeviceInfo , migStrategy string , coreScaling float64 ) string {
148148 tmp := ""
149+ core := "100"
150+ if strings .Compare (migStrategy , "none" ) == 0 {
151+ core = strconv .FormatInt (int64 (coreScaling * 100 ), 10 )
152+ }
149153 for _ , val := range dlist {
150- tmp += val .Id + "," + strconv .FormatInt (int64 (val .Count ), 10 ) + "," + strconv .Itoa (int (val .Devmem )) + "," + val .Type + "," + strconv .FormatBool (val .Health ) + "," + val .Mode + ":"
154+ tmp += val .Id + "," + strconv .FormatInt (int64 (val .Count ), 10 ) + "," + strconv .Itoa (int (val .Devmem )) + "," + core + "," + val .Type + "," + strconv .FormatBool (val .Health ) + "," + val .Mode + ":"
151155 }
152156 klog .V (3 ).Infoln ("Encoded node Devices" , tmp )
153157 return tmp
@@ -171,19 +175,22 @@ func EncodePodDevices(pd PodDevices) string {
171175 return strings .Join (ss , ";" )
172176}
173177
174- func DecodeContainerDevices (str string ) ContainerDevices {
178+ func DecodeContainerDevices (str string ) ( ContainerDevices , error ) {
175179 if len (str ) == 0 {
176- return ContainerDevices {}
180+ return ContainerDevices {}, nil
177181 }
178182 cd := strings .Split (str , ":" )
179183 contdev := ContainerDevices {}
180184 tmpdev := ContainerDevice {}
181185 if len (str ) == 0 {
182- return contdev
186+ return contdev , nil
183187 }
184188 for _ , val := range cd {
185189 if strings .Contains (val , "," ) {
186190 tmpstr := strings .Split (val , "," )
191+ if len (tmpstr ) < 4 {
192+ return contdev , fmt .Errorf ("invalid container device format: %s" , val )
193+ }
187194 tmpdev .UUID = tmpstr [0 ]
188195 tmpdev .Type = tmpstr [1 ]
189196 devmem , _ := strconv .ParseInt (tmpstr [2 ], 10 , 32 )
@@ -193,23 +200,30 @@ func DecodeContainerDevices(str string) ContainerDevices {
193200 contdev = append (contdev , tmpdev )
194201 }
195202 }
196- return contdev
203+ return contdev , nil
197204}
198205
199- func DecodePodDevices (str string ) PodDevices {
206+ func DecodePodDevices (str string ) ( PodDevices , error ) {
200207 if len (str ) == 0 {
201- return PodDevices {}
208+ return PodDevices {}, nil
202209 }
203210 var pd PodDevices
204211 for _ , s := range strings .Split (str , ";" ) {
205- cd := DecodeContainerDevices (s )
212+ cd , err := DecodeContainerDevices (s )
213+ if err != nil {
214+ return nil , err
215+ }
206216 pd = append (pd , cd )
207217 }
208- return pd
218+ return pd , nil
209219}
210220
211221func GetNextDeviceRequest (dtype string , p v1.Pod ) (v1.Container , ContainerDevices , error ) {
212- pdevices := DecodePodDevices (p .Annotations [AssignedIDsToAllocateAnnotations ])
222+ pdevices , err := DecodePodDevices (p .Annotations [AssignedIDsToAllocateAnnotations ])
223+ if err != nil {
224+ klog .Errorf ("failed to decode pod devices: %v" , err )
225+ return v1.Container {}, nil , err
226+ }
213227 klog .Infoln ("pdevices=" , pdevices )
214228 res := ContainerDevices {}
215229 for idx , val := range pdevices {
@@ -228,7 +242,11 @@ func GetNextDeviceRequest(dtype string, p v1.Pod) (v1.Container, ContainerDevice
228242}
229243
230244func EraseNextDeviceTypeFromAnnotation (dtype string , p v1.Pod ) error {
231- pdevices := DecodePodDevices (p .Annotations [AssignedIDsToAllocateAnnotations ])
245+ pdevices , err := DecodePodDevices (p .Annotations [AssignedIDsToAllocateAnnotations ])
246+ if err != nil {
247+ klog .Errorf ("failed to decode pod devices: %v" , err )
248+ return err
249+ }
232250 res := PodDevices {}
233251 found := false
234252 for _ , val := range pdevices {
@@ -383,7 +401,7 @@ func GenerateVirtualDeviceID(id uint, fakeCounter uint) string {
383401}
384402
385403// GetDevices returns virtual devices and all physical devices by index.
386- func GetDevices (gpuMemoryFactor uint ) ([]* pluginapi.Device , map [uint ]string ) {
404+ func GetDevices (gpuMemoryFactor uint , memoryScaling float64 , migStrategy string ) ([]* pluginapi.Device , map [uint ]string ) {
387405 n , ret := config .Nvml ().DeviceGetCount ()
388406 if ret != nvml .SUCCESS {
389407 klog .Fatalf ("call nvml.DeviceGetCount with error: %v" , ret )
@@ -407,6 +425,9 @@ func GetDevices(gpuMemoryFactor uint) ([]*pluginapi.Device, map[uint]string) {
407425 klog .Fatalf ("call GetMemoryInfo with error: %v" , ret )
408426 }
409427 deviceGPUMemory := uint (memory .Total / (1024 * 1024 ))
428+ if strings .Compare (migStrategy , "none" ) == 0 {
429+ deviceGPUMemory = uint (float64 (deviceGPUMemory ) * memoryScaling )
430+ }
410431 for j := uint (0 ); j < deviceGPUMemory / gpuMemoryFactor ; j ++ {
411432 klog .V (4 ).Infof ("adding virtual device: %d" , j )
412433 fakeID := GenerateVirtualDeviceID (id , j )
@@ -543,7 +564,7 @@ func ExtractMigTemplatesFromUUID(uuid string) (string, int, error) {
543564 return templateGroupName , pos , nil
544565}
545566
546- func LoadNvidiaConfig () * config.NvidiaConfig {
567+ func LoadNvidiaConfig (migStrategyFlag string ) * config.NvidiaConfig {
547568 configs , err := LoadConfigFromCM ("volcano-vgpu-device-config" )
548569 if err != nil {
549570 klog .InfoS ("configMap not found" , err .Error ())
@@ -555,6 +576,8 @@ func LoadNvidiaConfig() *config.NvidiaConfig {
555576 nvidiaConfig .DeviceSplitCount = config .DeviceSplitCount
556577 nvidiaConfig .DeviceCoreScaling = config .DeviceCoresScaling
557578 nvidiaConfig .GPUMemoryFactor = config .GPUMemoryFactor
579+ nvidiaConfig .DeviceMemoryScaling = config .DeviceMemoryScaling
580+ nvidiaConfig .MigStrategy = migStrategyFlag
558581 if err := readFromConfigFile (& nvidiaConfig ); err != nil {
559582 klog .InfoS ("readFrom device cm error" , err .Error ())
560583 }
@@ -583,6 +606,9 @@ func readFromConfigFile(sConfig *config.NvidiaConfig) error {
583606 if val .Devicecorescaling > 0 {
584607 sConfig .DeviceCoreScaling = val .Devicecorescaling
585608 }
609+ if val .Migstrategy != "" {
610+ sConfig .MigStrategy = val .Migstrategy
611+ }
586612 if val .Devicesplitcount > 0 {
587613 sConfig .DeviceSplitCount = val .Devicesplitcount
588614 }
0 commit comments