Skip to content

Commit e0b3c66

Browse files
nybidarigvisor-bot
authored andcommitted
Add sandbox metrics: cpu and wall time saved.
PiperOrigin-RevId: 864440294
1 parent 8a1f424 commit e0b3c66

File tree

7 files changed

+193
-13
lines changed

7 files changed

+193
-13
lines changed

runsc/boot/controller.go

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,9 @@ const (
6666
// ContMgrExecuteAsync executes a command in a container.
6767
ContMgrExecuteAsync = "containerManager.ExecuteAsync"
6868

69+
// ContMgrGetSavings gets the savings for restored sandboxes.
70+
ContMgrGetSavings = "containerManager.GetSavings"
71+
6972
// ContMgrPortForward starts port forwarding with the sandbox.
7073
ContMgrPortForward = "containerManager.PortForward"
7174

@@ -702,9 +705,10 @@ func (cm *containerManager) onRestoreFailed(err error) {
702705
cm.restorer = nil
703706
}
704707

705-
func (cm *containerManager) onRestoreDone() {
708+
func (cm *containerManager) onRestoreDone(s Savings) {
706709
cm.l.mu.Lock()
707710
cm.l.state = restored
711+
cm.l.savings = s
708712
cm.l.mu.Unlock()
709713
cm.l.restoreDone.Broadcast()
710714
cm.restorer = nil
@@ -1061,3 +1065,20 @@ func (cm *containerManager) ContainerRuntimeState(cid *string, state *ContainerR
10611065
*state = cm.l.containerRuntimeState(*cid)
10621066
return nil
10631067
}
1068+
1069+
// Savings holds the savings with restore.
1070+
type Savings struct {
1071+
// CPUTimeSaved is the CPU time saved at restore.
1072+
CPUTimeSaved gtime.Duration
1073+
// WallTimeSaved is the wall time saved at restore.
1074+
WallTimeSaved gtime.Duration
1075+
}
1076+
1077+
// GetSavings returns the savings for restored sandboxes.
1078+
func (cm *containerManager) GetSavings(_ *struct{}, s *Savings) error {
1079+
log.Debugf("containerManager.GetSavings")
1080+
cm.l.mu.Lock()
1081+
*s = cm.l.savings
1082+
cm.l.mu.Unlock()
1083+
return nil
1084+
}

runsc/boot/loader.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,11 @@ type Loader struct {
304304
// +checklocks:mu
305305
restoreErr error
306306

307+
// savings contains the restore savings (eg CPU and wall time saved).
308+
//
309+
// +checklocks:mu
310+
savings Savings
311+
307312
LoaderExtra
308313
}
309314

runsc/boot/restore.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -396,13 +396,9 @@ func (r *restorer) restore(l *Loader) error {
396396
}
397397
}
398398

399-
r.cm.onRestoreDone()
400-
postRestoreThread.Reached("kernel notified")
401-
402-
log.Infof("Restore successful")
403-
404399
// Calculate the CPU time saved for restore.
405400
t, err := state.CPUTime()
401+
var s Savings
406402
if err != nil {
407403
log.Warningf("Failed to get CPU time usage for restore, err: %v", err)
408404
} else {
@@ -415,7 +411,8 @@ func (r *restorer) restore(l *Loader) error {
415411
if err != nil {
416412
log.Warningf("CPU time usage in metadata %v is invalid, err: %v", savedTimeStr, err)
417413
} else {
418-
log.Infof("CPU time saved with restore: %v", (savedTime - t).String())
414+
s.CPUTimeSaved = savedTime - t
415+
log.Infof("CPU time saved with restore: %v ms", s.CPUTimeSaved.Milliseconds())
419416
}
420417
}
421418
}
@@ -433,9 +430,15 @@ func (r *restorer) restore(l *Loader) error {
433430
if err != nil {
434431
log.Warningf("Walltime in metadata %v is invalid, err: %v", savedWtStr, err)
435432
} else {
436-
log.Infof("Walltime saved with restore: %v", (savedWt - wt).String())
433+
s.WallTimeSaved = savedWt - wt
434+
log.Infof("Walltime saved with restore: %v ms", s.WallTimeSaved.Milliseconds())
437435
}
438436
}
437+
438+
r.cm.onRestoreDone(s)
439+
postRestoreThread.Reached("kernel notified")
440+
441+
log.Infof("Restore successful")
439442
}()
440443

441444
// Transfer ownership of the `timer` to a new goroutine.

runsc/container/metric_server_test.go

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
"gvisor.dev/gvisor/pkg/cleanup"
3131
"gvisor.dev/gvisor/pkg/test/testutil"
3232
"gvisor.dev/gvisor/runsc/config"
33+
"gvisor.dev/gvisor/runsc/sandbox"
3334
"gvisor.dev/gvisor/test/metricclient"
3435
)
3536

@@ -910,3 +911,107 @@ func TestMetricServerDoesNotExportZeroValueCounters(t *testing.T) {
910911
t.Logf("Last metric data:\n\n%s\n\n", metricData)
911912
}
912913
}
914+
915+
func TestContainerMetricsAfterRestore(t *testing.T) {
916+
te, cleanup := setupMetrics(t /* forceTempUDS= */, false)
917+
defer cleanup()
918+
919+
imagePath, err := os.MkdirTemp(testutil.TmpDir(), "checkpoint")
920+
if err != nil {
921+
t.Fatalf("os.MkdirTemp failed: %v", err)
922+
}
923+
defer os.RemoveAll(imagePath)
924+
925+
containerID := testutil.RandomContainerID()
926+
args := Args{
927+
ID: containerID,
928+
Spec: te.sleepSpec,
929+
BundleDir: te.bundleDir,
930+
}
931+
cont, err := New(te.sleepConf, args)
932+
if err != nil {
933+
t.Fatalf("error creating container: %v", err)
934+
}
935+
936+
if err := cont.Start(te.sleepConf); err != nil {
937+
t.Fatalf("Cannot start container: %v", err)
938+
}
939+
940+
if err := cont.Checkpoint(te.sleepConf, imagePath, sandbox.CheckpointOpts{Resume: true}); err != nil {
941+
t.Fatalf("Checkpoint failed: %v", err)
942+
}
943+
944+
// Check time saved metrics are zero before restore.
945+
ckpData, err := te.client.GetMetrics(te.testCtx, nil)
946+
if err != nil {
947+
t.Fatalf("Cannot get metrics after restoring container: %v", err)
948+
}
949+
cpuTimeSavedBefore, _, err := ckpData.GetPrometheusContainerInteger(metricclient.WantMetric{
950+
Metric: "testmetric_meta_sandbox_cpu_time_saved_milliseconds",
951+
Sandbox: containerID,
952+
})
953+
if err != nil {
954+
t.Errorf("Cannot get testmetric_meta_sandbox_cpu_time_saved_milliseconds from following data (err: %v):\n\n%s\n\n", err, ckpData)
955+
}
956+
if cpuTimeSavedBefore != 0 {
957+
t.Errorf("testmetric_meta_sandbox_cpu_time_saved_milliseconds should be zero, got %v", cpuTimeSavedBefore)
958+
}
959+
wallTimeSavedBefore, _, err := ckpData.GetPrometheusContainerInteger(metricclient.WantMetric{
960+
Metric: "testmetric_meta_sandbox_wall_time_saved_milliseconds",
961+
Sandbox: containerID,
962+
})
963+
if err != nil {
964+
t.Errorf("Cannot get testmetric_meta_sandbox_wall_time_saved_milliseconds from following data (err: %v):\n\n%s\n\n", err, ckpData)
965+
}
966+
if wallTimeSavedBefore != 0 {
967+
t.Errorf("testmetric_meta_sandbox_wall_time_saved_milliseconds should be zero, got %v", wallTimeSavedBefore)
968+
}
969+
970+
if err := cont.Destroy(); err != nil {
971+
t.Fatalf("Destroy failed: %v", err)
972+
}
973+
974+
// Create new container to restore into.
975+
restoreCont, err := New(te.sleepConf, args)
976+
if err != nil {
977+
t.Fatalf("error creating container for restore: %v", err)
978+
}
979+
defer restoreCont.Destroy()
980+
981+
if err := restoreCont.Restore(te.sleepConf, imagePath, false, false); err != nil {
982+
t.Fatalf("Restore failed: %v", err)
983+
}
984+
985+
var cpuTimeSaved, wallTimeSaved int64
986+
err = testutil.Poll(func() error {
987+
restoreData, err := te.client.GetMetrics(te.testCtx, nil)
988+
if err != nil {
989+
return fmt.Errorf("cannot get metrics after restoring container: %v", err)
990+
}
991+
992+
// Check time saved metrics are non-zero after restore.
993+
cpuTimeSaved, _, err = restoreData.GetPrometheusContainerInteger(metricclient.WantMetric{
994+
Metric: "testmetric_meta_sandbox_cpu_time_saved_milliseconds",
995+
Sandbox: containerID,
996+
})
997+
if err != nil {
998+
return fmt.Errorf("cannot get testmetric_meta_sandbox_cpu_time_saved_milliseconds from following data (err: %v):\n%s", err, restoreData)
999+
}
1000+
wallTimeSaved, _, err = restoreData.GetPrometheusContainerInteger(metricclient.WantMetric{
1001+
Metric: "testmetric_meta_sandbox_wall_time_saved_milliseconds",
1002+
Sandbox: containerID,
1003+
})
1004+
if err != nil {
1005+
return fmt.Errorf("cannot get testmetric_meta_sandbox_wall_time_saved_milliseconds from following data (err: %v):\n%s", err, restoreData)
1006+
}
1007+
if cpuTimeSaved == 0 && wallTimeSaved == 0 {
1008+
return fmt.Errorf("time saved metrics are zero: cpu: %d, wall: %d", cpuTimeSaved, wallTimeSaved)
1009+
}
1010+
return nil
1011+
}, 500*time.Millisecond)
1012+
if err != nil {
1013+
t.Fatalf("Failed to get non-zero time saved metrics: %v", err)
1014+
}
1015+
t.Logf("After restore, cpu_time_saved=%d", cpuTimeSaved)
1016+
t.Logf("After restore, wall_time_saved=%d", wallTimeSaved)
1017+
}

runsc/metricserver/metricserver.go

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -531,11 +531,13 @@ func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadRes
531531
// metric data (if querying metrics from the sandbox process succeeded).
532532
type sandboxMetricsResult struct {
533533
sandboxLoadResult
534-
isRunning bool
535-
isCheckpointed bool
536-
isRestored bool
537-
snapshot *prometheus.Snapshot
538-
err error
534+
isRunning bool
535+
isCheckpointed bool
536+
isRestored bool
537+
cpuTimeSavedMS int64
538+
wallTimeSavedMS int64
539+
snapshot *prometheus.Snapshot
540+
err error
539541
}
540542

541543
// queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes.
@@ -574,6 +576,8 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad
574576
isRunning := false
575577
isCheckpointed := false
576578
isRestored := false
579+
cpuTimeSavedMS := int64(0)
580+
wallTimeSavedMS := int64(0)
577581
var snapshot *prometheus.Snapshot
578582
err := s.err
579583
if err == nil {
@@ -583,12 +587,16 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad
583587
isRunning = s.sandbox.IsRunning()
584588
isCheckpointed = s.sandbox.Checkpointed
585589
isRestored = s.sandbox.Restored
590+
cpuTimeSavedMS = s.sandbox.CPUTimeSaved.Milliseconds()
591+
wallTimeSavedMS = s.sandbox.WallTimeSaved.Milliseconds()
586592
}
587593
processSandbox(sandboxMetricsResult{
588594
sandboxLoadResult: s,
589595
isRunning: isRunning,
590596
isCheckpointed: isCheckpointed,
591597
isRestored: isRestored,
598+
cpuTimeSavedMS: cpuTimeSavedMS,
599+
wallTimeSavedMS: wallTimeSavedMS,
592600
snapshot: snapshot,
593601
err: err,
594602
})
@@ -687,6 +695,8 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
687695
sandboxRunning := int64(0)
688696
sandboxCheckpointed := int64(0)
689697
sandboxRestored := int64(0)
698+
sandboxCPUTimeSavedMS := int64(0)
699+
sandboxWallTimeSavedMS := int64(0)
690700
if r.isRunning {
691701
sandboxRunning = 1
692702
meta.numRunningSandboxes++
@@ -698,10 +708,14 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
698708
if r.isRestored {
699709
sandboxRestored = 1
700710
meta.numRestoredSandboxes++
711+
sandboxCPUTimeSavedMS = r.cpuTimeSavedMS
712+
sandboxWallTimeSavedMS = r.wallTimeSavedMS
701713
}
702714
selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels))
703715
selfMetrics.Add(prometheus.LabeledIntData(&SandboxCheckpointedMetric, nil, sandboxCheckpointed).SetExternalLabels(r.served.extraLabels))
704716
selfMetrics.Add(prometheus.LabeledIntData(&SandboxRestoredMetric, nil, sandboxRestored).SetExternalLabels(r.served.extraLabels))
717+
selfMetrics.Add(prometheus.LabeledIntData(&SandboxCPUTimeSavedMSMetric, nil, sandboxCPUTimeSavedMS).SetExternalLabels(r.served.extraLabels))
718+
selfMetrics.Add(prometheus.LabeledIntData(&SandboxWallTimeSavedMSMetric, nil, sandboxWallTimeSavedMS).SetExternalLabels(r.served.extraLabels))
705719
if r.err == nil {
706720
selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels))
707721
for _, cap := range r.served.capabilities {

runsc/metricserver/metricserver_metrics.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,16 @@ var (
4040
Type: prometheus.TypeGauge,
4141
Help: "Boolean metric set to 1 for each restored sandbox.",
4242
}
43+
SandboxCPUTimeSavedMSMetric = prometheus.Metric{
44+
Name: "sandbox_cpu_time_saved_milliseconds",
45+
Type: prometheus.TypeGauge,
46+
Help: "CPU time saved for each restored sandbox(in milliseconds).",
47+
}
48+
SandboxWallTimeSavedMSMetric = prometheus.Metric{
49+
Name: "sandbox_wall_time_saved_milliseconds",
50+
Type: prometheus.TypeGauge,
51+
Help: "Wall time saved for each restored sandbox(in milliseconds).",
52+
}
4353
SandboxMetadataMetric = prometheus.Metric{
4454
Name: "sandbox_metadata",
4555
Type: prometheus.TypeGauge,
@@ -95,6 +105,8 @@ var Metrics = []*prometheus.Metric{
95105
&SandboxMetadataMetric,
96106
&SandboxCheckpointedMetric,
97107
&SandboxRestoredMetric,
108+
&SandboxCPUTimeSavedMSMetric,
109+
&SandboxWallTimeSavedMSMetric,
98110
&SandboxCapabilitiesMetric,
99111
&SpecMetadataMetric,
100112
&SandboxCreationMetric,

runsc/sandbox/sandbox.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,12 @@ type Sandbox struct {
229229

230230
// Restored will be true when the sandbox has been restored.
231231
Restored bool `json:"restored"`
232+
233+
// CPUTimeSaved contains the CPU time saved when the sandbox has been restored.
234+
CPUTimeSaved time.Duration `json:"cpuTimeSaved"`
235+
236+
// WallTimeSaved contains the wall time saved when the sandbox has been restored.
237+
WallTimeSaved time.Duration `json:"wallTimeSaved"`
232238
}
233239

234240
// Getpid returns the process ID of the sandbox process.
@@ -1662,6 +1668,20 @@ func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) {
16621668
// ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format.
16631669
func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) {
16641670
log.Debugf("Metrics export sandbox %q", s.ID)
1671+
1672+
// Update time saved metrics before exporting, if not exported already for
1673+
// restored sandboxes.
1674+
if s.Restored && s.CPUTimeSaved == 0 && s.WallTimeSaved == 0 {
1675+
var savings boot.Savings
1676+
err := s.call(boot.ContMgrGetSavings, nil, &savings)
1677+
if err != nil {
1678+
log.Warningf("Failed to get time saved metrics")
1679+
} else {
1680+
s.CPUTimeSaved = savings.CPUTimeSaved
1681+
s.WallTimeSaved = savings.WallTimeSaved
1682+
}
1683+
}
1684+
16651685
var data control.MetricsExportData
16661686
if err := s.call(boot.MetricsExport, &opts, &data); err != nil {
16671687
return nil, err

0 commit comments

Comments
 (0)