diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 3a19e1b48d..d47f9e4ba6 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -66,6 +66,9 @@ const ( // ContMgrExecuteAsync executes a command in a container. ContMgrExecuteAsync = "containerManager.ExecuteAsync" + // ContMgrGetSavings gets the savings for restored sandboxes. + ContMgrGetSavings = "containerManager.GetSavings" + // ContMgrPortForward starts port forwarding with the sandbox. ContMgrPortForward = "containerManager.PortForward" @@ -702,9 +705,10 @@ func (cm *containerManager) onRestoreFailed(err error) { cm.restorer = nil } -func (cm *containerManager) onRestoreDone() { +func (cm *containerManager) onRestoreDone(s Savings) { cm.l.mu.Lock() cm.l.state = restored + cm.l.savings = s cm.l.mu.Unlock() cm.l.restoreDone.Broadcast() cm.restorer = nil @@ -1061,3 +1065,20 @@ func (cm *containerManager) ContainerRuntimeState(cid *string, state *ContainerR *state = cm.l.containerRuntimeState(*cid) return nil } + +// Savings holds the savings with restore. +type Savings struct { + // CPUTimeSaved is the CPU time saved at restore. + CPUTimeSaved gtime.Duration + // WallTimeSaved is the wall time saved at restore. + WallTimeSaved gtime.Duration +} + +// GetSavings returns the savings for restored sandboxes. +func (cm *containerManager) GetSavings(_ *struct{}, s *Savings) error { + log.Debugf("containerManager.GetSavings") + cm.l.mu.Lock() + *s = cm.l.savings + cm.l.mu.Unlock() + return nil +} diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 260a6e8f25..af846bf97f 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -304,6 +304,11 @@ type Loader struct { // +checklocks:mu restoreErr error + // savings contains the restore savings (eg CPU and wall time saved). + // + // +checklocks:mu + savings Savings + LoaderExtra } diff --git a/runsc/boot/restore.go b/runsc/boot/restore.go index cc64a9c4cd..acd9be9847 100644 --- a/runsc/boot/restore.go +++ b/runsc/boot/restore.go @@ -396,13 +396,9 @@ func (r *restorer) restore(l *Loader) error { } } - r.cm.onRestoreDone() - postRestoreThread.Reached("kernel notified") - - log.Infof("Restore successful") - // Calculate the CPU time saved for restore. t, err := state.CPUTime() + var s Savings if err != nil { log.Warningf("Failed to get CPU time usage for restore, err: %v", err) } else { @@ -415,7 +411,8 @@ func (r *restorer) restore(l *Loader) error { if err != nil { log.Warningf("CPU time usage in metadata %v is invalid, err: %v", savedTimeStr, err) } else { - log.Infof("CPU time saved with restore: %v", (savedTime - t).String()) + s.CPUTimeSaved = savedTime - t + log.Infof("CPU time saved with restore: %v ms", s.CPUTimeSaved.Milliseconds()) } } } @@ -433,9 +430,15 @@ func (r *restorer) restore(l *Loader) error { if err != nil { log.Warningf("Walltime in metadata %v is invalid, err: %v", savedWtStr, err) } else { - log.Infof("Walltime saved with restore: %v", (savedWt - wt).String()) + s.WallTimeSaved = savedWt - wt + log.Infof("Walltime saved with restore: %v ms", s.WallTimeSaved.Milliseconds()) } } + + r.cm.onRestoreDone(s) + postRestoreThread.Reached("kernel notified") + + log.Infof("Restore successful") }() // Transfer ownership of the `timer` to a new goroutine. diff --git a/runsc/container/metric_server_test.go b/runsc/container/metric_server_test.go index c77da5bdb9..33274796bb 100644 --- a/runsc/container/metric_server_test.go +++ b/runsc/container/metric_server_test.go @@ -30,6 +30,7 @@ import ( "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/test/testutil" "gvisor.dev/gvisor/runsc/config" + "gvisor.dev/gvisor/runsc/sandbox" "gvisor.dev/gvisor/test/metricclient" ) @@ -910,3 +911,107 @@ func TestMetricServerDoesNotExportZeroValueCounters(t *testing.T) { t.Logf("Last metric data:\n\n%s\n\n", metricData) } } + +func TestTimeSavedMetricsAfterRestore(t *testing.T) { + te, cleanup := setupMetrics(t /* forceTempUDS= */, false) + defer cleanup() + + imagePath, err := os.MkdirTemp(testutil.TmpDir(), "checkpoint") + if err != nil { + t.Fatalf("os.MkdirTemp failed: %v", err) + } + defer os.RemoveAll(imagePath) + + containerID := testutil.RandomContainerID() + args := Args{ + ID: containerID, + Spec: te.sleepSpec, + BundleDir: te.bundleDir, + } + cont, err := New(te.sleepConf, args) + if err != nil { + t.Fatalf("error creating container: %v", err) + } + + if err := cont.Start(te.sleepConf); err != nil { + t.Fatalf("Cannot start container: %v", err) + } + + if err := cont.Checkpoint(te.sleepConf, imagePath, sandbox.CheckpointOpts{Resume: true}); err != nil { + t.Fatalf("Checkpoint failed: %v", err) + } + + // Check time saved metrics are zero before restore. + ckpData, err := te.client.GetMetrics(te.testCtx, nil) + if err != nil { + t.Fatalf("Cannot get metrics after restoring container: %v", err) + } + cpuTimeSavedBefore, _, err := ckpData.GetPrometheusContainerInteger(metricclient.WantMetric{ + Metric: "testmetric_meta_sandbox_cpu_time_saved_milliseconds", + Sandbox: containerID, + }) + if err != nil { + t.Errorf("Cannot get testmetric_meta_sandbox_cpu_time_saved_milliseconds from following data (err: %v):\n\n%s\n\n", err, ckpData) + } + if cpuTimeSavedBefore != 0 { + t.Errorf("testmetric_meta_sandbox_cpu_time_saved_milliseconds should be zero, got %v", cpuTimeSavedBefore) + } + wallTimeSavedBefore, _, err := ckpData.GetPrometheusContainerInteger(metricclient.WantMetric{ + Metric: "testmetric_meta_sandbox_wall_time_saved_milliseconds", + Sandbox: containerID, + }) + if err != nil { + t.Errorf("Cannot get testmetric_meta_sandbox_wall_time_saved_milliseconds from following data (err: %v):\n\n%s\n\n", err, ckpData) + } + if wallTimeSavedBefore != 0 { + t.Errorf("testmetric_meta_sandbox_wall_time_saved_milliseconds should be zero, got %v", wallTimeSavedBefore) + } + + if err := cont.Destroy(); err != nil { + t.Fatalf("Destroy failed: %v", err) + } + + // Create new container to restore into. + restoreCont, err := New(te.sleepConf, args) + if err != nil { + t.Fatalf("error creating container for restore: %v", err) + } + defer restoreCont.Destroy() + + if err := restoreCont.Restore(te.sleepConf, imagePath, false, false); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + var cpuTimeSaved, wallTimeSaved int64 + err = testutil.Poll(func() error { + restoreData, err := te.client.GetMetrics(te.testCtx, nil) + if err != nil { + return fmt.Errorf("cannot get metrics after restoring container: %v", err) + } + + // Check time saved metrics are non-zero after restore. + cpuTimeSaved, _, err = restoreData.GetPrometheusContainerInteger(metricclient.WantMetric{ + Metric: "testmetric_meta_sandbox_cpu_time_saved_milliseconds", + Sandbox: containerID, + }) + if err != nil { + return fmt.Errorf("cannot get testmetric_meta_sandbox_cpu_time_saved_milliseconds from following data (err: %v):\n%s", err, restoreData) + } + wallTimeSaved, _, err = restoreData.GetPrometheusContainerInteger(metricclient.WantMetric{ + Metric: "testmetric_meta_sandbox_wall_time_saved_milliseconds", + Sandbox: containerID, + }) + if err != nil { + return fmt.Errorf("cannot get testmetric_meta_sandbox_wall_time_saved_milliseconds from following data (err: %v):\n%s", err, restoreData) + } + if cpuTimeSaved == 0 && wallTimeSaved == 0 { + return fmt.Errorf("time saved metrics are zero: cpu: %d, wall: %d", cpuTimeSaved, wallTimeSaved) + } + return nil + }, 500*time.Millisecond) + if err != nil { + t.Fatalf("Failed to get non-zero time saved metrics: %v", err) + } + t.Logf("After restore, cpu_time_saved=%d", cpuTimeSaved) + t.Logf("After restore, wall_time_saved=%d", wallTimeSaved) +} diff --git a/runsc/metricserver/metricserver.go b/runsc/metricserver/metricserver.go index efd4db644e..4463deb475 100644 --- a/runsc/metricserver/metricserver.go +++ b/runsc/metricserver/metricserver.go @@ -531,11 +531,13 @@ func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadRes // metric data (if querying metrics from the sandbox process succeeded). type sandboxMetricsResult struct { sandboxLoadResult - isRunning bool - isCheckpointed bool - isRestored bool - snapshot *prometheus.Snapshot - err error + isRunning bool + isCheckpointed bool + isRestored bool + cpuTimeSavedMS int64 + wallTimeSavedMS int64 + snapshot *prometheus.Snapshot + err error } // queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes. @@ -574,6 +576,8 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad isRunning := false isCheckpointed := false isRestored := false + cpuTimeSavedMS := int64(0) + wallTimeSavedMS := int64(0) var snapshot *prometheus.Snapshot err := s.err if err == nil { @@ -583,12 +587,16 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad isRunning = s.sandbox.IsRunning() isCheckpointed = s.sandbox.Checkpointed isRestored = s.sandbox.Restored + cpuTimeSavedMS = s.sandbox.CPUTimeSaved.Milliseconds() + wallTimeSavedMS = s.sandbox.WallTimeSaved.Milliseconds() } processSandbox(sandboxMetricsResult{ sandboxLoadResult: s, isRunning: isRunning, isCheckpointed: isCheckpointed, isRestored: isRestored, + cpuTimeSavedMS: cpuTimeSavedMS, + wallTimeSavedMS: wallTimeSavedMS, snapshot: snapshot, err: err, }) @@ -687,6 +695,8 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht sandboxRunning := int64(0) sandboxCheckpointed := int64(0) sandboxRestored := int64(0) + sandboxCPUTimeSavedMS := int64(0) + sandboxWallTimeSavedMS := int64(0) if r.isRunning { sandboxRunning = 1 meta.numRunningSandboxes++ @@ -698,10 +708,14 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht if r.isRestored { sandboxRestored = 1 meta.numRestoredSandboxes++ + sandboxCPUTimeSavedMS = r.cpuTimeSavedMS + sandboxWallTimeSavedMS = r.wallTimeSavedMS } selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels)) selfMetrics.Add(prometheus.LabeledIntData(&SandboxCheckpointedMetric, nil, sandboxCheckpointed).SetExternalLabels(r.served.extraLabels)) selfMetrics.Add(prometheus.LabeledIntData(&SandboxRestoredMetric, nil, sandboxRestored).SetExternalLabels(r.served.extraLabels)) + selfMetrics.Add(prometheus.LabeledIntData(&SandboxCPUTimeSavedMSMetric, nil, sandboxCPUTimeSavedMS).SetExternalLabels(r.served.extraLabels)) + selfMetrics.Add(prometheus.LabeledIntData(&SandboxWallTimeSavedMSMetric, nil, sandboxWallTimeSavedMS).SetExternalLabels(r.served.extraLabels)) if r.err == nil { selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels)) for _, cap := range r.served.capabilities { diff --git a/runsc/metricserver/metricserver_metrics.go b/runsc/metricserver/metricserver_metrics.go index 5ef2c6fe11..a45def7abf 100644 --- a/runsc/metricserver/metricserver_metrics.go +++ b/runsc/metricserver/metricserver_metrics.go @@ -40,6 +40,16 @@ var ( Type: prometheus.TypeGauge, Help: "Boolean metric set to 1 for each restored sandbox.", } + SandboxCPUTimeSavedMSMetric = prometheus.Metric{ + Name: "sandbox_cpu_time_saved_milliseconds", + Type: prometheus.TypeGauge, + Help: "CPU time saved for each restored sandbox(in milliseconds).", + } + SandboxWallTimeSavedMSMetric = prometheus.Metric{ + Name: "sandbox_wall_time_saved_milliseconds", + Type: prometheus.TypeGauge, + Help: "Wall time saved for each restored sandbox(in milliseconds).", + } SandboxMetadataMetric = prometheus.Metric{ Name: "sandbox_metadata", Type: prometheus.TypeGauge, @@ -95,6 +105,8 @@ var Metrics = []*prometheus.Metric{ &SandboxMetadataMetric, &SandboxCheckpointedMetric, &SandboxRestoredMetric, + &SandboxCPUTimeSavedMSMetric, + &SandboxWallTimeSavedMSMetric, &SandboxCapabilitiesMetric, &SpecMetadataMetric, &SandboxCreationMetric, diff --git a/runsc/sandbox/sandbox.go b/runsc/sandbox/sandbox.go index f4759c5b5f..398b55a3e9 100644 --- a/runsc/sandbox/sandbox.go +++ b/runsc/sandbox/sandbox.go @@ -229,6 +229,12 @@ type Sandbox struct { // Restored will be true when the sandbox has been restored. Restored bool `json:"restored"` + + // CPUTimeSaved contains the CPU time saved when the sandbox has been restored. + CPUTimeSaved time.Duration `json:"cpuTimeSaved"` + + // WallTimeSaved contains the wall time saved when the sandbox has been restored. + WallTimeSaved time.Duration `json:"wallTimeSaved"` } // Getpid returns the process ID of the sandbox process. @@ -1662,6 +1668,20 @@ func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) { // ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format. func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) { log.Debugf("Metrics export sandbox %q", s.ID) + + // Update time saved metrics before exporting, if not exported already for + // restored sandboxes. + if s.Restored && s.CPUTimeSaved == 0 && s.WallTimeSaved == 0 { + var savings boot.Savings + err := s.call(boot.ContMgrGetSavings, nil, &savings) + if err != nil { + log.Warningf("Failed to get time saved metrics") + } else { + s.CPUTimeSaved = savings.CPUTimeSaved + s.WallTimeSaved = savings.WallTimeSaved + } + } + var data control.MetricsExportData if err := s.call(boot.MetricsExport, &opts, &data); err != nil { return nil, err