Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion runsc/boot/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ const (
// ContMgrExecuteAsync executes a command in a container.
ContMgrExecuteAsync = "containerManager.ExecuteAsync"

// ContMgrGetSavings gets the savings for restored sandboxes.
ContMgrGetSavings = "containerManager.GetSavings"

// ContMgrPortForward starts port forwarding with the sandbox.
ContMgrPortForward = "containerManager.PortForward"

Expand Down Expand Up @@ -702,9 +705,10 @@ func (cm *containerManager) onRestoreFailed(err error) {
cm.restorer = nil
}

func (cm *containerManager) onRestoreDone() {
func (cm *containerManager) onRestoreDone(s Savings) {
cm.l.mu.Lock()
cm.l.state = restored
cm.l.savings = s
cm.l.mu.Unlock()
cm.l.restoreDone.Broadcast()
cm.restorer = nil
Expand Down Expand Up @@ -1061,3 +1065,20 @@ func (cm *containerManager) ContainerRuntimeState(cid *string, state *ContainerR
*state = cm.l.containerRuntimeState(*cid)
return nil
}

// Savings holds the savings with restore.
type Savings struct {
// CPUTimeSaved is the CPU time saved at restore.
CPUTimeSaved gtime.Duration
// WallTimeSaved is the wall time saved at restore.
WallTimeSaved gtime.Duration
}

// GetSavings returns the savings for restored sandboxes.
func (cm *containerManager) GetSavings(_ *struct{}, s *Savings) error {
log.Debugf("containerManager.GetSavings")
cm.l.mu.Lock()
*s = cm.l.savings
cm.l.mu.Unlock()
return nil
}
5 changes: 5 additions & 0 deletions runsc/boot/loader.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,11 @@ type Loader struct {
// +checklocks:mu
restoreErr error

// savings contains the restore savings (eg CPU and wall time saved).
//
// +checklocks:mu
savings Savings

LoaderExtra
}

Expand Down
17 changes: 10 additions & 7 deletions runsc/boot/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -396,13 +396,9 @@ func (r *restorer) restore(l *Loader) error {
}
}

r.cm.onRestoreDone()
postRestoreThread.Reached("kernel notified")

log.Infof("Restore successful")

// Calculate the CPU time saved for restore.
t, err := state.CPUTime()
var s Savings
if err != nil {
log.Warningf("Failed to get CPU time usage for restore, err: %v", err)
} else {
Expand All @@ -415,7 +411,8 @@ func (r *restorer) restore(l *Loader) error {
if err != nil {
log.Warningf("CPU time usage in metadata %v is invalid, err: %v", savedTimeStr, err)
} else {
log.Infof("CPU time saved with restore: %v", (savedTime - t).String())
s.CPUTimeSaved = savedTime - t
log.Infof("CPU time saved with restore: %v ms", s.CPUTimeSaved.Milliseconds())
}
}
}
Expand All @@ -433,9 +430,15 @@ func (r *restorer) restore(l *Loader) error {
if err != nil {
log.Warningf("Walltime in metadata %v is invalid, err: %v", savedWtStr, err)
} else {
log.Infof("Walltime saved with restore: %v", (savedWt - wt).String())
s.WallTimeSaved = savedWt - wt
log.Infof("Walltime saved with restore: %v ms", s.WallTimeSaved.Milliseconds())
}
}

r.cm.onRestoreDone(s)
postRestoreThread.Reached("kernel notified")

log.Infof("Restore successful")
}()

// Transfer ownership of the `timer` to a new goroutine.
Expand Down
105 changes: 105 additions & 0 deletions runsc/container/metric_server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/test/testutil"
"gvisor.dev/gvisor/runsc/config"
"gvisor.dev/gvisor/runsc/sandbox"
"gvisor.dev/gvisor/test/metricclient"
)

Expand Down Expand Up @@ -910,3 +911,107 @@ func TestMetricServerDoesNotExportZeroValueCounters(t *testing.T) {
t.Logf("Last metric data:\n\n%s\n\n", metricData)
}
}

func TestTimeSavedMetricsAfterRestore(t *testing.T) {
te, cleanup := setupMetrics(t /* forceTempUDS= */, false)
defer cleanup()

imagePath, err := os.MkdirTemp(testutil.TmpDir(), "checkpoint")
if err != nil {
t.Fatalf("os.MkdirTemp failed: %v", err)
}
defer os.RemoveAll(imagePath)

containerID := testutil.RandomContainerID()
args := Args{
ID: containerID,
Spec: te.sleepSpec,
BundleDir: te.bundleDir,
}
cont, err := New(te.sleepConf, args)
if err != nil {
t.Fatalf("error creating container: %v", err)
}

if err := cont.Start(te.sleepConf); err != nil {
t.Fatalf("Cannot start container: %v", err)
}

if err := cont.Checkpoint(te.sleepConf, imagePath, sandbox.CheckpointOpts{Resume: true}); err != nil {
t.Fatalf("Checkpoint failed: %v", err)
}

// Check time saved metrics are zero before restore.
ckpData, err := te.client.GetMetrics(te.testCtx, nil)
if err != nil {
t.Fatalf("Cannot get metrics after restoring container: %v", err)
}
cpuTimeSavedBefore, _, err := ckpData.GetPrometheusContainerInteger(metricclient.WantMetric{
Metric: "testmetric_meta_sandbox_cpu_time_saved_milliseconds",
Sandbox: containerID,
})
if err != nil {
t.Errorf("Cannot get testmetric_meta_sandbox_cpu_time_saved_milliseconds from following data (err: %v):\n\n%s\n\n", err, ckpData)
}
if cpuTimeSavedBefore != 0 {
t.Errorf("testmetric_meta_sandbox_cpu_time_saved_milliseconds should be zero, got %v", cpuTimeSavedBefore)
}
wallTimeSavedBefore, _, err := ckpData.GetPrometheusContainerInteger(metricclient.WantMetric{
Metric: "testmetric_meta_sandbox_wall_time_saved_milliseconds",
Sandbox: containerID,
})
if err != nil {
t.Errorf("Cannot get testmetric_meta_sandbox_wall_time_saved_milliseconds from following data (err: %v):\n\n%s\n\n", err, ckpData)
}
if wallTimeSavedBefore != 0 {
t.Errorf("testmetric_meta_sandbox_wall_time_saved_milliseconds should be zero, got %v", wallTimeSavedBefore)
}

if err := cont.Destroy(); err != nil {
t.Fatalf("Destroy failed: %v", err)
}

// Create new container to restore into.
restoreCont, err := New(te.sleepConf, args)
if err != nil {
t.Fatalf("error creating container for restore: %v", err)
}
defer restoreCont.Destroy()

if err := restoreCont.Restore(te.sleepConf, imagePath, false, false); err != nil {
t.Fatalf("Restore failed: %v", err)
}

var cpuTimeSaved, wallTimeSaved int64
err = testutil.Poll(func() error {
restoreData, err := te.client.GetMetrics(te.testCtx, nil)
if err != nil {
return fmt.Errorf("cannot get metrics after restoring container: %v", err)
}

// Check time saved metrics are non-zero after restore.
cpuTimeSaved, _, err = restoreData.GetPrometheusContainerInteger(metricclient.WantMetric{
Metric: "testmetric_meta_sandbox_cpu_time_saved_milliseconds",
Sandbox: containerID,
})
if err != nil {
return fmt.Errorf("cannot get testmetric_meta_sandbox_cpu_time_saved_milliseconds from following data (err: %v):\n%s", err, restoreData)
}
wallTimeSaved, _, err = restoreData.GetPrometheusContainerInteger(metricclient.WantMetric{
Metric: "testmetric_meta_sandbox_wall_time_saved_milliseconds",
Sandbox: containerID,
})
if err != nil {
return fmt.Errorf("cannot get testmetric_meta_sandbox_wall_time_saved_milliseconds from following data (err: %v):\n%s", err, restoreData)
}
if cpuTimeSaved == 0 && wallTimeSaved == 0 {
return fmt.Errorf("time saved metrics are zero: cpu: %d, wall: %d", cpuTimeSaved, wallTimeSaved)
}
return nil
}, 500*time.Millisecond)
if err != nil {
t.Fatalf("Failed to get non-zero time saved metrics: %v", err)
}
t.Logf("After restore, cpu_time_saved=%d", cpuTimeSaved)
t.Logf("After restore, wall_time_saved=%d", wallTimeSaved)
}
24 changes: 19 additions & 5 deletions runsc/metricserver/metricserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -531,11 +531,13 @@ func (m *metricServer) loadSandboxesLocked(ctx context.Context) []sandboxLoadRes
// metric data (if querying metrics from the sandbox process succeeded).
type sandboxMetricsResult struct {
sandboxLoadResult
isRunning bool
isCheckpointed bool
isRestored bool
snapshot *prometheus.Snapshot
err error
isRunning bool
isCheckpointed bool
isRestored bool
cpuTimeSavedMS int64
wallTimeSavedMS int64
snapshot *prometheus.Snapshot
err error
}

// queryMultiSandboxMetrics queries metric data from multiple loaded sandboxes.
Expand Down Expand Up @@ -574,6 +576,8 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad
isRunning := false
isCheckpointed := false
isRestored := false
cpuTimeSavedMS := int64(0)
wallTimeSavedMS := int64(0)
var snapshot *prometheus.Snapshot
err := s.err
if err == nil {
Expand All @@ -583,12 +587,16 @@ func queryMultiSandboxMetrics(ctx context.Context, loadedSandboxes []sandboxLoad
isRunning = s.sandbox.IsRunning()
isCheckpointed = s.sandbox.Checkpointed
isRestored = s.sandbox.Restored
cpuTimeSavedMS = s.sandbox.CPUTimeSaved.Milliseconds()
wallTimeSavedMS = s.sandbox.WallTimeSaved.Milliseconds()
}
processSandbox(sandboxMetricsResult{
sandboxLoadResult: s,
isRunning: isRunning,
isCheckpointed: isCheckpointed,
isRestored: isRestored,
cpuTimeSavedMS: cpuTimeSavedMS,
wallTimeSavedMS: wallTimeSavedMS,
snapshot: snapshot,
err: err,
})
Expand Down Expand Up @@ -687,6 +695,8 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
sandboxRunning := int64(0)
sandboxCheckpointed := int64(0)
sandboxRestored := int64(0)
sandboxCPUTimeSavedMS := int64(0)
sandboxWallTimeSavedMS := int64(0)
if r.isRunning {
sandboxRunning = 1
meta.numRunningSandboxes++
Expand All @@ -698,10 +708,14 @@ func (m *metricServer) serveMetrics(w *httpResponseWriter, req *http.Request) ht
if r.isRestored {
sandboxRestored = 1
meta.numRestoredSandboxes++
sandboxCPUTimeSavedMS = r.cpuTimeSavedMS
sandboxWallTimeSavedMS = r.wallTimeSavedMS
}
selfMetrics.Add(prometheus.LabeledIntData(&SandboxRunningMetric, nil, sandboxRunning).SetExternalLabels(r.served.extraLabels))
selfMetrics.Add(prometheus.LabeledIntData(&SandboxCheckpointedMetric, nil, sandboxCheckpointed).SetExternalLabels(r.served.extraLabels))
selfMetrics.Add(prometheus.LabeledIntData(&SandboxRestoredMetric, nil, sandboxRestored).SetExternalLabels(r.served.extraLabels))
selfMetrics.Add(prometheus.LabeledIntData(&SandboxCPUTimeSavedMSMetric, nil, sandboxCPUTimeSavedMS).SetExternalLabels(r.served.extraLabels))
selfMetrics.Add(prometheus.LabeledIntData(&SandboxWallTimeSavedMSMetric, nil, sandboxWallTimeSavedMS).SetExternalLabels(r.served.extraLabels))
if r.err == nil {
selfMetrics.Add(prometheus.LabeledIntData(&SandboxMetadataMetric, r.sandbox.MetricMetadata, 1).SetExternalLabels(r.served.extraLabels))
for _, cap := range r.served.capabilities {
Expand Down
12 changes: 12 additions & 0 deletions runsc/metricserver/metricserver_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,16 @@ var (
Type: prometheus.TypeGauge,
Help: "Boolean metric set to 1 for each restored sandbox.",
}
SandboxCPUTimeSavedMSMetric = prometheus.Metric{
Name: "sandbox_cpu_time_saved_milliseconds",
Type: prometheus.TypeGauge,
Help: "CPU time saved for each restored sandbox(in milliseconds).",
}
SandboxWallTimeSavedMSMetric = prometheus.Metric{
Name: "sandbox_wall_time_saved_milliseconds",
Type: prometheus.TypeGauge,
Help: "Wall time saved for each restored sandbox(in milliseconds).",
}
SandboxMetadataMetric = prometheus.Metric{
Name: "sandbox_metadata",
Type: prometheus.TypeGauge,
Expand Down Expand Up @@ -95,6 +105,8 @@ var Metrics = []*prometheus.Metric{
&SandboxMetadataMetric,
&SandboxCheckpointedMetric,
&SandboxRestoredMetric,
&SandboxCPUTimeSavedMSMetric,
&SandboxWallTimeSavedMSMetric,
&SandboxCapabilitiesMetric,
&SpecMetadataMetric,
&SandboxCreationMetric,
Expand Down
20 changes: 20 additions & 0 deletions runsc/sandbox/sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,12 @@ type Sandbox struct {

// Restored will be true when the sandbox has been restored.
Restored bool `json:"restored"`

// CPUTimeSaved contains the CPU time saved when the sandbox has been restored.
CPUTimeSaved time.Duration `json:"cpuTimeSaved"`

// WallTimeSaved contains the wall time saved when the sandbox has been restored.
WallTimeSaved time.Duration `json:"wallTimeSaved"`
}

// Getpid returns the process ID of the sandbox process.
Expand Down Expand Up @@ -1662,6 +1668,20 @@ func (s *Sandbox) GetRegisteredMetrics() (*metricpb.MetricRegistration, error) {
// ExportMetrics returns a snapshot of metric values from the sandbox in Prometheus format.
func (s *Sandbox) ExportMetrics(opts control.MetricsExportOpts) (*prometheus.Snapshot, error) {
log.Debugf("Metrics export sandbox %q", s.ID)

// Update time saved metrics before exporting, if not exported already for
// restored sandboxes.
if s.Restored && s.CPUTimeSaved == 0 && s.WallTimeSaved == 0 {
var savings boot.Savings
err := s.call(boot.ContMgrGetSavings, nil, &savings)
if err != nil {
log.Warningf("Failed to get time saved metrics")
} else {
s.CPUTimeSaved = savings.CPUTimeSaved
s.WallTimeSaved = savings.WallTimeSaved
}
}

var data control.MetricsExportData
if err := s.call(boot.MetricsExport, &opts, &data); err != nil {
return nil, err
Expand Down
Loading