Skip to content

Commit a615cc1

Browse files
committed
stas: temp backport
1 parent cd43363 commit a615cc1

File tree

3 files changed

+215
-44
lines changed

3 files changed

+215
-44
lines changed

go/vt/vttablet/tabletmanager/restore.go

Lines changed: 58 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -145,36 +145,13 @@ func (tm *TabletManager) RestoreData(
145145
}
146146

147147
var (
148-
err error
149-
startTime time.Time
148+
err error
149+
startTime time.Time
150+
backupEngine string
150151
)
151152

152153
defer func() {
153-
stopTime := time.Now()
154-
155-
h := hook.NewSimpleHook("vttablet_restore_done")
156-
h.ExtraEnv = tm.hookExtraEnv()
157-
h.ExtraEnv["TM_RESTORE_DATA_START_TS"] = startTime.UTC().Format(time.RFC3339)
158-
h.ExtraEnv["TM_RESTORE_DATA_STOP_TS"] = stopTime.UTC().Format(time.RFC3339)
159-
h.ExtraEnv["TM_RESTORE_DATA_DURATION"] = stopTime.Sub(startTime).String()
160-
161-
if err != nil {
162-
h.ExtraEnv["TM_RESTORE_DATA_ERROR"] = err.Error()
163-
}
164-
165-
// vttablet_restore_done is best-effort (for now?).
166-
go func() {
167-
// Package vthook already logs the stdout/stderr of hooks when they
168-
// are run, so we don't duplicate that here.
169-
hr := h.Execute()
170-
switch hr.ExitStatus {
171-
case hook.HOOK_SUCCESS:
172-
case hook.HOOK_DOES_NOT_EXIST:
173-
log.Info("No vttablet_restore_done hook.")
174-
default:
175-
log.Warning("vttablet_restore_done hook failed")
176-
}
177-
}()
154+
tm.invokeRestoreDoneHook(startTime, err, backupEngine)
178155
}()
179156

180157
startTime = time.Now()
@@ -185,14 +162,46 @@ func (tm *TabletManager) RestoreData(
185162
RestoreToTimestamp: protoutil.TimeToProto(restoreToTimetamp),
186163
AllowedBackupEngines: allowedBackupEngines,
187164
}
188-
err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, req, mysqlShutdownTimeout)
165+
backupEngine, err = tm.restoreDataLocked(ctx, logger, waitForBackupInterval, deleteBeforeRestore, req, mysqlShutdownTimeout)
189166
if err != nil {
190167
return err
191168
}
192169
return nil
193170
}
194171

195-
func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, request *tabletmanagerdatapb.RestoreFromBackupRequest, mysqlShutdownTimeout time.Duration) error {
172+
func (tm *TabletManager) invokeRestoreDoneHook(startTime time.Time, err error, backupEngine string) {
173+
stopTime := time.Now()
174+
175+
h := hook.NewSimpleHook("vttablet_restore_done")
176+
h.ExtraEnv = tm.hookExtraEnv()
177+
h.ExtraEnv["TM_RESTORE_DATA_START_TS"] = startTime.UTC().Format(time.RFC3339)
178+
h.ExtraEnv["TM_RESTORE_DATA_STOP_TS"] = stopTime.UTC().Format(time.RFC3339)
179+
h.ExtraEnv["TM_RESTORE_DATA_DURATION"] = stopTime.Sub(startTime).String()
180+
181+
if backupEngine != "" {
182+
h.ExtraEnv["TM_RESTORE_DATA_BACKUP_ENGINE"] = backupEngine
183+
}
184+
185+
if err != nil {
186+
h.ExtraEnv["TM_RESTORE_DATA_ERROR"] = err.Error()
187+
}
188+
189+
// vttablet_restore_done is best-effort (for now?).
190+
go func() {
191+
// Package vthook already logs the stdout/stderr of hooks when they
192+
// are run, so we don't duplicate that here.
193+
hr := h.Execute()
194+
switch hr.ExitStatus {
195+
case hook.HOOK_SUCCESS:
196+
case hook.HOOK_DOES_NOT_EXIST:
197+
log.Info("No vttablet_restore_done hook.")
198+
default:
199+
log.Warning("vttablet_restore_done hook failed")
200+
}
201+
}()
202+
}
203+
204+
func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.Logger, waitForBackupInterval time.Duration, deleteBeforeRestore bool, request *tabletmanagerdatapb.RestoreFromBackupRequest, mysqlShutdownTimeout time.Duration) (string, error) {
196205

197206
tablet := tm.Tablet()
198207
originalType := tablet.Type
@@ -203,14 +212,14 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
203212
keyspace := tablet.Keyspace
204213
keyspaceInfo, err := tm.TopoServer.GetKeyspace(ctx, keyspace)
205214
if err != nil {
206-
return err
215+
return "", err
207216
}
208217

209218
// For a SNAPSHOT keyspace, we have to look for backups of BaseKeyspace
210219
// so we will pass the BaseKeyspace in RestoreParams instead of tablet.Keyspace
211220
if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_SNAPSHOT {
212221
if keyspaceInfo.BaseKeyspace == "" {
213-
return vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, fmt.Sprintf("snapshot keyspace %v has no base_keyspace set", tablet.Keyspace))
222+
return "", vterrors.New(vtrpcpb.Code_INVALID_ARGUMENT, fmt.Sprintf("snapshot keyspace %v has no base_keyspace set", tablet.Keyspace))
214223
}
215224
keyspace = keyspaceInfo.BaseKeyspace
216225
log.Infof("Using base_keyspace %v to restore keyspace %v using a backup time of %v", keyspace, tablet.Keyspace, protoutil.TimeFromProto(request.BackupTime).UTC())
@@ -239,12 +248,12 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
239248
}
240249
restoreToTimestamp := protoutil.TimeFromProto(request.RestoreToTimestamp).UTC()
241250
if request.RestoreToPos != "" && !restoreToTimestamp.IsZero() {
242-
return vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "--restore-to-pos and --restore-to-timestamp are mutually exclusive")
251+
return "", vterrors.Errorf(vtrpcpb.Code_INVALID_ARGUMENT, "--restore-to-pos and --restore-to-timestamp are mutually exclusive")
243252
}
244253
if request.RestoreToPos != "" {
245254
pos, err := replication.DecodePosition(request.RestoreToPos)
246255
if err != nil {
247-
return vterrors.Wrapf(err, "restore failed: unable to decode --restore-to-pos: %s", request.RestoreToPos)
256+
return "", vterrors.Wrapf(err, "restore failed: unable to decode --restore-to-pos: %s", request.RestoreToPos)
248257
}
249258
params.RestoreToPos = pos
250259
}
@@ -258,19 +267,19 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
258267
// so we keep our PrimaryTermStartTime (if any) if we aren't actually restoring.
259268
ok, err := mysqlctl.ShouldRestore(ctx, params)
260269
if err != nil {
261-
return err
270+
return "", err
262271
}
263272
if !ok {
264273
params.Logger.Infof("Attempting to restore, but mysqld already contains data. Assuming vttablet was just restarted.")
265-
return nil
274+
return "", nil
266275
}
267276
// We should not become primary after restore, because that would incorrectly
268277
// start a new primary term, and it's likely our data dir will be out of date.
269278
if originalType == topodatapb.TabletType_PRIMARY {
270279
originalType = tm.baseTabletType
271280
}
272281
if err := tm.tmState.ChangeTabletType(ctx, topodatapb.TabletType_RESTORE, DBActionNone); err != nil {
273-
return err
282+
return "", err
274283
}
275284
// Loop until a backup exists, unless we were told to give up immediately.
276285
var backupManifest *mysqlctl.BackupManifest
@@ -292,7 +301,7 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
292301
log.Infof("No backup found. Waiting %v (from -wait_for_backup_interval flag) to check again.", waitForBackupInterval)
293302
select {
294303
case <-ctx.Done():
295-
return ctx.Err()
304+
return "", ctx.Err()
296305
case <-time.After(waitForBackupInterval):
297306
}
298307
}
@@ -308,7 +317,7 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
308317
err = tm.restoreToTimeFromBinlog(ctx, pos, keyspaceInfo.SnapshotTime)
309318
if err != nil {
310319
log.Errorf("unable to restore to the specified time %s, error : %v", keyspaceInfo.SnapshotTime.String(), err)
311-
return nil
320+
return "", nil
312321
}
313322
}
314323
switch {
@@ -321,21 +330,21 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
321330
// up with the primary.
322331
params.Logger.Infof("Restore: disabling replication")
323332
if err := tm.disableReplication(context.Background()); err != nil {
324-
return err
333+
return "", err
325334
}
326335
} else if keyspaceInfo.KeyspaceType == topodatapb.KeyspaceType_NORMAL {
327336
// Reconnect to primary only for "NORMAL" keyspaces
328337
params.Logger.Infof("Restore: starting replication at position %v", pos)
329338
if err := tm.startReplication(context.Background(), pos, originalType); err != nil {
330-
return err
339+
return "", err
331340
}
332341
}
333342
case err == mysqlctl.ErrNoBackup:
334343
// Starting with empty database.
335344
// We just need to initialize replication
336345
_, err := tm.initializeReplication(ctx, originalType)
337346
if err != nil {
338-
return err
347+
return "", err
339348
}
340349
case err == nil && params.DryRun:
341350
// Do nothing here, let the rest of code run
@@ -346,7 +355,7 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
346355
if err := tm.tmState.ChangeTabletType(bgCtx, originalType, DBActionNone); err != nil {
347356
log.Errorf("Could not change back to original tablet type %v: %v", originalType, err)
348357
}
349-
return vterrors.Wrap(err, "Can't restore backup")
358+
return "", vterrors.Wrap(err, "Can't restore backup")
350359
}
351360

352361
// If we had type BACKUP or RESTORE it's better to set our type to the init_tablet_type to make result of the restore
@@ -365,7 +374,13 @@ func (tm *TabletManager) restoreDataLocked(ctx context.Context, logger logutil.L
365374
params.Logger.Infof("Restore: changing tablet type to %v for %s", originalType, tm.tabletAlias.String())
366375
// Change type back to original type if we're ok to serve.
367376
bgCtx := context.Background()
368-
return tm.tmState.ChangeTabletType(bgCtx, originalType, DBActionNone)
377+
378+
var backupEngine string
379+
if backupManifest != nil {
380+
backupEngine = backupManifest.BackupMethod
381+
}
382+
383+
return backupEngine, tm.tmState.ChangeTabletType(bgCtx, originalType, DBActionNone)
369384
}
370385

371386
// restoreToTimeFromBinlog restores to the snapshot time of the keyspace
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
/*
2+
Copyright 2026 The Vitess Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package tabletmanager
18+
19+
import (
20+
"context"
21+
"errors"
22+
"os"
23+
"path/filepath"
24+
"strings"
25+
"testing"
26+
"time"
27+
28+
"github.com/stretchr/testify/assert"
29+
"github.com/stretchr/testify/require"
30+
31+
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
32+
)
33+
34+
func newTestTabletManager(t *testing.T) *TabletManager {
35+
t.Helper()
36+
tablet := &topodatapb.Tablet{
37+
Alias: &topodatapb.TabletAlias{
38+
Cell: "zone1",
39+
Uid: 100,
40+
},
41+
Keyspace: "testkeyspace",
42+
Shard: "-80",
43+
}
44+
tm := &TabletManager{
45+
BatchCtx: context.Background(),
46+
tabletAlias: tablet.Alias,
47+
}
48+
tm.tmState = newTMState(tm, tablet)
49+
return tm
50+
}
51+
52+
// setupHookDir creates a temporary VTROOT with a hook script that dumps
53+
// environment variables into the given output file.
54+
func setupHookDir(t *testing.T) (outputFile string) {
55+
t.Helper()
56+
57+
vtroot := t.TempDir()
58+
t.Setenv("VTROOT", vtroot)
59+
60+
hookDir := filepath.Join(vtroot, "vthook")
61+
require.NoError(t, os.MkdirAll(hookDir, 0o755))
62+
63+
outputFile = filepath.Join(vtroot, "hook_env_output")
64+
hookScript := filepath.Join(hookDir, "vttablet_restore_done")
65+
require.NoError(t, os.WriteFile(hookScript, []byte("#!/bin/bash\nenv > "+outputFile+"\n"), 0o755))
66+
67+
return outputFile
68+
}
69+
70+
func waitForHookOutput(t *testing.T, outputFile string) string {
71+
t.Helper()
72+
var content string
73+
assert.Eventually(t, func() bool {
74+
data, err := os.ReadFile(outputFile)
75+
if err != nil {
76+
return false
77+
}
78+
content = string(data)
79+
return len(content) > 0
80+
}, 5*time.Second, 50*time.Millisecond)
81+
return content
82+
}
83+
84+
func TestInvokeRestoreDoneHook_BackupEngine(t *testing.T) {
85+
outputFile := setupHookDir(t)
86+
tm := newTestTabletManager(t)
87+
88+
startTime := time.Now().Add(-10 * time.Second)
89+
tm.invokeRestoreDoneHook(startTime, nil, "xtrabackup")
90+
91+
content := waitForHookOutput(t, outputFile)
92+
93+
assert.Contains(t, content, "TM_RESTORE_DATA_BACKUP_ENGINE=xtrabackup")
94+
assert.Contains(t, content, "TM_RESTORE_DATA_START_TS=")
95+
assert.Contains(t, content, "TM_RESTORE_DATA_STOP_TS=")
96+
assert.Contains(t, content, "TM_RESTORE_DATA_DURATION=")
97+
assert.Contains(t, content, "TABLET_ALIAS=zone1-0000000100")
98+
assert.Contains(t, content, "KEYSPACE=testkeyspace")
99+
assert.Contains(t, content, "SHARD=-80")
100+
assert.NotContains(t, content, "TM_RESTORE_DATA_ERROR=")
101+
}
102+
103+
func TestInvokeRestoreDoneHook_EmptyBackupEngine(t *testing.T) {
104+
outputFile := setupHookDir(t)
105+
tm := newTestTabletManager(t)
106+
107+
tm.invokeRestoreDoneHook(time.Now(), nil, "")
108+
109+
content := waitForHookOutput(t, outputFile)
110+
111+
assert.NotContains(t, content, "TM_RESTORE_DATA_BACKUP_ENGINE=")
112+
}
113+
114+
func TestInvokeRestoreDoneHook_WithError(t *testing.T) {
115+
outputFile := setupHookDir(t)
116+
tm := newTestTabletManager(t)
117+
118+
restoreErr := errors.New("restore failed: connection refused")
119+
tm.invokeRestoreDoneHook(time.Now(), restoreErr, "builtin")
120+
121+
content := waitForHookOutput(t, outputFile)
122+
123+
assert.Contains(t, content, "TM_RESTORE_DATA_BACKUP_ENGINE=builtin")
124+
assert.Contains(t, content, "TM_RESTORE_DATA_ERROR=restore failed: connection refused")
125+
}
126+
127+
func TestInvokeRestoreDoneHook_ErrorWithoutBackupEngine(t *testing.T) {
128+
outputFile := setupHookDir(t)
129+
tm := newTestTabletManager(t)
130+
131+
restoreErr := errors.New("no backup found")
132+
tm.invokeRestoreDoneHook(time.Now(), restoreErr, "")
133+
134+
content := waitForHookOutput(t, outputFile)
135+
136+
assert.Contains(t, content, "TM_RESTORE_DATA_ERROR=no backup found")
137+
assert.NotContains(t, content, "TM_RESTORE_DATA_BACKUP_ENGINE=")
138+
}
139+
140+
func TestInvokeRestoreDoneHook_Timestamps(t *testing.T) {
141+
outputFile := setupHookDir(t)
142+
tm := newTestTabletManager(t)
143+
144+
startTime := time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC)
145+
tm.invokeRestoreDoneHook(startTime, nil, "xtrabackup")
146+
147+
content := waitForHookOutput(t, outputFile)
148+
149+
assert.Contains(t, content, "TM_RESTORE_DATA_START_TS=2024-01-15T10:30:00Z")
150+
// Verify the duration is present and contains a non-zero value.
151+
for _, line := range strings.Split(content, "\n") {
152+
if duration, ok := strings.CutPrefix(line, "TM_RESTORE_DATA_DURATION="); ok {
153+
assert.NotEmpty(t, duration)
154+
}
155+
}
156+
}

go/vt/vttablet/tabletmanager/rpc_backup.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ func (tm *TabletManager) RestoreFromBackup(ctx context.Context, logger logutil.L
197197
l := logutil.NewTeeLogger(logutil.NewConsoleLogger(), logger)
198198

199199
// Now we can run restore.
200-
err = tm.restoreDataLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */, request, mysqlShutdownTimeout)
200+
_, err = tm.restoreDataLocked(ctx, l, 0 /* waitForBackupInterval */, true /* deleteBeforeRestore */, request, mysqlShutdownTimeout)
201201

202202
// Re-run health check to be sure to capture any replication delay.
203203
tm.QueryServiceControl.BroadcastHealth()

0 commit comments

Comments
 (0)