Skip to content

Commit a54bb82

Browse files
Add VTOrc recovery for mismatch in tablet type (vitessio#17870)
Signed-off-by: Manan Gupta <manan@planetscale.com>
1 parent 470671c commit a54bb82

File tree

22 files changed

+454
-43
lines changed

22 files changed

+454
-43
lines changed

go/test/endtoend/cluster/cluster_process.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,14 @@ type Vttablet struct {
148148
VttabletProcess *VttabletProcess
149149
}
150150

151+
// GetAlias returns the tablet alias.
152+
func (tablet *Vttablet) GetAlias() *topodatapb.TabletAlias {
153+
return &topodatapb.TabletAlias{
154+
Cell: tablet.Cell,
155+
Uid: uint32(tablet.TabletUID),
156+
}
157+
}
158+
151159
// Keyspace : Cluster accepts keyspace to launch it
152160
type Keyspace struct {
153161
Name string

go/test/endtoend/cluster/cluster_util.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030

3131
"vitess.io/vitess/go/vt/grpcclient"
3232
"vitess.io/vitess/go/vt/log"
33+
replicationdatapb "vitess.io/vitess/go/vt/proto/replicationdata"
3334
"vitess.io/vitess/go/vt/vtgate/grpcvtgateconn"
3435

3536
"github.com/buger/jsonparser"
@@ -101,6 +102,15 @@ func GetPrimaryPosition(t *testing.T, vttablet Vttablet, hostname string) (strin
101102
return pos, gtID
102103
}
103104

105+
// FullStatus gets the full status from the given tablet.
106+
func FullStatus(t *testing.T, vttablet *Vttablet, hostname string) *replicationdatapb.FullStatus {
107+
ctx := context.Background()
108+
vtablet := getTablet(vttablet.GrpcPort, hostname)
109+
status, err := tmClient.FullStatus(ctx, vtablet)
110+
require.NoError(t, err)
111+
return status
112+
}
113+
104114
// VerifyRowsInTabletForTable verifies the total number of rows in a table.
105115
// This is used to check that replication has caught up with the changes on primary.
106116
func VerifyRowsInTabletForTable(t *testing.T, vttablet *Vttablet, ksName string, expectedRows int, tableName string) {

go/test/endtoend/vtorc/general/vtorc_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,11 @@ import (
2626
"github.com/stretchr/testify/require"
2727

2828
"vitess.io/vitess/go/mysql"
29+
"vitess.io/vitess/go/protoutil"
2930
"vitess.io/vitess/go/test/endtoend/cluster"
3031
"vitess.io/vitess/go/test/endtoend/vtorc/utils"
3132
"vitess.io/vitess/go/vt/log"
33+
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
3234
"vitess.io/vitess/go/vt/vtorc/inst"
3335
"vitess.io/vitess/go/vt/vtorc/logic"
3436
)
@@ -335,6 +337,34 @@ func TestVTOrcRepairs(t *testing.T) {
335337
0,
336338
)
337339
})
340+
341+
t.Run("Primary tablet's display type doesn't match the topo record", func(t *testing.T) {
342+
// There is no easy way to make a tablet type mismatch with the topo record.
343+
// In production this only happens when the call to update the topo record fails with a timeout,
344+
// but the operation has succeeded. We can't reliably simulate this in a test.
345+
// So, instead we are explicitly changing the tablet record for one of the tablets
346+
// to make it a primary and see that VTOrc detects the mismatch and promotes the tablet.
347+
348+
// Initially check that replication is working as intended
349+
utils.CheckReplication(t, clusterInfo, curPrimary, []*cluster.Vttablet{replica, otherReplica}, 15*time.Second)
350+
351+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
352+
defer cancel()
353+
_, err := clusterInfo.Ts.UpdateTabletFields(ctx, replica.GetAlias(), func(tablet *topodatapb.Tablet) error {
354+
tablet.Type = topodatapb.TabletType_PRIMARY
355+
tablet.PrimaryTermStartTime = protoutil.TimeToProto(time.Now())
356+
return nil
357+
})
358+
require.NoError(t, err)
359+
360+
// Wait for VTOrc to detect the mismatch and promote the tablet.
361+
require.Eventuallyf(t, func() bool {
362+
fs := cluster.FullStatus(t, replica, clusterInfo.ClusterInstance.Hostname)
363+
return fs.TabletType == topodatapb.TabletType_PRIMARY
364+
}, 10*time.Second, 1*time.Second, "Primary tablet's display type didn't match the topo record")
365+
// Also check that the replica gets promoted and can accept writes.
366+
utils.CheckReplication(t, clusterInfo, replica, []*cluster.Vttablet{curPrimary, otherReplica}, 15*time.Second)
367+
})
338368
}
339369

340370
func TestRepairAfterTER(t *testing.T) {

go/test/endtoend/vtorc/readtopologyinstance/main_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
"vitess.io/vitess/go/test/endtoend/cluster"
2626
"vitess.io/vitess/go/test/endtoend/vtorc/utils"
27+
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
2728
"vitess.io/vitess/go/vt/servenv"
2829
"vitess.io/vitess/go/vt/vtorc/config"
2930
"vitess.io/vitess/go/vt/vtorc/inst"
@@ -74,6 +75,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
7475
require.NotNil(t, primaryInstance)
7576
assert.Equal(t, utils.Hostname, primaryInstance.Hostname)
7677
assert.Equal(t, primary.MySQLPort, primaryInstance.Port)
78+
assert.Equal(t, topodatapb.TabletType_PRIMARY, primaryInstance.TabletType)
7779
assert.Contains(t, primaryInstance.InstanceAlias, "zone1")
7880
assert.NotEqual(t, 0, primaryInstance.ServerID)
7981
assert.Greater(t, len(primaryInstance.ServerUUID), 10)
@@ -125,6 +127,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
125127
require.NotNil(t, replicaInstance)
126128
assert.Equal(t, utils.Hostname, replicaInstance.Hostname)
127129
assert.Equal(t, replica.MySQLPort, replicaInstance.Port)
130+
assert.Equal(t, topodatapb.TabletType_REPLICA, replicaInstance.TabletType)
128131
assert.Contains(t, replicaInstance.InstanceAlias, "zone1")
129132
assert.NotEqual(t, 0, replicaInstance.ServerID)
130133
assert.Greater(t, len(replicaInstance.ServerUUID), 10)

go/vt/proto/replicationdata/replicationdata.pb.go

Lines changed: 31 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/vt/proto/replicationdata/replicationdata_vtproto.pb.go

Lines changed: 31 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

go/vt/vtorc/db/generate_base.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ CREATE TABLE database_instance (
4444
alias varchar(256) NOT NULL,
4545
hostname varchar(128) NOT NULL,
4646
port smallint NOT NULL,
47+
tablet_type smallint(5) NOT NULL,
4748
last_checked timestamp not null default (''),
4849
last_seen timestamp NULL DEFAULT NULL,
4950
server_id int NOT NULL,

go/vt/vtorc/inst/analysis.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ const (
3838
DeadPrimaryAndSomeReplicas AnalysisCode = "DeadPrimaryAndSomeReplicas"
3939
PrimaryHasPrimary AnalysisCode = "PrimaryHasPrimary"
4040
PrimaryIsReadOnly AnalysisCode = "PrimaryIsReadOnly"
41+
PrimaryCurrentTypeMismatch AnalysisCode = "PrimaryCurrentTypeMismatch"
4142
PrimarySemiSyncMustBeSet AnalysisCode = "PrimarySemiSyncMustBeSet"
4243
PrimarySemiSyncMustNotBeSet AnalysisCode = "PrimarySemiSyncMustNotBeSet"
4344
ReplicaIsWritable AnalysisCode = "ReplicaIsWritable"
@@ -88,6 +89,7 @@ type ReplicationAnalysis struct {
8889
AnalyzedInstanceAlias string
8990
AnalyzedInstancePrimaryAlias string
9091
TabletType topodatapb.TabletType
92+
CurrentTabletType topodatapb.TabletType
9193
PrimaryTimeStamp time.Time
9294
ClusterDetails ClusterInfo
9395
AnalyzedKeyspace string

go/vt/vtorc/inst/analysis_dao.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,9 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
158158
MIN(
159159
primary_instance.semi_sync_replica_enabled
160160
) AS semi_sync_replica_enabled,
161+
MIN(
162+
primary_instance.tablet_type
163+
) AS current_tablet_type,
161164
SUM(replica_instance.oracle_gtid) AS count_oracle_gtid_replicas,
162165
IFNULL(
163166
SUM(
@@ -300,6 +303,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
300303
}
301304

302305
a.TabletType = tablet.Type
306+
a.CurrentTabletType = topodatapb.TabletType(m.GetInt("current_tablet_type"))
303307
a.AnalyzedKeyspace = m.GetString("keyspace")
304308
a.AnalyzedShard = m.GetString("shard")
305309
a.PrimaryTimeStamp = m.GetTime("primary_timestamp")
@@ -448,6 +452,9 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
448452
a.Analysis = PrimarySemiSyncMustNotBeSet
449453
a.Description = "Primary semi-sync must not be set"
450454
//
455+
} else if a.IsClusterPrimary && a.CurrentTabletType != topodatapb.TabletType_PRIMARY {
456+
a.Analysis = PrimaryCurrentTypeMismatch
457+
a.Description = "Primary tablet's current type is not PRIMARY"
451458
} else if topo.IsReplicaType(a.TabletType) && a.ErrantGTID != "" {
452459
a.Analysis = ErrantGTIDDetected
453460
a.Description = "Tablet has errant GTIDs"

0 commit comments

Comments
 (0)