Skip to content

Commit

Permalink
Fixes for avoidance of hosts taking backup in PRS & ERS (#17300) (#572)
Browse files Browse the repository at this point in the history
This is a backport of upstream's vitessio#17300 . Description of the upstream PR follows:

Description
This PR addresses fixes issues in PRS & ERS preference for tablets that are not taking backups. It does away with redundant field definitions in some of the proto messages and addresses segfault that could take place during ERS.


Signed-off-by: Eduardo J. Ortega U. <[email protected]>
  • Loading branch information
ejortegau authored Dec 18, 2024
1 parent 78921fb commit 90d19bf
Show file tree
Hide file tree
Showing 12 changed files with 663 additions and 791 deletions.
178 changes: 84 additions & 94 deletions go/vt/proto/replicationdata/replicationdata.pb.go

Large diffs are not rendered by default.

38 changes: 2 additions & 36 deletions go/vt/proto/replicationdata/replicationdata_vtproto.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

959 changes: 469 additions & 490 deletions go/vt/proto/tabletmanagerdata/tabletmanagerdata.pb.go

Large diffs are not rendered by default.

72 changes: 2 additions & 70 deletions go/vt/proto/tabletmanagerdata/tabletmanagerdata_vtproto.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 14 additions & 3 deletions go/vt/vtctl/reparentutil/replication.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,9 +251,6 @@ func stopReplicationAndBuildStatusMaps(
logger.Infof("getting replication position from %v", alias)

stopReplicationStatus, err := tmc.StopReplicationAndGetStatus(groupCtx, tabletInfo.Tablet, replicationdatapb.StopReplicationMode_IOTHREADONLY)
m.Lock()
res.tabletsBackupState[alias] = stopReplicationStatus.GetBackupRunning()
m.Unlock()
if err != nil {
sqlErr, isSQLErr := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError)
if isSQLErr && sqlErr != nil && sqlErr.Number() == sqlerror.ERNotReplica {
Expand All @@ -277,6 +274,20 @@ func stopReplicationAndBuildStatusMaps(
err = vterrors.Wrapf(err, "error when getting replication status for alias %v: %v", alias, err)
}
} else {
isTakingBackup := false

// Prefer the most up-to-date information regarding whether the tablet is taking a backup from the After
// replication status, but fall back to the Before status if After is nil.
if stopReplicationStatus.After != nil {
isTakingBackup = stopReplicationStatus.After.BackupRunning
} else if stopReplicationStatus.Before != nil {
isTakingBackup = stopReplicationStatus.Before.BackupRunning
}

m.Lock()
res.tabletsBackupState[alias] = isTakingBackup
m.Unlock()

var sqlThreadRunning bool
// Check if the sql thread was running for the tablet
sqlThreadRunning, err = SQLThreadWasRunning(stopReplicationStatus)
Expand Down
82 changes: 82 additions & 0 deletions go/vt/vtctl/reparentutil/replication_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
waitForAllTablets bool
expectedStatusMap map[string]*replicationdatapb.StopReplicationStatus
expectedPrimaryStatusMap map[string]*replicationdatapb.PrimaryStatus
expectedTakingBackup map[string]bool
expectedTabletsReachable []*topodatapb.Tablet
shouldErr bool
}{
Expand Down Expand Up @@ -375,6 +376,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -443,6 +445,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -527,6 +530,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -621,6 +625,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -714,6 +719,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -786,6 +792,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
Uid: 101,
},
}},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
shouldErr: false,
},
Expand Down Expand Up @@ -847,6 +854,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{
"zone1-0000000100": {
Position: "primary-position-100",
Expand Down Expand Up @@ -921,6 +929,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{}, // zone1-0000000100 fails to demote, so does not appear
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -1044,6 +1053,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
Uid: 101,
},
}},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
shouldErr: false,
},
Expand Down Expand Up @@ -1093,6 +1103,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
After: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-9"},
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000101": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Expand Down Expand Up @@ -1288,8 +1299,78 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
},
}},
stopReplicasTimeout: time.Minute,
expectedTakingBackup: map[string]bool{"zone1-0000000100": false, "zone1-0000000101": false, "zone1-0000000102": false},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
shouldErr: false,
}, {
name: "Handle nil replication status After. No segfaulting when determining backup status, and fall back to Before status",
durability: "none",
tmc: &stopReplicationAndBuildStatusMapsTestTMClient{
stopReplicationAndGetStatusResults: map[string]*struct {
StopStatus *replicationdatapb.StopReplicationStatus
Err error
}{
"zone1-0000000100": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429100:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
},
"zone1-0000000101": {
StopStatus: &replicationdatapb.StopReplicationStatus{
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
},
},
},
tabletMap: map[string]*topo.TabletInfo{
"zone1-0000000100": {
Tablet: &topodatapb.Tablet{
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
},
},
"zone1-0000000101": {
Tablet: &topodatapb.Tablet{
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
},
},
},
ignoredTablets: sets.New[string](),
expectedStatusMap: map[string]*replicationdatapb.StopReplicationStatus{
"zone1-0000000100": {
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429100:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
"zone1-0000000101": {
Before: &replicationdatapb.Status{Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429101:1-5", IoState: int32(replication.ReplicationStateRunning), SqlState: int32(replication.ReplicationStateRunning), BackupRunning: true},
After: nil,
},
},
expectedTakingBackup: map[string]bool{"zone1-0000000100": true, "zone1-0000000101": true},
expectedPrimaryStatusMap: map[string]*replicationdatapb.PrimaryStatus{},
expectedTabletsReachable: []*topodatapb.Tablet{{
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 100,
},
}, {
Type: topodatapb.TabletType_REPLICA,
Alias: &topodatapb.TabletAlias{
Cell: "zone1",
Uid: 101,
},
}},
shouldErr: false,
},
}

Expand Down Expand Up @@ -1317,6 +1398,7 @@ func Test_stopReplicationAndBuildStatusMaps(t *testing.T) {
for idx, tablet := range res.reachableTablets {
assert.True(t, topoproto.IsTabletInList(tablet, tt.expectedTabletsReachable), "TabletsReached[%d] not found - %s", idx, topoproto.TabletAliasString(tablet.Alias))
}
assert.Equal(t, tt.expectedTakingBackup, res.tabletsBackupState)
})
}
}
Expand Down
Loading

0 comments on commit 90d19bf

Please sign in to comment.