Skip to content

Commit 3cf9de4

Browse files
committed
server: return draining progress if the server controller is draining
This commit fixes a bug where if the server controller is being drained, the drain command would return a response without populating the IsDraining field. The CLI interprets this as if the draining is completed. Output example when running: ``` ./cockroach node drain 3 --insecure --url {pgurl:3} --logtostderr=INFO I251008 15:54:42.613200 15 2@rpc/peer.go:613 [rnode=?,raddr=10.142.1.228:26257,class=system,rpc] 1 connection is now healthy node is draining... remaining: 3 I251008 15:54:42.622586 1 cli/rpc_node_shutdown.go:184 [-] 2 drain details: tenant servers: 3 node is draining... remaining: 3 I251008 15:54:42.824526 1 cli/rpc_node_shutdown.go:184 [-] 3 drain details: tenant servers: 3 node is draining... remaining: 1 I251008 15:54:43.026405 1 cli/rpc_node_shutdown.go:184 [-] 4 drain details: tenant servers: 1 node is draining... remaining: 1 I251008 15:54:43.228596 1 cli/rpc_node_shutdown.go:184 [-] 5 drain details: tenant servers: 1 node is draining... remaining: 243 I251008 15:54:44.580413 1 cli/rpc_node_shutdown.go:184 [-] 6 drain details: liveness record: 1, range lease iterations: 175, descriptor leases: 67 node is draining... remaining: 0 (complete) drain ok ``` Release note (bug fix): fixed a bug in the drain command where draining a node using virtual clusters (such as clusters running Physical Cluster Replication) could return before the drain was complete, possibly resulting in shutting down the node while it still had active SQL clients and ranges leases. Epic: None
1 parent c9e2699 commit 3cf9de4

File tree

3 files changed

+54
-9
lines changed

3 files changed

+54
-9
lines changed

pkg/server/drain.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -389,10 +389,13 @@ func (s *drainServer) drainInner(
389389
return s.drainNode(ctx, reporter, verbose)
390390
}
391391

392-
// isDraining returns true if either SQL client connections are being drained
393-
// or if one of the stores on the node is not accepting replicas.
392+
// isDraining returns true if the SQL client connections are being drained,
393+
// if one of the stores on the node is not accepting replicas, or if the
394+
// server controller is being drained.
394395
func (s *drainServer) isDraining() bool {
395-
return s.sqlServer.pgServer.IsDraining() || (s.kvServer.node != nil && s.kvServer.node.IsDraining())
396+
return s.sqlServer.pgServer.IsDraining() ||
397+
(s.kvServer.node != nil && s.kvServer.node.IsDraining()) ||
398+
(s.serverCtl != nil && s.serverCtl.IsDraining())
396399
}
397400

398401
// drainClients starts draining the SQL layer.
@@ -404,7 +407,10 @@ func (s *drainServer) drainClients(
404407
var cancel context.CancelFunc
405408
ctx, cancel = context.WithCancel(ctx)
406409
defer cancel()
407-
shouldDelayDraining := !s.isDraining()
410+
411+
// If this is the first time we are draining the clients, we delay draining
412+
// to allow health probes to notice that the node is not ready.
413+
shouldDelayDraining := !s.sqlServer.pgServer.IsDraining()
408414

409415
// Set the gRPC mode of the node to "draining" and mark the node as "not ready".
410416
// Probes to /health?ready=1 will now notice the change in the node's readiness.

pkg/server/drain_test.go

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package server_test
77

88
import (
99
"context"
10+
gosql "database/sql"
1011
"io"
1112
"testing"
1213
"time"
@@ -20,6 +21,7 @@ import (
2021
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats/persistedsqlstats/sqlstatstestutil"
2122
"github.com/cockroachdb/cockroach/pkg/testutils"
2223
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
24+
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
2325
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
2426
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
2527
"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
@@ -34,15 +36,39 @@ import (
3436
func TestDrain(t *testing.T) {
3537
defer leaktest.AfterTest(t)()
3638
defer log.Scope(t).Close(t)
37-
doTestDrain(t)
39+
testutils.RunTrueAndFalse(t, "secondaryTenants", func(t *testing.T, secondaryTenants bool) {
40+
doTestDrain(t, secondaryTenants)
41+
})
3842
}
3943

4044
// doTestDrain runs the drain test.
41-
func doTestDrain(tt *testing.T) {
45+
func doTestDrain(tt *testing.T, secondaryTenants bool) {
46+
if secondaryTenants {
47+
// Draining the tenant server takes a long time under stress. See #155229.
48+
skip.UnderRace(tt)
49+
}
4250
var drainSleepCallCount = 0
4351
t := newTestDrainContext(tt, &drainSleepCallCount)
4452
defer t.Close()
4553

54+
var tenantConn *gosql.DB
55+
if secondaryTenants {
56+
_, err := t.tc.ServerConn(0).Exec("CREATE TENANT hello")
57+
require.NoError(tt, err)
58+
_, err = t.tc.ServerConn(0).Exec("ALTER TENANT hello START SERVICE SHARED")
59+
require.NoError(tt, err)
60+
61+
// Wait for the tenant to be ready by establishing a test connection.
62+
testutils.SucceedsSoon(tt, func() error {
63+
tenantConn, err = t.tc.Server(0).SystemLayer().SQLConnE(
64+
serverutils.DBName("cluster:hello/defaultdb"))
65+
if err != nil {
66+
return err
67+
}
68+
return tenantConn.Ping()
69+
})
70+
}
71+
4672
// Issue a probe. We're not draining yet, so the probe should
4773
// reflect that.
4874
resp := t.sendProbe()
@@ -54,7 +80,12 @@ func doTestDrain(tt *testing.T) {
5480
resp = t.sendDrainNoShutdown()
5581
t.assertDraining(resp, true)
5682
t.assertRemaining(resp, true)
57-
t.assertEqual(1, drainSleepCallCount)
83+
if !secondaryTenants {
84+
// If we have secondary tenants, we might not have waited before draining
85+
// the clients at this point because we might have returned early if we are
86+
// still draining the serverController.
87+
t.assertEqual(1, drainSleepCallCount)
88+
}
5889

5990
// Issue another probe. This checks that the server is still running
6091
// (i.e. Shutdown: false was effective), the draining status is
@@ -64,12 +95,14 @@ func doTestDrain(tt *testing.T) {
6495
t.assertDraining(resp, true)
6596
// probe-only has no remaining.
6697
t.assertRemaining(resp, false)
67-
t.assertEqual(1, drainSleepCallCount)
68-
98+
if !secondaryTenants {
99+
t.assertEqual(1, drainSleepCallCount)
100+
}
69101
// Repeat drain commands until we verify that there are zero remaining leases
70102
// (i.e. complete). Also validate that the server did not sleep again.
71103
testutils.SucceedsSoon(t, func() error {
72104
resp = t.sendDrainNoShutdown()
105+
t.Logf("drain response: %+v", resp)
73106
if !resp.IsDraining {
74107
return errors.Newf("expected draining")
75108
}
@@ -79,6 +112,7 @@ func doTestDrain(tt *testing.T) {
79112
}
80113
return nil
81114
})
115+
82116
t.assertEqual(1, drainSleepCallCount)
83117

84118
// Now issue a drain request without drain but with shutdown.
@@ -189,6 +223,7 @@ func newTestDrainContext(t *testing.T, drainSleepCallCount *int) *testDrainConte
189223
// We need to start the cluster insecure in order to not
190224
// care about TLS settings for the RPC client connection.
191225
ServerArgs: base.TestServerArgs{
226+
DefaultTestTenant: base.TestControlsTenantsExplicitly,
192227
Knobs: base.TestingKnobs{
193228
Server: &server.TestingKnobs{
194229
DrainSleepFn: func(time.Duration) {

pkg/server/server_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,10 @@ func (s *serverController) SetDraining() {
168168
}
169169
}
170170

171+
func (s *serverController) IsDraining() bool {
172+
return s.draining.Load()
173+
}
174+
171175
// start monitors changes to the service mode and updates
172176
// the running servers accordingly.
173177
func (c *serverController) start(ctx context.Context, ie isql.Executor) error {

0 commit comments

Comments
 (0)