Skip to content

Commit 93fbb91

Browse files
committed
server: return draining progress if the server controller is draining
This commit fixes a bug where if the server controller is being drained, the drain command would return a response without populating the IsDraining field. The CLI interprets this as if the draining is completed. Output example when running: ``` ./cockroach node drain 3 --insecure --url {pgurl:3} --logtostderr=INFO I251008 15:54:42.613200 15 2@rpc/peer.go:613 [rnode=?,raddr=10.142.1.228:26257,class=system,rpc] 1 connection is now healthy node is draining... remaining: 3 I251008 15:54:42.622586 1 cli/rpc_node_shutdown.go:184 [-] 2 drain details: tenant servers: 3 node is draining... remaining: 3 I251008 15:54:42.824526 1 cli/rpc_node_shutdown.go:184 [-] 3 drain details: tenant servers: 3 node is draining... remaining: 1 I251008 15:54:43.026405 1 cli/rpc_node_shutdown.go:184 [-] 4 drain details: tenant servers: 1 node is draining... remaining: 1 I251008 15:54:43.228596 1 cli/rpc_node_shutdown.go:184 [-] 5 drain details: tenant servers: 1 node is draining... remaining: 243 I251008 15:54:44.580413 1 cli/rpc_node_shutdown.go:184 [-] 6 drain details: liveness record: 1, range lease iterations: 175, descriptor leases: 67 node is draining... remaining: 0 (complete) drain ok ``` Release note (bug fix): fixed a bug in the drain command where draining a node using virtual clusters (such as clusters running Physical Cluster Replication) could return before the drain was complete, possibly resulting in shutting down the node while it still had active SQL clients and ranges leases. Epic: None
1 parent 7f9ca6c commit 93fbb91

File tree

3 files changed

+54
-9
lines changed

3 files changed

+54
-9
lines changed

pkg/server/drain.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -364,10 +364,13 @@ func (s *drainServer) drainInner(
364364
return s.drainNode(ctx, reporter, verbose)
365365
}
366366

367-
// isDraining returns true if either SQL client connections are being drained
368-
// or if one of the stores on the node is not accepting replicas.
367+
// isDraining returns true if the SQL client connections are being drained,
368+
// if one of the stores on the node is not accepting replicas, or if the
369+
// server controller is being drained.
369370
func (s *drainServer) isDraining() bool {
370-
return s.sqlServer.pgServer.IsDraining() || (s.kvServer.node != nil && s.kvServer.node.IsDraining())
371+
return s.sqlServer.pgServer.IsDraining() ||
372+
(s.kvServer.node != nil && s.kvServer.node.IsDraining()) ||
373+
(s.serverCtl != nil && s.serverCtl.IsDraining())
371374
}
372375

373376
// drainClients starts draining the SQL layer.
@@ -379,7 +382,10 @@ func (s *drainServer) drainClients(
379382
var cancel context.CancelFunc
380383
ctx, cancel = context.WithCancel(ctx)
381384
defer cancel()
382-
shouldDelayDraining := !s.isDraining()
385+
386+
// If this is the first time we are draining the clients, we delay draining
387+
// to allow health probes to notice that the node is not ready.
388+
shouldDelayDraining := !s.sqlServer.pgServer.IsDraining()
383389

384390
// Set the gRPC mode of the node to "draining" and mark the node as "not ready".
385391
// Probes to /health?ready=1 will now notice the change in the node's readiness.

pkg/server/drain_test.go

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package server_test
77

88
import (
99
"context"
10+
gosql "database/sql"
1011
"io"
1112
"testing"
1213
"time"
@@ -19,6 +20,7 @@ import (
1920
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats"
2021
"github.com/cockroachdb/cockroach/pkg/testutils"
2122
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
23+
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
2224
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
2325
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
2426
"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
@@ -33,15 +35,39 @@ import (
3335
func TestDrain(t *testing.T) {
3436
defer leaktest.AfterTest(t)()
3537
defer log.Scope(t).Close(t)
36-
doTestDrain(t)
38+
testutils.RunTrueAndFalse(t, "secondaryTenants", func(t *testing.T, secondaryTenants bool) {
39+
doTestDrain(t, secondaryTenants)
40+
})
3741
}
3842

3943
// doTestDrain runs the drain test.
40-
func doTestDrain(tt *testing.T) {
44+
func doTestDrain(tt *testing.T, secondaryTenants bool) {
45+
if secondaryTenants {
46+
// Draining the tenant server takes a long time under stress. See #155229.
47+
skip.UnderRace(tt)
48+
}
4149
var drainSleepCallCount = 0
4250
t := newTestDrainContext(tt, &drainSleepCallCount)
4351
defer t.Close()
4452

53+
var tenantConn *gosql.DB
54+
if secondaryTenants {
55+
_, err := t.tc.ServerConn(0).Exec("CREATE TENANT hello")
56+
require.NoError(tt, err)
57+
_, err = t.tc.ServerConn(0).Exec("ALTER TENANT hello START SERVICE SHARED")
58+
require.NoError(tt, err)
59+
60+
// Wait for the tenant to be ready by establishing a test connection.
61+
testutils.SucceedsSoon(tt, func() error {
62+
tenantConn, err = t.tc.Server(0).SystemLayer().SQLConnE(
63+
serverutils.DBName("cluster:hello/defaultdb"))
64+
if err != nil {
65+
return err
66+
}
67+
return tenantConn.Ping()
68+
})
69+
}
70+
4571
// Issue a probe. We're not draining yet, so the probe should
4672
// reflect that.
4773
resp := t.sendProbe()
@@ -53,7 +79,12 @@ func doTestDrain(tt *testing.T) {
5379
resp = t.sendDrainNoShutdown()
5480
t.assertDraining(resp, true)
5581
t.assertRemaining(resp, true)
56-
t.assertEqual(1, drainSleepCallCount)
82+
if !secondaryTenants {
83+
// If we have secondary tenants, we might not have waited before draining
84+
// the clients at this point because we might have returned early if we are
85+
// still draining the serverController.
86+
t.assertEqual(1, drainSleepCallCount)
87+
}
5788

5889
// Issue another probe. This checks that the server is still running
5990
// (i.e. Shutdown: false was effective), the draining status is
@@ -63,12 +94,14 @@ func doTestDrain(tt *testing.T) {
6394
t.assertDraining(resp, true)
6495
// probe-only has no remaining.
6596
t.assertRemaining(resp, false)
66-
t.assertEqual(1, drainSleepCallCount)
67-
97+
if !secondaryTenants {
98+
t.assertEqual(1, drainSleepCallCount)
99+
}
68100
// Repeat drain commands until we verify that there are zero remaining leases
69101
// (i.e. complete). Also validate that the server did not sleep again.
70102
testutils.SucceedsSoon(t, func() error {
71103
resp = t.sendDrainNoShutdown()
104+
t.Logf("drain response: %+v", resp)
72105
if !resp.IsDraining {
73106
return errors.Newf("expected draining")
74107
}
@@ -78,6 +111,7 @@ func doTestDrain(tt *testing.T) {
78111
}
79112
return nil
80113
})
114+
81115
t.assertEqual(1, drainSleepCallCount)
82116

83117
// Now issue a drain request without drain but with shutdown.
@@ -184,6 +218,7 @@ func newTestDrainContext(t *testing.T, drainSleepCallCount *int) *testDrainConte
184218
// We need to start the cluster insecure in order to not
185219
// care about TLS settings for the RPC client connection.
186220
ServerArgs: base.TestServerArgs{
221+
DefaultTestTenant: base.TestControlsTenantsExplicitly,
187222
Knobs: base.TestingKnobs{
188223
Server: &server.TestingKnobs{
189224
DrainSleepFn: func(time.Duration) {

pkg/server/server_controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,10 @@ func (s *serverController) SetDraining() {
168168
}
169169
}
170170

171+
func (s *serverController) IsDraining() bool {
172+
return s.draining.Load()
173+
}
174+
171175
// start monitors changes to the service mode and updates
172176
// the running servers accordingly.
173177
func (c *serverController) start(ctx context.Context, ie isql.Executor) error {

0 commit comments

Comments
 (0)