Skip to content

Commit 64fe2d2

Browse files
committed
add unit test to validate some deadlock scenarios
Signed-off-by: Alexandre Gaudreault <[email protected]>
1 parent cac515a commit 64fe2d2

File tree

2 files changed

+123
-35
lines changed

2 files changed

+123
-35
lines changed

pkg/cache/cluster_test.go

+112-24
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,15 @@ package cache
33
import (
44
"context"
55
"fmt"
6-
"k8s.io/apiextensions-apiserver/pkg/apis/apiextensions"
76
"sort"
87
"strings"
8+
"sync"
99
"testing"
1010
"time"
1111

12+
"golang.org/x/sync/semaphore"
13+
"k8s.io/apiextensions-apiserver/pkg/apis/apiextensions"
14+
1215
"github.com/stretchr/testify/assert"
1316
"github.com/stretchr/testify/require"
1417
appsv1 "k8s.io/api/apps/v1"
@@ -71,6 +74,16 @@ var (
7174
)
7275

7376
func newCluster(t *testing.T, objs ...runtime.Object) *clusterCache {
77+
cache := newClusterWithOptions(t, []UpdateSettingsFunc{}, objs...)
78+
79+
t.Cleanup(func() {
80+
cache.Invalidate()
81+
})
82+
83+
return cache
84+
}
85+
86+
func newClusterWithOptions(t *testing.T, opts []UpdateSettingsFunc, objs ...runtime.Object) *clusterCache {
7487
client := fake.NewSimpleDynamicClient(scheme.Scheme, objs...)
7588
reactor := client.ReactionChain[0]
7689
client.PrependReactor("list", "*", func(action testcore.Action) (handled bool, ret runtime.Object, err error) {
@@ -101,11 +114,14 @@ func newCluster(t *testing.T, objs ...runtime.Object) *clusterCache {
101114
Meta: metav1.APIResource{Namespaced: true},
102115
}}
103116

117+
opts = append([]UpdateSettingsFunc{
118+
SetKubectl(&kubetest.MockKubectlCmd{APIResources: apiResources, DynamicClient: client}),
119+
}, opts...)
120+
104121
cache := NewClusterCache(
105-
&rest.Config{Host: "https://test"}, SetKubectl(&kubetest.MockKubectlCmd{APIResources: apiResources, DynamicClient: client}))
106-
t.Cleanup(func() {
107-
cache.Invalidate()
108-
})
122+
&rest.Config{Host: "https://test"},
123+
opts...,
124+
)
109125
return cache
110126
}
111127

@@ -492,23 +508,23 @@ metadata:
492508
func TestGetManagedLiveObjsFailedConversion(t *testing.T) {
493509
cronTabGroup := "stable.example.com"
494510

495-
testCases := []struct{
496-
name string
497-
localConvertFails bool
511+
testCases := []struct {
512+
name string
513+
localConvertFails bool
498514
expectConvertToVersionCalled bool
499-
expectGetResourceCalled bool
515+
expectGetResourceCalled bool
500516
}{
501517
{
502-
name: "local convert fails, so GetResource is called",
503-
localConvertFails: true,
518+
name: "local convert fails, so GetResource is called",
519+
localConvertFails: true,
504520
expectConvertToVersionCalled: true,
505-
expectGetResourceCalled: true,
521+
expectGetResourceCalled: true,
506522
},
507523
{
508-
name: "local convert succeeds, so GetResource is not called",
509-
localConvertFails: false,
524+
name: "local convert succeeds, so GetResource is not called",
525+
localConvertFails: false,
510526
expectConvertToVersionCalled: true,
511-
expectGetResourceCalled: false,
527+
expectGetResourceCalled: false,
512528
},
513529
}
514530

@@ -557,7 +573,6 @@ metadata:
557573
return testCronTab(), nil
558574
})
559575

560-
561576
managedObjs, err := cluster.GetManagedLiveObjs([]*unstructured.Unstructured{targetDeploy}, func(r *Resource) bool {
562577
return true
563578
})
@@ -816,25 +831,25 @@ func testPod() *corev1.Pod {
816831

817832
func testCRD() *apiextensions.CustomResourceDefinition {
818833
return &apiextensions.CustomResourceDefinition{
819-
TypeMeta: metav1.TypeMeta{
834+
TypeMeta: metav1.TypeMeta{
820835
APIVersion: "apiextensions.k8s.io/v1",
821836
},
822837
ObjectMeta: metav1.ObjectMeta{
823838
Name: "crontabs.stable.example.com",
824839
},
825-
Spec: apiextensions.CustomResourceDefinitionSpec{
840+
Spec: apiextensions.CustomResourceDefinitionSpec{
826841
Group: "stable.example.com",
827842
Versions: []apiextensions.CustomResourceDefinitionVersion{
828843
{
829-
Name: "v1",
830-
Served: true,
844+
Name: "v1",
845+
Served: true,
831846
Storage: true,
832847
Schema: &apiextensions.CustomResourceValidation{
833848
OpenAPIV3Schema: &apiextensions.JSONSchemaProps{
834849
Type: "object",
835850
Properties: map[string]apiextensions.JSONSchemaProps{
836851
"cronSpec": {Type: "string"},
837-
"image": {Type: "string"},
852+
"image": {Type: "string"},
838853
"replicas": {Type: "integer"},
839854
},
840855
},
@@ -855,14 +870,14 @@ func testCRD() *apiextensions.CustomResourceDefinition {
855870
func testCronTab() *unstructured.Unstructured {
856871
return &unstructured.Unstructured{Object: map[string]interface{}{
857872
"apiVersion": "stable.example.com/v1",
858-
"kind": "CronTab",
873+
"kind": "CronTab",
859874
"metadata": map[string]interface{}{
860-
"name": "test-crontab",
875+
"name": "test-crontab",
861876
"namespace": "default",
862877
},
863878
"spec": map[string]interface{}{
864879
"cronSpec": "* * * * */5",
865-
"image": "my-awesome-cron-image",
880+
"image": "my-awesome-cron-image",
866881
},
867882
}}
868883
}
@@ -1006,3 +1021,76 @@ func TestIterateHierachy(t *testing.T) {
10061021
keys)
10071022
})
10081023
}
1024+
1025+
// TestDeadlock_startMissingWatches validates that starting watches will not create a deadlock
1026+
// caused by using improper locking in various callback methods when there is a high load on the
1027+
// system.
1028+
func Test_watchEvents_Deadlock(t *testing.T) {
1029+
// deadlock lock is used to simulate a user function calling the cluster cache while holding a lock
1030+
// and using this lock in callbacks such as OnPopulateResourceInfoHandler.
1031+
deadlock := sync.RWMutex{}
1032+
1033+
hasDeadlock := false
1034+
res1 := testPod()
1035+
res2 := testRS()
1036+
1037+
cluster := newClusterWithOptions(t, []UpdateSettingsFunc{
1038+
// Set low blocking semaphore
1039+
SetListSemaphore(semaphore.NewWeighted(1)),
1040+
// Resync watches often to use the semaphore and trigger the rate limiting behavior
1041+
SetResyncTimeout(500 * time.Millisecond),
1042+
// Use new resource handler to run code in the list callbacks
1043+
SetPopulateResourceInfoHandler(func(un *unstructured.Unstructured, isRoot bool) (info interface{}, cacheManifest bool) {
1044+
if un.GroupVersionKind().GroupKind() == res1.GroupVersionKind().GroupKind() ||
1045+
un.GroupVersionKind().GroupKind() == res2.GroupVersionKind().GroupKind() {
1046+
// Create a bottleneck for resources holding the semaphore
1047+
time.Sleep(2 * time.Second)
1048+
}
1049+
1050+
//// Uncommenting the following code will simulate a deadlock caused by client code holding a lock and
1051+
//// trying to acquire the same lock in the event callback
1052+
// deadlock.RLock()
1053+
// defer deadlock.RUnlock()
1054+
1055+
return
1056+
}),
1057+
}, res1, res2, testDeploy())
1058+
defer func() {
1059+
// Invalidate() is a blocking method and cannot be called safely in case of deadlock
1060+
if !hasDeadlock {
1061+
cluster.Invalidate()
1062+
}
1063+
}()
1064+
1065+
err := cluster.EnsureSynced()
1066+
require.NoError(t, err)
1067+
1068+
for i := 0; i < 2; i++ {
1069+
done := make(chan bool, 1)
1070+
go func() {
1071+
// Stop the watches, so startMissingWatches will restart them
1072+
cluster.stopWatching(res1.GroupVersionKind().GroupKind(), res1.Namespace)
1073+
cluster.stopWatching(res2.GroupVersionKind().GroupKind(), res2.Namespace)
1074+
1075+
// calling startMissingWatches to simulate that a CRD event was received
1076+
// TODO: how to simulate real watch events and test the full watchEvents function?
1077+
err = runSynced(&cluster.lock, func() error {
1078+
deadlock.Lock()
1079+
defer deadlock.Unlock()
1080+
return cluster.startMissingWatches()
1081+
})
1082+
require.NoError(t, err)
1083+
done <- true
1084+
}()
1085+
select {
1086+
case v := <-done:
1087+
require.True(t, v)
1088+
case <-time.After(10 * time.Second):
1089+
hasDeadlock = true
1090+
t.Errorf("timeout reached on attempt %d. It is possible that a deadlock occured", i)
1091+
// Tip: to debug the deadlock, increase the timer to a value higher than X in "go test -timeout X"
1092+
// This will make the test panic with the goroutines information
1093+
t.FailNow()
1094+
}
1095+
}
1096+
}

pkg/cache/resource_test.go

+11-11
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ import (
77
"k8s.io/client-go/rest"
88
)
99

10-
var c = NewClusterCache(&rest.Config{})
10+
var cacheTest = NewClusterCache(&rest.Config{})
1111

1212
func TestIsParentOf(t *testing.T) {
13-
child := c.newResource(mustToUnstructured(testPod()))
14-
parent := c.newResource(mustToUnstructured(testRS()))
15-
grandParent := c.newResource(mustToUnstructured(testDeploy()))
13+
child := cacheTest.newResource(mustToUnstructured(testPod()))
14+
parent := cacheTest.newResource(mustToUnstructured(testRS()))
15+
grandParent := cacheTest.newResource(mustToUnstructured(testDeploy()))
1616

1717
assert.True(t, parent.isParentOf(child))
1818
assert.False(t, grandParent.isParentOf(child))
@@ -22,38 +22,38 @@ func TestIsParentOfSameKindDifferentGroupAndUID(t *testing.T) {
2222
rs := testRS()
2323
rs.APIVersion = "somecrd.io/v1"
2424
rs.SetUID("123")
25-
child := c.newResource(mustToUnstructured(testPod()))
26-
invalidParent := c.newResource(mustToUnstructured(rs))
25+
child := cacheTest.newResource(mustToUnstructured(testPod()))
26+
invalidParent := cacheTest.newResource(mustToUnstructured(rs))
2727

2828
assert.False(t, invalidParent.isParentOf(child))
2929
}
3030

3131
func TestIsServiceParentOfEndPointWithTheSameName(t *testing.T) {
32-
nonMatchingNameEndPoint := c.newResource(strToUnstructured(`
32+
nonMatchingNameEndPoint := cacheTest.newResource(strToUnstructured(`
3333
apiVersion: v1
3434
kind: Endpoints
3535
metadata:
3636
name: not-matching-name
3737
namespace: default
3838
`))
3939

40-
matchingNameEndPoint := c.newResource(strToUnstructured(`
40+
matchingNameEndPoint := cacheTest.newResource(strToUnstructured(`
4141
apiVersion: v1
4242
kind: Endpoints
4343
metadata:
4444
name: helm-guestbook
4545
namespace: default
4646
`))
4747

48-
parent := c.newResource(testService)
48+
parent := cacheTest.newResource(testService)
4949

5050
assert.True(t, parent.isParentOf(matchingNameEndPoint))
5151
assert.Equal(t, parent.Ref.UID, matchingNameEndPoint.OwnerRefs[0].UID)
5252
assert.False(t, parent.isParentOf(nonMatchingNameEndPoint))
5353
}
5454

5555
func TestIsServiceAccountParentOfSecret(t *testing.T) {
56-
serviceAccount := c.newResource(strToUnstructured(`
56+
serviceAccount := cacheTest.newResource(strToUnstructured(`
5757
apiVersion: v1
5858
kind: ServiceAccount
5959
metadata:
@@ -63,7 +63,7 @@ metadata:
6363
secrets:
6464
- name: default-token-123
6565
`))
66-
tokenSecret := c.newResource(strToUnstructured(`
66+
tokenSecret := cacheTest.newResource(strToUnstructured(`
6767
apiVersion: v1
6868
kind: Secret
6969
metadata:

0 commit comments

Comments
 (0)