Skip to content

Commit 914341a

Browse files
committed
Fixed race condition in MsgElement::retrieve.
1 parent a01379b commit 914341a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+871
-151
lines changed

admin/tools/docker/index/container/dev/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ FROM qserv/qserv:dev
77
USER 0
88

99
#RUN mv /usr/bin/sh /usr/bin/sh.old && ln -s /usr/bin/bash /usr/bin/sh
10-
RUN yum update --assumeyes && yum install --assumeyes bind-utils
10+
RUN yum update --assumeyes && yum install --assumeyes bind-utils gdb screen
1111

1212
USER 1000
1313

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#! /bin/bash -l
2+
# admin/tools/docker/loader/container/dev/clientNum/appClientNum.bash
3+
4+
_term() {
5+
echo "Caught SIGTERM signal!"
6+
kill -TERM "$child" 2>/dev/null
7+
}
8+
9+
trap _term SIGTERM
10+
trap _term SIGKILL
11+
12+
13+
echo appClientScreen $1 $2 $3
14+
15+
screen -dm /home/qserv/dev/qserv/admin/tools/docker/index/container/dev/clientNum/appClientNum $1 $2 $3
16+
17+
child=$!
18+
wait "$child"
19+
tail -f /dev/null

admin/tools/docker/index/container/dev/worker/appWorker.bash

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ echo "child ${child}"
2222
wait "$child"
2323

2424
sleep 10000
25+
tail -f /dev/null
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#! /bin/bash
2+
# admin/tools/docker/loader/container/dev/worker/appWorker.bash
3+
4+
_term() {
5+
echo "Caught SIGTERM signal!"
6+
kill -TERM "$child" 2>/dev/null
7+
}
8+
9+
trap _term SIGTERM
10+
trap _term SIGKILL
11+
12+
screen -dm /home/qserv/dev/qserv/admin/tools/docker/index/container/dev/worker/appWorker.bash
13+
14+
child=$!
15+
echo "child ${child}"
16+
wait "$child"
17+
18+
sleep 10000
19+
tail -f /dev/null

admin/tools/docker/index/index-k8-100m.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ spec:
3333
containers:
3434
- name: imaster-ctr
3535
image: qserv/indexmaster:dev
36+
imagePullPolicy: Always
3637
ports:
3738
- containerPort: 10042
3839
protocol: UDP
@@ -72,6 +73,7 @@ spec:
7273
containers:
7374
- name: iworker-ctr
7475
image: qserv/indexworker:dev
76+
imagePullPolicy: Always
7577
ports:
7678
- containerPort: 10043
7779
protocol: UDP
@@ -113,7 +115,8 @@ spec:
113115
containers:
114116
- name: iclientnum-ctr
115117
image: qserv/indexclientnum:dev
116-
args: ["1", "100000000", "client-k8s-a1.cnf"]
118+
imagePullPolicy: Always
119+
args: ["100000000", "1", "client-k8s-a1.cnf"]
117120
ports:
118121
- containerPort: 10050
119122
protocol: UDP
@@ -153,6 +156,7 @@ spec:
153156
containers:
154157
- name: iclientnum2-ctr
155158
image: qserv/indexclientnum:dev
159+
imagePullPolicy: Always
156160
args: ["200000001", "300000001", "client-k8s-a2.cnf"]
157161
ports:
158162
- containerPort: 10050
@@ -193,6 +197,7 @@ spec:
193197
containers:
194198
- name: iclientnum3-ctr
195199
image: qserv/indexclientnum:dev
200+
imagePullPolicy: Always
196201
args: ["100000001", "200000000", "client-k8s-a3.cnf"]
197202
ports:
198203
- containerPort: 10050
Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: imaster-svc
5+
labels:
6+
app: index
7+
spec:
8+
ports:
9+
- port: 10042
10+
protocol: UDP
11+
clusterIP: None
12+
selector:
13+
app: imaster-pod
14+
---
15+
apiVersion: apps/v1
16+
kind: StatefulSet
17+
metadata:
18+
name: imaster-sts
19+
labels:
20+
app: index
21+
spec:
22+
serviceName: imaster-svc
23+
podManagementPolicy: Parallel
24+
replicas: 1
25+
selector:
26+
matchLabels:
27+
app: imaster-pod
28+
template:
29+
metadata:
30+
labels:
31+
app: imaster-pod
32+
spec:
33+
containers:
34+
- name: imaster-ctr
35+
image: qserv/indexmaster:dev
36+
imagePullPolicy: Always
37+
ports:
38+
- containerPort: 10042
39+
protocol: UDP
40+
---
41+
apiVersion: v1
42+
kind: Service
43+
metadata:
44+
name: iworker-svc
45+
labels:
46+
app: index
47+
spec:
48+
ports:
49+
- port: 10043
50+
protocol: UDP
51+
clusterIP: None
52+
selector:
53+
app: iworker-pod
54+
---
55+
apiVersion: apps/v1
56+
kind: StatefulSet
57+
metadata:
58+
name: iworker-sts
59+
labels:
60+
app: index
61+
spec:
62+
serviceName: iworker-svc
63+
podManagementPolicy: Parallel
64+
replicas: 3
65+
selector:
66+
matchLabels:
67+
app: iworker-pod
68+
template:
69+
metadata:
70+
labels:
71+
app: iworker-pod
72+
spec:
73+
containers:
74+
- name: iworker-ctr
75+
image: qserv/indexworker:dev
76+
imagePullPolicy: Always
77+
ports:
78+
- containerPort: 10043
79+
protocol: UDP
80+
- containerPort: 10143
81+
protocol: TCP
82+
---
83+
apiVersion: v1
84+
kind: Service
85+
metadata:
86+
name: iclientnum-svc
87+
labels:
88+
app: index
89+
spec:
90+
ports:
91+
- port: 10050
92+
protocol: UDP
93+
clusterIP: None
94+
selector:
95+
app: iclientnum-pod
96+
---
97+
apiVersion: apps/v1
98+
kind: StatefulSet
99+
metadata:
100+
name: iclientnum-sts
101+
labels:
102+
app: index
103+
spec:
104+
serviceName: iclientnum-svc
105+
podManagementPolicy: Parallel
106+
replicas: 1
107+
selector:
108+
matchLabels:
109+
app: iclientnum-pod
110+
template:
111+
metadata:
112+
labels:
113+
app: iclientnum-pod
114+
spec:
115+
containers:
116+
- name: iclientnum-ctr
117+
image: qserv/indexclientnum:dev
118+
imagePullPolicy: Always
119+
args: ["1000000", "1", "client-k8s-a1.cnf"]
120+
ports:
121+
- containerPort: 10050
122+
protocol: UDP
123+
---
124+
apiVersion: v1
125+
kind: Service
126+
metadata:
127+
name: iclientnum2-svc
128+
labels:
129+
app: index
130+
spec:
131+
ports:
132+
- port: 10050
133+
protocol: UDP
134+
clusterIP: None
135+
selector:
136+
app: iclientnum2-pod
137+
---
138+
apiVersion: apps/v1
139+
kind: StatefulSet
140+
metadata:
141+
name: iclientnum2-sts
142+
labels:
143+
app: index
144+
spec:
145+
serviceName: iclientnum2-svc
146+
podManagementPolicy: Parallel
147+
replicas: 1
148+
selector:
149+
matchLabels:
150+
app: iclientnum2-pod
151+
template:
152+
metadata:
153+
labels:
154+
app: iclientnum2-pod
155+
spec:
156+
containers:
157+
- name: iclientnum2-ctr
158+
image: qserv/indexclientnum:dev
159+
imagePullPolicy: Always
160+
args: ["2000001", "3000001", "client-k8s-a2.cnf"]
161+
ports:
162+
- containerPort: 10050
163+
protocol: UDP
164+
---
165+
apiVersion: v1
166+
kind: Service
167+
metadata:
168+
name: iclientnum3-svc
169+
labels:
170+
app: index
171+
spec:
172+
ports:
173+
- port: 10050
174+
protocol: UDP
175+
clusterIP: None
176+
selector:
177+
app: iclientnum3-pod
178+
---
179+
apiVersion: apps/v1
180+
kind: StatefulSet
181+
metadata:
182+
name: iclientnum3-sts
183+
labels:
184+
app: index
185+
spec:
186+
serviceName: iclientnum3-svc
187+
podManagementPolicy: Parallel
188+
replicas: 1
189+
selector:
190+
matchLabels:
191+
app: iclientnum3-pod
192+
template:
193+
metadata:
194+
labels:
195+
app: iclientnum3-pod
196+
spec:
197+
containers:
198+
- name: iclientnum3-ctr
199+
image: qserv/indexclientnum:dev
200+
imagePullPolicy: Always
201+
args: ["1000001", "2000000", "client-k8s-a3.cnf"]
202+
ports:
203+
- containerPort: 10050
204+
protocol: UDP
205+
206+

core/modules/loader/BufferUdp.cc

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ MsgElement::Ptr BufferUdp::readFromSocket(boost::asio::ip::tcp::socket& socket,
4848

4949
// If there's something in the buffer already, get it and return.
5050
// This can happen when the previous read of socket read multiple elements.
51-
MsgElement::Ptr msgElem = _safeRetrieve();
51+
MsgElement::Ptr msgElem = _safeRetrieve("1readFromSocket&&&" + note);
5252
if (msgElem != nullptr) {
5353
return msgElem;
5454
}
@@ -69,7 +69,7 @@ MsgElement::Ptr BufferUdp::readFromSocket(boost::asio::ip::tcp::socket& socket,
6969

7070
/// Try to retrieve an element (there's no guarantee that an entire element got read in a single read.
7171
// Store original cursor positions so they can be restored if the read fails.
72-
msgElem = _safeRetrieve();
72+
msgElem = _safeRetrieve("2readFromSocket&&&" + note);
7373
if (msgElem != nullptr) {
7474
return msgElem;
7575
}
@@ -117,10 +117,11 @@ void BufferUdp::advanceReadCursor(size_t len) {
117117
}
118118

119119

120-
std::shared_ptr<MsgElement> BufferUdp::_safeRetrieve() {
120+
std::shared_ptr<MsgElement> BufferUdp::_safeRetrieve(std::string const& note) { // &&& delete note, maybe
121121
auto wCursorOriginal = _wCursor;
122122
auto rCursorOriginal = _rCursor;
123-
MsgElement::Ptr msgElem = MsgElement::retrieve(*this);
123+
// throwOnMissing=false since missing data is possible with TCP.
124+
MsgElement::Ptr msgElem = MsgElement::retrieve(*this, note + " _safeRetrieve &&&", false);
124125
if (msgElem != nullptr) {
125126
return msgElem;
126127
} else {
@@ -133,7 +134,20 @@ std::shared_ptr<MsgElement> BufferUdp::_safeRetrieve() {
133134

134135
bool BufferUdp::isRetrieveSafe(size_t len) const {
135136
auto newLen = (_rCursor + len);
136-
return (newLen <= _end && newLen <= _wCursor);
137+
// &&&return (newLen <= _end && newLen <= _wCursor);
138+
bool res = (newLen <= _end && newLen <= _wCursor); // &&&
139+
if (!res) { // &&&
140+
LOGS(_log, LOG_LVL_WARN, "&&& BufferUdp::isRetrieveSafe not safe len=" << len <<
141+
" rCursor=" << (void*)_rCursor <<
142+
" newLen=" << (void*)newLen <<
143+
" wCursor=" << (void*)_wCursor <<
144+
" _end=" << (void*)_end <<
145+
" (newLen<=end)=" << (newLen <= _end) <<
146+
" (newLen<=_wCursor)=" << (newLen <= _wCursor) <<
147+
" res=" << res);
148+
LOGS(_log, LOG_LVL_WARN, "&&& BufferUdp::isRetrieveSafe " << dumpStr(false));
149+
}
150+
return res;
137151
}
138152

139153

@@ -143,6 +157,7 @@ bool BufferUdp::retrieve(void* out, size_t len) {
143157
_rCursor += len;
144158
return true;
145159
}
160+
LOGS(_log, LOG_LVL_WARN, "&&& BufferUdp::retrieve not safe len=" << len);
146161
return false;
147162
}
148163

core/modules/loader/BufferUdp.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ class MsgElement;
4545

4646

4747
/// A buffer for reading and writing. Nothing can be read from the buffer until
48-
/// something has written to it.
48+
/// something has been written to it.
49+
/// TODO: rename BufferUdp is not really accurate anymore. &&&
4950
class BufferUdp {
5051
public:
5152
using Ptr = std::shared_ptr<BufferUdp>;
@@ -154,7 +155,7 @@ class BufferUdp {
154155
/// MsgElement is available. If so, return the element and advance _rCursor.
155156
/// Otherwise return nullptr.
156157
/// If a message is not recovered, the buffer is left effectively unchanged.
157-
std::shared_ptr<MsgElement> _safeRetrieve();
158+
std::shared_ptr<MsgElement> _safeRetrieve(std::string const& note);
158159

159160

160161
char* _buffer;

core/modules/loader/Central.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ void Central::_checkDoList() {
7777
while(_loop) {
7878
// Run and then sleep for a second. TODO A more advanced timer should be used
7979
doList->checkList();
80+
LOGS(_log, LOG_LVL_INFO, "&&& SLEEP");
8081
usleep(_loopSleepTime);
8182
}
8283
}

0 commit comments

Comments
 (0)