Skip to content

Commit a20edb3

Browse files
committed
working?
1 parent c1ca12b commit a20edb3

File tree

4 files changed

+133
-90
lines changed

4 files changed

+133
-90
lines changed

python/integration_tests/test_consumer_rebalancing.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,13 @@ def test_tasks_written_once_during_rebalancing() -> None:
210210
for log_line_index, line in enumerate(lines):
211211
if "[31mERROR" in line:
212212
# If there is an error in log file, capture 10 lines before and after the error line
213-
consumer_error_logs.append(f"Error found in consumer_{i}. Logging 10 lines before and after the error line:")
214-
for j in range(max(0, log_line_index - 10), min(len(lines) - 1, log_line_index + 10)):
213+
consumer_error_logs.append(
214+
f"Error found in consumer_{i}. Logging 10 lines before and after the error line:"
215+
)
216+
for j in range(
217+
max(0, log_line_index - 10),
218+
min(len(lines) - 1, log_line_index + 10),
219+
):
215220
consumer_error_logs.append(lines[j].strip())
216221
consumer_error_logs.append("")
217222

src/config.rs

+1
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ impl Config {
136136
"session.timeout.ms",
137137
self.kafka_session_timeout_ms.to_string(),
138138
)
139+
.set("partition.assignment.strategy", "cooperative-sticky")
139140
.set("enable.partition.eof", "false")
140141
.set("enable.auto.commit", "true")
141142
.set(

src/consumer/kafka.rs

+63-18
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ use tokio::{
3232
mpsc::{self, unbounded_channel, UnboundedReceiver, UnboundedSender},
3333
oneshot,
3434
},
35-
task::{self, JoinError, JoinSet},
35+
task::{self, JoinError, JoinHandle, JoinSet},
3636
time::{self, sleep, MissedTickBehavior},
3737
};
3838
use tokio_stream::wrappers::UnboundedReceiverStream;
@@ -61,14 +61,16 @@ pub async fn start_consumer(
6161
.expect("Can't subscribe to specified topics");
6262

6363
handle_os_signals(event_sender.clone());
64-
poll_consumer_client(consumer.clone(), client_shutdown_receiver);
64+
let rdkafka_driver = poll_consumer_client(consumer.clone(), client_shutdown_receiver);
6565
handle_events(
6666
consumer,
6767
event_receiver,
6868
client_shutdown_sender,
6969
spawn_actors,
7070
)
71-
.await
71+
.await?;
72+
rdkafka_driver.await?;
73+
Ok(())
7274
}
7375

7476
pub fn handle_os_signals(event_sender: UnboundedSender<(Event, SyncSender<()>)>) {
@@ -85,23 +87,28 @@ pub fn handle_os_signals(event_sender: UnboundedSender<(Event, SyncSender<()>)>)
8587
pub fn poll_consumer_client(
8688
consumer: Arc<StreamConsumer<KafkaContext>>,
8789
shutdown: oneshot::Receiver<()>,
88-
) {
90+
) -> JoinHandle<()> {
8991
task::spawn_blocking(|| {
9092
Handle::current().block_on(async move {
9193
let _guard = elegant_departure::get_shutdown_guard().shutdown_on_drop();
9294
select! {
9395
biased;
94-
_ = shutdown => {
95-
debug!("Received shutdown signal, commiting state in sync mode...");
96-
let _ = consumer.commit_consumer_state(rdkafka::consumer::CommitMode::Sync);
97-
}
96+
_ = shutdown => {}
9897
msg = consumer.recv() => {
9998
error!("Got unexpected message from consumer client: {:?}", msg);
10099
}
101-
}
100+
101+
};
102+
103+
select! {
104+
biased;
105+
_ = consumer.recv() => {}
106+
_ = sleep(Duration::from_secs(8)) => {}
107+
};
108+
102109
debug!("Shutdown complete");
103110
});
104-
});
111+
})
105112
}
106113

107114
#[derive(Debug)]
@@ -118,8 +125,20 @@ impl KafkaContext {
118125
impl ClientContext for KafkaContext {}
119126

120127
impl ConsumerContext for KafkaContext {
121-
#[instrument(skip_all)]
122-
fn pre_rebalance(&self, _: &BaseConsumer<Self>, rebalance: &Rebalance) {
128+
#[instrument(skip(self, base_consumer))]
129+
fn pre_rebalance(&self, base_consumer: &BaseConsumer<Self>, rebalance: &Rebalance) {
130+
if let Rebalance::Assign(tpl) = rebalance {
131+
if tpl.count() == 0 {
132+
return;
133+
}
134+
}
135+
base_consumer
136+
.pause(
137+
&base_consumer
138+
.assignment()
139+
.expect("Unable to fetch assigned TPL"),
140+
)
141+
.expect("Unable to pause consumer");
123142
let (rendezvous_sender, rendezvous_receiver) = sync_channel(0);
124143
match rebalance {
125144
Rebalance::Assign(tpl) => {
@@ -149,6 +168,31 @@ impl ConsumerContext for KafkaContext {
149168
}
150169
}
151170

171+
#[instrument(skip(self, base_consumer))]
172+
fn post_rebalance(&self, base_consumer: &BaseConsumer<Self>, rebalance: &Rebalance) {
173+
if let Rebalance::Assign(tpl) = rebalance {
174+
if tpl.count() == 0 {
175+
return;
176+
}
177+
}
178+
let assignment = base_consumer
179+
.assignment()
180+
.expect("Failed to get assigned TPL");
181+
if assignment.count() != 0 {
182+
base_consumer
183+
.seek_partitions(
184+
base_consumer
185+
.committed(rdkafka::util::Timeout::Never)
186+
.expect("Failed to get commited TPL"),
187+
rdkafka::util::Timeout::Never,
188+
)
189+
.expect("Failed to seek to commited offset");
190+
base_consumer
191+
.resume(&assignment)
192+
.expect("Failed to resume consumer");
193+
}
194+
}
195+
152196
#[instrument(skip(self))]
153197
fn commit_callback(&self, result: KafkaResult<()>, _offsets: &TopicPartitionList) {
154198
debug!("Got commit callback");
@@ -336,7 +380,7 @@ pub async fn handle_events(
336380

337381
let mut state = ConsumerState::Ready;
338382

339-
while let ConsumerState::Ready { .. } | ConsumerState::Consuming { .. } = state {
383+
while let ConsumerState::Ready | ConsumerState::Consuming { .. } = state {
340384
select! {
341385
res = match state {
342386
ConsumerState::Consuming(ref mut handles, _) => Either::Left(handles.join_next()),
@@ -352,8 +396,8 @@ pub async fn handle_events(
352396
};
353397
info!("Received event: {:?}", event);
354398
state = match (state, event) {
355-
(ConsumerState::Ready, Event::Assign(assigned)) => {
356-
ConsumerState::Consuming(spawn_actors(consumer.clone(), &assigned), assigned)
399+
(ConsumerState::Ready, Event::Assign(tpl)) => {
400+
ConsumerState::Consuming(spawn_actors(consumer.clone(), &tpl), tpl)
357401
}
358402
(ConsumerState::Ready, Event::Revoke(_)) => {
359403
unreachable!("Got partition revocation before the consumer has started")
@@ -364,17 +408,17 @@ pub async fn handle_events(
364408
tpl.is_disjoint(&assigned),
365409
"Newly assigned TPL should be disjoint from TPL we're consuming from"
366410
);
367-
tpl.append(&mut assigned);
368411
debug!(
369412
"{} additional topic partitions added after assignment",
370413
assigned.len()
371414
);
415+
tpl.append(&mut assigned);
372416
handles.shutdown(CALLBACK_DURATION).await;
373417
ConsumerState::Consuming(spawn_actors(consumer.clone(), &tpl), tpl)
374418
}
375419
(ConsumerState::Consuming(handles, mut tpl), Event::Revoke(revoked)) => {
376420
assert!(
377-
tpl.is_subset(&revoked),
421+
revoked.is_subset(&tpl),
378422
"Revoked TPL should be a subset of TPL we're consuming from"
379423
);
380424
tpl.retain(|e| !revoked.contains(e));
@@ -750,7 +794,7 @@ impl CommitClient for StreamConsumer<KafkaContext> {
750794
}
751795
}
752796

753-
#[derive(Default)]
797+
#[derive(Default, Debug)]
754798
struct HighwaterMark {
755799
data: HashMap<(String, i32), i64>,
756800
}
@@ -795,6 +839,7 @@ pub async fn commit(
795839
while let Some(msgs) = receiver.recv().await {
796840
let mut highwater_mark = HighwaterMark::new();
797841
msgs.0.iter().for_each(|msg| highwater_mark.track(msg));
842+
debug!("Store: {:?}", highwater_mark);
798843
consumer.store_offsets(&highwater_mark.into()).unwrap();
799844
}
800845
debug!("Shutdown complete");

src/main.rs

+62-70
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,17 @@
1-
use anyhow::{anyhow, Error};
1+
use anyhow::Error;
22
use clap::Parser;
33
use std::{sync::Arc, time::Duration};
4-
use taskbroker::grpc_middleware::MetricsLayer;
5-
use taskbroker::upkeep::upkeep;
6-
use tokio::select;
74
use tokio::signal::unix::SignalKind;
85
use tokio::task::JoinHandle;
9-
use tonic::transport::Server;
10-
use tonic_health::server::health_reporter;
116
use tracing::{error, info};
127

13-
use sentry_protos::sentry::v1::consumer_service_server::ConsumerServiceServer;
14-
158
use taskbroker::config::Config;
169
use taskbroker::consumer::{
1710
deserialize_activation::{self, DeserializeConfig},
1811
inflight_activation_writer::{ActivationWriterConfig, InflightActivationWriter},
1912
kafka::start_consumer,
2013
os_stream_writer::{OsStream, OsStreamWriter},
2114
};
22-
use taskbroker::grpc_server::MyConsumerService;
2315
use taskbroker::inflight_activation_store::InflightActivationStore;
2416
use taskbroker::logging;
2517
use taskbroker::metrics;
@@ -50,14 +42,14 @@ async fn main() -> Result<(), Error> {
5042
let store = Arc::new(InflightActivationStore::new(&config.db_path).await?);
5143

5244
// Upkeep thread
53-
let upkeep_task = tokio::spawn({
54-
let upkeep_store = store.clone();
55-
let upkeep_config = config.clone();
56-
async move {
57-
upkeep(upkeep_config, upkeep_store).await;
58-
Ok(())
59-
}
60-
});
45+
// let upkeep_task = tokio::spawn({
46+
// let upkeep_store = store.clone();
47+
// let upkeep_config = config.clone();
48+
// async move {
49+
// upkeep(upkeep_config, upkeep_store).await;
50+
// Ok(())
51+
// }
52+
// });
6153

6254
// Consumer from kafka
6355
let consumer_task = tokio::spawn({
@@ -89,66 +81,66 @@ async fn main() -> Result<(), Error> {
8981
});
9082

9183
// GRPC server
92-
let grpc_server_task = tokio::spawn({
93-
let grpc_store = store.clone();
94-
let grpc_config = config.clone();
95-
async move {
96-
let guard = elegant_departure::get_shutdown_guard().shutdown_on_drop();
97-
let (mut health_reporter_fn, health_service) = health_reporter();
98-
health_reporter_fn
99-
.set_serving::<ConsumerServiceServer<MyConsumerService>>()
100-
.await;
101-
let addr = format!("{}:{}", grpc_config.grpc_addr, grpc_config.grpc_port)
102-
.parse()
103-
.expect("Failed to parse address");
104-
105-
let layers = tower::ServiceBuilder::new()
106-
.layer(MetricsLayer::default())
107-
.into_inner();
108-
109-
let server = Server::builder()
110-
.layer(layers)
111-
.add_service(ConsumerServiceServer::new(MyConsumerService {
112-
store: grpc_store,
113-
}))
114-
.add_service(health_service)
115-
.serve(addr);
116-
117-
info!("GRPC server listening on {}", addr);
118-
select! {
119-
biased;
120-
121-
res = server => {
122-
info!("GRPC server task failed, shutting down");
123-
health_reporter_fn.set_not_serving::<ConsumerServiceServer<MyConsumerService>>().await;
124-
125-
// Wait for any running requests to drain
126-
tokio::time::sleep(Duration::from_secs(5)).await;
127-
match res {
128-
Ok(()) => Ok(()),
129-
Err(e) => Err(anyhow!("GRPC server task failed: {:?}", e)),
130-
}
131-
}
132-
_ = guard.wait() => {
133-
info!("Cancellation token received, shutting down GRPC server");
134-
health_reporter_fn.set_not_serving::<ConsumerServiceServer<MyConsumerService>>().await;
135-
136-
// Wait for any running requests to drain
137-
tokio::time::sleep(Duration::from_secs(5)).await;
138-
Ok(())
139-
}
140-
}
141-
}
142-
});
84+
// let grpc_server_task = tokio::spawn({
85+
// let grpc_store = store.clone();
86+
// let grpc_config = config.clone();
87+
// async move {
88+
// let guard = elegant_departure::get_shutdown_guard().shutdown_on_drop();
89+
// let (mut health_reporter_fn, health_service) = health_reporter();
90+
// health_reporter_fn
91+
// .set_serving::<ConsumerServiceServer<MyConsumerService>>()
92+
// .await;
93+
// let addr = format!("{}:{}", grpc_config.grpc_addr, grpc_config.grpc_port)
94+
// .parse()
95+
// .expect("Failed to parse address");
96+
97+
// let layers = tower::ServiceBuilder::new()
98+
// .layer(MetricsLayer::default())
99+
// .into_inner();
100+
101+
// let server = Server::builder()
102+
// .layer(layers)
103+
// .add_service(ConsumerServiceServer::new(MyConsumerService {
104+
// store: grpc_store,
105+
// }))
106+
// .add_service(health_service)
107+
// .serve(addr);
108+
109+
// info!("GRPC server listening on {}", addr);
110+
// select! {
111+
// biased;
112+
113+
// res = server => {
114+
// info!("GRPC server task failed, shutting down");
115+
// health_reporter_fn.set_not_serving::<ConsumerServiceServer<MyConsumerService>>().await;
116+
117+
// // Wait for any running requests to drain
118+
// tokio::time::sleep(Duration::from_secs(5)).await;
119+
// match res {
120+
// Ok(()) => Ok(()),
121+
// Err(e) => Err(anyhow!("GRPC server task failed: {:?}", e)),
122+
// }
123+
// }
124+
// _ = guard.wait() => {
125+
// info!("Cancellation token received, shutting down GRPC server");
126+
// health_reporter_fn.set_not_serving::<ConsumerServiceServer<MyConsumerService>>().await;
127+
128+
// // Wait for any running requests to drain
129+
// tokio::time::sleep(Duration::from_secs(5)).await;
130+
// Ok(())
131+
// }
132+
// }
133+
// }
134+
// });
143135

144136
elegant_departure::tokio::depart()
145137
.on_termination()
146138
.on_sigint()
147139
.on_signal(SignalKind::hangup())
148140
.on_signal(SignalKind::quit())
149141
.on_completion(log_task_completion("consumer", consumer_task))
150-
.on_completion(log_task_completion("grpc_server", grpc_server_task))
151-
.on_completion(log_task_completion("upkeep_task", upkeep_task))
142+
// .on_completion(log_task_completion("grpc_server", grpc_server_task))
143+
// .on_completion(log_task_completion("upkeep_task", upkeep_task))
152144
.await;
153145

154146
Ok(())

0 commit comments

Comments
 (0)