-
-
Notifications
You must be signed in to change notification settings - Fork 0
ref: Type out chains and steps #99
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
45a3873
6d9cecd
33eda66
66f7f37
efd8e1a
501dab8
794bf32
a81fb8b
dcd7a27
63d5a73
a5c0204
dd68893
b9840b2
4d9de0e
5f41865
26abe36
9a537e3
ad4efd4
1c226a2
c7dcabf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,22 +1,21 @@ | ||
import logging | ||
from dataclasses import dataclass, field | ||
from typing import Any, Mapping, MutableSequence | ||
from typing import Any, Mapping, MutableSequence, Optional, Tuple, Union | ||
|
||
from arroyo.backends.kafka.consumer import KafkaPayload | ||
from arroyo.backends.kafka.consumer import Headers, KafkaPayload | ||
from arroyo.processing.strategies import CommitOffsets | ||
from arroyo.processing.strategies.abstract import ( | ||
ProcessingStrategy, | ||
ProcessingStrategyFactory, | ||
) | ||
from arroyo.processing.strategies.run_task import RunTask | ||
from arroyo.types import ( | ||
Commit, | ||
Message, | ||
Partition, | ||
) | ||
from arroyo.types import Commit, FilteredPayload, Message, Partition | ||
from sentry_kafka_schemas import get_codec | ||
from sentry_kafka_schemas.codecs import Codec | ||
|
||
from sentry_streams.adapters.arroyo.routes import Route, RoutedValue | ||
from sentry_streams.adapters.arroyo.steps import ArroyoStep | ||
from sentry_streams.pipeline.message import Message as StreamsMessage | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
@@ -41,6 +40,8 @@ class ArroyoConsumer: | |
""" | ||
|
||
source: str | ||
stream_name: str | ||
header_filter: Optional[Tuple[str, bytes]] = None | ||
steps: MutableSequence[ArroyoStep] = field(default_factory=list) | ||
|
||
def add_step(self, step: ArroyoStep) -> None: | ||
|
@@ -59,9 +60,26 @@ def build_strategy(self, commit: Commit) -> ProcessingStrategy[Any]: | |
follow. | ||
""" | ||
|
||
def add_route(message: Message[KafkaPayload]) -> RoutedValue: | ||
value = message.payload.value | ||
return RoutedValue(route=Route(source=self.source, waypoints=[]), payload=value) | ||
def add_route(message: Message[KafkaPayload]) -> Union[FilteredPayload, RoutedValue]: | ||
filtered = False | ||
if self.header_filter: | ||
headers: Headers = message.payload.headers | ||
if self.header_filter not in headers: | ||
filtered = True | ||
|
||
if filtered: | ||
return FilteredPayload() | ||
else: | ||
value = message.payload.value | ||
try: | ||
schema: Codec[Any] = get_codec(self.stream_name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you should not need to run the |
||
except Exception: | ||
raise ValueError(f"Kafka topic {self.stream_name} has no associated schema") | ||
|
||
return RoutedValue( | ||
route=Route(source=self.source, waypoints=[]), | ||
payload=StreamsMessage(schema=schema, payload=value), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This identifies the schema of the messages based on the topic it comes from. I chose to do it this way, where we wrap it in this If we can just bake parsing/deserialization into the Source then all this is not needed. |
||
) | ||
|
||
strategy: ProcessingStrategy[Any] = CommitOffsets(commit) | ||
for step in reversed(self.steps): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,32 @@ | ||
import json | ||
|
||
from sentry_streams.pipeline import Batch, FlatMap, Map, streaming_source | ||
from sentry_streams.pipeline.batch import unbatch | ||
from sentry_streams.pipeline.function_template import InputType | ||
|
||
|
||
def build_batch_str(batch: list[InputType]) -> str: | ||
d = {"batch": batch} | ||
|
||
return json.dumps(d) | ||
from typing import Callable, MutableSequence, Union, cast | ||
|
||
from sentry_kafka_schemas.schema_types.ingest_metrics_v1 import IngestMetric | ||
|
||
def build_message_str(message: str) -> str: | ||
d = {"message": message} | ||
|
||
return json.dumps(d) | ||
from sentry_streams.pipeline import Batch, FlatMap, streaming_source | ||
from sentry_streams.pipeline.batch import unbatch | ||
from sentry_streams.pipeline.chain import Parser, Serializer | ||
from sentry_streams.pipeline.message import Message | ||
|
||
pipeline = streaming_source( | ||
name="myinput", | ||
stream_name="ingest-metrics", | ||
) | ||
|
||
pipeline = ( | ||
streaming_source( | ||
name="myinput", | ||
stream_name="events", | ||
) | ||
.apply("mybatch", Batch(batch_size=5)) # User simply provides the batch size | ||
.apply("myunbatch", FlatMap(function=unbatch)) | ||
.apply("mymap", Map(function=build_message_str)) | ||
.sink( | ||
"kafkasink", | ||
stream_name="transformed-events", | ||
) # flush the batches to the Sink | ||
# TODO: Figure out why the concrete type of InputType is not showing up in the type hint of chain1 | ||
chain1 = pipeline.apply("parser", Parser(msg_type=IngestMetric)).apply( | ||
"mybatch", Batch(batch_size=5) | ||
) # User simply provides the batch size | ||
|
||
chain2 = chain1.apply( | ||
"myunbatch", | ||
FlatMap( | ||
function=cast( | ||
Union[Callable[[Message[MutableSequence[IngestMetric]]], Message[IngestMetric]], str], | ||
unbatch, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because Obviously this specific There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder whether it would be easier if we had |
||
) | ||
), | ||
) | ||
|
||
chain3 = chain2.apply("serializer", Serializer()).sink( | ||
"kafkasink2", stream_name="transformed-events" | ||
) # flush the batches to the Sink |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.