Skip to content

Commit fed7e01

Browse files
fpacificibmckerry
andauthored
Add an Arroyo Adapter (#68)
* feat(api): add broadcast functionality (#32) * add unit test for broadcast in pipeline config * add Broadcast step to API + unit test * use implicit broadcasting * use implicit broadcasting * whoops * revert user_functions split so tests pass * add broadcast tests * add examples/broadcast.py * fix CI * address comments * use unittest ANY * fix branch test * fix CI pipeline path * user_functions was removed * use test functions for tests * Add test * Add the main consumer * Arroyo Adapter * Add conftest * Fix tests * Add flatmap method --------- Co-authored-by: Ben McKerry <[email protected]>
1 parent f8a2f3d commit fed7e01

File tree

17 files changed

+936
-12
lines changed

17 files changed

+936
-12
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ repos:
3939
pytest==7.1.2,
4040
types-requests,
4141
responses,
42+
"sentry-arroyo>=2.18.2",
4243
]
4344
files: ^sentry_streams/.+
4445
- repo: https://github.com/pycqa/isort

.vscode/settings.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,8 @@
55
],
66
"python.testing.unittestEnabled": false,
77
"python.testing.pytestEnabled": true,
8-
"mypy-type-checker.args": ["--strict"],
8+
"mypy-type-checker.args": [
9+
"--strict"
10+
],
11+
"editor.formatOnSave": true
912
}

sentry_flink/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ requires-python = ">=3.11,<3.12"
1212
dependencies = [
1313
"apache-flink==1.20.0",
1414
"requests>=2.32.3",
15-
"sentry-streams>=0.0.9",
15+
"sentry-streams==0.0.9",
1616
# Figure out a way to get Flink work without setuptools.
1717
"setuptools==75.8.0",
1818
]

sentry_flink/uv.lock

Lines changed: 8 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

sentry_streams/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ readme = "README.md"
1111
requires-python = ">=3.11"
1212
dependencies = [
1313
"requests>=2.32.3",
14+
"sentry-arroyo>=2.18.2",
1415
]
1516

1617
[dependency-groups]
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from sentry_streams.adapters.arroyo.adapter import ArroyoAdapter
2+
3+
__all__ = ["ArroyoAdapter"]
Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
from __future__ import annotations
2+
3+
from typing import Any, Mapping, MutableMapping, TypedDict
4+
5+
from arroyo.backends.kafka.configuration import (
6+
build_kafka_configuration,
7+
build_kafka_consumer_configuration,
8+
)
9+
from arroyo.backends.kafka.consumer import KafkaConsumer, KafkaPayload, KafkaProducer
10+
from arroyo.processing.processor import StreamProcessor
11+
from arroyo.types import Topic
12+
13+
from sentry_streams.adapters.arroyo.consumer import (
14+
ArroyoConsumer,
15+
ArroyoStreamingFactory,
16+
)
17+
from sentry_streams.adapters.arroyo.routes import Route
18+
from sentry_streams.adapters.arroyo.steps import FilterStep, KafkaSinkStep, MapStep
19+
from sentry_streams.adapters.stream_adapter import PipelineConfig, StreamAdapter
20+
from sentry_streams.pipeline.pipeline import (
21+
Filter,
22+
FlatMapStep,
23+
KafkaSink,
24+
KafkaSource,
25+
Map,
26+
Reduce,
27+
Sink,
28+
Source,
29+
)
30+
31+
32+
class KafkaConsumerConfig(TypedDict):
33+
bootstrap_servers: str
34+
auto_offset_reset: str
35+
consumer_group: str
36+
additional_settings: Mapping[str, Any]
37+
38+
39+
class KafkaProducerConfig(TypedDict):
40+
bootstrap_servers: str
41+
additional_settings: Mapping[str, Any]
42+
43+
44+
class KafkaSources:
45+
def __init__(
46+
self,
47+
sources_config: Mapping[str, KafkaConsumerConfig],
48+
sources_override: Mapping[str, KafkaConsumer] = {},
49+
) -> None:
50+
super().__init__()
51+
52+
self.__sources_config = sources_config
53+
54+
# Overrides are for unit testing purposes
55+
self.__source_topics: MutableMapping[str, Topic] = {}
56+
self.__sources: MutableMapping[str, KafkaConsumer] = {**sources_override}
57+
58+
def add_source(self, step: Source) -> None:
59+
"""
60+
Builds an Arroyo Kafka consumer as a stream source.
61+
By default it uses the configuration provided to the adapter.
62+
63+
It is possible to override the configuration by providing an
64+
instantiated consumer for unit testing purposes.
65+
"""
66+
# TODO: Provide a better way to get the logical stream name from
67+
# the Sink step. We should not have to assert it is a Kafka sink
68+
assert isinstance(step, KafkaSource), "Only Kafka Sources are supported"
69+
source_name = step.name
70+
if source_name not in self.__sources:
71+
config = self.__sources_config.get(source_name)
72+
assert config, f"Config not provided for source {source_name}"
73+
self.__sources[source_name] = KafkaConsumer(
74+
build_kafka_consumer_configuration(
75+
default_config=config["additional_settings"],
76+
bootstrap_servers=config["bootstrap_servers"],
77+
auto_offset_reset=config["auto_offset_reset"],
78+
group_id=config["consumer_group"],
79+
)
80+
)
81+
82+
self.__source_topics[source_name] = Topic(step.logical_topic)
83+
84+
def get_topic(self, source: str) -> Topic:
85+
return self.__source_topics[source]
86+
87+
def get_consumer(self, source: str) -> KafkaConsumer:
88+
return self.__sources[source]
89+
90+
91+
class ArroyoAdapter(StreamAdapter[Route, Route]):
92+
93+
def __init__(
94+
self,
95+
sources_config: Mapping[str, KafkaConsumerConfig],
96+
sinks_config: Mapping[str, KafkaProducerConfig],
97+
sources_override: Mapping[str, KafkaConsumer] = {},
98+
sinks_override: Mapping[str, KafkaProducer] = {},
99+
) -> None:
100+
super().__init__()
101+
102+
self.__sources = KafkaSources(sources_config, sources_override)
103+
self.__sinks_config = sinks_config
104+
105+
# Overrides are for unit testing purposes
106+
self.__sinks: MutableMapping[str, Any] = {**sinks_override}
107+
108+
self.__consumers: MutableMapping[str, ArroyoConsumer] = {}
109+
110+
@classmethod
111+
def build(cls, config: PipelineConfig) -> ArroyoAdapter:
112+
return cls(
113+
config["sources_config"],
114+
config["sinks_config"],
115+
config.get("sources_override", {}),
116+
config.get("sinks_override", {}),
117+
)
118+
119+
def source(self, step: Source) -> Route:
120+
"""
121+
Builds an Arroyo Kafka consumer as a stream source.
122+
By default it uses the configuration provided to the adapter.
123+
124+
It is possible to override the configuration by providing an
125+
instantiated consumer for unit testing purposes.
126+
"""
127+
source_name = step.name
128+
self.__sources.add_source(step)
129+
self.__consumers[source_name] = ArroyoConsumer(source_name)
130+
131+
return Route(source_name, [])
132+
133+
def sink(self, step: Sink, stream: Route) -> Route:
134+
"""
135+
Builds an Arroyo Kafka producer as a stream sink.
136+
By default it uses the configuration provided to the adapter.
137+
138+
It is possible to override the configuration by providing an
139+
instantiated consumer for unit testing purposes.
140+
"""
141+
# TODO: Provide a better way to get the logical stream name from
142+
# the Sink step. We should not have to assert it is a Kafka sink
143+
assert isinstance(step, KafkaSink), "Only Kafka Sinks are supported"
144+
145+
sink_name = step.name
146+
if sink_name not in self.__sinks:
147+
config = self.__sinks_config.get(sink_name)
148+
assert config, f"Config not provided for sink {sink_name}"
149+
producer = KafkaProducer(
150+
build_kafka_configuration(
151+
default_config=config["additional_settings"],
152+
bootstrap_servers=config["bootstrap_servers"],
153+
)
154+
)
155+
else:
156+
producer = self.__sinks[sink_name]
157+
158+
assert (
159+
stream.source in self.__consumers
160+
), f"Stream starting at source {stream.source} not found when adding a producer"
161+
162+
self.__consumers[stream.source].add_step(
163+
KafkaSinkStep(route=stream, producer=producer, topic_name=step.logical_topic)
164+
)
165+
166+
return stream
167+
168+
def map(self, step: Map, stream: Route) -> Route:
169+
"""
170+
Builds a map operator for the platform the adapter supports.
171+
"""
172+
assert (
173+
stream.source in self.__consumers
174+
), f"Stream starting at source {stream.source} not found when adding a map"
175+
176+
self.__consumers[stream.source].add_step(MapStep(route=stream, pipeline_step=step))
177+
return stream
178+
179+
def flat_map(self, step: FlatMapStep, stream: Route) -> Route:
180+
"""
181+
Builds a flat-map operator for the platform the adapter supports.
182+
"""
183+
raise NotImplementedError
184+
185+
def filter(self, step: Filter, stream: Route) -> Route:
186+
"""
187+
Builds a filter operator for the platform the adapter supports.
188+
"""
189+
assert (
190+
stream.source in self.__consumers
191+
), f"Stream starting at source {stream.source} not found when adding a filter"
192+
193+
self.__consumers[stream.source].add_step(FilterStep(route=stream, pipeline_step=step))
194+
return stream
195+
196+
def reduce(
197+
self,
198+
step: Reduce,
199+
stream: Route,
200+
) -> Route:
201+
"""
202+
Build a map operator for the platform the adapter supports.
203+
"""
204+
raise NotImplementedError
205+
206+
def get_processor(self, source: str) -> StreamProcessor[KafkaPayload]:
207+
"""
208+
Returns the stream processor for the given source.
209+
"""
210+
factory = ArroyoStreamingFactory(self.__consumers[source])
211+
212+
return StreamProcessor(
213+
consumer=self.__sources.get_consumer(source),
214+
topic=self.__sources.get_topic(source),
215+
processor_factory=factory,
216+
)
217+
218+
def run(self) -> None:
219+
"""
220+
Starts the pipeline
221+
"""
222+
# TODO: Support multiple consumers
223+
assert len(self.__consumers) == 1, "Only one consumer is supported"
224+
source = next(iter(self.__consumers))
225+
226+
processor = self.get_processor(source)
227+
processor.run()
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
from dataclasses import dataclass, field
2+
from typing import Any, Mapping, MutableSequence
3+
4+
from arroyo.backends.kafka.consumer import KafkaPayload
5+
from arroyo.processing.strategies import CommitOffsets
6+
from arroyo.processing.strategies.abstract import (
7+
ProcessingStrategy,
8+
ProcessingStrategyFactory,
9+
)
10+
from arroyo.processing.strategies.run_task import RunTask
11+
from arroyo.types import (
12+
Commit,
13+
Message,
14+
Partition,
15+
)
16+
17+
from sentry_streams.adapters.arroyo.routes import Route, RoutedValue
18+
from sentry_streams.adapters.arroyo.steps import ArroyoStep
19+
20+
21+
@dataclass
22+
class ArroyoConsumer:
23+
"""
24+
Intermediate representation of A single Arroyo application composed
25+
of multiple steps.
26+
27+
Arroyo does not support branches. The streaming platform does, so
28+
we need to fake it in arroyo. This is done by making the branched
29+
pipeline a sequence and make all the messages go through all the
30+
steps for all the branches. The route is used to filter out the
31+
messages that do not belong to the branch.
32+
33+
Building an Arroyo application is done from the last step to the
34+
first step. This is because every step references the following one.
35+
The streaming platform allows you to define the pipeline in sequence
36+
from the first to last step. This intermediate representation also
37+
collects the pipeline to be built in reverse order in Arroyo.
38+
"""
39+
40+
source: str
41+
steps: MutableSequence[ArroyoStep] = field(default_factory=list)
42+
43+
def add_step(self, step: ArroyoStep) -> None:
44+
"""
45+
Append a pipeline step to the Arroyo consumer.
46+
"""
47+
assert step.route.source == self.source
48+
self.steps.append(step)
49+
50+
def build_strategy(self, commit: Commit) -> ProcessingStrategy[Any]:
51+
"""
52+
Build the Arroyo consumer wiring up the steps in reverse order.
53+
54+
It also add a strategy at the beginning that makes each payload
55+
a RoutedValue that contains the route the message is supposed to
56+
follow.
57+
"""
58+
59+
def add_route(message: Message[KafkaPayload]) -> RoutedValue:
60+
value = message.payload.value
61+
return RoutedValue(route=Route(source=self.source, waypoints=[]), payload=value)
62+
63+
strategy: ProcessingStrategy[Any] = CommitOffsets(commit)
64+
for step in reversed(self.steps):
65+
strategy = step.build(strategy)
66+
67+
return RunTask(add_route, strategy)
68+
69+
70+
class ArroyoStreamingFactory(ProcessingStrategyFactory[Any]):
71+
def __init__(self, consumer: ArroyoConsumer) -> None:
72+
self.consumer = consumer
73+
74+
def create_with_partitions(
75+
self,
76+
commit: Commit,
77+
_: Mapping[Partition, int],
78+
) -> ProcessingStrategy[Any]:
79+
return self.consumer.build_strategy(commit)

0 commit comments

Comments
 (0)