Skip to content

Commit 7cd4890

Browse files
committed
Merge branch 'dev'
2 parents b5fe0fe + 0979dca commit 7cd4890

28 files changed

+20245
-582
lines changed

README.md

+11-1
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,6 @@ Bodhi-Cast is under the Apache License 2.0. See the [LICENSE](LICENSE) file for
186186
- [ ] Get shore wind forecast
187187
- [x] Change data fetching utils to classes
188188
- [x] Add tests for all endpoints
189-
- [ ] Add API auth
190189
- [x] Add user spot creation endpoint
191190

192191
### Airflow:
@@ -208,3 +207,14 @@ Bodhi-Cast is under the Apache License 2.0. See the [LICENSE](LICENSE) file for
208207
- [x] Configure DAGs for production
209208
- [x] Set up env for production
210209
- [x] Add auto tests for data verification as DAGs
210+
211+
### Week Ending 240218
212+
213+
- [ ] Remove any excessive logging in DAGs
214+
- [ ] Incoroporate realtime data from available weather stations
215+
- [ ] add stations to new table in postgres
216+
- [ ] create producer to push to kafka topic
217+
- [ ] create consumer to write latest to redis
218+
- [ ] add nearby realtime data to spot pages
219+
- [ ] add endpoint to find stations within the radius of a given spot
220+
- [ ] display nearby station data on frontend

airflow/dags/gefs_wave_etl_from_kafka.py

+62-25
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import requests
99
import xarray as xr
1010
from airflow.decorators import task
11+
from airflow.exceptions import AirflowFailException
1112
from airflow.operators.empty import EmptyOperator
1213
from airflow.sensors.external_task import ExternalTaskSensor
1314
from confluent_kafka import Consumer, KafkaException
@@ -37,8 +38,8 @@
3738
"email": ["[email protected]"],
3839
"email_on_failure": False,
3940
"email_on_retry": False,
40-
"retries": 1,
41-
"retry_delay": pendulum.duration(minutes=5),
41+
"retries": 5,
42+
"retry_delay": pendulum.duration(minutes=1),
4243
}
4344

4445

@@ -192,7 +193,8 @@ def df_to_db(df, engine, table_name):
192193
dtype={"location": Geography(geometry_type="POINT", srid=4326)},
193194
)
194195
entry_id = df["valid_time"].unique()
195-
entry_id = entry_id[0].strftime("%Y-%m-%d %H:%M:%S")
196+
entry_id_datetime = pd.to_datetime(entry_id[0])
197+
entry_id_str = entry_id_datetime.strftime("%Y-%m-%d %H:%M:%S")
196198
print(f"Successfully wrote grib2 file for {entry_id}")
197199
except SQLAlchemyError as e:
198200
print(f"An error occurred: {e}")
@@ -207,30 +209,65 @@ def df_to_db(df, engine, table_name):
207209
is_paused_upon_creation=False,
208210
) as dag:
209211

210-
# ExternalTaskSensor to wait for gefs_wave_urls_to_kafka DAG to complete
211-
wait_for_gefs_wave_urls_to_kafka = ExternalTaskSensor(
212-
task_id="wait_for_gefs_wave_urls_to_kafka",
213-
external_dag_id="gefs_wave_urls_to_kafka",
214-
external_task_id=None, # `None` will wait for the entire task to complete
215-
timeout=7200, # Timeout before failing task
216-
poke_interval=60, # Seconds to wait between checks
217-
mode="reschedule", # Reschedule for long waits to free up worker slots
218-
)
219-
220-
@task
221-
def consume_and_process_from_kafka():
222-
consume_from_kafka(
223-
topic=topic,
224-
engine=engine,
225-
table_name=table_name,
226-
bs=8,
227-
sasl_username=sasl_username,
228-
sasl_password=sasl_password,
229-
)
212+
def taskflow():
213+
conf = {
214+
"bootstrap.servers": "kafka:9092",
215+
"group.id": "airflow-consumers",
216+
"enable.auto.commit": False,
217+
"auto.offset.reset": "earliest", # consume from the start of topic
218+
"security.protocol": "SASL_PLAINTEXT",
219+
"sasl.mechanisms": "PLAIN",
220+
"sasl.username": sasl_username,
221+
"sasl.password": sasl_password,
222+
"max.poll.interval.ms": 900000,
223+
}
224+
225+
# # ExternalTaskSensor to wait for gefs_wave_urls_to_kafka DAG to complete
226+
# wait_for_gefs_wave_urls_to_kafka = ExternalTaskSensor(
227+
# task_id="wait_for_gefs_wave_urls_to_kafka",
228+
# external_dag_id="gefs_wave_urls_to_kafka",
229+
# external_task_id=None, # `None` will wait for the entire task to complete
230+
# timeout=7200, # Timeout before failing task
231+
# poke_interval=60, # Seconds to wait between checks
232+
# mode="reschedule", # Reschedule for long waits to free up worker slots
233+
# )
234+
235+
@task
236+
def check_for_messages():
237+
c = Consumer(conf)
238+
c.subscribe([topic])
239+
logging.info(f"{conf}")
240+
# Poll for messages
241+
msg = c.poll(30.0)
242+
c.close()
243+
244+
if msg is None:
245+
logging.info("No new messages found. Task will be retried.")
246+
raise AirflowFailException(
247+
"No new messages found. Task will be explicitly failed to trigger retry."
248+
)
249+
else:
250+
logging.info("New messages found. Proceeding to consume and process.")
251+
return True
252+
253+
@task
254+
def consume_and_process_from_kafka():
255+
consume_from_kafka(
256+
topic=topic,
257+
engine=engine,
258+
table_name=table_name,
259+
bs=8,
260+
sasl_username=sasl_username,
261+
sasl_password=sasl_password,
262+
)
263+
264+
check_result = check_for_messages()
265+
consume_task = consume_and_process_from_kafka()
266+
check_result >> consume_task
230267

231-
data = consume_and_process_from_kafka()
268+
# wait_for_gefs_wave_urls_to_kafka >> data
232269

233-
wait_for_gefs_wave_urls_to_kafka >> data
270+
dag = taskflow()
234271

235272
if __name__ == "__main__":
236273
dag.test()

airflow/dags/gefs_wave_urls_to_kafka.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
from airflow.decorators import task
99
from bs4 import BeautifulSoup
1010
from confluent_kafka import Consumer, KafkaException, Producer
11-
from confluent_kafka.admin import AdminClient, ConfigResource
1211

1312
from airflow import DAG
1413

@@ -26,7 +25,7 @@
2625
"email": ["[email protected]"],
2726
"email_on_failure": False,
2827
"email_on_retry": False,
29-
"retries": 1,
28+
"retries": 5,
3029
"retry_delay": pendulum.duration(minutes=5),
3130
}
3231

airflow/dags/monitor_gefs_urls_kafka.py

-70
This file was deleted.

airflow/dags/monitor_kafka_topics.py

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import logging
2+
import os
3+
4+
import pendulum
5+
from airflow.decorators import task
6+
from confluent_kafka import Consumer, KafkaException
7+
8+
from airflow import DAG
9+
10+
sasl_username = os.environ.get("KAFKA_DEFAULT_USERS")
11+
sasl_password = os.environ.get("KAFKA_DEFAULT_PASSWORDS")
12+
13+
14+
start_date = pendulum.datetime(2024, 1, 1)
15+
16+
default_args = {
17+
"owner": "airflow",
18+
"depends_on_past": False,
19+
"start_date": start_date,
20+
"email": ["[email protected]"],
21+
"email_on_failure": False,
22+
"email_on_retry": False,
23+
"retries": 1,
24+
"retry_delay": pendulum.duration(minutes=5),
25+
}
26+
27+
28+
with DAG(
29+
"monitor_kafka_topics",
30+
default_args=default_args,
31+
description="Monitor the content of Kafka topics",
32+
schedule=None,
33+
catchup=False,
34+
) as dag:
35+
36+
def taskflow():
37+
38+
@task
39+
def monitor_kafka_topic(topic, sasl_username=sasl_username, sasl_password=sasl_password):
40+
conf = {
41+
"bootstrap.servers": "kafka:9092",
42+
"group.id": "airflow-consumers",
43+
"enable.auto.commit": False,
44+
"auto.offset.reset": "earliest", # consume from the start of topic
45+
"security.protocol": "SASL_PLAINTEXT",
46+
"sasl.mechanisms": "PLAIN",
47+
"sasl.username": sasl_username,
48+
"sasl.password": sasl_password,
49+
}
50+
51+
c = Consumer(conf)
52+
53+
c.subscribe([topic])
54+
55+
while True:
56+
msg = c.poll(9.0)
57+
if msg is None:
58+
logging.info(f"No more messages in topic {topic}")
59+
break
60+
if msg.error():
61+
logging.error(f"Error consuming from topic {topic}: {msg.error()}")
62+
raise KafkaException(msg.error())
63+
logging.info("Received message: {}".format(msg.value().decode("utf-8")))
64+
# Skipping committing the offsets so that this can be run as many times as needed
65+
c.close()
66+
67+
monitor_wave_urls = monitor_kafka_topic(
68+
topic="gefs_wave_urls", sasl_username=sasl_username, sasl_password=sasl_password
69+
)
70+
71+
monitor_station_data = monitor_kafka_topic(
72+
topic="noaa_station_latest_data",
73+
sasl_username=sasl_username,
74+
sasl_password=sasl_password,
75+
)
76+
77+
dag = taskflow()

0 commit comments

Comments
 (0)