-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproducer_consumer_parse_recipes.py
108 lines (85 loc) · 3.15 KB
/
producer_consumer_parse_recipes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 17 17:58:03 2020
@author: vicma
"""
import json
from time import sleep
from bs4 import BeautifulSoup
from kafka import KafkaConsumer, KafkaProducer
def publish_message(producer_instance, topic_name, key, value):
try:
key_bytes = bytes(key, encoding='utf-8')
value_bytes = bytes(value, encoding='utf-8')
producer_instance.send(topic_name, key=key_bytes, value=value_bytes)
producer_instance.flush()
print('Message published successfully.')
except Exception as ex:
print('Exception in publishing message')
print(str(ex))
def connect_kafka_producer():
_producer = None
try:
_producer = KafkaProducer(bootstrap_servers=['localhost:9092'], api_version=(0, 10))
except Exception as ex:
print('Exception while connecting Kafka')
print(str(ex))
finally:
return _producer
def parse(markup):
title = '-'
submit_by = '-'
description = '-'
calories = 0
ingredients = []
rec = {}
try:
soup = BeautifulSoup(markup, features="html5lib")
# title
title_section = soup.select('.recipe-summary__h1')
# submitter
submitter_section = soup.select('.submitter__name')
# description
description_section = soup.select('.submitter__description')
# ingredients
ingredients_section = soup.select('.recipe-ingred_txt')
# calories
calories_section = soup.select('.calorie-count')
if calories_section:
calories = calories_section[0].text.replace('cals', '').strip()
if ingredients_section:
for ingredient in ingredients_section:
ingredient_text = ingredient.text.strip()
if 'Add all ingredients to list' not in ingredient_text and ingredient_text != '':
ingredients.append({'step': ingredient.text.strip()})
if description_section:
description = description_section[0].text.strip().replace('"', '')
if submitter_section:
submit_by = submitter_section[0].text.strip()
if title_section:
title = title_section[0].text
rec = {'title': title, 'submitter': submit_by, 'description': description, 'calories': calories,
'ingredients': ingredients}
except Exception as ex:
print('Exception while parsing')
print(str(ex))
finally:
return json.dumps(rec)
if __name__ == '__main__':
print('Running Consumer..')
parsed_records = []
topic_name = 'raw_recipes'
parsed_topic_name = 'parsed_recipes'
consumer = KafkaConsumer(topic_name, auto_offset_reset='earliest',
bootstrap_servers=['localhost:9092'], api_version=(0, 10), consumer_timeout_ms=1000)
for msg in consumer:
html = msg.value
result = parse(html)
parsed_records.append(result)
consumer.close()
sleep(5)
if len(parsed_records) > 0:
print('Publishing records..')
producer = connect_kafka_producer()
for rec in parsed_records:
publish_message(producer, parsed_topic_name, 'parsed', rec)