Skip to content

Commit

Permalink
Exceptions now will be catched and logged
Browse files Browse the repository at this point in the history
  • Loading branch information
NieThor committed Aug 17, 2020
1 parent 1ff763c commit ff92e2a
Showing 1 changed file with 146 additions and 142 deletions.
288 changes: 146 additions & 142 deletions VDE web scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ async def scrape_events(driver):
"""

global curr_last_times
channel = await client.get_entity('t.me/vdeyoungnet') # getting the telegram channel
channel = await client.get_entity('@TN_freqtrade_14052020_bot')#t.me/vdeyoungnet') # getting the telegram channel
events = {}
i = 0
while True:
Expand All @@ -130,147 +130,151 @@ async def scrape_events(driver):
curr_hour = datetime.datetime.now().hour
if 21 < curr_hour or curr_hour < 9: # no messages between 21:00 and 9:00
continue
driver.get(base_url)
wait = WebDriverWait(driver, 10)

# wait until events are lazy-loaded
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ci-teaser-automatic')))

# first three events are teasers
events_raw = driver.find_elements_by_class_name('ci-teaser-automatic')[0:3]

# others are available in the search-list
events_raw += driver.find_elements_by_class_name('ci-search-teaser')

# only posting if event is in less than 4 weeks?
curr_time_w_offset = datetime.datetime.now() + datetime.timedelta(days=40)

# only posting maximum every week
# posting_offset = datetime.datetime.now() - datetime.timedelta(days=7)

for event_raw in events_raw:
event_url = event_raw.find_element_by_tag_name('a').get_attribute('href')
if event_url not in events.keys():
event = VdeEvent()
event.event_url = event_url
else:
event = events[event_url]
resp = requests.get(event_url)
soup = BeautifulSoup(resp.text, 'html.parser')
if 'vde.com' in event_url: # some events are hosted on vde.com

# getting event information from the html code considering classes and content
event.title = prettify_string(soup.find('h1', {'class': 'ci-h2'}).get_text())
event.img_url = 'https://www.vde.com' + \
soup.find('div', {'class': 'ci-image-caption'}).find('img')['srcset']

table = soup.find('table', {'class': 'ci-stencil-event-table'})
tbody = table.find('tbody')
if None is tbody:
tbody = table
rows = tbody.find_all('tr')
temp_tds = rows[0].find_all('td')
temp_start = temp_tds[1].get_text()
if len(temp_tds) > 2:
temp_start += f' {temp_tds[2].get_text()}'
event.start = strip_time(temp_start)
temp_tds = rows[1].find_all('td')
temp_end = temp_tds[1].get_text()
if len(temp_tds) > 2:
temp_end += f' {temp_tds[2].get_text()}'
event.end = strip_time(temp_end)

def find_by_str(tag, search_str):
return tag.name == 'div' and tag.has_attr('class') and 'row-1' in tag[
'class'] and search_str in tag.get_text()

def find_location(tag):
return find_by_str(tag, 'Veranstaltungsort')
event.location = [prettify_string(
soup.find(find_location).parent.find_all('div')[1].find('p').get_text())]

def find_language(tag):
return find_by_str(tag, 'Veranstaltungs-Sprache')

def find_desc(tag):
return find_by_str(tag, 'Beschreibung')

def find_desc_2(tag):
return find_by_str(tag, 'Bemerkung')

event.language = [soup.find(find_language).parent.find('img').get('title')]

desc = soup.find(find_desc)
if None is desc:
desc = soup.find(find_desc_2)
if None is not desc:
event.description = prettify_string_with_list(desc.parent.find_all('div')[1])

elif 'vde-verlag.de' in event_url: # some events are hosted on vde-verlag.de

# getting event information from the html code considering classes and content
event.title = prettify_string(soup.find('h1', {'class': 'hyphenate'}).get_text())
event.img_url = 'https://www.vde-verlage.de' + \
soup.find('img', {'id': 'cover'})['src']
starts = []
ends = []
locations = []
table = soup.find('table', {'id': 'seminartermine'})
rows = table.find('tbody').find_all('tr')
for row in rows:
start_col, end_col, loc_col = row.find_all('td')[1:4]
starts.append(start_col.get_text())
ends.append(end_col.get_text())
locations.append(loc_col.find('span', {'class': 'hidden-xs'}).get_text())

event.start = strip_time(starts)
event.end = strip_time(ends)
event.location = locations
event.description = prettify_string(
soup.find('div', {'id': 'beschreibung'}).get_text())
if event.event_url in curr_last_times:
continue
# only post event if not been posted recently and event is coming up soon
if (None is event.last_posting_time) and event.start[0] <= curr_time_w_offset: # or event.last_posting_time < posting_offset) \
message = f'[{event.title}]({event_url})\n'
for i in range(len(event.start)):
if len(event.start) > 1:
message += f'{i + 1}. Termin\n'
message += f'Beginn: {event.start[i].strftime(datetime_format) + " Uhr" if type(event.start[i]) is datetime.datetime else event.start[i].strftime(date_format)}\n'
if (type(event.start[i]) is datetime.date
and type(event.end[i]) is datetime.date
and event.start[i] != event.end[i]) \
or (type(event.start[i]) is datetime.datetime
and type(event.end[i]) is datetime.date
and event.start[i].date() != event.end[i]):
message += f'Ende: {event.end[i].strftime(datetime_format if type(event.end[i]) is datetime.datetime else date_format)}\n'
if len(event.language) != 0:
message += 'Sprache: '
if 'de' in event.language[0]:
message += "\U0001f1e9\U0001f1ea"
else:
message += "\U0001f1ec\U0001f1e7"
message += '\n'
event.location[i] = event.location[i].replace('\n', ', ')
message += f'Ort: {event.location[i]}\n\n'
if None is not event.description:
message += f'__Beschreibung:__\n{event.description}'
image = requests.get(event.img_url, stream=True).content
with open('temp_image', 'wb') as img_file:
img_file.write(image)
with open('temp_image', 'rb') as img_file:
pass
await client.send_message(entity=channel, message=f'[{event.title}]({event.event_url})\n',
file=img_file)

await client.send_message(entity=channel, message=message,
link_preview=True) # , file=img_file) not usable right now, because text is limited elsewise
event.last_posting_time = datetime.datetime.now()
curr_last_times[event.event_url] = event.last_posting_time
with open('last_scraping_time.json', 'w') as time_write:
json.dump(datetime_dict_to_str(curr_last_times), time_write)

events[event.event_url] = event # update or append event
try:
driver.get(base_url)
wait = WebDriverWait(driver, 10)

# wait until events are lazy-loaded
wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ci-teaser-automatic')))

# first three events are teasers
events_raw = driver.find_elements_by_class_name('ci-teaser-automatic')[0:3]

# others are available in the search-list
events_raw += driver.find_elements_by_class_name('ci-search-teaser')

# only posting if event is in less than 4 weeks?
curr_time_w_offset = datetime.datetime.now() + datetime.timedelta(days=40)

# only posting maximum every week
# posting_offset = datetime.datetime.now() - datetime.timedelta(days=7)

for event_raw in events_raw:
event_url = event_raw.find_element_by_tag_name('a').get_attribute('href')
if event_url not in events.keys():
event = VdeEvent()
event.event_url = event_url
else:
event = events[event_url]
resp = requests.get(event_url)
soup = BeautifulSoup(resp.text, 'html.parser')
if 'vde.com' in event_url: # some events are hosted on vde.com

# getting event information from the html code considering classes and content
event.title = prettify_string(soup.find('h1', {'class': 'ci-h2'}).get_text())
event.img_url = 'https://www.vde.com' + \
soup.find('div', {'class': 'ci-image-caption'}).find('img')['srcset']

table = soup.find('table', {'class': 'ci-stencil-event-table'})
tbody = table.find('tbody')
if None is tbody:
tbody = table
rows = tbody.find_all('tr')
temp_tds = rows[0].find_all('td')
temp_start = temp_tds[1].get_text()
if len(temp_tds) > 2:
temp_start += f' {temp_tds[2].get_text()}'
event.start = strip_time(temp_start)
temp_tds = rows[1].find_all('td')
temp_end = temp_tds[1].get_text()
if len(temp_tds) > 2:
temp_end += f' {temp_tds[2].get_text()}'
event.end = strip_time(temp_end)

def find_by_str(tag, search_str):
return tag.name == 'div' and tag.has_attr('class') and 'row-1' in tag[
'class'] and search_str in tag.get_text()

def find_location(tag):
return find_by_str(tag, 'Veranstaltungsort')
event.location = [prettify_string(
soup.find(find_location).parent.find_all('div')[1].find('p').get_text())]

def find_language(tag):
return find_by_str(tag, 'Veranstaltungs-Sprache')

def find_desc(tag):
return find_by_str(tag, 'Beschreibung')

def find_desc_2(tag):
return find_by_str(tag, 'Bemerkung')

event.language = [soup.find(find_language).parent.find('img').get('title')]

desc = soup.find(find_desc)
if None is desc:
desc = soup.find(find_desc_2)
if None is not desc:
event.description = prettify_string_with_list(desc.parent.find_all('div')[1])

elif 'vde-verlag.de' in event_url: # some events are hosted on vde-verlag.de

# getting event information from the html code considering classes and content
event.title = prettify_string(soup.find('h1', {'class': 'hyphenate'}).get_text())
event.img_url = 'https://www.vde-verlage.de' + \
soup.find('img', {'id': 'cover'})['src']
starts = []
ends = []
locations = []
table = soup.find('table', {'id': 'seminartermine'})
rows = table.find('tbody').find_all('tr')
for row in rows:
start_col, end_col, loc_col = row.find_all('td')[1:4]
starts.append(start_col.get_text())
ends.append(end_col.get_text())
locations.append(loc_col.find('span', {'class': 'hidden-xs'}).get_text())

event.start = strip_time(starts)
event.end = strip_time(ends)
event.location = locations
event.description = prettify_string(
soup.find('div', {'id': 'beschreibung'}).get_text())
if event.event_url in curr_last_times:
continue
# only post event if not been posted recently and event is coming up soon
if (None is event.last_posting_time) and event.start[0] <= curr_time_w_offset: # or event.last_posting_time < posting_offset) \
message = f'[{event.title}]({event_url})\n'
for i in range(len(event.start)):
if len(event.start) > 1:
message += f'{i + 1}. Termin\n'
message += f'Beginn: {event.start[i].strftime(datetime_format) + " Uhr" if type(event.start[i]) is datetime.datetime else event.start[i].strftime(date_format)}\n'
if (type(event.start[i]) is datetime.date
and type(event.end[i]) is datetime.date
and event.start[i] != event.end[i]) \
or (type(event.start[i]) is datetime.datetime
and type(event.end[i]) is datetime.date
and event.start[i].date() != event.end[i]):
message += f'Ende: {event.end[i].strftime(datetime_format if type(event.end[i]) is datetime.datetime else date_format)}\n'
if len(event.language) != 0:
message += 'Sprache: '
if 'de' in event.language[0]:
message += "\U0001f1e9\U0001f1ea"
else:
message += "\U0001f1ec\U0001f1e7"
message += '\n'
event.location[i] = event.location[i].replace('\n', ', ')
message += f'Ort: {event.location[i]}\n\n'
if None is not event.description:
message += f'__Beschreibung:__\n{event.description}'
image = requests.get(event.img_url, stream=True).content
with open('temp_image', 'wb') as img_file:
img_file.write(image)
with open('temp_image', 'rb') as img_file:
pass
await client.send_message(entity=channel, message=f'[{event.title}]({event.event_url})\n',
file=img_file)

await client.send_message(entity=channel, message=message,
link_preview=True) # , file=img_file) not usable right now, because text is limited elsewise
event.last_posting_time = datetime.datetime.now()
curr_last_times[event.event_url] = event.last_posting_time
with open('last_scraping_time.json', 'w') as time_write:
json.dump(datetime_dict_to_str(curr_last_times), time_write)

events[event.event_url] = event # update or append event
except Exception as e:
with open('log.txt', 'a+') as log_file:
log_file.write(f'{datetime.datetime.now().strftime(datetime_format)}: {str(e)}\n')


if __name__ == '__main__':
Expand Down

0 comments on commit ff92e2a

Please sign in to comment.