NodyHub · Zaunei · Mar 25, 2018 · Mar 25, 2018 · Mar 25, 2018 · Mar 25, 2018
diff --git a/config.yaml.dist b/config.yaml.dist
@@ -12,6 +12,12 @@ loop:
 # 	- https://www.wg-gesucht.de/...
 urls:
 
+# There are often city districts in the address which
+# Google Maps does not like. Use this blacklist to remove
+# districts from the search.
+blacklist:
+  - Innenstadt
+
 # If an expose includes an address, the bot is capable of
 # displaying the distance and time to travel (duration) to
 # some configured other addresses, for specific kinds of
@@ -22,6 +28,7 @@ urls:
 # 	- "bicyle"
 #	- "transit" (public transport)
 #	- "driving"
+#   - "walking"
 # 
 # The example configuration below includes a place for
 # "John", located at the main train station of munich.

diff --git a/flathunter/crawl_immobilienscout.py b/flathunter/crawl_immobilienscout.py
@@ -53,18 +53,21 @@ def extract_data(self, soup):
         address_fields = soup.find_all(lambda e: e.has_attr('class') and 'result-list-entry__address' in e['class'])
 
         for idx, title_el in enumerate(title_elements):
-            attr_els = attr_container_els[idx].find_all('dd')
-            address = address_fields[idx].text.strip()
-            details = {
-                'id': expose_ids[idx],
-                'url': expose_urls[idx],
-                'title': title_el.text.strip().replace('NEU', ''),
-                'price': attr_els[0].text.strip().split(' ')[0].strip(),
-                'size': attr_els[1].text.strip().split(' ')[0].strip() + " qm",
-                'rooms': attr_els[2].text.strip().split(' ')[0].strip() + " Zi.",
-                'address': address
-            }
-            entries.append(details)
+            try:
+                attr_els = attr_container_els[idx].find_all('dd')
+                address = address_fields[idx].text.strip().replace(',', '')
+                details = {
+                    'id': expose_ids[idx],
+                    'url': expose_urls[idx],
+                    'title': title_el.text.strip().replace('NEU', ''),
+                    'price': attr_els[0].text.strip().split(' ')[0].strip(),
+                    'size': attr_els[1].text.strip().split(' ')[0].strip() + " qm",
+                    'rooms': attr_els[2].text.strip().split(' ')[0].strip() + " Zi.",
+                    'address': address
+                }
+                entries.append(details)
+            except:
+                print("Error")
 
         self.__log__.debug('extracted: ' + str(entries))
         return entries
diff --git a/flathunter/crawl_wggesucht.py b/flathunter/crawl_wggesucht.py
@@ -1,6 +1,7 @@
 import logging
-import requests
 import re
+
+import requests
 from bs4 import BeautifulSoup
 
 
@@ -38,33 +39,33 @@ def get_page(self, search_url, page_no):
         resp = requests.get(search_url)  # TODO add page_no in url
         if resp.status_code != 200:
             self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
-        return BeautifulSoup(resp.content, 'html.parser')
+        return BeautifulSoup(resp.content, 'lxml')
 
     def extract_data(self, soup):
         entries = []
 
-        findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('ad--'))
+        findings = soup.find_all(lambda e: e.has_attr('id') and e['id'].startswith('liste-'))
         existingFindings = list(
-            filter(lambda e: e.has_attr('class') and not 'listenansicht-inactive' in e['class'], findings))
+            filter(lambda e: e.has_attr('class') and not 'display-none' in e['class'], findings))
 
         baseurl = 'https://www.wg-gesucht.de/'
         for row in existingFindings:
-            url = baseurl + row['adid']  # u'wohnungen-in-Muenchen-Altstadt-Lehel.6038357.html'
-            id = int(url.split('.')[-2])
-            rooms = row.find(lambda e: e.has_attr('class') and 'ang_spalte_zimmer' in e['class']).text.strip()  # u'3'
-            price = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_miete' in e['class']).text.strip()  # u'433\u20ac'
-            size = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_groesse' in e['class']).text.strip()  # u'75m\xb2'
-            district = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_stadt' in e['class']).text.strip()  # u'Altstadt-Lehel'
-            date = row.find(
-                lambda e: e.has_attr('class') and 'ang_spalte_freiab' in e['class']).text.strip()  # u'21.03.17'
+            infostring = row.find(
+                lambda e: e.name == "div" and e.has_attr('class') and 'list-details-panel-inner' in e[
+                    'class']).p.text.strip()
+            rooms = re.findall(r'\d[-]Zimmer[-]Wohnung', infostring)[0][:1]
+            date = re.findall(r'\d{2}.\d{2}.\d{4}', infostring)[0]
+            detail = row.find_all(lambda e: e.name == "a" and e.has_attr('class') and 'detailansicht' in e['class']);
+            title = detail[2].text.strip()
+            url = baseurl + detail[0]["href"]
+            size_price = detail[0].text.strip()
+            price = re.findall(r'\d{2,4}\s€', size_price)[0]
+            size = re.findall(r'\d{2,4}\sm²', size_price)[0]
 
             details = {
                 'id': int(url.split('.')[-2]),
                 'url': url,
-                'title': "Wohnung in %s ab dem %s" % (district, date),
+                'title': "%s ab dem %s" % (title, date),
                 'price': price,
                 'size': size,
                 'rooms': rooms + " Zi.",
@@ -78,9 +79,7 @@ def extract_data(self, soup):
 
     def load_address(self, url):
         # extract address from expose itself
-        exposeHTML = requests.get(url).content
-        exposeSoup = BeautifulSoup(exposeHTML, 'html.parser')
-        address_raw = exposeSoup.find(lambda e: e.has_attr('onclick') and '#map_tab' in e['onclick']).text
-        address = address_raw.strip().split('\n')[0] + ", " + address_raw.strip().split('\n')[-1].strip()
-
+        r = requests.get(url)
+        flat = BeautifulSoup(r.content, 'lxml')
+        address = ' '.join(flat.find('div', {"class": "col-sm-4 mb10"}).find("a", {"href": "#"}).text.strip().split())
         return address
diff --git a/flathunter/hunter.py b/flathunter/hunter.py
@@ -1,16 +1,20 @@
+import datetime
 import logging
-import requests
 import re
-import urllib
-import datetime
 import time
+import urllib
+
+import requests
+
 from flathunter.sender_telegram import SenderTelegram
 
+
 class Hunter:
     __log__ = logging.getLogger(__name__)
     GM_MODE_TRANSIT = 'transit'
     GM_MODE_BICYCLE = 'bicycling'
     GM_MODE_DRIVING = 'driving'
+    GM_MODE_WALKING = 'walking'
 
     def hunt_flats(self, config, searchers, id_watch):
         sender = SenderTelegram(config)
@@ -50,6 +54,10 @@ def hunt_flats(self, config, searchers, id_watch):
                             self.__log__.debug("Loaded address %s for url %s" % (address, url))
                             break
 
+                # filter districts
+                blacklist = config.get('blacklist', list())
+                address = ' '.join(filter(lambda x: x not in blacklist, address.split()))
+
                 # calculdate durations
                 message = config.get('message', "").format(
                     title=expose['title'],
@@ -74,16 +82,16 @@ def get_formatted_durations(self, config, address):
                 dest = duration.get('destination')
                 name = duration.get('name')
                 for mode in duration.get('modes', list()):
-                    if 'gm_id' in mode and 'title' in mode and 'key' in config.get('google_maps_api',dict()):
+                    if 'gm_id' in mode and 'title' in mode and 'key' in config.get('google_maps_api', dict()):
                         duration = self.get_gmaps_distance(config, address, dest, mode['gm_id'])
                         out += "> %s (%s): %s\n" % (name, mode['title'], duration)
 
         return out.strip()
 
     def get_gmaps_distance(self, config, address, dest, mode):
         # get timestamp for next monday at 9:00:00 o'clock
-        now = datetime.datetime.today().replace(hour=9,minute=0,second=0)
-        next_monday = now + datetime.timedelta(days=(7-now.weekday()))
+        now = datetime.datetime.today().replace(hour=9, minute=0, second=0)
+        next_monday = now + datetime.timedelta(days=(7 - now.weekday()))
         arrival_time = str(int(time.mktime(next_monday.timetuple())))
 
         # decode from unicode and url encode addresses
@@ -92,12 +100,12 @@ def get_gmaps_distance(self, config, address, dest, mode):
         self.__log__.debug("Got address: %s" % address)
 
         # get google maps config stuff
-        base_url = config.get('google_maps_api',dict()).get('url')
-        gm_key = config.get('google_maps_api',dict()).get('key')
+        base_url = config.get('google_maps_api', dict()).get('url')
+        gm_key = config.get('google_maps_api', dict()).get('key')
 
         if not gm_key and mode != self.GM_MODE_DRIVING:
             self.__log__.warning("No Google Maps API key configured and without using a mode different from "
-                                    "'driving' is not allowed. Downgrading to mode 'drinving' thus. ")
+                                 "'driving' is not allowed. Downgrading to mode 'drinving' thus. ")
             mode = 'driving'
             base_url = base_url.replace('&key={key}', '')
 
@@ -113,14 +121,13 @@ def get_gmaps_distance(self, config, address, dest, mode):
         for row in result['rows']:
             for element in row['elements']:
                 if 'status' in element and element['status'] != 'OK':
-                    self.__log__.warning("For address %s we got the status message: %s" % (address,element['status']))
+                    self.__log__.warning("For address %s we got the status message: %s" % (address, element['status']))
                     self.__log__.debug("We got this result: %s" % repr(result))
                     continue
-                    self.__log__.debug("Got distance and duration: %s / %s (%i seconds)"
-                                       % (element['distance']['text'], element['duration']['text'],
-                                          element['duration']['value'])
-                                       )
+                self.__log__.debug("Got distance and duration: %s / %s (%i seconds)"
+                                   % (element['distance']['text'], element['duration']['text'],
+                                      element['duration']['value'])
+                                   )
                 distances[element['duration']['value']] = '%s (%s)' % \
                                                           (element['duration']['text'], element['distance']['text'])
         return distances[min(distances.keys())] if distances else None
-
diff --git a/requirements.txt b/requirements.txt
@@ -6,3 +6,4 @@ idna==2.5
 PyYAML==3.12
 requests==2.18.1
 urllib3==1.21.1
+lxml==4.2.1