Merge branch 'master' into v1.2.1

chicago-justice-project · Sep 18, 2019 · 9024375 · 9024375
2 parents 37e381b + 48f9ace
commit 9024375
Show file tree

Hide file tree

Showing 5 changed files with 219 additions and 70 deletions.
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -0,0 +1 @@
+* @jherzberg @mchladek
diff --git a/lib/tagnews/crimetype/models/binary_stemmed_logistic/save_model.py b/lib/tagnews/crimetype/models/binary_stemmed_logistic/save_model.py
@@ -23,9 +23,9 @@
 else:
     raise Exception('BAD ARGUMENTS')
 
-crime_df = df.ix[df.loc[:, 'OEMC':'TASR'].any(1), :]
+crime_df = df.loc[df.loc[:, 'OEMC':'TASR'].any(1), :]
 crime_df = crime_df.append(
-    df.ix[~df['relevant'], :].sample(n=min(3000, (~df['relevant']).sum()),
+    df.loc[~df['relevant'], :].sample(n=min(3000, (~df['relevant']).sum()),
                                      axis=0)
 )
 

diff --git a/lib/tagnews/geoloc/tag.py b/lib/tagnews/geoloc/tag.py
@@ -1,21 +1,21 @@
 from __future__ import division
 
-import os
-from collections import namedtuple
 import glob
-import time
 import json
+import os
 import re
+import time
+from collections import namedtuple
+from contextlib import ExitStack, redirect_stderr
 
-import requests
-import pandas as pd
 import numpy as np
+import pandas as pd
+import requests
 from shapely.geometry import shape, Point
 
+from tagnews.utils.neighborhoods import neighborhoods
 from .. import utils
 
-from contextlib import ExitStack, redirect_stderr
-
 with ExitStack() as stack:
     null_stream = open(os.devnull, "w")
     stack.enter_context(null_stream)
@@ -26,12 +26,15 @@
 Contains the CrimeTags class that allows tagging of articles.
 """
 
-MODEL_LOCATION = os.path.join(os.path.split(__file__)[0],
-                              os.path.join('models', 'lstm', 'saved'))
+MODEL_LOCATION = os.path.join(
+    os.path.split(__file__)[0], os.path.join("models", "lstm", "saved")
+)
 
 COMMUNITY_AREAS_FILE = os.path.join(
-    os.path.split(__file__)[0], '..', 'data',
-    'Boundaries - Community Areas (current).geojson'
+    os.path.split(__file__)[0],
+    "..",
+    "data",
+    "Boundaries - Community Areas (current).geojson",
 )
 
 
@@ -50,39 +53,48 @@ def post_process(geostring):
     processed_geostring : str
     """
     # Merge multiple whitespaces into one
-    geostring = ' '.join(geostring.split())
+    geostring = " ".join(geostring.split())
 
     # gisgraphy struggles with things like "55th and Woodlawn".
     # replace "...<number><number ender, e.g. th or rd> and..."
     # with two zeros.
     # \100 does not work correclty so we need to add a separator.
-    geostring = re.sub(r'([0-9]+)(th|rd|st) and',
-                       r'\1<__internal_separator__>00 and',
-                       geostring)
-    geostring = geostring.replace('<__internal_separator__>', '')
+    geostring = re.sub(
+        r"([0-9]+)(th|rd|st) and", r"\1<__internal_separator__>00 and", geostring
+    )
+    geostring = geostring.replace("<__internal_separator__>", "")
 
     # remove stopwords, only if they are internal, i.e.
     # the geostring doesn't start with "block ...".
-    for stopword in ['block', 'of', 'and']:
-        geostring = geostring.replace(' {} '.format(stopword), ' ')
+    for stopword in ["block", "of", "and"]:
+        geostring = geostring.replace(" {} ".format(stopword), " ")
 
     return geostring
 
 
-_base_geocoder_url = ('http://ec2-34-228-58-223.compute-1.amazonaws.com'
-                      ':4000/v1/search?text={}')
+_base_geocoder_url = (
+    "http://ec2-34-228-58-223.compute-1.amazonaws.com" ":4000/v1/search?text={}"
+)
 
-GeocodeResults = namedtuple('GeocodeResults', ['coords_raw',
-                                               'full_responses_raw',
-                                               'scores_raw',
-                                               'coords_post',
-                                               'full_responses_post',
-                                               'scores_post'])
+GeocodeResults = namedtuple(
+    "GeocodeResults",
+    [
+        "coords_raw",
+        "full_responses_raw",
+        "scores_raw",
+        "coords_post",
+        "full_responses_post",
+        "scores_post",
+    ],
+)
 
 
-def get_lat_longs_from_geostrings(geostring_list, post_process_f=None,
-                                  sleep_secs=0,
-                                  geocoder_url_formatter=_base_geocoder_url):
+def get_lat_longs_from_geostrings(
+    geostring_list,
+    post_process_f=None,
+    sleep_secs=0,
+    geocoder_url_formatter=_base_geocoder_url,
+):
     """
     Geo-code each geostring in `geostring_list` into lat/long values.
     Also return the full response from the geocoding service.
@@ -129,29 +141,30 @@ def _geocode(lst):
         full_responses = []
         for addr_str in lst:
             try:
-                g = json.loads(requests.get(
-                    geocoder_url_formatter.format(addr_str)
-                ).text)
+                g = json.loads(
+                    requests.get(geocoder_url_formatter.format(addr_str)).text
+                )
             except Exception:
                 g = {}
             full_responses.append(g)
             time.sleep(sleep_secs)
 
         def _get_latlong(g):
             try:
-                return g['features'][0]['geometry']['coordinates']
+                return g["features"][0]["geometry"]["coordinates"]
             except (KeyError, IndexError):
                 return [np.nan, np.nan]
 
         def _get_confidence(g):
             try:
-                return g['features'][0]['properties']['confidence']
+                return g["features"][0]["properties"]["confidence"]
             except (KeyError, IndexError):
                 return np.nan
 
-        coords = pd.DataFrame([_get_latlong(g) for g in full_responses],
-                              columns=['long', 'lat'])
-        coords = coords[['lat', 'long']] # it makes me feel better, OK?
+        coords = pd.DataFrame(
+            [_get_latlong(g) for g in full_responses], columns=["long", "lat"]
+        )
+        coords = coords[["lat", "long"]]  # it makes me feel better, OK?
         scores = np.array([_get_confidence(g) for g in full_responses])
 
         return full_responses, coords, scores
@@ -162,12 +175,14 @@ def _get_confidence(g):
         [post_process_f(geo_s) for geo_s in geostring_list]
     )
 
-    return GeocodeResults(coords_raw=coords_raw,
-                          full_responses_raw=full_responses_raw,
-                          scores_raw=scores_raw,
-                          coords_post=coords_post,
-                          full_responses_post=full_responses_post,
-                          scores_post=scores_post)
+    return GeocodeResults(
+        coords_raw=coords_raw,
+        full_responses_raw=full_responses_raw,
+        scores_raw=scores_raw,
+        coords_post=coords_post,
+        full_responses_post=full_responses_post,
+        scores_post=scores_post,
+    )
 
 
 def load_model(location=MODEL_LOCATION):
@@ -178,29 +193,32 @@ def load_model(location=MODEL_LOCATION):
 
     The files with the most recent timestamp are loaded.
     """
-    models = glob.glob(os.path.join(location, 'weights*.hdf5'))
+    models = glob.glob(os.path.join(location, "weights*.hdf5"))
     if not models:
-        raise RuntimeError(('No models to load. Run'
-                            ' "python -m tagnews.geoloc.models.'
-                            'lstm.save_model"'))
+        raise RuntimeError(
+            (
+                "No models to load. Run"
+                ' "python -m tagnews.geoloc.models.'
+                'lstm.save_model"'
+            )
+        )
 
     model = keras.models.load_model(models[-1])
 
     return model
 
 
-class GeoCoder():
+class GeoCoder:
     def __init__(self):
         self.model = load_model()
         self.glove = utils.load_vectorizer.load_glove(
-            os.path.join(os.path.split(__file__)[0],
-                         '../data/glove.6B.50d.txt')
+            os.path.join(os.path.split(__file__)[0], "../data/glove.6B.50d.txt")
         )
         with open(COMMUNITY_AREAS_FILE) as f:
             d = json.load(f)
             self.com_areas = {
-                f['properties']['community']: shape(f['geometry'])
-                for f in d['features']
+                f["properties"]["community"]: shape(f["geometry"])
+                for f in d["features"]
             }
 
     def pre_process(self, s):
@@ -223,12 +241,14 @@ def pre_process(self, s):
             Has shape (1, N, M) where N is the number of words and M
             is the size of the word vectors, currently M is 51.
         """
-        words = s.split() # split along white space.
-        data = pd.concat([pd.DataFrame([[w[0].isupper()] if w else [False]
-                                        for w in words]),
-                          (self.glove.reindex(words).fillna(0)
-                           .reset_index(drop=True))],
-                         axis='columns')
+        words = s.split()  # split along white space.
+        data = pd.concat(
+            [
+                pd.DataFrame([[w[0].isupper()] if w else [False] for w in words]),
+                (self.glove.reindex(words).fillna(0).reset_index(drop=True)),
+            ],
+            axis="columns",
+        )
         return words, np.expand_dims(data, axis=0)
 
     def extract_geostring_probs(self, s):
@@ -271,24 +291,26 @@ def extract_geostrings(self, s, prob_thresh=0.5):
         geostrings : list of lists of strings
             The list of extracted geostrings from the article text.
             Each word is kept separated in the list.
-            Examle:
+            Example:
                 [['1300', 'W.', 'Halsted'], ['Ohio']]
         """
         words, probs = self.extract_geostring_probs(s)
         above_thresh = probs >= prob_thresh
 
-        words = ['filler'] + words + ['filler']
-        above_thresh = np.concatenate([[False],
-                                       above_thresh,
-                                       [False]]).astype(np.int32)
+        words = ["filler"] + words + ["filler"]
+        probs = np.append(0, np.append(probs, 0))
+
+        above_thresh = np.concatenate([[False], above_thresh, [False]]).astype(np.int32)
         switch_ons = np.where(np.diff(above_thresh) == 1)[0] + 1
         switch_offs = np.where(np.diff(above_thresh) == -1)[0] + 1
 
         geostrings = []
+        probstrings = []
         for on, off in zip(switch_ons, switch_offs):
             geostrings.append(words[on:off])
+            probstrings.append(probs[on:off])
 
-        return geostrings
+        return geostrings, probstrings
 
     @staticmethod
     def lat_longs_from_geostring_lists(geostring_lists, **kwargs):
@@ -317,7 +339,7 @@ def lat_longs_from_geostring_lists(geostring_lists, **kwargs):
             of absolute rule.
         """
         out = get_lat_longs_from_geostrings(
-            [' '.join(gl) for gl in geostring_lists], **kwargs
+            [" ".join(gl) for gl in geostring_lists], **kwargs
         )
 
         return out.coords_post, out.scores_post
@@ -340,11 +362,40 @@ def community_area_from_coords(self, coords):
         """
         out = []
         for _, coord in coords.iterrows():
-            p = Point(coord['long'], coord['lat'])
+            p = Point(coord["long"], coord["lat"])
             for com_name, com_shape in self.com_areas.items():
                 if com_shape.contains(p):
                     out.append(com_name)
                     break
             else:
-                out.append('')
+                out.append("")
         return out
+
+    def best_geostring(self, extracted_strs_and_probs: tuple):
+        """
+
+        Parameters
+        ----------
+        extracted_strs_and_probs : 2-tuple
+            A 2-tuple of two lists containing a list of extracted geostrings at index zero
+                                and a list of extracted geostring probabilities at index one
+
+        Returns
+        -------
+        2-tuple of one geostring of the best geostring
+        """
+        consider = [[], []]
+        for geostring, probs in zip(
+            extracted_strs_and_probs[0], extracted_strs_and_probs[1]
+        ):
+            is_neighborhood = False
+            for neighborhood in neighborhoods:
+                if neighborhood.lower() in " ".join(geostring).lower():
+                    is_neighborhood = True
+            if is_neighborhood or len(geostring) >= 3:
+                consider[0].append((geostring))
+                consider[1].append((probs))
+
+        avgs = [sum(i) / len(i) for i in consider[1]]
+        max_index = avgs.index(max(avgs))
+        return consider[0][max_index]
diff --git a/lib/tagnews/tests/test_geocoder.py b/lib/tagnews/tests/test_geocoder.py
@@ -23,7 +23,7 @@ def test_extract_geostring_probs(self):
         max_word = words[np.argmax(probs)]
         geostrings = self.model.extract_geostrings(article,
                                                    prob_thresh=max_prob-0.001)
-        assert max_word in [word for geostring in geostrings for word in geostring]
+        assert max_word in [word for geostring in geostrings for word in geostring][0]
 
     def test_extract_geostring_probs_word_not_in_glove(self):
         """