diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000..5f5a7cf --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1 @@ +* @jherzberg @mchladek \ No newline at end of file diff --git a/lib/tagnews/crimetype/models/binary_stemmed_logistic/save_model.py b/lib/tagnews/crimetype/models/binary_stemmed_logistic/save_model.py index e7ccabc..4027554 100644 --- a/lib/tagnews/crimetype/models/binary_stemmed_logistic/save_model.py +++ b/lib/tagnews/crimetype/models/binary_stemmed_logistic/save_model.py @@ -23,9 +23,9 @@ else: raise Exception('BAD ARGUMENTS') -crime_df = df.ix[df.loc[:, 'OEMC':'TASR'].any(1), :] +crime_df = df.loc[df.loc[:, 'OEMC':'TASR'].any(1), :] crime_df = crime_df.append( - df.ix[~df['relevant'], :].sample(n=min(3000, (~df['relevant']).sum()), + df.loc[~df['relevant'], :].sample(n=min(3000, (~df['relevant']).sum()), axis=0) ) diff --git a/lib/tagnews/geoloc/tag.py b/lib/tagnews/geoloc/tag.py index 9ee53ea..d04fc19 100644 --- a/lib/tagnews/geoloc/tag.py +++ b/lib/tagnews/geoloc/tag.py @@ -1,21 +1,21 @@ from __future__ import division -import os -from collections import namedtuple import glob -import time import json +import os import re +import time +from collections import namedtuple +from contextlib import ExitStack, redirect_stderr -import requests -import pandas as pd import numpy as np +import pandas as pd +import requests from shapely.geometry import shape, Point +from tagnews.utils.neighborhoods import neighborhoods from .. import utils -from contextlib import ExitStack, redirect_stderr - with ExitStack() as stack: null_stream = open(os.devnull, "w") stack.enter_context(null_stream) @@ -26,12 +26,15 @@ Contains the CrimeTags class that allows tagging of articles. """ -MODEL_LOCATION = os.path.join(os.path.split(__file__)[0], - os.path.join('models', 'lstm', 'saved')) +MODEL_LOCATION = os.path.join( + os.path.split(__file__)[0], os.path.join("models", "lstm", "saved") +) COMMUNITY_AREAS_FILE = os.path.join( - os.path.split(__file__)[0], '..', 'data', - 'Boundaries - Community Areas (current).geojson' + os.path.split(__file__)[0], + "..", + "data", + "Boundaries - Community Areas (current).geojson", ) @@ -50,39 +53,48 @@ def post_process(geostring): processed_geostring : str """ # Merge multiple whitespaces into one - geostring = ' '.join(geostring.split()) + geostring = " ".join(geostring.split()) # gisgraphy struggles with things like "55th and Woodlawn". # replace "... and..." # with two zeros. # \100 does not work correclty so we need to add a separator. - geostring = re.sub(r'([0-9]+)(th|rd|st) and', - r'\1<__internal_separator__>00 and', - geostring) - geostring = geostring.replace('<__internal_separator__>', '') + geostring = re.sub( + r"([0-9]+)(th|rd|st) and", r"\1<__internal_separator__>00 and", geostring + ) + geostring = geostring.replace("<__internal_separator__>", "") # remove stopwords, only if they are internal, i.e. # the geostring doesn't start with "block ...". - for stopword in ['block', 'of', 'and']: - geostring = geostring.replace(' {} '.format(stopword), ' ') + for stopword in ["block", "of", "and"]: + geostring = geostring.replace(" {} ".format(stopword), " ") return geostring -_base_geocoder_url = ('http://ec2-34-228-58-223.compute-1.amazonaws.com' - ':4000/v1/search?text={}') +_base_geocoder_url = ( + "http://ec2-34-228-58-223.compute-1.amazonaws.com" ":4000/v1/search?text={}" +) -GeocodeResults = namedtuple('GeocodeResults', ['coords_raw', - 'full_responses_raw', - 'scores_raw', - 'coords_post', - 'full_responses_post', - 'scores_post']) +GeocodeResults = namedtuple( + "GeocodeResults", + [ + "coords_raw", + "full_responses_raw", + "scores_raw", + "coords_post", + "full_responses_post", + "scores_post", + ], +) -def get_lat_longs_from_geostrings(geostring_list, post_process_f=None, - sleep_secs=0, - geocoder_url_formatter=_base_geocoder_url): +def get_lat_longs_from_geostrings( + geostring_list, + post_process_f=None, + sleep_secs=0, + geocoder_url_formatter=_base_geocoder_url, +): """ Geo-code each geostring in `geostring_list` into lat/long values. Also return the full response from the geocoding service. @@ -129,9 +141,9 @@ def _geocode(lst): full_responses = [] for addr_str in lst: try: - g = json.loads(requests.get( - geocoder_url_formatter.format(addr_str) - ).text) + g = json.loads( + requests.get(geocoder_url_formatter.format(addr_str)).text + ) except Exception: g = {} full_responses.append(g) @@ -139,19 +151,20 @@ def _geocode(lst): def _get_latlong(g): try: - return g['features'][0]['geometry']['coordinates'] + return g["features"][0]["geometry"]["coordinates"] except (KeyError, IndexError): return [np.nan, np.nan] def _get_confidence(g): try: - return g['features'][0]['properties']['confidence'] + return g["features"][0]["properties"]["confidence"] except (KeyError, IndexError): return np.nan - coords = pd.DataFrame([_get_latlong(g) for g in full_responses], - columns=['long', 'lat']) - coords = coords[['lat', 'long']] # it makes me feel better, OK? + coords = pd.DataFrame( + [_get_latlong(g) for g in full_responses], columns=["long", "lat"] + ) + coords = coords[["lat", "long"]] # it makes me feel better, OK? scores = np.array([_get_confidence(g) for g in full_responses]) return full_responses, coords, scores @@ -162,12 +175,14 @@ def _get_confidence(g): [post_process_f(geo_s) for geo_s in geostring_list] ) - return GeocodeResults(coords_raw=coords_raw, - full_responses_raw=full_responses_raw, - scores_raw=scores_raw, - coords_post=coords_post, - full_responses_post=full_responses_post, - scores_post=scores_post) + return GeocodeResults( + coords_raw=coords_raw, + full_responses_raw=full_responses_raw, + scores_raw=scores_raw, + coords_post=coords_post, + full_responses_post=full_responses_post, + scores_post=scores_post, + ) def load_model(location=MODEL_LOCATION): @@ -178,29 +193,32 @@ def load_model(location=MODEL_LOCATION): The files with the most recent timestamp are loaded. """ - models = glob.glob(os.path.join(location, 'weights*.hdf5')) + models = glob.glob(os.path.join(location, "weights*.hdf5")) if not models: - raise RuntimeError(('No models to load. Run' - ' "python -m tagnews.geoloc.models.' - 'lstm.save_model"')) + raise RuntimeError( + ( + "No models to load. Run" + ' "python -m tagnews.geoloc.models.' + 'lstm.save_model"' + ) + ) model = keras.models.load_model(models[-1]) return model -class GeoCoder(): +class GeoCoder: def __init__(self): self.model = load_model() self.glove = utils.load_vectorizer.load_glove( - os.path.join(os.path.split(__file__)[0], - '../data/glove.6B.50d.txt') + os.path.join(os.path.split(__file__)[0], "../data/glove.6B.50d.txt") ) with open(COMMUNITY_AREAS_FILE) as f: d = json.load(f) self.com_areas = { - f['properties']['community']: shape(f['geometry']) - for f in d['features'] + f["properties"]["community"]: shape(f["geometry"]) + for f in d["features"] } def pre_process(self, s): @@ -223,12 +241,14 @@ def pre_process(self, s): Has shape (1, N, M) where N is the number of words and M is the size of the word vectors, currently M is 51. """ - words = s.split() # split along white space. - data = pd.concat([pd.DataFrame([[w[0].isupper()] if w else [False] - for w in words]), - (self.glove.reindex(words).fillna(0) - .reset_index(drop=True))], - axis='columns') + words = s.split() # split along white space. + data = pd.concat( + [ + pd.DataFrame([[w[0].isupper()] if w else [False] for w in words]), + (self.glove.reindex(words).fillna(0).reset_index(drop=True)), + ], + axis="columns", + ) return words, np.expand_dims(data, axis=0) def extract_geostring_probs(self, s): @@ -271,24 +291,26 @@ def extract_geostrings(self, s, prob_thresh=0.5): geostrings : list of lists of strings The list of extracted geostrings from the article text. Each word is kept separated in the list. - Examle: + Example: [['1300', 'W.', 'Halsted'], ['Ohio']] """ words, probs = self.extract_geostring_probs(s) above_thresh = probs >= prob_thresh - words = ['filler'] + words + ['filler'] - above_thresh = np.concatenate([[False], - above_thresh, - [False]]).astype(np.int32) + words = ["filler"] + words + ["filler"] + probs = np.append(0, np.append(probs, 0)) + + above_thresh = np.concatenate([[False], above_thresh, [False]]).astype(np.int32) switch_ons = np.where(np.diff(above_thresh) == 1)[0] + 1 switch_offs = np.where(np.diff(above_thresh) == -1)[0] + 1 geostrings = [] + probstrings = [] for on, off in zip(switch_ons, switch_offs): geostrings.append(words[on:off]) + probstrings.append(probs[on:off]) - return geostrings + return geostrings, probstrings @staticmethod def lat_longs_from_geostring_lists(geostring_lists, **kwargs): @@ -317,7 +339,7 @@ def lat_longs_from_geostring_lists(geostring_lists, **kwargs): of absolute rule. """ out = get_lat_longs_from_geostrings( - [' '.join(gl) for gl in geostring_lists], **kwargs + [" ".join(gl) for gl in geostring_lists], **kwargs ) return out.coords_post, out.scores_post @@ -340,11 +362,40 @@ def community_area_from_coords(self, coords): """ out = [] for _, coord in coords.iterrows(): - p = Point(coord['long'], coord['lat']) + p = Point(coord["long"], coord["lat"]) for com_name, com_shape in self.com_areas.items(): if com_shape.contains(p): out.append(com_name) break else: - out.append('') + out.append("") return out + + def best_geostring(self, extracted_strs_and_probs: tuple): + """ + + Parameters + ---------- + extracted_strs_and_probs : 2-tuple + A 2-tuple of two lists containing a list of extracted geostrings at index zero + and a list of extracted geostring probabilities at index one + + Returns + ------- + 2-tuple of one geostring of the best geostring + """ + consider = [[], []] + for geostring, probs in zip( + extracted_strs_and_probs[0], extracted_strs_and_probs[1] + ): + is_neighborhood = False + for neighborhood in neighborhoods: + if neighborhood.lower() in " ".join(geostring).lower(): + is_neighborhood = True + if is_neighborhood or len(geostring) >= 3: + consider[0].append((geostring)) + consider[1].append((probs)) + + avgs = [sum(i) / len(i) for i in consider[1]] + max_index = avgs.index(max(avgs)) + return consider[0][max_index] diff --git a/lib/tagnews/tests/test_geocoder.py b/lib/tagnews/tests/test_geocoder.py index e610759..cb6e288 100644 --- a/lib/tagnews/tests/test_geocoder.py +++ b/lib/tagnews/tests/test_geocoder.py @@ -23,7 +23,7 @@ def test_extract_geostring_probs(self): max_word = words[np.argmax(probs)] geostrings = self.model.extract_geostrings(article, prob_thresh=max_prob-0.001) - assert max_word in [word for geostring in geostrings for word in geostring] + assert max_word in [word for geostring in geostrings for word in geostring][0] def test_extract_geostring_probs_word_not_in_glove(self): """ diff --git a/lib/tagnews/utils/neighborhoods.py b/lib/tagnews/utils/neighborhoods.py new file mode 100644 index 0000000..aa7f9f5 --- /dev/null +++ b/lib/tagnews/utils/neighborhoods.py @@ -0,0 +1,97 @@ +neighborhoods = [ + "Andersonville", + "Archer Heights", + "Ashburn", + "Ashburn Estates", + "Austin", + "Avaondale", + "Belmont Central", + "Beverly", + "Beverly Woods", + "Brainerd", + "Bridgeport", + "Brighton Park", + "Bronceville", + "Bucktown", + "Burnside", + "Calumet Heights", + "Canaryville", + "Clearing", + "Chatham", + "Chinatown", + "Cottage Grove Heights", + "Cragin", + "Dunning", + "East Chicago", + "Edison Park", + "Edgebrook", + "Edgewater", + "Englewood", + "Ford City", + "Gage Park", + "Galewood", + "Garfield Park", + "Garfield Ridge", + "Gold Coast", + "Grand Crossing", + "Gresham", + "Hamilton Park", + "Humboldt Park", + "Hyde Park", + "Jefferson Park", + "Kelvyn Park", + "Kenwood", + "Kilbourn Park", + "Lake Meadows", + "Lakeview", + "Lawndale", + "Lincoln Park", + "Lincoln Square", + "Little Village", + "Logan Square", + "Longwood Manor", + "Loop", + "Marquette Park", + "McKinley Park", + "Midway", + "Morgan Park", + "Montclare", + "Mount Greenwood", + "North Center", + "Norwood Park", + "Old Irving Park", + "Old Town", + "Park Manor", + "Pilsen", + "Princeton Park", + "Portage Park", + "Pullman", + "Ravenswood", + "River North", + "River West", + "Rodgers Park", + "Roscoe VIllage", + "Roseland", + "Sauganash", + "Schorsch Village", + "Scottsdale", + "South Chicago", + "South Deering", + "South Loop", + "South Shore", + "Streeterville", + "Tri-Taylor", + "Ukrainian Village", + "United Center", + "Uptown", + "Vittum Park", + "Washington Heights", + "West Elsdon", + "West Loop", + "West Pullman", + "Westlawn", + "Wicker Park", + "Woodlawn", + "Wrigleyville", + "Wrigtwood", +]