Skip to content

Commit

Permalink
Merge branch 'master' into v1.2.1
Browse files Browse the repository at this point in the history
  • Loading branch information
jlherzberg authored Sep 18, 2019
2 parents 37e381b + 48f9ace commit 9024375
Show file tree
Hide file tree
Showing 5 changed files with 219 additions and 70 deletions.
1 change: 1 addition & 0 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* @jherzberg @mchladek
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
else:
raise Exception('BAD ARGUMENTS')

crime_df = df.ix[df.loc[:, 'OEMC':'TASR'].any(1), :]
crime_df = df.loc[df.loc[:, 'OEMC':'TASR'].any(1), :]
crime_df = crime_df.append(
df.ix[~df['relevant'], :].sample(n=min(3000, (~df['relevant']).sum()),
df.loc[~df['relevant'], :].sample(n=min(3000, (~df['relevant']).sum()),
axis=0)
)

Expand Down
185 changes: 118 additions & 67 deletions lib/tagnews/geoloc/tag.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
from __future__ import division

import os
from collections import namedtuple
import glob
import time
import json
import os
import re
import time
from collections import namedtuple
from contextlib import ExitStack, redirect_stderr

import requests
import pandas as pd
import numpy as np
import pandas as pd
import requests
from shapely.geometry import shape, Point

from tagnews.utils.neighborhoods import neighborhoods
from .. import utils

from contextlib import ExitStack, redirect_stderr

with ExitStack() as stack:
null_stream = open(os.devnull, "w")
stack.enter_context(null_stream)
Expand All @@ -26,12 +26,15 @@
Contains the CrimeTags class that allows tagging of articles.
"""

MODEL_LOCATION = os.path.join(os.path.split(__file__)[0],
os.path.join('models', 'lstm', 'saved'))
MODEL_LOCATION = os.path.join(
os.path.split(__file__)[0], os.path.join("models", "lstm", "saved")
)

COMMUNITY_AREAS_FILE = os.path.join(
os.path.split(__file__)[0], '..', 'data',
'Boundaries - Community Areas (current).geojson'
os.path.split(__file__)[0],
"..",
"data",
"Boundaries - Community Areas (current).geojson",
)


Expand All @@ -50,39 +53,48 @@ def post_process(geostring):
processed_geostring : str
"""
# Merge multiple whitespaces into one
geostring = ' '.join(geostring.split())
geostring = " ".join(geostring.split())

# gisgraphy struggles with things like "55th and Woodlawn".
# replace "...<number><number ender, e.g. th or rd> and..."
# with two zeros.
# \100 does not work correclty so we need to add a separator.
geostring = re.sub(r'([0-9]+)(th|rd|st) and',
r'\1<__internal_separator__>00 and',
geostring)
geostring = geostring.replace('<__internal_separator__>', '')
geostring = re.sub(
r"([0-9]+)(th|rd|st) and", r"\1<__internal_separator__>00 and", geostring
)
geostring = geostring.replace("<__internal_separator__>", "")

# remove stopwords, only if they are internal, i.e.
# the geostring doesn't start with "block ...".
for stopword in ['block', 'of', 'and']:
geostring = geostring.replace(' {} '.format(stopword), ' ')
for stopword in ["block", "of", "and"]:
geostring = geostring.replace(" {} ".format(stopword), " ")

return geostring


_base_geocoder_url = ('http://ec2-34-228-58-223.compute-1.amazonaws.com'
':4000/v1/search?text={}')
_base_geocoder_url = (
"http://ec2-34-228-58-223.compute-1.amazonaws.com" ":4000/v1/search?text={}"
)

GeocodeResults = namedtuple('GeocodeResults', ['coords_raw',
'full_responses_raw',
'scores_raw',
'coords_post',
'full_responses_post',
'scores_post'])
GeocodeResults = namedtuple(
"GeocodeResults",
[
"coords_raw",
"full_responses_raw",
"scores_raw",
"coords_post",
"full_responses_post",
"scores_post",
],
)


def get_lat_longs_from_geostrings(geostring_list, post_process_f=None,
sleep_secs=0,
geocoder_url_formatter=_base_geocoder_url):
def get_lat_longs_from_geostrings(
geostring_list,
post_process_f=None,
sleep_secs=0,
geocoder_url_formatter=_base_geocoder_url,
):
"""
Geo-code each geostring in `geostring_list` into lat/long values.
Also return the full response from the geocoding service.
Expand Down Expand Up @@ -129,29 +141,30 @@ def _geocode(lst):
full_responses = []
for addr_str in lst:
try:
g = json.loads(requests.get(
geocoder_url_formatter.format(addr_str)
).text)
g = json.loads(
requests.get(geocoder_url_formatter.format(addr_str)).text
)
except Exception:
g = {}
full_responses.append(g)
time.sleep(sleep_secs)

def _get_latlong(g):
try:
return g['features'][0]['geometry']['coordinates']
return g["features"][0]["geometry"]["coordinates"]
except (KeyError, IndexError):
return [np.nan, np.nan]

def _get_confidence(g):
try:
return g['features'][0]['properties']['confidence']
return g["features"][0]["properties"]["confidence"]
except (KeyError, IndexError):
return np.nan

coords = pd.DataFrame([_get_latlong(g) for g in full_responses],
columns=['long', 'lat'])
coords = coords[['lat', 'long']] # it makes me feel better, OK?
coords = pd.DataFrame(
[_get_latlong(g) for g in full_responses], columns=["long", "lat"]
)
coords = coords[["lat", "long"]] # it makes me feel better, OK?
scores = np.array([_get_confidence(g) for g in full_responses])

return full_responses, coords, scores
Expand All @@ -162,12 +175,14 @@ def _get_confidence(g):
[post_process_f(geo_s) for geo_s in geostring_list]
)

return GeocodeResults(coords_raw=coords_raw,
full_responses_raw=full_responses_raw,
scores_raw=scores_raw,
coords_post=coords_post,
full_responses_post=full_responses_post,
scores_post=scores_post)
return GeocodeResults(
coords_raw=coords_raw,
full_responses_raw=full_responses_raw,
scores_raw=scores_raw,
coords_post=coords_post,
full_responses_post=full_responses_post,
scores_post=scores_post,
)


def load_model(location=MODEL_LOCATION):
Expand All @@ -178,29 +193,32 @@ def load_model(location=MODEL_LOCATION):
The files with the most recent timestamp are loaded.
"""
models = glob.glob(os.path.join(location, 'weights*.hdf5'))
models = glob.glob(os.path.join(location, "weights*.hdf5"))
if not models:
raise RuntimeError(('No models to load. Run'
' "python -m tagnews.geoloc.models.'
'lstm.save_model"'))
raise RuntimeError(
(
"No models to load. Run"
' "python -m tagnews.geoloc.models.'
'lstm.save_model"'
)
)

model = keras.models.load_model(models[-1])

return model


class GeoCoder():
class GeoCoder:
def __init__(self):
self.model = load_model()
self.glove = utils.load_vectorizer.load_glove(
os.path.join(os.path.split(__file__)[0],
'../data/glove.6B.50d.txt')
os.path.join(os.path.split(__file__)[0], "../data/glove.6B.50d.txt")
)
with open(COMMUNITY_AREAS_FILE) as f:
d = json.load(f)
self.com_areas = {
f['properties']['community']: shape(f['geometry'])
for f in d['features']
f["properties"]["community"]: shape(f["geometry"])
for f in d["features"]
}

def pre_process(self, s):
Expand All @@ -223,12 +241,14 @@ def pre_process(self, s):
Has shape (1, N, M) where N is the number of words and M
is the size of the word vectors, currently M is 51.
"""
words = s.split() # split along white space.
data = pd.concat([pd.DataFrame([[w[0].isupper()] if w else [False]
for w in words]),
(self.glove.reindex(words).fillna(0)
.reset_index(drop=True))],
axis='columns')
words = s.split() # split along white space.
data = pd.concat(
[
pd.DataFrame([[w[0].isupper()] if w else [False] for w in words]),
(self.glove.reindex(words).fillna(0).reset_index(drop=True)),
],
axis="columns",
)
return words, np.expand_dims(data, axis=0)

def extract_geostring_probs(self, s):
Expand Down Expand Up @@ -271,24 +291,26 @@ def extract_geostrings(self, s, prob_thresh=0.5):
geostrings : list of lists of strings
The list of extracted geostrings from the article text.
Each word is kept separated in the list.
Examle:
Example:
[['1300', 'W.', 'Halsted'], ['Ohio']]
"""
words, probs = self.extract_geostring_probs(s)
above_thresh = probs >= prob_thresh

words = ['filler'] + words + ['filler']
above_thresh = np.concatenate([[False],
above_thresh,
[False]]).astype(np.int32)
words = ["filler"] + words + ["filler"]
probs = np.append(0, np.append(probs, 0))

above_thresh = np.concatenate([[False], above_thresh, [False]]).astype(np.int32)
switch_ons = np.where(np.diff(above_thresh) == 1)[0] + 1
switch_offs = np.where(np.diff(above_thresh) == -1)[0] + 1

geostrings = []
probstrings = []
for on, off in zip(switch_ons, switch_offs):
geostrings.append(words[on:off])
probstrings.append(probs[on:off])

return geostrings
return geostrings, probstrings

@staticmethod
def lat_longs_from_geostring_lists(geostring_lists, **kwargs):
Expand Down Expand Up @@ -317,7 +339,7 @@ def lat_longs_from_geostring_lists(geostring_lists, **kwargs):
of absolute rule.
"""
out = get_lat_longs_from_geostrings(
[' '.join(gl) for gl in geostring_lists], **kwargs
[" ".join(gl) for gl in geostring_lists], **kwargs
)

return out.coords_post, out.scores_post
Expand All @@ -340,11 +362,40 @@ def community_area_from_coords(self, coords):
"""
out = []
for _, coord in coords.iterrows():
p = Point(coord['long'], coord['lat'])
p = Point(coord["long"], coord["lat"])
for com_name, com_shape in self.com_areas.items():
if com_shape.contains(p):
out.append(com_name)
break
else:
out.append('')
out.append("")
return out

def best_geostring(self, extracted_strs_and_probs: tuple):
"""
Parameters
----------
extracted_strs_and_probs : 2-tuple
A 2-tuple of two lists containing a list of extracted geostrings at index zero
and a list of extracted geostring probabilities at index one
Returns
-------
2-tuple of one geostring of the best geostring
"""
consider = [[], []]
for geostring, probs in zip(
extracted_strs_and_probs[0], extracted_strs_and_probs[1]
):
is_neighborhood = False
for neighborhood in neighborhoods:
if neighborhood.lower() in " ".join(geostring).lower():
is_neighborhood = True
if is_neighborhood or len(geostring) >= 3:
consider[0].append((geostring))
consider[1].append((probs))

avgs = [sum(i) / len(i) for i in consider[1]]
max_index = avgs.index(max(avgs))
return consider[0][max_index]
2 changes: 1 addition & 1 deletion lib/tagnews/tests/test_geocoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_extract_geostring_probs(self):
max_word = words[np.argmax(probs)]
geostrings = self.model.extract_geostrings(article,
prob_thresh=max_prob-0.001)
assert max_word in [word for geostring in geostrings for word in geostring]
assert max_word in [word for geostring in geostrings for word in geostring][0]

def test_extract_geostring_probs_word_not_in_glove(self):
"""
Expand Down
Loading

0 comments on commit 9024375

Please sign in to comment.