From 35bf91cb5eabfeab39c0d10dd916293caa179271 Mon Sep 17 00:00:00 2001 From: Allen Tran Date: Sun, 19 Dec 2021 17:09:49 -0800 Subject: [PATCH 01/10] autofind categories and common column names --- data/stiles_data/parse_la_data.py | 384 ++++++++++-------------------- 1 file changed, 129 insertions(+), 255 deletions(-) diff --git a/data/stiles_data/parse_la_data.py b/data/stiles_data/parse_la_data.py index 58d997b..edccff5 100644 --- a/data/stiles_data/parse_la_data.py +++ b/data/stiles_data/parse_la_data.py @@ -1,11 +1,33 @@ import argparse from pathlib import Path +from typing import List import geopandas as gpd class CityParser(object): + # these are case insensitive + name_common_columns = ['name_common', 'species', 'com_name'] + name_botanical_columns = ['name_botanical', 'botanical', 'botanicaln'] + address_columns = ['address'] + diameter_min_in_columns = ['diameter_min_in'] + diameter_max_in_columns = ['diameter_max_in'] + diameter_columns = ['diameter'] + height_min_feet_columns = ['height_min_feet'] + height_max_feet_columns = ['height_max_feet'] + tree_id_columns = ['tree_id', 'inventoryid', 'tree', 'inventoryi', 'treeid'] + est_value_columns = ['estimated_value', 'est_value', 'estvalue'] + + height_tuples = [ + ('height', 'height_min_feet', 'height_max_feet'), + ('HEIGHT_RAN', 'height_min_feet', 'height_max_feet'), + ] + diameter_tuples = [ + ('diameter', 'diameter_min_in', 'diameter_max_in'), + ('DBH', 'diameter_min_in', 'diameter_max_in'), + ] + def __init__(self, path: Path): geo_jsons = [p for p in path.iterdir() if p.is_file() and p.suffix == '.geojson'] self.city = path.parts[-1] @@ -15,10 +37,59 @@ def __init__(self, path: Path): else: self.geo_json_path = None - def get_maximal_df(self): + def get_min_max_columns(self, df, range_col_tuples): + for (range_col, min_col, max_col) in range_col_tuples: + if range_col in df.columns: + return self.cat_parser(df, min_col, max_col, range_col) + return df + + def get_column(self, df, potential_columns: List[str], titleize=False): + column_name = potential_columns[0] + df_columns = [s.strip().lower() for s in df.columns] + for potential_column in potential_columns: + if potential_column in df_columns: + idx = df_columns.index(potential_column) + column = df.columns[idx] + if titleize: + df[column_name] = df[column].str.title() + else: + df[column_name] = df[column] + return df + return df + + def filter_columns(self, df): + potential_columns = { + 'name_common', + 'latitude', + 'longitude', + 'city', + 'diameter_min_in', + 'diameter_max_in', + 'height_min_feet', + 'height_max_feet', + } + actual_columns = potential_columns & set(df.columns) + return df[list(actual_columns)] + + def read_df(self): assert self.geo_json_path - df = gpd.read_file(str(self.geo_json_path.absolute())).assign(city=self.city) - return self.lat_lon_from_geometry(df) + return gpd.read_file(str(self.geo_json_path.absolute())).assign(city=self.city) + + def get_maximal_df(self, df=None): + if df is None: + df = self.read_df() + df = self.lat_lon_from_geometry(df) + df = self.get_column(df, self.address_columns, titleize=True) + df = self.get_column(df, self.name_common_columns, titleize=True) + df = self.get_column(df, self.name_botanical_columns, titleize=True) + + df = self.get_min_max_columns(df, self.height_tuples) + df = self.get_min_max_columns(df, self.diameter_tuples) + + df = self.get_column(df, self.diameter_min_in_columns) + df = self.get_column(df, self.diameter_max_in_columns) + + return self.filter_columns(df).drop_duplicates() @staticmethod def lat_lon_from_geometry(df, y_is_lat=True): @@ -33,12 +104,11 @@ def lat_lon_from_geometry(df, y_is_lat=True): ) @staticmethod - def cat_parser(df, min_field, max_field, og_field, cats): - actual_cats = [cat for cat in df[og_field].unique().tolist() if set(cat.strip()) != {'-'}] - if len(actual_cats) > len(cats): - raise RuntimeError(f'{len(cats)} categories but categories in df={df[og_field].unique().tolist()}') - df[min_field] = -1 - df[max_field] = -1 + def cat_parser(df, min_field, max_field, og_field, cats=None): + if cats is None: + cats = [cat.strip() for cat in df[og_field].unique().tolist() if set(cat.strip()) != {'-'}] + df[min_field] = None + df[max_field] = None for cat in cats: mask = df[og_field].str.strip() == cat if len(cat.split('-')) == 2: @@ -59,26 +129,12 @@ class LosAngelesCityParser(CityParser): def __init__(self, path: Path): super().__init__(path) - def get_maximal_df(self): - df = super().get_maximal_df() - df = df.assign(name_common=df['species'].str.title()) - df = self.lat_lon_from_geometry(df) - return df[['name_common', 'latitude', 'longitude', 'city']] - class LosAngelesCountyParser(CityParser): def __init__(self, path: Path): super().__init__(path) - def get_maximal_df(self): - df = super().get_maximal_df() - df = df.assign( - name_common=df['SPECIES'].str.title(), - diameter_min_in=df['DIAMETER'] - ) - return df[['name_common', 'latitude', 'longitude', 'diameter_min_in', 'city']] - class AgouraHillsParser(CityParser): @@ -86,88 +142,23 @@ def __init__(self, path: Path): super().__init__(path) def get_maximal_df(self): - df = super().get_maximal_df() + df = self.read_df() df = df.assign( - tree_id=df['InventoryID'], - name_common=df['species'].str.title(), - name_botanical=df['botanical'].str.title(), address=df['Address'].astype(str).str.cat(df['Street'].str.title(), sep=' ') - ) - df = self.cat_parser( - df, - 'diameter_min_in', - 'diameter_max_in', - 'DBH', - ['0-6', '07-12', '13-18', '19-24', '25-30', '31+'] - ) - df = self.cat_parser( - df, - 'height_min_feet', - 'height_max_feet', - 'height', - ['01-15', '15-30', '30-45', '45-60', '60+'] - ) - - return df[ - [ - 'tree_id', - 'name_common', - 'name_botanical', - 'address', - 'height_min_feet', - 'city', - 'height_max_feet', - 'latitude', - 'longitude', - 'diameter_min_in', - 'diameter_max_in', - ] - ] + ).drop('Address', axis=1) + df = super().get_maximal_df(df=df) + return self.filter_columns(df) class AlhambraParser(CityParser): - def __init__(self, path: Path): - super().__init__(path) - def get_maximal_df(self): - df = super().get_maximal_df() + df = self.read_df() df = df.assign( - name_common=df['species'].str.title(), - name_botanical=df['BotanicalN'].str.title(), - tree_id=df['Tree'], address=df['Address'].astype(str).str.cat(df['Street'].str.title(), sep=' ') - ) - df = self.cat_parser( - df, - 'diameter_min_in', - 'diameter_max_in', - 'DBH', - ['0-6', '07-12', '13-18', '19-24', '25-30', '31+'] - ) - df = self.cat_parser( - df, - 'height_min_feet', - 'height_max_feet', - 'height', - ['01-15', '15-30', '30-45', '45-60', '60+'] - ) - return df[ - [ - 'name_common', - 'name_botanical', - 'city', - 'tree_id', - 'address', - 'height_min_feet', - 'height_max_feet', - 'latitude', - 'longitude', - 'diameter_min_in', - 'diameter_max_in', - - ] - ] + ).drop('Address', axis=1) + df = super().get_maximal_df(df=df) + return self.filter_columns(df) class ArcadiaParser(CityParser): @@ -176,23 +167,12 @@ def __init__(self, path: Path): super().__init__(path) def get_maximal_df(self): - df = super().get_maximal_df() + df = self.read_df() df = df.assign( - tree_id=df['TREE_ID'], - name_common=df['COM_NAME'].str.title(), address=df['ADDR'].str.split('ARCADIA').str[0].str.title() ) - return df[ - [ - 'name_common', - 'city', - 'tree_id', - 'address', - 'latitude', - 'longitude', - - ] - ] + df = super().get_maximal_df(df=df) + return self.filter_columns(df) class BellflowerParser(CityParser): @@ -201,45 +181,12 @@ def __init__(self, path: Path): super().__init__(path) def get_maximal_df(self): - df = super().get_maximal_df() + df = self.read_df() df = df.assign( - estimated_value=df['EstValue'], - name_common=df['species'].str.title(), - name_botanical=df['botanical'].str.title(), address=df['Address'].astype(str).str.cat(df['Street'].str.title(), sep=' '), - tree_id=df['InventoryID'], - ) - df = self.cat_parser( - df, - 'diameter_min_in', - 'diameter_max_in', - 'DBH', - ['0-6', '07-12', '13-18', '19-24', '25-30', '31+', '---'] - ) - df = self.cat_parser( - df, - 'height_min_feet', - 'height_max_feet', - 'height', - ['01-15', '15-30', '30-45', '45-60', '60+', '---'] - ) - - return df[ - [ - 'name_common', - 'name_botanical', - 'city', - 'tree_id', - 'address', - 'estimated_value', - 'height_min_feet', - 'height_max_feet', - 'latitude', - 'longitude', - 'diameter_min_in', - 'diameter_max_in', - ] - ] + ).drop('Address', axis=1) + df = super().get_maximal_df(df=df) + return self.filter_columns(df) class BellGardensParser(CityParser): @@ -248,43 +195,12 @@ def __init__(self, path: Path): super().__init__(path) def get_maximal_df(self): - df = super().get_maximal_df() + df = self.read_df() df = df.assign( - name_common=df['species'].str.title(), - name_botanical=df['BOTANICALN'].str.title(), - tree_id=df['INVENTORYI'], address=df['ADDRESS'].astype(str).str.cat(df['STREET'].str.title(), sep=' ') - ) - df = self.cat_parser( - df, - 'diameter_min_in', - 'diameter_max_in', - 'DBH', - ['0-6', '07-12', '13-18', '19-24', '25-30', '31+'] - ) - df = self.cat_parser( - df, - 'height_min_feet', - 'height_max_feet', - 'height', - ['01-15', '15-30', '30-45', '45-60', '60+'] - ) - - return df[ - [ - 'name_common', - 'name_botanical', - 'city', - 'tree_id', - 'address', - 'height_min_feet', - 'height_max_feet', - 'latitude', - 'longitude', - 'diameter_min_in', - 'diameter_max_in', - ] - ] + ).drop('ADDRESS', axis=1) + df = super().get_maximal_df(df=df) + return self.filter_columns(df) class ArtesiaParser(CityParser): @@ -293,92 +209,48 @@ def __init__(self, path: Path): super().__init__(path) def get_maximal_df(self): - df = super().get_maximal_df() + df = self.read_df() df = df.assign( - tree_id=df['INVENTORYI'], address=df['ADDRESS'].astype(str).str.cat(df['STREET'].str.title(), sep=' '), - name_botanical=df['BOTANICALN'].str.title(), - name_common=df['species'].str.title(), - ) - df = self.cat_parser( - df, - 'diameter_min_in', - 'diameter_max_in', - 'DBH', - ['0-6', '07-12', '13-18', '19-24', '25-30', '31+'] - ) - df = self.cat_parser( - df, - 'height_min_feet', - 'height_max_feet', - 'height', - ['01-15', '15-30', '30-45', '45-60', '60+'] - ) - return df[ - [ - 'name_common', - 'name_botanical', - 'city', - 'tree_id', - 'address', - 'height_min_feet', - 'height_max_feet', - 'latitude', - 'longitude', - 'diameter_min_in', - 'diameter_max_in', - - ] - ] + ).drop('ADDRESS', axis=1) + df = super().get_maximal_df(df=df) + return self.filter_columns(df) class BeverlyHillsParser(CityParser): - def __init__(self, path: Path): - super().__init__(path) - def get_maximal_df(self): - df = super().get_maximal_df() + df = self.read_df() df = df.assign( - tree_id=df['TREEID'], address=df['ADDRESS'].astype(str).str.cat(df['STREET'].str.title(), sep=' '), - name_botanical=df['BOTANICAL'].str.title(), - name_common=df['species'].str.title(), - ) - df = self.cat_parser( - df, - 'height_min_feet', - 'height_max_feet', - 'HEIGHT_RAN', - ['1-15', '16-30', '31-45', '46-60', '>60', '------', ''] - ) - return df[ - [ - 'name_common', - 'name_botanical', - 'city', - 'tree_id', - 'address', - 'height_min_feet', - 'height_max_feet', - 'latitude', - 'longitude', - ] - ] + ).drop('ADDRESS', axis=1) + df = super().get_maximal_df(df=df) + return self.filter_columns(df) + + +class LongBeachParser(CityParser): + def get_maximal_df(self, df=None): + df = self.read_df().drop('ADDRESS', axis=1) + df = super().get_maximal_df(df=df) + return self.filter_columns(df) + + +# class SantaClarita(CityParser): class StilesDataParser(object): mapper = { - 'los-angeles-city': LosAngelesCityParser, - 'los-angeles-county': LosAngelesCountyParser, - 'agoura-hills': AgouraHillsParser, - 'alhambra' : AlhambraParser, - 'arcadia': ArcadiaParser, - 'artesia': ArtesiaParser, - 'bell-gardens': BellGardensParser, - 'bellflower': BellflowerParser, - 'beverly-hills': BeverlyHillsParser + # 'los-angeles-city': LosAngelesCityParser, + # 'los-angeles-county': LosAngelesCountyParser, + # 'agoura-hills': AgouraHillsParser, + # 'alhambra' : AlhambraParser, + # 'arcadia': ArcadiaParser, + # 'artesia': ArtesiaParser, + # 'bell-gardens': BellGardensParser, + # 'bellflower': BellflowerParser, + # 'beverly-hills': BeverlyHillsParser, + 'long-beach': LongBeachParser } def __init__(self, data_path): @@ -392,7 +264,9 @@ def parse_all(self): if city in self.mapper: city_parser = self.mapper[city](data_dir) if city_parser.geo_json_path: - city_parser.get_maximal_df() + df = city_parser.get_maximal_df() + print(city, len(df)) + print(df) if __name__ == "__main__": From 3eb969fdd88a4280359a5a7af66c5a57099ba8f8 Mon Sep 17 00:00:00 2001 From: Allen Tran Date: Thu, 23 Dec 2021 10:52:59 -0800 Subject: [PATCH 02/10] custom parsers up to Pomona in order of size --- data/stiles_data/parse_la_data.py | 147 ++++++++++++++++++------------ 1 file changed, 91 insertions(+), 56 deletions(-) diff --git a/data/stiles_data/parse_la_data.py b/data/stiles_data/parse_la_data.py index edccff5..821153b 100644 --- a/data/stiles_data/parse_la_data.py +++ b/data/stiles_data/parse_la_data.py @@ -1,7 +1,8 @@ import argparse from pathlib import Path -from typing import List +from typing import List, Set +import pandas as pd import geopandas as gpd @@ -10,13 +11,15 @@ class CityParser(object): # these are case insensitive name_common_columns = ['name_common', 'species', 'com_name'] name_botanical_columns = ['name_botanical', 'botanical', 'botanicaln'] + condition = ['condition', 'treecondition'] address_columns = ['address'] diameter_min_in_columns = ['diameter_min_in'] diameter_max_in_columns = ['diameter_max_in'] - diameter_columns = ['diameter'] + exact_diameter_columns = ['exact_diameter', 'diameter', 'exact_dbh', 'trunk_diam', 'actualdbh'] height_min_feet_columns = ['height_min_feet'] height_max_feet_columns = ['height_max_feet'] - tree_id_columns = ['tree_id', 'inventoryid', 'tree', 'inventoryi', 'treeid'] + exact_height_columns = ['exact_height', 'exact_heigh', 'height', 'actualheight'] + tree_id_columns = ['tree_id', 'inventoryid', 'tree', 'inventoryi', 'treeid', 'objectid'] est_value_columns = ['estimated_value', 'est_value', 'estvalue'] height_tuples = [ @@ -28,17 +31,23 @@ class CityParser(object): ('DBH', 'diameter_min_in', 'diameter_max_in'), ] - def __init__(self, path: Path): - geo_jsons = [p for p in path.iterdir() if p.is_file() and p.suffix == '.geojson'] - self.city = path.parts[-1] - assert len(geo_jsons) <= 1 + def __init__(self, city, path: Path, geojson_path: Path=None): + self.city = city + geo_jsons = [] + if path: + geo_jsons = [p for p in path.iterdir() if p.is_file() and p.suffix == '.geojson'] + assert len(geo_jsons) <= 1 if len(geo_jsons) > 0: self.geo_json_path = geo_jsons[-1] + elif geojson_path: + self.geo_json_path = geojson_path else: self.geo_json_path = None - def get_min_max_columns(self, df, range_col_tuples): + def get_min_max_columns(self, df, range_col_tuples, skip_col: Set[str] = None): for (range_col, min_col, max_col) in range_col_tuples: + if skip_col and range_col in skip_col: + continue if range_col in df.columns: return self.cat_parser(df, min_col, max_col, range_col) return df @@ -60,6 +69,13 @@ def get_column(self, df, potential_columns: List[str], titleize=False): def filter_columns(self, df): potential_columns = { 'name_common', + 'name_botanical', + 'condition', + 'exact_diameter', + 'exact_height', + 'tree_id', + 'estimated_value', + 'address', 'latitude', 'longitude', 'city', @@ -106,7 +122,7 @@ def lat_lon_from_geometry(df, y_is_lat=True): @staticmethod def cat_parser(df, min_field, max_field, og_field, cats=None): if cats is None: - cats = [cat.strip() for cat in df[og_field].unique().tolist() if set(cat.strip()) != {'-'}] + cats = [cat.strip() for cat in df[og_field].unique().tolist() if cat is not None and set(cat.strip()) != {'-'}] df[min_field] = None df[max_field] = None for cat in cats: @@ -125,22 +141,7 @@ def cat_parser(df, min_field, max_field, og_field, cats=None): return df -class LosAngelesCityParser(CityParser): - def __init__(self, path: Path): - super().__init__(path) - - -class LosAngelesCountyParser(CityParser): - - def __init__(self, path: Path): - super().__init__(path) - - class AgouraHillsParser(CityParser): - - def __init__(self, path: Path): - super().__init__(path) - def get_maximal_df(self): df = self.read_df() df = df.assign( @@ -151,7 +152,6 @@ def get_maximal_df(self): class AlhambraParser(CityParser): - def get_maximal_df(self): df = self.read_df() df = df.assign( @@ -162,10 +162,6 @@ def get_maximal_df(self): class ArcadiaParser(CityParser): - - def __init__(self, path: Path): - super().__init__(path) - def get_maximal_df(self): df = self.read_df() df = df.assign( @@ -176,10 +172,6 @@ def get_maximal_df(self): class BellflowerParser(CityParser): - - def __init__(self, path: Path): - super().__init__(path) - def get_maximal_df(self): df = self.read_df() df = df.assign( @@ -190,10 +182,6 @@ def get_maximal_df(self): class BellGardensParser(CityParser): - - def __init__(self, path: Path): - super().__init__(path) - def get_maximal_df(self): df = self.read_df() df = df.assign( @@ -204,10 +192,6 @@ def get_maximal_df(self): class ArtesiaParser(CityParser): - - def __init__(self, path: Path): - super().__init__(path) - def get_maximal_df(self): df = self.read_df() df = df.assign( @@ -218,9 +202,9 @@ def get_maximal_df(self): class BeverlyHillsParser(CityParser): - def get_maximal_df(self): df = self.read_df() + df = df.rename(columns={'height': 'exact_height'}) df = df.assign( address=df['ADDRESS'].astype(str).str.cat(df['STREET'].str.title(), sep=' '), ).drop('ADDRESS', axis=1) @@ -235,14 +219,59 @@ def get_maximal_df(self, df=None): return self.filter_columns(df) -# class SantaClarita(CityParser): +class SantaClaritaParser(CityParser): + def get_maximal_df(self, df=None): + df = self.read_df() + df = df[df['PROP_ADR'].notnull()] + df = df.assign( + address=df['PROP_ADR'].astype(str).str.cat(df['PROPSTREET'].str.title(), sep=' '), + ).drop('ADDRESS', axis=1) + df = super().get_maximal_df(df=df) + return self.filter_columns(df) + + +class PasadenaParser(CityParser): + def get_maximal_df(self, df=None): + df = self.read_df() + df['Botanical'] = df['Genus'].str.cat(df['Species'], sep=' ').str.title() + df = df.drop('Species', axis=1) + + df['House_Numb'] = df['House_Numb'].astype(pd.Int64Dtype()) + mask = df['Street_Dir'].isnull() + df['Street'] = df['Street_Nam'].astype(str).str.cat(df['Street_Typ'], sep=' ') + df['Address'] = df['House_Numb'].astype(str).str.cat(df['Street'], sep=' ') + df.loc[mask, 'Address'] = df.loc[mask, 'House_Numb'].astype(str).str.cat( + df.loc[mask, 'Street_Nam'], sep=' ' + ).str.cat(df.loc[mask, 'Street_Typ'], sep=' ') + + df = super().get_maximal_df(df=df) + return self.filter_columns(df) + + +class GlendaleParser(CityParser): + def get_maximal_df(self, df=None): + df = self.read_df() + df = df.drop('Address', axis=1) + df['address'] = df['OnAddress'].astype(str).str.cat(df['OnStreet'].astype(str).str.strip(), sep=' ') + df = super().get_maximal_df(df=df) + return self.filter_columns(df) + + +class PomonaParser(CityParser): + def get_maximal_df(self, df=None): + df = self.read_df() + df['address'] = df['ADDRESS'].astype(str).str.cat(df['STREET'].astype(str).str.strip(), sep=' ') + df = df.drop('ADDRESS', axis=1) + + df = super().get_maximal_df(df=df) + return self.filter_columns(df) class StilesDataParser(object): mapper = { - # 'los-angeles-city': LosAngelesCityParser, - # 'los-angeles-county': LosAngelesCountyParser, + # 'los-angeles-city': CityParser, + # 'los-angeles-county': CityParser, # 'agoura-hills': AgouraHillsParser, # 'alhambra' : AlhambraParser, # 'arcadia': ArcadiaParser, @@ -250,23 +279,29 @@ class StilesDataParser(object): # 'bell-gardens': BellGardensParser, # 'bellflower': BellflowerParser, # 'beverly-hills': BeverlyHillsParser, - 'long-beach': LongBeachParser + # 'long-beach': LongBeachParser, + # 'santa-clarita': SantaClaritaParser, + # 'pasadena': PasadenaParser, + # 'glendale': GlendaleParser, + 'pomona': PomonaParser, } def __init__(self, data_path): root_path = Path(data_path) - self.data_dirs = ([x for x in root_path.iterdir() if x.is_dir()]) + self.data_dirs = {x.name: x for x in root_path.iterdir() if x.is_dir()} + all_path = Path(f'{data_path}/all') + self.geojsons = {geojson_path.name.split('.')[0]: geojson_path for geojson_path in all_path.glob('*.geojson')} def parse_all(self): - for data_dir in self.data_dirs: - city = data_dir.parts[-1] - if city != 'all': - if city in self.mapper: - city_parser = self.mapper[city](data_dir) - if city_parser.geo_json_path: - df = city_parser.get_maximal_df() - print(city, len(df)) - print(df) + for city in self.mapper: + if city in self.mapper: + city_parser = self.mapper[city]( + city, + self.data_dirs[city] if city in self.data_dirs else None, + self.geojsons[city] if city in self.geojsons else None + ) + if city_parser.geo_json_path: + df = city_parser.get_maximal_df() if __name__ == "__main__": From 5c43625c7323e5942f538e943d1f319da878a9eb Mon Sep 17 00:00:00 2001 From: Allen Tran Date: Thu, 23 Dec 2021 10:58:16 -0800 Subject: [PATCH 03/10] bug in drop column for santa clarita --- data/stiles_data/parse_la_data.py | 32 +++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/data/stiles_data/parse_la_data.py b/data/stiles_data/parse_la_data.py index 821153b..709f815 100644 --- a/data/stiles_data/parse_la_data.py +++ b/data/stiles_data/parse_la_data.py @@ -225,7 +225,7 @@ def get_maximal_df(self, df=None): df = df[df['PROP_ADR'].notnull()] df = df.assign( address=df['PROP_ADR'].astype(str).str.cat(df['PROPSTREET'].str.title(), sep=' '), - ).drop('ADDRESS', axis=1) + ) df = super().get_maximal_df(df=df) return self.filter_columns(df) @@ -270,19 +270,19 @@ def get_maximal_df(self, df=None): class StilesDataParser(object): mapper = { - # 'los-angeles-city': CityParser, - # 'los-angeles-county': CityParser, - # 'agoura-hills': AgouraHillsParser, - # 'alhambra' : AlhambraParser, - # 'arcadia': ArcadiaParser, - # 'artesia': ArtesiaParser, - # 'bell-gardens': BellGardensParser, - # 'bellflower': BellflowerParser, - # 'beverly-hills': BeverlyHillsParser, - # 'long-beach': LongBeachParser, - # 'santa-clarita': SantaClaritaParser, - # 'pasadena': PasadenaParser, - # 'glendale': GlendaleParser, + 'los-angeles-city': CityParser, + 'los-angeles-county': CityParser, + 'agoura-hills': AgouraHillsParser, + 'alhambra' : AlhambraParser, + 'arcadia': ArcadiaParser, + 'artesia': ArtesiaParser, + 'bell-gardens': BellGardensParser, + 'bellflower': BellflowerParser, + 'beverly-hills': BeverlyHillsParser, + 'long-beach': LongBeachParser, + 'santa-clarita': SantaClaritaParser, + 'pasadena': PasadenaParser, + 'glendale': GlendaleParser, 'pomona': PomonaParser, } @@ -293,6 +293,7 @@ def __init__(self, data_path): self.geojsons = {geojson_path.name.split('.')[0]: geojson_path for geojson_path in all_path.glob('*.geojson')} def parse_all(self): + dfs = [] for city in self.mapper: if city in self.mapper: city_parser = self.mapper[city]( @@ -302,6 +303,9 @@ def parse_all(self): ) if city_parser.geo_json_path: df = city_parser.get_maximal_df() + dfs.append(df) + + print(len(pd.concat(dfs))) if __name__ == "__main__": From 720456f84477b8bfce52d45d0067b5252b67b7ff Mon Sep 17 00:00:00 2001 From: Allen Tran Date: Fri, 24 Dec 2021 19:51:29 -0800 Subject: [PATCH 04/10] gcloud api to Tree DB --- treeapi/.dockerignore | 7 ++++ treeapi/Dockerfile | 25 +++++++++++ treeapi/main.py | 91 ++++++++++++++++++++++++++++++++++++++++ treeapi/requirements.txt | 5 +++ 4 files changed, 128 insertions(+) create mode 100644 treeapi/.dockerignore create mode 100644 treeapi/Dockerfile create mode 100644 treeapi/main.py create mode 100644 treeapi/requirements.txt diff --git a/treeapi/.dockerignore b/treeapi/.dockerignore new file mode 100644 index 0000000..3e4bdd9 --- /dev/null +++ b/treeapi/.dockerignore @@ -0,0 +1,7 @@ +Dockerfile +README.md +*.pyc +*.pyo +*.pyd +__pycache__ +.pytest_cache diff --git a/treeapi/Dockerfile b/treeapi/Dockerfile new file mode 100644 index 0000000..d49d5d7 --- /dev/null +++ b/treeapi/Dockerfile @@ -0,0 +1,25 @@ + +# Use the official lightweight Python image. +# https://hub.docker.com/_/python +FROM python:3.8.2-slim + +# Allow statements and log messages to immediately appear in the Knative logs +ENV PYTHONUNBUFFERED True + +# Copy local code to the container image. +ENV APP_HOME /app +WORKDIR $APP_HOME +COPY . ./ + +RUN apt-get update -y \ + && apt-get install -y gcc libpq-dev + +# Install production dependencies. +RUN pip3 install --no-cache-dir -r requirements.txt + +# Run the web service on container startup. Here we use the gunicorn +# webserver, with one worker process and 8 threads. +# For environments with multiple CPU cores, increase the number of workers +# to be equal to the cores available. +# Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. +CMD exec gunicorn --bind :$PORT --workers 1 --worker-class uvicorn.workers.UvicornWorker --threads 8 main:app diff --git a/treeapi/main.py b/treeapi/main.py new file mode 100644 index 0000000..57cebc6 --- /dev/null +++ b/treeapi/main.py @@ -0,0 +1,91 @@ +import os + +import pymysql +from google.cloud.sql.connector import connector +from fastapi import FastAPI + +app = FastAPI() + + +class DBConn(object): + def __init__(self, password): + self.connection = connector.connect( + instance_connection_string='total-ensign-336021:us-west1:public-tree-map', + driver='pymysql', + user='root', + password=password, + db='publictrees' + ) + + def __enter__(self): + return self.connection + + def __exit__(self, type, value, traceback): + self.connection.close() + + +@app.get("/random/") +async def get_random_tree(): + sql = f""" + SELECT + tree_id, + name_common, + name_botanical, + address, + city, + diameter_min_in, + diameter_max_in, + exact_diameter, + height_min_ft, + height_max_ft, + exact_height, + estimated_value, + tree_condition, + ST_LATITUDE(location) AS latitude, + ST_LONGITUDE(location) AS longitude + FROM trees + LIMIT 1 + """ + with DBConn(os.environ['TREE_DB_PASS']) as conn: + cursor = conn.cursor(pymysql.cursors.DictCursor) + cursor.execute(sql) + return cursor.fetchall() + + +@app.get("/trees/") +async def get_tree(lat1, lng1, lat2, lng2, lat3, lng3, lat4, lng4): + lats = [lat1, lat2, lat3, lat4] + lngs = [lng1, lng2, lng3, lng4] + lat_lngs = ' '.join(zip(lats, lngs)) + csv = ','.join(lat_lngs) + polygon_str = f'POLYGON(({csv}, {lat_lngs[0]}))' + sql = f""" + SELECT + tree_id, + name_common, + name_botanical, + address, + city, + diameter_min_in, + diameter_max_in, + exact_diameter, + height_min_ft, + height_max_ft, + exact_height, + estimated_value, + tree_condition, + ST_LATITUDE(MY_POINT) AS latitude, + ST_LONGITUDE(location) AS longitude + FROM trees + WHERE + MBRContains( + ST_GeomFromText('%s'), + location + ) + """ + with DBConn(os.environ['TREE_DB_PASS']) as conn: + cursor = conn.cursor(pymysql.cursors.DictCursor) + cursor.execute(sql, polygon_str) + results = cursor.fetchall() + + return results \ No newline at end of file diff --git a/treeapi/requirements.txt b/treeapi/requirements.txt new file mode 100644 index 0000000..48ebc1a --- /dev/null +++ b/treeapi/requirements.txt @@ -0,0 +1,5 @@ +fastapi +uvicorn[standard] +gunicorn +cloud-sql-python-connector[pymysql] +pymysql \ No newline at end of file From 2ca772fd24d53a4d45d340bfb70b8d4f9b1f387a Mon Sep 17 00:00:00 2001 From: Allen Tran Date: Wed, 29 Dec 2021 18:49:33 -0800 Subject: [PATCH 05/10] download images, upload trees, cloud build instead of circleci --- .dockerignore | 10 + .gcloudignore | 11 + Dockerfile | 28 +++ Makefile | 7 + cloudbuild.yaml | 9 + data/species_attributes.csv | 69 +++++- download_images.py | 145 +++++++++++ .../parse_la_data.py => parse_la_data.py | 229 +++++++++++++----- pruning_planting.py | 6 +- requirements.txt | 9 +- upload_trees.py | 229 ++++++++++++++++++ 11 files changed, 678 insertions(+), 74 deletions(-) create mode 100644 .dockerignore create mode 100644 .gcloudignore create mode 100644 Dockerfile create mode 100644 cloudbuild.yaml create mode 100644 download_images.py rename data/stiles_data/parse_la_data.py => parse_la_data.py (56%) create mode 100644 upload_trees.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..6f59a4a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,10 @@ +Dockerfile +README.md +*.pyc +*.pyo +*.pyd +__pycache__ +.pytest_cache +*.pdf +*.dbf +*.jpg \ No newline at end of file diff --git a/.gcloudignore b/.gcloudignore new file mode 100644 index 0000000..4a03439 --- /dev/null +++ b/.gcloudignore @@ -0,0 +1,11 @@ +README.md +trees.csv +.git/ +*.pyc +*.pyo +*.pyd +__pycache__ +.pytest_cache +*.pdf +*.jpg +stiles.trees.csv diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a93a089 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ + +# Use the official lightweight Python image. +# https://hub.docker.com/_/python +FROM python:3.8.2-slim + +# Copy local code to the container image. +ENV APP_HOME /app +WORKDIR $APP_HOME +COPY . ./ + +RUN apt-get update -y \ + && apt-get install -y build-essential curl + +ENV NODE_VERSION 10.15.1 +RUN curl -o- https://raw.githubusercontent.com/creationix/nvm/v0.33.11/install.sh | bash +RUN . $HOME/.nvm/nvm.sh \ + && nvm install $NODE_VERSION \ + && nvm alias default $NODE_VERSION \ + && nvm use default \ + && npm install + +RUN pip3 install --no-cache-dir -r requirements.txt + +#RUN echo 'source $NVM_DIR/nvm.sh' >> $BASH_ENV +#RUN echo 'export PATH="$HOME/miniconda/bin:$PATH"' >> $BASH_ENV +#RUN echo 'source activate public-tree-map' >> $BASH_ENV + +RUN ["/bin/bash", "-c", ". $HOME/.nvm/nvm.sh && make release-gc"] \ No newline at end of file diff --git a/Makefile b/Makefile index b6e378c..e352057 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ # Anything that needs to be done before the other rules run +SHELL := /bin/bash setup: mkdir -p build/data @@ -10,6 +11,12 @@ release: setup | node download-images.js \ | node split-trees.js build/data +release-gc: setup + curl 'https://data.smgov.net/resource/w8ue-6cnd.csv?$$limit=200' \ + | node parse-trees.js \ + | python3 upload_trees.py + python3 download_images.py + # Runs the pipeline using local data, but skips the CPU-intensive python tasks img-test: setup cat data/trees.csv \ diff --git a/cloudbuild.yaml b/cloudbuild.yaml new file mode 100644 index 0000000..cd51f85 --- /dev/null +++ b/cloudbuild.yaml @@ -0,0 +1,9 @@ +steps: +- name: 'gcr.io/cloud-builders/docker' + entrypoint: 'bash' + secretEnv: ['TREE_DB_PASS'] + args: ['-c', 'docker build -t us-west1-docker.pkg.dev/$PROJECT_ID/cloud-run-source-deploy/etl-image:prod .'] +availableSecrets: + secretManager: + - versionName: projects/$PROJECT_ID/secrets/trees-db-password/versions/latest + env: TREE_DB_PASS \ No newline at end of file diff --git a/data/species_attributes.csv b/data/species_attributes.csv index 10aefd6..4879833 100644 --- a/data/species_attributes.csv +++ b/data/species_attributes.csv @@ -1,24 +1,66 @@ botanical_name,botanical_synonyms,sm_botanical_name,common_name,Species ID,family_botanical_name,family_common_name,native,EOL_ID,EOL_overview_URL,simplified_IUCN_status,IUCN_status,IUCN_DOI_or_URL,shade_production,form,type,Cal_IPC_rating,CAL_IPC_url,Irrigation_Requirements × Chitalpa tashkentensis,,Chitalpa tashkentensis,CHITALPA,262,Bignoniaceae,Bignonia,exotic,49307441,https://eol.org/pages/49307441/overview,not listed,not listed,,filtered,rounded,,,, -Acacia baileyana,,Acacia baileyana,BAILEY ACACIA,144,Fabaceae,Legume,watch,649008,http://eol.org/pages/649008/overview,not listed,not listed,,dense,spreading,evergreen,watch,https://www.cal-ipc.org/plants/risk/acacia-baileyana-risk/,"none, once established" -Acacia baileyana 'Purpurea',,Acacia baileyana 'Purpurea',Purple Acacia,1102,Fabaceae,Legume,watch,649008,http://eol.org/pages/649008/overview,not listed,not listed,,dense,spreading,evergreen,watch,https://www.cal-ipc.org/plants/risk/acacia-baileyana-risk/,"none, once established" +Abies concolor,,,White Fir,,Pinaceae,Pine,native,1033078,https://eol.org/pages/1033078,not listed,not listed,,,conical,evergreen,,, +Abies grandis,,,Grand Fir,,Pinaceae,Pine,native,1033074,https://eol.org/pages/1033074,,,,,conical,evergreen,,, +Abies magnifica,,,Red Fir,,Pinaceae,Pine,native,1061727,https://eol.org/pages/1061727,,,,,,,,, +Abies pinsapo,,,Spanish Fir,,Pinaceae,Pine,exotic,1061726,https://eol.org/pages/1061726,,,,,conical,evergreen,,, +Abies spp.,,,Fir,,Pinaceae,Pine,,13998,https://eol.org/pages/13998,,,,,,,,, +Abutilon pictum thompsonii,Abutilon striatum,,Painted Albutilon,,Malvaceae,Mallow,exotic,597500,https://eol.org/pages/597500,,,,,,,,, +Acacia baileyana,,Acacia baileyana,BAILEY ACACIA,144,Fabaceae,Legume,invasive,649008,http://eol.org/pages/649008/overview,not listed,not listed,,dense,spreading,evergreen,watch,https://www.cal-ipc.org/plants/risk/acacia-baileyana-risk/,"none, once established" +Acacia baileyana 'Purpurea',,Acacia baileyana 'Purpurea',Purple Acacia,1102,Fabaceae,Legume,invasive,649008,http://eol.org/pages/649008/overview,not listed,not listed,,dense,spreading,evergreen,watch,https://www.cal-ipc.org/plants/risk/acacia-baileyana-risk/,"none, once established" Acacia cognata,Acacia subporosa,Acacia cognata,River Wattle,274,Fabaceae,Legume,exotic,660740,http://eol.org/pages/660740/overview,not listed,not listed,,dense,"small, spreading",,,, -Acacia longifolia,,Acacia longifolia,Sydney Golden Wattle,338,Fabaceae,Legume,exotic,690308,http://eol.org/pages/690308/overview,not listed,not listed,,dense,rounded,,,, -Acacia melanoxylon,,Acacia melanoxylon,BLACK ACACIA,145,Fabaceae,Legume,limited,8684941,http://eol.org/pages/8684941/overview,not listed,not listed,,dense,rounded,evergreen,limited,https://www.cal-ipc.org/plants/paf/acacia-melanoxylon-plant-assessment-form/,"none, once established" +Acacia cultriformis,,,Knife Acacia,,Fabaceae,Legume,exotic,661694,https://eol.org/pages/661694,,,,,,evergreen,,, +Acacia cyclops,"[""acacia cyclopsis"", ""Acacia cyclopis""]",,Cyclop Acacia,,Fabaceae,Legume,exotic,661835,https://eol.org/pages/661835,,,,,,,,, +Acacia dealbata,"[""Acacia decurrens var. dealbata"",]",,Silver Wattle,,Fabaceae,Legume,invasive,684065,https://eol.org/pages/684065,,,,,,,,, +Acacia decurrens,,,Green Wattle,,Fabaceae,Legume,exotic,663357,https://eol.org/pages/663357,,,,,,,,, +Acacia floribunda,,,Gossamer Wattle,,Fabaceae,Legume,exotic,646757,https://eol.org/pages/646757,,,,,,,,, +Acacia jennerae,,,Coonavittra Wattle,,Fabaceae,Legume,exotic,655359,https://eol.org/pages/655359,,,,,,,,, +Acacia longifolia,,Acacia longifolia,Sydney Golden Wattle,338,Fabaceae,Legume,invasive,690308,http://eol.org/pages/690308/overview,not listed,not listed,,dense,rounded,,,, +Acacia longissima,,,Narrow-leaf Wattle,,Fabaceae,Legume,exotic,661222,https://eol.org/pages/661222,,,,,,,,, +Acacia melanoxylon,,Acacia melanoxylon,BLACK ACACIA,145,Fabaceae,Legume,invasive,8684941,http://eol.org/pages/8684941/overview,not listed,not listed,,dense,rounded,evergreen,limited,https://www.cal-ipc.org/plants/paf/acacia-melanoxylon-plant-assessment-form/,"none, once established" +Acacia paradoxa,"[""Acacia ornithophora"", ""Acacia undulata"", ""Mimosa paradoxa"", ""Racosperma paradoxum"", ""Acacia armata"", ""Acacia hybrida""]",,Kangaroo Acacia,,Fabaceae,Legume,invasive,648693,https://eol.org/pages/648693,,,,,,,,, +Acacia pendula,"[""Racosperma pendulum"",]",,Weeping Acacia,,Fabaceae,Legume,exotic,688908,https://eol.org/pages/688908,,,,,,,,, +Acacia podalyriifolia,,,Pearl Acacia,,Fabaceae,Legume,exotic,691015,https://eol.org/pages/691015,,,,,,,,, +Acacia pycnantha,,,Golden Wattle,,Fabaceae,Legume,exotic,689739,https://eol.org/pages/689739,,,,,,,,, +Acacia redolens,,,Bank Catclaw,,Fabaceae,Legume,exotic,660800,https://eol.org/pages/660800,,,,,,,,, +Acacia retiniodes,,,Water Wattle,,Fabaceae,Legume,exotic,703681,https://eol.org/pages/703681,,,,,,,,, +Acacia salicina,,,Willow Acacia,,Fabaceae,Legume,exotic,643418,https://eol.org/pages/643418,,,,,,,,, +Acacia saligna,"[""Acacia cyanophylla"", ""Acacia bracteata""]",,Blue-Leaf Wattle,,Fabaceae,Legume,exotic,690383,https://eol.org/pages/690383,,,,,,,,, +Vachellia farnesiana,"[""acacia smallii"", ""Acacia farnesiana"",]",,Sweeet Acacia,,Fabaceae,Legume,exotic,52202878,https://eol.org/pages/52202878,,,,,,,,, +Acacia spp.,,,Acacia,,Fabaceae,Legume,exotic,39940157,https://eol.org/pages/39940157,,,,,,,,, Acacia stenophylla,Acacia stenophylla var. linearis,Acacia stenophylla,Shoestring Acacia,415,Fabaceae,Legume,exotic,643396,http://eol.org/pages/643396/overview,not listed,not listed,,filtered,"weeping, pendulous",,,, Acca sellowiana,Feijoa sellowiana,Acca sellowiana,Pinapple Guava,207,Myrtaceae,Myrtle,exotic,2508674,http://eol.org/pages/2508674/overview,not listed,not listed,,filtered,rounded,,,, +Acer buergerianum,"[""Acer buergeranum"",]",,Trident Maple,,Sapindaceae,Soapberry,exotic,2888944,https://eol.org/pages/2888944,,,,,rounded,deciduous,,, +Acer circinatum,,,Vine Maple,,Sapindaceae,Soapberry,native,582251,https://eol.org/pages/582251,,,,,,,,, +Acer japonicum,,,Full Moon Maple,,Sapindaceae,Soapberry,exotic,2888970,https://eol.org/pages/2888970,,,,,,,,, +Acer macrophyllum,,,Big Leaf Maple,,Sapindaceae,Soapberry,native,582252,https://eol.org/pages/582252,,,,,,,,, +Acer negundo,,,Box Elder,,Sapindaceae,Soapberry,native,583069,https://eol.org/pages/583069,,,,,,,,, +Acer oblongum,,,Evergreen Maple,,Sapindaceae,Soapberry,exotic,2888990,https://eol.org/pages/2888990,,,,,,,,, Acer palmatum,,Acer palmatum,JAPANESE MAPLE,188,Sapindaceae,Soapberry,exotic,596824,http://eol.org/pages/596824/overview,Least Concern,Least Concern,http://dx.doi.org/10.2305/IUCN.UK.2017-3.RLTS.T193845A2285627.en,dense,"spreading, vase",,,,moderate +Acer palmatum 'bloodgood',,,Bloodgood Japanese Maple,,Sapindaceae,Soapberry,exotic,596824,http://eol.org/pages/596824/overview,,,,,,,,, +Acer palmatum 'green',,,Green Japanese Maple,,Sapindaceae,Soapberry,exotic,596824,http://eol.org/pages/596824/overview,,,,,,,,, +Acer palmatum 'red',,,Red Japanese Maple,,Sapindaceae,Soapberry,exotic,596824,http://eol.org/pages/596824/overview,,,,,,,,, Acer paxii,,Acer paxii,Evergreen Maple,137,Sapindaceae,Soapberry,exotic,2888996,http://eol.org/pages/2888996/overview,not listed,not listed,,dense,rounded,,,, Acer rubrum,Acer rubrum var. drummondii,Acer rubrum,Red Maple,104,Sapindaceae,Soapberry,exotic,582246,http://eol.org/pages/582246/overview,Least Concern,Learn Concern,http://dx.doi.org/10.2305/IUCN.UK.2017-3.RLTS.T193860A2287111.en,filtered,"Conical, rounded, Spreading",,,, +Acer platanoides,,,Norway Maple,,Sapindaceae,Soapberry,exotic,583070,https://eol.org/pages/583070,,,,,,,,, +Acer pseudoplatanus,,,Sycamore Maple,,Sapindaceae,Soapberry,exotic,583073,https://eol.org/pages/583073,,,,,,,,, Acer saccharinum,,Acer saccharinum,SILVER MAPLE,115,Sapindaceae,Soapberry,exotic,583072,http://eol.org/pages/583072/overview,Least Concern,Learn Concern,http://dx.doi.org/10.2305/IUCN.UK.2017-3.RLTS.T193862A2287256.en,dense,"spreading, vase",,,, +Acer saccharum,,,Sugar Maple,,Sapindaceae,Soapberry,exotic,582247,https://eol.org/pages/582247,,,,,,,,, Acer spp.,,Acer spp.,Maple,72,Sapindaceae,Soapberry,exotic,47125858,https://eol.org/pages/47125858/overview,not listed,not listed,,dense,rounded,,,, +Acer x freemanii,Acer rubrum x saccharinum,,Freeman Maple,,Sapindaceae,Soapberry,exotic,49123875,https://eol.org/pages/49123875,,,,,,,,, +Acer × freemanii 'autumn fantasy',,,Autumn Fantasy Maple,,Sapindaceae,Soapberry,exotic,49123875,https://eol.org/pages/49123875,,,,,,,,, +Acoelorrhaphe wrightii,,,Paurotis palm,,Arecaceae,Palm,exotic,1127723,https://eol.org/pages/1127723,,,,,,,,, +Acrocarpus fraxinifolius,,,Pink Cedar,,Fabaceae,Legume,exotic,695211,https://eol.org/pages/695211,,,,,,deciduous,,, +Aesculus californica,Hippocastanatum californica,,California Buckeye,,Sapindaceae,Soapberry,native,581628,https://eol.org/pages/581628,,,,,,,,, +Aesculus × carnea,Aesculus carnea,,Red Horse Chestnut,,Sapindaceae,Soapberry,exotic,52558635,https://eol.org/pages/52558635,,,,,,,,, +Aesculus hippocastanum,,,Horse Chestnut,,Sapindaceae,Soapberry,exotic,582243,https://eol.org/pages/582243,,,,,,,,, Afrocarpus elongatus 'Icee Blue',Podocarpus elongatus,Afrocarpus elongatus 'Icee Blue',Icee Blue Yellow-Wood,2034,Podocarpaceae,Podocarp,exotic,323467,https://eol.org/pages/323467,not listed,not listed,,dense,rounded,evergeen,,, Afrocarpus falcatus,Afrocarpus gracilior;Podocarpus gracilior,Afrocarpus falcatus,African fern pine,2086,Podocarpaceae,Podocarp,exotic,1033604,https://eol.org/pages/1033604,Least Concern,Least Concern,http://dx.doi.org/10.2305/IUCN.UK.2013-1.RLTS.T42438A2980290.en,dense,rounded,evergreen,,, Afrocarpus falcatus,,Afrocarpus gracilior,FERN PINE,46,Podocarpaceae,Podocarp,exotic,1033605,http://eol.org/pages/1033605/overview,Least Concern,Least Concern,http://dx.doi.org/10.2305/IUCN.UK.2013-1.RLTS.T42439A2980350.en,dense,rounded,evergeen,,, Afrocarpus macrophyllus,,Afrocarpus macrophyllus,YEW PINE,134,Podocarpaceae,Podocarp,exotic,1059922,http://eol.org/pages/1059922/overview,not listed,not listed,,dense,rounded,evergeen,,, Agathis robusta,,Agathis robusta,Queensland Kauri,269,Araucariaceae,Araucaria,exotic,1033628,http://eol.org/pages/1033628/overview,Least Concern,Least Concern,http://dx.doi.org/10.2305/IUCN.UK.2013-1.RLTS.T16437966A2960124.en,dense,rounded,,,, Agonis flexuosa,,Agonis flexuosa,PEPPERMINT TREE,307,Myrtaceae,Myrtle,exotic,5448625,http://eol.org/pages/5448625/overview,not listed,not listed,,filtered,pendulous,,,,minimal -Ailanthus altissima,,Ailanthus altissima,TREE OF HEAVEN,233,Simaroubaceae,Quassia,moderate,5614169,http://eol.org/pages/5614169/overview,not listed,not listed,,dense,"spreading, vase",,moderate,https://www.cal-ipc.org/plants/paf/ailanthus-altissima-plant-assessment-form/, +Ailanthus altissima,,Ailanthus altissima,TREE OF HEAVEN,233,Simaroubaceae,Quassia,invasive,5614169,http://eol.org/pages/5614169/overview,not listed,not listed,,dense,"spreading, vase",,moderate,https://www.cal-ipc.org/plants/paf/ailanthus-altissima-plant-assessment-form/, Albizia julibrissin,,Albizia julibrissin,SILK TREE,76,Fabaceae,Legume,exotic,640054,http://eol.org/pages/640054/overview,not listed,not listed,,filtered,rounded,deciduous,,,minimal Allocasuarina verticillata,Casuarina excelsa;Casuarina stricta,Allocasuarina verticillata,Drooping She-Oak,1881,Casuarinaceae,Beefwood,exotic,628407,http://eol.org/pages/628407/overview,not listed,not listed,,filtered,"vase, pendulous",,,, Alnus cordata,,Alnus cordata,ITALIAN ALDER,187,Betulaceae,Birch,exotic,1145955,http://eol.org/pages/1145955/overview,Least Concern,Least Concern,http://dx.doi.org/10.2305/IUCN.UK.2017-3.RLTS.T194657A117268007.en,dense,"Conical, Spreading",,,,moderate @@ -29,7 +71,7 @@ Araucaria columnaris,Araucaria cookii,Araucaria columnaris,STAR PINE,306,Araucar Araucaria heterophylla,Araucaria excelsa,Araucaria heterophylla,NORFOLK ISLAND PINE,84,Araucariaceae,Araucaria,exotic,1033727,http://eol.org/pages/1033727/overview,Vulnerable,Vulnerable D2,http://dx.doi.org/10.2305/IUCN.UK.2011-2.RLTS.T30497A9548582.en,dense,"Conical, Spreading",evergreen,,,moderate Arbutus 'Marina',,Arbutus 'Marina',MARINA ARBUTUS,486,Ericaceae,Heather,exotic,71122,http://eol.org/pages/71122/overview,not listed,not listed,,dense,rounded,evergreen,,, Arbutus unedo,,Arbutus unedo,STRAWBERRY TREE,315,Ericaceae,Heather,exotic,583608,http://eol.org/pages/583608/overview,Least Concern,Least Concern,http://dx.doi.org/10.2305/IUCN.UK.2017-3.RLTS.T202930A68076133.en,dense,rounded,evergreen,,,minimal -Archontophoenix cunninghamiana,,Archontophoenix cunninghamiana,KING PALM,63,Arecaceae,Palm,exotic,1136266,http://eol.org/pages/1136266/overview,not listed,not listed,,"little, none",palm,evergreen,,, +Archontophoenix cunninghamiana,"[""archontophoenix cunningham"", ""archontophoenix cunningham""]",Archontophoenix cunninghamiana,KING PALM,63,Arecaceae,Palm,exotic,1136266,http://eol.org/pages/1136266/overview,not listed,not listed,,"little, none",palm,evergreen,,, Asphalted well,,Asphalted well,Asphalted well,638,,,,,,not listed,not listed,,,,,,, Auranticarpa rhombifolium,Pittosporum rhombifolium,Auranticarpa rhombifolium,QUEENSLAND PITTOSPORUM,100,Pittosporaceae,Cheesewood,exotic,5556305,http://eol.org/pages/5556305/overview,not listed,not listed,,dense,rounded,,,, Bauhinia × blakeana,Bauhinia purpurea × Bauhinia variegata,Bauhinia blakeana,HONG KONG ORCHID TREE,250,Fabaceae,Legume,exotic,641500,http://eol.org/pages/641500/overview,not listed,not listed,,dense,"pendulous, spreading",,,,minimal @@ -135,8 +177,9 @@ Gleditsia triacanthos,,Gleditsia triacanthos,HONEY LOCUST,55,Fabaceae,Legume,exo Grevillea robusta,,Grevillea robusta,SILK OAK,112,Proteaceae,Protea,watch,582736,http://eol.org/pages/582736/overview,not listed,not listed,,filtered,spreading,evergreen,watch,https://www.cal-ipc.org/plants/risk/grevillea-robusta-risk/,minimal Hakea drupacea,Banksia heterophylla,Hakea suaveolens,SWEET HAKEA,270,Proteaceae,Protea,exotic,5511103,http://eol.org/pages/5511103/overview,not listed,not listed,,filtered,rounded,,,, Handroanthus chrysotrichus,Tabebuia chrysotricha,Handroanthus chrysotrichus,Golden Trumpet Tree,179,Bignoniaceae,Bignonia,exotic,5637482,http://eol.org/pages/5637482/overview,not listed,not listed,,dense,rounded,,,, -Handroanthus heptaphyllus,Tabebuia impetiginosa ,Handroanthus impetiginosus,Pink Trumpet Tree,403,Bignoniaceae,Bignonia,exotic,5637444,http://eol.org/pages/5637444,not listed,not listed,,"filtered, dense","rounded, vase",deciduous,,, +Handroanthus heptaphyllus,Tabebuia impetiginosa,Handroanthus impetiginosus,Pink Trumpet Tree,403,Bignoniaceae,Bignonia,exotic,5637444,http://eol.org/pages/5637444,not listed,not listed,,"filtered, dense","rounded, vase",deciduous,,, Harpephyllum caffrum,,Harpephyllum caffrum,KAFFIR PLUM,192,Anacardiaceae,Sumac,exotic,6935240,http://eol.org/pages/6935240/overview,not listed,not listed,,dense,rounded,,,,minimal +Hesperocyparis arizonica,"[""cupressus arizonica"", ""Cupressus arizonica var. arizonica""]",,Arizona Cypress,,Cupressaceae,Cypress,exotic,,,,,,,,,,, Hesperocyparis arizonica var. glabra,Cupressus arizonica var. glabra,Cupressus glabra,Smoothbark Arizona Cypress,142,Cupressaceae,Cypress,exotic,49307043,https://eol.org/pages/49307043,Near Threatened,Near Threatened,http://dx.doi.org/10.2305/IUCN.UK.2013-1.RLTS.T19708408A19708411.en,dense,Columnar,,,,minimal Hesperocyparis macrocarpa,Cupressus macrocarpa;Cupressus lambertiana,Hesperocyparis macrocarpa,MONTEREY CYPRESS,332,Cupressaceae,Cypress,native,1034856,http://eol.org/pages/1034856/overview,Vulnerable,Vulnerable D2,http://www.iucnredlist.org/details/30375/0,filtered,conical,,,, Heteromeles arbutifolia,,Heteromeles arbutifolia,TOYON,232,Rosaceae,Rose,native,47383069,https://eol.org/pages/47383069,not listed,not listed,,dense,rounded,evergreen,,, @@ -224,6 +267,7 @@ Pittosporum crassifolium,,Pittosporum crassifolium,KARO,322,Pittosporaceae,Chees Pittosporum tobira,,Pittosporum tobira,MOCK ORANGE,198,Pittosporaceae,Cheesewood,exotic,583390,http://eol.org/pages/583390/overview,not listed,not listed,,dense,rounded,evergreen,,,moderate Pittosporum undulatum,,Pittosporum undulatum,VICTORIAN BOX,125,Pittosporaceae,Cheesewood,watch,583391,http://eol.org/pages/583391/overview,not listed,not listed,,dense,rounded,evergreen,watch,https://www.cal-ipc.org/plants/risk/pittosporum-undulatum-risk/,moderate Pittosporum viridiflorum,,Pittosporum viridiflorum,CAPE PITTOSPORUM,345,Pittosporaceae,Cheesewood,exotic,47136685,https://eol.org/pages/47136685/overview,not listed,not listed,,dense,rounded,evergreen,,,moderate +Planting site,"[""Vacant site"", ""planting site large"", ""planting site medium"", ""planting site small""]",Vacant site,VACANT SITE,238,,,,,,not listed,not listed,,,,,,, Platanus × hispanica,Platanus × acerifolia;Platanus occidentalis × Platanus orientalis;,Platanus X hispanica,London Plane,70,Platanaceae,Plane-tree,exotic,49950980,https://eol.org/pages/49950980/overview,not listed,not listed,,dense,"vase, spreading",,,, Platanus × hispanica 'Bloodgood',Platanus × acerifolia 'Bloodgood',Platanus X hispanica 'Bloodgood',Bloodgood Plane,1079,Platanaceae,Plane-tree,exotic,49950980,https://eol.org/pages/49950980/overview,not listed,not listed,,filtered,rounded,deciduous,,, Platanus × hispanica 'Yarwood',Platanus × acerifolia 'Yarwood',Platanus X hispanica 'Yarwood',Yarwood Plane,933,Platanaceae,Plane-tree,exotic,49950980,https://eol.org/pages/49950980/overview,not listed,not listed,,filtered,rounded,deciduous,,, @@ -231,6 +275,7 @@ Platanus mexicana,,Platanus mexicana,Mexican Sycamore,440,Platanaceae,Plane-tree Platanus racemosa,,Platanus racemosa,CALIFORNIA SYCAMORE,23,Platanaceae,Plane-tree,native,594707,http://eol.org/pages/594707/overview,not listed,not listed,,filtered,spreading,evergreen,,, Platanus x hispanica 'Columbia',Platanus × acerifolia 'Columbia',Platanus X hispanica 'Columbia',Columbia Plane,1360,Platanaceae,Plane-tree,exotic,49950980,https://eol.org/pages/49950980/overview,not listed,not listed,,dense,"rounded, spreading",deciduous,,, Platycladus orientalis,,Platycladus orientalis,ORIENTAL ARBORVITAE,200,Cupressaceae,Cypress,exotic,323359,http://eol.org/pages/323359/overview,Near Threatened,Near Threatened,http://dx.doi.org/10.2305/IUCN.UK.2013-1.RLTS.T31305A2803944.en,dense,rounded,evergreen,,,moderate +Podocarpus henkelii,Afrocarpus henkelii,,Long Leafed Yellowwood,,Podocarpaceae,Podocarp,exotic,1033708,https://eol.org/pages/1033708,Endangered,,,,,,,, Prosopis glandulosa,,Prosopis glandulosa,Mesquite,905,Fabaceae,Legume,native,416627,http://eol.org/pages/416627/overview,not listed,not listed,,little,,deciduous,,,"none, once established" Prunus × blireiana,,Prunus blireiana,FLOWERING PLUM,47,Rosaceae,Rose,exotic,39934521,https://eol.org/pages/39934521,not listed,not listed,,filtered,,,,,moderate Prunus armeniaca,,Prunus armeniaca,APRICOT,6,Rosaceae,Rose,exotic,301091,http://eol.org/pages/301091/overview,Endangered,Endangered B2ab(iii),http://dx.doi.org/10.2305/IUCN.UK.2007.RLTS.T63405A12666025.en,filtered,"Rounded, Spreading, Vase",deciduous,,,moderate @@ -271,7 +316,7 @@ Spathodea campanulata,,Spathodea campanulata,African Tulip Tree,383,Bignoniaceae Sphaeropteris cooperi,Alsophila cooperi,Cyathea cooperi,Australian Tree Fern,739,Cyatheaceae,Tree fern,exotic,483203,http://eol.org/pages/483203/overview,not listed,not listed,,filtered,fern,,,, Stenocarpus sinuatus,,Stenocarpus sinuatus,FIREWHEEL TREE,171,Proteaceae,Protea,exotic,582737,http://eol.org/pages/582737/overview,not listed,not listed,,filtered,,,,,moderate Strelitzia nicolai,,Strelitzia nicolai,GIANT BIRD OF PARADISE,321,Strelitziaceae,Strelitzia,exotic,345179,http://eol.org/pages/345179/overview,not listed,not listed,,"little, none",palm,evergreen,,,moderate -Stump,,Stump,STUMP,225,,,,,,not listed,not listed,,,,,,, +Stump,"[""Dead tree""]",Stump,STUMP,225,,,,,,not listed,not listed,,,,,,, Stump - not accessible,,Stump - not accessible,STUMP - NOT ACCESSIBLE,1434,,,,,,not listed,not listed,,,,,,, Syagrus romanzoffiana,Arecastrum romanzoffianum,Syagrus romanzoffianum,QUEEN PALM,99,Arecaceae,Palm,exotic,1129524,http://eol.org/pages/1129524/overview,not listed,not listed,,"little, none",palm,evergreen,,,moderate Syzygium australe,Eugenia australis;Eugenia myrtifolia,Syzygium paniculatum,BRUSH CHERRY,17,Myrtaceae,Myrtle,exotic,2508667,http://eol.org/pages/2508667/overview,not listed,not listed,,dense,rounded,evergreen,,,moderate @@ -292,8 +337,7 @@ Ulmus parvifolia 'Drake',,Ulmus parvifolia 'Drake',DRAKE ELM,462,Ulmaceae,Elm,ex Ulmus pumila,,Ulmus pumila,SIBERIAN ELM,111,Ulmaceae,Elm,exotic,594950,http://www.eol.org/pages/594950/overview,Least Concern,Least Concern,http://dx.doi.org/10.2305/IUCN.UK.2018-1.RLTS.T61967372A61967374.en,filtered,"spreading, rounded",deciduous,,,moderate Umbellularia californica,,Umbellularia californica,California Bay,20,Lauraceae,Laurel,native,596841,http://eol.org/pages/596841/overview,not listed,not listed,,filtered,rounded,evergreen,,,"none, once established" Unidentified spp.,,Unidentified spp.,Unidentified Tree,1506,,,,,,not listed,not listed,,,,,,, -Unsuitable site,,Unsuitable site,UNSUITABLE SITE,237,,,,,,not listed,not listed,,,,,,, -Vacant site,,Vacant site,VACANT SITE,238,,,,,,not listed,not listed,,,,,,, +Unsuitable site,"[""poor planting site""]",Unsuitable site,UNSUITABLE SITE,237,,,,,,not listed,not listed,,,,,,, Viburnum spp.,,Viburnum spp.,Viburnum,571,Adoxaceae,Adoxas,exotic,490016,https://eol.org/pages/490016/overview,not listed,not listed,,dense,,,,,moderate Washingtonia filifera,,Washingtonia filifera,CALIFORNIA FAN PALM,21,Arecaceae,Palm,native,1127834,http://www.eol.org/pages/1127834/overview,Lower Risk/near threatened,Lower Risk/near threatened,http://dx.doi.org/10.2305/IUCN.UK.1998.RLTS.T38725A10145920.en,little,,,,,"none, once established" Washingtonia filifera x robusta,,Washingtonia filifera X robusta,Filibuster Hybrid Fan Palm,557,Arecaceae,Palm,exotic,,,not listed,not listed,,"little, none",palm,evergreen,,,"none, once established" @@ -302,5 +346,8 @@ Wisteria sinensis,,Wisteria sinensis,CHINESE WISTERIA,662,Fabaceae,Legume,exotic Wodyetia bifurcata,,Wodyetia bifurcata,FOXTAIL PALM,604,Arecaceae,Palm,exotic,1127809,http://eol.org/pages/1127809/overview,Lower Risk/conservation dependent,Lower Risk/conservation dependent,http://dx.doi.org/10.2305/IUCN.UK.1998.RLTS.T38733A10146773.en,"little, none",palm,evergreen,,,moderate Yucca elephantipes,Yucca gigantea,Yucca elephantipes,Giant Yucca,224,Asparagaceae,Asparagus,exotic,1083612,https://eol.org/pages/1083612/overview,not listed,not listed,,filtered,palm,evergreen,,,"none, once established" Yucca gloriosa,,Yucca gloriosa,SPANISH DAGGER,223,Asparagaceae,Asparagus,exotic,1083624,http://www.eol.org/pages/1083624/overview,not listed,not listed,,little/none,palm,evergreen,,,"none, once established" -Yucca spp.,,Yucca spp.,Yucca Species,135,Asparagaceae,Asparagus,exotic,23768875,https://eol.org/pages/23768875,not listed,not listed,,"little, none",palm,evergreen,,,"none, once established" +Yucca spp.,,Yucca spp.,Yucca,135,Asparagaceae,Asparagus,exotic,23768875,https://eol.org/pages/23768875,not listed,not listed,,"little, none",palm,evergreen,,,"none, once established" +Zamia spp.,,,Zamia,,Zamiaceae,Cycad,exotic,39967576,https://eol.org/pages/39967576,,,,,,,,, Zelkova serrata,,Zelkova serrata,Sawtooth Zelkova,216,Ulmaceae,Elm,exotic,484119,http://eol.org/pages/484119/overview,not listed,not listed,,filtered,"spreading, vase",deciduous,,, +Zelkova serrata 'village green',,,Village Green Japanese Zelkova,,Ulmaceae,Elm,exotic,484119,http://eol.org/pages/484119/overview,not listed,not listed,,filtered,"spreading, vase",deciduous,,, +Ziziphus jujuba,"[""zizyphus jujuba""]",,Chinese jujube,,Rhamnaceae,Buckthorn,exotic,582338,https://eol.org/pages/582338,,,,,,,,, \ No newline at end of file diff --git a/download_images.py b/download_images.py new file mode 100644 index 0000000..f805124 --- /dev/null +++ b/download_images.py @@ -0,0 +1,145 @@ +import os +from typing import Optional, Set +import dataclasses +import hashlib + +from google.cloud import storage +from PIL import Image +from io import BytesIO +import requests +import pandas as pd +import pymysql.cursors +from upload_trees import DBCursor + + +@dataclasses.dataclass +class TreeImage: + species_id: int + retrieval_url: str + hashed_url: str + img_type: str + description: str + stored_url: Optional[str] + image: Optional + author: Optional[str] + author_url: Optional[str] + + +class ImageDownloader(object): + MAX_SIZE = (1024, 1024) + + def __init__(self): + + self.bucket = storage.Client().bucket('public-tree-map-images') + + def insert_tree_into_db(self, tree: TreeImage): + with DBCursor(os.environ['TREE_DB_PASS']) as conn: + cursor = conn.cursor(pymysql.cursors.DictCursor) + cursor.execute( + """ + INSERT INTO images( + stored_url, + original_url, + details, + hashed_original_url, + species_id, + author, + author_url + ) + VALUES (%s, %s, %s, %s, %s, %s, %s) + """, + ( + tree.stored_url, + tree.retrieval_url, + tree.description if tree.description else None, + tree.hashed_url, + tree.species_id, + tree.author if tree.author else None, + tree.author_url if tree.author_url else None + ) + ) + conn.commit() + + def get_and_upload_image(self, tree_image: TreeImage): + r = requests.get(tree_image.retrieval_url) + if r.ok: + img = Image.open(BytesIO(r.content)) + img.thumbnail(self.MAX_SIZE) + tree_image.image = img + image_key = f'{tree_image.hashed_url}.{tree_image.img_type}' + blob = self.bucket.blob(image_key) + blob.upload_from_string( + r.content, + content_type=r.headers['Content-Type'] + ) + tree_image.stored_url = f'https://storage.googleapis.com/public-tree-map-images/{image_key}' + self.insert_tree_into_db(tree_image) + return tree_image + + def get_tree_images(self, tree_id, eol_id, existing_images: Set[str]): + + assert os.environ.get('TREE_SALT') is not None + + url = f'http://eol.org/api/pages/1.0.json?id=${eol_id}&images_per_page=3&videos_per_page=0&sounds_per_page=0&maps_per_page=0&texts_per_page=0&details=true&taxonomy=false' + r = requests.get(url) + images_to_retrieve = [] + if r.ok: + request_body = r.json() + data_objects = request_body['taxonConcept']['dataObjects'] + if data_objects: + for data_object in data_objects: + hashed_url = hashlib.md5(f"{data_object['eolMediaURL']}{os.environ['TREE_SALT']}".encode('utf-8')).hexdigest() + if hashed_url not in existing_images: + images_to_retrieve.append( + TreeImage( + tree_id, + data_object['eolMediaURL'], + hashed_url, + data_object['dataSubtype'], + data_object['description'], + None, + None, + data_object['rightsHolder'].strip() if data_object['rightsHolder'] else None, + f'https://eol.org/pages/${eol_id}/media' + ) + ) + + if images_to_retrieve: + for image in images_to_retrieve: + uploaded_tree = self.get_and_upload_image(image) + if uploaded_tree: + existing_images.add(uploaded_tree.hashed_url) + + def get_trees_without_images(self): + with DBCursor(os.environ['TREE_DB_PASS']) as conn: + cursor = conn.cursor(pymysql.cursors.DictCursor) + cursor.execute( + """ + SELECT + S.id, + S.eol_id, + COUNT(DISTINCT I.id) AS cnt + FROM species S + LEFT JOIN images I ON S.id = I.species_id + GROUP BY 1, 2 + HAVING cnt < 3 + """ + ) + tree_results = cursor.fetchall() + cursor.execute( + """ + SELECT + hashed_original_url + FROM images + """ + ) + image_results = set([row['hashed_original_url'] for row in cursor.fetchall()]) + return pd.DataFrame(tree_results), image_results + + +if __name__ == "__main__": + img_download = ImageDownloader() + os.environ['TREE_DB_PASS'] = "hFitnGfIvD1COm24" + trees_df, hashed_urls = img_download.get_trees_without_images() + for row in trees_df.itertuples(): + img_download.get_tree_images(row.id, row.eol_id, hashed_urls) diff --git a/data/stiles_data/parse_la_data.py b/parse_la_data.py similarity index 56% rename from data/stiles_data/parse_la_data.py rename to parse_la_data.py index 709f815..0e62445 100644 --- a/data/stiles_data/parse_la_data.py +++ b/parse_la_data.py @@ -1,17 +1,23 @@ import argparse +import getpass +import json from pathlib import Path from typing import List, Set +import numpy as np import pandas as pd import geopandas as gpd +from shapely.geometry import Point + +from upload_trees import DBTreeUploader, DBCursor class CityParser(object): # these are case insensitive - name_common_columns = ['name_common', 'species', 'com_name'] - name_botanical_columns = ['name_botanical', 'botanical', 'botanicaln'] - condition = ['condition', 'treecondition'] + name_common_columns = ['name_common', 'species', 'com_name', 'trees_spec'] + name_botanical_columns = ['name_botanical', 'botanical', 'botanicaln', 'botanicalna', 'trees_bota'] + condition = ['condition', 'treecondition', 'treeconditi'] address_columns = ['address'] diameter_min_in_columns = ['diameter_min_in'] diameter_max_in_columns = ['diameter_max_in'] @@ -19,7 +25,7 @@ class CityParser(object): height_min_feet_columns = ['height_min_feet'] height_max_feet_columns = ['height_max_feet'] exact_height_columns = ['exact_height', 'exact_heigh', 'height', 'actualheight'] - tree_id_columns = ['tree_id', 'inventoryid', 'tree', 'inventoryi', 'treeid', 'objectid'] + tree_id_columns = ['tree_id', 'inventoryid', 'tree', 'inventoryi', 'treeid', 'objectid', 'trees_ogc_'] est_value_columns = ['estimated_value', 'est_value', 'estvalue'] height_tuples = [ @@ -34,9 +40,16 @@ class CityParser(object): def __init__(self, city, path: Path, geojson_path: Path=None): self.city = city geo_jsons = [] + csvs = [] if path: geo_jsons = [p for p in path.iterdir() if p.is_file() and p.suffix == '.geojson'] + csvs = [p for p in path.iterdir() if p.is_file() and p.suffix == '.csv'] assert len(geo_jsons) <= 1 + assert len(csvs) <= 1 + if len(csvs) > 0: + self.csv_path = csvs[-1] + else: + self.csv_path = None if len(geo_jsons) > 0: self.geo_json_path = geo_jsons[-1] elif geojson_path: @@ -48,15 +61,21 @@ def get_min_max_columns(self, df, range_col_tuples, skip_col: Set[str] = None): for (range_col, min_col, max_col) in range_col_tuples: if skip_col and range_col in skip_col: continue - if range_col in df.columns: - return self.cat_parser(df, min_col, max_col, range_col) - return df - - def get_column(self, df, potential_columns: List[str], titleize=False): + for col in [range_col, range_col.upper(), range_col.lower()]: + if col in df.columns: + try: + return self.cat_parser(df, min_col, max_col, col), col + except AttributeError: + pass + return df, None + + def get_column(self, df, potential_columns: List[str], exclude_col: str = None, titleize=False): column_name = potential_columns[0] df_columns = [s.strip().lower() for s in df.columns] for potential_column in potential_columns: if potential_column in df_columns: + if exclude_col is not None and exclude_col.lower() == potential_column.lower(): + continue idx = df_columns.index(potential_column) column = df.columns[idx] if titleize: @@ -76,8 +95,7 @@ def filter_columns(self, df): 'tree_id', 'estimated_value', 'address', - 'latitude', - 'longitude', + 'geometry', 'city', 'diameter_min_in', 'diameter_max_in', @@ -85,40 +103,62 @@ def filter_columns(self, df): 'height_max_feet', } actual_columns = potential_columns & set(df.columns) - return df[list(actual_columns)] + return df[list(actual_columns)].rename( + columns={ + 'height_min_feet': 'height_min_ft', + 'height_max_feet': 'height_max_ft', + } + ) def read_df(self): - assert self.geo_json_path - return gpd.read_file(str(self.geo_json_path.absolute())).assign(city=self.city) + assert self.geo_json_path or self.csv_path + if self.csv_path: + df = pd.read_csv(self.csv_path.absolute()).assign(city=self.city) + if 'longitude' in df.columns: + df['geometry'] = gpd.points_from_xy(df['longitude'], df['latitude']) + elif 'LONGITUDE' in df.columns: + df['geometry'] = gpd.points_from_xy(df['LONGITUDE'], df['LATITUDE']) + else: + raise RuntimeError(f'CSV: {self.csv_path} has no geometry') + else: + df = gpd.read_file(str(self.geo_json_path.absolute())).assign(city=self.city) + return df + + def cleanup_columns(self, df): + str_cols = [ + 'name_common', + 'name_botanical', + 'condition', + 'tree_id', + 'address', + ] + for col in str_cols: + df[col] = df[col].str.strip() def get_maximal_df(self, df=None): if df is None: df = self.read_df() - df = self.lat_lon_from_geometry(df) + df = self.get_column(df, self.address_columns, titleize=True) df = self.get_column(df, self.name_common_columns, titleize=True) df = self.get_column(df, self.name_botanical_columns, titleize=True) + df = self.get_column(df, self.tree_id_columns) + df = self.get_column(df, self.condition) + df = self.get_column(df, self.est_value_columns) + + df, height_col = self.get_min_max_columns(df, self.height_tuples) + df, diameter_col = self.get_min_max_columns(df, self.diameter_tuples) - df = self.get_min_max_columns(df, self.height_tuples) - df = self.get_min_max_columns(df, self.diameter_tuples) + df = self.get_column(df, self.exact_height_columns, exclude_col=height_col) + df = self.get_column(df, self.exact_diameter_columns, exclude_col=diameter_col) df = self.get_column(df, self.diameter_min_in_columns) df = self.get_column(df, self.diameter_max_in_columns) + df = self.get_column(df, self.height_max_feet_columns) + df = self.get_column(df, self.height_min_feet_columns) return self.filter_columns(df).drop_duplicates() - @staticmethod - def lat_lon_from_geometry(df, y_is_lat=True): - if y_is_lat: - return df.assign( - latitude=df['geometry'].apply(lambda p: p.y), - longitude=df['geometry'].apply(lambda p: p.x) - ) - return df.assign( - latitude=df['geometry'].apply(lambda p: p.x), - longitude=df['geometry'].apply(lambda p: p.y) - ) - @staticmethod def cat_parser(df, min_field, max_field, og_field, cats=None): if cats is None: @@ -147,8 +187,7 @@ def get_maximal_df(self): df = df.assign( address=df['Address'].astype(str).str.cat(df['Street'].str.title(), sep=' ') ).drop('Address', axis=1) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class AlhambraParser(CityParser): @@ -157,8 +196,7 @@ def get_maximal_df(self): df = df.assign( address=df['Address'].astype(str).str.cat(df['Street'].str.title(), sep=' ') ).drop('Address', axis=1) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class ArcadiaParser(CityParser): @@ -167,8 +205,7 @@ def get_maximal_df(self): df = df.assign( address=df['ADDR'].str.split('ARCADIA').str[0].str.title() ) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class BellflowerParser(CityParser): @@ -177,8 +214,7 @@ def get_maximal_df(self): df = df.assign( address=df['Address'].astype(str).str.cat(df['Street'].str.title(), sep=' '), ).drop('Address', axis=1) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class BellGardensParser(CityParser): @@ -187,8 +223,7 @@ def get_maximal_df(self): df = df.assign( address=df['ADDRESS'].astype(str).str.cat(df['STREET'].str.title(), sep=' ') ).drop('ADDRESS', axis=1) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class ArtesiaParser(CityParser): @@ -197,8 +232,7 @@ def get_maximal_df(self): df = df.assign( address=df['ADDRESS'].astype(str).str.cat(df['STREET'].str.title(), sep=' '), ).drop('ADDRESS', axis=1) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class BeverlyHillsParser(CityParser): @@ -208,15 +242,13 @@ def get_maximal_df(self): df = df.assign( address=df['ADDRESS'].astype(str).str.cat(df['STREET'].str.title(), sep=' '), ).drop('ADDRESS', axis=1) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class LongBeachParser(CityParser): def get_maximal_df(self, df=None): df = self.read_df().drop('ADDRESS', axis=1) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class SantaClaritaParser(CityParser): @@ -226,8 +258,16 @@ def get_maximal_df(self, df=None): df = df.assign( address=df['PROP_ADR'].astype(str).str.cat(df['PROPSTREET'].str.title(), sep=' '), ) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) + + +class SantaClaritaParksParser(CityParser): + def get_maximal_df(self, df=None): + df = self.read_df() + df = df.assign( + address=df['ADDRESS'].astype(str).str.cat(df['STREET'].str.title(), sep=' '), + ).drop('ADDRESS', axis=1) + return super().get_maximal_df(df=df) class PasadenaParser(CityParser): @@ -244,8 +284,7 @@ def get_maximal_df(self, df=None): df.loc[mask, 'Street_Nam'], sep=' ' ).str.cat(df.loc[mask, 'Street_Typ'], sep=' ') - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class GlendaleParser(CityParser): @@ -253,8 +292,7 @@ def get_maximal_df(self, df=None): df = self.read_df() df = df.drop('Address', axis=1) df['address'] = df['OnAddress'].astype(str).str.cat(df['OnStreet'].astype(str).str.strip(), sep=' ') - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class PomonaParser(CityParser): @@ -263,8 +301,7 @@ def get_maximal_df(self, df=None): df['address'] = df['ADDRESS'].astype(str).str.cat(df['STREET'].astype(str).str.strip(), sep=' ') df = df.drop('ADDRESS', axis=1) - df = super().get_maximal_df(df=df) - return self.filter_columns(df) + return super().get_maximal_df(df=df) class StilesDataParser(object): @@ -273,14 +310,15 @@ class StilesDataParser(object): 'los-angeles-city': CityParser, 'los-angeles-county': CityParser, 'agoura-hills': AgouraHillsParser, - 'alhambra' : AlhambraParser, - 'arcadia': ArcadiaParser, + # 'alhambra' : AlhambraParser, + # 'arcadia': ArcadiaParser, 'artesia': ArtesiaParser, - 'bell-gardens': BellGardensParser, + # 'bell-gardens': BellGardensParser, 'bellflower': BellflowerParser, 'beverly-hills': BeverlyHillsParser, 'long-beach': LongBeachParser, - 'santa-clarita': SantaClaritaParser, + 'santa-clarita-parks': SantaClaritaParksParser, + # 'santa-clarita': SantaClaritaParser, 'pasadena': PasadenaParser, 'glendale': GlendaleParser, 'pomona': PomonaParser, @@ -305,13 +343,86 @@ def parse_all(self): df = city_parser.get_maximal_df() dfs.append(df) - print(len(pd.concat(dfs))) + df = pd.concat(dfs) + str_cols = [ + 'name_common', + 'name_botanical', + 'address', + 'city', + 'condition', + ] + for col in str_cols: + df[col] = df[col].astype(str, skipna=True) + mask = df[col].notnull() + df.loc[mask, col] = df.loc[mask, col].str.strip() + + return df + + +class SpeciesMatcher(object): + def __init__(self, df): + self.species_df = pd.read_csv('data/species_attributes.csv') + # TODO (Remove this after Emily renames) + self.species_df = self.species_df.drop_duplicates('botanical_name') + self.synonym_df = SpeciesMatcher.generate_synonyms( + self.species_df.copy(), 'botanical_name', ['sm_botanical_name'], ['botanical_synonyms'] + ) + self.df = df + + @staticmethod + def generate_synonyms(df, base_column, regular_columns, json_columns): + synonyms = [] + for row in df.itertuples(): + row_synonyms = set() + base_name = getattr(row, base_column) + row_synonyms.add(base_name.lower()) + for column in regular_columns: + if not isinstance(getattr(row, column), float) and getattr(row, column): + row_synonyms.add(getattr(row, column).lower()) + for column in json_columns: + json_value = getattr(row, column) + if not isinstance(json_value, float): + try: + row_synonyms |= set([s.lower() for s in json.loads(json_value)]) + except json.JSONDecodeError: + row_synonyms.add(json_value.lower()) + synonyms.append(list(row_synonyms)) + + df['synonym'] = synonyms + return df.explode('synonym') + + def match(self): + df = self.df.assign(synonym=self.df['name_botanical'].str.lower()) + return pd.merge( + df, + self.synonym_df[['synonym', 'botanical_name']], + on='synonym' + ).drop(['name_botanical', 'synonym'], axis=1) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--datapath", required=True, type=str) + parser.add_argument("--host", required=False, type=str, default=None) args = parser.parse_args() data_parser = StilesDataParser(args.datapath) - data_parser.parse_all() + # df = data_parser.parse_all() + # df['location'] = gpd.GeoSeries(df['geometry']).apply(lambda x: Point(x.y, x.x)).to_wkt() + # df.to_csv('stiles.trees.csv', index=False) + # assert False + df = pd.read_csv('stiles.trees.csv') + matcher = SpeciesMatcher(df) + matched_df = matcher.match() + uploader = DBTreeUploader(getpass.getpass('Password...?')) + uploader.truncate_trees() + uploader.delete_species() + uploader.upload_species(matcher.species_df) + species_mapper = uploader.get_species_ids_mapper() + matched_df['species_id'] = matched_df['botanical_name'].map(species_mapper) + assert matched_df['species_id'].notnull().all() + uploader.upload_trees( + matched_df.assign(state='CA').rename( + columns={col: col.lower() for col in matched_df.columns} + ).rename(columns={'Species ID': 'species_id'}) + ) diff --git a/pruning_planting.py b/pruning_planting.py index c23a98a..7617442 100644 --- a/pruning_planting.py +++ b/pruning_planting.py @@ -17,8 +17,8 @@ def load_dataset(name, line_to_points=False): reproject it into WGS84, and return the geodataframe. """ # Load the street planting shape data, reprojecting into WGS84 - gdf = gpd.read_file(name, crs='+init=epsg:2229') - gdf = gdf.to_crs({'init': 'epsg:4326', 'no_defs': True}) + gdf = gpd.read_file(name, crs='2229') + gdf = gdf.to_crs(4326) if line_to_points: dfs = [] for row in gdf.itertuples(): @@ -134,7 +134,7 @@ def collapse_years(tree): trees = gpd.GeoDataFrame( trees, geometry=gpd.points_from_xy(trees['longitude'], trees['latitude']), - crs={'init': 'epsg:4326'} + crs=4326 ) trees["pruning_year"] = trees.apply(collapse_years, axis=1) diff --git a/requirements.txt b/requirements.txt index a3e3678..2c26196 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,8 @@ -geopandas==0.4.0 \ No newline at end of file +python-geohash +geopandas +pandas +rtree +cloud-sql-python-connector[pymysql] +pymysql +Pillow +google-cloud-storage diff --git a/upload_trees.py b/upload_trees.py new file mode 100644 index 0000000..b6695e0 --- /dev/null +++ b/upload_trees.py @@ -0,0 +1,229 @@ +import os +from typing import Dict +import json +import sys + +# import pymysql +# from google.cloud.sql.connector import connector +# import pandas as pd +import geopandas as gpd +import numpy as np +import pandas as pd +import pymysql.cursors +from google.cloud.sql.connector import connector + + +class DBTreeUploader(object): + + def __init__(self, password): + self.password = password + + def truncate_trees(self): + self._truncate_table('trees') + + def delete_species(self): + with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + conn.cursor().execute( + f""" + DELETE FROM species; + """ + ) + conn.cursor().execute( + f""" + ALTER TABLE species AUTO_INCREMENT = 1; + """ + ) + conn.commit() + + def _truncate_table(self, table): + with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + conn.cursor().execute( + f""" + TRUNCATE TABLE {table}; + """ + ) + conn.cursor().execute( + f""" + ALTER TABLE {table} AUTO_INCREMENT = 1; + """ + ) + conn.commit() + + def truncate_sm_trees(self): + with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + conn.cursor().execute( + """ + DELETE FROM trees + WHERE city = 'Santa Monica' + """ + ) + conn.commit() + + def get_species_ids_mapper(self) -> Dict[str, int]: + with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + cursor = conn.cursor(pymysql.cursors.DictCursor) + cursor.execute( + """ + SELECT id, botanical_name + FROM species + """ + ) + results = cursor.fetchall() + + return pd.DataFrame(results).set_index('botanical_name').to_dict()['id'] + + def upload_trees(self, df: pd.DataFrame, batch_size=100000): + with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + sql = """ + INSERT INTO trees( + tree_id, + species_id, + address, + state, + city, + tree_condition, + diameter_min_in, + diameter_max_in, + exact_diameter, + height_min_ft, + height_max_ft, + exact_height, + estimated_value, + location, + heritage, + heritage_year, + heritage_number, + heritage_text + ) + VALUES ( + %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ST_GeomFromText(%s, 4269), %s, %s, %s, %s + ) + """ + df['batch'] = np.random.randint(batch_size, size=len(df)) + df = df.where((pd.notnull(df)), None) + for _, batch_df in df.groupby('batch'): + conn.cursor().executemany( + sql, + [ + ( + int(row.tree_id) if row.tree_id is not None and row.tree_id != np.nan else None, + row.species_id, + row.address, + row.state, + row.city, + row.tree_condition if hasattr(row, 'tree_condition') else None, + row.diameter_min_in, + row.diameter_max_in, + row.exact_diameter if hasattr(row, 'exact_diameter') else None, + row.height_min_ft, + row.height_max_ft, + row.exact_height if hasattr(row, 'exact_height') else None, + row.estimated_value if hasattr(row, 'estimated_value') else None, + row.location, + row.heritage if hasattr(row, 'heritage') else False, + row.heritage_year if hasattr(row, 'heritage_year') else None, + row.heritage_number if hasattr(row, 'heritage_number') else None, + row.heritage_text if hasattr(row, 'heritage_text') else None + ) for row in batch_df.itertuples() + ] + ) + conn.commit() + + def upload_species(self, df: pd.DataFrame): + with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + sql = """ + INSERT INTO species( + botanical_name, + common_name, + family_botanical_name, + family_common_name, + native, + eol_id, + eol_overview_url, + simplified_iucn_status, + iucn_status, + iucn_doi_or_url, + shade_production, + form, + type, + cal_ipc_url, + irrigation_requirements, + species_id + ) + VALUES ( + %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s + ) + """ + df = df.where((pd.notnull(df)), None) + conn.cursor().executemany( + sql, + [ + ( + row.botanical_name, + row.common_name, + row.family_botanical_name, + row.family_common_name, + row.native, + int(row.eol_id) if row.eol_id is not None else None, + row.eol_overview_url, + row.simplified_iucn_status, + row.iucn_status, + row.iucn_doi_or_url, + row.shade_production, + row.form, + row.type, + row.cal_ipc_url, + row.irrigation_requirements, + row.species_id + ) for row in df.itertuples() + ] + ) + + conn.commit() + + +class SMTreeUploader(DBTreeUploader): + + def __init__(self): + super().__init__(None) + + def prepare_df(self, df): + df['location'] = gpd.GeoSeries(gpd.points_from_xy(df['longitude'], df['latitude'])).to_wkt() + self.df = df.rename(columns={ + 'heritageYear': 'heritage_year', + 'heritageNumber': 'heritage_number', + 'heritageText': 'heritage_text', + }) + + +if __name__ == "__main__": + + # Load the trees dataset. + json_lines = [] + for line in sys.stdin: + json_lines.append(json.loads(line)) + + df = pd.DataFrame(json_lines) + uploader = SMTreeUploader() + species_mapper = uploader.get_species_ids_mapper() + df['species_id'] = df['botanical_name'].map(species_mapper) + assert df['species_id'].notnull().all() + print(df, file=sys.stderr) + uploader.upload_trees(uploader.df) + + +class DBCursor(object): + def __init__(self, password): + self.connection = connector.connect( + instance_connection_string='total-ensign-336021:us-west1:public-tree-map', + driver='pymysql', + user='root', + password=password, + db='publictrees' + ) + + def __enter__(self): + return self.connection + + def __exit__(self, type, value, traceback): + self.connection.close() \ No newline at end of file From 223cf9ac5538bf406b550ad525cde909de300f3e Mon Sep 17 00:00:00 2001 From: Allen Tran Date: Wed, 29 Dec 2021 18:50:20 -0800 Subject: [PATCH 06/10] rm --- download_images.py | 1 - 1 file changed, 1 deletion(-) diff --git a/download_images.py b/download_images.py index f805124..34d6ad3 100644 --- a/download_images.py +++ b/download_images.py @@ -139,7 +139,6 @@ def get_trees_without_images(self): if __name__ == "__main__": img_download = ImageDownloader() - os.environ['TREE_DB_PASS'] = "hFitnGfIvD1COm24" trees_df, hashed_urls = img_download.get_trees_without_images() for row in trees_df.itertuples(): img_download.get_tree_images(row.id, row.eol_id, hashed_urls) From db63562a6ffb103585bdf45132ddc3f3caf2bbd1 Mon Sep 17 00:00:00 2001 From: Allen Tran Date: Sat, 8 Jan 2022 12:27:40 -0800 Subject: [PATCH 07/10] ETL and API via Google Cloud Run --- .gcloudignore | 4 ++ Dockerfile | 14 +---- Makefile | 5 -- cloudbuild.yaml | 7 +-- download_images.py | 51 ++++++++-------- etl_http_listener.py | 39 ++++++++++++ parse_la_data.py | 19 +++--- requirements.txt | 7 ++- sm_parser.py | 42 +++++++++++++ treeapi/main.py | 116 ++++++++++++++++++++++++++++------- upload_trees.py | 141 ++++++++++++++++++++++++++++--------------- 11 files changed, 312 insertions(+), 133 deletions(-) create mode 100644 etl_http_listener.py create mode 100644 sm_parser.py diff --git a/.gcloudignore b/.gcloudignore index 4a03439..5437b5d 100644 --- a/.gcloudignore +++ b/.gcloudignore @@ -9,3 +9,7 @@ __pycache__ *.pdf *.jpg stiles.trees.csv +node_modules/ +build/ +tmp/ +cloud_sql_proxy diff --git a/Dockerfile b/Dockerfile index a93a089..4754161 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,18 +11,6 @@ COPY . ./ RUN apt-get update -y \ && apt-get install -y build-essential curl -ENV NODE_VERSION 10.15.1 -RUN curl -o- https://raw.githubusercontent.com/creationix/nvm/v0.33.11/install.sh | bash -RUN . $HOME/.nvm/nvm.sh \ - && nvm install $NODE_VERSION \ - && nvm alias default $NODE_VERSION \ - && nvm use default \ - && npm install - RUN pip3 install --no-cache-dir -r requirements.txt -#RUN echo 'source $NVM_DIR/nvm.sh' >> $BASH_ENV -#RUN echo 'export PATH="$HOME/miniconda/bin:$PATH"' >> $BASH_ENV -#RUN echo 'source activate public-tree-map' >> $BASH_ENV - -RUN ["/bin/bash", "-c", ". $HOME/.nvm/nvm.sh && make release-gc"] \ No newline at end of file +CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 etl_http_listener:app \ No newline at end of file diff --git a/Makefile b/Makefile index e352057..35b6892 100644 --- a/Makefile +++ b/Makefile @@ -11,11 +11,6 @@ release: setup | node download-images.js \ | node split-trees.js build/data -release-gc: setup - curl 'https://data.smgov.net/resource/w8ue-6cnd.csv?$$limit=200' \ - | node parse-trees.js \ - | python3 upload_trees.py - python3 download_images.py # Runs the pipeline using local data, but skips the CPU-intensive python tasks img-test: setup diff --git a/cloudbuild.yaml b/cloudbuild.yaml index cd51f85..5062922 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -1,9 +1,4 @@ steps: - name: 'gcr.io/cloud-builders/docker' entrypoint: 'bash' - secretEnv: ['TREE_DB_PASS'] - args: ['-c', 'docker build -t us-west1-docker.pkg.dev/$PROJECT_ID/cloud-run-source-deploy/etl-image:prod .'] -availableSecrets: - secretManager: - - versionName: projects/$PROJECT_ID/secrets/trees-db-password/versions/latest - env: TREE_DB_PASS \ No newline at end of file + args: ['-c', 'docker build -t us-west1-docker.pkg.dev/$PROJECT_ID/cloud-run-source-deploy/etl-image:prod .'] \ No newline at end of file diff --git a/download_images.py b/download_images.py index 34d6ad3..b3c260e 100644 --- a/download_images.py +++ b/download_images.py @@ -19,7 +19,6 @@ class TreeImage: hashed_url: str img_type: str description: str - stored_url: Optional[str] image: Optional author: Optional[str] author_url: Optional[str] @@ -33,12 +32,12 @@ def __init__(self): self.bucket = storage.Client().bucket('public-tree-map-images') def insert_tree_into_db(self, tree: TreeImage): - with DBCursor(os.environ['TREE_DB_PASS']) as conn: + with DBCursor() as conn: cursor = conn.cursor(pymysql.cursors.DictCursor) cursor.execute( """ INSERT INTO images( - stored_url, + extension, original_url, details, hashed_original_url, @@ -49,7 +48,7 @@ def insert_tree_into_db(self, tree: TreeImage): VALUES (%s, %s, %s, %s, %s, %s, %s) """, ( - tree.stored_url, + tree.img_type, tree.retrieval_url, tree.description if tree.description else None, tree.hashed_url, @@ -61,20 +60,22 @@ def insert_tree_into_db(self, tree: TreeImage): conn.commit() def get_and_upload_image(self, tree_image: TreeImage): - r = requests.get(tree_image.retrieval_url) - if r.ok: - img = Image.open(BytesIO(r.content)) - img.thumbnail(self.MAX_SIZE) - tree_image.image = img - image_key = f'{tree_image.hashed_url}.{tree_image.img_type}' - blob = self.bucket.blob(image_key) - blob.upload_from_string( - r.content, - content_type=r.headers['Content-Type'] - ) - tree_image.stored_url = f'https://storage.googleapis.com/public-tree-map-images/{image_key}' - self.insert_tree_into_db(tree_image) - return tree_image + image_key = f'{tree_image.hashed_url}.{tree_image.img_type}' + blob = self.bucket.blob(image_key) + if not blob.exists(): + r = requests.get(tree_image.retrieval_url) + if r.ok: + img = Image.open(BytesIO(r.content)) + img.thumbnail(self.MAX_SIZE) + tree_image.image = img + with BytesIO(img.tobytes()) as f: + blob.upload_from_file( + f, + content_type=r.headers['Content-Type'] + ) + + self.insert_tree_into_db(tree_image) + return tree_image def get_tree_images(self, tree_id, eol_id, existing_images: Set[str]): @@ -85,7 +86,7 @@ def get_tree_images(self, tree_id, eol_id, existing_images: Set[str]): images_to_retrieve = [] if r.ok: request_body = r.json() - data_objects = request_body['taxonConcept']['dataObjects'] + data_objects = request_body['taxonConcept'].get('dataObjects') if data_objects: for data_object in data_objects: hashed_url = hashlib.md5(f"{data_object['eolMediaURL']}{os.environ['TREE_SALT']}".encode('utf-8')).hexdigest() @@ -96,11 +97,10 @@ def get_tree_images(self, tree_id, eol_id, existing_images: Set[str]): data_object['eolMediaURL'], hashed_url, data_object['dataSubtype'], - data_object['description'], - None, + data_object['description'] if 'description' in data_object else None, None, - data_object['rightsHolder'].strip() if data_object['rightsHolder'] else None, - f'https://eol.org/pages/${eol_id}/media' + data_object['rightsHolder'].strip() if 'rightsHolder' in data_object else None, + f'https://eol.org/pages/{int(eol_id)}/media' ) ) @@ -111,7 +111,7 @@ def get_tree_images(self, tree_id, eol_id, existing_images: Set[str]): existing_images.add(uploaded_tree.hashed_url) def get_trees_without_images(self): - with DBCursor(os.environ['TREE_DB_PASS']) as conn: + with DBCursor() as conn: cursor = conn.cursor(pymysql.cursors.DictCursor) cursor.execute( """ @@ -140,5 +140,6 @@ def get_trees_without_images(self): if __name__ == "__main__": img_download = ImageDownloader() trees_df, hashed_urls = img_download.get_trees_without_images() - for row in trees_df.itertuples(): + for idx, row in enumerate(trees_df.itertuples()): + print(f'{idx}/{len(trees_df)}') img_download.get_tree_images(row.id, row.eol_id, hashed_urls) diff --git a/etl_http_listener.py b/etl_http_listener.py new file mode 100644 index 0000000..61db80e --- /dev/null +++ b/etl_http_listener.py @@ -0,0 +1,39 @@ +import os + +import pandas as pd +from flask import Flask +from sm_parser import parse_trees +import upload_trees +import parse_la_data +import download_images + +app = Flask(__name__) + + +def download_tree_images(): + img_download = download_images.ImageDownloader() + trees_df, hashed_urls = img_download.get_trees_without_images() + for idx, row in enumerate(trees_df.itertuples()): + img_download.get_tree_images(row.id, row.eol_id, hashed_urls) + + +@app.route("/") +def upload_sm_trees(): + df = pd.read_csv('https://data.smgov.net/resource/w8ue-6cnd.csv?$limit=50000') + df = parse_trees(df=df, stdout=False) + matcher = parse_la_data.SpeciesMatcher(df) + matched_df = matcher.match(how='left').drop('species_id', axis=1) + uploader = upload_trees.SMTreeUploader() + species_mapper = uploader.get_species_ids_mapper() + matched_df['species_id'] = matched_df['botanical_name'].map(species_mapper) + uploader.truncate_sm_trees() + uploader.prepare_df(matched_df) + uploader.upload_trees(uploader.df, batch_size=0) + + download_tree_images() + + return 'SUCCESS' + + +if __name__ == "__main__": + app.run(debug=False, host="0.0.0.0", port=int(os.environ.get("PORT", 8080))) \ No newline at end of file diff --git a/parse_la_data.py b/parse_la_data.py index 0e62445..1dcb4b5 100644 --- a/parse_la_data.py +++ b/parse_la_data.py @@ -4,12 +4,11 @@ from pathlib import Path from typing import List, Set -import numpy as np import pandas as pd import geopandas as gpd from shapely.geometry import Point -from upload_trees import DBTreeUploader, DBCursor +from upload_trees import DBTreeUploader class CityParser(object): @@ -391,11 +390,12 @@ def generate_synonyms(df, base_column, regular_columns, json_columns): df['synonym'] = synonyms return df.explode('synonym') - def match(self): + def match(self, how='inner'): df = self.df.assign(synonym=self.df['name_botanical'].str.lower()) return pd.merge( df, self.synonym_df[['synonym', 'botanical_name']], + how=how, on='synonym' ).drop(['name_botanical', 'synonym'], axis=1) @@ -414,15 +414,18 @@ def match(self): df = pd.read_csv('stiles.trees.csv') matcher = SpeciesMatcher(df) matched_df = matcher.match() - uploader = DBTreeUploader(getpass.getpass('Password...?')) + uploader = DBTreeUploader() uploader.truncate_trees() - uploader.delete_species() - uploader.upload_species(matcher.species_df) + uploader.update_species( + matcher.species_df.rename(columns={'Species ID': 'species_id'}).rename( + columns={col: col.lower() for col in matcher.species_df.columns} + ) + ) species_mapper = uploader.get_species_ids_mapper() matched_df['species_id'] = matched_df['botanical_name'].map(species_mapper) assert matched_df['species_id'].notnull().all() uploader.upload_trees( - matched_df.assign(state='CA').rename( + matched_df.assign(state='CA').rename(columns={'Species ID': 'species_id'}).rename( columns={col: col.lower() for col in matched_df.columns} - ).rename(columns={'Species ID': 'species_id'}) + ) ) diff --git a/requirements.txt b/requirements.txt index 2c26196..2b9d1df 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ -python-geohash geopandas -pandas +pandas==1.1.5 +Flask==2.0.2 +gunicorn==20.1.0 rtree -cloud-sql-python-connector[pymysql] pymysql Pillow google-cloud-storage +cloud-sql-python-connector[pymysql] diff --git a/sm_parser.py b/sm_parser.py new file mode 100644 index 0000000..3bef796 --- /dev/null +++ b/sm_parser.py @@ -0,0 +1,42 @@ +import sys +from io import StringIO + +import pandas as pd + + +def parse_trees(df=None, stdout=False): + heritage_trees = pd.read_csv('data/heritage_trees.csv') + if df is None: + df = pd.read_csv(StringIO(''.join(sys.stdin.readlines()))).rename(columns={'Tree ID': 'tree_id'}) + else: + df = df.rename(columns={'Tree ID': 'tree_id'}) + df = pd.merge(df, heritage_trees, how='left', on='tree_id').rename( + columns={ + 'Species ID': 'species_id', + 'Name Botanical': 'name_botanical', + 'Height Min': 'height_min_ft', + 'Height Max': 'height_max_ft', + 'DBH Min': 'diameter_min_in', + 'DBH Max': 'diameter_max_in', + 'Latitude': 'latitude', + 'Longitude': 'longitude', + 'Location Description': 'location_description', + 'year_added': 'heritageYear', + 'heritage_number': 'heritageNumber', + 'text': 'heritageText' + } + ) + df = df.assign( + address=df['Address'].astype(str).str.cat(df['Street'], sep=' '), + city='Santa Monica', + state='CA', + heritage=df['heritageNumber'].notnull() + ) + if stdout: + print(df.to_json(orient='records'), file=sys.stdout) + else: + return df + + +if __name__ == "__main__": + parse_trees(True) \ No newline at end of file diff --git a/treeapi/main.py b/treeapi/main.py index 57cebc6..430b897 100644 --- a/treeapi/main.py +++ b/treeapi/main.py @@ -1,4 +1,5 @@ import os +import json import pymysql from google.cloud.sql.connector import connector @@ -8,14 +9,22 @@ class DBConn(object): - def __init__(self, password): - self.connection = connector.connect( - instance_connection_string='total-ensign-336021:us-west1:public-tree-map', - driver='pymysql', - user='root', - password=password, - db='publictrees' - ) + def __init__(self, password, local=False): + if local: + self.connection = pymysql.connect( + host='localhost', + password=password, + db='publictrees', + user='root' + ) + else: + self.connection = connector.connect( + instance_connection_string=os.environ['TREE_DB_CONNECTION_STR'], + driver='pymysql', + user='root', + password=password, + db='publictrees' + ) def __enter__(self): return self.connection @@ -52,34 +61,92 @@ async def get_random_tree(): return cursor.fetchall() -@app.get("/trees/") -async def get_tree(lat1, lng1, lat2, lng2, lat3, lng3, lat4, lng4): - lats = [lat1, lat2, lat3, lat4] - lngs = [lng1, lng2, lng3, lng4] - lat_lngs = ' '.join(zip(lats, lngs)) - csv = ','.join(lat_lngs) - polygon_str = f'POLYGON(({csv}, {lat_lngs[0]}))' +@app.get("/tree/{tree_id}") +async def get_tree(tree_id): sql = f""" - SELECT - tree_id, - name_common, - name_botanical, + SELECT + botanical_name AS name_botanical, + common_name AS name_common, + family_botanical_name AS family_name_botanical, + family_common_name AS family_name_common, address, city, + state, diameter_min_in, diameter_max_in, exact_diameter, height_min_ft, height_max_ft, exact_height, + native AS nativity, estimated_value, tree_condition, - ST_LATITUDE(MY_POINT) AS latitude, - ST_LONGITUDE(location) AS longitude - FROM trees + shade_production, + irrigation_requirements, + form, + type, + iucn_status, + iucn_doi_or_url, + ST_LATITUDE(location) AS latitude, + ST_LONGITUDE(location) AS longitude, + heritage, + heritage_number AS heritageNumber, + heritage_text AS heritageText, + heritage_year AS heritageYear, + JSON_ARRAYAGG( + JSON_OBJECT( + 'url', + CONCAT('https://storage.googleapis.com/public-tree-map-images/', hashed_original_url, '.', + extension), + 'author', JSON_OBJECT( + 'name', author, + 'url', author_url + ) + ) + ) AS images + FROM trees T + INNER JOIN species s on T.species_id = s.id + INNER JOIN images i on s.id = i.species_id + WHERE + T.id = %s + GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 + """ + with DBConn(os.environ['TREE_DB_PASS']) as conn: + cursor = conn.cursor(pymysql.cursors.DictCursor) + cursor.execute(sql, tree_id) + result = cursor.fetchone() + + if result: + result['images'] = json.loads(result['images']) + return result + + +@app.get("/trees/") +async def get_trees(lat1, lng1, lat2, lng2, lat3, lng3, lat4, lng4): + lats = [lat1, lat2, lat3, lat4] + lngs = [lng1, lng2, lng3, lng4] + lat_lngs = [] + for lat, lng in zip(lats, lngs): + lat_lngs.append(f'{lat} {lng}') + csv = ','.join(lat_lngs) + polygon_str = f'POLYGON(({csv}, {lat_lngs[0]}))' + sql = f""" + SELECT + T.id AS tree_id, + botanical_name AS name_botanical, + common_name AS name_common, + family_botanical_name AS family_name_botanical, + family_common_name AS family_name_common, + iucn_status, + native AS nativity, + ST_LATITUDE(location) AS latitude, + ST_LONGITUDE(location) AS longitude, + heritage + FROM trees T + INNER JOIN species s on T.species_id = s.id WHERE MBRContains( - ST_GeomFromText('%s'), + ST_GeomFromText(%s, 4269), location ) """ @@ -88,4 +155,7 @@ async def get_tree(lat1, lng1, lat2, lng2, lat3, lng3, lat4, lng4): cursor.execute(sql, polygon_str) results = cursor.fetchall() + if results: + for tree in results: + tree['heritage'] = True if tree['heritage'] else False return results \ No newline at end of file diff --git a/upload_trees.py b/upload_trees.py index b6695e0..0e94fa1 100644 --- a/upload_trees.py +++ b/upload_trees.py @@ -3,9 +3,7 @@ import json import sys -# import pymysql -# from google.cloud.sql.connector import connector -# import pandas as pd +import pymysql import geopandas as gpd import numpy as np import pandas as pd @@ -13,16 +11,30 @@ from google.cloud.sql.connector import connector -class DBTreeUploader(object): +class DBCursor(object): + def __init__(self, password=None): + self.connection = connector.connect( + os.environ['TREE_DB_CONNECTION_STR'], + 'pymysql', + user='root', + password=password if password else os.environ['TREE_DB_PASS'], + db='publictrees' + ) - def __init__(self, password): - self.password = password + def __enter__(self): + return self.connection + + def __exit__(self, type, value, traceback): + self.connection.close() + + +class DBTreeUploader(object): def truncate_trees(self): self._truncate_table('trees') def delete_species(self): - with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + with DBCursor() as conn: conn.cursor().execute( f""" DELETE FROM species; @@ -36,7 +48,7 @@ def delete_species(self): conn.commit() def _truncate_table(self, table): - with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + with DBCursor() as conn: conn.cursor().execute( f""" TRUNCATE TABLE {table}; @@ -50,7 +62,7 @@ def _truncate_table(self, table): conn.commit() def truncate_sm_trees(self): - with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + with DBCursor() as conn: conn.cursor().execute( """ DELETE FROM trees @@ -60,7 +72,7 @@ def truncate_sm_trees(self): conn.commit() def get_species_ids_mapper(self) -> Dict[str, int]: - with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + with DBCursor() as conn: cursor = conn.cursor(pymysql.cursors.DictCursor) cursor.execute( """ @@ -73,7 +85,7 @@ def get_species_ids_mapper(self) -> Dict[str, int]: return pd.DataFrame(results).set_index('botanical_name').to_dict()['id'] def upload_trees(self, df: pd.DataFrame, batch_size=100000): - with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + with DBCursor() as conn: sql = """ INSERT INTO trees( tree_id, @@ -99,7 +111,10 @@ def upload_trees(self, df: pd.DataFrame, batch_size=100000): %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ST_GeomFromText(%s, 4269), %s, %s, %s, %s ) """ - df['batch'] = np.random.randint(batch_size, size=len(df)) + if batch_size == 0: + df['batch'] = 0 + else: + df['batch'] = np.random.randint(int(len(df) / batch_size), size=len(df)) df = df.where((pd.notnull(df)), None) for _, batch_df in df.groupby('batch'): conn.cursor().executemany( @@ -129,8 +144,67 @@ def upload_trees(self, df: pd.DataFrame, batch_size=100000): ) conn.commit() - def upload_species(self, df: pd.DataFrame): - with DBCursor(self.password if self.password else os.environ['TREE_DB_PASS']) as conn: + def update_species(self, df): + with DBCursor() as conn: + cursor = conn.cursor(pymysql.cursors.DictCursor) + cursor.execute( + """ + SELECT botanical_name FROM species + """ + ) + botanical_names = set([row['botanical_name'] for row in cursor.fetchall()]) + update_df = df[df['botanical_name'].isin(botanical_names)] + cursor.executemany( + """ + UPDATE species + SET + common_name = %s, + family_botanical_name = %s, + family_common_name = %s, + native = %s, + eol_id = %s, + eol_overview_url = %s, + simplified_iucn_status = %s, + iucn_status = %s, + iucn_doi_or_url = %s, + shade_production = %s, + form = %s, + type = %s, + cal_ipc_url = %s, + irrigation_requirements = %s, + species_id = %s + WHERE + botanical_name = %s + """, + [ + ( + row.common_name, + row.family_botanical_name, + row.family_common_name, + row.native, + int(row.eol_id) if row.eol_id is not None else None, + row.eol_overview_url, + row.simplified_iucn_status, + row.iucn_status, + row.iucn_doi_or_url, + row.shade_production, + row.form, + row.type, + row.cal_ipc_url, + row.irrigation_requirements, + row.species_id, + row.botanical_name, + ) for row in update_df.itertuples() + ] + + ) + + write_df = df[~df['botanical_name'].isin(botanical_names)] + self.upload_species(write_df) + + @staticmethod + def upload_species(df: pd.DataFrame): + with DBCursor() as conn: sql = """ INSERT INTO species( botanical_name, @@ -185,45 +259,12 @@ def upload_species(self, df: pd.DataFrame): class SMTreeUploader(DBTreeUploader): def __init__(self): - super().__init__(None) + super().__init__() def prepare_df(self, df): - df['location'] = gpd.GeoSeries(gpd.points_from_xy(df['longitude'], df['latitude'])).to_wkt() + df['location'] = gpd.GeoSeries(gpd.points_from_xy(df['latitude'], df['longitude'])).to_wkt() self.df = df.rename(columns={ 'heritageYear': 'heritage_year', 'heritageNumber': 'heritage_number', 'heritageText': 'heritage_text', - }) - - -if __name__ == "__main__": - - # Load the trees dataset. - json_lines = [] - for line in sys.stdin: - json_lines.append(json.loads(line)) - - df = pd.DataFrame(json_lines) - uploader = SMTreeUploader() - species_mapper = uploader.get_species_ids_mapper() - df['species_id'] = df['botanical_name'].map(species_mapper) - assert df['species_id'].notnull().all() - print(df, file=sys.stderr) - uploader.upload_trees(uploader.df) - - -class DBCursor(object): - def __init__(self, password): - self.connection = connector.connect( - instance_connection_string='total-ensign-336021:us-west1:public-tree-map', - driver='pymysql', - user='root', - password=password, - db='publictrees' - ) - - def __enter__(self): - return self.connection - - def __exit__(self, type, value, traceback): - self.connection.close() \ No newline at end of file + }) \ No newline at end of file From 5a08dfc49b8a339f8e8cee565f8c52f6b889c141 Mon Sep 17 00:00:00 2001 From: allen Date: Sat, 11 Jun 2022 14:40:43 -0700 Subject: [PATCH 08/10] use cloud sql proxy and sockets for access --- treeapi/Dockerfile | 2 +- treeapi/main.py | 93 +++++++++++++++++----------------------- treeapi/requirements.txt | 4 +- 3 files changed, 43 insertions(+), 56 deletions(-) diff --git a/treeapi/Dockerfile b/treeapi/Dockerfile index d49d5d7..7bc29ba 100644 --- a/treeapi/Dockerfile +++ b/treeapi/Dockerfile @@ -22,4 +22,4 @@ RUN pip3 install --no-cache-dir -r requirements.txt # For environments with multiple CPU cores, increase the number of workers # to be equal to the cores available. # Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling. -CMD exec gunicorn --bind :$PORT --workers 1 --worker-class uvicorn.workers.UvicornWorker --threads 8 main:app +CMD exec gunicorn --bind :$PORT --workers 1 --worker-class uvicorn.workers.UvicornWorker --threads 8 --timeout 0 main:app diff --git a/treeapi/main.py b/treeapi/main.py index 430b897..ae9f2f0 100644 --- a/treeapi/main.py +++ b/treeapi/main.py @@ -1,64 +1,52 @@ import os import json -import pymysql -from google.cloud.sql.connector import connector +import sqlalchemy from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware app = FastAPI() +origins = [ + "*", + "http://localhost", + "http://localhost:8080", +] +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) -class DBConn(object): - def __init__(self, password, local=False): - if local: - self.connection = pymysql.connect( - host='localhost', - password=password, - db='publictrees', - user='root' - ) - else: - self.connection = connector.connect( - instance_connection_string=os.environ['TREE_DB_CONNECTION_STR'], - driver='pymysql', - user='root', - password=password, - db='publictrees' - ) - def __enter__(self): - return self.connection - - def __exit__(self, type, value, traceback): - self.connection.close() +def init_connection_engine(local=False): + prepend_str = '/home/allen/Downloads' if local else '' + return sqlalchemy.create_engine( + sqlalchemy.engine.url.URL.create( + drivername="mysql+pymysql", + username='root', + password=os.environ['TREE_DB_PASS'], + database="publictrees", + query={ + "unix_socket": f"{prepend_str}/cloudsql/{os.environ['TREE_DB_CONNECTION_STR']}" + } + ), + ) @app.get("/random/") async def get_random_tree(): sql = f""" SELECT - tree_id, - name_common, - name_botanical, - address, - city, - diameter_min_in, - diameter_max_in, - exact_diameter, - height_min_ft, - height_max_ft, - exact_height, - estimated_value, - tree_condition, - ST_LATITUDE(location) AS latitude, - ST_LONGITUDE(location) AS longitude + ST_LATITUDE(location) AS lat, + ST_LONGITUDE(location) AS lng FROM trees - LIMIT 1 + WHERE rand() <= 0.1 """ - with DBConn(os.environ['TREE_DB_PASS']) as conn: - cursor = conn.cursor(pymysql.cursors.DictCursor) - cursor.execute(sql) - return cursor.fetchall() + with init_connection_engine().connect() as conn: + return conn.execute(sql).mappings().all() @app.get("/tree/{tree_id}") @@ -111,10 +99,9 @@ async def get_tree(tree_id): T.id = %s GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 """ - with DBConn(os.environ['TREE_DB_PASS']) as conn: - cursor = conn.cursor(pymysql.cursors.DictCursor) - cursor.execute(sql, tree_id) - result = cursor.fetchone() + with init_connection_engine() as conn: + resultset = conn.execute(sql, tree_id).mappings() + result = resultset.fetchone() if result: result['images'] = json.loads(result['images']) @@ -150,12 +137,12 @@ async def get_trees(lat1, lng1, lat2, lng2, lat3, lng3, lat4, lng4): location ) """ - with DBConn(os.environ['TREE_DB_PASS']) as conn: - cursor = conn.cursor(pymysql.cursors.DictCursor) - cursor.execute(sql, polygon_str) - results = cursor.fetchall() + with init_connection_engine() as conn: + resultset = conn.execute(sql, polygon_str).mappings() + results = resultset.fetchall() if results: for tree in results: tree['heritage'] = True if tree['heritage'] else False - return results \ No newline at end of file + return results + diff --git a/treeapi/requirements.txt b/treeapi/requirements.txt index 48ebc1a..d557fe3 100644 --- a/treeapi/requirements.txt +++ b/treeapi/requirements.txt @@ -1,5 +1,5 @@ fastapi uvicorn[standard] gunicorn -cloud-sql-python-connector[pymysql] -pymysql \ No newline at end of file +pymysql +sqlalchemy \ No newline at end of file From ea6c83831157ef22f78873a5b01356a1a495cf60 Mon Sep 17 00:00:00 2001 From: Allen Tran Date: Sat, 4 Nov 2023 18:51:20 -0700 Subject: [PATCH 09/10] readme, sql fixes for front-end (nulls from inner joins, return tree id) --- treeapi/README.md | 15 +++++++++++ treeapi/main.py | 64 +++++++++++++++++++++++++++++++---------------- 2 files changed, 57 insertions(+), 22 deletions(-) create mode 100644 treeapi/README.md diff --git a/treeapi/README.md b/treeapi/README.md new file mode 100644 index 0000000..8a55aeb --- /dev/null +++ b/treeapi/README.md @@ -0,0 +1,15 @@ +# Database Connection + +You need to set the environment variables +```angular2html +EXPORT TREE_DB_PASS="password" +EXPORT TREE_DB_CONNECTION_STR="lively-sentry-336718:us-west1:public-tree-map-db" +``` + +Also make sure in `main.py` that `LOCAL=True` for local development. + + +Assuming the database instance is running, you need to run `cloud-sql-proxy` which uses unix-sockets. +```angular2html +./cloud-sql-proxy --unix-socket ~/cloudsql --credentials-file lively-sentry-336718-fc01c1868439.json lively-sentry-336718:us-west1:public-tree-map-db +``` \ No newline at end of file diff --git a/treeapi/main.py b/treeapi/main.py index ae9f2f0..55e529b 100644 --- a/treeapi/main.py +++ b/treeapi/main.py @@ -2,7 +2,7 @@ import json import sqlalchemy -from fastapi import FastAPI +from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware app = FastAPI() @@ -12,6 +12,8 @@ "http://localhost:8080", ] +LOCAL = True + app.add_middleware( CORSMiddleware, allow_origins=origins, @@ -20,13 +22,14 @@ allow_headers=["*"], ) +random_tree_cache = {} def init_connection_engine(local=False): - prepend_str = '/home/allen/Downloads' if local else '' + prepend_str = '/Users/allent/' if local else '' return sqlalchemy.create_engine( sqlalchemy.engine.url.URL.create( drivername="mysql+pymysql", - username='root', + username='root' if local else os.environ['TREE_DB_USER'], password=os.environ['TREE_DB_PASS'], database="publictrees", query={ @@ -37,16 +40,25 @@ def init_connection_engine(local=False): @app.get("/random/") -async def get_random_tree(): - sql = f""" - SELECT - ST_LATITUDE(location) AS lat, - ST_LONGITUDE(location) AS lng - FROM trees - WHERE rand() <= 0.1 - """ - with init_connection_engine().connect() as conn: - return conn.execute(sql).mappings().all() +async def get_random_tree(request: Request): + ip_address = request.client.host + ip_hash = sum([int(x) for x in ip_address if x.isdigit()]) % 11 + if ip_hash in random_tree_cache: + return random_tree_cache[ip_hash] + else: + sql = f""" + SELECT + ST_LATITUDE(location) AS lat, + ST_LONGITUDE(location) AS lng + FROM trees + WHERE id % 11 = :ip_hash + """ + with init_connection_engine(LOCAL).connect() as conn: + random_tree_cache[ip_hash] = conn.execute( + sqlalchemy.text(sql), + {'ip_hash': ip_hash} + ).mappings().all() + return random_tree_cache[ip_hash] @app.get("/tree/{tree_id}") @@ -91,19 +103,21 @@ async def get_tree(tree_id): 'url', author_url ) ) - ) AS images + ) AS images, + T.id AS tree_id FROM trees T INNER JOIN species s on T.species_id = s.id - INNER JOIN images i on s.id = i.species_id + LEFT JOIN images i on s.id = i.species_id WHERE - T.id = %s + T.id = :tree_id GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 """ - with init_connection_engine() as conn: - resultset = conn.execute(sql, tree_id).mappings() + with init_connection_engine(LOCAL).connect() as conn: + resultset = conn.execute(sqlalchemy.text(sql), {'tree_id': tree_id}).mappings() result = resultset.fetchone() if result: + result = dict(result) result['images'] = json.loads(result['images']) return result @@ -133,15 +147,21 @@ async def get_trees(lat1, lng1, lat2, lng2, lat3, lng3, lat4, lng4): INNER JOIN species s on T.species_id = s.id WHERE MBRContains( - ST_GeomFromText(%s, 4269), + ST_GeomFromText(:polygon, 4269), location - ) + ) AND + T.id IS NOT NULL """ - with init_connection_engine() as conn: - resultset = conn.execute(sql, polygon_str).mappings() + + with init_connection_engine(LOCAL).connect() as conn: + resultset = conn.execute( + sqlalchemy.text(sql), + {'polygon': polygon_str} + ).mappings() results = resultset.fetchall() if results: + results = [dict(r) for r in results] for tree in results: tree['heritage'] = True if tree['heritage'] else False return results From e532b2a68ccb3e4816cee367640d08b0f016f87c Mon Sep 17 00:00:00 2001 From: Allen Tran Date: Sat, 18 Nov 2023 17:46:52 -0800 Subject: [PATCH 10/10] use common db access point --- treeapi/db_conn.py | 18 +++++ .../download_images.py | 70 ++++++++----------- treeapi/main.py | 17 +---- upload_trees.py | 4 +- 4 files changed, 53 insertions(+), 56 deletions(-) create mode 100644 treeapi/db_conn.py rename download_images.py => treeapi/download_images.py (72%) diff --git a/treeapi/db_conn.py b/treeapi/db_conn.py new file mode 100644 index 0000000..af129c9 --- /dev/null +++ b/treeapi/db_conn.py @@ -0,0 +1,18 @@ +import os + +import sqlalchemy + + +def init_connection_engine(local=False): + prepend_str = '/Users/allent/' if local else '' + return sqlalchemy.create_engine( + sqlalchemy.engine.url.URL.create( + drivername="mysql+pymysql", + username='root' if local else os.environ['TREE_DB_USER'], + password=os.environ['TREE_DB_PASS'], + database="publictrees", + query={ + "unix_socket": f"{prepend_str}/cloudsql/{os.environ['TREE_DB_CONNECTION_STR']}" + } + ), + ) diff --git a/download_images.py b/treeapi/download_images.py similarity index 72% rename from download_images.py rename to treeapi/download_images.py index b3c260e..e113be5 100644 --- a/download_images.py +++ b/treeapi/download_images.py @@ -8,8 +8,9 @@ from io import BytesIO import requests import pandas as pd -import pymysql.cursors -from upload_trees import DBCursor +import sqlalchemy + +from db_conn import init_connection_engine @dataclasses.dataclass @@ -27,36 +28,25 @@ class TreeImage: class ImageDownloader(object): MAX_SIZE = (1024, 1024) - def __init__(self): + def __init__(self, local): self.bucket = storage.Client().bucket('public-tree-map-images') + self.local = local def insert_tree_into_db(self, tree: TreeImage): - with DBCursor() as conn: - cursor = conn.cursor(pymysql.cursors.DictCursor) - cursor.execute( - """ - INSERT INTO images( - extension, - original_url, - details, - hashed_original_url, - species_id, - author, - author_url - ) - VALUES (%s, %s, %s, %s, %s, %s, %s) - """, - ( - tree.img_type, - tree.retrieval_url, - tree.description if tree.description else None, - tree.hashed_url, - tree.species_id, - tree.author if tree.author else None, - tree.author_url if tree.author_url else None - ) + with init_connection_engine(self.local).connect() as conn: + table = sqlalchemy.Table('images', sqlalchemy.MetaData(), autoload_with=conn) + stmt = sqlalchemy.insert(table).values( + extension=tree.img_type, + original_url=tree.retrieval_url, + details=tree.description if tree.description else None, + hashed_original_url=tree.hashed_url, + species_id=tree.species_id, + author=tree.author if tree.author else None, + author_url=tree.author_url if tree.author_url else None ) + + conn.execute(stmt) conn.commit() def get_and_upload_image(self, tree_image: TreeImage): @@ -68,11 +58,13 @@ def get_and_upload_image(self, tree_image: TreeImage): img = Image.open(BytesIO(r.content)) img.thumbnail(self.MAX_SIZE) tree_image.image = img - with BytesIO(img.tobytes()) as f: - blob.upload_from_file( - f, - content_type=r.headers['Content-Type'] - ) + byte_stream = BytesIO() + img.save(byte_stream, format=img.format) + byte_stream.seek(0) + blob.upload_from_file( + byte_stream, + content_type=r.headers['Content-Type'] + ) self.insert_tree_into_db(tree_image) return tree_image @@ -111,9 +103,8 @@ def get_tree_images(self, tree_id, eol_id, existing_images: Set[str]): existing_images.add(uploaded_tree.hashed_url) def get_trees_without_images(self): - with DBCursor() as conn: - cursor = conn.cursor(pymysql.cursors.DictCursor) - cursor.execute( + with init_connection_engine(self.local).connect() as conn: + sql = sqlalchemy.text( """ SELECT S.id, @@ -125,20 +116,21 @@ def get_trees_without_images(self): HAVING cnt < 3 """ ) - tree_results = cursor.fetchall() - cursor.execute( + tree_results = conn.execute(sql).mappings().fetchall() + sql = sqlalchemy.text( """ SELECT hashed_original_url FROM images """ ) - image_results = set([row['hashed_original_url'] for row in cursor.fetchall()]) + result_set = conn.execute(sql).mappings() + image_results = set([row['hashed_original_url'] for row in result_set.fetchall()]) return pd.DataFrame(tree_results), image_results if __name__ == "__main__": - img_download = ImageDownloader() + img_download = ImageDownloader(local=True) trees_df, hashed_urls = img_download.get_trees_without_images() for idx, row in enumerate(trees_df.itertuples()): print(f'{idx}/{len(trees_df)}') diff --git a/treeapi/main.py b/treeapi/main.py index 55e529b..5813022 100644 --- a/treeapi/main.py +++ b/treeapi/main.py @@ -1,10 +1,11 @@ -import os import json import sqlalchemy from fastapi import FastAPI, Request from fastapi.middleware.cors import CORSMiddleware +from db_conn import init_connection_engine + app = FastAPI() origins = [ "*", @@ -24,20 +25,6 @@ random_tree_cache = {} -def init_connection_engine(local=False): - prepend_str = '/Users/allent/' if local else '' - return sqlalchemy.create_engine( - sqlalchemy.engine.url.URL.create( - drivername="mysql+pymysql", - username='root' if local else os.environ['TREE_DB_USER'], - password=os.environ['TREE_DB_PASS'], - database="publictrees", - query={ - "unix_socket": f"{prepend_str}/cloudsql/{os.environ['TREE_DB_CONNECTION_STR']}" - } - ), - ) - @app.get("/random/") async def get_random_tree(request: Request): diff --git a/upload_trees.py b/upload_trees.py index 0e94fa1..642bc77 100644 --- a/upload_trees.py +++ b/upload_trees.py @@ -8,12 +8,12 @@ import numpy as np import pandas as pd import pymysql.cursors -from google.cloud.sql.connector import connector +from google.cloud.sql.connector import Connector class DBCursor(object): def __init__(self, password=None): - self.connection = connector.connect( + self.connection = Connector().connect( os.environ['TREE_DB_CONNECTION_STR'], 'pymysql', user='root',