From 794ec3a09284e927ac4cd11a48713c4c9cc79a3a Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Thu, 5 Dec 2019 16:17:11 -0700 Subject: [PATCH 01/33] [DEV-4015] initial progress --- .../matviews/functions_and_enums.sql | 31 +++++++++++++++++++ .../helpers/download_annotation_functions.py | 22 +++++++++---- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/usaspending_api/database_scripts/matviews/functions_and_enums.sql b/usaspending_api/database_scripts/matviews/functions_and_enums.sql index ed88698794..6792b6934a 100644 --- a/usaspending_api/database_scripts/matviews/functions_and_enums.sql +++ b/usaspending_api/database_scripts/matviews/functions_and_enums.sql @@ -29,3 +29,34 @@ AS $$ RETURN result::public.total_obligation_bins; END; $$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION public.urlencode(str_val TEXT) +RETURNS text +IMMUTABLE PARALLEL SAFE +AS $$ + DECLARE + DECLARE result text; + BEGIN + result = REPLACE($1, '%', '%25'); + result = REPLACE(result, ' ', '%20'); + result = REPLACE(result, '!', '%21'); + result = REPLACE(result, '#', '%23'); + result = REPLACE(result, '$', '%24'); + result = REPLACE(result, '&', '%26'); + result = REPLACE(result, '''', '%27'); + result = REPLACE(result, '(', '%28'); + result = REPLACE(result, ')', '%29'); + result = REPLACE(result, '*', '%2A'); + result = REPLACE(result, '+', '%2B'); + result = REPLACE(result, ',', '%2C'); + result = REPLACE(result, '/', '%2F'); + result = REPLACE(result, ':', '%3A'); + result = REPLACE(result, ';', '%3B'); + result = REPLACE(result, '=', '%3D'); + result = REPLACE(result, '?', '%3F'); + result = REPLACE(result, '@', '%40'); + result = REPLACE(result, '[', '%5B'); + result = REPLACE(result, ']', '%5D'); + RETURN result; + END; +$$ LANGUAGE plpgsql; diff --git a/usaspending_api/download/helpers/download_annotation_functions.py b/usaspending_api/download/helpers/download_annotation_functions.py index 7b5e5bc9d1..7f216e225b 100644 --- a/usaspending_api/download/helpers/download_annotation_functions.py +++ b/usaspending_api/download/helpers/download_annotation_functions.py @@ -2,7 +2,7 @@ from usaspending_api.common.helpers.orm_helpers import FiscalYear from usaspending_api.settings import HOST from django.db.models.functions import Concat -from django.db.models import Value +from django.db.models import Func, F, Value AWARD_URL = f"{HOST}/#/award/" if "localhost" in HOST else f"https://{HOST}/#/award/" @@ -18,7 +18,9 @@ def universal_transaction_matview_annotations(): ";", distinct=True, ), - "usaspending_permalink": Concat(Value(AWARD_URL), "transaction__award__generated_unique_award_id"), + "usaspending_permalink": Concat( + Value(AWARD_URL), Func(F("transaction__award__generated_unique_award_id"), function="urlencode"), Value("/") + ), } return annotation_fields @@ -31,7 +33,9 @@ def universal_award_matview_annotations(): "federal_accounts_funding_this_award": StringAgg( "award__financial_set__treasury_account__federal_account__federal_account_code", ";", distinct=True ), - "usaspending_permalink": Concat(Value(AWARD_URL), "award__generated_unique_award_id"), + "usaspending_permalink": Concat( + Value(AWARD_URL), Func(F("award__generated_unique_award_id"), function="urlencode"), Value("/") + ), } return annotation_fields @@ -44,7 +48,9 @@ def idv_order_annotations(): "federal_accounts_funding_this_award": StringAgg( "financial_set__treasury_account__federal_account__federal_account_code", ";", distinct=True ), - "usaspending_permalink": Concat(Value(AWARD_URL), "generated_unique_award_id"), + "usaspending_permalink": Concat( + Value(AWARD_URL), Func(F("generated_unique_award_id"), function="urlencode"), Value("/") + ), } return annotation_fields @@ -58,7 +64,9 @@ def idv_transaction_annotations(): "federal_accounts_funding_this_award": StringAgg( "award__financial_set__treasury_account__federal_account__federal_account_code", ";", distinct=True ), - "usaspending_permalink": Concat(Value(AWARD_URL), "award__generated_unique_award_id"), + "usaspending_permalink": Concat( + Value(AWARD_URL), Func(F("award__generated_unique_award_id"), function="urlencode"), Value("/") + ), } return annotation_fields @@ -73,6 +81,8 @@ def subaward_annotations(): "prime_award_treasury_accounts_funding_this_award": StringAgg( "award__financial_set__treasury_account__tas_rendering_label", ";", distinct=True ), - "usaspending_permalink": Concat(Value(AWARD_URL), "award__generated_unique_award_id"), + "usaspending_permalink": Concat( + Value(AWARD_URL), Func(F("award__generated_unique_award_id"), function="urlencode"), Value("/") + ), } return annotation_fields From d5cd27c21c738facfd6dfde80f4e1b5643860cfa Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Fri, 6 Dec 2019 11:02:06 -0700 Subject: [PATCH 02/33] [DEV-4015] added optimized urlencode function --- .../matviews/functions_and_enums.sql | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/usaspending_api/database_scripts/matviews/functions_and_enums.sql b/usaspending_api/database_scripts/matviews/functions_and_enums.sql index 6792b6934a..3c2e3403ab 100644 --- a/usaspending_api/database_scripts/matviews/functions_and_enums.sql +++ b/usaspending_api/database_scripts/matviews/functions_and_enums.sql @@ -30,7 +30,7 @@ AS $$ END; $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION public.urlencode(str_val TEXT) +CREATE OR REPLACE FUNCTION public.urlencode(str_val text) RETURNS text IMMUTABLE PARALLEL SAFE AS $$ @@ -60,3 +60,19 @@ AS $$ RETURN result; END; $$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION public.urlencode_arr(url text) +RETURNS text AS $$ +BEGIN + RETURN ( + WITH str AS ( + SELECT + CASE WHEN $1 ~ '^[% !#$&''\(\)*+,/:;=?@\[\]]' THEN array[''] END || regexp_split_to_array ($1, '([% !#$&''\(\)*+,/:;=?@\[\]])+', 'i') plain, + ARRAY(SELECT (regexp_matches ($1, '(([% !#$&''\(\)*+,/:;=?@\[\]])+)', 'gi'))[1]) special + ) + SELECT coalesce(string_agg(plain[i] || CASE WHEN COALESCE(special[i], '') = '' THEN '' ELSE UPPER(CONCAT('%', to_hex(get_byte(special[i]::bytea, 0)))) END, ''), $1) + FROM str, + (SELECT generate_series(1, array_upper(special,1) + 1) i FROM str) as series + ); +END +$$ LANGUAGE plpgsql IMMUTABLE STRICT; \ No newline at end of file From 47cbb0411f98d04efe998d267fdcc566611945a3 Mon Sep 17 00:00:00 2001 From: Brian Zito Date: Fri, 6 Dec 2019 13:38:54 -0500 Subject: [PATCH 03/33] added columns --- .../download/tests/integration/test_download_status.py | 2 +- .../download/v2/download_column_historical_lookups.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/usaspending_api/download/tests/integration/test_download_status.py b/usaspending_api/download/tests/integration/test_download_status.py index 5e329ba368..cf0dae159b 100644 --- a/usaspending_api/download/tests/integration/test_download_status.py +++ b/usaspending_api/download/tests/integration/test_download_status.py @@ -135,7 +135,7 @@ def test_download_assistance_status(client, download_test_data, refresh_matviews assert resp.status_code == status.HTTP_200_OK assert resp.json()["total_rows"] == 1 - assert resp.json()["total_columns"] == 88 + assert resp.json()["total_columns"] == 89 # Test with columns specified dl_resp = client.post( diff --git a/usaspending_api/download/v2/download_column_historical_lookups.py b/usaspending_api/download/v2/download_column_historical_lookups.py index c0624209e7..d1f0836580 100644 --- a/usaspending_api/download/v2/download_column_historical_lookups.py +++ b/usaspending_api/download/v2/download_column_historical_lookups.py @@ -1236,9 +1236,11 @@ "broker_subaward__place_of_perform_country_na", ), ("prime_award_description", "broker_subaward__award_description"), + ("prime_award_project_title", "broker_subaward__program_title"), ("prime_award_naics_code", "broker_subaward__naics"), ("prime_award_naics_description", "broker_subaward__naics_description"), ("subaward_type", "broker_subaward__subaward_type"), + ("subaward_fsrs_report_id", "broker_subaward__internal_id"), ("subaward_report_year", "broker_subaward__subaward_report_year"), ("subaward_report_month", "broker_subaward__subaward_report_month"), ("subaward_number", "broker_subaward__subaward_number"), @@ -1298,6 +1300,7 @@ ("subawardee_highly_compensated_officer_5_name", "broker_subaward__sub_high_comp_officer5_full_na"), ("subawardee_highly_compensated_officer_5_amount", "broker_subaward__sub_high_comp_officer5_amount"), ("usaspending_permalink", None), # to be filled in by annotation + ("subaward_fsrs_report_last_modified_date", "broker_subaward__date_submitted"), ] ), "d2": OrderedDict( @@ -1356,9 +1359,11 @@ "broker_subaward__place_of_perform_country_na", ), ("prime_award_description", "broker_subaward__award_description"), + ("prime_award_project_title", "broker_subaward__program_title"), ("prime_award_cfda_number", "broker_subaward__cfda_numbers"), ("prime_award_cfda_title", "broker_subaward__cfda_titles"), ("subaward_type", "broker_subaward__subaward_type"), + ("subaward_fsrs_report_id", "broker_subaward__internal_id"), ("subaward_report_year", "broker_subaward__subaward_report_year"), ("subaward_report_month", "broker_subaward__subaward_report_month"), ("subaward_number", "broker_subaward__subaward_number"), @@ -1418,6 +1423,7 @@ ("subawardee_highly_compensated_officer_5_name", "broker_subaward__sub_high_comp_officer5_full_na"), ("subawardee_highly_compensated_officer_5_amount", "broker_subaward__sub_high_comp_officer5_amount"), ("usaspending_permalink", None), # to be filled in by annotation + ("subaward_fsrs_report_last_modified_date", "broker_subaward__date_submitted"), ] ), }, From 309838d74325674228468f68fed2cd875388f1b4 Mon Sep 17 00:00:00 2001 From: Brian Zito Date: Fri, 6 Dec 2019 14:39:55 -0500 Subject: [PATCH 04/33] renamed columns --- .../download/v2/download_column_historical_lookups.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/usaspending_api/download/v2/download_column_historical_lookups.py b/usaspending_api/download/v2/download_column_historical_lookups.py index d1f0836580..7784cb3b8e 100644 --- a/usaspending_api/download/v2/download_column_historical_lookups.py +++ b/usaspending_api/download/v2/download_column_historical_lookups.py @@ -1241,8 +1241,8 @@ ("prime_award_naics_description", "broker_subaward__naics_description"), ("subaward_type", "broker_subaward__subaward_type"), ("subaward_fsrs_report_id", "broker_subaward__internal_id"), - ("subaward_report_year", "broker_subaward__subaward_report_year"), - ("subaward_report_month", "broker_subaward__subaward_report_month"), + ("subaward_fsrs_report_year", "broker_subaward__subaward_report_year"), + ("subaward_fsrs_report_month", "broker_subaward__subaward_report_month"), ("subaward_number", "broker_subaward__subaward_number"), ("subaward_amount", "broker_subaward__subaward_amount"), ("subaward_action_date", "broker_subaward__sub_action_date"), @@ -1364,8 +1364,8 @@ ("prime_award_cfda_title", "broker_subaward__cfda_titles"), ("subaward_type", "broker_subaward__subaward_type"), ("subaward_fsrs_report_id", "broker_subaward__internal_id"), - ("subaward_report_year", "broker_subaward__subaward_report_year"), - ("subaward_report_month", "broker_subaward__subaward_report_month"), + ("subaward_fsrs_report_year", "broker_subaward__subaward_report_year"), + ("subaward_fsrs_report_month", "broker_subaward__subaward_report_month"), ("subaward_number", "broker_subaward__subaward_number"), ("subaward_amount", "broker_subaward__subaward_amount"), ("subaward_action_date", "broker_subaward__sub_action_date"), From 2b30ef3988be297aab8f335b77524c305b56dc2f Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Fri, 6 Dec 2019 13:42:46 -0700 Subject: [PATCH 05/33] [DEV-4015] Seems to function as desired --- .../matviews/functions_and_enums.sql | 66 +++++++------------ .../commands/populate_monthly_delta_files.py | 3 +- 2 files changed, 26 insertions(+), 43 deletions(-) diff --git a/usaspending_api/database_scripts/matviews/functions_and_enums.sql b/usaspending_api/database_scripts/matviews/functions_and_enums.sql index 3c2e3403ab..a7a92a251c 100644 --- a/usaspending_api/database_scripts/matviews/functions_and_enums.sql +++ b/usaspending_api/database_scripts/matviews/functions_and_enums.sql @@ -30,49 +30,33 @@ AS $$ END; $$ LANGUAGE plpgsql; -CREATE OR REPLACE FUNCTION public.urlencode(str_val text) -RETURNS text + +CREATE OR REPLACE FUNCTION public.urlencode(INOUT str_val text) IMMUTABLE PARALLEL SAFE AS $$ - DECLARE - DECLARE result text; BEGIN - result = REPLACE($1, '%', '%25'); - result = REPLACE(result, ' ', '%20'); - result = REPLACE(result, '!', '%21'); - result = REPLACE(result, '#', '%23'); - result = REPLACE(result, '$', '%24'); - result = REPLACE(result, '&', '%26'); - result = REPLACE(result, '''', '%27'); - result = REPLACE(result, '(', '%28'); - result = REPLACE(result, ')', '%29'); - result = REPLACE(result, '*', '%2A'); - result = REPLACE(result, '+', '%2B'); - result = REPLACE(result, ',', '%2C'); - result = REPLACE(result, '/', '%2F'); - result = REPLACE(result, ':', '%3A'); - result = REPLACE(result, ';', '%3B'); - result = REPLACE(result, '=', '%3D'); - result = REPLACE(result, '?', '%3F'); - result = REPLACE(result, '@', '%40'); - result = REPLACE(result, '[', '%5B'); - result = REPLACE(result, ']', '%5D'); - RETURN result; + -- Only percent-encode special characters, pass all unicode and other ascii characters + -- IMPORTANT! handle % first, otherwise it can return incorrect results + -- Appears to be inefficient, but it beat an "optimized" algorithm using regex and arrays + str_val := REPLACE(str_val, '%', '%25'); + str_val := REPLACE(str_val, ' ', '%20'); + str_val := REPLACE(str_val, '!', '%21'); + str_val := REPLACE(str_val, '#', '%23'); + str_val := REPLACE(str_val, '$', '%24'); + str_val := REPLACE(str_val, '&', '%26'); + str_val := REPLACE(str_val, '''', '%27'); + str_val := REPLACE(str_val, '(', '%28'); + str_val := REPLACE(str_val, ')', '%29'); + str_val := REPLACE(str_val, '*', '%2A'); + str_val := REPLACE(str_val, '+', '%2B'); + str_val := REPLACE(str_val, ',', '%2C'); + str_val := REPLACE(str_val, '/', '%2F'); + str_val := REPLACE(str_val, ':', '%3A'); + str_val := REPLACE(str_val, ';', '%3B'); + str_val := REPLACE(str_val, '=', '%3D'); + str_val := REPLACE(str_val, '?', '%3F'); + str_val := REPLACE(str_val, '@', '%40'); + str_val := REPLACE(str_val, '[', '%5B'); + str_val := REPLACE(str_val, ']', '%5D'); END; $$ LANGUAGE plpgsql; - -CREATE OR REPLACE FUNCTION public.urlencode_arr(url text) -RETURNS text AS $$ -BEGIN - RETURN ( - WITH str AS ( - SELECT - CASE WHEN $1 ~ '^[% !#$&''\(\)*+,/:;=?@\[\]]' THEN array[''] END || regexp_split_to_array ($1, '([% !#$&''\(\)*+,/:;=?@\[\]])+', 'i') plain, - ARRAY(SELECT (regexp_matches ($1, '(([% !#$&''\(\)*+,/:;=?@\[\]])+)', 'gi'))[1]) special - ) - SELECT coalesce(string_agg(plain[i] || CASE WHEN COALESCE(special[i], '') = '' THEN '' ELSE UPPER(CONCAT('%', to_hex(get_byte(special[i]::bytea, 0)))) END, ''), $1) - FROM str, - (SELECT generate_series(1, array_upper(special,1) + 1) i FROM str) as series - ); -END -$$ LANGUAGE plpgsql IMMUTABLE STRICT; \ No newline at end of file diff --git a/usaspending_api/download/management/commands/populate_monthly_delta_files.py b/usaspending_api/download/management/commands/populate_monthly_delta_files.py index eeaab9c4b8..8fd74ec77a 100644 --- a/usaspending_api/download/management/commands/populate_monthly_delta_files.py +++ b/usaspending_api/download/management/commands/populate_monthly_delta_files.py @@ -370,8 +370,7 @@ def apply_annotations_to_sql(raw_query, aliases): # Match aliases with their values values_list = [ - '{} AS "{}"'.format(deriv_dict[alias] if alias in deriv_dict else selects_list.pop(0), alias) - for alias in aliases + f'{deriv_dict[alias] if alias in deriv_dict else selects_list.pop(0)} AS "{alias}"' for alias in aliases ] return raw_query.replace(query_before_from, ", ".join(values_list)) From 642448878a887eb387b75c53550fc51a37e7be18 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Sat, 7 Dec 2019 14:02:25 -0700 Subject: [PATCH 06/33] [DEV-4015] start of new tests --- .../tests/test_custom_sql_functions.py | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 usaspending_api/database_scripts/tests/test_custom_sql_functions.py diff --git a/usaspending_api/database_scripts/tests/test_custom_sql_functions.py b/usaspending_api/database_scripts/tests/test_custom_sql_functions.py new file mode 100644 index 0000000000..8072b7fa3b --- /dev/null +++ b/usaspending_api/database_scripts/tests/test_custom_sql_functions.py @@ -0,0 +1,58 @@ +import pytest +import urllib + +from django.db import connection +from model_mommy import mommy + + +URLENCODE_FUNCTION_NAME = "urlencode" + + +@pytest.fixture() +def add_fun_awards(db): + mommy.make("awards.award", generated_unique_award_id="ASST_AGG_ABCDEFG_0123456") + mommy.make("awards.award", generated_unique_award_id="ASST_AGG_abcdefg_9876543") + mommy.make("awards.award", generated_unique_award_id="ASST_AGG_..._...") + mommy.make("awards.award", generated_unique_award_id="ASST_AGG_---_---") + mommy.make("awards.award", generated_unique_award_id="ASST_AGG_1008DRCATTHP 01^~@01906470531201403_7022") + mommy.make("awards.award", generated_unique_award_id="ASST_AGG_12C30000000000006122970000 121/21000_12C3") + mommy.make("awards.award", generated_unique_award_id="ASST_AGG_17.302-MARYLAND-PRINCE GEORGE'S-20081231-10_1635") + mommy.make("awards.award", generated_unique_award_id="ASST_NON_30180J015 MOD#2_1448") + mommy.make("awards.award", generated_unique_award_id="ASST_NON_5% RECAP_8630") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_GS30FY0027QP0019405Â_4732_GS30FY0027_4732") + mommy.make("awards.award", generated_unique_award_id="ASST_NON_R!D1102A37 10_12E2") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_[_]_test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_(_)_test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_=_+_test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_?_?_test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_:_;_test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_,__test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_$$$_test") + mommy.make("awards.award", generated_unique_award_id="FUN_TXT_⎺╲_❪ツ❫_╱⎺") + mommy.make("awards.award", generated_unique_award_id="FUN_TXT_☰☱☲☳☴☵☶☷") + + +def test_urlencoding_no_change(add_fun_awards): + test_sql = f"SELECT generated_unique_award_id, {URLENCODE_FUNCTION_NAME}(generated_unique_award_id) from awards" + with connection.cursor() as cursor: + cursor.execute(test_sql) + results = cursor.fetchall() + + for i in range(4): + assert results[i][0] == results[i][1], "Safe ASCII characters were incorrectly modified!" + + +def test_urlencoding_with_urllib(add_fun_awards): + test_sql = f"SELECT generated_unique_award_id, {URLENCODE_FUNCTION_NAME}(generated_unique_award_id) from awards" + with connection.cursor() as cursor: + cursor.execute(test_sql) + results = cursor.fetchall() + + for result in results: + msg = f"Original '{result[0]}' doesn't match reverse quote '{urllib.parse.unquote(result[1])}'" + assert urllib.parse.unquote(result[1]) == result[0], msg + + # TODO: escape UNICODE + # urlib_ver = urllib.parse.unquote(result[0], safe="^") + # msg = f"Custom SQL result '{result[1]}' doesn't match urllib function's '{urlib_ver}'" + # assert urlib_ver == result[1], msg From 534b1096bf55f6e1ef8fa69fb5ff677bf254b5e7 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Mon, 9 Dec 2019 10:15:46 -0700 Subject: [PATCH 07/33] [DEV-4015] completed unicode encoding --- .../matviews/functions_and_enums.sql | 19 +++++++++++++++++ .../tests/test_custom_sql_functions.py | 21 ++++++++++++------- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/usaspending_api/database_scripts/matviews/functions_and_enums.sql b/usaspending_api/database_scripts/matviews/functions_and_enums.sql index a7a92a251c..20b331de60 100644 --- a/usaspending_api/database_scripts/matviews/functions_and_enums.sql +++ b/usaspending_api/database_scripts/matviews/functions_and_enums.sql @@ -34,6 +34,11 @@ $$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION public.urlencode(INOUT str_val text) IMMUTABLE PARALLEL SAFE AS $$ + DECLARE + DECLARE i text; + DECLARE temp_array text[]; + DECLARE building_array text[]; + DECLARE hex_code text; BEGIN -- Only percent-encode special characters, pass all unicode and other ascii characters -- IMPORTANT! handle % first, otherwise it can return incorrect results @@ -58,5 +63,19 @@ AS $$ str_val := REPLACE(str_val, '@', '%40'); str_val := REPLACE(str_val, '[', '%5B'); str_val := REPLACE(str_val, ']', '%5D'); + + FOREACH i IN ARRAY ARRAY(SELECT (regexp_matches (str_val, '([^[:ascii:]])', 'gi'))[1]) LOOP + hex_code = UPPER(encode(i::bytea, 'hex')); + + building_array = '{}'; + temp_array = string_to_array(hex_code, null); + FOR i IN 0..char_length(hex_code) - 1 LOOP + IF i % 2 = 0 THEN + building_array := array_append(building_array, '%'); + END IF; + building_array := array_append(building_array, temp_array[i+1]); + END LOOP; + str_val := REPLACE(str_val, i, array_to_string(building_array, '')); + END LOOP; END; $$ LANGUAGE plpgsql; diff --git a/usaspending_api/database_scripts/tests/test_custom_sql_functions.py b/usaspending_api/database_scripts/tests/test_custom_sql_functions.py index 8072b7fa3b..5f0437315b 100644 --- a/usaspending_api/database_scripts/tests/test_custom_sql_functions.py +++ b/usaspending_api/database_scripts/tests/test_custom_sql_functions.py @@ -12,8 +12,8 @@ def add_fun_awards(db): mommy.make("awards.award", generated_unique_award_id="ASST_AGG_ABCDEFG_0123456") mommy.make("awards.award", generated_unique_award_id="ASST_AGG_abcdefg_9876543") - mommy.make("awards.award", generated_unique_award_id="ASST_AGG_..._...") - mommy.make("awards.award", generated_unique_award_id="ASST_AGG_---_---") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_..._..._..._...") + mommy.make("awards.award", generated_unique_award_id="CONT_IDV_---_---") mommy.make("awards.award", generated_unique_award_id="ASST_AGG_1008DRCATTHP 01^~@01906470531201403_7022") mommy.make("awards.award", generated_unique_award_id="ASST_AGG_12C30000000000006122970000 121/21000_12C3") mommy.make("awards.award", generated_unique_award_id="ASST_AGG_17.302-MARYLAND-PRINCE GEORGE'S-20081231-10_1635") @@ -48,11 +48,18 @@ def test_urlencoding_with_urllib(add_fun_awards): cursor.execute(test_sql) results = cursor.fetchall() + for result in results: + urlib_ver = urllib.parse.quote(result[0], safe="^") + msg = f"Custom SQL result '{result[1]}' doesn't match urllib function's '{urlib_ver}'" + assert urlib_ver == result[1], msg + + +def test_reverse_urlencoding_with_urllib(add_fun_awards): + test_sql = f"SELECT generated_unique_award_id, {URLENCODE_FUNCTION_NAME}(generated_unique_award_id) from awards" + with connection.cursor() as cursor: + cursor.execute(test_sql) + results = cursor.fetchall() + for result in results: msg = f"Original '{result[0]}' doesn't match reverse quote '{urllib.parse.unquote(result[1])}'" assert urllib.parse.unquote(result[1]) == result[0], msg - - # TODO: escape UNICODE - # urlib_ver = urllib.parse.unquote(result[0], safe="^") - # msg = f"Custom SQL result '{result[1]}' doesn't match urllib function's '{urlib_ver}'" - # assert urlib_ver == result[1], msg From 59bf0512f5e36cf60e96974699c9388ff318b5e0 Mon Sep 17 00:00:00 2001 From: sethstoudenmier Date: Mon, 9 Dec 2019 17:06:42 -0800 Subject: [PATCH 08/33] add additional fields to es transaction index --- .../v2/lookups/elasticsearch_lookups.py | 8 +- usaspending_api/common/matview_manager.py | 5 +- .../etl/transaction_delta_view.sql | 52 ++++- .../universal_transaction_matview.json | 8 +- usaspending_api/etl/es_etl_helpers.py | 35 ++- .../etl/es_transaction_template.json | 211 +++++++++++++----- .../search/v2/elasticsearch_helper.py | 2 +- 7 files changed, 248 insertions(+), 73 deletions(-) diff --git a/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py b/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py index 86280b6aed..77b5aa957c 100644 --- a/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py +++ b/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py @@ -6,12 +6,12 @@ TRANSACTIONS_LOOKUP = { - "Recipient Name": "recipient_name", + "Recipient Name": "recipient_name.keyword", "Action Date": "action_date", "Transaction Amount": "transaction_amount", - "Award Type": "type_description", - "Awarding Agency": "awarding_toptier_agency_name", - "Awarding Sub Agency": "awarding_subtier_agency_name", + "Award Type": "type_description.keyword", + "Awarding Agency": "awarding_toptier_agency_name.keyword", + "Awarding Sub Agency": "awarding_subtier_agency_name.keyword", "Funding Agency": "funding_toptier_agency_name", "Funding Sub Agency": "funding_subtier_agency_name", "Issued Date": "period_of_performance_start_date", diff --git a/usaspending_api/common/matview_manager.py b/usaspending_api/common/matview_manager.py index b34c29199a..5d3bfa3b7e 100644 --- a/usaspending_api/common/matview_manager.py +++ b/usaspending_api/common/matview_manager.py @@ -9,10 +9,7 @@ DEPENDENCY_FILEPATH = settings.APP_DIR / "database_scripts" / "matviews" / "functions_and_enums.sql" JSON_DIR = settings.APP_DIR / "database_scripts" / "matview_sql_generator" MATVIEW_GENERATOR_FILE = settings.APP_DIR / "database_scripts" / "matview_generator" / "matview_sql_generator.py" -OVERLAY_VIEWS = [ - settings.APP_DIR / "database_scripts" / "matviews" / "vw_award_search.sql", - settings.APP_DIR / "database_scripts" / "etl" / "transaction_delta_view.sql", -] +OVERLAY_VIEWS = [settings.APP_DIR / "database_scripts" / "matviews" / "vw_award_search.sql"] DROP_OLD_MATVIEWS = settings.APP_DIR / "database_scripts" / "matviews" / "drop_old_matviews.sql" MATERIALIZED_VIEWS = OrderedDict( [ diff --git a/usaspending_api/database_scripts/etl/transaction_delta_view.sql b/usaspending_api/database_scripts/etl/transaction_delta_view.sql index afa28db73a..c0d66b1e37 100644 --- a/usaspending_api/database_scripts/etl/transaction_delta_view.sql +++ b/usaspending_api/database_scripts/etl/transaction_delta_view.sql @@ -35,9 +35,11 @@ SELECT UTM.award_category, UTM.recipient_unique_id, UTM.parent_recipient_unique_id, + CONCAT(UTM.recipient_hash, '-', case when UTM.parent_recipient_unique_id is null then 'R' else 'C' end) as recipient_hash, UTM.recipient_name, UTM.action_date, + DATE(UTM.action_date + interval '3 months') AS fiscal_action_date, AWD.period_of_performance_start_date, AWD.period_of_performance_current_end_date, FPDS.ordering_period_end_date, @@ -47,9 +49,14 @@ SELECT UTM.federal_action_obligation AS transaction_amount, UTM.face_value_loan_guarantee, UTM.original_loan_subsidy_cost, + UTM.generated_pragmatic_obligation, UTM.awarding_agency_id, UTM.funding_agency_id, + AA.toptier_agency_id AS awarding_toptier_agency_id, + FA.toptier_agency_id AS funding_toptier_agency_id, + AA.subtier_agency_id AS awarding_subtier_agency_id, + FA.subtier_agency_id AS funding_subtier_agency_id, UTM.awarding_toptier_agency_name, UTM.funding_toptier_agency_name, UTM.awarding_subtier_agency_name, @@ -58,7 +65,13 @@ SELECT UTM.funding_toptier_agency_abbreviation, UTM.awarding_subtier_agency_abbreviation, UTM.funding_subtier_agency_abbreviation, + TAA.toptier_code AS awarding_toptier_agency_code, + TFA.toptier_code AS funding_toptier_agency_code, + SAA.subtier_code AS awarding_subtier_agency_code, + SFA.subtier_code AS funding_subtier_agency_code, + CFDA.id AS cfda_id, + UTM.cfda_number, UTM.cfda_title, '' AS cfda_popular_name, UTM.type_of_contract_pricing, @@ -83,10 +96,43 @@ SELECT UTM.recipient_location_county_name, UTM.recipient_location_zip5, UTM.recipient_location_congressional_code, - UTM.recipient_location_city_name + UTM.recipient_location_city_name, + + UTM.treasury_account_identifiers, + ACCT.federal_accounts, + UTM.business_categories FROM universal_transaction_matview UTM -JOIN transaction_normalized TM ON (UTM.transaction_id = TM.id) +INNER JOIN transaction_normalized TM ON (UTM.transaction_id = TM.id) LEFT JOIN transaction_fpds FPDS ON (UTM.transaction_id = FPDS.transaction_id) LEFT JOIN transaction_fabs FABS ON (UTM.transaction_id = FABS.transaction_id) -LEFT OUTER JOIN awards AWD ON (UTM.award_id = AWD.id); +LEFT JOIN awards AWD ON (UTM.award_id = AWD.id) +-- Similar joins are already performed oon universal_transaction_matview, however, to avoid making the matview larger +-- than needed they have been placed here. Feel free to phase out if the columns gained from the following joins are +-- added to the universal_transaction_matview. +LEFT JOIN agency AA ON (TM.awarding_agency_id = AA.id) +LEFT JOIN agency FA ON (TM.funding_agency_id = FA.id) +LEFT JOIN toptier_agency TAA ON (AA.toptier_agency_id = TAA.toptier_agency_id) +LEFT JOIN subtier_agency SAA ON (AA.subtier_agency_id = SAA.subtier_agency_id) +LEFT JOIN toptier_agency TFA ON (FA.toptier_agency_id = TFA.toptier_agency_id) +LEFT JOIN subtier_agency SFA ON (FA.subtier_agency_id = SFA.subtier_agency_id) +LEFT JOIN references_cfda CFDA ON (FABS.cfda_number = CFDA.program_number) +LEFT JOIN ( + SELECT + faba.award_id, + JSONB_AGG( + DISTINCT JSONB_BUILD_OBJECT( + 'id', fa.id, + 'account_title', fa.account_title, + 'federal_account_code', fa.federal_account_code + ) + ) federal_accounts + FROM + federal_account fa + INNER JOIN treasury_appropriation_account taa ON fa.id = taa.federal_account_id + INNER JOIN financial_accounts_by_awards faba ON taa.treasury_account_identifier = faba.treasury_account_id + WHERE + faba.award_id IS NOT NULL + GROUP BY + faba.award_id +) ACCT ON (ACCT.award_id = TM.award_id);; diff --git a/usaspending_api/database_scripts/matview_generator/universal_transaction_matview.json b/usaspending_api/database_scripts/matview_generator/universal_transaction_matview.json index 19584cd326..f662fc06d3 100644 --- a/usaspending_api/database_scripts/matview_generator/universal_transaction_matview.json +++ b/usaspending_api/database_scripts/matview_generator/universal_transaction_matview.json @@ -45,18 +45,18 @@ " COALESCE(transaction_fpds.place_of_perform_country_n, transaction_fabs.place_of_perform_country_n) AS pop_country_name,", " COALESCE(transaction_fpds.place_of_perform_country_c, transaction_fabs.place_of_perform_country_c, 'USA') AS pop_country_code,", " COALESCE(transaction_fpds.place_of_performance_state, transaction_fabs.place_of_perfor_state_code) AS pop_state_code,", - " COALESCE(transaction_fpds.place_of_perform_county_co, transaction_fabs.place_of_perform_county_co) AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.place_of_perform_county_co, transaction_fabs.place_of_perform_county_co), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " COALESCE(transaction_fpds.place_of_perform_county_na, transaction_fabs.place_of_perform_county_na) AS pop_county_name,", " COALESCE(transaction_fpds.place_of_performance_zip5, transaction_fabs.place_of_performance_zip5) AS pop_zip5,", - " COALESCE(transaction_fpds.place_of_performance_congr, transaction_fabs.place_of_performance_congr) AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.place_of_performance_congr, transaction_fabs.place_of_performance_congr), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM COALESCE(transaction_fpds.place_of_perform_city_name, transaction_fabs.place_of_performance_city)) AS pop_city_name,", "", " CASE WHEN COALESCE(transaction_fpds.legal_entity_country_code, transaction_fabs.legal_entity_country_code) = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fpds.legal_entity_country_code, transaction_fabs.legal_entity_country_code, 'USA') END AS recipient_location_country_code,", " COALESCE(transaction_fpds.legal_entity_country_name, transaction_fabs.legal_entity_country_name) AS recipient_location_country_name,", " COALESCE(transaction_fpds.legal_entity_state_code, transaction_fabs.legal_entity_state_code) AS recipient_location_state_code,", - " COALESCE(transaction_fpds.legal_entity_county_code, transaction_fabs.legal_entity_county_code) AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.legal_entity_county_code, transaction_fabs.legal_entity_county_code), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " COALESCE(transaction_fpds.legal_entity_county_name, transaction_fabs.legal_entity_county_name) AS recipient_location_county_name,", - " COALESCE(transaction_fpds.legal_entity_congressional, transaction_fabs.legal_entity_congressional) AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.legal_entity_congressional, transaction_fabs.legal_entity_congressional), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " COALESCE(transaction_fpds.legal_entity_zip5, transaction_fabs.legal_entity_zip5) AS recipient_location_zip5,", " TRIM(TRAILING FROM COALESCE(transaction_fpds.legal_entity_city_name, transaction_fabs.legal_entity_city_name)) AS recipient_location_city_name,", "", diff --git a/usaspending_api/etl/es_etl_helpers.py b/usaspending_api/etl/es_etl_helpers.py index 69ad11e8c4..2544b9083a 100644 --- a/usaspending_api/etl/es_etl_helpers.py +++ b/usaspending_api/etl/es_etl_helpers.py @@ -43,8 +43,10 @@ "award_category", "recipient_unique_id", "parent_recipient_unique_id", + "recipient_hash", "recipient_name", "action_date", + "fiscal_action_date", "period_of_performance_start_date", "period_of_performance_current_end_date", "ordering_period_end_date", @@ -54,8 +56,13 @@ "transaction_amount", "face_value_loan_guarantee", "original_loan_subsidy_cost", + "generated_pragmatic_obligation", "awarding_agency_id", "funding_agency_id", + "awarding_toptier_agency_id", + "funding_toptier_agency_id", + "awarding_subtier_agency_id", + "funding_subtier_agency_id", "awarding_toptier_agency_name", "funding_toptier_agency_name", "awarding_subtier_agency_name", @@ -64,6 +71,12 @@ "funding_toptier_agency_abbreviation", "awarding_subtier_agency_abbreviation", "funding_subtier_agency_abbreviation", + "awarding_toptier_agency_code", + "funding_toptier_agency_code", + "awarding_subtier_agency_code", + "funding_subtier_agency_code", + "cfda_id", + "cfda_number", "cfda_title", "cfda_popular_name", "type_of_contract_pricing", @@ -87,6 +100,9 @@ "recipient_location_zip5", "recipient_location_congressional_code", "recipient_location_city_name", + "treasury_account_identifiers", + "federal_accounts", + "business_categories", ] UPDATE_DATE_SQL = " AND update_date >= '{}'" @@ -150,6 +166,15 @@ def __init__(self, *args): # ============================================================================== +def convert_postgres_array_as_string_to_list(array_as_string: str) -> list: + """ + Postgres arrays are stored in CSVs as strings. Elasticsearch is able to handle lists of items, but needs to + be passed a list instead of a string. In the case of an empty array, return null + For example, "{this,is,a,postgres,array}" -> ["this", "is", "a", "postgres", "array"]. + """ + return array_as_string.replace("{", "").replace("}", "").split(",") if len(array_as_string) > 2 else None + + def process_guarddog(process_list): """ pass in a list of multiprocess Process objects. @@ -276,9 +301,15 @@ def download_csv(count_sql, copy_sql, filename, job_id, skip_counts, verbose): def csv_chunk_gen(filename, chunksize, job_id): printf({"msg": "Opening {} (batch size = {})".format(filename, chunksize), "job": job_id, "f": "ES Ingest"}) + # Need a specific converter to handle converting strings to correct data types (e.g. string -> array) + converters = { + "business_categories": convert_postgres_array_as_string_to_list, + "treasury_account_identifiers": convert_postgres_array_as_string_to_list, + "federal_accounts": lambda string_to_convert: json.loads(string_to_convert) if string_to_convert else None, + } # Panda's data type guessing causes issues for Elasticsearch. Explicitly cast using dictionary - dtype = {k: str for k in VIEW_COLUMNS} - for file_df in pd.read_csv(filename, dtype=dtype, header=0, chunksize=chunksize): + dtype = {k: str for k in VIEW_COLUMNS if k not in converters} + for file_df in pd.read_csv(filename, dtype=dtype, converters=converters, header=0, chunksize=chunksize): file_df = file_df.where(cond=(pd.notnull(file_df)), other=None) yield file_df.to_dict(orient="records") diff --git a/usaspending_api/etl/es_transaction_template.json b/usaspending_api/etl/es_transaction_template.json index 0de2acb8df..50b4435e6c 100644 --- a/usaspending_api/etl/es_transaction_template.json +++ b/usaspending_api/etl/es_transaction_template.json @@ -66,16 +66,13 @@ "type": "integer" }, "piid": { - "index": false, - "type": "text" + "type": "keyword" }, "fain": { - "index": false, - "type": "text" + "type": "keyword" }, "uri": { - "index": false, - "type": "text" + "type": "keyword" }, "award_description": { "type": "text", @@ -94,16 +91,15 @@ "type": "text" }, "type_description": { - "type": "keyword", + "type": "text", "fields": { - "raw": { - "type": "text" + "keyword": { + "type": "keyword" } } }, "award_category": { - "type": "text", - "index": false + "type": "keyword" }, "recipient_unique_id": { "type": "text" @@ -111,11 +107,14 @@ "parent_recipient_unique_id": { "type": "text" }, + "recipient_hash": { + "type": "keyword" + }, "recipient_name": { - "type": "keyword", + "type": "text", "fields": { - "raw": { - "type": "text" + "keyword": { + "type": "keyword" } } }, @@ -123,26 +122,27 @@ "type": "date", "format": "yyyy-MM-dd" }, + "fiscal_action_date": { + "type": "date", + "format": "yyyy-MM-dd" + }, "period_of_performance_start_date": { "type": "date", "format": "yyyy-MM-dd" }, "period_of_performance_current_end_date": { "type": "date", - "format": "yyyy-MM-dd", - "index": false + "format": "yyyy-MM-dd" }, "ordering_period_end_date": { "type": "date", "format": "yyyy-MM-dd" }, "transaction_fiscal_year": { - "type": "integer", - "index": false + "type": "integer" }, "award_fiscal_year": { - "type": "integer", - "index": false + "type": "integer" }, "award_amount": { "type": "scaled_float", @@ -160,72 +160,137 @@ "type": "scaled_float", "scaling_factor": 100 }, + "generated_pragmatic_obligation": { + "type": "scaled_float", + "scaling_factor": 100 + }, "awarding_agency_id": { - "type": "integer", - "index": false + "type": "integer" }, "funding_agency_id": { - "type": "integer", - "index": false + "type": "integer" + }, + "awarding_toptier_agency_id": { + "type": "integer" + }, + "awarding_subtier_agency_id": { + "type": "integer" + }, + "funding_toptier_agency_id": { + "type": "integer" + }, + "funding_subtier_agency_id": { + "type": "integer" }, "awarding_toptier_agency_name": { - "type": "keyword", + "type": "text", "fields": { - "raw": { - "type": "text" + "keyword": { + "type": "keyword" } } }, "funding_toptier_agency_name": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "awarding_subtier_agency_name": { - "type": "keyword", + "type": "text", "fields": { - "raw": { - "type": "text" + "keyword": { + "type": "keyword" } } }, "funding_subtier_agency_name": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "awarding_toptier_agency_abbreviation": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "funding_toptier_agency_abbreviation": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "awarding_subtier_agency_abbreviation": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "funding_subtier_agency_abbreviation": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } + }, + "awarding_toptier_agency_code": { + "type": "keyword" + }, + "funding_toptier_agency_code": { + "type": "keyword" + }, + "awarding_subtier_agency_code": { + "type": "keyword" + }, + "funding_subtier_agency_code": { + "type": "keyword" + }, + "cfda_id": { + "type": "integer" + }, + "cfda_number": { + "type": "keyword" }, - "cfda_title": { "type": "text", - "index": false + "fields": { + "keyword": { + "type": "keyword" + } + } }, "cfda_popular_name": { "type": "text", "index": false }, "type_of_contract_pricing": { - "type": "text", - "index": false + "type": "text" }, "type_set_aside": { - "type": "text", - "index": false + "type": "text" }, "extent_competed": { - "type": "text", - "index": false + "type": "text" }, "pulled_from": { - "type": "text", - "index": false + "type": "text" + }, + "type": { + "type": "keyword", + "null_value": "NULL" }, "pop_country_code": { "type": "keyword" @@ -237,16 +302,26 @@ "type": "keyword" }, "pop_county_code": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "pop_county_name": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "pop_zip5": { "type": "text" }, "pop_congressional_code": { - "type": "text" + "type": "keyword" }, "pop_city_name": { "type": "text", @@ -267,16 +342,26 @@ "type": "keyword" }, "recipient_location_county_code": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "recipient_location_county_name": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "recipient_location_zip5": { "type": "text" }, "recipient_location_congressional_code": { - "type": "text" + "type": "keyword" }, "recipient_location_city_name": { "type": "text", @@ -286,9 +371,25 @@ } } }, - "type": { - "type": "keyword", - "null_value": "NULL" + "treasury_account_identifiers": { + "type": "keyword" + }, + "federal_accounts": { + "type": "nested", + "properties": { + "id": { + "type": "integer" + }, + "account_title": { + "type": "keyword" + }, + "federal_account_code": { + "type": "keyword" + } + } + }, + "business_categories": { + "type": "keyword" } } } diff --git a/usaspending_api/search/v2/elasticsearch_helper.py b/usaspending_api/search/v2/elasticsearch_helper.py index bae8debe7a..acbcc0c739 100644 --- a/usaspending_api/search/v2/elasticsearch_helper.py +++ b/usaspending_api/search/v2/elasticsearch_helper.py @@ -11,7 +11,7 @@ logger = logging.getLogger("console") DOWNLOAD_QUERY_SIZE = settings.MAX_DOWNLOAD_LIMIT -KEYWORD_DATATYPE_FIELDS = ["{}.raw".format(i) for i in KEYWORD_DATATYPE_FIELDS] +KEYWORD_DATATYPE_FIELDS = [field for field in KEYWORD_DATATYPE_FIELDS] TRANSACTIONS_LOOKUP.update({v: k for k, v in TRANSACTIONS_LOOKUP.items()}) From 055fe039433cb5383400791f64beb5867415781a Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Tue, 10 Dec 2019 09:06:14 -0700 Subject: [PATCH 09/33] [DEV-4015] encode the full list of encodable characters --- .../matviews/functions_and_enums.sql | 46 ++++++------------- .../tests/test_custom_sql_functions.py | 37 +++++++++------ 2 files changed, 37 insertions(+), 46 deletions(-) diff --git a/usaspending_api/database_scripts/matviews/functions_and_enums.sql b/usaspending_api/database_scripts/matviews/functions_and_enums.sql index 20b331de60..217eec9d61 100644 --- a/usaspending_api/database_scripts/matviews/functions_and_enums.sql +++ b/usaspending_api/database_scripts/matviews/functions_and_enums.sql @@ -36,46 +36,28 @@ IMMUTABLE PARALLEL SAFE AS $$ DECLARE DECLARE i text; - DECLARE temp_array text[]; - DECLARE building_array text[]; - DECLARE hex_code text; + DECLARE reassemble_array text[]; + DECLARE hex_code_array text[]; BEGIN - -- Only percent-encode special characters, pass all unicode and other ascii characters + -- Percent-encode all characters which aren't considered the "unreserved list" + -- of URL characters: A-Za-z0-9\-_.~ + -- (This is the most conservative approach when implementing URLencoding) + -- The logic appears to be inefficient, but it beat more "optimized" algorithms + -- IMPORTANT! handle % first, otherwise it can return incorrect results - -- Appears to be inefficient, but it beat an "optimized" algorithm using regex and arrays str_val := REPLACE(str_val, '%', '%25'); - str_val := REPLACE(str_val, ' ', '%20'); - str_val := REPLACE(str_val, '!', '%21'); - str_val := REPLACE(str_val, '#', '%23'); - str_val := REPLACE(str_val, '$', '%24'); - str_val := REPLACE(str_val, '&', '%26'); - str_val := REPLACE(str_val, '''', '%27'); - str_val := REPLACE(str_val, '(', '%28'); - str_val := REPLACE(str_val, ')', '%29'); - str_val := REPLACE(str_val, '*', '%2A'); - str_val := REPLACE(str_val, '+', '%2B'); - str_val := REPLACE(str_val, ',', '%2C'); - str_val := REPLACE(str_val, '/', '%2F'); - str_val := REPLACE(str_val, ':', '%3A'); - str_val := REPLACE(str_val, ';', '%3B'); - str_val := REPLACE(str_val, '=', '%3D'); - str_val := REPLACE(str_val, '?', '%3F'); - str_val := REPLACE(str_val, '@', '%40'); - str_val := REPLACE(str_val, '[', '%5B'); - str_val := REPLACE(str_val, ']', '%5D'); - FOREACH i IN ARRAY ARRAY(SELECT (regexp_matches (str_val, '([^[:ascii:]])', 'gi'))[1]) LOOP - hex_code = UPPER(encode(i::bytea, 'hex')); + FOREACH i IN ARRAY ARRAY(SELECT DISTINCT (regexp_matches (str_val, '([^A-Za-z0-9\-_.~%])', 'gi'))[1]) LOOP + reassemble_array = '{}'; + hex_code_array = string_to_array(UPPER(encode(i::bytea, 'hex')), NULL); - building_array = '{}'; - temp_array = string_to_array(hex_code, null); - FOR i IN 0..char_length(hex_code) - 1 LOOP + FOR i IN 0..array_length(hex_code_array, 1) - 1 LOOP IF i % 2 = 0 THEN - building_array := array_append(building_array, '%'); + reassemble_array := array_append(reassemble_array, '%'); END IF; - building_array := array_append(building_array, temp_array[i+1]); + reassemble_array := array_append(reassemble_array, hex_code_array[i+1]); END LOOP; - str_val := REPLACE(str_val, i, array_to_string(building_array, '')); + str_val := REPLACE(str_val, i, array_to_string(reassemble_array, '')); END LOOP; END; $$ LANGUAGE plpgsql; diff --git a/usaspending_api/database_scripts/tests/test_custom_sql_functions.py b/usaspending_api/database_scripts/tests/test_custom_sql_functions.py index 5f0437315b..b6b10fc182 100644 --- a/usaspending_api/database_scripts/tests/test_custom_sql_functions.py +++ b/usaspending_api/database_scripts/tests/test_custom_sql_functions.py @@ -10,10 +10,10 @@ @pytest.fixture() def add_fun_awards(db): - mommy.make("awards.award", generated_unique_award_id="ASST_AGG_ABCDEFG_0123456") - mommy.make("awards.award", generated_unique_award_id="ASST_AGG_abcdefg_9876543") - mommy.make("awards.award", generated_unique_award_id="CONT_AWD_..._..._..._...") - mommy.make("awards.award", generated_unique_award_id="CONT_IDV_---_---") + mommy.make("awards.award", generated_unique_award_id="CONT_IDV_ABCDEFG_0123456") + mommy.make("awards.award", generated_unique_award_id="CONT_IDV_abcdefg_9876543") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_._.._..._....") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_-_--_---_----") mommy.make("awards.award", generated_unique_award_id="ASST_AGG_1008DRCATTHP 01^~@01906470531201403_7022") mommy.make("awards.award", generated_unique_award_id="ASST_AGG_12C30000000000006122970000 121/21000_12C3") mommy.make("awards.award", generated_unique_award_id="ASST_AGG_17.302-MARYLAND-PRINCE GEORGE'S-20081231-10_1635") @@ -21,15 +21,24 @@ def add_fun_awards(db): mommy.make("awards.award", generated_unique_award_id="ASST_NON_5% RECAP_8630") mommy.make("awards.award", generated_unique_award_id="CONT_AWD_GS30FY0027QP0019405Â_4732_GS30FY0027_4732") mommy.make("awards.award", generated_unique_award_id="ASST_NON_R!D1102A37 10_12E2") - mommy.make("awards.award", generated_unique_award_id="CONT_AWD_[_]_test") - mommy.make("awards.award", generated_unique_award_id="CONT_AWD_(_)_test") - mommy.make("awards.award", generated_unique_award_id="CONT_AWD_=_+_test") - mommy.make("awards.award", generated_unique_award_id="CONT_AWD_?_?_test") - mommy.make("awards.award", generated_unique_award_id="CONT_AWD_:_;_test") - mommy.make("awards.award", generated_unique_award_id="CONT_AWD_,__test") - mommy.make("awards.award", generated_unique_award_id="CONT_AWD_$$$_test") - mommy.make("awards.award", generated_unique_award_id="FUN_TXT_⎺╲_❪ツ❫_╱⎺") - mommy.make("awards.award", generated_unique_award_id="FUN_TXT_☰☱☲☳☴☵☶☷") + mommy.make("awards.award", generated_unique_award_id="CONT_IDV_[_]_test") + mommy.make("awards.award", generated_unique_award_id="CONT_IDV_(_)_test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_(())_[[]]_test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_==_++_test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_?_??_test") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_^_^^_^^^") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_::_;;_:::;;;") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_,_,,_,,,") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_$_$$_$$$") + mommy.make("awards.award", generated_unique_award_id="CONT_AWD_%_%%_%%%%") + mommy.make("awards.award", generated_unique_award_id="☰☱☳☲☶☴ ൠൠൠ ☴☶☲☳☱☰") + mommy.make("awards.award", generated_unique_award_id="❋❋❋ ALL YOUR BASE ARE BELONG TO US ❋❋❋") + mommy.make("awards.award", generated_unique_award_id="⎺╲_❪ツ❫_╱⎺") + mommy.make("awards.award", generated_unique_award_id="питон е јазик на компјутер и змија") + mommy.make("awards.award", generated_unique_award_id="如果科羅拉多被鋪平會比得克薩斯州大") + mommy.make("awards.award", generated_unique_award_id="епстеин се није убио") + mommy.make("awards.award", generated_unique_award_id="kjo frazë nuk bën mirëkuptim") + mommy.make("awards.award", generated_unique_award_id="何者なにものかによって、爆発物ばくはつぶつが仕掛しかけられたようです。") def test_urlencoding_no_change(add_fun_awards): @@ -49,7 +58,7 @@ def test_urlencoding_with_urllib(add_fun_awards): results = cursor.fetchall() for result in results: - urlib_ver = urllib.parse.quote(result[0], safe="^") + urlib_ver = urllib.parse.quote(result[0], safe="") msg = f"Custom SQL result '{result[1]}' doesn't match urllib function's '{urlib_ver}'" assert urlib_ver == result[1], msg From e66761f15c31242f2a7e066f95efde4a0feb9606 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Tue, 10 Dec 2019 12:03:28 -0700 Subject: [PATCH 10/33] [DEV-1566] updated monthly files --- .../management/commands/populate_monthly_files.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/usaspending_api/download/management/commands/populate_monthly_files.py b/usaspending_api/download/management/commands/populate_monthly_files.py index e5128dd7f2..499ec3126b 100644 --- a/usaspending_api/download/management/commands/populate_monthly_files.py +++ b/usaspending_api/download/management/commands/populate_monthly_files.py @@ -204,7 +204,7 @@ def handle(self, *args, **options): toptier_agencies = list(toptier_agencies.values("name", "toptier_agency_id", "toptier_code")) # Adding 'all' to prevent duplication of code if include_all: - toptier_agencies.append({"name": "All", "toptier_agency_id": "all", "toptier_code": "all"}) + toptier_agencies.append({"name": "All", "toptier_agency_id": "all", "toptier_code": "All"}) if not fiscal_years: fiscal_years = range(2001, generate_fiscal_year(current_date) + 1) @@ -226,10 +226,10 @@ def handle(self, *args, **options): start_date = "{}-10-01".format(fiscal_year - 1) end_date = "{}-09-30".format(fiscal_year) for award_type in award_types: - file_name = "{}_{}_{}".format(fiscal_year, agency["toptier_code"], award_type.capitalize()) - full_file_name = "{}_Full_{}.zip".format(file_name, updated_date_timestamp) + file_name = f"FY{fiscal_year}_{agency['toptier_code']}_{award_type.capitalize()}" + full_file_name = f"{file_name}_Full_{updated_date_timestamp}.zip" if not clobber and file_name in reuploads: - logger.info("Skipping already uploaded: {}".format(full_file_name)) + logger.info(f"Skipping already uploaded: {full_file_name}") continue if placeholders: empty_file = empty_contracts_file if award_type == "contracts" else empty_asssistance_file From d20a3775701b6c671eaa05962d3cf203a6f82bf3 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Tue, 10 Dec 2019 12:15:23 -0700 Subject: [PATCH 11/33] [DEV-1566] updated Delta filenames --- .../management/commands/populate_monthly_delta_files.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/usaspending_api/download/management/commands/populate_monthly_delta_files.py b/usaspending_api/download/management/commands/populate_monthly_delta_files.py index eeaab9c4b8..699bea91df 100644 --- a/usaspending_api/download/management/commands/populate_monthly_delta_files.py +++ b/usaspending_api/download/management/commands/populate_monthly_delta_files.py @@ -151,10 +151,11 @@ def create_local_file(self, award_type, source, agency_code, generate_since): # Create file paths and working directory timestamp = datetime.strftime(datetime.now(), "%Y%m%d%H%M%S%f") - working_dir = "{}_{}_delta_gen_{}/".format(settings.CSV_LOCAL_PATH, agency_code, timestamp) + working_dir = f"{settings.CSV_LOCAL_PATH}_{agency_code}_delta_gen_{timestamp}/" if not os.path.exists(working_dir): os.mkdir(working_dir) - source_name = "{}_{}_Delta_{}".format(agency_code, award_type, datetime.strftime(date.today(), "%Y%m%d")) + agency_str = 'All' if agency_code == 'all' else agency_code + source_name = f"FY(All)-{agency_str}_{award_type}_Delta_{datetime.strftime(date.today(), '%Y%m%d')}" source_path = os.path.join(working_dir, "{}.csv".format(source_name)) # Create a unique temporary file with the raw query @@ -185,7 +186,7 @@ def create_local_file(self, award_type, source, agency_code, generate_since): zipfile_path = "{}{}.zip".format(settings.CSV_LOCAL_PATH, source_name) logger.info("Creating compressed file: {}".format(os.path.basename(zipfile_path))) - split_and_zip_data_files(zipfile_path, source_path, source_name) + split_and_zip_data_files(zipfile_path, source_path, source_name, "csv") else: zipfile_path = None From 8d9c37a3c19d10c148b2201727c6d06ec5b91a58 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Tue, 10 Dec 2019 13:58:41 -0700 Subject: [PATCH 12/33] [DEV-1566] implemented new account zipfilenames --- usaspending_api/download/download_utils.py | 32 ++++++++++++++++++---- usaspending_api/download/lookups.py | 9 ++++-- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/usaspending_api/download/download_utils.py b/usaspending_api/download/download_utils.py index 29b89fb559..dc4a7831b7 100644 --- a/usaspending_api/download/download_utils.py +++ b/usaspending_api/download/download_utils.py @@ -7,23 +7,45 @@ def create_unique_filename(json_request, request_agency=None): + import json + print(f"agency: {request_agency} request: {json.dumps(json_request)}") + if json_request.get("is_for_idv"): download_name = "IDV_" + slugify_text_for_file_names(json_request.get("piid"), "UNKNOWN", 50) elif json_request.get("is_for_contract"): download_name = "CONT_" + slugify_text_for_file_names(json_request.get("piid"), "UNKNOWN", 50) elif json_request.get("is_for_assistance"): download_name = "ASST_" + slugify_text_for_file_names(json_request.get("assistance_id"), "UNKNOWN", 50) + elif json_request["request_type"] == "account": + file_name_template = obtain_zip_filename_format(json_request["download_types"]) + prefix = obtain_filename_prefix_from_agency_id(request_agency) + level = "FA" if json_request["account_level"] == "federal_account" else "TAS" + additional_quarters = "" + + if json_request["filters"]["quarter"] != 1: + additional_quarters = f"-Q{json_request['filters']['quarter']}" + + download_name = file_name_template.format(fy=json_request["filters"]["fy"], date_range=additional_quarters, level=level, agency=prefix) else: + # request_type: assistance, award, idv, account, contract download_types = json_request["download_types"] prefix = obtain_filename_prefix_from_agency_id(request_agency) award_type_name = create_award_level_string(download_types) download_name = "{}_{}".format(prefix, award_type_name) - timestamped_file_name = get_timestamped_filename("{}.zip".format(download_name)) + + datetime_format = "%Y-%m-%d_H%HM%MS%S%f" + timestamped_file_name = get_timestamped_filename(f"{download_name}.zip", datetime_format=datetime_format) return timestamped_file_name +def obtain_zip_filename_format(download_types): + if len(download_types) > 1: + raise NotImplementedError + return VALUE_MAPPINGS[download_types[0]]["zipfile_template"] + + def obtain_filename_prefix_from_agency_id(request_agency): - result = "all" + result = "All" if request_agency: toptier_agency_filter = ToptierAgency.objects.filter(toptier_agency_id=request_agency).first() if toptier_agency_filter: @@ -32,13 +54,11 @@ def obtain_filename_prefix_from_agency_id(request_agency): def create_award_level_string(download_types): - return "_".join(VALUE_MAPPINGS[award_level]["download_name"] for award_level in download_types) + return "+".join(VALUE_MAPPINGS[award_level]["download_name"] for award_level in download_types) def get_timestamped_filename(filename, datetime_format="%Y%m%d%H%M%S%f"): - """ - Gets a Timestamped file name to prevent conflicts on S3 Uploading - """ + """Return an updated filename to include current timestamp""" file_sans_extension, file_extension = filename.split(".") timestamp = datetime.strftime(datetime.now(timezone.utc), datetime_format) return "{}_{}.{}".format(file_sans_extension, timestamp, file_extension) diff --git a/usaspending_api/download/lookups.py b/usaspending_api/download/lookups.py index a90a8ffd98..9d74906054 100644 --- a/usaspending_api/download/lookups.py +++ b/usaspending_api/download/lookups.py @@ -94,7 +94,8 @@ "source_type": "account", "table": AppropriationAccountBalances, "table_name": "account_balances", - "download_name": "account_balances", + "download_name": "AccountBalances", + "zipfile_template": "FY{fy}Q1{date_range}_{level}_{agency}_AccountBalances", "filter_function": account_download_filter, }, # Object Class Program Activity Account Data @@ -102,14 +103,16 @@ "source_type": "account", "table": FinancialAccountsByProgramActivityObjectClass, "table_name": "object_class_program_activity", - "download_name": "account_breakdown_by_program_activity_object_class", + "download_name": "AccountBreakdownByProgramActivityObjectClass", + "zipfile_template": "FY{fy}Q1{date_range}_{level}_{agency}_AccountBreakdownByProgramActivityObjectClass", "filter_function": account_download_filter, }, "award_financial": { "source_type": "account", "table": FinancialAccountsByAwards, "table_name": "award_financial", - "download_name": "account_breakdown_by_award", + "download_name": "AccountBreakdownByAward", + "zipfile_template": "FY{fy}Q1{date_range}_{level}_{agency}_AccountBreakdownByAward", "filter_function": account_download_filter, }, "idv_orders": { From 57d03d89167e9e8bdd67668eca5d87bb50496805 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Tue, 10 Dec 2019 15:23:13 -0700 Subject: [PATCH 13/33] [DEV-1566] most of the functionality is present --- usaspending_api/download/download_utils.py | 5 +++-- .../filestreaming/download_generation.py | 19 ++++++++++++++----- usaspending_api/download/lookups.py | 6 +++--- usaspending_api/download/v2/views.py | 3 +++ 4 files changed, 23 insertions(+), 10 deletions(-) diff --git a/usaspending_api/download/download_utils.py b/usaspending_api/download/download_utils.py index dc4a7831b7..93dcf436f5 100644 --- a/usaspending_api/download/download_utils.py +++ b/usaspending_api/download/download_utils.py @@ -28,10 +28,11 @@ def create_unique_filename(json_request, request_agency=None): download_name = file_name_template.format(fy=json_request["filters"]["fy"], date_range=additional_quarters, level=level, agency=prefix) else: # request_type: assistance, award, idv, account, contract + prefix = "All_" if json_request.get("constraint_type", "") == "year" else "" download_types = json_request["download_types"] - prefix = obtain_filename_prefix_from_agency_id(request_agency) + agency = obtain_filename_prefix_from_agency_id(request_agency) award_type_name = create_award_level_string(download_types) - download_name = "{}_{}".format(prefix, award_type_name) + download_name = f"{prefix}{agency}_{award_type_name}" datetime_format = "%Y-%m-%d_H%HM%MS%S%f" timestamped_file_name = get_timestamped_filename(f"{download_name}.zip", datetime_format=datetime_format) diff --git a/usaspending_api/download/filestreaming/download_generation.py b/usaspending_api/download/filestreaming/download_generation.py index 75f301eb2f..f5e887209c 100644 --- a/usaspending_api/download/filestreaming/download_generation.py +++ b/usaspending_api/download/filestreaming/download_generation.py @@ -9,6 +9,7 @@ import time import traceback +from datetime import datetime, timezone from django.conf import settings from usaspending_api.awards.v2.filters.filter_helpers import add_date_range_comparison_types @@ -177,14 +178,14 @@ def get_download_sources(json_request): def parse_source(source, columns, download_job, working_dir, piid, assistance_id, zip_file_path, limit, extension): """Write to delimited text file(s) and zip file(s) using the source data""" d_map = { - "d1": "contracts", - "d2": "assistance", + "d1": "Contracts", + "d2": "Assistance", "treasury_account": "treasury_account", "federal_account": "federal_account", } if download_job and download_job.monthly_download: - # Use existing detailed filename from parent file for monthly files - # e.g. `019_Assistance_Delta_20180917_%s.csv` + # For monthly archives, use the existing detailed zip filename for the data files + # e.g. FY(All)-012_Contracts_Delta_20191108.zip -> FY(All)-012_Contracts_Delta_20191108_%.csv source_name = strip_file_extension(download_job.file_name) elif source.is_for_idv or source.is_for_contract: file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"] @@ -193,8 +194,16 @@ def parse_source(source, columns, download_job, working_dir, piid, assistance_id file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"] source_name = file_name_pattern.format(assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN", 50)) else: + print(f"Source: {source} zip: {zip_file_path}, download {download_job.json_request}") download_name = VALUE_MAPPINGS[source.source_type]["download_name"] - source_name = f"{source.agency_code}_{d_map[source.file_type]}_{download_name}" + if zip_file_path.startswith("All"): + prefix = "All_" + agency = f"_{'All' if source.agency_code == 'all' else source.agency_code}" + else: + prefix = "" + agency = "" + timestamp = datetime.strftime(datetime.now(timezone.utc), "%Y-%m-%d_H%HM%MS%S") + source_name = f"{prefix}{d_map[source.file_type]}{agency}_{download_name}_{timestamp}" source_query = source.row_emitter(columns) source.file_name = f"{source_name}.{extension}" diff --git a/usaspending_api/download/lookups.py b/usaspending_api/download/lookups.py index 9d74906054..d0b4bbcc58 100644 --- a/usaspending_api/download/lookups.py +++ b/usaspending_api/download/lookups.py @@ -61,7 +61,7 @@ "source_type": "award", "table": AwardSearchView, "table_name": "award", - "download_name": "prime_awards", + "download_name": "PrimeAwardSummaries", "contract_data": "award__latest_transaction__contract_data", "assistance_data": "award__latest_transaction__assistance_data", "filter_function": universal_award_matview_filter, @@ -72,7 +72,7 @@ "source_type": "award", "table": UniversalTransactionView, "table_name": "transaction", - "download_name": "prime_transactions", + "download_name": "PrimeTransactions", "contract_data": "transaction__contract_data", "assistance_data": "transaction__assistance_data", "filter_function": universal_transaction_matview_filter, @@ -83,7 +83,7 @@ "source_type": "award", "table": SubawardView, "table_name": "subaward", - "download_name": "subawards", + "download_name": "Subawards", "contract_data": "award__latest_transaction__contract_data", "assistance_data": "award__latest_transaction__assistance_data", "filter_function": subaward_download, diff --git a/usaspending_api/download/v2/views.py b/usaspending_api/download/v2/views.py index fccc1afdfe..e63f69dfc2 100644 --- a/usaspending_api/download/v2/views.py +++ b/usaspending_api/download/v2/views.py @@ -22,6 +22,7 @@ class RowLimitedIDVDownloadViewSet(BaseDownloadViewSet): endpoint_doc = "usaspending_api/api_contracts/contracts/v2/download/idv.md" def post(self, request): + request.data["constraint_type"] = "row_count" return BaseDownloadViewSet.post(self, request, "idv") @@ -33,6 +34,7 @@ class RowLimitedContractDownloadViewSet(BaseDownloadViewSet): endpoint_doc = "usaspending_api/api_contracts/contracts/v2/download/contract.md" def post(self, request): + request.data["constraint_type"] = "row_count" return BaseDownloadViewSet.post(self, request, "contract") @@ -44,6 +46,7 @@ class RowLimitedAssistanceDownloadViewSet(BaseDownloadViewSet): endpoint_doc = "usaspending_api/api_contracts/contracts/v2/download/assistance.md" def post(self, request): + request.data["constraint_type"] = "row_count" return BaseDownloadViewSet.post(self, request, "assistance") From d2931f3cca62ee6b747d0a8d96baffe2b3663633 Mon Sep 17 00:00:00 2001 From: sethstoudenmier Date: Wed, 11 Dec 2019 08:57:17 -0800 Subject: [PATCH 14/33] suggested changes and fix es tests --- usaspending_api/conftest_helpers.py | 11 ++++++++--- .../database_scripts/etl/transaction_delta_view.sql | 2 +- usaspending_api/etl/es_etl_helpers.py | 4 ++-- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/usaspending_api/conftest_helpers.py b/usaspending_api/conftest_helpers.py index eb361398ac..12146a227f 100644 --- a/usaspending_api/conftest_helpers.py +++ b/usaspending_api/conftest_helpers.py @@ -43,12 +43,17 @@ def update_index(self): def _add_contents(self): """ - Get all of the transactions presented in the view and - stuff them into the Elasticsearch index. + Get all of the transactions presented in the view and stuff them into the Elasticsearch index. + The view is only needed to load the transactions into Elasticsearch so it is dropped after each use. """ + transaction_delta_view_sql = open( + str(settings.APP_DIR / "database_scripts" / "etl" / "transaction_delta_view.sql"), "r" + ).read() with connection.cursor() as cursor: - cursor.execute("SELECT * FROM {}".format(settings.ES_TRANSACTIONS_ETL_VIEW_NAME)) + cursor.execute(transaction_delta_view_sql) + cursor.execute(f"SELECT * FROM {settings.ES_TRANSACTIONS_ETL_VIEW_NAME};") transactions = ordered_dictionary_fetcher(cursor) + cursor.execute(f"DROP VIEW IF EXISTS {settings.ES_TRANSACTIONS_ETL_VIEW_NAME};") for transaction in transactions: self.client.index( diff --git a/usaspending_api/database_scripts/etl/transaction_delta_view.sql b/usaspending_api/database_scripts/etl/transaction_delta_view.sql index c0d66b1e37..a97dcd5701 100644 --- a/usaspending_api/database_scripts/etl/transaction_delta_view.sql +++ b/usaspending_api/database_scripts/etl/transaction_delta_view.sql @@ -135,4 +135,4 @@ LEFT JOIN ( faba.award_id IS NOT NULL GROUP BY faba.award_id -) ACCT ON (ACCT.award_id = TM.award_id);; +) ACCT ON (ACCT.award_id = TM.award_id); diff --git a/usaspending_api/etl/es_etl_helpers.py b/usaspending_api/etl/es_etl_helpers.py index 2544b9083a..90b7109314 100644 --- a/usaspending_api/etl/es_etl_helpers.py +++ b/usaspending_api/etl/es_etl_helpers.py @@ -169,10 +169,10 @@ def __init__(self, *args): def convert_postgres_array_as_string_to_list(array_as_string: str) -> list: """ Postgres arrays are stored in CSVs as strings. Elasticsearch is able to handle lists of items, but needs to - be passed a list instead of a string. In the case of an empty array, return null + be passed a list instead of a string. In the case of an empty array, return null. For example, "{this,is,a,postgres,array}" -> ["this", "is", "a", "postgres", "array"]. """ - return array_as_string.replace("{", "").replace("}", "").split(",") if len(array_as_string) > 2 else None + return array_as_string[1:-1].split(",") if len(array_as_string) > 2 else None def process_guarddog(process_list): From ca3808ad0bc7ecf109185eb51e044b43b11c21d5 Mon Sep 17 00:00:00 2001 From: sethstoudenmier Date: Wed, 11 Dec 2019 10:50:55 -0800 Subject: [PATCH 15/33] requested changes --- usaspending_api/conftest_helpers.py | 2 +- usaspending_api/search/v2/elasticsearch_helper.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/usaspending_api/conftest_helpers.py b/usaspending_api/conftest_helpers.py index 12146a227f..2a5809c2bd 100644 --- a/usaspending_api/conftest_helpers.py +++ b/usaspending_api/conftest_helpers.py @@ -53,7 +53,7 @@ def _add_contents(self): cursor.execute(transaction_delta_view_sql) cursor.execute(f"SELECT * FROM {settings.ES_TRANSACTIONS_ETL_VIEW_NAME};") transactions = ordered_dictionary_fetcher(cursor) - cursor.execute(f"DROP VIEW IF EXISTS {settings.ES_TRANSACTIONS_ETL_VIEW_NAME};") + cursor.execute(f"DROP VIEW {settings.ES_TRANSACTIONS_ETL_VIEW_NAME};") for transaction in transactions: self.client.index( diff --git a/usaspending_api/search/v2/elasticsearch_helper.py b/usaspending_api/search/v2/elasticsearch_helper.py index acbcc0c739..bf40fa3887 100644 --- a/usaspending_api/search/v2/elasticsearch_helper.py +++ b/usaspending_api/search/v2/elasticsearch_helper.py @@ -11,8 +11,6 @@ logger = logging.getLogger("console") DOWNLOAD_QUERY_SIZE = settings.MAX_DOWNLOAD_LIMIT -KEYWORD_DATATYPE_FIELDS = [field for field in KEYWORD_DATATYPE_FIELDS] - TRANSACTIONS_LOOKUP.update({v: k for k, v in TRANSACTIONS_LOOKUP.items()}) From 6d093971cfae693adf2bd47ff29770c60e427f98 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Wed, 11 Dec 2019 14:54:19 -0700 Subject: [PATCH 16/33] [DEV-1566] more progress --- usaspending_api/download/download_utils.py | 23 ++++++++---- .../filestreaming/download_generation.py | 37 ++++++++++++++----- usaspending_api/download/lookups.py | 21 ++++++----- .../commands/populate_monthly_delta_files.py | 2 +- .../download/v2/base_download_viewset.py | 3 +- .../download/v2/request_validations.py | 2 + .../download/v2/year_limited_downloads.py | 1 - 7 files changed, 58 insertions(+), 31 deletions(-) diff --git a/usaspending_api/download/download_utils.py b/usaspending_api/download/download_utils.py index 93dcf436f5..5adcdaa3b0 100644 --- a/usaspending_api/download/download_utils.py +++ b/usaspending_api/download/download_utils.py @@ -7,9 +7,6 @@ def create_unique_filename(json_request, request_agency=None): - import json - print(f"agency: {request_agency} request: {json.dumps(json_request)}") - if json_request.get("is_for_idv"): download_name = "IDV_" + slugify_text_for_file_names(json_request.get("piid"), "UNKNOWN", 50) elif json_request.get("is_for_contract"): @@ -25,10 +22,14 @@ def create_unique_filename(json_request, request_agency=None): if json_request["filters"]["quarter"] != 1: additional_quarters = f"-Q{json_request['filters']['quarter']}" - download_name = file_name_template.format(fy=json_request["filters"]["fy"], date_range=additional_quarters, level=level, agency=prefix) + download_name = file_name_template.format( + fy=json_request["filters"]["fy"], date_range=additional_quarters, level=level, agency=prefix + ) else: - # request_type: assistance, award, idv, account, contract - prefix = "All_" if json_request.get("constraint_type", "") == "year" else "" + if json_request.get("constraint_type", "") == "year": + prefix = "All_" if request_agency == "all" else f"{request_agency}_" + else: + prefix = "" download_types = json_request["download_types"] agency = obtain_filename_prefix_from_agency_id(request_agency) award_type_name = create_award_level_string(download_types) @@ -47,7 +48,7 @@ def obtain_zip_filename_format(download_types): def obtain_filename_prefix_from_agency_id(request_agency): result = "All" - if request_agency: + if request_agency and request_agency != "all": toptier_agency_filter = ToptierAgency.objects.filter(toptier_agency_id=request_agency).first() if toptier_agency_filter: result = toptier_agency_filter.toptier_code @@ -55,7 +56,13 @@ def obtain_filename_prefix_from_agency_id(request_agency): def create_award_level_string(download_types): - return "+".join(VALUE_MAPPINGS[award_level]["download_name"] for award_level in download_types) + type_list = [] + for award_level in download_types: + if "type_name" in VALUE_MAPPINGS[award_level]: + type_list.append(VALUE_MAPPINGS[award_level]["type_name"]) + else: + type_list.append(VALUE_MAPPINGS[award_level]["download_name"]) + return "+".join(type_list) def get_timestamped_filename(filename, datetime_format="%Y%m%d%H%M%S%f"): diff --git a/usaspending_api/download/filestreaming/download_generation.py b/usaspending_api/download/filestreaming/download_generation.py index f5e887209c..886ea3c95f 100644 --- a/usaspending_api/download/filestreaming/download_generation.py +++ b/usaspending_api/download/filestreaming/download_generation.py @@ -124,7 +124,7 @@ def generate_download(download_job): def get_download_sources(json_request): download_sources = [] for download_type in json_request["download_types"]: - agency_id = json_request["filters"].get("agency", "all") + agency_id = json_request.get("agency", "all") filter_function = VALUE_MAPPINGS[download_type]["filter_function"] download_type_table = VALUE_MAPPINGS[download_type]["table"] @@ -180,8 +180,8 @@ def parse_source(source, columns, download_job, working_dir, piid, assistance_id d_map = { "d1": "Contracts", "d2": "Assistance", - "treasury_account": "treasury_account", - "federal_account": "federal_account", + "treasury_account": "TAS", + "federal_account": "FA", } if download_job and download_job.monthly_download: # For monthly archives, use the existing detailed zip filename for the data files @@ -194,16 +194,33 @@ def parse_source(source, columns, download_job, working_dir, piid, assistance_id file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"] source_name = file_name_pattern.format(assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN", 50)) else: - print(f"Source: {source} zip: {zip_file_path}, download {download_job.json_request}") - download_name = VALUE_MAPPINGS[source.source_type]["download_name"] - if zip_file_path.startswith("All"): - prefix = "All_" - agency = f"_{'All' if source.agency_code == 'all' else source.agency_code}" + file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"] + if_agency = "_" + + if source.agency_code == "all": + agency = "All" else: - prefix = "" + agency = str(source.agency_code) + + request = json.loads(download_job.json_request) + filters = request["filters"] + fy = filters.get("fy") + date_range = "" + if filters.get("quarter") != 1: + date_range = f"-Q{filters.get('quarter')}" + if request.get("limit"): agency = "" + if_agency = "" timestamp = datetime.strftime(datetime.now(timezone.utc), "%Y-%m-%d_H%HM%MS%S") - source_name = f"{prefix}{d_map[source.file_type]}{agency}_{download_name}_{timestamp}" + source_name = file_name_pattern.format( + agency=agency, + level=d_map[source.file_type], + timestamp=timestamp, + fy=fy, + date_range=date_range, + if_agency=if_agency, + type=d_map[source.file_type], + ) source_query = source.row_emitter(columns) source.file_name = f"{source_name}.{extension}" diff --git a/usaspending_api/download/lookups.py b/usaspending_api/download/lookups.py index d0b4bbcc58..c051b1b3b4 100644 --- a/usaspending_api/download/lookups.py +++ b/usaspending_api/download/lookups.py @@ -61,7 +61,8 @@ "source_type": "award", "table": AwardSearchView, "table_name": "award", - "download_name": "PrimeAwardSummaries", + "type_name": "PrimeAwardSummaries", + "download_name": "{agency}{if_agency}{type}_PrimeAwardSummaries_{timestamp}", "contract_data": "award__latest_transaction__contract_data", "assistance_data": "award__latest_transaction__assistance_data", "filter_function": universal_award_matview_filter, @@ -72,7 +73,8 @@ "source_type": "award", "table": UniversalTransactionView, "table_name": "transaction", - "download_name": "PrimeTransactions", + "type_name": "PrimeTransactions", + "download_name": "{agency}{if_agency}{type}_PrimeTransactions_{timestamp}", "contract_data": "transaction__contract_data", "assistance_data": "transaction__assistance_data", "filter_function": universal_transaction_matview_filter, @@ -83,7 +85,8 @@ "source_type": "award", "table": SubawardView, "table_name": "subaward", - "download_name": "Subawards", + "type_name": "Subawards", + "download_name": "{agency}{if_agency}{type}_Subawards_{timestamp}", "contract_data": "award__latest_transaction__contract_data", "assistance_data": "award__latest_transaction__assistance_data", "filter_function": subaward_download, @@ -94,8 +97,8 @@ "source_type": "account", "table": AppropriationAccountBalances, "table_name": "account_balances", - "download_name": "AccountBalances", - "zipfile_template": "FY{fy}Q1{date_range}_{level}_{agency}_AccountBalances", + "download_name": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBalances_{timestamp}", + "zipfile_template": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBalances", "filter_function": account_download_filter, }, # Object Class Program Activity Account Data @@ -103,16 +106,16 @@ "source_type": "account", "table": FinancialAccountsByProgramActivityObjectClass, "table_name": "object_class_program_activity", - "download_name": "AccountBreakdownByProgramActivityObjectClass", - "zipfile_template": "FY{fy}Q1{date_range}_{level}_{agency}_AccountBreakdownByProgramActivityObjectClass", + "download_name": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass_{timestamp}", + "zipfile_template": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass", "filter_function": account_download_filter, }, "award_financial": { "source_type": "account", "table": FinancialAccountsByAwards, "table_name": "award_financial", - "download_name": "AccountBreakdownByAward", - "zipfile_template": "FY{fy}Q1{date_range}_{level}_{agency}_AccountBreakdownByAward", + "download_name": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBreakdownByAward_{timestamp}", + "zipfile_template": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBreakdownByAward", "filter_function": account_download_filter, }, "idv_orders": { diff --git a/usaspending_api/download/management/commands/populate_monthly_delta_files.py b/usaspending_api/download/management/commands/populate_monthly_delta_files.py index 0b6f4cae58..ccba5afe84 100644 --- a/usaspending_api/download/management/commands/populate_monthly_delta_files.py +++ b/usaspending_api/download/management/commands/populate_monthly_delta_files.py @@ -154,7 +154,7 @@ def create_local_file(self, award_type, source, agency_code, generate_since): working_dir = f"{settings.CSV_LOCAL_PATH}_{agency_code}_delta_gen_{timestamp}/" if not os.path.exists(working_dir): os.mkdir(working_dir) - agency_str = 'All' if agency_code == 'all' else agency_code + agency_str = "All" if agency_code == "all" else agency_code source_name = f"FY(All)-{agency_str}_{award_type}_Delta_{datetime.strftime(date.today(), '%Y%m%d')}" source_path = os.path.join(working_dir, "{}.csv".format(source_name)) diff --git a/usaspending_api/download/v2/base_download_viewset.py b/usaspending_api/download/v2/base_download_viewset.py index aea8ac9476..e189b908f6 100644 --- a/usaspending_api/download/v2/base_download_viewset.py +++ b/usaspending_api/download/v2/base_download_viewset.py @@ -64,8 +64,7 @@ def post(self, request, request_type="award"): cached_filename = cached_download["file_name"] return self.get_download_response(file_name=cached_filename) - request_agency = json_request.get("filters", {}).get("agency", None) - final_output_zip_name = create_unique_filename(json_request, request_agency) + final_output_zip_name = create_unique_filename(json_request, json_request["agency"]) download_job = DownloadJob.objects.create( job_status_id=JOB_STATUS_DICT["ready"], file_name=final_output_zip_name, json_request=ordered_json_request ) diff --git a/usaspending_api/download/v2/request_validations.py b/usaspending_api/download/v2/request_validations.py index 8dc76fa728..5869c0270c 100644 --- a/usaspending_api/download/v2/request_validations.py +++ b/usaspending_api/download/v2/request_validations.py @@ -34,6 +34,7 @@ def validate_award_request(request_data): json_request = {"download_types": award_levels, "filters": {}} # Set defaults of non-required parameters + json_request["agency"] = request_data["filters"]["agency"] if request_data["filters"].get("agency") else "all" json_request["columns"] = request_data.get("columns", []) json_request["file_format"] = str(request_data.get("file_format", "csv")).lower() @@ -193,6 +194,7 @@ def validate_account_request(request_data): raise InvalidParameterException("Invalid Parameter: submission_type must be {}".format(valid_submissions)) json_request["download_types"] = [filters["submission_type"]] + json_request["agency"] = request_data["filters"]["agency"] if request_data["filters"].get("agency") else "all" # Validate the rest of the filters check_types_and_assign_defaults(filters, json_request["filters"], ACCOUNT_FILTER_DEFAULTS) diff --git a/usaspending_api/download/v2/year_limited_downloads.py b/usaspending_api/download/v2/year_limited_downloads.py index 46cc67aad4..a3b9bca3b8 100644 --- a/usaspending_api/download/v2/year_limited_downloads.py +++ b/usaspending_api/download/v2/year_limited_downloads.py @@ -78,6 +78,5 @@ def process_filters(self, request_data): del filters["sub_agency"] else: filters["agencies"] = [{"type": "awarding", "tier": "toptier", "name": toptier_name}] - del filters["agency"] request_data["filters"] = filters From d15de9c94859772a9b164c0219aac4f408846b7b Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Wed, 11 Dec 2019 15:58:00 -0700 Subject: [PATCH 17/33] [DEV-1566] minor cleanup --- usaspending_api/download/v2/base_download_viewset.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/usaspending_api/download/v2/base_download_viewset.py b/usaspending_api/download/v2/base_download_viewset.py index e189b908f6..b367cf3118 100644 --- a/usaspending_api/download/v2/base_download_viewset.py +++ b/usaspending_api/download/v2/base_download_viewset.py @@ -58,13 +58,11 @@ def post(self, request, request_type="award"): if cached_download and not settings.IS_LOCAL: # By returning the cached files, there should be no duplicates on a daily basis - write_to_log( - message="Generating file from cached download job ID: {}".format(cached_download["download_job_id"]) - ) + write_to_log(message=f"Generating file from cached download job ID: {cached_download['download_job_id']}") cached_filename = cached_download["file_name"] return self.get_download_response(file_name=cached_filename) - final_output_zip_name = create_unique_filename(json_request, json_request["agency"]) + final_output_zip_name = create_unique_filename(json_request, json_request.get("agency", "all")) download_job = DownloadJob.objects.create( job_status_id=JOB_STATUS_DICT["ready"], file_name=final_output_zip_name, json_request=ordered_json_request ) @@ -82,7 +80,7 @@ def process_request(self, download_job): # Send a SQS message that will be processed by another server which will eventually run # download_generation.write_csvs(**kwargs) (see download_sqs_worker.py) write_to_log( - message="Passing download_job {} to SQS".format(download_job.download_job_id), download_job=download_job + message=f"Passing download_job {download_job.download_job_id} to SQS", download_job=download_job ) queue = get_sqs_queue_resource(queue_name=settings.BULK_DOWNLOAD_SQS_QUEUE_NAME) queue.send_message(MessageBody=str(download_job.download_job_id)) @@ -92,7 +90,7 @@ def get_download_response(self, file_name): download job""" download_job = DownloadJob.objects.filter(file_name=file_name).first() if not download_job: - raise NotFound("Download job with filename {} does not exist.".format(file_name)) + raise NotFound(f"Download job with filename {file_name} does not exist.") # Compile url to file if settings.IS_LOCAL: From 6dc6a9bb626ad8a51c7bdfc787434d7106c1270c Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Thu, 12 Dec 2019 07:24:34 -0700 Subject: [PATCH 18/33] [DEV-1566] using valid character for filename concat --- usaspending_api/download/download_utils.py | 15 +++++++++------ .../filestreaming/download_generation.py | 14 +++++--------- usaspending_api/download/lookups.py | 18 +++++++++--------- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/usaspending_api/download/download_utils.py b/usaspending_api/download/download_utils.py index 5adcdaa3b0..caf293e25f 100644 --- a/usaspending_api/download/download_utils.py +++ b/usaspending_api/download/download_utils.py @@ -17,13 +17,9 @@ def create_unique_filename(json_request, request_agency=None): file_name_template = obtain_zip_filename_format(json_request["download_types"]) prefix = obtain_filename_prefix_from_agency_id(request_agency) level = "FA" if json_request["account_level"] == "federal_account" else "TAS" - additional_quarters = "" - - if json_request["filters"]["quarter"] != 1: - additional_quarters = f"-Q{json_request['filters']['quarter']}" download_name = file_name_template.format( - fy=json_request["filters"]["fy"], date_range=additional_quarters, level=level, agency=prefix + data_quarters=construct_data_date_range(json_request["filters"]), level=level, agency=prefix ) else: if json_request.get("constraint_type", "") == "year": @@ -62,7 +58,7 @@ def create_award_level_string(download_types): type_list.append(VALUE_MAPPINGS[award_level]["type_name"]) else: type_list.append(VALUE_MAPPINGS[award_level]["download_name"]) - return "+".join(type_list) + return "And".join(type_list) def get_timestamped_filename(filename, datetime_format="%Y%m%d%H%M%S%f"): @@ -78,3 +74,10 @@ def log_new_download_job(request, download_job): download_job=download_job, other_params={"request_addr": get_remote_addr(request)}, ) + + +def construct_data_date_range(provided_filters: dict) -> str: + string = f"FY{provided_filters.get('fy')}Q1" + if provided_filters.get("quarter") != 1: + string += f"-Q{provided_filters.get('quarter')}" + return string diff --git a/usaspending_api/download/filestreaming/download_generation.py b/usaspending_api/download/filestreaming/download_generation.py index 886ea3c95f..520ad74c30 100644 --- a/usaspending_api/download/filestreaming/download_generation.py +++ b/usaspending_api/download/filestreaming/download_generation.py @@ -19,6 +19,7 @@ from usaspending_api.common.helpers.orm_helpers import generate_raw_quoted_query from usaspending_api.common.helpers.text_helpers import slugify_text_for_file_names from usaspending_api.common.retrieve_file_from_uri import RetrieveFileFromUri +from usaspending_api.download.download_utils import construct_data_date_range from usaspending_api.download.filestreaming.download_source import DownloadSource from usaspending_api.download.filestreaming.file_description import build_file_description, save_file_description from usaspending_api.download.filestreaming.zip_file import append_files_to_zip_file @@ -195,7 +196,7 @@ def parse_source(source, columns, download_job, working_dir, piid, assistance_id source_name = file_name_pattern.format(assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN", 50)) else: file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"] - if_agency = "_" + agency_is_optional = "_" if source.agency_code == "all": agency = "All" @@ -204,21 +205,16 @@ def parse_source(source, columns, download_job, working_dir, piid, assistance_id request = json.loads(download_job.json_request) filters = request["filters"] - fy = filters.get("fy") - date_range = "" - if filters.get("quarter") != 1: - date_range = f"-Q{filters.get('quarter')}" if request.get("limit"): agency = "" - if_agency = "" + agency_is_optional = "" timestamp = datetime.strftime(datetime.now(timezone.utc), "%Y-%m-%d_H%HM%MS%S") source_name = file_name_pattern.format( agency=agency, + agency_is_optional=agency_is_optional, + data_quarters=construct_data_date_range(filters), level=d_map[source.file_type], timestamp=timestamp, - fy=fy, - date_range=date_range, - if_agency=if_agency, type=d_map[source.file_type], ) diff --git a/usaspending_api/download/lookups.py b/usaspending_api/download/lookups.py index c051b1b3b4..26e6619c55 100644 --- a/usaspending_api/download/lookups.py +++ b/usaspending_api/download/lookups.py @@ -62,7 +62,7 @@ "table": AwardSearchView, "table_name": "award", "type_name": "PrimeAwardSummaries", - "download_name": "{agency}{if_agency}{type}_PrimeAwardSummaries_{timestamp}", + "download_name": "{agency}{agency_is_optional}{type}_PrimeAwardSummaries_{timestamp}", "contract_data": "award__latest_transaction__contract_data", "assistance_data": "award__latest_transaction__assistance_data", "filter_function": universal_award_matview_filter, @@ -74,7 +74,7 @@ "table": UniversalTransactionView, "table_name": "transaction", "type_name": "PrimeTransactions", - "download_name": "{agency}{if_agency}{type}_PrimeTransactions_{timestamp}", + "download_name": "{agency}{agency_is_optional}{type}_PrimeTransactions_{timestamp}", "contract_data": "transaction__contract_data", "assistance_data": "transaction__assistance_data", "filter_function": universal_transaction_matview_filter, @@ -86,7 +86,7 @@ "table": SubawardView, "table_name": "subaward", "type_name": "Subawards", - "download_name": "{agency}{if_agency}{type}_Subawards_{timestamp}", + "download_name": "{agency}{agency_is_optional}{type}_Subawards_{timestamp}", "contract_data": "award__latest_transaction__contract_data", "assistance_data": "award__latest_transaction__assistance_data", "filter_function": subaward_download, @@ -97,8 +97,8 @@ "source_type": "account", "table": AppropriationAccountBalances, "table_name": "account_balances", - "download_name": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBalances_{timestamp}", - "zipfile_template": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBalances", + "download_name": "{data_quarters}_{agency}_{level}_AccountBalances_{timestamp}", + "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBalances", "filter_function": account_download_filter, }, # Object Class Program Activity Account Data @@ -106,16 +106,16 @@ "source_type": "account", "table": FinancialAccountsByProgramActivityObjectClass, "table_name": "object_class_program_activity", - "download_name": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass_{timestamp}", - "zipfile_template": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass", + "download_name": "{data_quarters}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass_{timestamp}", + "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass", "filter_function": account_download_filter, }, "award_financial": { "source_type": "account", "table": FinancialAccountsByAwards, "table_name": "award_financial", - "download_name": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBreakdownByAward_{timestamp}", - "zipfile_template": "FY{fy}Q1{date_range}_{agency}_{level}_AccountBreakdownByAward", + "download_name": "{data_quarters}_{agency}_{level}_AccountBreakdownByAward_{timestamp}", + "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBreakdownByAward", "filter_function": account_download_filter, }, "idv_orders": { From a86a6113daa822c676027a4fc7d339ff0c6f190c Mon Sep 17 00:00:00 2001 From: Kirk Barden Date: Thu, 12 Dec 2019 09:36:53 -0500 Subject: [PATCH 19/33] dev-4006: convert report period to FYQ --- usaspending_api/accounts/v2/filters/account_download.py | 2 ++ .../download/v2/download_column_historical_lookups.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/usaspending_api/accounts/v2/filters/account_download.py b/usaspending_api/accounts/v2/filters/account_download.py index 91668a8332..240449eb9a 100644 --- a/usaspending_api/accounts/v2/filters/account_download.py +++ b/usaspending_api/accounts/v2/filters/account_download.py @@ -168,6 +168,7 @@ def generate_treasury_account_query(queryset, account_type, tas_id): Value("-"), "{}__federal_account__main_account_code".format(tas_id), ), + "submission_period": FiscalYearAndQuarter("reporting_period_end"), } # Derive recipient_parent_name @@ -188,6 +189,7 @@ def generate_federal_account_query(queryset, account_type, tas_id): "{}__federal_account__main_account_code".format(tas_id), ), "agency_name": get_agency_name_annotation(tas_id, "agency_id"), + "submission_period": FiscalYearAndQuarter("reporting_period_end"), } # Derive recipient_parent_name for award_financial downloads diff --git a/usaspending_api/download/v2/download_column_historical_lookups.py b/usaspending_api/download/v2/download_column_historical_lookups.py index 7784cb3b8e..77cc02bfe2 100644 --- a/usaspending_api/download/v2/download_column_historical_lookups.py +++ b/usaspending_api/download/v2/download_column_historical_lookups.py @@ -1585,7 +1585,7 @@ "award_financial": { "treasury_account": OrderedDict( [ - ("submission_period", "reporting_period_end"), + ("submission_period", "submission_period"), # Column is appended to in account_download.py ("allocation_transfer_agency_identifier", "treasury_account__allocation_transfer_agency_id"), ("agency_identifier", "treasury_account__agency_id"), ("beginning_period_of_availability", "treasury_account__beginning_period_of_availability"), @@ -1649,7 +1649,7 @@ ), "federal_account": OrderedDict( [ - ("submission_period", "reporting_period_end"), + ("submission_period", "submission_period"), # Column is appended to in account_download.py ("federal_account_symbol", "federal_account_symbol"), # Column is appended to in account_download.py ("federal_account_name", "treasury_account__federal_account__account_title"), ("agency_name", "agency_name"), # Column is appended to in account_download.py From 363018bb69023bb9eb1cd98815aac8aede5b0cb6 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Thu, 12 Dec 2019 10:46:14 -0700 Subject: [PATCH 20/33] [DEV-1566] all renames completed --- usaspending_api/download/download_utils.py | 44 +++++++++---------- usaspending_api/download/lookups.py | 6 +-- .../download/v2/base_download_viewset.py | 4 +- .../download/v2/list_monthly_downloads.py | 6 +-- .../download/v2/year_limited_downloads.py | 2 +- 5 files changed, 29 insertions(+), 33 deletions(-) diff --git a/usaspending_api/download/download_utils.py b/usaspending_api/download/download_utils.py index caf293e25f..5b91c53810 100644 --- a/usaspending_api/download/download_utils.py +++ b/usaspending_api/download/download_utils.py @@ -6,40 +6,43 @@ from usaspending_api.references.models import ToptierAgency -def create_unique_filename(json_request, request_agency=None): +def create_unique_filename(json_request, origination=None): + timestamp = datetime.strftime(datetime.now(timezone.utc), "%Y-%m-%d_H%HM%MS%S%f") + request_agency = json_request.get("agency", "all") + if json_request.get("is_for_idv"): - download_name = "IDV_" + slugify_text_for_file_names(json_request.get("piid"), "UNKNOWN", 50) + slug_text = slugify_text_for_file_names(json_request.get("piid"), "UNKNOWN", 50) + download_name = f"IDV_{slug_text}_{timestamp}.zip" elif json_request.get("is_for_contract"): - download_name = "CONT_" + slugify_text_for_file_names(json_request.get("piid"), "UNKNOWN", 50) + slug_text = slugify_text_for_file_names(json_request.get("piid"), "UNKNOWN", 50) + download_name = f"CONT_{slug_text}_{timestamp}.zip" elif json_request.get("is_for_assistance"): - download_name = "ASST_" + slugify_text_for_file_names(json_request.get("assistance_id"), "UNKNOWN", 50) + slug_text = slugify_text_for_file_names(json_request.get("assistance_id"), "UNKNOWN", 50) + download_name = f"ASST_{slug_text}_{timestamp}.zip" elif json_request["request_type"] == "account": file_name_template = obtain_zip_filename_format(json_request["download_types"]) - prefix = obtain_filename_prefix_from_agency_id(request_agency) + agency = obtain_filename_prefix_from_agency_id(request_agency) level = "FA" if json_request["account_level"] == "federal_account" else "TAS" + data_quarters = construct_data_date_range(json_request["filters"]) download_name = file_name_template.format( - data_quarters=construct_data_date_range(json_request["filters"]), level=level, agency=prefix + agency=agency, data_quarters=data_quarters, level=level, timestamp=timestamp, ) - else: - if json_request.get("constraint_type", "") == "year": - prefix = "All_" if request_agency == "all" else f"{request_agency}_" - else: - prefix = "" + else: # "award" downloads download_types = json_request["download_types"] - agency = obtain_filename_prefix_from_agency_id(request_agency) + agency = "" award_type_name = create_award_level_string(download_types) - download_name = f"{prefix}{agency}_{award_type_name}" + if origination == "bulk_download": + agency = obtain_filename_prefix_from_agency_id(request_agency) + "_" + download_name = f"{agency}{award_type_name}_{timestamp}.zip" - datetime_format = "%Y-%m-%d_H%HM%MS%S%f" - timestamped_file_name = get_timestamped_filename(f"{download_name}.zip", datetime_format=datetime_format) - return timestamped_file_name + return download_name def obtain_zip_filename_format(download_types): if len(download_types) > 1: raise NotImplementedError - return VALUE_MAPPINGS[download_types[0]]["zipfile_template"] + return VALUE_MAPPINGS[download_types[0]]["zipfile_template"] + ".zip" def obtain_filename_prefix_from_agency_id(request_agency): @@ -61,13 +64,6 @@ def create_award_level_string(download_types): return "And".join(type_list) -def get_timestamped_filename(filename, datetime_format="%Y%m%d%H%M%S%f"): - """Return an updated filename to include current timestamp""" - file_sans_extension, file_extension = filename.split(".") - timestamp = datetime.strftime(datetime.now(timezone.utc), datetime_format) - return "{}_{}.{}".format(file_sans_extension, timestamp, file_extension) - - def log_new_download_job(request, download_job): write_to_download_log( message="Starting new download job [{}]".format(download_job.download_job_id), diff --git a/usaspending_api/download/lookups.py b/usaspending_api/download/lookups.py index 26e6619c55..0dda4745d8 100644 --- a/usaspending_api/download/lookups.py +++ b/usaspending_api/download/lookups.py @@ -98,7 +98,7 @@ "table": AppropriationAccountBalances, "table_name": "account_balances", "download_name": "{data_quarters}_{agency}_{level}_AccountBalances_{timestamp}", - "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBalances", + "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBalances_{timestamp}", "filter_function": account_download_filter, }, # Object Class Program Activity Account Data @@ -107,7 +107,7 @@ "table": FinancialAccountsByProgramActivityObjectClass, "table_name": "object_class_program_activity", "download_name": "{data_quarters}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass_{timestamp}", - "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass", + "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass_{timestamp}", "filter_function": account_download_filter, }, "award_financial": { @@ -115,7 +115,7 @@ "table": FinancialAccountsByAwards, "table_name": "award_financial", "download_name": "{data_quarters}_{agency}_{level}_AccountBreakdownByAward_{timestamp}", - "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBreakdownByAward", + "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBreakdownByAward_{timestamp}", "filter_function": account_download_filter, }, "idv_orders": { diff --git a/usaspending_api/download/v2/base_download_viewset.py b/usaspending_api/download/v2/base_download_viewset.py index b367cf3118..bd2206dbb7 100644 --- a/usaspending_api/download/v2/base_download_viewset.py +++ b/usaspending_api/download/v2/base_download_viewset.py @@ -31,7 +31,7 @@ class BaseDownloadViewSet(APIView): bucket_name=settings.BULK_DOWNLOAD_S3_BUCKET_NAME, redirect_dir=settings.BULK_DOWNLOAD_S3_REDIRECT_DIR ) - def post(self, request, request_type="award"): + def post(self, request, request_type="award", origination=None): if request_type == "award": json_request = validate_award_request(request.data) elif request_type == "idv": @@ -62,7 +62,7 @@ def post(self, request, request_type="award"): cached_filename = cached_download["file_name"] return self.get_download_response(file_name=cached_filename) - final_output_zip_name = create_unique_filename(json_request, json_request.get("agency", "all")) + final_output_zip_name = create_unique_filename(json_request, origination=origination) download_job = DownloadJob.objects.create( job_status_id=JOB_STATUS_DICT["ready"], file_name=final_output_zip_name, json_request=ordered_json_request ) diff --git a/usaspending_api/download/v2/list_monthly_downloads.py b/usaspending_api/download/v2/list_monthly_downloads.py index 218e2c2df0..d254e3ccb7 100644 --- a/usaspending_api/download/v2/list_monthly_downloads.py +++ b/usaspending_api/download/v2/list_monthly_downloads.py @@ -37,7 +37,7 @@ def post(self, request): # Capitalize type_param and retrieve agency information from agency ID download_type = type_param.capitalize() if agency_id == "all": - agency = {"toptier_code": "all", "name": "All", "abbreviation": None} + agency = {"toptier_code": "All", "name": "All", "abbreviation": None} else: agency_check = ToptierAgency.objects.filter(toptier_agency_id=agency_id).values( "toptier_code", "name", "abbreviation" @@ -48,9 +48,9 @@ def post(self, request): raise InvalidParameterException("{} agency not found".format(agency_id)) # Populate regex - monthly_download_prefixes = "{}_{}_{}".format(fiscal_year, agency["toptier_code"], download_type) + monthly_download_prefixes = f"FY{fiscal_year}_{agency['toptier_code']}_{download_type}" monthly_download_regex = r"{}_Full_.*\.zip".format(monthly_download_prefixes) - delta_download_prefixes = "{}_{}".format(agency["toptier_code"], download_type) + delta_download_prefixes = f"FY(All)-{agency['toptier_code']}_{download_type}" delta_download_regex = r"{}_Delta_.*\.zip".format(delta_download_prefixes) # Retrieve and filter the files we need diff --git a/usaspending_api/download/v2/year_limited_downloads.py b/usaspending_api/download/v2/year_limited_downloads.py index a3b9bca3b8..76cc296b83 100644 --- a/usaspending_api/download/v2/year_limited_downloads.py +++ b/usaspending_api/download/v2/year_limited_downloads.py @@ -17,7 +17,7 @@ def post(self, request): # TODO: update front end to use the Common Filter Object and get rid of this function self.process_filters(request.data) - return BaseDownloadViewSet.post(self, request, "award") + return BaseDownloadViewSet.post(self, request, "award", "bulk_download") def process_filters(self, request_data): """Filter function to update Bulk Download parameters to shared parameters""" From ba2d13dc1931e7bd5c60f53613b671aa741f3711 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Thu, 12 Dec 2019 12:59:00 -0700 Subject: [PATCH 21/33] [DEV-1566] added support for new filenames to list_monthly_files --- .../download/v2/list_monthly_downloads.py | 32 ++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/usaspending_api/download/v2/list_monthly_downloads.py b/usaspending_api/download/v2/list_monthly_downloads.py index d254e3ccb7..38e80b4bba 100644 --- a/usaspending_api/download/v2/list_monthly_downloads.py +++ b/usaspending_api/download/v2/list_monthly_downloads.py @@ -51,7 +51,7 @@ def post(self, request): monthly_download_prefixes = f"FY{fiscal_year}_{agency['toptier_code']}_{download_type}" monthly_download_regex = r"{}_Full_.*\.zip".format(monthly_download_prefixes) delta_download_prefixes = f"FY(All)-{agency['toptier_code']}_{download_type}" - delta_download_regex = r"{}_Delta_.*\.zip".format(delta_download_prefixes) + delta_download_regex = r"FY\(All\)-{}_{}_Delta_.*\.zip".format(agency["toptier_code"], download_type) # Retrieve and filter the files we need bucket = boto3.resource("s3", region_name=self.s3_handler.region).Bucket(self.s3_handler.bucketRoute) @@ -68,6 +68,36 @@ def post(self, request): ) ) + ########################################## + # TEMPORARY 2019/12/12. REMOVE after 2020/01/15 + # KEEP old_* prefix and regex around until monthly files using the new format are + # generated and accessible in S3 + if agency["toptier_code"] == "All": + agency["toptier_code"] = "all" + old_monthly_download_prefixes = "{}_{}_{}".format(fiscal_year, agency["toptier_code"], download_type) + old_monthly_download_regex = r"{}_Full_.*\.zip".format(old_monthly_download_prefixes) + old_delta_download_prefixes = "{}_{}".format(agency["toptier_code"], download_type) + old_delta_download_regex = r"{}_Delta_.*\.zip".format(old_delta_download_prefixes) + + monthly_download_names.extend( + list( + filter( + re.compile(old_monthly_download_regex).search, + [key.key for key in bucket.objects.filter(Prefix=old_monthly_download_prefixes)], + ) + ) + ) + delta_download_names.extend( + list( + filter( + re.compile(old_delta_download_regex).search, + [key.key for key in bucket.objects.filter(Prefix=old_delta_download_prefixes)], + ) + ) + ) + ########################################## + ########################################## + # Generate response downloads = [] for filename in monthly_download_names: From 01011add126cf1d1ed2d13fc00dd5f8d5140d286 Mon Sep 17 00:00:00 2001 From: sethstoudenmier Date: Thu, 12 Dec 2019 12:08:41 -0800 Subject: [PATCH 22/33] requested changes and updated a few es fields --- .../etl/transaction_delta_view.sql | 3 +- .../mv_contract_award_search.json | 8 ++-- .../mv_directpayment_award_search.json | 8 ++-- .../mv_grant_award_search.json | 8 ++-- .../mv_idv_award_search.json | 8 ++-- .../mv_loan_award_search.json | 8 ++-- .../mv_other_award_search.json | 8 ++-- .../mv_pre2008_award_search.json | 8 ++-- .../matview_generator/subaward_view.json | 8 ++-- .../summary_transaction_geo_view.json | 8 ++-- .../summary_transaction_month_view.json | 8 ++-- .../summary_transaction_view.json | 8 ++-- usaspending_api/etl/es_etl_helpers.py | 1 - .../etl/es_transaction_template.json | 38 +++++++++++++++---- .../etl/management/commands/es_rapidloader.py | 23 +++++++++-- 15 files changed, 95 insertions(+), 58 deletions(-) diff --git a/usaspending_api/database_scripts/etl/transaction_delta_view.sql b/usaspending_api/database_scripts/etl/transaction_delta_view.sql index a97dcd5701..8e7e9ea877 100644 --- a/usaspending_api/database_scripts/etl/transaction_delta_view.sql +++ b/usaspending_api/database_scripts/etl/transaction_delta_view.sql @@ -77,7 +77,6 @@ SELECT UTM.type_of_contract_pricing, UTM.type_set_aside, UTM.extent_competed, - UTM.pulled_from, UTM.type, UTM.pop_country_code, @@ -107,7 +106,7 @@ INNER JOIN transaction_normalized TM ON (UTM.transaction_id = TM.id) LEFT JOIN transaction_fpds FPDS ON (UTM.transaction_id = FPDS.transaction_id) LEFT JOIN transaction_fabs FABS ON (UTM.transaction_id = FABS.transaction_id) LEFT JOIN awards AWD ON (UTM.award_id = AWD.id) --- Similar joins are already performed oon universal_transaction_matview, however, to avoid making the matview larger +-- Similar joins are already performed on universal_transaction_matview, however, to avoid making the matview larger -- than needed they have been placed here. Feel free to phase out if the columns gained from the following joins are -- added to the universal_transaction_matview. LEFT JOIN agency AA ON (TM.awarding_agency_id = AA.id) diff --git a/usaspending_api/database_scripts/matview_generator/mv_contract_award_search.json b/usaspending_api/database_scripts/matview_generator/mv_contract_award_search.json index 2770d1144d..f20ecad10e 100644 --- a/usaspending_api/database_scripts/matview_generator/mv_contract_award_search.json +++ b/usaspending_api/database_scripts/matview_generator/mv_contract_award_search.json @@ -59,20 +59,20 @@ " CASE WHEN COALESCE(transaction_fpds.legal_entity_country_code) = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fpds.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " transaction_fpds.legal_entity_country_name AS recipient_location_country_name,", " transaction_fpds.legal_entity_state_code AS recipient_location_state_code,", - " transaction_fpds.legal_entity_county_code AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.legal_entity_county_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " transaction_fpds.legal_entity_county_name AS recipient_location_county_name,", - " transaction_fpds.legal_entity_congressional AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.legal_entity_congressional, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " transaction_fpds.legal_entity_zip5 AS recipient_location_zip5,", " TRIM(TRAILING FROM transaction_fpds.legal_entity_city_name) AS recipient_location_city_name,", "", " transaction_fpds.place_of_perform_country_n AS pop_country_name,", " COALESCE(transaction_fpds.place_of_perform_country_c, 'USA') AS pop_country_code,", " transaction_fpds.place_of_performance_state AS pop_state_code,", - " transaction_fpds.place_of_perform_county_co AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.place_of_perform_county_co, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " transaction_fpds.place_of_perform_county_na AS pop_county_name,", " NULL::text AS pop_city_code,", " transaction_fpds.place_of_performance_zip5 AS pop_zip5,", - " transaction_fpds.place_of_performance_congr AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.place_of_performance_congr, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM transaction_fpds.place_of_perform_city_name) AS pop_city_name,", "", " NULL::text AS cfda_number,", diff --git a/usaspending_api/database_scripts/matview_generator/mv_directpayment_award_search.json b/usaspending_api/database_scripts/matview_generator/mv_directpayment_award_search.json index 1b55d9d538..05982980cd 100644 --- a/usaspending_api/database_scripts/matview_generator/mv_directpayment_award_search.json +++ b/usaspending_api/database_scripts/matview_generator/mv_directpayment_award_search.json @@ -56,20 +56,20 @@ " CASE WHEN transaction_fabs.legal_entity_country_code = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fabs.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " transaction_fabs.legal_entity_country_name AS recipient_location_country_name,", " transaction_fabs.legal_entity_state_code AS recipient_location_state_code,", - " transaction_fabs.legal_entity_county_code AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.legal_entity_county_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " transaction_fabs.legal_entity_county_name AS recipient_location_county_name,", - " transaction_fabs.legal_entity_congressional AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.legal_entity_congressional, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " transaction_fabs.legal_entity_zip5 AS recipient_location_zip5,", " TRIM(TRAILING FROM transaction_fabs.legal_entity_city_name) AS recipient_location_city_name,", "", " transaction_fabs.place_of_perform_country_n AS pop_country_name,", " COALESCE(transaction_fabs.place_of_perform_country_c,'USA') AS pop_country_code,", " transaction_fabs.place_of_perfor_state_code AS pop_state_code,", - " transaction_fabs.place_of_perform_county_co AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.place_of_perform_county_co, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " transaction_fabs.place_of_perform_county_na AS pop_county_name,", " transaction_fabs.place_of_performance_code AS pop_city_code,", " transaction_fabs.place_of_performance_zip5 AS pop_zip5,", - " transaction_fabs.place_of_performance_congr AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.place_of_performance_congr, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM transaction_fabs.place_of_performance_city) AS pop_city_name,", "", " transaction_fabs.cfda_number,", diff --git a/usaspending_api/database_scripts/matview_generator/mv_grant_award_search.json b/usaspending_api/database_scripts/matview_generator/mv_grant_award_search.json index d0abb14fb5..9f76ec65d4 100644 --- a/usaspending_api/database_scripts/matview_generator/mv_grant_award_search.json +++ b/usaspending_api/database_scripts/matview_generator/mv_grant_award_search.json @@ -56,20 +56,20 @@ " CASE WHEN transaction_fabs.legal_entity_country_code = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fabs.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " transaction_fabs.legal_entity_country_name AS recipient_location_country_name,", " transaction_fabs.legal_entity_state_code AS recipient_location_state_code,", - " transaction_fabs.legal_entity_county_code AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.legal_entity_county_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " transaction_fabs.legal_entity_county_name AS recipient_location_county_name,", - " transaction_fabs.legal_entity_congressional AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.legal_entity_congressional, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " transaction_fabs.legal_entity_zip5 AS recipient_location_zip5,", " TRIM(TRAILING FROM transaction_fabs.legal_entity_city_name) AS recipient_location_city_name,", "", " transaction_fabs.place_of_perform_country_n AS pop_country_name,", " COALESCE(transaction_fabs.place_of_perform_country_c,'USA') AS pop_country_code,", " transaction_fabs.place_of_perfor_state_code AS pop_state_code,", - " transaction_fabs.place_of_perform_county_co AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.place_of_perform_county_co, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " transaction_fabs.place_of_perform_county_na AS pop_county_name,", " transaction_fabs.place_of_performance_code AS pop_city_code,", " transaction_fabs.place_of_performance_zip5 AS pop_zip5,", - " transaction_fabs.place_of_performance_congr AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.place_of_performance_congr, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM transaction_fabs.place_of_performance_city) AS pop_city_name,", "", " transaction_fabs.cfda_number,", diff --git a/usaspending_api/database_scripts/matview_generator/mv_idv_award_search.json b/usaspending_api/database_scripts/matview_generator/mv_idv_award_search.json index ffa28f570e..1c30f2929e 100644 --- a/usaspending_api/database_scripts/matview_generator/mv_idv_award_search.json +++ b/usaspending_api/database_scripts/matview_generator/mv_idv_award_search.json @@ -59,20 +59,20 @@ " CASE WHEN COALESCE(transaction_fpds.legal_entity_country_code) = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fpds.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " transaction_fpds.legal_entity_country_name AS recipient_location_country_name,", " transaction_fpds.legal_entity_state_code AS recipient_location_state_code,", - " transaction_fpds.legal_entity_county_code AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.legal_entity_county_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " transaction_fpds.legal_entity_county_name AS recipient_location_county_name,", - " transaction_fpds.legal_entity_congressional AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.legal_entity_congressional, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " transaction_fpds.legal_entity_zip5 AS recipient_location_zip5,", " TRIM(TRAILING FROM transaction_fpds.legal_entity_city_name) AS recipient_location_city_name,", "", " transaction_fpds.place_of_perform_country_n AS pop_country_name,", " COALESCE(transaction_fpds.place_of_perform_country_c, 'USA') AS pop_country_code,", " transaction_fpds.place_of_performance_state AS pop_state_code,", - " transaction_fpds.place_of_perform_county_co AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.place_of_perform_county_co, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " transaction_fpds.place_of_perform_county_na AS pop_county_name,", " NULL::text AS pop_city_code,", " transaction_fpds.place_of_performance_zip5 AS pop_zip5,", - " transaction_fpds.place_of_performance_congr AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.place_of_performance_congr, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM transaction_fpds.place_of_perform_city_name) AS pop_city_name,", "", " NULL::text AS cfda_number,", diff --git a/usaspending_api/database_scripts/matview_generator/mv_loan_award_search.json b/usaspending_api/database_scripts/matview_generator/mv_loan_award_search.json index f5562206c7..4f7ead4ae9 100644 --- a/usaspending_api/database_scripts/matview_generator/mv_loan_award_search.json +++ b/usaspending_api/database_scripts/matview_generator/mv_loan_award_search.json @@ -56,20 +56,20 @@ " CASE WHEN transaction_fabs.legal_entity_country_code = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fabs.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " transaction_fabs.legal_entity_country_name AS recipient_location_country_name,", " transaction_fabs.legal_entity_state_code AS recipient_location_state_code,", - " transaction_fabs.legal_entity_county_code AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.legal_entity_county_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " transaction_fabs.legal_entity_county_name AS recipient_location_county_name,", - " transaction_fabs.legal_entity_congressional AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.legal_entity_congressional, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " transaction_fabs.legal_entity_zip5 AS recipient_location_zip5,", " TRIM(TRAILING FROM transaction_fabs.legal_entity_city_name) AS recipient_location_city_name,", "", " transaction_fabs.place_of_perform_country_n AS pop_country_name,", " COALESCE(transaction_fabs.place_of_perform_country_c,'USA') AS pop_country_code,", " transaction_fabs.place_of_perfor_state_code AS pop_state_code,", - " transaction_fabs.place_of_perform_county_co AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.place_of_perform_county_co, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " transaction_fabs.place_of_perform_county_na AS pop_county_name,", " transaction_fabs.place_of_performance_code AS pop_city_code,", " transaction_fabs.place_of_performance_zip5 AS pop_zip5,", - " transaction_fabs.place_of_performance_congr AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.place_of_performance_congr, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM transaction_fabs.place_of_performance_city) AS pop_city_name,", "", " transaction_fabs.cfda_number,", diff --git a/usaspending_api/database_scripts/matview_generator/mv_other_award_search.json b/usaspending_api/database_scripts/matview_generator/mv_other_award_search.json index 5ca571717e..07620faeab 100644 --- a/usaspending_api/database_scripts/matview_generator/mv_other_award_search.json +++ b/usaspending_api/database_scripts/matview_generator/mv_other_award_search.json @@ -56,20 +56,20 @@ " CASE WHEN transaction_fabs.legal_entity_country_code = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fabs.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " transaction_fabs.legal_entity_country_name AS recipient_location_country_name,", " transaction_fabs.legal_entity_state_code AS recipient_location_state_code,", - " transaction_fabs.legal_entity_county_code AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.legal_entity_county_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " transaction_fabs.legal_entity_county_name AS recipient_location_county_name,", - " transaction_fabs.legal_entity_congressional AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.legal_entity_congressional, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " transaction_fabs.legal_entity_zip5 AS recipient_location_zip5,", " TRIM(TRAILING FROM transaction_fabs.legal_entity_city_name) AS recipient_location_city_name,", "", " transaction_fabs.place_of_perform_country_n AS pop_country_name,", " COALESCE(transaction_fabs.place_of_perform_country_c,'USA') AS pop_country_code,", " transaction_fabs.place_of_perfor_state_code AS pop_state_code,", - " transaction_fabs.place_of_perform_county_co AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.place_of_perform_county_co, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " transaction_fabs.place_of_perform_county_na AS pop_county_name,", " transaction_fabs.place_of_performance_code AS pop_city_code,", " transaction_fabs.place_of_performance_zip5 AS pop_zip5,", - " transaction_fabs.place_of_performance_congr AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fabs.place_of_performance_congr, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM transaction_fabs.place_of_performance_city) AS pop_city_name,", "", " transaction_fabs.cfda_number,", diff --git a/usaspending_api/database_scripts/matview_generator/mv_pre2008_award_search.json b/usaspending_api/database_scripts/matview_generator/mv_pre2008_award_search.json index f6e3ac8397..ae68b595f0 100644 --- a/usaspending_api/database_scripts/matview_generator/mv_pre2008_award_search.json +++ b/usaspending_api/database_scripts/matview_generator/mv_pre2008_award_search.json @@ -59,20 +59,20 @@ " CASE WHEN COALESCE(transaction_fpds.legal_entity_country_code) = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fpds.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " transaction_fpds.legal_entity_country_name AS recipient_location_country_name,", " transaction_fpds.legal_entity_state_code AS recipient_location_state_code,", - " transaction_fpds.legal_entity_county_code AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.legal_entity_county_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " transaction_fpds.legal_entity_county_name AS recipient_location_county_name,", - " transaction_fpds.legal_entity_congressional AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.legal_entity_congressional, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " transaction_fpds.legal_entity_zip5 AS recipient_location_zip5,", " TRIM(TRAILING FROM transaction_fpds.legal_entity_city_name) AS recipient_location_city_name,", "", " transaction_fpds.place_of_perform_country_n AS pop_country_name,", " COALESCE(transaction_fpds.place_of_perform_country_c, 'USA') AS pop_country_code,", " transaction_fpds.place_of_performance_state AS pop_state_code,", - " transaction_fpds.place_of_perform_county_co AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.place_of_perform_county_co, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " transaction_fpds.place_of_perform_county_na AS pop_county_name,", " NULL::text AS pop_city_code,", " transaction_fpds.place_of_performance_zip5 AS pop_zip5,", - " transaction_fpds.place_of_performance_congr AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(transaction_fpds.place_of_performance_congr, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM transaction_fpds.place_of_perform_city_name) AS pop_city_name,", "", " NULL::text AS cfda_number,", diff --git a/usaspending_api/database_scripts/matview_generator/subaward_view.json b/usaspending_api/database_scripts/matview_generator/subaward_view.json index fafb4e5c73..5205fbeebd 100644 --- a/usaspending_api/database_scripts/matview_generator/subaward_view.json +++ b/usaspending_api/database_scripts/matview_generator/subaward_view.json @@ -71,23 +71,23 @@ " recipient_location_city_name,", " recipient_location_state_code,", " recipient_location_state_name,", - " recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(recipient_location_county_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " recipient_location_county_name,", " LEFT(COALESCE(recipient_location_zip4, ''), 5) AS recipient_location_zip5,", " recipient_location_street_address,", - " recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(recipient_location_congressional_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", "", " pop_country_name,", " COALESCE(pop_country_code,'USA') as pop_country_code,", " pop_state_code,", " pop_state_name,", - " pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(pop_county_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " pop_county_name,", " pop_city_code,", " pop_city_name,", " LEFT(COALESCE(pop_zip4, ''), 5) AS pop_zip5,", " pop_street_address,", - " pop_congressional_code", + " LPAD(CAST(CAST((REGEXP_MATCH(pop_congressional_code, '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code", "FROM", " subaward AS sub", "LEFT OUTER JOIN psc ON product_or_service_code = psc.code", diff --git a/usaspending_api/database_scripts/matview_generator/summary_transaction_geo_view.json b/usaspending_api/database_scripts/matview_generator/summary_transaction_geo_view.json index bb59b502e7..56cbd7090b 100644 --- a/usaspending_api/database_scripts/matview_generator/summary_transaction_geo_view.json +++ b/usaspending_api/database_scripts/matview_generator/summary_transaction_geo_view.json @@ -13,19 +13,19 @@ " CASE WHEN COALESCE(transaction_fpds.legal_entity_country_code, transaction_fabs.legal_entity_country_code) = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fpds.legal_entity_country_code, transaction_fabs.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " COALESCE(transaction_fpds.legal_entity_country_name, transaction_fabs.legal_entity_country_name) AS recipient_location_country_name,", " COALESCE(transaction_fpds.legal_entity_state_code, transaction_fabs.legal_entity_state_code) AS recipient_location_state_code,", - " COALESCE(transaction_fpds.legal_entity_county_code, transaction_fabs.legal_entity_county_code) AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.legal_entity_county_code, transaction_fabs.legal_entity_county_code), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " COALESCE(transaction_fpds.legal_entity_county_name, transaction_fabs.legal_entity_county_name) AS recipient_location_county_name,", - " COALESCE(transaction_fpds.legal_entity_congressional, transaction_fabs.legal_entity_congressional) AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.legal_entity_congressional, transaction_fabs.legal_entity_congressional), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " COALESCE(transaction_fpds.legal_entity_zip5, transaction_fabs.legal_entity_zip5) AS recipient_location_zip5,", " TRIM(TRAILING FROM COALESCE(transaction_fpds.legal_entity_city_name, transaction_fabs.legal_entity_city_name)) AS recipient_location_city_name,", "", " COALESCE(transaction_fpds.place_of_perform_country_n, transaction_fabs.place_of_perform_country_n) AS pop_country_name,", " COALESCE(transaction_fpds.place_of_perform_country_c, transaction_fabs.place_of_perform_country_c,'USA') AS pop_country_code,", " COALESCE(transaction_fpds.place_of_performance_state, transaction_fabs.place_of_perfor_state_code) AS pop_state_code,", - " COALESCE(transaction_fpds.place_of_perform_county_co, transaction_fabs.place_of_perform_county_co) AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.place_of_perform_county_co, transaction_fabs.place_of_perform_county_co), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " COALESCE(transaction_fpds.place_of_perform_county_na, transaction_fabs.place_of_perform_county_na) AS pop_county_name,", " COALESCE(transaction_fpds.place_of_performance_zip5, transaction_fabs.place_of_performance_zip5) AS pop_zip5,", - " COALESCE(transaction_fpds.place_of_performance_congr, transaction_fabs.place_of_performance_congr) AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.place_of_performance_congr, transaction_fabs.place_of_performance_congr), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM COALESCE(transaction_fpds.place_of_perform_city_name, transaction_fabs.place_of_performance_city)) AS pop_city_name,", "", " transaction_normalized.awarding_agency_id,", diff --git a/usaspending_api/database_scripts/matview_generator/summary_transaction_month_view.json b/usaspending_api/database_scripts/matview_generator/summary_transaction_month_view.json index a40409b661..9d5790b148 100644 --- a/usaspending_api/database_scripts/matview_generator/summary_transaction_month_view.json +++ b/usaspending_api/database_scripts/matview_generator/summary_transaction_month_view.json @@ -13,19 +13,19 @@ " CASE WHEN COALESCE(transaction_fpds.legal_entity_country_code, transaction_fabs.legal_entity_country_code) = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fpds.legal_entity_country_code, transaction_fabs.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " COALESCE(transaction_fpds.legal_entity_country_name, transaction_fabs.legal_entity_country_name) AS recipient_location_country_name,", " COALESCE(transaction_fpds.legal_entity_state_code, transaction_fabs.legal_entity_state_code) AS recipient_location_state_code,", - " COALESCE(transaction_fpds.legal_entity_county_code, transaction_fabs.legal_entity_county_code) AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.legal_entity_county_code, transaction_fabs.legal_entity_county_code), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " COALESCE(transaction_fpds.legal_entity_county_name, transaction_fabs.legal_entity_county_name) AS recipient_location_county_name,", - " COALESCE(transaction_fpds.legal_entity_congressional, transaction_fabs.legal_entity_congressional) AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.legal_entity_congressional, transaction_fabs.legal_entity_congressional), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " COALESCE(transaction_fpds.legal_entity_zip5, transaction_fabs.legal_entity_zip5) AS recipient_location_zip5,", " TRIM(TRAILING FROM COALESCE(transaction_fpds.legal_entity_city_name, transaction_fabs.legal_entity_city_name)) AS recipient_location_city_name,", "", " COALESCE(transaction_fpds.place_of_perform_country_n, transaction_fabs.place_of_perform_country_n) AS pop_country_name,", " COALESCE(transaction_fpds.place_of_perform_country_c, transaction_fabs.place_of_perform_country_c,'USA') AS pop_country_code,", " COALESCE(transaction_fpds.place_of_performance_state, transaction_fabs.place_of_perfor_state_code) AS pop_state_code,", - " COALESCE(transaction_fpds.place_of_perform_county_co, transaction_fabs.place_of_perform_county_co) AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.place_of_perform_county_co, transaction_fabs.place_of_perform_county_co), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " COALESCE(transaction_fpds.place_of_perform_county_na, transaction_fabs.place_of_perform_county_na) AS pop_county_name,", " COALESCE(transaction_fpds.place_of_performance_zip5, transaction_fabs.place_of_performance_zip5) AS pop_zip5,", - " COALESCE(transaction_fpds.place_of_performance_congr, transaction_fabs.place_of_performance_congr) AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.place_of_performance_congr, transaction_fabs.place_of_performance_congr), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM COALESCE(transaction_fpds.place_of_perform_city_name, transaction_fabs.place_of_performance_city)) AS pop_city_name,", "", " transaction_normalized.awarding_agency_id,", diff --git a/usaspending_api/database_scripts/matview_generator/summary_transaction_view.json b/usaspending_api/database_scripts/matview_generator/summary_transaction_view.json index bee7996f7e..18c9958f9a 100644 --- a/usaspending_api/database_scripts/matview_generator/summary_transaction_view.json +++ b/usaspending_api/database_scripts/matview_generator/summary_transaction_view.json @@ -13,19 +13,19 @@ " CASE WHEN COALESCE(transaction_fpds.legal_entity_country_code, transaction_fabs.legal_entity_country_code) = 'UNITED STATES' THEN 'USA' ELSE COALESCE(transaction_fpds.legal_entity_country_code, transaction_fabs.legal_entity_country_code,'USA') END AS recipient_location_country_code,", " COALESCE(transaction_fpds.legal_entity_country_name, transaction_fabs.legal_entity_country_name) AS recipient_location_country_name,", " COALESCE(transaction_fpds.legal_entity_state_code, transaction_fabs.legal_entity_state_code) AS recipient_location_state_code,", - " COALESCE(transaction_fpds.legal_entity_county_code, transaction_fabs.legal_entity_county_code) AS recipient_location_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.legal_entity_county_code, transaction_fabs.legal_entity_county_code), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS recipient_location_county_code,", " COALESCE(transaction_fpds.legal_entity_county_name, transaction_fabs.legal_entity_county_name) AS recipient_location_county_name,", - " COALESCE(transaction_fpds.legal_entity_congressional, transaction_fabs.legal_entity_congressional) AS recipient_location_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.legal_entity_congressional, transaction_fabs.legal_entity_congressional), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS recipient_location_congressional_code,", " COALESCE(transaction_fpds.legal_entity_zip5, transaction_fabs.legal_entity_zip5) AS recipient_location_zip5,", " TRIM(TRAILING FROM COALESCE(transaction_fpds.legal_entity_city_name, transaction_fabs.legal_entity_city_name)) AS recipient_location_city_name,", "", " COALESCE(transaction_fpds.place_of_perform_country_n, transaction_fabs.place_of_perform_country_n) AS pop_country_name,", " COALESCE(transaction_fpds.place_of_perform_country_c, transaction_fabs.place_of_perform_country_c,'USA') AS pop_country_code,", " COALESCE(transaction_fpds.place_of_performance_state, transaction_fabs.place_of_perfor_state_code) AS pop_state_code,", - " COALESCE(transaction_fpds.place_of_perform_county_co, transaction_fabs.place_of_perform_county_co) AS pop_county_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.place_of_perform_county_co, transaction_fabs.place_of_perform_county_co), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 3, '0') AS pop_county_code,", " COALESCE(transaction_fpds.place_of_perform_county_na, transaction_fabs.place_of_perform_county_na) AS pop_county_name,", " COALESCE(transaction_fpds.place_of_performance_zip5, transaction_fabs.place_of_performance_zip5) AS pop_zip5,", - " COALESCE(transaction_fpds.place_of_performance_congr, transaction_fabs.place_of_performance_congr) AS pop_congressional_code,", + " LPAD(CAST(CAST((REGEXP_MATCH(COALESCE(transaction_fpds.place_of_performance_congr, transaction_fabs.place_of_performance_congr), '^[A-Z]*(\\d+)(?:\\.\\d+)?$'))[1] AS smallint) AS text), 2, '0') AS pop_congressional_code,", " TRIM(TRAILING FROM COALESCE(transaction_fpds.place_of_perform_city_name, transaction_fabs.place_of_performance_city)) AS pop_city_name,", "", " transaction_normalized.awarding_agency_id,", diff --git a/usaspending_api/etl/es_etl_helpers.py b/usaspending_api/etl/es_etl_helpers.py index 90b7109314..29ab41d465 100644 --- a/usaspending_api/etl/es_etl_helpers.py +++ b/usaspending_api/etl/es_etl_helpers.py @@ -82,7 +82,6 @@ "type_of_contract_pricing", "type_set_aside", "extent_competed", - "pulled_from", "type", "pop_country_code", "pop_country_name", diff --git a/usaspending_api/etl/es_transaction_template.json b/usaspending_api/etl/es_transaction_template.json index 50b4435e6c..7e0c12146b 100644 --- a/usaspending_api/etl/es_transaction_template.json +++ b/usaspending_api/etl/es_transaction_template.json @@ -79,13 +79,23 @@ "analyzer": "stemmer_analyzer" }, "product_or_service_code": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "product_or_service_description": { "type": "text" }, "naics_code": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "naics_description": { "type": "text" @@ -277,16 +287,28 @@ "index": false }, "type_of_contract_pricing": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "type_set_aside": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "extent_competed": { - "type": "text" - }, - "pulled_from": { - "type": "text" + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } }, "type": { "type": "keyword", diff --git a/usaspending_api/etl/management/commands/es_rapidloader.py b/usaspending_api/etl/management/commands/es_rapidloader.py index 7e67926025..3922dd7761 100644 --- a/usaspending_api/etl/management/commands/es_rapidloader.py +++ b/usaspending_api/etl/management/commands/es_rapidloader.py @@ -95,6 +95,12 @@ def add_arguments(self, parser): help="Processes transactions updated on or after the UTC date/time provided. yyyy-mm-dd hh:mm:ss is always " "a safe format. Wrap in quotes if date/time contains spaces.", ) + parser.add_argument( + "--skip-delete-index", + action="store_true", + help="When creating a new index skip the step that deletes the old indexes and swaps the aliases. " + "Only used when --create-new-index is provided.", + ) def handle(self, *args, **options): self.elasticsearch_client = instantiate_elasticsearch_client() @@ -175,9 +181,12 @@ def run_load_steps(self) -> None: def complete_process(self) -> None: if self.config["create_new_index"]: - printf({"msg": "Closing old indices and adding aliases"}) set_final_index_config(self.elasticsearch_client, self.config["index_name"]) - swap_aliases(self.elasticsearch_client, self.config["index_name"]) + if self.config["skip_delete_index"]: + printf({"msg": "Skipping deletion of old indices"}) + else: + printf({"msg": "Closing old indices and adding aliases"}) + swap_aliases(self.elasticsearch_client, self.config["index_name"]) if self.config["snapshot"]: printf({"msg": "Taking snapshot"}) @@ -191,7 +200,15 @@ def complete_process(self) -> None: def process_cli_parameters(options: dict, es_client) -> None: default_datetime = datetime.strptime("{}+0000".format(settings.API_SEARCH_MIN_DATE), "%Y-%m-%d%z") - simple_args = ("process_deletes", "create_new_index", "snapshot", "index_name", "directory", "skip_counts") + simple_args = ( + "process_deletes", + "create_new_index", + "snapshot", + "index_name", + "directory", + "skip_counts", + "skip_delete_index", + ) config = set_config(simple_args, options) config["fiscal_years"] = fiscal_years_for_processing(options) From f0ba9f4074e1586a1e4b72c7a0dda66e05d3b9a8 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Thu, 12 Dec 2019 13:28:18 -0700 Subject: [PATCH 23/33] [DEV-1566] switched dashes to underscores for deltas --- .../management/commands/populate_monthly_delta_files.py | 2 +- usaspending_api/download/v2/list_monthly_downloads.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/usaspending_api/download/management/commands/populate_monthly_delta_files.py b/usaspending_api/download/management/commands/populate_monthly_delta_files.py index ccba5afe84..44d475fe39 100644 --- a/usaspending_api/download/management/commands/populate_monthly_delta_files.py +++ b/usaspending_api/download/management/commands/populate_monthly_delta_files.py @@ -155,7 +155,7 @@ def create_local_file(self, award_type, source, agency_code, generate_since): if not os.path.exists(working_dir): os.mkdir(working_dir) agency_str = "All" if agency_code == "all" else agency_code - source_name = f"FY(All)-{agency_str}_{award_type}_Delta_{datetime.strftime(date.today(), '%Y%m%d')}" + source_name = f"FY(All)_{agency_str}_{award_type}_Delta_{datetime.strftime(date.today(), '%Y%m%d')}" source_path = os.path.join(working_dir, "{}.csv".format(source_name)) # Create a unique temporary file with the raw query diff --git a/usaspending_api/download/v2/list_monthly_downloads.py b/usaspending_api/download/v2/list_monthly_downloads.py index 38e80b4bba..138700869f 100644 --- a/usaspending_api/download/v2/list_monthly_downloads.py +++ b/usaspending_api/download/v2/list_monthly_downloads.py @@ -50,8 +50,8 @@ def post(self, request): # Populate regex monthly_download_prefixes = f"FY{fiscal_year}_{agency['toptier_code']}_{download_type}" monthly_download_regex = r"{}_Full_.*\.zip".format(monthly_download_prefixes) - delta_download_prefixes = f"FY(All)-{agency['toptier_code']}_{download_type}" - delta_download_regex = r"FY\(All\)-{}_{}_Delta_.*\.zip".format(agency["toptier_code"], download_type) + delta_download_prefixes = f"FY(All)_{agency['toptier_code']}_{download_type}" + delta_download_regex = r"FY\(All\)_{}_{}_Delta_.*\.zip".format(agency["toptier_code"], download_type) # Retrieve and filter the files we need bucket = boto3.resource("s3", region_name=self.s3_handler.region).Bucket(self.s3_handler.bucketRoute) From 330463e5978cf947d57e93891b983641c09908e8 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Thu, 12 Dec 2019 13:57:21 -0700 Subject: [PATCH 24/33] [DEV-1566] Added check to make Keyword Seach and Advanced Search downloads match --- usaspending_api/download/download_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/usaspending_api/download/download_utils.py b/usaspending_api/download/download_utils.py index 5b91c53810..2abc83f4ef 100644 --- a/usaspending_api/download/download_utils.py +++ b/usaspending_api/download/download_utils.py @@ -29,11 +29,14 @@ def create_unique_filename(json_request, origination=None): agency=agency, data_quarters=data_quarters, level=level, timestamp=timestamp, ) else: # "award" downloads - download_types = json_request["download_types"] agency = "" - award_type_name = create_award_level_string(download_types) - if origination == "bulk_download": + + # Since Keyword Search using the "Bulk Download" Endpoint for unknown reasons + # Check for the specific filter to mimic the Advanced Search download filename + if origination == "bulk_download" and "elasticsearch_keyword" not in json_request["filters"]: agency = obtain_filename_prefix_from_agency_id(request_agency) + "_" + + award_type_name = create_award_level_string(json_request["download_types"]) download_name = f"{agency}{award_type_name}_{timestamp}.zip" return download_name From 24a7e55d5c3a3c0ae3586cfac0d33a845838d5f8 Mon Sep 17 00:00:00 2001 From: sethstoudenmier Date: Thu, 12 Dec 2019 14:36:07 -0800 Subject: [PATCH 25/33] update treasury accounts on index --- .../etl/transaction_delta_view.sql | 29 +++++++++++++++++-- usaspending_api/etl/es_etl_helpers.py | 15 ++++++++-- .../etl/es_transaction_template.json | 25 +++++++++------- 3 files changed, 52 insertions(+), 17 deletions(-) diff --git a/usaspending_api/database_scripts/etl/transaction_delta_view.sql b/usaspending_api/database_scripts/etl/transaction_delta_view.sql index 8e7e9ea877..da3c9ba8c9 100644 --- a/usaspending_api/database_scripts/etl/transaction_delta_view.sql +++ b/usaspending_api/database_scripts/etl/transaction_delta_view.sql @@ -97,8 +97,8 @@ SELECT UTM.recipient_location_congressional_code, UTM.recipient_location_city_name, - UTM.treasury_account_identifiers, - ACCT.federal_accounts, + TREASURY_ACCT.treasury_accounts, + FEDERAL_ACCT.federal_accounts, UTM.business_categories FROM universal_transaction_matview UTM @@ -116,6 +116,29 @@ LEFT JOIN subtier_agency SAA ON (AA.subtier_agency_id = SAA.subtier_agency_id) LEFT JOIN toptier_agency TFA ON (FA.toptier_agency_id = TFA.toptier_agency_id) LEFT JOIN subtier_agency SFA ON (FA.subtier_agency_id = SFA.subtier_agency_id) LEFT JOIN references_cfda CFDA ON (FABS.cfda_number = CFDA.program_number) +LEFT JOIN ( + SELECT + faba.award_id, + JSONB_AGG( + DISTINCT JSONB_BUILD_OBJECT( + 'aid', taa.agency_id, + 'ata', taa.allocation_transfer_agency_id, + 'main', taa.main_account_code, + 'sub', taa.sub_account_code, + 'bpoa', taa.beginning_period_of_availability, + 'epoa', taa.beginning_period_of_availability, + 'a', taa.availability_type_code + ) + ) treasury_accounts + FROM + federal_account fa + INNER JOIN treasury_appropriation_account taa ON (fa.id = taa.federal_account_id) + INNER JOIN financial_accounts_by_awards faba ON (taa.treasury_account_identifier = faba.treasury_account_id) + WHERE + faba.award_id IS NOT NULL + GROUP BY + faba.award_id +) TREASURY_ACCT ON (TREASURY_ACCT.award_id = UTM.award_id) LEFT JOIN ( SELECT faba.award_id, @@ -134,4 +157,4 @@ LEFT JOIN ( faba.award_id IS NOT NULL GROUP BY faba.award_id -) ACCT ON (ACCT.award_id = TM.award_id); +) FEDERAL_ACCT ON (FEDERAL_ACCT.award_id = UTM.award_id); diff --git a/usaspending_api/etl/es_etl_helpers.py b/usaspending_api/etl/es_etl_helpers.py index 29ab41d465..287c5e9576 100644 --- a/usaspending_api/etl/es_etl_helpers.py +++ b/usaspending_api/etl/es_etl_helpers.py @@ -99,7 +99,7 @@ "recipient_location_zip5", "recipient_location_congressional_code", "recipient_location_city_name", - "treasury_account_identifiers", + "treasury_accounts", "federal_accounts", "business_categories", ] @@ -174,6 +174,15 @@ def convert_postgres_array_as_string_to_list(array_as_string: str) -> list: return array_as_string[1:-1].split(",") if len(array_as_string) > 2 else None +def convert_postgres_json_array_as_string_to_json(json_array_as_string: str) -> dict: + """ + Postgres JSON arrays (jsonb) are stored in CSVs as strings. Elasticsearch is able to handle + JSON arrays with nested types, but needs to be passed a list JSON instead of a string. + In the case of an empty array, return null. + """ + return json.loads(json_array_as_string) if json_array_as_string else None + + def process_guarddog(process_list): """ pass in a list of multiprocess Process objects. @@ -303,8 +312,8 @@ def csv_chunk_gen(filename, chunksize, job_id): # Need a specific converter to handle converting strings to correct data types (e.g. string -> array) converters = { "business_categories": convert_postgres_array_as_string_to_list, - "treasury_account_identifiers": convert_postgres_array_as_string_to_list, - "federal_accounts": lambda string_to_convert: json.loads(string_to_convert) if string_to_convert else None, + "treasury_accounts": convert_postgres_json_array_as_string_to_json, + "federal_accounts": convert_postgres_json_array_as_string_to_json, } # Panda's data type guessing causes issues for Elasticsearch. Explicitly cast using dictionary dtype = {k: str for k in VIEW_COLUMNS if k not in converters} diff --git a/usaspending_api/etl/es_transaction_template.json b/usaspending_api/etl/es_transaction_template.json index 7e0c12146b..fff59855a9 100644 --- a/usaspending_api/etl/es_transaction_template.json +++ b/usaspending_api/etl/es_transaction_template.json @@ -393,21 +393,24 @@ } } }, - "treasury_account_identifiers": { - "type": "keyword" + "treasury_accounts": { + "type": "nested", + "properties": { + "aid": {"type": "keyword"}, + "ata": {"type": "keyword"}, + "main": {"type": "keyword"}, + "sub": {"type": "keyword"}, + "bpoa": {"type": "keyword"}, + "epoa": {"type": "keyword"}, + "a": {"type": "keyword"} + } }, "federal_accounts": { "type": "nested", "properties": { - "id": { - "type": "integer" - }, - "account_title": { - "type": "keyword" - }, - "federal_account_code": { - "type": "keyword" - } + "id": {"type": "integer"}, + "account_title": {"type": "keyword"}, + "federal_account_code": {"type": "keyword"} } }, "business_categories": { From 284042de7d12bac34e029db65add90f65f9b8a84 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Fri, 13 Dec 2019 12:03:55 -0700 Subject: [PATCH 26/33] [DEV-1566] cleanup per review --- usaspending_api/download/download_utils.py | 6 ++---- .../download/filestreaming/download_generation.py | 5 ++--- usaspending_api/download/lookups.py | 6 +++--- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/usaspending_api/download/download_utils.py b/usaspending_api/download/download_utils.py index 2abc83f4ef..56e084ae58 100644 --- a/usaspending_api/download/download_utils.py +++ b/usaspending_api/download/download_utils.py @@ -11,11 +11,9 @@ def create_unique_filename(json_request, origination=None): request_agency = json_request.get("agency", "all") if json_request.get("is_for_idv"): - slug_text = slugify_text_for_file_names(json_request.get("piid"), "UNKNOWN", 50) - download_name = f"IDV_{slug_text}_{timestamp}.zip" + download_name = f"IDV_{slugify_text_for_file_names(json_request.get('piid'), 'UNKNOWN', 50)}_{timestamp}.zip" elif json_request.get("is_for_contract"): - slug_text = slugify_text_for_file_names(json_request.get("piid"), "UNKNOWN", 50) - download_name = f"CONT_{slug_text}_{timestamp}.zip" + download_name = f"CONT_{slugify_text_for_file_names(json_request.get('piid'), 'UNKNOWN', 50)}_{timestamp}.zip" elif json_request.get("is_for_assistance"): slug_text = slugify_text_for_file_names(json_request.get("assistance_id"), "UNKNOWN", 50) download_name = f"ASST_{slug_text}_{timestamp}.zip" diff --git a/usaspending_api/download/filestreaming/download_generation.py b/usaspending_api/download/filestreaming/download_generation.py index 520ad74c30..b6170227a6 100644 --- a/usaspending_api/download/filestreaming/download_generation.py +++ b/usaspending_api/download/filestreaming/download_generation.py @@ -196,7 +196,6 @@ def parse_source(source, columns, download_job, working_dir, piid, assistance_id source_name = file_name_pattern.format(assistance_id=slugify_text_for_file_names(assistance_id, "UNKNOWN", 50)) else: file_name_pattern = VALUE_MAPPINGS[source.source_type]["download_name"] - agency_is_optional = "_" if source.agency_code == "all": agency = "All" @@ -207,11 +206,11 @@ def parse_source(source, columns, download_job, working_dir, piid, assistance_id filters = request["filters"] if request.get("limit"): agency = "" - agency_is_optional = "" + else: + agency = f"{agency}_" timestamp = datetime.strftime(datetime.now(timezone.utc), "%Y-%m-%d_H%HM%MS%S") source_name = file_name_pattern.format( agency=agency, - agency_is_optional=agency_is_optional, data_quarters=construct_data_date_range(filters), level=d_map[source.file_type], timestamp=timestamp, diff --git a/usaspending_api/download/lookups.py b/usaspending_api/download/lookups.py index 0dda4745d8..6fb6b10c06 100644 --- a/usaspending_api/download/lookups.py +++ b/usaspending_api/download/lookups.py @@ -62,7 +62,7 @@ "table": AwardSearchView, "table_name": "award", "type_name": "PrimeAwardSummaries", - "download_name": "{agency}{agency_is_optional}{type}_PrimeAwardSummaries_{timestamp}", + "download_name": "{agency}{type}_PrimeAwardSummaries_{timestamp}", "contract_data": "award__latest_transaction__contract_data", "assistance_data": "award__latest_transaction__assistance_data", "filter_function": universal_award_matview_filter, @@ -74,7 +74,7 @@ "table": UniversalTransactionView, "table_name": "transaction", "type_name": "PrimeTransactions", - "download_name": "{agency}{agency_is_optional}{type}_PrimeTransactions_{timestamp}", + "download_name": "{agency}{type}_PrimeTransactions_{timestamp}", "contract_data": "transaction__contract_data", "assistance_data": "transaction__assistance_data", "filter_function": universal_transaction_matview_filter, @@ -86,7 +86,7 @@ "table": SubawardView, "table_name": "subaward", "type_name": "Subawards", - "download_name": "{agency}{agency_is_optional}{type}_Subawards_{timestamp}", + "download_name": "{agency}{type}_Subawards_{timestamp}", "contract_data": "award__latest_transaction__contract_data", "assistance_data": "award__latest_transaction__assistance_data", "filter_function": subaward_download, From 47d3b8692f341a1e30254dfe06acb4b3f14b02c2 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Wed, 18 Dec 2019 08:09:42 -0700 Subject: [PATCH 27/33] [DEV-1566] extra underscore for accounts downloads --- usaspending_api/download/filestreaming/download_generation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/usaspending_api/download/filestreaming/download_generation.py b/usaspending_api/download/filestreaming/download_generation.py index b6170227a6..ddb715b810 100644 --- a/usaspending_api/download/filestreaming/download_generation.py +++ b/usaspending_api/download/filestreaming/download_generation.py @@ -206,7 +206,7 @@ def parse_source(source, columns, download_job, working_dir, piid, assistance_id filters = request["filters"] if request.get("limit"): agency = "" - else: + elif source.file_type not in ("treasury_account", "federal_account"): agency = f"{agency}_" timestamp = datetime.strftime(datetime.now(timezone.utc), "%Y-%m-%d_H%HM%MS%S") source_name = file_name_pattern.format( @@ -221,7 +221,7 @@ def parse_source(source, columns, download_job, working_dir, piid, assistance_id source.file_name = f"{source_name}.{extension}" source_path = os.path.join(working_dir, source.file_name) - write_to_log(message=f"Preparing to download data as {source_name}", download_job=download_job) + write_to_log(message=f"Preparing to download data as {source.file_name}", download_job=download_job) # Generate the query file; values, limits, dates fixed temp_file, temp_file_path = generate_temp_query_file(source_query, limit, source, download_job, columns, extension) From 28a75112bd87bb5ed6019c9ba1bd5ca797ed53ae Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Thu, 19 Dec 2019 10:28:22 -0700 Subject: [PATCH 28/33] [DEV-4032] removing the .keyword from the mapping --- .../awards/v2/lookups/elasticsearch_lookups.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py b/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py index 77b5aa957c..86280b6aed 100644 --- a/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py +++ b/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py @@ -6,12 +6,12 @@ TRANSACTIONS_LOOKUP = { - "Recipient Name": "recipient_name.keyword", + "Recipient Name": "recipient_name", "Action Date": "action_date", "Transaction Amount": "transaction_amount", - "Award Type": "type_description.keyword", - "Awarding Agency": "awarding_toptier_agency_name.keyword", - "Awarding Sub Agency": "awarding_subtier_agency_name.keyword", + "Award Type": "type_description", + "Awarding Agency": "awarding_toptier_agency_name", + "Awarding Sub Agency": "awarding_subtier_agency_name", "Funding Agency": "funding_toptier_agency_name", "Funding Sub Agency": "funding_subtier_agency_name", "Issued Date": "period_of_performance_start_date", From 888e3922ada2c9179a635e131be0ee063f818b97 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Thu, 19 Dec 2019 11:01:04 -0700 Subject: [PATCH 29/33] [DEV-4032] added tests to cover bug --- .../tests/test_spending_by_transaction.py | 106 ++++++++++++++++-- 1 file changed, 99 insertions(+), 7 deletions(-) diff --git a/usaspending_api/search/tests/test_spending_by_transaction.py b/usaspending_api/search/tests/test_spending_by_transaction.py index 4b6f039047..5085706b23 100644 --- a/usaspending_api/search/tests/test_spending_by_transaction.py +++ b/usaspending_api/search/tests/test_spending_by_transaction.py @@ -1,18 +1,36 @@ import json import pytest +from model_mommy import mommy from time import perf_counter from rest_framework import status -@pytest.mark.skip +ENDPOINT = "/api/v2/search/spending_by_transaction/" + + +@pytest.fixture +def transaction_data(db): + mommy.make( + "awards.TransactionNormalized", + id=1, + award_id=1, + action_date="2010-10-01", + is_fpds=True, + type="A", + description="test", + ) + mommy.make("awards.TransactionFPDS", transaction_id=1, legal_entity_zip5="abcde", piid="IND12PB00323") + mommy.make("awards.Award", id=1, latest_transaction_id=1, is_fpds=True, type="A", piid="IND12PB00323") + + @pytest.mark.django_db -def test_spending_by_transaction_kws_success(client): +def test_spending_by_transaction_kws_success(client, elasticsearch_transaction_index): """Verify error on bad autocomplete request for budget function.""" resp = client.post( - "/api/v2/search/spending_by_transaction/", + ENDPOINT, content_type="application/json", data=json.dumps( { @@ -34,9 +52,7 @@ def test_spending_by_transaction_kws_failure(client): """Verify error on bad autocomplete request for budget function.""" - resp = client.post( - "/api/v2/search/spending_by_transaction/", content_type="application/json", data=json.dumps({"filters": {}}) - ) + resp = client.post(ENDPOINT, content_type="application/json", data=json.dumps({"filters": {}})) assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY @@ -52,8 +68,84 @@ def test_no_intersection(client, refresh_matviews): } api_start = perf_counter() - resp = client.post("/api/v2/search/spending_by_award", content_type="application/json", data=json.dumps(request)) + resp = client.post(ENDPOINT, content_type="application/json", data=json.dumps(request)) api_end = perf_counter() assert resp.status_code == status.HTTP_200_OK assert api_end - api_start < 0.5, "Response took over 0.5s! Investigate why" assert len(resp.data["results"]) == 0, "Results returned, there should be 0" + + +@pytest.mark.django_db +def test_all_fields_returned(client, transaction_data, elasticsearch_transaction_index): + + elasticsearch_transaction_index.update_index() + + fields = [ + "Recipient Name", + "Action Date", + "Transaction Amount", + "Award Type", + "Awarding Agency", + "Awarding Sub Agency", + "Funding Agency", + "Funding Sub Agency", + "Issued Date", + "Loan Value", + "Subsidy Cost", + "Mod", + "Award ID", + "awarding_agency_id", + "internal_id", + "generated_internal_id", + "Last Date to Order", + ] + + request = { + "filters": {"keyword": "test", "award_type_codes": ["A", "B", "C", "D"]}, + "fields": fields, + "page": 1, + "limit": 5, + "sort": "Award ID", + "order": "desc", + } + + resp = client.post(ENDPOINT, content_type="application/json", data=json.dumps(request)) + + assert resp.status_code == status.HTTP_200_OK + assert len(resp.data["results"]) > 0 + for result in resp.data["results"]: + for field in fields: + assert field in result, f"Response item is missing field {field}" + + assert "Sausage" not in result + assert "A" not in result + + +@pytest.mark.django_db +def test_subset_of_fields_returned(client, transaction_data, elasticsearch_transaction_index): + + elasticsearch_transaction_index.update_index() + + fields = ["Award ID", "Recipient Name", "Mod"] + + request = { + "filters": {"keyword": "test", "award_type_codes": ["A", "B", "C", "D"]}, + "fields": fields, + "page": 1, + "limit": 5, + "sort": "Award ID", + "order": "desc", + } + + resp = client.post(ENDPOINT, content_type="application/json", data=json.dumps(request)) + + assert resp.status_code == status.HTTP_200_OK + assert len(resp.data["results"]) > 0 + for result in resp.data["results"]: + for field in fields: + assert field in result, f"Response item is missing field {field}" + + assert "internal_id" in result + assert "generated_internal_id" in result + + assert "Last Date to Order" not in result From 3d5f90132c3c3e87a0590f917fd59c63beb6c7ac Mon Sep 17 00:00:00 2001 From: Brian Zito Date: Fri, 20 Dec 2019 10:00:01 -0500 Subject: [PATCH 30/33] dev-3993 removed header from subgrants --- .../download/v2/download_column_historical_lookups.py | 1 - 1 file changed, 1 deletion(-) diff --git a/usaspending_api/download/v2/download_column_historical_lookups.py b/usaspending_api/download/v2/download_column_historical_lookups.py index 77cc02bfe2..56420fb939 100644 --- a/usaspending_api/download/v2/download_column_historical_lookups.py +++ b/usaspending_api/download/v2/download_column_historical_lookups.py @@ -1359,7 +1359,6 @@ "broker_subaward__place_of_perform_country_na", ), ("prime_award_description", "broker_subaward__award_description"), - ("prime_award_project_title", "broker_subaward__program_title"), ("prime_award_cfda_number", "broker_subaward__cfda_numbers"), ("prime_award_cfda_title", "broker_subaward__cfda_titles"), ("subaward_type", "broker_subaward__subaward_type"), From 17e0fdde1d3bd401fa4f2434b693e5a923a3f740 Mon Sep 17 00:00:00 2001 From: Brian Zito Date: Fri, 20 Dec 2019 11:09:45 -0500 Subject: [PATCH 31/33] fixed tests --- .../download/tests/integration/test_download_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/usaspending_api/download/tests/integration/test_download_status.py b/usaspending_api/download/tests/integration/test_download_status.py index cf0dae159b..5e329ba368 100644 --- a/usaspending_api/download/tests/integration/test_download_status.py +++ b/usaspending_api/download/tests/integration/test_download_status.py @@ -135,7 +135,7 @@ def test_download_assistance_status(client, download_test_data, refresh_matviews assert resp.status_code == status.HTTP_200_OK assert resp.json()["total_rows"] == 1 - assert resp.json()["total_columns"] == 89 + assert resp.json()["total_columns"] == 88 # Test with columns specified dl_resp = client.post( From 228ce68d252dc3955ace2f9f181c30406a669505 Mon Sep 17 00:00:00 2001 From: sethstoudenmier Date: Mon, 23 Dec 2019 11:28:13 -0800 Subject: [PATCH 32/33] [Warmfix][DEV-4032] Fix sorting on keyword search --- .../v2/lookups/elasticsearch_lookups.py | 16 ++++++---- .../tests/test_spending_by_transaction.py | 30 +++++++++++++++++++ .../search/v2/elasticsearch_helper.py | 17 +++++++---- 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py b/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py index 86280b6aed..aa79834319 100644 --- a/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py +++ b/usaspending_api/awards/v2/lookups/elasticsearch_lookups.py @@ -6,12 +6,12 @@ TRANSACTIONS_LOOKUP = { - "Recipient Name": "recipient_name", + "Recipient Name": "recipient_name.keyword", "Action Date": "action_date", "Transaction Amount": "transaction_amount", - "Award Type": "type_description", - "Awarding Agency": "awarding_toptier_agency_name", - "Awarding Sub Agency": "awarding_subtier_agency_name", + "Award Type": "type_description.keyword", + "Awarding Agency": "awarding_toptier_agency_name.keyword", + "Awarding Sub Agency": "awarding_subtier_agency_name.keyword", "Funding Agency": "funding_toptier_agency_name", "Funding Sub Agency": "funding_subtier_agency_name", "Issued Date": "period_of_performance_start_date", @@ -25,9 +25,15 @@ "Last Date to Order": "ordering_period_end_date", } +TRANSACTIONS_SOURCE_LOOKUP = {key: value.replace(".keyword", "") for key, value in TRANSACTIONS_LOOKUP.items()} INDEX_ALIASES_TO_AWARD_TYPES = deepcopy(all_award_types_mappings) INDEX_ALIASES_TO_AWARD_TYPES["directpayments"] = INDEX_ALIASES_TO_AWARD_TYPES.pop("direct_payments") INDEX_ALIASES_TO_AWARD_TYPES["other"] = INDEX_ALIASES_TO_AWARD_TYPES.pop("other_financial_assistance") -KEYWORD_DATATYPE_FIELDS = ["recipient_name", "awarding_toptier_agency_name", "awarding_subtier_agency_name"] +KEYWORD_DATATYPE_FIELDS = [ + "recipient_name.keyword", + "awarding_toptier_agency_name.keyword", + "awarding_subtier_agency_name.keyword", + "type_description.keyword", +] diff --git a/usaspending_api/search/tests/test_spending_by_transaction.py b/usaspending_api/search/tests/test_spending_by_transaction.py index 5085706b23..d2bbb78cd0 100644 --- a/usaspending_api/search/tests/test_spending_by_transaction.py +++ b/usaspending_api/search/tests/test_spending_by_transaction.py @@ -149,3 +149,33 @@ def test_subset_of_fields_returned(client, transaction_data, elasticsearch_trans assert "generated_internal_id" in result assert "Last Date to Order" not in result + + +@pytest.mark.django_db +def test_columns_can_be_sorted(client, transaction_data, elasticsearch_transaction_index): + + elasticsearch_transaction_index.update_index() + + fields = [ + "Action Date", + "Award ID", + "Awarding Agency", + "Awarding Sub Agency", + "Award Type", + "Mod", + "Recipient Name", + "Action Date", + ] + + request = { + "filters": {"keyword": "test", "award_type_codes": ["A", "B", "C", "D"]}, + "fields": fields, + "page": 1, + "limit": 5, + "order": "desc", + } + + for field in fields: + request["sort"] = field + resp = client.post(ENDPOINT, content_type="application/json", data=json.dumps(request)) + assert resp.status_code == status.HTTP_200_OK, f"Failed to sort column: {field}" diff --git a/usaspending_api/search/v2/elasticsearch_helper.py b/usaspending_api/search/v2/elasticsearch_helper.py index bf40fa3887..045688121c 100644 --- a/usaspending_api/search/v2/elasticsearch_helper.py +++ b/usaspending_api/search/v2/elasticsearch_helper.py @@ -3,15 +3,18 @@ from django.conf import settings -from usaspending_api.awards.v2.lookups.elasticsearch_lookups import KEYWORD_DATATYPE_FIELDS -from usaspending_api.awards.v2.lookups.elasticsearch_lookups import INDEX_ALIASES_TO_AWARD_TYPES -from usaspending_api.awards.v2.lookups.elasticsearch_lookups import TRANSACTIONS_LOOKUP +from usaspending_api.awards.v2.lookups.elasticsearch_lookups import ( + TRANSACTIONS_LOOKUP, + TRANSACTIONS_SOURCE_LOOKUP, + KEYWORD_DATATYPE_FIELDS, + INDEX_ALIASES_TO_AWARD_TYPES, +) from usaspending_api.common.elasticsearch.client import es_client_query logger = logging.getLogger("console") DOWNLOAD_QUERY_SIZE = settings.MAX_DOWNLOAD_LIMIT -TRANSACTIONS_LOOKUP.update({v: k for k, v in TRANSACTIONS_LOOKUP.items()}) +TRANSACTIONS_SOURCE_LOOKUP.update({v: k for k, v in TRANSACTIONS_SOURCE_LOOKUP.items()}) def es_sanitize(input_string): @@ -36,7 +39,9 @@ def es_minimal_sanitize(keyword): def swap_keys(dictionary_): - return dict((TRANSACTIONS_LOOKUP.get(old_key, old_key), new_key) for (old_key, new_key) in dictionary_.items()) + return dict( + (TRANSACTIONS_SOURCE_LOOKUP.get(old_key, old_key), new_key) for (old_key, new_key) in dictionary_.items() + ) def format_for_frontend(response): @@ -65,7 +70,7 @@ def search_transactions(request_data, lower_limit, limit): """ keyword = request_data["filters"]["keywords"] - query_fields = [TRANSACTIONS_LOOKUP[i] for i in request_data["fields"]] + query_fields = [TRANSACTIONS_SOURCE_LOOKUP[i] for i in request_data["fields"]] query_fields.extend(["award_id", "generated_unique_award_id"]) query_sort = TRANSACTIONS_LOOKUP[request_data["sort"]] query = { From 69f79bf374ed570a46f97035fffc3b0371b8b663 Mon Sep 17 00:00:00 2001 From: Tony Sappe <22781949+tony-sappe@users.noreply.github.com> Date: Thu, 26 Dec 2019 14:01:45 -0700 Subject: [PATCH 33/33] [DEV-1566] names were too long for windows --- usaspending_api/download/lookups.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/usaspending_api/download/lookups.py b/usaspending_api/download/lookups.py index 6fb6b10c06..bc8c5a03df 100644 --- a/usaspending_api/download/lookups.py +++ b/usaspending_api/download/lookups.py @@ -106,8 +106,8 @@ "source_type": "account", "table": FinancialAccountsByProgramActivityObjectClass, "table_name": "object_class_program_activity", - "download_name": "{data_quarters}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass_{timestamp}", - "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBreakdownByProgramActivityObjectClass_{timestamp}", + "download_name": "{data_quarters}_{agency}_{level}_AccountBreakdownByPA-OC_{timestamp}", + "zipfile_template": "{data_quarters}_{agency}_{level}_AccountBreakdownByPA-OC_{timestamp}", "filter_function": account_download_filter, }, "award_financial": {