diff --git a/usaspending_api/accounts/v2/filters/account_download.py b/usaspending_api/accounts/v2/filters/account_download.py index c74cf1cc1e..398c4c1a90 100644 --- a/usaspending_api/accounts/v2/filters/account_download.py +++ b/usaspending_api/accounts/v2/filters/account_download.py @@ -47,7 +47,7 @@ get_submission_ids_for_periods, ) -AWARD_URL = f"{HOST}/#/award/" if "localhost" in HOST else f"https://{HOST}/#/award/" +AWARD_URL = f"{HOST}/award/" if "localhost" in HOST else f"https://{HOST}/award/" def account_download_filter(account_type, download_table, filters, account_level="treasury_account"): diff --git a/usaspending_api/api_contracts/contracts/v2/download/count.md b/usaspending_api/api_contracts/contracts/v2/download/count.md index 644819bab4..518b66a515 100644 --- a/usaspending_api/api_contracts/contracts/v2/download/count.md +++ b/usaspending_api/api_contracts/contracts/v2/download/count.md @@ -48,7 +48,7 @@ Returns the number of transactions that would be included in a download request + `calculated_transaction_count` (required, number) The calculated count of all transactions which would be included in the download files. + `maximum_transaction_limit` (required, number) - The current allowed maximum number of transactions in a row-limited download. Visit https://www.usaspending.gov/#/download_center/custom_award_data to download larger volumes of data. + The current allowed maximum number of transactions in a row-limited download. Visit https://www.usaspending.gov/download_center/custom_award_data to download larger volumes of data. + `messages` (optional, array[string]) An array of warnings or instructional directives to aid consumers of this endpoint with development and debugging. + Body diff --git a/usaspending_api/api_contracts/contracts/v2/reporting/agencies/overview.md b/usaspending_api/api_contracts/contracts/v2/reporting/agencies/overview.md index 01e9556cfb..ee93f19a9b 100644 --- a/usaspending_api/api_contracts/contracts/v2/reporting/agencies/overview.md +++ b/usaspending_api/api_contracts/contracts/v2/reporting/agencies/overview.md @@ -36,7 +36,7 @@ This endpoint returns an overview list of government agencies submission data. + Members + `toptier_code` + `current_total_budget_authority_amount` - + `missing_tas_accounts_total` + + `tas_accounts_total` + `missing_tas_accounts_count` + `agency_name` + `obligation_difference` @@ -82,8 +82,8 @@ This endpoint returns an overview list of government agencies submission data. "missing_tas_accounts_count": 20 }, "obligation_difference": 436376232652.87, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 3, + "unlinked_assistance_award_count": 2, "assurance_statement_url": "https://files-nonprod.usaspending.gov/agency_submissions/Raw%20DATA%20Act%20Files/2020/P09/075%20-%20Department%20of%20Health%20and%20Human%20Services%20(HHS)/2020-P09-075_Department%20of%20Health%20and%20Human%20Services%20(HHS)-Assurance_Statement.txt" }, { diff --git a/usaspending_api/api_contracts/contracts/v2/reporting/agencies/toptier_code/fiscal_year/fiscal_period/unlinked_awards/type.md b/usaspending_api/api_contracts/contracts/v2/reporting/agencies/toptier_code/fiscal_year/fiscal_period/unlinked_awards/type.md new file mode 100644 index 0000000000..f4066d0b70 --- /dev/null +++ b/usaspending_api/api_contracts/contracts/v2/reporting/agencies/toptier_code/fiscal_year/fiscal_period/unlinked_awards/type.md @@ -0,0 +1,37 @@ +FORMAT: 1A +HOST: https://api.usaspending.gov + +# Agencies' Unlinked Awards [/api/v2/reporting/agencies/{toptier_code}/{fiscal_year}/{fiscal_period}/unlinked_awards/{type}/] + +This endpoint is used to power USAspending.gov's About the Data \| Agencies unlinked data modals. + +## GET + +This endpoint returns the number of unlinked and linked awards for the agency in the provided fiscal year and period. + ++ Parameters + + `toptier_code`: `020` (required, string) + The specific agency code. + + `fiscal_year`: 2020 (required, number) + The fiscal year of the submission + + `fiscal_period`: 10 (required, number) + The fiscal period of the submission. valid values: 2-12 (2 = November ... 12 = September) + For retrieving quarterly submissions, provide the period which equals 'quarter * 3' (e.g. Q2 = P6) + + `type`: `assistance` (required, enum[string]) + + Members + + `assistance` + + `procurement` + ++ Response 200 (application/json) + + + Attributes (object) + + `unlinked_file_c_award_count` (required, number) + + `unlinked_file_d_award_count` (required, number) + + `total_linked_award_count` (required, number) + + Body + + { + "unlinked_file_c_award_count": 123213, + "unlinked_file_d_award_count": 43543, + "total_linked_award_count": 12321312 + } \ No newline at end of file diff --git a/usaspending_api/api_contracts/contracts/v2/reporting/agencies/toptier_code/overview.md b/usaspending_api/api_contracts/contracts/v2/reporting/agencies/toptier_code/overview.md index ebb356fa01..16b9e4186a 100644 --- a/usaspending_api/api_contracts/contracts/v2/reporting/agencies/toptier_code/overview.md +++ b/usaspending_api/api_contracts/contracts/v2/reporting/agencies/toptier_code/overview.md @@ -32,7 +32,7 @@ This endpoint returns an overview of government agency submission data. + `fiscal_period` + `fiscal_year` + `missing_tas_accounts_count` - + `missing_tas_accounts_total` + + `tas_accounts_total` + `obligation_difference` + `percent_of_total_budgetary_resources` + `recent_publication_date` @@ -78,8 +78,8 @@ This endpoint returns an overview of government agency submission data. "missing_tas_accounts_count": 9 }, "obligation_difference": 12581114.45, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 2, + "unlinked_assistance_award_count": 5, "assurance_statement_url": "https://files.usaspending.gov/agency_submissions/Raw%20DATA%20Act%20Files/2020/P07/020%20-%20Department%20of%20the%20Treasury%20(TREAS)/2020-P07-020_Department%20of%20the%20Treasury%20(TREAS)-Assurance_Statement.txt" }, { diff --git a/usaspending_api/api_contracts/contracts/v2/reporting/placeholder.md b/usaspending_api/api_contracts/contracts/v2/reporting/placeholder.md deleted file mode 100644 index 2891c541f8..0000000000 --- a/usaspending_api/api_contracts/contracts/v2/reporting/placeholder.md +++ /dev/null @@ -1,21 +0,0 @@ -FORMAT: 1A -HOST: https://api.usaspending.gov - -# Placeholder [/api/v2/reporting/placeholder/] - -Description of the endpoint as a whole not taking into account the different HTTP methods. - -## GET - -Description of the endpoint using the above HTTP method. - - -+ Response 200 (application/json) - + Attributes - + `status` (required, string) - - + Body - - { - "status": "success" - } diff --git a/usaspending_api/api_contracts/contracts/v2/subawards.md b/usaspending_api/api_contracts/contracts/v2/subawards.md index ef6f70249c..70f4e7aee9 100644 --- a/usaspending_api/api_contracts/contracts/v2/subawards.md +++ b/usaspending_api/api_contracts/contracts/v2/subawards.md @@ -47,15 +47,37 @@ This endpoint returns a filtered set of subawards. + `results` (required, array[SubawardResponse], fixed-type) + `page_metadata` (required, PageMetadataObject) + + Body + + { + "page_metadata": { + "page": 1, + "next": 2, + "previous": null, + "hasNext": true, + "hasPrevious": false + }, + "results": [ + { + "id": 119270129, + "subaward_number": "Z981002", + "description": "DEVELOPMENT OF A SELF-SUSTAINED WIRELESS INTEGRATED STRUCTURAL HEALTH MONITORING SYSTEM FOR HIGHWAY BRIDGES", + "action_date": "2011-10-27", + "amount": 110000.0, + "recipient_name": "URS GROUP, INC." + } + ] + } + # Data Structures ## SubawardResponse (object) -+ `subaward_number` (required, string) -+ `amount` (required, number) + `id` (required, number) ++ `subaward_number` (required, string) ++ `description` (required, string) + `action_date` (required, string) ++ `amount` (required, number) + `recipient_name` (required, string) -+ `description` (required, string) ## PageMetadataObject (object) + `page` (required, number) diff --git a/usaspending_api/api_docs/management/commands/generate_model_markdown.py b/usaspending_api/api_docs/management/commands/generate_model_markdown.py index e25079ab2e..f0cfa801ad 100644 --- a/usaspending_api/api_docs/management/commands/generate_model_markdown.py +++ b/usaspending_api/api_docs/management/commands/generate_model_markdown.py @@ -7,7 +7,7 @@ class Command(BaseCommand): help = "Generates a markdown file of a model's fields and help text \ for use in documentation \ Usage: `python manage.py generate_model_markdown `" - logger = logging.getLogger("console") + logger = logging.getLogger("script") friendly_names = { "ForeignKey": "Relation", diff --git a/usaspending_api/api_docs/markdown/endpoints.md b/usaspending_api/api_docs/markdown/endpoints.md index 591ac9877f..d24c57577d 100644 --- a/usaspending_api/api_docs/markdown/endpoints.md +++ b/usaspending_api/api_docs/markdown/endpoints.md @@ -147,6 +147,7 @@ The currently available endpoints are listed in the following table. |[/api/v2/reporting/agencies/overview/](/api/v2/reporting/agencies/overview/)|GET| Returns About the Data information about all agencies with submissions in a provided fiscal year and period| |[/api/v2/reporting/agencies/publish_dates/](/api/v2/reporting/agencies/publish_dates/)|GET| Returns submission publication and certification information about all agencies with submissions in a provided fiscal year and period| |[/api/v2/reporting/agencies////submission_history/](/api/v2/reporting/agencies/020/2020/12/submission_history/)|GET| Returns a list of submission publication dates and certified dates for the provided agency for the provided fiscal year and period. | +|[/api/v2/reporting/agencies////unlinked_awards//](/api/v2/reporting/agencies/020/2020/12/unlinked_awards/procurement/)|GET| Returns counts of an agency's linked and unlinked awards for a given period. | |[/api/v2/search/new_awards_over_time/](/api/v2/search/new_awards_over_time/)|POST| Returns a list of time periods with the new awards in the appropriate period within the provided time range | |[/api/v2/search/spending_by_award/](/api/v2/search/spending_by_award/)|POST| Returns the fields of the filtered awards | |[/api/v2/search/spending_by_award_count/](/api/v2/search/spending_by_award_count/)|POST| Returns the number of awards in each award type (Contracts, IDV, Loans, Direct Payments, Grants, and Other) | diff --git a/usaspending_api/api_docs/unused_markdown/request_recipes.md b/usaspending_api/api_docs/unused_markdown/request_recipes.md index bb816db975..36974cfbe4 100644 --- a/usaspending_api/api_docs/unused_markdown/request_recipes.md +++ b/usaspending_api/api_docs/unused_markdown/request_recipes.md @@ -96,4 +96,4 @@ POST # Postman Collections -[Postman](https://www.getpostman.com/) is a free app for making easy API requests. You can also use it to import and inspect a collection of pre-generated API requests. [Here is a postman collection](https://raw.githubusercontent.com/fedspendingtransparency/usaspending-api/master/usaspending_api/static_doc_files/docs/usaspending_searchpage_postmancollection.json) you can use to see how we generate the visualizations on [the search page](https://www.usaspending.gov/#/search/). +[Postman](https://www.getpostman.com/) is a free app for making easy API requests. You can also use it to import and inspect a collection of pre-generated API requests. [Here is a postman collection](https://raw.githubusercontent.com/fedspendingtransparency/usaspending-api/master/usaspending_api/static_doc_files/docs/usaspending_searchpage_postmancollection.json) you can use to see how we generate the visualizations on [the search page](https://www.usaspending.gov/search/). diff --git a/usaspending_api/api_docs/unused_markdown/using_the_api.md b/usaspending_api/api_docs/unused_markdown/using_the_api.md index 91e17af5d7..5ccbe7f055 100644 --- a/usaspending_api/api_docs/unused_markdown/using_the_api.md +++ b/usaspending_api/api_docs/unused_markdown/using_the_api.md @@ -365,11 +365,11 @@ The endpoints described in this section generate files that reflect the site's u #### Award Data Archive -On a monthly basis, the website pre-generates a series of commonly used files based on the agency, fiscal year, and award type. You can find these on the [Award Data Archive](https://www.usaspending.gov/#/download_center/award_data_archive) page. You can also access this information via the API's [List Downloads Endpoint](https://api.usaspending.gov/api/v2/bulk_download/list_monthly_files/). +On a monthly basis, the website pre-generates a series of commonly used files based on the agency, fiscal year, and award type. You can find these on the [Award Data Archive](https://www.usaspending.gov/download_center/award_data_archive) page. You can also access this information via the API's [List Downloads Endpoint](https://api.usaspending.gov/api/v2/bulk_download/list_monthly_files/). #### Generating Download Files -**Reminder**: Before using these endpoints, check the [Award Data Archive](https://usaspending.gov/#/download_center/award_data_archive) for pre-generated files +**Reminder**: Before using these endpoints, check the [Award Data Archive](https://usaspending.gov/download_center/award_data_archive) for pre-generated files There are several downloadable endpoints, all with different features/constraints. diff --git a/usaspending_api/awards/management/commands/fix_missing_agencies.py b/usaspending_api/awards/management/commands/fix_missing_agencies.py index 5a3ad039fe..201dc11eae 100644 --- a/usaspending_api/awards/management/commands/fix_missing_agencies.py +++ b/usaspending_api/awards/management/commands/fix_missing_agencies.py @@ -10,7 +10,7 @@ from django.core.management.base import BaseCommand from django.db import connection -logger = logging.getLogger("console") +logger = logging.getLogger("script") BATCH_SIZE = 10000 diff --git a/usaspending_api/awards/management/commands/load_subawards.py b/usaspending_api/awards/management/commands/load_subawards.py index 479e2217b9..3952d4f4db 100644 --- a/usaspending_api/awards/management/commands/load_subawards.py +++ b/usaspending_api/awards/management/commands/load_subawards.py @@ -9,7 +9,7 @@ from usaspending_api.etl.operations.subaward.update_city_county import update_subaward_city_county -logger = logging.getLogger("console") +logger = logging.getLogger("script") class Command(mixins.ETLMixin, BaseCommand): diff --git a/usaspending_api/awards/management/commands/restock_parent_award.py b/usaspending_api/awards/management/commands/restock_parent_award.py index 732ccd68e9..39cf5a2b22 100644 --- a/usaspending_api/awards/management/commands/restock_parent_award.py +++ b/usaspending_api/awards/management/commands/restock_parent_award.py @@ -8,7 +8,7 @@ class Command(BaseCommand): help = "Empty and repopulate parent_award table with IDV aggregates and counts" - logger = logging.getLogger("console") + logger = logging.getLogger("script") def add_arguments(self, parser): diff --git a/usaspending_api/awards/management/commands/update_transaction_fiscal_year.py b/usaspending_api/awards/management/commands/update_transaction_fiscal_year.py index a04001da2d..cddf66afad 100644 --- a/usaspending_api/awards/management/commands/update_transaction_fiscal_year.py +++ b/usaspending_api/awards/management/commands/update_transaction_fiscal_year.py @@ -6,7 +6,7 @@ class Command(BaseCommand): help = "Updates the fiscal year for all transactions based on their individual action dates" - logger = logging.getLogger("console") + logger = logging.getLogger("script") def handle(self, *args, **options): all_transactions = TransactionNormalized.objects.all() diff --git a/usaspending_api/awards/tests/integration/test_subaward_endpoint.py b/usaspending_api/awards/tests/integration/test_subaward_endpoint.py index 43d638acab..4739ca1501 100644 --- a/usaspending_api/awards/tests/integration/test_subaward_endpoint.py +++ b/usaspending_api/awards/tests/integration/test_subaward_endpoint.py @@ -8,22 +8,22 @@ subaward_1, subaward_2, subaward_3, + subaward_10, + subaward_11, subaward_12, ) @pytest.mark.django_db -def test_subaward_success(client): - - resp = client.post( - "/api/v2/subawards/", content_type="application/json", data=json.dumps({"order": "desc", "limit": 100}) - ) +def test_subaward_no_params(client): + create_subaward_test_data(subaward_1, subaward_2, subaward_3) + resp = client.post("/api/v2/subawards/", content_type="application/json") assert resp.status_code == status.HTTP_200_OK + assert len(json.loads(resp.content.decode("utf-8"))["results"]) == 3 @pytest.mark.django_db def test_subaward_failure(client): - resp = client.post( "/api/v2/subawards/", content_type="application/json", @@ -33,33 +33,54 @@ def test_subaward_failure(client): @pytest.mark.django_db -def test_subaward_query_1(client): +def test_subaward_limit(client): create_subaward_test_data(subaward_1, subaward_2, subaward_3) resp = client.post( "/api/v2/subawards/", content_type="application/json", - data=json.dumps({"order": "desc", "limit": 100, "award_id": 99}), + data=json.dumps({"limit": 2}), ) - assert len(json.loads(resp.content.decode("utf-8"))["results"]) == 3 + assert len(json.loads(resp.content.decode("utf-8"))["results"]) == 2 @pytest.mark.django_db -def test_subaward_query_2(client): - create_subaward_test_data(subaward_12) +def test_subaward_filters(client): + create_subaward_test_data(subaward_1, subaward_2, subaward_3, subaward_12, subaward_11) + resp = client.post( + "/api/v2/subawards/", + content_type="application/json", + data=json.dumps({"award_id": 99}), + ) + assert len(json.loads(resp.content.decode("utf-8"))["results"]) == 4 + + resp = client.post( + "/api/v2/subawards/", + content_type="application/json", + data=json.dumps({"award_id": 88}), + ) + results = json.loads(resp.content.decode("utf-8"))["results"] + assert len(results) == 1 + assert results[0]["id"] == 12 + resp = client.post( "/api/v2/subawards/", content_type="application/json", - data=json.dumps({"order": "desc", "limit": 100, "award_id": 88}), + data=json.dumps({"award_id": "generated_unique_award_id_for_88"}), ) - assert json.loads(resp.content.decode("utf-8"))["results"][0]["id"] == 12 + results = json.loads(resp.content.decode("utf-8"))["results"] + assert len(results) == 1 + assert results[0]["id"] == 12 @pytest.mark.django_db -def test_subaward_query_3(client): - create_subaward_test_data(subaward_12) +def test_subaward_sorting(client): + create_subaward_test_data(subaward_1, subaward_12, subaward_10, subaward_2, subaward_3) resp = client.post( "/api/v2/subawards/", content_type="application/json", - data=json.dumps({"order": "desc", "limit": 100, "award_id": "generated_unique_award_id_for_88"}), + data=json.dumps({"sort": "description", "order": "asc"}), ) - assert json.loads(resp.content.decode("utf-8"))["results"][0]["id"] == 12 + results = json.loads(resp.content.decode("utf-8"))["results"] + assert len(results) == 5 + assert results[0]["id"] == 1 + assert results[4]["id"] == 12 diff --git a/usaspending_api/awards/v2/views/subawards.py b/usaspending_api/awards/v2/views/subawards.py index 7360c4a65f..8e71c93653 100644 --- a/usaspending_api/awards/v2/views/subawards.py +++ b/usaspending_api/awards/v2/views/subawards.py @@ -66,10 +66,15 @@ def _business_logic(self, request_data): queryset = queryset.values(*list(self.subaward_lookup.values())) + # always secondary-sort by PK in case a repeating value (e.g. subaward_number) crosses pages, so suborder isn't abitrary if request_data["order"] == "desc": - queryset = queryset.order_by(F(self.subaward_lookup[request_data["sort"]]).desc(nulls_last=True)) + queryset = queryset.order_by( + F(self.subaward_lookup[request_data["sort"]]).desc(nulls_last=True), F("subaward_id").desc() + ) else: - queryset = queryset.order_by(F(self.subaward_lookup[request_data["sort"]]).asc(nulls_first=True)) + queryset = queryset.order_by( + F(self.subaward_lookup[request_data["sort"]]).asc(nulls_first=True), F("subaward_id").asc() + ) rows = list(queryset[lower_limit : upper_limit + 1]) return [{k: row[v] for k, v in self.subaward_lookup.items()} for row in rows] @@ -79,7 +84,5 @@ def post(self, request): request_data = self._parse_and_validate_request(request.data) results = self._business_logic(request_data) page_metadata = get_simple_pagination_metadata(len(results), request_data["limit"], request_data["page"]) - response = {"page_metadata": page_metadata, "results": results[: request_data["limit"]]} - return Response(response) diff --git a/usaspending_api/broker/helpers/delete_fabs_transactions.py b/usaspending_api/broker/helpers/delete_fabs_transactions.py index 7ccbe62160..7a27be7383 100644 --- a/usaspending_api/broker/helpers/delete_fabs_transactions.py +++ b/usaspending_api/broker/helpers/delete_fabs_transactions.py @@ -5,7 +5,7 @@ from usaspending_api.common.helpers.timing_helpers import timer -logger = logging.getLogger("console") +logger = logging.getLogger("script") def delete_fabs_transactions(ids_to_delete): diff --git a/usaspending_api/broker/helpers/delete_stale_fabs.py b/usaspending_api/broker/helpers/delete_stale_fabs.py index 0f937c1d28..105328f4c8 100644 --- a/usaspending_api/broker/helpers/delete_stale_fabs.py +++ b/usaspending_api/broker/helpers/delete_stale_fabs.py @@ -7,7 +7,7 @@ from usaspending_api.broker.helpers.find_related_awards import find_related_awards -logger = logging.getLogger("console") +logger = logging.getLogger("script") @transaction.atomic diff --git a/usaspending_api/broker/helpers/last_load_date.py b/usaspending_api/broker/helpers/last_load_date.py index 64f797f878..6b19858bbe 100644 --- a/usaspending_api/broker/helpers/last_load_date.py +++ b/usaspending_api/broker/helpers/last_load_date.py @@ -1,9 +1,13 @@ +import logging + from datetime import timedelta from usaspending_api.broker import lookups from usaspending_api.broker.models import ExternalDataLoadDate from usaspending_api.common.helpers.date_helper import cast_datetime_to_utc +logger = logging.getLogger("script") + def get_last_load_date(key, lookback_minutes=None, default=None): """ @@ -25,7 +29,10 @@ def get_last_load_date(key, lookback_minutes=None, default=None): .first() ) if last_load_date is None: + logger.warning(f"No record of a previous run for `{key}` was found!") return default + else: + logger.info(f"Value for previous `{key}` ETL: {last_load_date}") if lookback_minutes is not None: last_load_date -= timedelta(minutes=lookback_minutes) return last_load_date diff --git a/usaspending_api/broker/helpers/store_deleted_fabs.py b/usaspending_api/broker/helpers/store_deleted_fabs.py index 6d66ca3448..76b1ced175 100644 --- a/usaspending_api/broker/helpers/store_deleted_fabs.py +++ b/usaspending_api/broker/helpers/store_deleted_fabs.py @@ -6,7 +6,7 @@ from django.conf import settings -logger = logging.getLogger("console") +logger = logging.getLogger("script") def store_deleted_fabs(ids_to_delete): diff --git a/usaspending_api/broker/helpers/upsert_fabs_transactions.py b/usaspending_api/broker/helpers/upsert_fabs_transactions.py index 6440bd8802..06f3ae87bc 100644 --- a/usaspending_api/broker/helpers/upsert_fabs_transactions.py +++ b/usaspending_api/broker/helpers/upsert_fabs_transactions.py @@ -18,7 +18,7 @@ from usaspending_api.references.models import Agency -logger = logging.getLogger("console") +logger = logging.getLogger("script") BATCH_FETCH_SIZE = 25000 diff --git a/usaspending_api/broker/management/commands/fabs_nightly_loader.py b/usaspending_api/broker/management/commands/fabs_nightly_loader.py index d4ad82ca50..60292a5906 100644 --- a/usaspending_api/broker/management/commands/fabs_nightly_loader.py +++ b/usaspending_api/broker/management/commands/fabs_nightly_loader.py @@ -11,8 +11,7 @@ delete_fabs_transactions, get_delete_pks_for_afa_keys, ) -from usaspending_api.broker.helpers.last_load_date import get_last_load_date -from usaspending_api.broker.helpers.last_load_date import update_last_load_date +from usaspending_api.broker.helpers.last_load_date import get_last_load_date, update_last_load_date from usaspending_api.broker.helpers.upsert_fabs_transactions import upsert_fabs_transactions from usaspending_api.broker.models import ExternalDataLoadDate from usaspending_api.common.helpers.date_helper import cast_datetime_to_naive, datetime_command_line_argument_type @@ -48,7 +47,7 @@ def get_incremental_load_start_datetime(): to prevent FABS transactions submitted between when the source records are copied from Broker and when FABS transactions are processed from being skipped. - An unfortunate side effect of the lookback is that some submissions may be processed more than + An unfortunate side effect of the look back is that some submissions may be processed more than once. This SHOULDN'T cause any problems since the FABS loader is designed to be able to reload transactions, but it could add to the run time. To minimize reprocessing, keep the LAST_LOAD_LOOKBACK_MINUTES value as small as possible while still preventing skips. To be @@ -65,10 +64,12 @@ def get_incremental_load_start_datetime(): max_updated_at = TransactionFABS.objects.aggregate(Max("updated_at"))["updated_at__max"] if max_updated_at is None: return last_load_date + else: + logger.info(f"Most recent update_date in `transaction_fabs` {max_updated_at}") # We add a little tiny bit of time to the max_updated_at to prevent us from always reprocessing # records since the SQL that grabs new records is using updated_at >=. I realize this is a hack - # but the pipeline is already running for too long so anything we can do to prevent enlongating + # but the pipeline is already running for too long so anything we can do to prevent elongating # it should be welcome. max_updated_at += timedelta(milliseconds=UPDATED_AT_MODIFIER_MS) @@ -203,7 +204,7 @@ def handle(self, *args, **options): if is_incremental_load: start_datetime = get_incremental_load_start_datetime() - logger.info("Processing data for FABS starting from %s" % start_datetime) + logger.info(f"Processing data for FABS starting from {start_datetime} (includes offset)") # We only perform deletes with incremental loads. with timer("obtaining delete records", logger.info): @@ -212,13 +213,14 @@ def handle(self, *args, **options): ids_to_delete = get_delete_pks_for_afa_keys(ids_to_delete) logger.info(f"{len(ids_to_delete):,} delete ids found in total") - with timer("retrieving/diff-ing FABS Data", logger.info): + with timer("retrieving IDs of FABS to process", logger.info): ids_to_upsert = get_fabs_transaction_ids(ids, afa_ids, start_datetime, end_datetime) update_award_ids = delete_fabs_transactions(ids_to_delete) if is_incremental_load else [] upsert_fabs_transactions(ids_to_upsert, update_award_ids) if is_incremental_load: + logger.info(f"Storing {processing_start_datetime} for the next incremental run") update_last_load_date("fabs", processing_start_datetime) logger.info("FABS UPDATE FINISHED!") diff --git a/usaspending_api/broker/management/commands/load_broker_static_data.py b/usaspending_api/broker/management/commands/load_broker_static_data.py index 5dbf4c42ed..9dc0eea391 100644 --- a/usaspending_api/broker/management/commands/load_broker_static_data.py +++ b/usaspending_api/broker/management/commands/load_broker_static_data.py @@ -6,7 +6,7 @@ from usaspending_api.broker import lookups from usaspending_api.broker.models import ExternalDataType -logger = logging.getLogger("console") +logger = logging.getLogger("script") @transaction.atomic diff --git a/usaspending_api/broker/management/commands/update_agency_code_name_fabs_fpds.py b/usaspending_api/broker/management/commands/update_agency_code_name_fabs_fpds.py index 9ead835527..406c2ffcbf 100644 --- a/usaspending_api/broker/management/commands/update_agency_code_name_fabs_fpds.py +++ b/usaspending_api/broker/management/commands/update_agency_code_name_fabs_fpds.py @@ -6,7 +6,7 @@ from django.core.management.base import BaseCommand, CommandError from django.db import connection -logger = logging.getLogger("console") +logger = logging.getLogger("script") class Command(BaseCommand): diff --git a/usaspending_api/broker/management/commands/update_awarding_agencies.py b/usaspending_api/broker/management/commands/update_awarding_agencies.py index 9c257cd112..bf7a679379 100644 --- a/usaspending_api/broker/management/commands/update_awarding_agencies.py +++ b/usaspending_api/broker/management/commands/update_awarding_agencies.py @@ -8,7 +8,7 @@ from usaspending_api.references.models import Agency -logger = logging.getLogger("console") +logger = logging.getLogger("script") agency_no_sub_map = { diff --git a/usaspending_api/broker/management/commands/update_duns.py b/usaspending_api/broker/management/commands/update_duns.py index a32f586da3..fc6d1aac44 100644 --- a/usaspending_api/broker/management/commands/update_duns.py +++ b/usaspending_api/broker/management/commands/update_duns.py @@ -9,7 +9,7 @@ import logging -logger = logging.getLogger("console") +logger = logging.getLogger("script") class Command(BaseCommand): diff --git a/usaspending_api/broker/management/commands/update_transactions.py b/usaspending_api/broker/management/commands/update_transactions.py index 40eef914d8..9f713b545e 100644 --- a/usaspending_api/broker/management/commands/update_transactions.py +++ b/usaspending_api/broker/management/commands/update_transactions.py @@ -14,7 +14,7 @@ from usaspending_api.etl.award_helpers import update_awards, update_procurement_awards, update_assistance_awards -logger = logging.getLogger("console") +logger = logging.getLogger("script") exception_logger = logging.getLogger("exceptions") # Lists to store for update_awards and update_procurement_awards diff --git a/usaspending_api/common/helpers/etl_helpers.py b/usaspending_api/common/helpers/etl_helpers.py index d862746029..62f2d045d4 100644 --- a/usaspending_api/common/helpers/etl_helpers.py +++ b/usaspending_api/common/helpers/etl_helpers.py @@ -7,7 +7,7 @@ from usaspending_api.common.helpers.sql_helpers import read_sql_file -logger = logging.getLogger("console") +logger = logging.getLogger("script") _ETL_SQL_FILE_PATH = settings.APP_DIR / "etl" / "management" / "sql" / "c_file_linkage" @@ -47,23 +47,21 @@ def update_c_to_d_linkages(type, count=True, submission_id=None): else: raise InvalidParameterException("Invalid type provided to process C to D linkages.") - file_paths = [str(_ETL_SQL_FILE_PATH / file_name) for file_name in file_names] - if count: starting_unlinked_count = get_unlinked_count(file_name=unlinked_count_file_name) logger.info("Current count of unlinked %s records: %s" % (type, str(starting_unlinked_count))) total_start = datetime.now() - for file_name in file_paths: + for file_name in file_names: start = datetime.now() - logger.info("Running %s" % file_name) - sql_commands = read_sql_file(file_path=file_name) + logger.info(f"Running {file_name}") + sql_commands = read_sql_file(file_path=str(_ETL_SQL_FILE_PATH / file_name)) for command in sql_commands: submission_id_clause = f"and faba_sub.submission_id = {submission_id}" if submission_id else "" command = command.format(submission_id_clause=submission_id_clause) with connection.cursor() as cursor: cursor.execute(command) - logger.info("Finished %s in %s seconds" % (file_name, str(datetime.now() - start))) + logger.info(f"Finished {file_name} in {str(datetime.now() - start)} seconds") if count: ending_unlinked_count = get_unlinked_count(file_name=unlinked_count_file_name) diff --git a/usaspending_api/common/management/commands/clear_usaspending_cache.py b/usaspending_api/common/management/commands/clear_usaspending_cache.py index 016986f40b..befe1b194a 100644 --- a/usaspending_api/common/management/commands/clear_usaspending_cache.py +++ b/usaspending_api/common/management/commands/clear_usaspending_cache.py @@ -10,7 +10,7 @@ class Command(BaseCommand): """ help = "Clears the usaspending-cache" - logger = logging.getLogger("console") + logger = logging.getLogger("script") def handle(self, *args, **options): self.logger.info("Clearing usaspending-cache...") diff --git a/usaspending_api/common/management/commands/disaster_spending_report.py b/usaspending_api/common/management/commands/disaster_spending_report.py index a5a02f6530..414823cd9b 100644 --- a/usaspending_api/common/management/commands/disaster_spending_report.py +++ b/usaspending_api/common/management/commands/disaster_spending_report.py @@ -22,7 +22,7 @@ ASSISTANCE_SQL = read_text_file("usaspending_api/common/management/sql/disaster_spending_assistance.sql") CONTRACT_SQL = read_text_file("usaspending_api/common/management/sql/disaster_spending_contracts.sql") -logger = logging.getLogger("console") +logger = logging.getLogger("script") def dump_to_csv(filepath, data_lines): diff --git a/usaspending_api/common/management/commands/matview_runner.py b/usaspending_api/common/management/commands/matview_runner.py index 5a047aa99d..b98691fa4e 100644 --- a/usaspending_api/common/management/commands/matview_runner.py +++ b/usaspending_api/common/management/commands/matview_runner.py @@ -22,7 +22,7 @@ ) from usaspending_api.common.helpers.sql_helpers import get_database_dsn_string -logger = logging.getLogger("console") +logger = logging.getLogger("script") class Command(BaseCommand): diff --git a/usaspending_api/common/management/commands/repair_sequences.py b/usaspending_api/common/management/commands/repair_sequences.py index 3505aa2bf1..dfaf7b616d 100644 --- a/usaspending_api/common/management/commands/repair_sequences.py +++ b/usaspending_api/common/management/commands/repair_sequences.py @@ -10,7 +10,7 @@ class Command(BaseCommand): """ help = "Generate SQL to repair primary key sequences" - logger = logging.getLogger("console") + logger = logging.getLogger("script") def handle(self, *args, **options): fixable_apps = ["accounts", "awards", "common", "financial_activities", "references", "submissions"] diff --git a/usaspending_api/conftest.py b/usaspending_api/conftest.py index dcb03e022f..d27a522fd7 100644 --- a/usaspending_api/conftest.py +++ b/usaspending_api/conftest.py @@ -144,7 +144,10 @@ def elasticsearch_transaction_index(db): See test_demo_elasticsearch_tests.py for sample usage. """ elastic_search_index = TestElasticSearchIndex("transaction") - with override_settings(ES_TRANSACTIONS_QUERY_ALIAS_PREFIX=elastic_search_index.alias_prefix): + with override_settings( + ES_TRANSACTIONS_QUERY_ALIAS_PREFIX=elastic_search_index.alias_prefix, + ES_TRANSACTIONS_WRITE_ALIAS=elastic_search_index.etl_config["write_alias"], + ): yield elastic_search_index elastic_search_index.delete_index() @@ -159,7 +162,10 @@ def elasticsearch_award_index(db): See test_award_index_elasticsearch_tests.py for sample usage. """ elastic_search_index = TestElasticSearchIndex("award") - with override_settings(ES_AWARDS_QUERY_ALIAS_PREFIX=elastic_search_index.alias_prefix): + with override_settings( + ES_AWARDS_QUERY_ALIAS_PREFIX=elastic_search_index.alias_prefix, + ES_AWARDS_WRITE_ALIAS=elastic_search_index.etl_config["write_alias"], + ): yield elastic_search_index elastic_search_index.delete_index() @@ -174,7 +180,10 @@ def elasticsearch_account_index(db): See test_account_index_elasticsearch_tests.py for sample usage. """ elastic_search_index = TestElasticSearchIndex("covid19-faba") - with override_settings(ES_COVID19_FABA_QUERY_ALIAS_PREFIX=elastic_search_index.alias_prefix): + with override_settings( + ES_COVID19_FABA_QUERY_ALIAS_PREFIX=elastic_search_index.alias_prefix, + ES_COVID19_FABA_WRITE_ALIAS=elastic_search_index.etl_config["write_alias"], + ): yield elastic_search_index elastic_search_index.delete_index() diff --git a/usaspending_api/conftest_helpers.py b/usaspending_api/conftest_helpers.py index 2808c1e3cf..5a3647d3a1 100644 --- a/usaspending_api/conftest_helpers.py +++ b/usaspending_api/conftest_helpers.py @@ -8,6 +8,7 @@ from pathlib import Path from string import Template +from usaspending_api.etl.elasticsearch_loader_helpers.index_config import create_load_alias from usaspending_api.common.sqs.sqs_handler import ( UNITTEST_FAKE_QUEUE_NAME, _FakeUnitTestFileBackedSQSQueue, @@ -24,7 +25,6 @@ transform_covid19_faba_data, transform_transaction_data, ) -from usaspending_api.etl.management.commands.es_configure import retrieve_index_template class TestElasticSearchIndex: @@ -38,15 +38,13 @@ def __init__(self, index_type): self.index_name = self._generate_index_name() self.alias_prefix = self.index_name self.client = Elasticsearch([settings.ES_HOSTNAME], timeout=settings.ES_TIMEOUT) - self.template = retrieve_index_template(f"{self.index_type.replace('-', '_')}_template") - self.mappings = json.loads(self.template)["mappings"] self.etl_config = { "load_type": self.index_type, "index_name": self.index_name, "query_alias_prefix": self.alias_prefix, "verbose": False, "verbosity": 0, - "write_alias": self.index_name + "-alias", + "write_alias": self.index_name + "-load-alias", "process_deletes": True, } self.worker = TaskSpec( @@ -67,17 +65,30 @@ def __init__(self, index_type): def delete_index(self): self.client.indices.delete(self.index_name, ignore_unavailable=True) - def update_index(self, **options): + def update_index(self, load_index: bool = True, **options): """ To ensure a fresh Elasticsearch index, delete the old one, update the materialized views, re-create the Elasticsearch index, create aliases for the index, and add contents. """ self.delete_index() - self.client.indices.create(index=self.index_name, body=self.template) + call_command("es_configure", "--template-only", f"--load-type={self.index_type}") + self.client.indices.create(index=self.index_name) create_award_type_aliases(self.client, self.etl_config) - self._add_contents(**options) - call_command("es_configure", "--load-type", self.index_type) + create_load_alias(self.client, self.etl_config) + self.etl_config["max_query_size"] = self._get_max_query_size() + if load_index: + self._add_contents(**options) + + def _get_max_query_size(self): + upper_name = "" + if self.index_type == "award": + upper_name = "AWARDS" + elif self.index_type == "covid19-faba": + upper_name = "COVID19_FABA" + elif self.index_type == "transaction": + upper_name = "TRANSACTIONS" + return getattr(settings, f"ES_{upper_name}_MAX_RESULT_WINDOW") def _add_contents(self, **options): """ @@ -145,9 +156,19 @@ def _add_contents(self, **options): # Force newly added documents to become searchable. self.client.indices.refresh(self.index_name) - @classmethod - def _generate_index_name(cls): - return f"test-{datetime.now(timezone.utc).strftime('%Y-%m-%d-%H-%M-%S-%f')}-{generate_random_string()}" + def _generate_index_name(self): + required_suffix = "" + if self.index_type == "award": + required_suffix = "-" + settings.ES_AWARDS_NAME_SUFFIX + elif self.index_type == "transaction": + required_suffix = "-" + settings.ES_TRANSACTIONS_NAME_SUFFIX + elif self.index_type == "covid19-faba": + required_suffix = "-" + settings.ES_COVID19_FABA_NAME_SUFFIX + return ( + f"test-{datetime.now(timezone.utc).strftime('%Y-%m-%d-%H-%M-%S-%f')}" + f"-{generate_random_string()}" + f"{required_suffix}" + ) def ensure_broker_server_dblink_exists(): diff --git a/usaspending_api/data/AssistanceSummary_download_readme.txt b/usaspending_api/data/AssistanceSummary_download_readme.txt index fdd0215457..f9f0d090f1 100644 --- a/usaspending_api/data/AssistanceSummary_download_readme.txt +++ b/usaspending_api/data/AssistanceSummary_download_readme.txt @@ -1,8 +1,8 @@ ========ABOUT THESE FILES======== -This ZIP file was generated from a specific Assistance Award Summary Page on USAspending.gov, located at https://www.usaspending.gov/#/award/[AWARD_ID] +This ZIP file was generated from a specific Assistance Award Summary Page on USAspending.gov, located at https://www.usaspending.gov/award/[AWARD_ID] -Data Element Definitions: A searchable Data Dictionary that defines every data element in the included files can be found here: https://www.usaspending.gov/#/download_center/data_dictionary. We have also included a copy in this download for convenience. Note that the dictionary is updated periodically. +Data Element Definitions: A searchable Data Dictionary that defines every data element in the included files can be found here: https://www.usaspending.gov/download_center/data_dictionary. We have also included a copy in this download for convenience. Note that the dictionary is updated periodically. Empty Files: When no data is available for a given file, its contents will only contain column headers (no records will be included). @@ -28,4 +28,4 @@ This file contains transaction-level data for all of the modifications made to t File: Data_Dictionary_Crosswalk.xlsx -This file contains the data dictionary covering all elements available for download from USAspending.gov. You can find an online and up-to-date version of the data dictionary here: https://www.usaspending.gov/#/download_center/data_dictionary \ No newline at end of file +This file contains the data dictionary covering all elements available for download from USAspending.gov. You can find an online and up-to-date version of the data dictionary here: https://www.usaspending.gov/download_center/data_dictionary \ No newline at end of file diff --git a/usaspending_api/data/COVID-19_download_readme.txt b/usaspending_api/data/COVID-19_download_readme.txt index 5e701cf8ba..8a850faaca 100644 --- a/usaspending_api/data/COVID-19_download_readme.txt +++ b/usaspending_api/data/COVID-19_download_readme.txt @@ -1,8 +1,8 @@ ========ABOUT THESE FILES======== -This ZIP file was generated from the COVID-19 Profile page on USAspending.gov, located at https://www.usaspending.gov/#/disaster/covid-19. +This ZIP file was generated from the COVID-19 Profile page on USAspending.gov, located at https://www.usaspending.gov/disaster/covid-19. -Data Element Definitions: A searchable Data Dictionary that defines every data element in the included files can be found here: https://www.usaspending.gov/#/download_center/data_dictionary. We have also included a copy in this download for convenience. The dictionary is updated periodically as the data model is improved or download headers change. +Data Element Definitions: A searchable Data Dictionary that defines every data element in the included files can be found here: https://www.usaspending.gov/download_center/data_dictionary. We have also included a copy in this download for convenience. The dictionary is updated periodically as the data model is improved or download headers change. Split Files: The # in all filenames defaults to 1; if the number of rows in a given file is large enough to warrant breaking it into multiple files, then additional files will be present and appended with 2, 3, etc. instead. @@ -14,7 +14,7 @@ This file captures the total resources made available under COVID-19 supplementa Each row of this file contains a Treasury Account Symbol (TAS) followed by COVID-19 Disaster Emergency Fund Code (DEFC)(one of 5 codes used to track COVID-19 supplemental appropriations from the 4 bills that authorized it), followed by the dollar balances associated with that TAS + DEFC combination. Non-COVID supplemental appropriations are NOT captured in this file, but this data is available from our Custom Account Download page. -The data in this file is reported to the Governmentwide Treasury Account Symbol Adjusted Trial Balance System (GTAS) on a monthly basis (other than fiscal year period 01) from audited agency financial systems. The data is provided at the TAS + DEFC level for increased granularity, but may rolled up by TAS, Federal Account, or DEFC as desired, using the appropriate columns. +The data in this file is reported to the Government wide Treasury Account Symbol Adjusted Trial Balance System (GTAS) on a monthly basis (other than fiscal year period 01) from audited agency financial systems. The data is provided at the TAS + DEFC level for increased granularity, but may rolled up by TAS, Federal Account, or DEFC as desired, using the appropriate columns. Only the latest relevant data is included in this file. For the status of these dollar amounts at earlier points in time, please visit Custom Account download and select the particular period of interest. @@ -63,4 +63,4 @@ The data in this file is primarily sourced from that reported by prime grant rec File: Data_Dictionary_Crosswalk.xlsx -This file contains the data dictionary covering all elements available for download from USAspending.gov. You can find an online and up-to-date version of the data dictionary here: https://www.usaspending.gov/#/download_center/data_dictionary +This file contains the data dictionary covering all elements available for download from USAspending.gov. You can find an online and up-to-date version of the data dictionary here: https://www.usaspending.gov/download_center/data_dictionary diff --git a/usaspending_api/data/ContractSummary_download_readme.txt b/usaspending_api/data/ContractSummary_download_readme.txt index 432bebb1e7..600d2b2524 100644 --- a/usaspending_api/data/ContractSummary_download_readme.txt +++ b/usaspending_api/data/ContractSummary_download_readme.txt @@ -1,8 +1,8 @@ ========ABOUT THESE FILES======== -This ZIP file was generated from a specific Contract Award Summary Page on USAspending.gov, located at https://www.usaspending.gov/#/award/[AWARD_ID] +This ZIP file was generated from a specific Contract Award Summary Page on USAspending.gov, located at https://www.usaspending.gov/award/[AWARD_ID] -Data Element Definitions: A searchable Data Dictionary that defines every data element in the included files can be found here: https://www.usaspending.gov/#/download_center/data_dictionary. We have also included a copy in this download for convenience. Note that the dictionary is updated periodically. +Data Element Definitions: A searchable Data Dictionary that defines every data element in the included files can be found here: https://www.usaspending.gov/download_center/data_dictionary. We have also included a copy in this download for convenience. Note that the dictionary is updated periodically. Empty Files: When no data is available for a given file, its contents will only contain column headers (no records will be included). @@ -28,4 +28,4 @@ This file contains transaction-level data for all of the modifications made to t File: Data_Dictionary_Crosswalk.xlsx -This file contains the data dictionary covering all elements available for download from USAspending.gov. You can find an online and up-to-date version of the data dictionary here: https://www.usaspending.gov/#/download_center/data_dictionary \ No newline at end of file +This file contains the data dictionary covering all elements available for download from USAspending.gov. You can find an online and up-to-date version of the data dictionary here: https://www.usaspending.gov/download_center/data_dictionary \ No newline at end of file diff --git a/usaspending_api/data/idv_download_readme.txt b/usaspending_api/data/idv_download_readme.txt index f0a14ae886..17e9ae1bc6 100644 --- a/usaspending_api/data/idv_download_readme.txt +++ b/usaspending_api/data/idv_download_readme.txt @@ -3,7 +3,7 @@ # Lines starting with # are ignored. ========ABOUT THESE FILES======== -Data Element Definitions: A searchable Data Dictionary that defines every data element in the included files can be found here: https://www.usaspending.gov/#/download_center/data_dictionary. We have also included a copy in this download for convenience. +Data Element Definitions: A searchable Data Dictionary that defines every data element in the included files can be found here: https://www.usaspending.gov/download_center/data_dictionary. We have also included a copy in this download for convenience. Empty Files: When no data is available for a given file, its contents will only contain column headers (no records will be included). @@ -27,4 +27,4 @@ This file contains transaction-level data representing all of the modifications File: Data_Dictionary_Crosswalk.xlsx -This file contains the data dictionary covering all elements available for download from USAspending.gov. You can find an online and up-to-date version of the data dictionary here: https://www.usaspending.gov/#/download_center/data_dictionary +This file contains the data dictionary covering all elements available for download from USAspending.gov. You can find an online and up-to-date version of the data dictionary here: https://www.usaspending.gov/download_center/data_dictionary diff --git a/usaspending_api/disaster/management/sql/disaster_covid19_file_d1_awards.sql b/usaspending_api/disaster/management/sql/disaster_covid19_file_d1_awards.sql index e785a928ee..6cf3c90cb5 100644 --- a/usaspending_api/disaster/management/sql/disaster_covid19_file_d1_awards.sql +++ b/usaspending_api/disaster/management/sql/disaster_covid19_file_d1_awards.sql @@ -270,7 +270,7 @@ SELECT "awards"."officer_4_amount" AS "highly_compensated_officer_4_amount", "awards"."officer_5_name" AS "highly_compensated_officer_5_name", "awards"."officer_5_amount" AS "highly_compensated_officer_5_amount", - CONCAT('https://www.usaspending.gov/#/award/', urlencode("awards"."generated_unique_award_id"), '/') AS "usaspending_permalink", + CONCAT('https://www.usaspending.gov/award/', urlencode("awards"."generated_unique_award_id"), '/') AS "usaspending_permalink", "transaction_fpds"."last_modified" AS "last_modified_date" FROM "awards" INNER JOIN "transaction_fpds" ON ("awards"."latest_transaction_id" = "transaction_fpds"."transaction_id") diff --git a/usaspending_api/disaster/management/sql/disaster_covid19_file_d2_awards.sql b/usaspending_api/disaster/management/sql/disaster_covid19_file_d2_awards.sql index 02e0793287..a0e090ec51 100644 --- a/usaspending_api/disaster/management/sql/disaster_covid19_file_d2_awards.sql +++ b/usaspending_api/disaster/management/sql/disaster_covid19_file_d2_awards.sql @@ -85,7 +85,7 @@ SELECT "awards"."officer_4_amount" AS "highly_compensated_officer_4_amount", "awards"."officer_5_name" AS "highly_compensated_officer_5_name", "awards"."officer_5_amount" AS "highly_compensated_officer_5_amount", - CONCAT('https://www.usaspending.gov/#/award/', urlencode("awards"."generated_unique_award_id"), '/') AS "usaspending_permalink", + CONCAT('https://www.usaspending.gov/award/', urlencode("awards"."generated_unique_award_id"), '/') AS "usaspending_permalink", "transaction_fabs"."modified_at" AS "last_modified_date" FROM "awards" INNER JOIN "transaction_fabs" ON ("awards"."latest_transaction_id" = "transaction_fabs"."transaction_id") diff --git a/usaspending_api/disaster/management/sql/disaster_covid19_file_f_contracts.sql b/usaspending_api/disaster/management/sql/disaster_covid19_file_f_contracts.sql index a3f1ea5d8d..f78d599437 100644 --- a/usaspending_api/disaster/management/sql/disaster_covid19_file_f_contracts.sql +++ b/usaspending_api/disaster/management/sql/disaster_covid19_file_f_contracts.sql @@ -100,7 +100,7 @@ SELECT "broker_subaward"."sub_high_comp_officer4_amount" AS "subawardee_highly_compensated_officer_4_amount", "broker_subaward"."sub_high_comp_officer5_full_na" AS "subawardee_highly_compensated_officer_5_name", "broker_subaward"."sub_high_comp_officer5_amount" AS "subawardee_highly_compensated_officer_5_amount", - CONCAT ('https://www.usaspending.gov/#/award/' , urlencode("awards"."generated_unique_award_id"), '/') AS "usaspending_permalink", + CONCAT ('https://www.usaspending.gov/award/' , urlencode("awards"."generated_unique_award_id"), '/') AS "usaspending_permalink", "broker_subaward"."date_submitted" AS "subaward_fsrs_report_last_modified_date" FROM "subaward" INNER JOIN "awards" ON ("subaward"."award_id" = "awards"."id") diff --git a/usaspending_api/disaster/management/sql/disaster_covid19_file_f_grants.sql b/usaspending_api/disaster/management/sql/disaster_covid19_file_f_grants.sql index 474c8b4b3b..5a070949f8 100644 --- a/usaspending_api/disaster/management/sql/disaster_covid19_file_f_grants.sql +++ b/usaspending_api/disaster/management/sql/disaster_covid19_file_f_grants.sql @@ -96,7 +96,7 @@ SELECT "broker_subaward"."sub_high_comp_officer4_amount" AS "subawardee_highly_compensated_officer_4_amount", "broker_subaward"."sub_high_comp_officer5_full_na" AS "subawardee_highly_compensated_officer_5_name", "broker_subaward"."sub_high_comp_officer5_amount" AS "subawardee_highly_compensated_officer_5_amount", - CONCAT('https://www.usaspending.gov/#/award/', urlencode("awards"."generated_unique_award_id"), '/') AS "usaspending_permalink", + CONCAT('https://www.usaspending.gov/award/', urlencode("awards"."generated_unique_award_id"), '/') AS "usaspending_permalink", "broker_subaward"."date_submitted" AS "subaward_fsrs_report_last_modified_date" FROM "subaward" INNER JOIN "awards" ON ("subaward"."award_id" = "awards"."id") diff --git a/usaspending_api/download/helpers/download_annotation_functions.py b/usaspending_api/download/helpers/download_annotation_functions.py index 434beb7b08..14df735d67 100644 --- a/usaspending_api/download/helpers/download_annotation_functions.py +++ b/usaspending_api/download/helpers/download_annotation_functions.py @@ -27,7 +27,7 @@ from usaspending_api.settings import HOST -AWARD_URL = f"{HOST}/#/award/" if "localhost" in HOST else f"https://{HOST}/#/award/" +AWARD_URL = f"{HOST}/award/" if "localhost" in HOST else f"https://{HOST}/award/" def filter_limit_to_closed_periods(submission_query_path: str = "") -> Q: diff --git a/usaspending_api/download/management/commands/populate_monthly_delta_files.py b/usaspending_api/download/management/commands/populate_monthly_delta_files.py index 2f6c241658..b562423fd4 100644 --- a/usaspending_api/download/management/commands/populate_monthly_delta_files.py +++ b/usaspending_api/download/management/commands/populate_monthly_delta_files.py @@ -226,15 +226,12 @@ def split_transaction_id(tid): return pd.Series(tid.split("_") + [tid]) def add_deletion_records(self, source_path, working_dir, award_type, agency_code, source, generate_since): - """ Retrieve deletion files from S3 and append necessary records to the end of the file """ + """Retrieve deletion files from S3 and append necessary records to the end of the file""" logger.info("Retrieving deletion records from S3 files and appending to the CSV") # Retrieve all SubtierAgency IDs within this TopTierAgency - subtier_agencies = list( - SubtierAgency.objects.filter(agency__toptier_agency__toptier_code=agency_code).values_list( - "subtier_code", flat=True - ) - ) + filter = {"agency__toptier_agency__toptier_code": agency_code} + subtier_agencies = list(SubtierAgency.objects.filter(**filter).values_list("subtier_code", flat=True)) # Create a list of keys in the bucket that match the date range we want bucket = boto3.resource("s3", region_name=settings.USASPENDING_AWS_REGION).Bucket( @@ -246,7 +243,7 @@ def add_deletion_records(self, source_path, working_dir, award_type, agency_code match_date = self.check_regex_match(award_type, key.key, generate_since) if match_date: # Create a local copy of the deletion file - delete_filepath = "{}{}".format(working_dir, key.key) + delete_filepath = f"{working_dir}{key.key}" bucket.download_file(key.key, delete_filepath) df = pd.read_csv(delete_filepath) os.remove(delete_filepath) @@ -257,6 +254,7 @@ def add_deletion_records(self, source_path, working_dir, award_type, agency_code .apply(self.split_transaction_id) .replace("-none-", "") .replace("-NONE-", "") + .reset_index() # adding to handle API bug which caused a Series to be returned .rename(columns=AWARD_MAPPINGS[award_type]["column_headers"]) ) @@ -270,13 +268,13 @@ def add_deletion_records(self, source_path, working_dir, award_type, agency_code # Reorder columns to make it CSV-ready, and append df = self.organize_deletion_columns(source, df, award_type, match_date) - logger.info("Found {} deletion records to include".format(len(df.index))) + logger.info(f"Found {len(df.index):,} deletion records to include") all_deletions = all_deletions.append(df, ignore_index=True) - # Only append to file if there are any records if len(all_deletions.index) == 0: logger.info("No deletion records to append to file") else: + logger.info(f"Appending {len(all_deletions.index):,} records to file") self.add_deletions_to_file(all_deletions, award_type, source_path) def organize_deletion_columns(self, source, dataframe, award_type, match_date): diff --git a/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py b/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py new file mode 100644 index 0000000000..1bdf2d4350 --- /dev/null +++ b/usaspending_api/download/tests/integration/test_populate_monthly_delta_files.py @@ -0,0 +1,480 @@ +import zipfile +import datetime +import pytest +import os + +from django.core.management import call_command +from os import listdir +from model_mommy import mommy +from csv import reader + +from usaspending_api.settings import HOST +from usaspending_api.awards.models import TransactionDelta +from usaspending_api.common.helpers.generic_helper import generate_test_db_connection_string +from usaspending_api.download.v2.download_column_historical_lookups import query_paths + + +@pytest.fixture +@pytest.mark.django_db(transaction=True) +def monthly_download_delta_data(db, monkeypatch): + mommy.make("references.ToptierAgency", toptier_agency_id=1, toptier_code="001", name="Test_Agency") + mommy.make("references.Agency", pk=1, toptier_agency_id=1) + mommy.make("references.ToptierAgency", toptier_agency_id=2, toptier_code="002", name="Test_Agency 2") + mommy.make("references.Agency", pk=2, toptier_agency_id=2) + i = 1 + fiscal_year = 2020 + mommy.make( + "awards.Award", + id=i, + generated_unique_award_id="CONT_AWD_1_0_0", + is_fpds=True, + type="B", + type_description="Purchase Order", + piid=f"piid{i}", + awarding_agency_id=1, + funding_agency_id=1, + fiscal_year=fiscal_year, + ) + mommy.make("awards.FinancialAccountsByAwards", award_id=i) + mommy.make( + "awards.TransactionNormalized", + award_id=i, + id=i, + is_fpds=True, + transaction_unique_id=i, + usaspending_unique_transaction_id="", + type="B", + type_description="Purchase Order", + period_of_performance_start_date=datetime.datetime(fiscal_year, 5, 7), + period_of_performance_current_end_date=datetime.datetime(fiscal_year, 5, 7), + action_date=datetime.datetime(fiscal_year, 5, 7), + federal_action_obligation=100, + modification_number="", + description="a description", + drv_award_transaction_usaspend=1, + drv_current_total_award_value_amount_adjustment=1, + drv_potential_total_award_value_amount_adjustment=1, + last_modified_date=datetime.datetime(fiscal_year, 5, 7), + certified_date=datetime.datetime(fiscal_year, 5, 7), + create_date=datetime.datetime(fiscal_year, 5, 7), + update_date=datetime.datetime(fiscal_year, 5, 7), + fiscal_year=fiscal_year, + awarding_agency_id=1, + funding_agency_id=1, + original_loan_subsidy_cost=100.0, + face_value_loan_guarantee=100.0, + funding_amount=100.0, + non_federal_funding_amount=100.0, + unique_award_key=1, + business_categories=[], + ) + mommy.make( + "awards.TransactionFPDS", + transaction_id=i, + detached_award_procurement_id=i, + detached_award_proc_unique=f"test{i}", + piid=f"piid{i}", + agency_id=1, + awarding_sub_tier_agency_c="001", + awarding_sub_tier_agency_n="Test_Agency", + awarding_agency_code="001", + awarding_agency_name="Test_Agency", + parent_award_id=f"000{i}", + award_modification_amendme="1", + contract_award_type="B", + contract_award_type_desc="Contract", + created_at=datetime.datetime(fiscal_year, 5, 7), + updated_at=datetime.datetime(fiscal_year, 5, 7), + ) + TransactionDelta.objects.update_or_create_transaction(i) + + monkeypatch.setenv("DOWNLOAD_DATABASE_URL", generate_test_db_connection_string()) + + +@pytest.mark.django_db(transaction=True) +def test_all_agencies(monthly_download_delta_data, monkeypatch): + call_command("populate_monthly_delta_files", "--debugging_skip_deleted", "--last_date=2020-12-31") + file_list = listdir("csv_downloads") + formatted_date = datetime.datetime.strftime(datetime.date.today(), "%Y%m%d") + assert f"FY(All)_All_Contracts_Delta_{formatted_date}.zip" in file_list + os.remove(os.path.normpath(f"csv_downloads/FY(All)_All_Contracts_Delta_{formatted_date}.zip")) + + +@pytest.mark.django_db(transaction=True) +def test_specific_agency(monthly_download_delta_data, monkeypatch): + contract_data = [ + "C", + "1", + "test1", + "CONT_AWD_1_0_0", + "piid1", + "1", + "", + "", + "", + "0001", + "", + "100.00", + "", + "", + "", + "", + "", + "", + "", + "", + "2020-05-07", + "2020", + "", + "", + "", + "", + "", + "001", + "Test_Agency", + "001", + "Test_Agency", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "B", + "Contract", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + f"{HOST}/award/CONT_AWD_1_0_0/" if "localhost" in HOST else f"https://{HOST}/award/CONT_AWD_1_0_0/", + "", + ] + call_command("populate_monthly_delta_files", "--agencies=1", "--debugging_skip_deleted", "--last_date=2020-12-31") + file_list = listdir("csv_downloads") + formatted_date = datetime.datetime.strftime(datetime.date.today(), "%Y%m%d") + assert f"FY(All)_001_Contracts_Delta_{formatted_date}.zip" in file_list + with zipfile.ZipFile( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{formatted_date}.zip"), "r" + ) as zip_ref: + zip_ref.extractall("csv_downloads") + assert f"FY(All)_001_Contracts_Delta_{formatted_date}_1.csv" in listdir("csv_downloads") + with open( + os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{formatted_date}_1.csv"), "r" + ) as contract_file: + csv_reader = reader(contract_file) + row_count = 0 + for row in csv_reader: + if row_count == 0: + assert row == [s[:63] for s in query_paths["transaction"]["d1"].keys()] + else: + assert row == contract_data + row_count += 1 + assert row_count == 2 + os.remove(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{formatted_date}.zip")) + os.remove(os.path.normpath(f"csv_downloads/FY(All)_001_Contracts_Delta_{formatted_date}_1.csv")) + + +@pytest.mark.django_db(transaction=True) +def test_award_types(client, monthly_download_delta_data, monkeypatch): + call_command( + "populate_monthly_delta_files", + "--agencies=1", + "--award_types=assistance", + "--debugging_skip_deleted", + "--last_date=2020-12-31", + ) + file_list = listdir("csv_downloads") + formatted_date = datetime.datetime.strftime(datetime.date.today(), "%Y%m%d") + assert f"FY(All)_001_Assistance_Delta_{formatted_date}.zip" not in file_list + + mommy.make( + "awards.Award", + id=2, + is_fpds=False, + type="02", + type_description="Block Grant", + fain="fain2", + awarding_agency_id=2, + funding_agency_id=2, + fiscal_year=2020, + ) + mommy.make( + "awards.TransactionNormalized", + award_id=2, + id=2, + is_fpds=False, + transaction_unique_id=2, + type="02", + type_description="Block Grant", + period_of_performance_start_date=datetime.datetime(2020, 5, 7), + period_of_performance_current_end_date=datetime.datetime(2020, 5, 7), + action_date=datetime.datetime(2020, 5, 7), + last_modified_date=datetime.datetime(2020, 5, 7), + certified_date=datetime.datetime(2020, 5, 7), + create_date=datetime.datetime(2020, 5, 7), + update_date=datetime.datetime(2020, 5, 7), + fiscal_year=2020, + awarding_agency_id=1, + funding_agency_id=1, + unique_award_key=2, + ) + mommy.make( + "awards.TransactionFABS", + transaction_id=2, + fain="fain2", + awarding_agency_code="001", + awarding_sub_tier_agency_c=1, + awarding_agency_name="Test_Agency", + awarding_sub_tier_agency_n="Test_Agency", + ) + mommy.make("awards.TransactionDelta", transaction_id=2, created_at=datetime.datetime.now()) + call_command( + "populate_monthly_delta_files", + "--agencies=1", + "--award_types=assistance", + "--debugging_skip_deleted", + "--last_date=2020-12-31", + ) + file_list = listdir("csv_downloads") + formatted_date = datetime.datetime.strftime(datetime.date.today(), "%Y%m%d") + assert f"FY(All)_001_Assistance_Delta_{formatted_date}.zip" in file_list + os.remove(os.path.normpath(f"csv_downloads/FY(All)_001_Assistance_Delta_{formatted_date}.zip")) diff --git a/usaspending_api/download/tests/integration/test_populate_monthly_files.py b/usaspending_api/download/tests/integration/test_populate_monthly_files.py new file mode 100644 index 0000000000..9c2ebc5db1 --- /dev/null +++ b/usaspending_api/download/tests/integration/test_populate_monthly_files.py @@ -0,0 +1,629 @@ +import pytest +import datetime +import zipfile +import os + +from django.core.management import call_command +from csv import reader +from model_mommy import mommy + +from usaspending_api.download.lookups import JOB_STATUS +from usaspending_api.download.v2.download_column_historical_lookups import query_paths + + +def delete_files(): + file_list = os.listdir("csv_downloads") + for file in file_list: + if file != "README.md": + os.remove(os.path.normpath(f"csv_downloads/{file}")) + + +def generate_contract_data(fiscal_year, i): + return [ + i, + i, + f"piid{i}", + "" # "contract_transaction_unique_key", "contract_award_unique_key", "award_id_piid", "modification_number", + "", + "", + "", + "", # "transaction_number", "parent_award_agency_id", "parent_award_agency_name", "parent_award_id_piid", + "", + "", + "", # "parent_award_modification_number", "federal_action_obligation", "total_dollars_obligated", + "", + "", + "", # "base_and_exercised_options_value", "current_total_value_of_award", "base_and_all_options_value", + "", + "", + "", # "potential_total_value_of_award", "disaster_emergency_fund_codes_for_overall_award", + "", + "", # "outlayed_amount_funded_by_COVID-19_supplementals_for_overall_award", "obligated_amount_funded_by_COVID-19_supplementals_for_overall_award", + "", + "", # "action_date", "action_date_fiscal_year", + f"05/07{fiscal_year}", + f"05/07/{fiscal_year}", + "", # "period_of_performance_start_date", "period_of_performance_current_end_date", "period_of_performance_potential_end_date", + "", + "", + "001", + "Test_Agency", + "001", # "ordering_period_end_date", "solicitation_date", "awarding_agency_code", "awarding_agency_name", "awarding_sub_agency_code", + "Test_Agency", + "", + "", + "", + "", # "awarding_sub_agency_name", "awarding_office_code", "awarding_office_name", "funding_agency_code", "funding_agency_name", + "", + "", + "", + "", # "funding_sub_agency_code", "funding_sub_agency_name", "funding_office_code", "funding_office_name", + "", + "", + "", # "treasury_accounts_funding_this_award", "federal_accounts_funding_this_award", "object_classes_funding_this_award", + "", + "", + "", + "", # "program_activities_funding_this_award", "foreign_funding", "foreign_funding_description", "sam_exception", + "", + "", + "", + "", + "", # "sam_exception_description", "recipient_duns", "recipient_name", "recipient_doing_business_as_name", "cage_code", + "", + "", + "", + "", # "recipient_parent_duns", "recipient_parent_name", "recipient_country_code", "recipient_country_name", + "", + "", + "", + "", # "recipient_address_line_1", "recipient_address_line_2", "recipient_city_name", "recipient_county_name", + "", + "", + "", + "", # "recipient_state_code", "recipient_state_name", "recipient_zip_4_code", "recipient_congressional_district", + "", + "", + "", # "recipient_phone_number", "recipient_fax_number", "primary_place_of_performance_country_code", + "", + "", # "primary_place_of_performance_country_name", "primary_place_of_performance_city_name", + "", + "", # "primary_place_of_performance_county_name", "primary_place_of_performance_state_code", + "", + "", # "primary_place_of_performance_state_name", "primary_place_of_performance_zip_4", + "", + "", + "", + "", # "primary_place_of_performance_congressional_district", "award_or_idv_flag", "award_type_code", "award_type", + "", + "", + "", + "", + "", # "idv_type_code", "idv_type", "multiple_or_single_award_idv_code", "multiple_or_single_award_idv", "type_of_idc_code", + "", + "", + "", + "", + "", # "type_of_idc", "type_of_contract_pricing_code", "type_of_contract_pricing", "award_description", "action_type_code", + "", + "", + "", + "", # "action_type", "solicitation_identifier", "number_of_actions", "inherently_governmental_functions", + "", + "", + "", # "inherently_governmental_functions_description", "product_or_service_code", "product_or_service_code_description", + "", + "", + "", + "", # "contract_bundling_code", "contract_bundling ", "dod_claimant_program_code", "dod_claimant_program_description", + "", + "", + "", + "", # "naics_code", "naics_description", "recovered_materials_sustainability_code", "recovered_materials_sustainability", + "", + "", + "", # "domestic_or_foreign_entity_code", "domestic_or_foreign_entity", "dod_acquisition_program_code", + "", + "", # "dod_acquisition_program_description", "information_technology_commercial_item_category_code", + "", + "", + "", # "information_technology_commercial_item_category ", "epa_designated_product_code", "epa_designated_product", + "", + "", + "", # "country_of_product_or_service_origin_code","country_of_product_or_service_origin", "place_of_manufacture_code ", + "", + "", + "", + "", + "", # place_of_manufacture", "subcontracting_plan_code", "subcontracting_plan", "extent_competed_code", "extent_competed", + "", + "", + "", + "", # "solicitation_procedures_code", "solicitation_procedures", "type_of_set_aside_code", "type_of_set_aside", + "", + "", + "", + "", + "", # "evaluated_preference_code", "evaluated_preference", "research_code", "research", "fair_opportunity_limited_sources_code", + "", + "", + "", # "fair_opportunity_limited_sources", "other_than_full_and_open_competition_code", "other_than_full_and_open_competition", + "", + "", + "", # "number_of_offers_received", "commercial_item_acquisition_procedures_code", "commercial_item_acquisition_procedures", + "", + "", # "small_business_competitiveness_demonstration_program", "simplified_procedures_for_certain_commercial_items_code", + "", + "", + "", + "", # "simplified_procedures_for_certain_commercial_items", "a76_fair_act_action_code", "a76_fair_act_action", "fed_biz_opps_code", + "", + "", + "", + "", # "fed_biz_opps", "local_area_set_aside_code", "local_area_set_aside", "price_evaluation_adjustment_preference_percent_difference", + "", + "", + "", # "clinger_cohen_act_planning_code", "clinger_cohen_act_planning", "materials_supplies_articles_equipment_code", + "", + "", + "", + "", # "materials_supplies_articles_equipment", "labor_standards_code", "labor_standards", "construction_wage_rate_requirements_code", + "", + "", + "", # "construction_wage_rate_requirements", "interagency_contracting_authority_code", "interagency_contracting_authority", + "", + "", + "", + "", + "", # "other_statutory_authority", "program_acronym", "parent_award_type_code", "parent_award_type", "parent_award_single_or_multiple_code", + "", + "", + "", + "", # "parent_award_single_or_multiple", "major_program", "national_interest_action_code", "national_interest_action", + "", + "", + "", + "", # "cost_or_pricing_data_code", "cost_or_pricing_data", "cost_accounting_standards_clause_code", "cost_accounting_standards_clause", + "", + "", + "", + "", # "government_furnished_property_code", "government_furnished_property", "sea_transportation_code", "sea_transportation", + "", + "", + "", + "", # "undefinitized_action_code", "undefinitized_action", "consolidated_contract_code", "consolidated_contract", + "", + "", + "", # "performance_based_service_acquisition_code", "performance_based_service_acquisition", "multi_year_contract_code", + "", + "", + "", + "", # "multi_year_contract", "contract_financing_code", "contract_financing", "purchase_card_as_payment_method_code", + "", + "", # "purchase_card_as_payment_method", "contingency_humanitarian_or_peacekeeping_operation_code", + "", + "", # "contingency_humanitarian_or_peacekeeping_operation", "alaskan_native_corporation_owned_firm", + "", + "", + "", # "american_indian_owned_business", "indian_tribe_federally_recognized", "native_hawaiian_organization_owned_firm", + "", + "", + "", + "", # "tribally_owned_firm", "veteran_owned_business", "service_disabled_veteran_owned_business", "woman_owned_business", + "", + "", + "", # "women_owned_small_business", "economically_disadvantaged_women_owned_small_business", "joint_venture_women_owned_small_business", + "", + "", + "", # "joint_venture_economic_disadvantaged_women_owned_small_bus", "minority_owned_business", "subcontinent_asian_asian_indian_american_owned_business", + "", + "", + "", + "", # "asian_pacific_american_owned_business", "black_american_owned_business", "hispanic_american_owned_business", "native_american_owned_business", + "", + "", + "", # "other_minority_owned_business", "contracting_officers_determination_of_business_size", "contracting_officers_determination_of_business_size_code", + "", + "", + "", + "", # "emerging_small_business", "community_developed_corporation_owned_firm", "labor_surplus_area_firm", "us_federal_government", + "", + "", + "", + "", + "", # "federally_funded_research_and_development_corp", "federal_agency", "us_state_government", "us_local_government", "city_local_government", + "", + "", + "", + "", # "county_local_government", "inter_municipal_local_government", "local_government_owned", "municipality_local_government", + "", + "", + "", + "", # "school_district_local_government ", "township_local_government", "us_tribal_government ", "foreign_government", + "", + "", + "", + "", + "", # "organizational_type", "corporate_entity_not_tax_exempt", "corporate_entity_tax_exempt", "partnership_or_limited_liability_partnership", + "", + "", + "", + "", # "sole_proprietorship", "small_agricultural_cooperative", "international_organization", "us_government_entity", + "", + "", + "", + "", + "", # "community_development_corporation", "domestic_shelter", "educational_institution", "foundation", "hospital_flag", + "", + "", + "", + "", + "", # "manufacturer_of_goods", "veterinary_hospital", "hispanic_servicing_institution", "receives_contracts", "receives_financial_assistance", + "", + "", + "", + "", # "receives_contracts_and_financial_assistance ", "airport_authority", "council_of_governments", "housing_authorities_public_tribal", + "", + "", + "", + "", + "", # "interstate_entity", "planning_commission", "port_authority", "transit_authority", "subchapter_scorporation", + "", + "", + "", + "", # "limited_liability_corporation ", "foreign_owned", "for_profit_organization", "nonprofit_organization", + "", + "", + "", + "", # "other_not_for_profit_organization", "the_ability_one_program", "private_university_or_college ", "state_controlled_institution_of_higher_learning", + "", + "", + "", + "", + "", # "1862_land_grant_college", "1890_land_grant_college", "1994_land_grant_college", "minority_institution", "historically_black_college ", + "", + "", + "", + "", # "tribal_college", "alaskan_native_servicing_institution", "native_hawaiian_servicing_institution", "school_of_forestry", + "", + "", + "", + "", # "veterinary_college", "dot_certified_disadvantage", "self_certified_small_disadvantaged_business", "small_disadvantaged_business", + "", + "", + "", # "c8a_program_participant", "historically_underutilized_business_zone_hubzone_firm", "sba_certified_8a_joint_venture", + "", + "", + "", # "highly_compensated_officer_1_name", "highly_compensated_officer_1_amount", "highly_compensated_officer_2_name", + "", + "", + "", # "highly_compensated_officer_2_amount", "highly_compensated_officer_3_name", "highly_compensated_officer_3_amount", + "", + "", + "", # "highly_compensated_officer_4_name", "highly_compensated_officer_4_amount", "highly_compensated_officer_5_name", + "", + "www.usaspending.gov/award/CONT_AWD_1_0_0", + f"05/07/{fiscal_year}", # "highly_compensated_officer_5_amount", "usaspending_permalink", "last_modified_date" + ] + + +def generate_assistance_data(fiscal_year, i): + return [ + i + 100, + i + 100, + f"fain{i+100}", + "", # "assistance_transaction_unique_key", "assistance_award_unique_key", "award_id_fain", "modification_number", + "", + "", + "", + "", + "", # "award_id_uri", "sai_number", "federal_action_obligation", "total_obligated_amount", "non_federal_funding_amount", + "", + "", + "", + "", # "total_non_federal_funding_amount", "face_value_of_loan", "original_loan_subsidy_cost", "total_face_value_of_loan", + "", + "", # "total_loan_subsidy_cost", "disaster_emergency_fund_codes_for_overall_award", + "", + "", + "", # "outlayed_amount_funded_by_COVID-19_supplementals_for_overall_award", "obligated_amount_funded_by_COVID-19_supplementals_for_overall_award","action_date", + "", + "", + "", # "action_date_fiscal_year", "period_of_performance_start_date", "period_of_performance_current_end_date", + "001", + "Test_Agency", + "001", + "Test_Agency", # "awarding_agency_code", "awarding_agency_name", "awarding_sub_agency_code", "awarding_sub_agency_name", + "", + "", + "", + "", # "awarding_office_code", "awarding_office_name", "funding_agency_code", "funding_agency_name", + "", + "", + "", + "", # "funding_sub_agency_code", "funding_sub_agency_name", "funding_office_code", "funding_office_name", + "", + "", # "treasury_accounts_funding_this_award", "federal_accounts_funding_this_award", + "", + "", # "object_classes_funding_this_award", "recipient_duns", "recipient_name", + "", + "", + "", + "", # "recipient_parent_duns", "recipient_parent_name", "recipient_country_code", "recipient_country_name", + "", + "", + "", + "", # "recipient_address_line_1", "recipient_address_line_2", "recipient_city_code", "recipient_city_name", + "", + "", + "", + "", # "recipient_county_code", "recipient_county_name", "recipient_state_code", "recipient_state_name", + "", + "", + "", + "", # "recipient_zip_code", "recipient_zip_last_4_code", "recipient_congressional_district", "recipient_foreign_city_name", + "", + "", + "", + "", # "recipient_foreign_province_name", "recipient_foreign_postal_code", "primary_place_of_performance_scope", + "", + "", # "primary_place_of_performance_country_code", "primary_place_of_performance_country_name", + "", + "", # "primary_place_of_performance_code", "primary_place_of_performance_city_name", + "", + "", # "primary_place_of_performance_county_code", "primary_place_of_performance_county_name", + "", + "", # "primary_place_of_performance_state_name", "primary_place_of_performance_zip_4", + "", + "", # "primary_place_of_performance_congressional_district","primary_place_of_performance_foreign_location", + "", + "", + "02", + "Block Grant", + "", # "cfda_number", "cfda_title", "assistance_type_code", "assistance_type_description", "award_description", + "", + "", + "", # "business_funds_indicator_code", "business_funds_indicator_description", "business_types_code", + "", + "", + "", # "business_types_description", "correction_delete_indicator_code", "correction_delete_indicator_description", + "", + "", + "", + "", # "action_type_code", "action_type_description", "record_type_code", "record_type_description", + "", + "", + "", # "highly_compensated_officer_1_name", "highly_compensated_officer_1_amount", "highly_compensated_officer_2_name", + "", + "", + "", # "highly_compensated_officer_2_amount", "highly_compensated_officer_3_name", "highly_compensated_officer_3_amount", + "", + "", + "", # "highly_compensated_officer_4_name", "highly_compensated_officer_4_amount", "highly_compensated_officer_5_name", + "", + "http://www.usaspending.gov/award/ASST_NON_2_0_0", + f"05/07/{fiscal_year}", # "highly_compensated_officer_5_amount", "usaspending_permalink", "last_modified_date" + ] + + +@pytest.fixture +def monthly_download_data(db, monkeypatch): + for js in JOB_STATUS: + mommy.make("download.JobStatus", job_status_id=js.id, name=js.name, description=js.desc) + + mommy.make("references.ToptierAgency", toptier_agency_id=1, toptier_code="001", name="Test_Agency") + mommy.make("references.Agency", pk=1, toptier_agency_id=1) + mommy.make("references.ToptierAgency", toptier_agency_id=2, toptier_code="002", name="Test_Agency 2") + mommy.make("references.Agency", pk=2, toptier_agency_id=2) + i = 1 + for fiscal_year in range(2001, 2021): + mommy.make( + "awards.Award", + id=i, + generated_unique_award_id="CONT_AWD_1_0_0", + is_fpds=True, + type="B", + type_description="Purchase Order", + piid=f"piid{i}", + awarding_agency_id=1, + funding_agency_id=1, + latest_transaction_id=i, + fiscal_year=fiscal_year, + ) + mommy.make( + "awards.TransactionFPDS", + transaction_id=i, + detached_award_procurement_id=i, + detached_award_proc_unique=f"test{i}", + piid=f"piid{i}", + agency_id=1, + awarding_sub_tier_agency_c="001", + awarding_sub_tier_agency_n="Test_Agency", + awarding_agency_code="001", + awarding_agency_name="Test_Agency", + parent_award_id=f"000{i}", + award_modification_amendme="1", + contract_award_type="B", + contract_award_type_desc="Contract", + created_at=datetime.datetime(fiscal_year, 5, 7), + updated_at=datetime.datetime(fiscal_year, 5, 7), + ) + mommy.make( + "awards.TransactionNormalized", + award_id=i, + id=i, + is_fpds=True, + transaction_unique_id=i, + usaspending_unique_transaction_id="", + type="B", + type_description="Purchase Order", + period_of_performance_start_date=datetime.datetime(fiscal_year, 5, 7), + period_of_performance_current_end_date=datetime.datetime(fiscal_year, 5, 7), + action_date=datetime.datetime(fiscal_year, 5, 7), + federal_action_obligation=100, + modification_number="", + description="a description", + drv_award_transaction_usaspend=1, + drv_current_total_award_value_amount_adjustment=1, + drv_potential_total_award_value_amount_adjustment=1, + last_modified_date=datetime.datetime(fiscal_year, 5, 7), + certified_date=datetime.datetime(fiscal_year, 5, 7), + create_date=datetime.datetime(fiscal_year, 5, 7), + update_date=datetime.datetime(fiscal_year, 5, 7), + fiscal_year=fiscal_year, + awarding_agency_id=1, + funding_agency_id=1, + original_loan_subsidy_cost=100.0, + face_value_loan_guarantee=100.0, + funding_amount=100.0, + non_federal_funding_amount=100.0, + unique_award_key=1, + business_categories=[], + ) + mommy.make( + "awards.Award", + id=i + 100, + generated_unique_award_id="ASST_NON_2_0_0", + is_fpds=False, + type="02", + type_description="Block Grant", + fain=f"fain{i}", + awarding_agency_id=1, + funding_agency_id=1, + latest_transaction_id=i + 100, + fiscal_year=fiscal_year, + ) + mommy.make( + "awards.TransactionFABS", + transaction_id=i + 100, + fain=f"fain{i}", + awarding_agency_code="001", + awarding_sub_tier_agency_c=1, + awarding_agency_name="Test_Agency", + awarding_sub_tier_agency_n="Test_Agency", + ) + mommy.make( + "awards.TransactionNormalized", + award_id=i + 100, + id=i + 100, + is_fpds=False, + transaction_unique_id=i + 100, + usaspending_unique_transaction_id="", + type="02", + type_description="Block Grant", + period_of_performance_start_date=datetime.datetime(fiscal_year, 5, 7), + period_of_performance_current_end_date=datetime.datetime(fiscal_year, 5, 7), + action_date=datetime.datetime(fiscal_year, 5, 7), + federal_action_obligation=100, + modification_number=f"{i+100}", + description="a description", + drv_award_transaction_usaspend=1, + drv_current_total_award_value_amount_adjustment=1, + drv_potential_total_award_value_amount_adjustment=1, + last_modified_date=datetime.datetime(fiscal_year, 5, 7), + certified_date=datetime.datetime(fiscal_year, 5, 7), + create_date=datetime.datetime(fiscal_year, 5, 7), + update_date=datetime.datetime(fiscal_year, 5, 7), + fiscal_year=fiscal_year, + awarding_agency_id=1, + funding_agency_id=1, + original_loan_subsidy_cost=100.0, + face_value_loan_guarantee=100.0, + funding_amount=100.0, + non_federal_funding_amount=100.0, + unique_award_key=i + 100, + ) + i += 1 + monkeypatch.setattr("usaspending_api.settings.MONTHLY_DOWNLOAD_S3_BUCKET_NAME", "whatever") + + +def test_all_agencies(client, monthly_download_data, monkeypatch): + call_command("populate_monthly_files", "--fiscal_year=2020", "--local", "--clobber") + file_list = os.listdir("csv_downloads") + formatted_date = datetime.datetime.strftime(datetime.date.today(), "%Y%m%d") + assert f"FY2020_All_Contracts_Full_{formatted_date}.zip" in file_list + assert f"FY2020_All_Assistance_Full_{formatted_date}.zip" in file_list + delete_files() + + +def test_specific_agency(client, monthly_download_data, monkeypatch): + contract_data = generate_contract_data(2020, 1) + assistance_data = generate_assistance_data + call_command("populate_monthly_files", "--agencies=1", "--fiscal_year=2020", "--local", "--clobber") + file_list = os.listdir("csv_downloads") + formatted_date = datetime.datetime.strftime(datetime.date.today(), "%Y%m%d") + assert f"FY2020_001_Contracts_Full_{formatted_date}.zip" in file_list + assert f"FY2020_001_Assistance_Full_{formatted_date}.zip" in file_list + + with zipfile.ZipFile( + os.path.normpath(f"csv_downloads/FY2020_001_Contracts_Full_{formatted_date}.zip"), "r" + ) as zip_ref: + zip_ref.extractall("csv_downloads") + assert f"FY2020_001_Contracts_Full_{formatted_date}_1.csv" in os.listdir("csv_downloads") + with open( + os.path.normpath(f"csv_downloads/FY2020_001_Contracts_Full_{formatted_date}_1.csv"), "r" + ) as contract_file: + csv_reader = reader(contract_file) + row_count = 0 + for row in csv_reader: + if row_count == 0: + assert row == [s[:63] for s in query_paths["transaction"]["d1"].keys()] + else: + assert row == contract_data + row_count += 1 + assert row_count >= 1 + + with zipfile.ZipFile( + os.path.normpath(f"csv_downloads/FY2020_001_Assistance_Full_{formatted_date}.zip"), "r" + ) as zip_ref: + zip_ref.extractall("csv_downloads") + assert f"FY2020_001_Assistance_Full_{formatted_date}_1.csv" in os.listdir("csv_downloads") + with open( + os.path.normpath(f"csv_downloads/FY2020_001_Assistance_Full_{formatted_date}_1.csv"), "r" + ) as assistance_file: + csv_reader = reader(assistance_file) + row_count = 0 + for row in csv_reader: + if row_count == 0: + assert row == [s[:63] for s in query_paths["transaction"]["d2"].keys()] + else: + assert row == assistance_data + row_count += 1 + assert row_count >= 1 + delete_files() + + +def test_agency_no_data(client, monthly_download_data, monkeypatch): + call_command("populate_monthly_files", "--agencies=2", "--fiscal_year=2022", "--local", "--clobber") + file_list = os.listdir("csv_downloads") + formatted_date = datetime.datetime.strftime(datetime.date.today(), "%Y%m%d") + assert f"FY2020_002_Contracts_Full_{formatted_date}.zip" not in file_list + assert f"FY2020_002_Assistance_Full_{formatted_date}.zip" not in file_list + + +def test_fiscal_years(client, monthly_download_data, monkeypatch): + call_command("populate_monthly_files", "--agencies=1", "--fiscal_year=2020", "--local", "--clobber") + call_command("populate_monthly_files", "--agencies=1", "--fiscal_year=2004", "--local", "--clobber") + file_list = os.listdir("csv_downloads") + formatted_date = datetime.datetime.strftime(datetime.date.today(), "%Y%m%d") + assert f"FY2004_001_Contracts_Full_{formatted_date}.zip" in file_list + assert f"FY2004_001_Assistance_Full_{formatted_date}.zip" in file_list + assert f"FY2020_001_Contracts_Full_{formatted_date}.zip" in file_list + assert f"FY2020_001_Assistance_Full_{formatted_date}.zip" in file_list + delete_files() + + +def test_award_type(client, monthly_download_data, monkeypatch): + call_command( + "populate_monthly_files", + "--agencies=1", + "--fiscal_year=2020", + "--award_types=assistance", + "--local", + "--clobber", + ) + file_list = os.listdir("csv_downloads") + formatted_date = datetime.datetime.strftime(datetime.date.today(), "%Y%m%d") + assert f"FY2020_001_Assistance_Full_{formatted_date}.zip" in file_list + assert f"FY2020_001_Contracts_Full_{formatted_date}.zip" not in file_list + delete_files() diff --git a/usaspending_api/etl/broker_etl_helpers.py b/usaspending_api/etl/broker_etl_helpers.py index 826cc1fbd6..7fd627b33b 100644 --- a/usaspending_api/etl/broker_etl_helpers.py +++ b/usaspending_api/etl/broker_etl_helpers.py @@ -7,7 +7,7 @@ from usaspending_api.common.helpers.sql_helpers import ordered_dictionary_fetcher -logger = logging.getLogger("console") +logger = logging.getLogger("script") def dictfetchall(cursor): diff --git a/usaspending_api/etl/elasticsearch_loader_helpers/controller.py b/usaspending_api/etl/elasticsearch_loader_helpers/controller.py index 36a901795a..34e242a064 100644 --- a/usaspending_api/etl/elasticsearch_loader_helpers/controller.py +++ b/usaspending_api/etl/elasticsearch_loader_helpers/controller.py @@ -105,11 +105,11 @@ def complete_process(self) -> None: update_last_load_date(f"{self.config['stored_date_key']}", self.config["processing_start_datetime"]) def determine_partitions(self) -> int: - """Create partition size less than or equal to max_size for more even distribution""" - if self.config["partition_size"] > (self.max_id - self.min_id): + """Simple strategy of partitions that cover the id-range in an even distribution""" + id_range_item_count = self.max_id - self.min_id + 1 # total number or records if all IDs exist in DB + if self.config["partition_size"] > id_range_item_count: return 1 - # return ceil(self.record_count / self.config["partition_size"]) - return ceil(max((self.max_id - self.min_id), self.record_count) / (self.config["partition_size"])) + return ceil(id_range_item_count / self.config["partition_size"]) def construct_tasks(self) -> List[TaskSpec]: """Create the Task objects w/ the appropriate configuration""" @@ -142,10 +142,9 @@ def configure_task(self, partition_number: int, name_gen: Generator, is_null_par ) def get_id_range_for_partition(self, partition_number: int) -> Tuple[int, int]: - range_size = ceil((self.max_id - self.min_id) / self.config["partitions"]) - lower_bound = self.min_id + (range_size * partition_number) - upper_bound = min(self.min_id + ((range_size * (partition_number + 1) - 1)), self.max_id) - + partition_size = self.config["partition_size"] + lower_bound = self.min_id + (partition_number * partition_size) + upper_bound = min(lower_bound + partition_size - 1, self.max_id) return lower_bound, upper_bound def run_deletes(self) -> None: @@ -189,7 +188,7 @@ def extract_transform_load(task: TaskSpec) -> None: msg = f"Partition #{task.partition_number} failed after an error was previously encountered" logger.warning(format_log(msg, name=task.name)) else: - logger.error(format_log(f"{task.name} failed!", name=task.name)) + logger.exception(format_log(f"{task.name} failed!", name=task.name)) abort.set() else: msg = f"Partition #{task.partition_number} was successfully processed in {perf_counter() - start:.2f}s" diff --git a/usaspending_api/etl/elasticsearch_loader_helpers/delete_data.py b/usaspending_api/etl/elasticsearch_loader_helpers/delete_data.py index b83199b6b5..7f3d681ee6 100644 --- a/usaspending_api/etl/elasticsearch_loader_helpers/delete_data.py +++ b/usaspending_api/etl/elasticsearch_loader_helpers/delete_data.py @@ -1,14 +1,14 @@ -import json import logging + import pandas as pd -from collections import defaultdict from django.conf import settings from time import perf_counter -from typing import Tuple, List, Optional, Dict, Union, Any +from typing import Optional, Dict, Union, Any from elasticsearch import Elasticsearch -from elasticsearch_dsl import Search, Q as ES_Q +from elasticsearch_dsl import Search +from elasticsearch_dsl.mapping import Mapping from usaspending_api.common.helpers.s3_helpers import retrieve_s3_bucket_object_list, access_s3_object from usaspending_api.etl.elasticsearch_loader_helpers.index_config import ( @@ -19,89 +19,44 @@ execute_sql_statement, format_log, chunks, - filter_query, ) logger = logging.getLogger("script") -def _delete_query(response: dict) -> dict: - return {"query": {"ids": {"values": [i["_id"] for i in response["hits"]["hits"]]}}} - - -def _delete_from_es( +def delete_docs_by_unique_key( client: Elasticsearch, - id_list: List[dict], - index: str, - max_query_size: int, - use_aliases: bool = False, - task_id: Optional[Tuple[int, str]] = None, -) -> None: - """ - id_list = [ - {key: 'key1', col: 'transaction_id'}, - {key: 'key2', col: 'generated_unique_transaction_id'}, - ... - ] - - or - - id_list = [ - {key: 'key1', col: 'award_id'}, - {key: 'key2', col: 'generated_unique_award_id'}, - ... - ] - """ - start = perf_counter() - msg = f"Deleting up to {len(id_list):,} document{'s' if len(id_list) != 1 else ''}" - logger.info(format_log(msg, name=task_id, action="Delete")) - - if use_aliases: - index = f"{index}-*" - start_ = client.count(index=index)["count"] - logger.info(format_log(f"Starting amount of indices ----- {start_:,}", name=task_id, action="Delete")) - col_to_items_dict = defaultdict(list) - for line in id_list: - col_to_items_dict[line["col"]].append(line["key"]) - - for column, values in col_to_items_dict.items(): - logger.info(format_log(f"Deleting {len(values):,} of '{column}'", name=task_id, action="Delete")) - values_generator = chunks(values, 1000) - for v in values_generator: - # IMPORTANT: This delete routine looks at just 1 index at a time. If there are duplicate records across - # multiple indexes, those duplicates will not be caught by this routine. It is left as is because at the - # time of this comment, we are migrating to using a single index. - body = filter_query(column, v) - response = client.search(index=index, body=json.dumps(body), size=max_query_size) - delete_body = _delete_query(response) - try: - client.delete_by_query(index=index, body=json.dumps(delete_body), refresh=True, size=max_query_size) - except Exception: - logger.exception(format_log("", name=task_id, action="Delete")) - raise SystemExit(1) - - end_ = client.count(index=index)["count"] - record_count = start_ - end_ - duration = perf_counter() - start - msg = f"Delete operation took {duration:.2f}s. Removed {record_count:,} document{'s' if record_count != 1 else ''}" - logger.info(format_log(msg, name=task_id, action="Delete")) - return - - -def delete_docs_by_unique_key(client: Elasticsearch, key: str, value_list: list, task_id: str, index) -> int: + key: str, + value_list: list, + task_id: str, + index, + refresh_after: bool = True, + delete_chunk_size: int = 10000, +) -> int: """ Bulk delete a batch of documents whose field identified by ``key`` matches any value provided in the ``values_list``. + NOTE: This delete routine looks at just the index name given. If there are duplicate records across + multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple + indices, or this will need to be run once per index. + Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster - key (str): name of filed in targeted elasticsearch index that should have a unique value for - every doc in the index. Ideally the field or sub-field provided is of ``keyword`` type. + key (str): name of field in targeted elasticsearch index that should have a unique value for + every doc in the index. The field or sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field) value_list (list): if key field has these values, the document will be deleted task_id (str): name of ES ETL job being run, used in logging index (str): name of index (or alias) to target for the ``_delete_by_query`` ES operation. - - NOTE: This delete routine looks at just the index name given. If there are duplicate records across - multiple indexes, an alias or wildcard should be provided for ``index`` param that covers multiple - indices, or this will need to be run once per index. + refresh_after (bool): Whether to call ``_refresh`` on the index when all of the provided values in + ``value_list`` have been processed for delete; defaults to ``True``. If many small deletes happen at a + rapid rate, it may be best to set this ``False`` and await a deferred refresh afterward in the calling + code. NOTE: This param will be ignored and a refresh will be attempted if this function + errors-out during execution, in order to not leave un-refreshed deletes in the index. + delete_chunk_size (int): the batch-size of terms value-array given to each _delete_by_query call. Needs to be + less than 65536 (max values for any terms query), and less than index.max_results_window setting. Ideally + use ``config["partition_size"]`` (derived from --partition-siz) to set this to a calibrated value. If not + provided, uses 10,000 as the default which was measured as a good reasonable default. Returns: Number of ES documents deleted """ @@ -115,73 +70,171 @@ def delete_docs_by_unique_key(client: Elasticsearch, key: str, value_list: list, if not index: raise RuntimeError("index name must be provided") + if not _is_allowed_key_field_type(client, key, index): + msg = ( + f'Cannot perform deletes in index "{index}" by key field "{key}" because its type is not one of ' + f"the allowed field types, or the field was not found in that index." + ) + logger.error(format_log(msg=msg, action="Delete", name=task_id)) + raise RuntimeError(msg) + + if delete_chunk_size > 65536: + # 65,536 is max number of terms that can be added to an ES terms filter query + msg = ( + f"{delete_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES " + f"terms filter query" + ) + logger.error(format_log(msg=msg, action="Delete")) + raise RuntimeError(msg) + + chunks_processed = 0 deleted = 0 is_error = False try: - # 65,536 is max number of terms that can be added to an ES terms filter query - values_generator = chunks(value_list, 50000) + values_generator = chunks(value_list, delete_chunk_size) for chunk_of_values in values_generator: - # Creates an Elasticsearch query criteria for the _delete_by_query call - q = ES_Q("terms", **{key: chunk_of_values}) # Invoking _delete_by_query as per the elasticsearch-dsl docs: # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html#delete-by-query - response = Search(using=client, index=index).filter(q).delete() - chunk_deletes = response["deleted"] - deleted += chunk_deletes + # _refresh is deferred until the end of chunk processing + q = Search(using=client, index=index).filter("terms", **{key: chunk_of_values}) # type: Search + # params: + # conflicts="proceed": Ignores version conflict errors if a doc delete is attempted more than once + # slices="auto": Will create parallel delete batches per shard + q = q.params(conflicts="proceed", slices="auto") + response = q.delete() + # Some subtle errors come back on the response + if response["timed_out"]: + msg = f"Delete request timed out on cluster after {int(response['took'])/1000:.2f}s" + logger.error(format_log(msg=msg, action="Delete", name=task_id)) + raise RuntimeError(msg) + if response["failures"]: + fail_snippet = "\n\t\t" + "\n\t\t".join(map(str, response["failures"][0:4])) + "\n\t\t" + "..." + msg = f"Some docs failed to delete on cluster:{fail_snippet}" + logger.error(format_log(msg=msg, action="Delete", name=task_id)) + raise RuntimeError(msg) + logger.info( + format_log( + f"Deleted {response['deleted']:,} docs in ES from chunk of size {len(chunk_of_values):,} " + f"in {int(response['took'])/1000:.2f}s, " + f"and ignored {response['version_conflicts']:,} version conflicts", + action="Delete", + name=task_id, + ) + ) + deleted += response["deleted"] + chunks_processed += 1 except Exception: is_error = True logger.exception(format_log("", name=task_id, action="Delete")) - raise SystemExit(1) + raise finally: - error_text = " before encountering an error" if is_error else "" - duration = perf_counter() - start - docs = f"document{'s' if deleted != 1 else ''}" - msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} {docs}{error_text}" - logger.info(format_log(msg, action="Delete", name=task_id)) + if deleted > 0 and (refresh_after or is_error): + if not is_error: + refresh_msg = "Refreshing index so deletes take effect" + else: + refresh_msg = "Attempting index refresh while handling error so deletes take effect" + logger.info(format_log(refresh_msg, action="Delete", name=task_id)) + client.indices.refresh(index=index) + if chunks_processed > 1 or is_error: + # This log becomes redundant unless to log the sum of multiple chunks' deletes (or error) + error_text = " before encountering an error" if is_error else "" + duration = perf_counter() - start + docs = f"document{'s' if deleted != 1 else ''}" + msg = f"Delete operation took {duration:.2f}s. Removed {deleted:,} total {docs}{error_text}" + logger.info(format_log(msg, action="Delete", name=task_id)) return deleted -def _lookup_deleted_award_ids(client: Elasticsearch, id_list: list, config: dict, index: Optional[str] = None) -> list: - """Lookup deleted transactions to derive parent awards to be deleted +def _is_allowed_key_field_type(client: Elasticsearch, key_field: str, index: str) -> bool: + """Return ``True`` if the given field's mapping in the given index is in our allowed list of ES types + compatible with term(s) queries - This fetches a list of all unique award keys compiled from the ``ES_AWARDS_UNIQUE_KEY_FIELD`` field of - any document in the transaction index that matches the query, which looks up deleted transaction ES - documents by their ``ES_TRANSACTIONS_UNIQUE_KEY_FIELD`` field. + This is mainly to prevent use of ``text`` fields in terms queries, which give bad results because Elasticsearch + changes the values of text fields during analysis. + """ + if key_field == "_id": + # Special case. It is a reserved field, without a type, but can effectively be treated as a keyword field + return True + + # Get true index name from alias, if provided an alias + response = client.indices.get(index) + aliased_index_name = list(response.keys())[0] + es_field_type = Mapping().from_es(using=client, index=aliased_index_name).resolve_field(key_field) + # This is the allowed types whitelist. More can be added as-needed if compatible with terms(s) queries. + if es_field_type and es_field_type.name in ["keyword", "integer"]: + return True + return False - Args: - client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster - id_list (list): A list of dictionaries, each having two keys, in this format:: - id_list = [ - {key:'', col:''}, - {key:'', col:''}, - ..., - ] +def _lookup_deleted_award_keys( + client: Elasticsearch, + lookup_key: str, + value_list: list, + config: dict, + index: Optional[str] = None, + lookup_chunk_size: int = 50000, +) -> list: + """Derive a list of award keys given a target index, Lookup field, and lookup values + + This returns a list of all unique award keys, which are compiled from the ``ES_AWARDS_UNIQUE_KEY_FIELD`` field of + any document in the given ``index`` that matches the query. The matching query is a terms query that will return + the doc if its ``lookup_key`` field has any value provided in ``value_list``. + Args: + client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster + lookup_key (str): name of field in targeted elasticsearch index by which we are looking up docs. The field or + sub-field provided MUST be of ``keyword`` type (or ``_id`` meta field) + value_list (list): if lookup_key field has any of these values, the document will be returned from the lookup config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task index (str): Optional name, alias, or pattern of index this query will target. Looks up via config if not - provided + provided + lookup_chunk_size (int): the batch-size of terms value-array to be looked-up. Needs to be less + than 65536 (max values for any terms query), and less than config["max_query_size"] - Returns: None + Returns: list of values for the ES_AWARDS_UNIQUE_KEY_FIELD fields in the looked-up documents. """ if index is None: index = f"{config['query_alias_prefix']}-*" - col_to_items_dict = defaultdict(list) - for line in id_list: - col_to_items_dict[line["col"]].append(line["key"]) - awards = [] - for column, values in col_to_items_dict.items(): - values_generator = chunks(values, 1000) - for v in values_generator: - body = filter_query(column, v) - response = client.search(index=index, body=json.dumps(body), size=config["max_query_size"]) - if response["hits"]["total"]["value"] != 0: - awards += [x["_source"][ES_AWARDS_UNIQUE_KEY_FIELD] for x in response["hits"]["hits"]] - return awards - - -def delete_awards(client: Elasticsearch, config: dict) -> None: + + if not _is_allowed_key_field_type(client, lookup_key, index): + msg = ( + f'Cannot perform lookups in index "{index}" with key field "{lookup_key}" because its type is not one of ' + f"the allowed field types, or the field was not found in that index." + ) + logger.error(format_log(msg=msg, action="Delete")) + raise RuntimeError(msg) + + if lookup_chunk_size > 65536: + # 65,536 is max number of terms that can be added to an ES terms filter query + msg = ( + f"{lookup_chunk_size} is greater than 65,536, which is the max number of terms that can be added to an ES " + f"terms filter query" + ) + logger.error(format_log(msg=msg, action="Delete")) + raise RuntimeError(msg) + + if lookup_chunk_size > config["max_query_size"]: + # Some keys would be left undiscovered if our chunk was cut short by the query only returning a lesser subset + msg = ( + f"{lookup_chunk_size} is greater {config['max_query_size']}, which is the max number of query " + f"results returnable from this index. Use a smaller chunk or increase max_result_window for this index." + ) + logger.error(format_log(msg=msg, action="Delete")) + raise RuntimeError(msg) + + award_key_list = [] + values_generator = chunks(value_list, lookup_chunk_size) + for chunk_of_values in values_generator: + q = Search(using=client, index=index).filter("terms", **{lookup_key: chunk_of_values}) # type: Search + q.update_from_dict({"size": config["max_query_size"]}) + response = q.execute() + if response["hits"]["total"]["value"] != 0: + award_key_list += [x["_source"][ES_AWARDS_UNIQUE_KEY_FIELD] for x in response["hits"]["hits"]] + return award_key_list + + +def delete_awards(client: Elasticsearch, config: dict, task_id: str = "Sync DB Deletes") -> int: """Delete all awards in the Elasticsearch awards index that were deleted in the source database. This performs the deletes of award documents in ES in a series of batches, as there could be many. Millions of @@ -198,46 +251,95 @@ def delete_awards(client: Elasticsearch, config: dict) -> None: Args: client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task + task_id (str): label for this sub-step of the ETL - Returns: None + Returns: Number of ES docs deleted in the index """ - deleted_ids = _gather_deleted_ids(config) + deleted_tx_keys = _gather_deleted_transaction_keys(config) # While extracting unique award keys, the lookup is on transactions and must match against the unique transaction id - id_list = [{"key": deleted_id, "col": ES_TRANSACTIONS_UNIQUE_KEY_FIELD} for deleted_id in deleted_ids] - award_ids = _lookup_deleted_award_ids(client, id_list, config, settings.ES_TRANSACTIONS_QUERY_ALIAS_PREFIX + "-*") - if (len(award_ids)) == 0: - logger.info(format_log(f"No related awards require deletion", action="Delete")) - return - - deleted_award_ids = _check_awards_for_deletes(award_ids) - if len(deleted_award_ids) == 0: - logger.info(format_log(f"No related awards require deletion", action="Delete")) - return - - award_id_list = [ - {"key": deleted_award[config["unique_key_field"]], "col": config["unique_key_field"]} - for deleted_award in deleted_award_ids - ] - _delete_from_es( + award_keys = _lookup_deleted_award_keys( client, - award_id_list, - index=config["query_alias_prefix"], - max_query_size=config["max_query_size"], - use_aliases=True, + ES_TRANSACTIONS_UNIQUE_KEY_FIELD, + [*deleted_tx_keys], + config, + settings.ES_TRANSACTIONS_QUERY_ALIAS_PREFIX + "-*", + ) + award_keys = list(set(award_keys)) # get unique list of keys + award_keys_len = len(award_keys) + if award_keys_len == 0: + logger.info( + format_log( + f"No related awards found for deletion. Zero transaction docs found from which to derive awards.", + action="Delete", + name=task_id, + ) + ) + return 0 + logger.info( + format_log(f"Derived {award_keys_len} award keys from transactions in ES", action="Delete", name=task_id) ) - return + deleted_award_kvs = _check_awards_for_deletes(award_keys) + deleted_award_kvs_len = len(deleted_award_kvs) + if deleted_award_kvs_len == 0: + # In this case it could be an award's transaction was deleted, but not THE LAST transaction of that award. + # i.e. the deleted transaction's "siblings" are still in the DB and therefore the parent award should remain + logger.info( + format_log( + f"No related awards found will be deleted. All derived awards are still in the DB.", + action="Delete", + name=task_id, + ) + ) + return 0 + logger.info( + format_log( + f"{deleted_award_kvs_len} awards no longer in the DB will be removed from ES", action="Delete", name=task_id + ) + ) + values_list = [v for d in deleted_award_kvs for v in d.values()] + return delete_docs_by_unique_key( + client, + key=config["unique_key_field"], + value_list=values_list, + task_id=task_id, + index=config["index_name"], + delete_chunk_size=config["partition_size"], + ) + + +def delete_transactions(client: Elasticsearch, config: dict, task_id: str = "Sync DB Deletes") -> int: + """Delete all transactions in the Elasticsearch transactions index that were deleted in the source database. + + This performs the deletes of transaction documents in ES in a series of batches, as there could be many. Millions of + transactions deleted may take a prohibitively long time, and it could be better to just re-index all documents from + the DB instead. + + Side Effects: + The index from which docs were deleted will be refreshed if the delete was successful + and removed more than 0 docs. -def delete_transactions(client: Elasticsearch, config: dict) -> None: - deleted_ids = _gather_deleted_ids(config) - id_list = [{"key": deleted_id, "col": config["unique_key_field"]} for deleted_id in deleted_ids] - _delete_from_es( - client, id_list, index=config["query_alias_prefix"], max_query_size=config["max_query_size"], use_aliases=True + Args: + client (Elasticsearch): elasticsearch-dsl client for making calls to an ES cluster + config (dict): collection of key-value pairs that encapsulates runtime arguments for this ES management task + task_id (str): label for this sub-step of the ETL + + + Returns: Number of ES docs deleted in the index + """ + deleted_tx_keys = _gather_deleted_transaction_keys(config) + return delete_docs_by_unique_key( + client, + key=config["unique_key_field"], + value_list=[*deleted_tx_keys], + task_id="Sync DB Deletes", + index=config["index_name"], + delete_chunk_size=config["partition_size"], ) -def _gather_deleted_ids(config: dict) -> Dict[Union[str, Any], Dict[str, Any]]: +def _gather_deleted_transaction_keys(config: dict) -> Optional[Dict[Union[str, Any], Dict[str, Any]]]: """ Connect to S3 and gather all of the transaction ids stored in CSV files generated by the broker when transactions are removed from the DB. @@ -245,7 +347,7 @@ def _gather_deleted_ids(config: dict) -> Dict[Union[str, Any], Dict[str, Any]]: if not config["process_deletes"]: logger.info(format_log(f"Skipping the S3 CSV fetch for deleted transactions", action="Delete")) - return + return None logger.info(format_log(f"Gathering all deleted transactions from S3", action="Delete")) start = perf_counter() @@ -265,7 +367,7 @@ def _gather_deleted_ids(config: dict) -> Dict[Union[str, Any], Dict[str, Any]]: if config["verbose"]: logger.info(format_log(f"Found {len(filtered_csv_list)} csv files", action="Delete")) - deleted_ids = {} + deleted_keys = {} for obj in filtered_csv_list: object_data = access_s3_object(bucket_name=config["s3_bucket"], obj=obj) @@ -283,25 +385,28 @@ def _gather_deleted_ids(config: dict) -> Dict[Union[str, Any], Dict[str, Any]]: raise RuntimeError(msg) for uid in new_ids: - if uid in deleted_ids: - if deleted_ids[uid]["timestamp"] < obj.last_modified: - deleted_ids[uid]["timestamp"] = obj.last_modified + if uid in deleted_keys: + if deleted_keys[uid]["timestamp"] < obj.last_modified: + deleted_keys[uid]["timestamp"] = obj.last_modified else: - deleted_ids[uid] = {"timestamp": obj.last_modified} + deleted_keys[uid] = {"timestamp": obj.last_modified} if config["verbose"]: - for uid, deleted_dict in deleted_ids.items(): + for uid, deleted_dict in deleted_keys.items(): logger.info(format_log(f"id: {uid} last modified: {deleted_dict['timestamp']}", action="Delete")) logger.info( format_log( - f"Gathering {len(deleted_ids):,} deleted transactions took {perf_counter() - start:.2f}s", action="Delete" + f"Gathered {len(deleted_keys):,} deleted transactions from {len(filtered_csv_list)} files in " + f"increment in {perf_counter() - start:.2f}s", + action="Delete", ) ) - return deleted_ids + return deleted_keys def _check_awards_for_deletes(id_list: list) -> list: + """Takes a list of award key values and returns them if they are NOT found in the awards DB table""" formatted_value_ids = "" for x in id_list: formatted_value_ids += "('" + x + "')," diff --git a/usaspending_api/etl/elasticsearch_loader_helpers/index_config.py b/usaspending_api/etl/elasticsearch_loader_helpers/index_config.py index 2194cb88ec..3fb9cda8a9 100644 --- a/usaspending_api/etl/elasticsearch_loader_helpers/index_config.py +++ b/usaspending_api/etl/elasticsearch_loader_helpers/index_config.py @@ -1,11 +1,11 @@ import json import logging +import time from django.conf import settings from usaspending_api.awards.v2.lookups.elasticsearch_lookups import INDEX_ALIASES_TO_AWARD_TYPES -from usaspending_api.etl.elasticsearch_loader_helpers.utilities import format_log - +from usaspending_api.etl.elasticsearch_loader_helpers.utilities import format_log, is_snapshot_running logger = logging.getLogger("script") @@ -83,7 +83,7 @@ def swap_aliases(client, config): client.indices.delete_alias(old_index, "_all") logger.info(format_log(f"Removing aliases from '{old_index}'", action="ES Alias")) except Exception: - logger.exception(f"No aliases found for {alias_patterns}", action="ES Alias") + logger.exception(format_log(f"No aliases found for {alias_patterns}", action="ES Alias")) if config["create_award_type_aliases"]: create_award_type_aliases(client, config) @@ -94,10 +94,32 @@ def swap_aliases(client, config): try: if old_indexes: - client.indices.delete(index=old_indexes, ignore_unavailable=False) - logger.info(format_log(f"Deleted index(es) '{old_indexes}'", action="ES Alias")) + max_wait_time = 15 # in minutes + start_wait_time = time.time() + is_snapshot_conflict = is_snapshot_running(client, old_indexes) + if is_snapshot_conflict: + logger.info( + format_log( + f"Snapshot in-progress prevents delete; waiting up to {max_wait_time} minutes", + action="ES Alias", + ) + ) + while (time.time() - start_wait_time) < (max_wait_time * 60) and is_snapshot_conflict: + logger.info(format_log("Waiting while snapshot is in-progress", action="ES Alias")) + time.sleep(90) + is_snapshot_conflict = is_snapshot_running(client, old_indexes) + if is_snapshot_conflict: + config["raise_status_code_3"] = True + logger.error( + format_log( + f"Unable to delete index(es) '{old_indexes}' due to in-progress snapshot", action="ES Alias" + ) + ) + else: + client.indices.delete(index=old_indexes, ignore_unavailable=False) + logger.info(format_log(f"Deleted index(es) '{old_indexes}'", action="ES Alias")) except Exception: - logger.exception(f"Unable to delete indexes: {old_indexes}", action="ES Alias") + logger.exception(format_log(f"Unable to delete indexes: {old_indexes}", action="ES Alias")) def toggle_refresh_off(client, index): diff --git a/usaspending_api/etl/elasticsearch_loader_helpers/load_data.py b/usaspending_api/etl/elasticsearch_loader_helpers/load_data.py index 8f89c64ab6..c0ad07d9b1 100644 --- a/usaspending_api/etl/elasticsearch_loader_helpers/load_data.py +++ b/usaspending_api/etl/elasticsearch_loader_helpers/load_data.py @@ -19,7 +19,11 @@ # Aiming for a batch that yields each ES cluster data-node handling max 0.3-1MB per vCPU per batch request # Ex: 3-data-node cluster of i3.large.elasticsearch = 2 vCPU * 3 nodes = 6 vCPU: .75MB*6 ~= 4.5MB batches # Ex: 5-data-node cluster of i3.xlarge.elasticsearch = 4 vCPU * 5 nodes = 20 vCPU: .75MB*20 ~= 15MB batches -ES_MAX_BATCH_BYTES = 10 * 1024 * 1024 +# A good test is use the number of desired docs below and pipe a curl result to a text file +# > curl localhost:9200/*awards/_search?size=4000 > out.json -> 15MB +# > curl localhost:9200/*transactions/_search?size=4000 > out.json -> 19.4MB +# Given these, leaning towards the high end of 1MB per vCPU +ES_MAX_BATCH_BYTES = 20 * 1024 * 1024 # Aiming for a batch that yields each ES cluster data-node handling max 100-400 doc entries per vCPU per request # Ex: 3-data-node cluster of i3.large.elasticsearch = 2 vCPU * 3 nodes = 6 vCPU: 300*6 = 1800 doc batches # Ex: 5-data-node cluster of i3.xlarge.elasticsearch = 4 vCPU * 5 nodes = 20 vCPU: 300*20 = 6000 doc batches @@ -72,7 +76,15 @@ def streaming_post_to_es( try: if delete_before_index: value_list = [doc[delete_key] for doc in chunk] - delete_docs_by_unique_key(client, delete_key, value_list, job_name, index_name) + delete_docs_by_unique_key( + client, + delete_key, + value_list, + job_name, + index_name, + refresh_after=False, + delete_chunk_size=len(chunk), # same size as this partition to process + ) for ok, item in helpers.streaming_bulk( client, actions=chunk, diff --git a/usaspending_api/etl/elasticsearch_loader_helpers/utilities.py b/usaspending_api/etl/elasticsearch_loader_helpers/utilities.py index de6baa6ee9..873bf92e7e 100644 --- a/usaspending_api/etl/elasticsearch_loader_helpers/utilities.py +++ b/usaspending_api/etl/elasticsearch_loader_helpers/utilities.py @@ -1,9 +1,11 @@ import json import logging import psycopg2 +import re from dataclasses import dataclass from django.conf import settings +from elasticsearch import Elasticsearch from pathlib import Path from random import choice from typing import Any, Generator, List, Optional @@ -124,3 +126,13 @@ def to_roman_numerals(num: int) -> str: if len(previous_names) >= (upper_limit + (upper_limit * full_cycles)): full_cycles += 1 loop = f" {to_roman_numerals(full_cycles)}" + + +def is_snapshot_running(client: Elasticsearch, index_names: List[str]) -> bool: + snapshot_list = client.snapshot.status().get("snapshots", []) + index_names_pattern = f".*({'|'.join(index_names)}).*" + for snapshot in snapshot_list: + indexes = str(snapshot.get("indices", {}).keys()) + if re.match(index_names_pattern, indexes): + return True + return False diff --git a/usaspending_api/etl/es_award_template.json b/usaspending_api/etl/es_award_template.json index 2e6e8e9318..abdf03ceb2 100644 --- a/usaspending_api/etl/es_award_template.json +++ b/usaspending_api/etl/es_award_template.json @@ -38,12 +38,7 @@ "type": "integer" }, "generated_unique_award_id": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword" - } - } + "type": "keyword" }, "display_award_id": { "type": "keyword" diff --git a/usaspending_api/etl/es_config_objects.json b/usaspending_api/etl/es_config_objects.json index 3e41feafbe..2acf4b4dcc 100644 --- a/usaspending_api/etl/es_config_objects.json +++ b/usaspending_api/etl/es_config_objects.json @@ -1,13 +1,5 @@ { - "cluster": { - "transient" : {}, - "persistent": { - "action.destructive_requires_name": true - } - }, - "settings": { - "index.max_result_window" : null - }, + "cluster": {}, "final_index_settings": { "number_of_replicas": 1, "refresh_interval": "1s" diff --git a/usaspending_api/etl/es_transaction_template.json b/usaspending_api/etl/es_transaction_template.json index 901c4fdc59..c7a13d4dff 100644 --- a/usaspending_api/etl/es_transaction_template.json +++ b/usaspending_api/etl/es_transaction_template.json @@ -31,13 +31,13 @@ "type": "integer" }, "detached_award_proc_unique": { - "type": "text" + "type": "keyword" }, "afa_generated_unique": { - "type": "text" + "type": "keyword" }, "generated_unique_transaction_id": { - "type": "text" + "type": "keyword" }, "display_award_id": { "type": "keyword" diff --git a/usaspending_api/etl/management/commands/elasticsearch_indexer.py b/usaspending_api/etl/management/commands/elasticsearch_indexer.py index 4d03a9e2fd..0576d8b3bd 100644 --- a/usaspending_api/etl/management/commands/elasticsearch_indexer.py +++ b/usaspending_api/etl/management/commands/elasticsearch_indexer.py @@ -97,10 +97,9 @@ def add_arguments(self, parser): parser.add_argument( "--partition-size", type=int, - help="Set the target size of a single data partition. A partition might be slightly " - "larger or slightly smaller depending on the distribution of the data to process", - default=250000, - metavar="(default: 250,000)", + help="Set the batch-size of a single partition of data to process.", + default=10000, + metavar="(default: 10,000)", ) parser.add_argument( "--drop-db-view", @@ -147,9 +146,13 @@ def handle(self, *args, **options): logger.info(format_log(msg)) logger.info(format_log(headers)) + # Used to help pipeline determine when job passed but needs attention + if config["raise_status_code_3"]: + raise SystemExit(3) + def parse_cli_args(options: dict, es_client) -> dict: - passthrough_values = ( + passthrough_values = [ "create_new_index", "drop_db_view", "index_name", @@ -160,7 +163,7 @@ def parse_cli_args(options: dict, es_client) -> dict: "processes", "skip_counts", "skip_delete_index", - ) + ] config = set_config(passthrough_values, options) if config["create_new_index"] and not config["index_name"]: @@ -183,7 +186,7 @@ def parse_cli_args(options: dict, es_client) -> dict: config["starting_date"] != config["initial_datetime"] and not config["deletes_only"] ) - if config["is_incremental_load"]: + if config["is_incremental_load"] or config["deletes_only"]: if config["index_name"]: logger.info(format_log(f"Ignoring provided index name, using alias '{config['write_alias']}' for safety")) config["index_name"] = config["write_alias"] @@ -278,6 +281,7 @@ def set_config(passthrough_values: list, arg_parse_options: dict) -> dict: "s3_bucket": settings.DELETED_TRANSACTION_JOURNAL_FILES, "processing_start_datetime": datetime.now(timezone.utc), "verbose": arg_parse_options["verbosity"] > 1, # convert command's levels of verbosity to a bool + "raise_status_code_3": False, } ) diff --git a/usaspending_api/etl/management/commands/es_configure.py b/usaspending_api/etl/management/commands/es_configure.py index e141f892fc..9d1fe56e8a 100644 --- a/usaspending_api/etl/management/commands/es_configure.py +++ b/usaspending_api/etl/management/commands/es_configure.py @@ -13,7 +13,6 @@ CURL_COMMANDS = { "template": "{host}/_template/{name}?pretty", "cluster": "{host}/_cluster/settings?pretty", - "settings": "{host}/_settings?pretty", } FILES = { @@ -53,29 +52,26 @@ def handle(self, *args, **options): if options["load_type"] in ("award", "awards"): self.index_pattern = f"*{settings.ES_AWARDS_NAME_SUFFIX}" self.max_result_window = settings.ES_AWARDS_MAX_RESULT_WINDOW - self.template = "award_template" + self.template_name = "award_template" elif options["load_type"] in ("transaction", "transactions"): self.index_pattern = f"*{settings.ES_TRANSACTIONS_NAME_SUFFIX}" self.max_result_window = settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW - self.template = "transaction_template" + self.template_name = "transaction_template" elif options["load_type"] == "covid19-faba": self.index_pattern = f"*{settings.ES_COVID19_FABA_NAME_SUFFIX}" self.max_result_window = settings.ES_COVID19_FABA_MAX_RESULT_WINDOW - self.template = "covid19_faba_template" + self.template_name = "covid19_faba_template" else: raise RuntimeError(f"No config for {options['load_type']}") - cluster, index_settings = self.get_elasticsearch_settings() + cluster = self.get_elasticsearch_settings() template = self.get_index_template() - if not options["template_only"]: + if not options["template_only"] and cluster: self.run_curl_cmd(payload=cluster, url=CURL_COMMANDS["cluster"], host=settings.ES_HOSTNAME) - self.run_curl_cmd(payload=index_settings, url=CURL_COMMANDS["settings"], host=settings.ES_HOSTNAME) - - template_name = "{type}_template".format(type=self.load_type[:-1]) self.run_curl_cmd( - payload=template, url=CURL_COMMANDS["template"], host=settings.ES_HOSTNAME, name=template_name + payload=template, url=CURL_COMMANDS["template"], host=settings.ES_HOSTNAME, name=self.template_name ) logger.info(f"ES Configure took {perf_counter() - start:.2f}s") @@ -92,11 +88,10 @@ def run_curl_cmd(self, **kwargs) -> None: def get_elasticsearch_settings(self): es_config = self.return_json_from_file(FILES["settings"]) - es_config["settings"]["index.max_result_window"] = self.max_result_window - return es_config["cluster"], es_config["settings"] + return es_config["cluster"] def get_index_template(self): - template = self.return_json_from_file(FILES[self.template]) + template = self.return_json_from_file(FILES[self.template_name]) template["index_patterns"] = [self.index_pattern] template["settings"]["index.max_result_window"] = self.max_result_window return template @@ -115,12 +110,3 @@ def return_json_from_file(self, path): json_to_dict = json.load(f) return json_to_dict - - -def retrieve_index_template(template): - """This function is used for test configuration""" - with open(str(FILES[template])) as f: - mapping_dict = json.load(f) - template = json.dumps(mapping_dict) - - return template diff --git a/usaspending_api/etl/management/commands/load_budget_authority.py b/usaspending_api/etl/management/commands/load_budget_authority.py index 6b50de0aef..279da47b5e 100644 --- a/usaspending_api/etl/management/commands/load_budget_authority.py +++ b/usaspending_api/etl/management/commands/load_budget_authority.py @@ -13,7 +13,7 @@ from usaspending_api.common.helpers.date_helper import fy from usaspending_api.references.models import OverallTotals, FrecMap -logger = logging.getLogger("console") +logger = logging.getLogger("script") exception_logger = logging.getLogger("exceptions") diff --git a/usaspending_api/etl/management/commands/load_fpds_csv.py b/usaspending_api/etl/management/commands/load_fpds_csv.py index 0adbd81900..cf68985cd6 100644 --- a/usaspending_api/etl/management/commands/load_fpds_csv.py +++ b/usaspending_api/etl/management/commands/load_fpds_csv.py @@ -10,7 +10,7 @@ from usaspending_api.awards.models import TransactionFPDS -logger = logging.getLogger("console") +logger = logging.getLogger("script") class Command(BaseCommand): diff --git a/usaspending_api/etl/management/commands/update_total_funding_amount_sql.py b/usaspending_api/etl/management/commands/update_total_funding_amount_sql.py index 2a4716813f..3e403c0f73 100644 --- a/usaspending_api/etl/management/commands/update_total_funding_amount_sql.py +++ b/usaspending_api/etl/management/commands/update_total_funding_amount_sql.py @@ -10,7 +10,7 @@ from django.db import connection -logger = logging.getLogger("console") +logger = logging.getLogger("script") exception_logger = logging.getLogger("exceptions") diff --git a/usaspending_api/etl/management/sql/dev3627_compare_dabs_row_counts.sql b/usaspending_api/etl/management/sql/dev3627_compare_dabs_row_counts.sql new file mode 100644 index 0000000000..a4378fec00 --- /dev/null +++ b/usaspending_api/etl/management/sql/dev3627_compare_dabs_row_counts.sql @@ -0,0 +1,123 @@ +WITH file_a AS ( + SELECT submission_id, COUNT(aab.*) AS count + FROM submission_attributes AS sa + LEFT OUTER JOIN appropriation_account_balances AS aab USING (submission_id) + GROUP BY submission_id +), file_b AS ( + SELECT submission_id, COUNT(fapaoc.*) AS count + FROM submission_attributes AS sa + LEFT OUTER JOIN financial_accounts_by_program_activity_object_class AS fapaoc USING (submission_id) + GROUP BY submission_id +), file_c AS ( + SELECT sa.submission_id, COUNT(faba.*) AS count + FROM submission_attributes AS sa + LEFT OUTER JOIN financial_accounts_by_awards AS faba USING (submission_id) + GROUP BY submission_id +) + +SELECT submission_id, + file_a.count AS usas_file_a, + bs.count_file_a AS broker_file_a, + file_b.count AS usas_file_b, + bs.count_file_b AS broker_file_b, + file_c.count AS usas_file_c, + bs.count_file_c AS broker_file_c, + (bs.count_file_a - file_a.count) + (bs.count_file_b - file_b.count) + (bs.count_file_c - file_c.count) AS discrepency_count +FROM + submission_attributes sa +INNER JOIN file_a USING(submission_id) +INNER JOIN file_b USING(submission_id) +INNER JOIN file_c USING(submission_id) +INNER JOIN dblink( + 'broker_server', + 'WITH subs_with_duplicate_file_b AS ( + SELECT DISTINCT submission_id + FROM certified_object_class_program_activity + WHERE LENGTH(object_class) = 4 + GROUP BY submission_id, tas_id, program_activity_code, object_class, disaster_emergency_fund_code + HAVING COUNT(*) > 1 + ), valid_tas AS ( + SELECT account_num + FROM tas_lookup + WHERE + financial_indicator2 IS DISTINCT FROM ''F'' + ) + SELECT + submission_id, + ( + SELECT COUNT(*) FROM certified_appropriation AS ca + INNER JOIN valid_tas ON valid_tas.account_num = ca.tas_id + WHERE ( + submission_id = s.submission_id + ) + ) AS count_file_a, + CASE + WHEN s.submission_id IN (SELECT submission_id FROM subs_with_duplicate_file_b) + THEN + ( + SELECT COUNT(*) + FROM ( + SELECT 1 + FROM certified_object_class_program_activity AS cocpa + INNER JOIN valid_tas ON valid_tas.account_num = cocpa.tas_id + WHERE ( + submission_id = s.submission_id + ) + GROUP BY + cocpa.submission_id, + cocpa.job_id, + cocpa.agency_identifier, + cocpa.allocation_transfer_agency, + cocpa.availability_type_code, + cocpa.beginning_period_of_availa, + cocpa.ending_period_of_availabil, + cocpa.main_account_code, + RIGHT(cocpa.object_class, 3), + CASE + WHEN length(cocpa.object_class) = 4 AND LEFT(cocpa.object_class, 1) = ''1'' THEN ''D'' + WHEN length(cocpa.object_class) = 4 AND LEFT(cocpa.object_class, 1) = ''2'' THEN ''R'' + ELSE by_direct_reimbursable_fun + END, + cocpa.program_activity_code, + cocpa.program_activity_name, + cocpa.sub_account_code, + cocpa.tas, + cocpa.tas_id, + cocpa.disaster_emergency_fund_code + ) temp_file_b + ) + ELSE + ( + SELECT COUNT(*) + FROM certified_object_class_program_activity AS cocpa + INNER JOIN valid_tas ON valid_tas.account_num = cocpa.tas_id + WHERE ( + submission_id = s.submission_id + ) + ) + END + AS count_file_b, + ( + SELECT COUNT(*) + FROM certified_award_financial AS caf + INNER JOIN valid_tas ON valid_tas.account_num = caf.tas_id + WHERE ( + submission_id = s.submission_id + AND ( + COALESCE(caf.transaction_obligated_amou, 0) != 0 + OR COALESCE(caf.gross_outlay_amount_by_awa_cpe, 0) != 0 + OR COALESCE(caf.ussgl487200_downward_adjus_cpe, 0) != 0 + OR COALESCE(caf.ussgl497200_downward_adjus_cpe, 0) != 0 + ) + ) + ) AS count_file_c + FROM submission AS s + WHERE + s.d2_submission = FALSE + AND s.publish_status_id IN (2, 3)' +) AS bs (submission_id integer, count_file_a integer, count_file_b integer, count_file_c integer) USING (submission_id) +WHERE + file_a.count IS DISTINCT FROM bs.count_file_a + OR file_b.count IS DISTINCT FROM bs.count_file_b + OR file_c.count IS DISTINCT FROM bs.count_file_c +; diff --git a/usaspending_api/etl/submission_loader_helpers/file_a.py b/usaspending_api/etl/submission_loader_helpers/file_a.py index f870043c7e..b1dba53cf7 100644 --- a/usaspending_api/etl/submission_loader_helpers/file_a.py +++ b/usaspending_api/etl/submission_loader_helpers/file_a.py @@ -1,9 +1,10 @@ import logging import re +from collections import defaultdict + from usaspending_api.accounts.models import AppropriationAccountBalances from usaspending_api.etl.broker_etl_helpers import dictfetchall -from usaspending_api.etl.submission_loader_helpers.skipped_tas import update_skipped_tas from usaspending_api.etl.management.load_base import load_data_into_model from usaspending_api.etl.submission_loader_helpers.bulk_create_manager import BulkCreateManager from usaspending_api.etl.submission_loader_helpers.treasury_appropriation_account import ( @@ -25,13 +26,7 @@ def get_file_a(submission_attributes, db_cursor): def load_file_a(submission_attributes, appropriation_data, db_cursor): """ Process and load file A broker data (aka TAS balances, aka appropriation account balances). """ reverse = re.compile("gross_outlay_amount_by_tas_cpe") - - # dictionary to capture TAS that were skipped and some metadata - # tas = top-level key - # count = number of rows skipped - # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted - skipped_tas = {} - + skipped_tas = defaultdict(int) # tracks count of rows skipped due to "missing" TAS bulk_treasury_appropriation_account_tas_lookup(appropriation_data, db_cursor) # Create account objects @@ -41,7 +36,7 @@ def load_file_a(submission_attributes, appropriation_data, db_cursor): # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup(row.get("tas_id")) if treasury_account is None: - update_skipped_tas(row, tas_rendering_label, skipped_tas) + skipped_tas[tas_rendering_label] += 1 continue # Now that we have the account, we can load the appropriation balances @@ -67,11 +62,12 @@ def load_file_a(submission_attributes, appropriation_data, db_cursor): save_manager.save_stragglers() - for key in skipped_tas: - logger.info(f"Skipped {skipped_tas[key]['count']:,} rows due to missing TAS: {key}") + for tas, count in skipped_tas.items(): + logger.info(f"Skipped {count:,} rows due to {tas}") - total_tas_skipped = 0 - for key in skipped_tas: - total_tas_skipped += skipped_tas[key]["count"] + total_tas_skipped = sum([count for count in skipped_tas.values()]) - logger.info(f"Skipped a total of {total_tas_skipped:,} TAS rows for File A") + if total_tas_skipped > 0: + logger.info(f"SKIPPED {total_tas_skipped:,} ROWS of File A (missing TAS)") + else: + logger.info("All File A records in Broker loaded into USAspending") diff --git a/usaspending_api/etl/submission_loader_helpers/file_b.py b/usaspending_api/etl/submission_loader_helpers/file_b.py index 341dc3f05b..512604d06c 100644 --- a/usaspending_api/etl/submission_loader_helpers/file_b.py +++ b/usaspending_api/etl/submission_loader_helpers/file_b.py @@ -1,9 +1,10 @@ import logging import re +from collections import defaultdict + from usaspending_api.accounts.models import AppropriationAccountBalances from usaspending_api.etl.broker_etl_helpers import dictfetchall -from usaspending_api.etl.submission_loader_helpers.skipped_tas import update_skipped_tas from usaspending_api.etl.management.load_base import load_data_into_model from usaspending_api.etl.submission_loader_helpers.bulk_create_manager import BulkCreateManager from usaspending_api.etl.submission_loader_helpers.disaster_emergency_fund_codes import get_disaster_emergency_fund @@ -155,13 +156,7 @@ class code but with one of the direct reimbursable flags set to NULL. def load_file_b(submission_attributes, prg_act_obj_cls_data, db_cursor): """ Process and load file B broker data (aka TAS balances by program activity and object class). """ reverse = re.compile(r"(_(cpe|fyb)$)|^transaction_obligated_amount$") - - # dictionary to capture TAS that were skipped and some metadata - # tas = top-level key - # count = number of rows skipped - # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted - skipped_tas = {} - + skipped_tas = defaultdict(int) # tracks count of rows skipped due to "missing" TAS bulk_treasury_appropriation_account_tas_lookup(prg_act_obj_cls_data, db_cursor) save_manager = BulkCreateManager(FinancialAccountsByProgramActivityObjectClass) @@ -169,7 +164,7 @@ def load_file_b(submission_attributes, prg_act_obj_cls_data, db_cursor): # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup(row.get("tas_id")) if treasury_account is None: - update_skipped_tas(row, tas_rendering_label, skipped_tas) + skipped_tas[tas_rendering_label] += 1 continue # get the corresponding account balances row (aka "File A" record) @@ -196,11 +191,12 @@ def load_file_b(submission_attributes, prg_act_obj_cls_data, db_cursor): save_manager.save_stragglers() - for key in skipped_tas: - logger.info(f"Skipped {skipped_tas[key]['count']:,} rows due to missing TAS: {key}") + for tas, count in skipped_tas.items(): + logger.info(f"Skipped {count:,} rows due to {tas}") - total_tas_skipped = 0 - for key in skipped_tas: - total_tas_skipped += skipped_tas[key]["count"] + total_tas_skipped = sum([count for count in skipped_tas.values()]) - logger.info(f"Skipped a total of {total_tas_skipped:,} TAS rows for File B") + if total_tas_skipped > 0: + logger.info(f"SKIPPED {total_tas_skipped:,} ROWS of File B (missing TAS)") + else: + logger.info("All File B records in Broker loaded into USAspending") diff --git a/usaspending_api/etl/submission_loader_helpers/file_c.py b/usaspending_api/etl/submission_loader_helpers/file_c.py index 6b65805bfe..8a05f83031 100644 --- a/usaspending_api/etl/submission_loader_helpers/file_c.py +++ b/usaspending_api/etl/submission_loader_helpers/file_c.py @@ -3,15 +3,15 @@ import pandas as pd import re -from collections import deque +from collections import deque, defaultdict from datetime import datetime from django.db import connections from django.utils.functional import cached_property + from usaspending_api.awards.models import FinancialAccountsByAwards from usaspending_api.common.helpers.dict_helpers import upper_case_dict_values from usaspending_api.common.helpers.etl_helpers import update_c_to_d_linkages from usaspending_api.etl.broker_etl_helpers import dictfetchall -from usaspending_api.etl.submission_loader_helpers.skipped_tas import update_skipped_tas from usaspending_api.etl.management.load_base import load_data_into_model from usaspending_api.etl.submission_loader_helpers.bulk_create_manager import BulkCreateManager from usaspending_api.etl.submission_loader_helpers.disaster_emergency_fund_codes import get_disaster_emergency_fund @@ -99,13 +99,10 @@ def get_from_where(submission_id): inner join submission s on s.submission_id = c.submission_id where s.submission_id = {submission_id} and ( - ( - c.transaction_obligated_amou is not null and - c.transaction_obligated_amou != 0 - ) or ( - c.gross_outlay_amount_by_awa_cpe is not null and - c.gross_outlay_amount_by_awa_cpe != 0 - ) + COALESCE(c.transaction_obligated_amou, 0) != 0 + or COALESCE(c.gross_outlay_amount_by_awa_cpe, 0) != 0 + or COALESCE(c.ussgl487200_downward_adjus_cpe, 0) != 0 + or COALESCE(c.ussgl497200_downward_adjus_cpe, 0) != 0 ) """ @@ -123,20 +120,15 @@ def load_file_c(submission_attributes, db_cursor, certified_award_financial): Note: this should run AFTER the D1 and D2 files are loaded because we try to join to those records to retrieve some additional information about the awarding sub-tier agency. """ - # this matches the file b reverse directive, but am repeating it here to ensure that we don't overwrite it as we - # change up the order of file loading if certified_award_financial.count == 0: logger.warning("No File C (award financial) data found, skipping...") return + # this matches the file b reverse directive, but am repeating it here to ensure that we don't overwrite it as we + # change up the order of file loading reverse = re.compile(r"(_(cpe|fyb)$)|^transaction_obligated_amount$") - - # dictionary to capture TAS that were skipped and some metadata - # tas = top-level key - # count = number of rows skipped - # rows = row numbers skipped, corresponding to the original row numbers in the file that was submitted - skipped_tas = {} + skipped_tas = defaultdict(int) # tracks count of rows skipped due to "missing" TAS total_rows = certified_award_financial.count start_time = datetime.now() @@ -147,14 +139,15 @@ def load_file_c(submission_attributes, db_cursor, certified_award_financial): update_c_to_d_linkages("contract", False, submission_attributes.submission_id) update_c_to_d_linkages("assistance", False, submission_attributes.submission_id) - for key in skipped_tas: - logger.info(f"Skipped {skipped_tas[key]['count']:,} rows due to missing TAS: {key}") + for tas, count in skipped_tas.items(): + logger.info(f"Skipped {count:,} rows due to {tas}") - total_tas_skipped = 0 - for key in skipped_tas: - total_tas_skipped += skipped_tas[key]["count"] + total_tas_skipped = sum([count for count in skipped_tas.values()]) - logger.info(f"Skipped a total of {total_tas_skipped:,} TAS rows for File C") + if total_tas_skipped > 0: + logger.info(f"SKIPPED {total_tas_skipped:,} ROWS of File C (missing TAS)") + else: + logger.info("All File C records in Broker loaded into USAspending") def _save_file_c_rows(certified_award_financial, total_rows, start_time, skipped_tas, submission_attributes, reverse): @@ -168,7 +161,7 @@ def _save_file_c_rows(certified_award_financial, total_rows, start_time, skipped # Check and see if there is an entry for this TAS treasury_account, tas_rendering_label = get_treasury_appropriation_account_tas_lookup(row.get("tas_id")) if treasury_account is None: - update_skipped_tas(row, tas_rendering_label, skipped_tas) + skipped_tas[tas_rendering_label] += 1 continue award_financial_data = FinancialAccountsByAwards() diff --git a/usaspending_api/etl/submission_loader_helpers/skipped_tas.py b/usaspending_api/etl/submission_loader_helpers/skipped_tas.py deleted file mode 100644 index 2ca4c6d77b..0000000000 --- a/usaspending_api/etl/submission_loader_helpers/skipped_tas.py +++ /dev/null @@ -1,8 +0,0 @@ -def update_skipped_tas(row, tas_rendering_label, skipped_tas): - if tas_rendering_label not in skipped_tas: - skipped_tas[tas_rendering_label] = {} - skipped_tas[tas_rendering_label]["count"] = 1 - skipped_tas[tas_rendering_label]["rows"] = [row["row_number"]] - else: - skipped_tas[tas_rendering_label]["count"] += 1 - skipped_tas[tas_rendering_label]["rows"] += [row["row_number"]] diff --git a/usaspending_api/etl/submission_loader_helpers/treasury_appropriation_account.py b/usaspending_api/etl/submission_loader_helpers/treasury_appropriation_account.py index 5d8bbcf988..dfcb559fe8 100644 --- a/usaspending_api/etl/submission_loader_helpers/treasury_appropriation_account.py +++ b/usaspending_api/etl/submission_loader_helpers/treasury_appropriation_account.py @@ -28,10 +28,7 @@ def bulk_treasury_appropriation_account_tas_lookup(rows, db_cursor): sub_account_code from tas_lookup where account_num in %s - and ( - financial_indicator2 != 'F' - or financial_indicator2 is null - ) + and financial_indicator2 IS DISTINCT FROM 'F' """, [tas_lookup_ids], ) @@ -63,5 +60,5 @@ def bulk_treasury_appropriation_account_tas_lookup(rows, db_cursor): def get_treasury_appropriation_account_tas_lookup(tas_lookup_id): tas = TAS_ID_TO_ACCOUNT.get(tas_lookup_id) if not tas or not tas[1]: - return None, f"Account number {tas_lookup_id} not found in Broker" + return None, f"TAS Account Number (tas_lookup.account_num) '{tas_lookup_id}' not found in Broker" return tas diff --git a/usaspending_api/etl/tests/data/submission_data.json b/usaspending_api/etl/tests/data/submission_data.json index 5789fb1b9f..2736ace3d2 100644 --- a/usaspending_api/etl/tests/data/submission_data.json +++ b/usaspending_api/etl/tests/data/submission_data.json @@ -13,7 +13,7 @@ "updated_at": "1700-12-31" } ], - "SELECT * FROM tas_lookup WHERE (financial_indicator2 <> 'F' OR financial_indicator2 IS NULL) AND account_num = -99999": [ + "SELECT * FROM tas_lookup WHERE (financial_indicator2 IS DISTINCT FROM 'F') AND account_num = -99999": [ { "allocation_transfer_agency": "999", "agency_identifier": "999", diff --git a/usaspending_api/etl/tests/etl_test_data.json b/usaspending_api/etl/tests/etl_test_data.json index f7df5699c2..d170ec1641 100644 --- a/usaspending_api/etl/tests/etl_test_data.json +++ b/usaspending_api/etl/tests/etl_test_data.json @@ -4137,7 +4137,7 @@ "count": 1 } ], - "SELECT * FROM tas_lookup WHERE (financial_indicator2 <> 'F' OR financial_indicator2 IS NULL) AND account_num = %s": [ + "SELECT * FROM tas_lookup WHERE (financial_indicator2 IS DISTINCT FROM 'F') AND account_num = %s": [ { "sub_account_code": "000", "main_account_code": "0100", diff --git a/usaspending_api/etl/tests/integration/test_elasticsearch_indexer.py b/usaspending_api/etl/tests/integration/test_elasticsearch_indexer.py new file mode 100644 index 0000000000..f53d118590 --- /dev/null +++ b/usaspending_api/etl/tests/integration/test_elasticsearch_indexer.py @@ -0,0 +1,549 @@ +import pytest + +from collections import OrderedDict +from datetime import datetime, timezone, timedelta + +from django.test import override_settings +from elasticsearch import Elasticsearch +from model_mommy import mommy + +from usaspending_api.common.elasticsearch.elasticsearch_sql_helpers import ensure_view_exists +from usaspending_api.conftest_helpers import TestElasticSearchIndex +from usaspending_api.etl.elasticsearch_loader_helpers import set_final_index_config +from usaspending_api.awards.models import Award, TransactionFABS, TransactionFPDS, TransactionNormalized +from usaspending_api.common.helpers.sql_helpers import execute_sql_to_ordered_dictionary +from usaspending_api.etl.elasticsearch_loader_helpers.index_config import ES_AWARDS_UNIQUE_KEY_FIELD +from usaspending_api.etl.management.commands.elasticsearch_indexer import ( + Command as ElasticsearchIndexerCommand, + parse_cli_args, +) +from usaspending_api.etl.elasticsearch_loader_helpers import ( + Controller, + delete_awards, + delete_transactions, +) +from usaspending_api.etl.elasticsearch_loader_helpers.delete_data import ( + _check_awards_for_deletes, + _lookup_deleted_award_keys, + delete_docs_by_unique_key, +) + + +@pytest.fixture +def award_data_fixture(db): + fpds_unique_key = "fpds_transaction_id_1".upper() # our ETL UPPERs all these when brought from Broker + mommy.make( + "awards.TransactionNormalized", + id=1, + award_id=1, + action_date="2010-10-01", + is_fpds=True, + type="A", + transaction_unique_id=fpds_unique_key, + ) + mommy.make( + "awards.TransactionFPDS", + detached_award_proc_unique=fpds_unique_key, + transaction_id=1, + legal_entity_zip5="abcde", + piid="IND12PB00323", + legal_entity_county_code="059", + legal_entity_state_code="VA", + legal_entity_congressional="11", + legal_entity_country_code="USA", + place_of_performance_state="VA", + place_of_performance_congr="11", + place_of_perform_country_c="USA", + naics="331122", + product_or_service_code="1510", + type_set_aside="8AN", + type_of_contract_pricing="2", + extent_competed="F", + ) + + fabs_unique_key = "fabs_transaction_id_2".upper() # our ETL UPPERs all these when brought from Broker + mommy.make( + "awards.TransactionNormalized", + id=2, + award_id=2, + action_date="2016-10-01", + is_fpds=False, + type="02", + transaction_unique_id=fabs_unique_key, + ) + mommy.make( + "awards.TransactionFABS", + transaction_id=2, + fain="P063P100612", + cfda_number="84.063", + afa_generated_unique=fabs_unique_key, + ) + + mommy.make("references.ToptierAgency", toptier_agency_id=1, name="Department of Transportation") + mommy.make("references.SubtierAgency", subtier_agency_id=1, name="Department of Transportation") + mommy.make("references.Agency", id=1, toptier_agency_id=1, subtier_agency_id=1) + mommy.make( + "awards.Award", + id=1, + generated_unique_award_id="CONT_AWD_IND12PB00323", + latest_transaction_id=1, + is_fpds=True, + type="A", + piid="IND12PB00323", + description="pop tarts and assorted cereals", + total_obligation=500000.00, + date_signed="2010-10-1", + awarding_agency_id=1, + funding_agency_id=1, + update_date="2012-05-19", + ) + mommy.make( + "awards.Award", + id=2, + generated_unique_award_id="ASST_NON_P063P100612", + latest_transaction_id=2, + is_fpds=False, + type="02", + fain="P063P100612", + total_obligation=1000000.00, + date_signed="2016-10-1", + update_date="2014-07-21", + ) + mommy.make("accounts.FederalAccount", id=1) + mommy.make( + "accounts.TreasuryAppropriationAccount", + treasury_account_identifier=1, + agency_id="097", + main_account_code="4930", + federal_account_id=1, + ) + + mommy.make("awards.FinancialAccountsByAwards", financial_accounts_by_awards_id=1, award_id=1, treasury_account_id=1) + + +def mock_execute_sql(sql, results, verbosity=None): + """SQL method is being mocked here since the `execute_sql_statement` used + doesn't use the same DB connection to avoid multiprocessing errors + """ + return execute_sql_to_ordered_dictionary(sql) + + +def test_create_and_load_new_award_index(award_data_fixture, elasticsearch_award_index, monkeypatch): + """Test the ``elasticsearch_loader`` django management command to create a new awards index and load it + with data from the DB + """ + client = elasticsearch_award_index.client # type: Elasticsearch + + # Ensure index is not yet created + assert not client.indices.exists(elasticsearch_award_index.index_name) + original_db_awards_count = Award.objects.count() + + # Inject ETL arg into config for this run, which loads a newly created index + elasticsearch_award_index.etl_config["create_new_index"] = True + es_etl_config = _process_es_etl_test_config(client, elasticsearch_award_index) + + # Must use mock sql function to share test DB conn+transaction in ETL code + # Patching on the module into which it is imported, not the module where it is defined + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.extract_data.execute_sql_statement", mock_execute_sql + ) + # Also override SQL function listed in config object with the mock one + es_etl_config["execute_sql_func"] = mock_execute_sql + loader = Controller(es_etl_config) + assert loader.__class__.__name__ == "Controller" + loader.prepare_for_etl() + loader.dispatch_tasks() + # Along with other things, this will refresh the index, to surface loaded docs + set_final_index_config(client, elasticsearch_award_index.index_name) + + assert client.indices.exists(elasticsearch_award_index.index_name) + es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] + assert es_award_docs == original_db_awards_count + + +def test_create_and_load_new_transaction_index(award_data_fixture, elasticsearch_transaction_index, monkeypatch): + """Test the ``elasticsearch_loader`` django management command to create a new transactions index and load it + with data from the DB + """ + client = elasticsearch_transaction_index.client # type: Elasticsearch + + # Ensure index is not yet created + assert not client.indices.exists(elasticsearch_transaction_index.index_name) + original_db_tx_count = TransactionNormalized.objects.count() + + # Inject ETL arg into config for this run, which loads a newly created index + elasticsearch_transaction_index.etl_config["create_new_index"] = True + es_etl_config = _process_es_etl_test_config(client, elasticsearch_transaction_index) + + # Must use mock sql function to share test DB conn+transaction in ETL code + # Patching on the module into which it is imported, not the module where it is defined + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.extract_data.execute_sql_statement", mock_execute_sql + ) + # Also override SQL function listed in config object with the mock one + es_etl_config["execute_sql_func"] = mock_execute_sql + loader = Controller(es_etl_config) + assert loader.__class__.__name__ == "Controller" + loader.prepare_for_etl() + loader.dispatch_tasks() + # Along with other things, this will refresh the index, to surface loaded docs + set_final_index_config(client, elasticsearch_transaction_index.index_name) + + assert client.indices.exists(elasticsearch_transaction_index.index_name) + es_award_docs = client.count(index=elasticsearch_transaction_index.index_name)["count"] + assert es_award_docs == original_db_tx_count + + +def test_incremental_load_into_award_index(award_data_fixture, elasticsearch_award_index, monkeypatch): + """Test the ``elasticsearch_loader`` django management command to incrementally load updated data into the awards ES + index from the DB, overwriting the doc that was already there + """ + original_db_awards_count = Award.objects.count() + elasticsearch_award_index.update_index() + client = elasticsearch_award_index.client # type: Elasticsearch + assert client.indices.exists(elasticsearch_award_index.index_name) + es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] + assert es_award_docs == original_db_awards_count + + # Inject ETL arg into config for this run, to suppress processing deletes. Test incremental load only + elasticsearch_award_index.etl_config["process_deletes"] = False + elasticsearch_award_index.etl_config["start_datetime"] = datetime.now(timezone.utc) + es_etl_config = _process_es_etl_test_config(client, elasticsearch_award_index) + + # Now modify one of the DB objects + awd = Award.objects.first() # type: Award + awd.total_obligation = 9999 + awd.save() + + # Must use mock sql function to share test DB conn+transaction in ETL code + # Patching on the module into which it is imported, not the module where it is defined + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.extract_data.execute_sql_statement", mock_execute_sql + ) + # Also override SQL function listed in config object with the mock one + es_etl_config["execute_sql_func"] = mock_execute_sql + ensure_view_exists(es_etl_config["sql_view"], force=True) + loader = Controller(es_etl_config) + assert loader.__class__.__name__ == "Controller" + loader.prepare_for_etl() + loader.dispatch_tasks() + client.indices.refresh(elasticsearch_award_index.index_name) + + assert client.indices.exists(elasticsearch_award_index.index_name) + es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] + assert es_award_docs == original_db_awards_count + es_awards = client.search(index=elasticsearch_award_index.index_name) + updated_award = [a for a in es_awards["hits"]["hits"] if a["_source"]["award_id"] == awd.id][0] + assert int(updated_award["_source"]["total_obligation"]) == 9999 + + +def test_incremental_load_into_transaction_index(award_data_fixture, elasticsearch_transaction_index, monkeypatch): + """Test the ``elasticsearch_loader`` django management command to incrementally load updated data into + the transactions ES index from the DB, overwriting the doc that was already there + """ + original_db_txs_count = TransactionNormalized.objects.count() + elasticsearch_transaction_index.update_index() + client = elasticsearch_transaction_index.client # type: Elasticsearch + assert client.indices.exists(elasticsearch_transaction_index.index_name) + es_tx_docs = client.count(index=elasticsearch_transaction_index.index_name)["count"] + assert es_tx_docs == original_db_txs_count + + # Inject ETL arg into config for this run, to suppress processing deletes. Test incremental load only + elasticsearch_transaction_index.etl_config["process_deletes"] = False + elasticsearch_transaction_index.etl_config["start_datetime"] = datetime.now(timezone.utc) + es_etl_config = _process_es_etl_test_config(client, elasticsearch_transaction_index) + + # Now modify one of the DB objects + tx = TransactionNormalized.objects.first() # type: TransactionNormalized + tx.federal_action_obligation = 9999 + tx.save() + + # Must use mock sql function to share test DB conn+transaction in ETL code + # Patching on the module into which it is imported, not the module where it is defined + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.extract_data.execute_sql_statement", mock_execute_sql + ) + # Also override SQL function listed in config object with the mock one + es_etl_config["execute_sql_func"] = mock_execute_sql + ensure_view_exists(es_etl_config["sql_view"], force=True) + loader = Controller(es_etl_config) + assert loader.__class__.__name__ == "Controller" + loader.prepare_for_etl() + loader.dispatch_tasks() + client.indices.refresh(elasticsearch_transaction_index.index_name) + + assert client.indices.exists(elasticsearch_transaction_index.index_name) + es_tx_docs = client.count(index=elasticsearch_transaction_index.index_name)["count"] + assert es_tx_docs == original_db_txs_count + es_txs = client.search(index=elasticsearch_transaction_index.index_name) + updated_tx = [t for t in es_txs["hits"]["hits"] if t["_source"]["transaction_id"] == tx.id][0] + assert int(updated_tx["_source"]["federal_action_obligation"]) == 9999 + + +def test__lookup_deleted_award_keys(award_data_fixture, elasticsearch_transaction_index): + elasticsearch_transaction_index.update_index() + client = elasticsearch_transaction_index.client + ids = _lookup_deleted_award_keys( + client, + "afa_generated_unique", + ["FABS_TRANSACTION_ID_2"], + elasticsearch_transaction_index.etl_config, + index=elasticsearch_transaction_index.index_name, + ) + assert ids == ["ASST_NON_P063P100612"] + + +def test__lookup_deleted_award_keys_multiple_chunks(award_data_fixture, elasticsearch_transaction_index): + """Test that when the lookup requires multiple iterative calls to ES to match values, it still gets all of them""" + elasticsearch_transaction_index.update_index() + client = elasticsearch_transaction_index.client + ids = _lookup_deleted_award_keys( + client, + "generated_unique_transaction_id", + ["CONT_TX_FPDS_TRANSACTION_ID_1", "ASST_TX_FABS_TRANSACTION_ID_2"], + elasticsearch_transaction_index.etl_config, + index=elasticsearch_transaction_index.index_name, + lookup_chunk_size=1, + ) + assert "CONT_AWD_IND12PB00323" in ids and "ASST_NON_P063P100612" in ids + assert len(ids) == 2 + + +def test__lookup_deleted_award_keys_by_int(award_data_fixture, elasticsearch_award_index): + """Looks up awards off of an awards index using award_id as the lookup_key field""" + elasticsearch_award_index.update_index() + client = elasticsearch_award_index.client + ids = _lookup_deleted_award_keys( + client, "award_id", [1], elasticsearch_award_index.etl_config, index=elasticsearch_award_index.index_name + ) + assert ids == ["CONT_AWD_IND12PB00323"] + + +def test_delete_docs_by_unique_key_exceed_max_terms(award_data_fixture, elasticsearch_award_index): + """Verify we restrict attempting to delete more than allowed in a terms query""" + elasticsearch_award_index.update_index() + with pytest.raises(RuntimeError) as exc_info: + delete_docs_by_unique_key( + elasticsearch_award_index.client, + ES_AWARDS_UNIQUE_KEY_FIELD, + list(map(str, range(0, 70000))), + "test delete", + elasticsearch_award_index.index_name, + delete_chunk_size=70000, + ) + assert "greater than 65536" in str(exc_info.value) + + +def test_delete_docs_by_unique_key_exceed_max_results_window(award_data_fixture, elasticsearch_award_index): + """Verify that trying to delete more records at once than the index max_results_window will error-out""" + fake_max_results_window = 1 + with override_settings(ES_AWARDS_MAX_RESULT_WINDOW=fake_max_results_window): + elasticsearch_award_index.update_index() + assert elasticsearch_award_index.etl_config["max_query_size"] == fake_max_results_window + from elasticsearch.exceptions import RequestError + + with pytest.raises(RequestError) as exc_info: + delete_docs_by_unique_key( + elasticsearch_award_index.client, + "award_id", + [1, 2], + "test delete", + elasticsearch_award_index.index_name, + delete_chunk_size=10, + ) + assert "Batch size is too large" in str(exc_info.value) + assert "controlled by the [index.max_result_window] index level setting" in str(exc_info.value) + + +def test__check_awards_for_deletes(award_data_fixture, monkeypatch, db): + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.delete_data.execute_sql_statement", mock_execute_sql + ) + id_list = ["CONT_AWD_IND12PB00323"] + awards = _check_awards_for_deletes(id_list) + assert awards == [] + + id_list = ["CONT_AWD_WHATEVER", "CONT_AWD_IND12PB00323"] + awards = _check_awards_for_deletes(id_list) + assert awards == [OrderedDict([("generated_unique_award_id", "CONT_AWD_WHATEVER")])] + + +def test_delete_awards(award_data_fixture, elasticsearch_transaction_index, elasticsearch_award_index, monkeypatch, db): + """Transactions that are logged for delete, that are in the transaction ES index, and their parent awards are NOT + in the DB ... are deleted from the ES awards index + + The current delete approach intakes transaction IDs from the S3 delete log file, looks them up in the transaction + index, gets their unique award key from each transaction, and checks if they were deleted from the DB. If so, THEN + it deletes them from the awards ES index. + """ + elasticsearch_transaction_index.update_index() + elasticsearch_award_index.update_index() + delete_time = datetime.now(timezone.utc) + last_load_time = delete_time - timedelta(hours=12) + + fpds_keys = [ + "CONT_TX_" + key.upper() + for key in TransactionNormalized.objects.filter(is_fpds=True).values_list("transaction_unique_id", flat=True) + ] + fabs_keys = [ + "ASST_TX_" + key.upper() + for key in TransactionNormalized.objects.filter(is_fpds=False).values_list("transaction_unique_id", flat=True) + ] + deleted_tx = {key: {"timestamp": delete_time} for key in fpds_keys + fabs_keys} + # Patch the function that fetches deleted transaction keys from the CSV delete-log file + # in S3, and provide fake transaction keys + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.delete_data._gather_deleted_transaction_keys", + lambda cfg: deleted_tx, + ) + + original_db_awards_count = Award.objects.count() + # Simulate an awards ETL deleting the transactions and awards from the DB. + TransactionNormalized.objects.all().delete() + TransactionFPDS.objects.all().delete() + TransactionFABS.objects.all().delete() + Award.objects.all().delete() + + client = elasticsearch_award_index.client # type: Elasticsearch + es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] + assert es_award_docs == original_db_awards_count + elasticsearch_award_index.etl_config["start_datetime"] = last_load_time + es_etl_config = _process_es_etl_test_config(client, elasticsearch_award_index) + # Must use mock sql function to share test DB conn+transaction in ETL code + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.delete_data.execute_sql_statement", mock_execute_sql + ) + delete_awards(client, es_etl_config) + es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] + assert es_award_docs == 0 + + +def test_delete_awards_zero_for_unmatched_transactions( + award_data_fixture, elasticsearch_transaction_index, elasticsearch_award_index, monkeypatch, db +): + """No awards deleted from the index if their transactions are not found in the transaction index + + If the logged deleted transactions are not found in the transaction index, then there is no way to fetch unique + award keys and remove those from the ES index. + """ + elasticsearch_transaction_index.update_index() + elasticsearch_award_index.update_index() + delete_time = datetime.now(timezone.utc) + last_load_time = delete_time - timedelta(hours=12) + + # Patch the function that fetches deleted transaction keys from the CSV delete-log file + # in S3, and provide fake transaction keys + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.delete_data._gather_deleted_transaction_keys", + lambda cfg: { + "unmatchable_tx_key1": {"timestamp": delete_time}, + "unmatchable_tx_key2": {"timestamp": delete_time}, + "unmatchable_tx_key3": {"timestamp": delete_time}, + }, + ) + + client = elasticsearch_award_index.client # type: Elasticsearch + es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] + assert es_award_docs == Award.objects.count() + elasticsearch_award_index.etl_config["start_datetime"] = last_load_time + es_etl_config = _process_es_etl_test_config(client, elasticsearch_award_index) + delete_count = delete_awards(client, es_etl_config) + assert delete_count == 0 + es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] + assert es_award_docs == Award.objects.count() + + +def test_delete_one_assistance_award( + award_data_fixture, elasticsearch_transaction_index, elasticsearch_award_index, monkeypatch, db +): + """Ensure that transactions not logged for delete don't cause their parent awards to get deleted + + Similar to test that logs all transactions as deleted and deletes all ES awards, but just picking 1 to delete + """ + elasticsearch_transaction_index.update_index() + elasticsearch_award_index.update_index() + delete_time = datetime.now(timezone.utc) + last_load_time = delete_time - timedelta(hours=12) + + # Get FABS transaction with the lowest ID. This ONE will be deleted. + tx = TransactionNormalized.objects.filter(is_fpds=False).order_by("pk").first() # type: TransactionNormalized + fabs_key = "ASST_TX_" + tx.transaction_unique_id.upper() + deleted_tx = {fabs_key: {"timestamp": delete_time}} + # Patch the function that fetches deleted transaction keys from the CSV delete-log file + # in S3, and provide fake transaction keys + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.delete_data._gather_deleted_transaction_keys", + lambda cfg: deleted_tx, + ) + + original_db_awards_count = Award.objects.count() + # Simulate an awards ETL deleting the transactions and awards from the DB. + TransactionFABS.objects.filter(pk=tx.pk).delete() + tx.award.delete() + tx.delete() + + client = elasticsearch_award_index.client # type: Elasticsearch + es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] + assert es_award_docs == original_db_awards_count + elasticsearch_award_index.etl_config["start_datetime"] = last_load_time + es_etl_config = _process_es_etl_test_config(client, elasticsearch_award_index) + # Must use mock sql function to share test DB conn+transaction in ETL code + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.delete_data.execute_sql_statement", mock_execute_sql + ) + delete_count = delete_awards(client, es_etl_config) + assert delete_count == 1 + es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] + assert es_award_docs == original_db_awards_count - 1 + + +def test_delete_one_assistance_transaction(award_data_fixture, elasticsearch_transaction_index, monkeypatch, db): + """Ensure that transactions not logged for delete don't get deleted but those logged for delete do""" + elasticsearch_transaction_index.update_index() + delete_time = datetime.now(timezone.utc) + last_load_time = delete_time - timedelta(hours=12) + + # Get FABS transaction with the lowest ID. This ONE will be deleted. + tx = TransactionNormalized.objects.filter(is_fpds=False).order_by("pk").first() # type: TransactionNormalized + fabs_key = "ASST_TX_" + tx.transaction_unique_id.upper() + deleted_tx = {fabs_key: {"timestamp": delete_time}} + # Patch the function that fetches deleted transaction keys from the CSV delete-log file + # in S3, and provide fake transaction keys + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.delete_data._gather_deleted_transaction_keys", + lambda cfg: deleted_tx, + ) + + original_db_tx_count = TransactionNormalized.objects.count() + # Simulate an awards ETL deleting the transaction and award from the DB. + TransactionFABS.objects.filter(pk=tx.pk).delete() + tx.award.delete() + tx.delete() + + client = elasticsearch_transaction_index.client # type: Elasticsearch + es_award_docs = client.count(index=elasticsearch_transaction_index.index_name)["count"] + assert es_award_docs == original_db_tx_count + elasticsearch_transaction_index.etl_config["start_datetime"] = last_load_time + es_etl_config = _process_es_etl_test_config(client, elasticsearch_transaction_index) + # Must use mock sql function to share test DB conn+transaction in ETL code + monkeypatch.setattr( + "usaspending_api.etl.elasticsearch_loader_helpers.delete_data.execute_sql_statement", mock_execute_sql + ) + delete_count = delete_transactions(client, es_etl_config) + assert delete_count == 1 + es_award_docs = client.count(index=elasticsearch_transaction_index.index_name)["count"] + assert es_award_docs == original_db_tx_count - 1 + + +def _process_es_etl_test_config(client: Elasticsearch, test_es_index: TestElasticSearchIndex): + """Use the Django mgmt cmd to extract args with default values, then update those with test ETL config values""" + cmd = ElasticsearchIndexerCommand() + cmd_name = cmd.__module__.split(".")[-1] # should give "elasticsearch_indexer" unless name changed + parser = cmd.create_parser("", cmd_name) + # Changes dict of arg k-v pairs into a flat list of ordered ["k1", "v1", "k2", "v2" ...] items + list_of_arg_kvs = [["--" + k.replace("_", "-"), str(v)] for k, v in test_es_index.etl_config.items()] + test_args = [arg_item for kvpair in list_of_arg_kvs for arg_item in kvpair] + cli_args, _ = parser.parse_known_args(args=test_args) # parse the known args programmatically + cli_opts = {**vars(cli_args), **test_es_index.etl_config} # update defaults with test config + es_etl_config = parse_cli_args(cli_opts, client) # use command's config parser for final config for testing ETL + return es_etl_config diff --git a/usaspending_api/etl/tests/integration/test_load_multiple_submissions.py b/usaspending_api/etl/tests/integration/test_load_multiple_submissions.py index 71466a8cf3..c67949632e 100644 --- a/usaspending_api/etl/tests/integration/test_load_multiple_submissions.py +++ b/usaspending_api/etl/tests/integration/test_load_multiple_submissions.py @@ -248,25 +248,28 @@ def setUp(self): object_class, gross_outlay_amount_by_awa_cpe, transaction_obligated_amou, + ussgl487200_downward_adjus_cpe, + ussgl497200_downward_adjus_cpe, disaster_emergency_fund_code ) (values - (1, 1, 1, '1101', 11111, 111110, null), - (2, 1, 1, '1101', 22222, 222220, 'B'), - (3, 1, 1, '1101', 33333, 333330, 'L'), - (4, 2, 1, '1101', 44444, 444440, null), - (5, 2, 1, '1101', 55555, 555550, null), - (6, 2, 1, '1101', 66666, 666660, null), - (7, 3, 2, '1101', 77777, 777770, 'L'), - (8, 3, 2, '1101', 88888, 888880, 'L'), - (9, 3, 2, '1101', 99999, 999990, 'L'), - (10, 4, 2, '1101', 10101, 101010, null), - (11, 5, 2, '1101', 11111, 111110, 'B'), - (12, 5, 2, '1101', null, null, 'B'), -- this should not load because of 0/null values - (13, 5, 2, '1101', 0, 0, 'B'), -- this should not load because of 0/null values - (14, 5, 2, '1101', null, 0, 'B'), -- this should not load because of 0/null values - (15, 5, 2, '1101', 0, null, 'B'), -- this should not load because of 0/null values - (16, 6, 2, '1101', 12121, 121210, 'L'), - (17, 7, 2, '1101', 13131, 131310, 'N') + (1, 1, 1, '1101', 11111, 111110, -11, -111, null), + (2, 1, 1, '1101', 22222, 222220, -22, -222, 'B'), + (3, 1, 1, '1101', 33333, 333330, -33, -333, 'L'), + (4, 2, 1, '1101', 44444, 444440, -44, -444, null), + (5, 2, 1, '1101', 55555, 555550, -55, -555, null), + (6, 2, 1, '1101', 66666, 666660, -66, -666, null), + (7, 3, 2, '1101', 77777, 777770, -77, -777, 'L'), + (8, 3, 2, '1101', 88888, 888880, -88, -888, 'L'), + (9, 3, 2, '1101', 99999, 999990, -99, -999, 'L'), + (10, 4, 2, '1101', 10101, 101010, -10, -101, null), + (11, 5, 2, '1101', 11111, 111110, 0, 0, 'B'), + (12, 5, 2, '1101', null, null, 0, 0, 'M'), -- this should not load because of 0/null values + (13, 5, 2, '1101', 0, 0, null, 0, 'M'), -- this should not load because of 0/null values + (14, 5, 2, '1101', null, 0, null, 0, 'M'), -- this should not load because of 0/null values + (15, 5, 2, '1101', 0, null, 0, null, 'M'), -- this should not load because of 0/null values + (16, 6, 2, '1101', 12121, 121210, -12, -121, 'L'), + (17, 7, 2, '1101', 13131, 131310, -13, -131, 'N'), + (18, 5, 2, '1101', 0, 0, 0, -1010, 'N') ) """ ) @@ -346,7 +349,7 @@ def test_all_the_things(self): assert SubmissionAttributes.objects.count() == 5 assert AppropriationAccountBalances.objects.count() == 5 assert FinancialAccountsByProgramActivityObjectClass.objects.count() == 7 - assert FinancialAccountsByAwards.objects.count() == 11 + assert FinancialAccountsByAwards.objects.count() == 12 # Now that we have everything loaded, let's make sure our data make sense. with connections[DEFAULT_DB_ALIAS].cursor() as cursor: @@ -395,25 +398,33 @@ def test_all_the_things(self): """ select sum(gross_outlay_amount_by_award_cpe), sum(transaction_obligated_amount), + sum(ussgl487200_down_adj_pri_ppaid_undel_orders_oblig_refund_cpe), + sum(ussgl497200_down_adj_pri_paid_deliv_orders_oblig_refund_cpe), string_agg(disaster_emergency_fund_code, ',' order by disaster_emergency_fund_code) from financial_accounts_by_awards """ ) - assert cursor.fetchone() == (Decimal("-521207.00"), Decimal("-5212070.00"), "B,B,L,L,L,L") + assert cursor.fetchone() == ( + Decimal("-521207.00"), + Decimal("-5212070.00"), + Decimal("505.00"), + Decimal("6106.00"), + "B,B,L,L,L,L,N", + ) # Nuke a submission. SubmissionAttributes.objects.filter(submission_id=1).delete() assert SubmissionAttributes.objects.count() == 4 assert AppropriationAccountBalances.objects.count() == 4 assert FinancialAccountsByProgramActivityObjectClass.objects.count() == 4 - assert FinancialAccountsByAwards.objects.count() == 8 + assert FinancialAccountsByAwards.objects.count() == 9 # Make sure it reloads. call_command("load_multiple_submissions", "--incremental") assert SubmissionAttributes.objects.count() == 5 assert AppropriationAccountBalances.objects.count() == 5 assert FinancialAccountsByProgramActivityObjectClass.objects.count() == 7 - assert FinancialAccountsByAwards.objects.count() == 11 + assert FinancialAccountsByAwards.objects.count() == 12 # Make a change to a submission. SubmissionAttributes.objects.filter(submission_id=1).update(reporting_fiscal_year=1999) @@ -435,7 +446,7 @@ def test_all_the_things(self): assert SubmissionAttributes.objects.count() == 4 assert AppropriationAccountBalances.objects.count() == 4 assert FinancialAccountsByProgramActivityObjectClass.objects.count() == 4 - assert FinancialAccountsByAwards.objects.count() == 8 + assert FinancialAccountsByAwards.objects.count() == 9 # Ok, after all the stuff we just did, let's make sure submissions 2 and 3 never got touched. assert SubmissionAttributes.objects.get(submission_id=2).update_date == update_date_sub_2 diff --git a/usaspending_api/etl/tests/test_elasticsearch_indexer.py b/usaspending_api/etl/tests/test_elasticsearch_indexer.py deleted file mode 100644 index 8bee318521..0000000000 --- a/usaspending_api/etl/tests/test_elasticsearch_indexer.py +++ /dev/null @@ -1,368 +0,0 @@ -import pytest -from django.conf import settings - -from collections import OrderedDict -from datetime import datetime, timezone -from elasticsearch import Elasticsearch -from model_mommy import mommy -from pathlib import Path - -from usaspending_api.awards.models import Award, TransactionFABS, TransactionFPDS, TransactionNormalized -from usaspending_api.common.elasticsearch.client import instantiate_elasticsearch_client -from usaspending_api.common.helpers.sql_helpers import execute_sql_to_ordered_dictionary -from usaspending_api.common.helpers.text_helpers import generate_random_string - -from usaspending_api.etl.elasticsearch_loader_helpers import ( - Controller, - execute_sql_statement, - transform_award_data, - transform_transaction_data, - delete_awards, - delete_transactions, -) -from usaspending_api.etl.management.commands.elasticsearch_indexer import set_config -from usaspending_api.etl.elasticsearch_loader_helpers.delete_data import ( - _check_awards_for_deletes, - _lookup_deleted_award_ids, -) - - -@pytest.fixture -def award_data_fixture(db): - fpds_unique_key = "fpds_transaction_id_1" - mommy.make( - "awards.TransactionNormalized", - id=1, - award_id=1, - action_date="2010-10-01", - is_fpds=True, - type="A", - transaction_unique_id=fpds_unique_key, - ) - mommy.make( - "awards.TransactionFPDS", - detached_award_proc_unique=fpds_unique_key, - transaction_id=1, - legal_entity_zip5="abcde", - piid="IND12PB00323", - legal_entity_county_code="059", - legal_entity_state_code="VA", - legal_entity_congressional="11", - legal_entity_country_code="USA", - place_of_performance_state="VA", - place_of_performance_congr="11", - place_of_perform_country_c="USA", - naics="331122", - product_or_service_code="1510", - type_set_aside="8AN", - type_of_contract_pricing="2", - extent_competed="F", - ) - - fabs_unique_key = "fabs_transaction_id_2" - mommy.make( - "awards.TransactionNormalized", - id=2, - award_id=2, - action_date="2016-10-01", - is_fpds=False, - type="02", - transaction_unique_id=fabs_unique_key, - ) - mommy.make( - "awards.TransactionFABS", - transaction_id=2, - fain="P063P100612", - cfda_number="84.063", - afa_generated_unique=fabs_unique_key, - ) - - mommy.make("references.ToptierAgency", toptier_agency_id=1, name="Department of Transportation") - mommy.make("references.SubtierAgency", subtier_agency_id=1, name="Department of Transportation") - mommy.make("references.Agency", id=1, toptier_agency_id=1, subtier_agency_id=1) - mommy.make( - "awards.Award", - id=1, - generated_unique_award_id="CONT_AWD_IND12PB00323", - latest_transaction_id=1, - is_fpds=True, - type="A", - piid="IND12PB00323", - description="pop tarts and assorted cereals", - total_obligation=500000.00, - date_signed="2010-10-1", - awarding_agency_id=1, - funding_agency_id=1, - update_date="2012-05-19", - ) - mommy.make( - "awards.Award", - id=2, - generated_unique_award_id="ASST_NON_P063P100612", - latest_transaction_id=2, - is_fpds=False, - type="02", - fain="P063P100612", - total_obligation=1000000.00, - date_signed="2016-10-1", - update_date="2014-07-21", - ) - mommy.make("accounts.FederalAccount", id=1) - mommy.make( - "accounts.TreasuryAppropriationAccount", - treasury_account_identifier=1, - agency_id="097", - main_account_code="4930", - federal_account_id=1, - ) - - mommy.make("awards.FinancialAccountsByAwards", financial_accounts_by_awards_id=1, award_id=1, treasury_account_id=1) - - -award_config = { - "create_new_index": True, - "data_type": "award", - "data_transform_func": transform_award_data, - "directory": Path(__file__).resolve().parent, - "fiscal_years": [2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020], - "index_name": f"test-{datetime.now(timezone.utc).strftime('%Y-%m-%d-%H-%M-%S-%f')}-{generate_random_string()}", - "is_incremental_load": False, - "max_query_size": 10000, - "process_deletes": False, - "processing_start_datetime": datetime(2019, 12, 13, 16, 10, 33, 729108, tzinfo=timezone.utc), - "query_alias_prefix": "award-query", - "skip_counts": False, - "snapshot": False, - "starting_date": datetime(2007, 10, 1, 0, 0, tzinfo=timezone.utc), - "unique_key_field": "award_id", - "verbose": False, -} - -transaction_config = { - "base_table": "transaction_normalized", - "base_table_id": "id", - "create_award_type_aliases": True, - "data_transform_func": transform_transaction_data, - "data_type": "transaction", - "execute_sql_func": execute_sql_statement, - "extra_null_partition": False, - "field_for_es_id": "transaction_id", - "initial_datetime": datetime(2019, 12, 13, 16, 10, 33, 729108, tzinfo=timezone.utc), - "max_query_size": 50000, - "optional_predicate": """WHERE "update_date" >= '{starting_date}'""", - "primary_key": "transaction_id", - "query_alias_prefix": "transaction-query", - "required_index_name": settings.ES_TRANSACTIONS_NAME_SUFFIX, - "sql_view": settings.ES_TRANSACTIONS_ETL_VIEW_NAME, - "stored_date_key": "es_transactions", - "unique_key_field": "generated_unique_transaction_id", - "write_alias": settings.ES_TRANSACTIONS_WRITE_ALIAS, -} - -################################################################################ -# Originally the ES ETL would create a new index even if there was no data. -# A few simple changes led to these two tests failing because the entire ETL -# needs to run and that would require monkeypatching the PSQL CSV copy steps -# which would be laborious and fragile. Leaving tests in-place to document this -# testing shortcoming. It may be addressed in the near future (time-permitting) -# if some refactoring occurs and allows more flexibility. -- from Aug 2020 -################################################################################ - - -@pytest.mark.skip -def test_es_award_loader_class(award_data_fixture, elasticsearch_award_index, monkeypatch): - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.utilities.execute_sql_statement", mock_execute_sql - ) - elasticsearch_client = instantiate_elasticsearch_client() - loader = Controller(award_config, elasticsearch_client) - assert loader.__class__.__name__ == "Controller" - loader.run_load_steps() - assert elasticsearch_client.indices.exists(award_config["index_name"]) - elasticsearch_client.indices.delete(index=award_config["index_name"], ignore_unavailable=False) - - -@pytest.mark.skip -def test_es_transaction_loader_class(award_data_fixture, elasticsearch_transaction_index, monkeypatch): - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.utilities.execute_sql_statement", mock_execute_sql - ) - elasticsearch_client = instantiate_elasticsearch_client() - loader = Controller(transaction_config, elasticsearch_client) - assert loader.__class__.__name__ == "Controller" - loader.run_load_steps() - assert elasticsearch_client.indices.exists(transaction_config["index_name"]) - elasticsearch_client.indices.delete(index=transaction_config["index_name"], ignore_unavailable=False) - - -# SQL method is being mocked here since the `execute_sql_statement` used -# doesn't use the same DB connection to avoid multiprocessing errors -def mock_execute_sql(sql, results, verbosity=None): - return execute_sql_to_ordered_dictionary(sql) - - -def test__lookup_deleted_award_ids(award_data_fixture, elasticsearch_award_index): - elasticsearch_award_index.update_index() - id_list = [{"key": 1, "col": "award_id"}] - client = elasticsearch_award_index.client - ids = _lookup_deleted_award_ids(client, id_list, award_config, index=elasticsearch_award_index.index_name) - assert ids == ["CONT_AWD_IND12PB00323"] - - -def test__check_awards_for_deletes(award_data_fixture, monkeypatch, db): - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.delete_data.execute_sql_statement", mock_execute_sql - ) - id_list = ["CONT_AWD_IND12PB00323"] - awards = _check_awards_for_deletes(id_list) - assert awards == [] - - id_list = ["CONT_AWD_WHATEVER", "CONT_AWD_IND12PB00323"] - awards = _check_awards_for_deletes(id_list) - assert awards == [OrderedDict([("generated_unique_award_id", "CONT_AWD_WHATEVER")])] - - -def test_delete_awards(award_data_fixture, elasticsearch_transaction_index, elasticsearch_award_index, monkeypatch, db): - """Transactions that are logged for delete, that are in the transaction ES index, and their parent awards are NOT - in the DB ... are deleted from the ES awards index - - The current delete approach intakes transaction IDs from the S3 delete log file, looks them up in the transaction - index, gets their unique award key from each transaction, and checks if they were deleted from the DB. If so, THEN - it deletes them from the awards ES index. - """ - elasticsearch_transaction_index.update_index() - elasticsearch_award_index.update_index() - - fpds_keys = [ - "CONT_TX_" + key.upper() - for key in TransactionNormalized.objects.filter(is_fpds=True).values_list("transaction_unique_id", flat=True) - ] - fabs_keys = [ - "ASST_TX_" + key.upper() - for key in TransactionNormalized.objects.filter(is_fpds=False).values_list("transaction_unique_id", flat=True) - ] - deleted_tx = {key: {"timestamp": datetime.now()} for key in fpds_keys + fabs_keys} - # Patch the function that fetches deleted transaction keys from the CSV delete-log file - # in S3, and provide fake transaction keys - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.delete_data._gather_deleted_ids", lambda cfg: deleted_tx - ) - - original_db_awards_count = Award.objects.count() - # Simulate an awards ETL deleting the transactions and awards from the DB. - TransactionNormalized.objects.all().delete() - TransactionFPDS.objects.all().delete() - TransactionFABS.objects.all().delete() - Award.objects.all().delete() - - client = elasticsearch_award_index.client # type: Elasticsearch - es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] - assert es_award_docs == original_db_awards_count - es_etl_config = set_config([], elasticsearch_award_index.etl_config) - # Must use mock sql function to share test DB conn+transaction in ETL code - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.delete_data.execute_sql_statement", mock_execute_sql - ) - delete_awards(client, es_etl_config) - es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] - assert es_award_docs == 0 - - -def test_delete_awards_zero_for_unmatched_transactions(award_data_fixture, elasticsearch_award_index, monkeypatch, db): - """No awards deleted from the index if their transactions are not found in the transaction index - - If the logged deleted transactions are not found in the transaction index, then there is no way to fetch unique - award keys and remove those from the ES index. - """ - elasticsearch_award_index.update_index() - - # Patch the function that fetches deleted transaction keys from the CSV delete-log file - # in S3, and provide fake transaction keys - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.delete_data._gather_deleted_ids", - lambda cfg: { - "unmatchable_tx_key1": {"timestamp": datetime.now()}, - "unmatchable_tx_key2": {"timestamp": datetime.now()}, - "unmatchable_tx_key3": {"timestamp": datetime.now()}, - }, - ) - - client = elasticsearch_award_index.client # type: Elasticsearch - es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] - assert es_award_docs == Award.objects.count() - config = set_config([], elasticsearch_award_index.etl_config) - delete_awards(client, config) - es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] - assert es_award_docs == Award.objects.count() - - -def test_delete_one_assistance_award( - award_data_fixture, elasticsearch_transaction_index, elasticsearch_award_index, monkeypatch, db -): - """Ensure that transactions not logged for delete don't cause their parent awards to get deleted - - Similar to test that logs all transactions as deleted and deletes all ES awards, but just picking 1 to delete - """ - elasticsearch_transaction_index.update_index() - elasticsearch_award_index.update_index() - - # Get FABS transaction with the lowest ID. This ONE will be deleted. - tx = TransactionNormalized.objects.filter(is_fpds=False).order_by("pk").first() # type: TransactionNormalized - fabs_key = "ASST_TX_" + tx.transaction_unique_id.upper() - deleted_tx = {fabs_key: {"timestamp": datetime.now()}} - # Patch the function that fetches deleted transaction keys from the CSV delete-log file - # in S3, and provide fake transaction keys - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.delete_data._gather_deleted_ids", lambda cfg: deleted_tx - ) - - original_db_awards_count = Award.objects.count() - # Simulate an awards ETL deleting the transactions and awards from the DB. - TransactionFABS.objects.filter(pk=tx.pk).delete() - tx.award.delete() - tx.delete() - - client = elasticsearch_award_index.client # type: Elasticsearch - es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] - assert es_award_docs == original_db_awards_count - es_etl_config = set_config([], elasticsearch_award_index.etl_config) - # Must use mock sql function to share test DB conn+transaction in ETL code - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.delete_data.execute_sql_statement", mock_execute_sql - ) - delete_awards(client, es_etl_config) - es_award_docs = client.count(index=elasticsearch_award_index.index_name)["count"] - assert es_award_docs == original_db_awards_count - 1 - - -def test_delete_one_assistance_transaction(award_data_fixture, elasticsearch_transaction_index, monkeypatch, db): - """Ensure that transactions not logged for delete don't get deleted but those logged for delete do""" - elasticsearch_transaction_index.update_index() - - # Get FABS transaction with the lowest ID. This ONE will be deleted. - tx = TransactionNormalized.objects.filter(is_fpds=False).order_by("pk").first() # type: TransactionNormalized - fabs_key = "ASST_TX_" + tx.transaction_unique_id.upper() - deleted_tx = {fabs_key: {"timestamp": datetime.now()}} - # Patch the function that fetches deleted transaction keys from the CSV delete-log file - # in S3, and provide fake transaction keys - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.delete_data._gather_deleted_ids", lambda cfg: deleted_tx - ) - - original_db_tx_count = TransactionNormalized.objects.count() - # Simulate an awards ETL deleting the transaction and award from the DB. - TransactionFABS.objects.filter(pk=tx.pk).delete() - tx.award.delete() - tx.delete() - - client = elasticsearch_transaction_index.client # type: Elasticsearch - es_award_docs = client.count(index=elasticsearch_transaction_index.index_name)["count"] - assert es_award_docs == original_db_tx_count - es_etl_config = set_config([], elasticsearch_transaction_index.etl_config) - # Must use mock sql function to share test DB conn+transaction in ETL code - monkeypatch.setattr( - "usaspending_api.etl.elasticsearch_loader_helpers.delete_data.execute_sql_statement", mock_execute_sql - ) - delete_transactions(client, es_etl_config) - es_award_docs = client.count(index=elasticsearch_transaction_index.index_name)["count"] - assert es_award_docs == original_db_tx_count - 1 diff --git a/usaspending_api/etl/tests/unit/elasticsearch_loader_helpers/test_controller.py b/usaspending_api/etl/tests/unit/elasticsearch_loader_helpers/test_controller.py new file mode 100644 index 0000000000..55a8a6da27 --- /dev/null +++ b/usaspending_api/etl/tests/unit/elasticsearch_loader_helpers/test_controller.py @@ -0,0 +1,212 @@ +from usaspending_api.etl.elasticsearch_loader_helpers import Controller +from math import ceil + + +def test_get_id_range_for_partition_one_records(): + min_id = 1 + max_id = 1 + id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range + etl_config = {"partition_size": 10000} + ctrl = Controller(etl_config) + ctrl.min_id = min_id + ctrl.max_id = max_id + ctrl.record_count = id_range_item_count # assume records exist for each ID in range + ctrl.config["partitions"] = ctrl.determine_partitions() + partition_range = range(0, ctrl.config["partitions"]) + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[0]) + assert lower_bound == min_id + assert upper_bound == max_id + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[-1]) + assert lower_bound == min_id + assert upper_bound == max_id + id_set = set(range(min_id, max_id + 1)) + assert _remove_seen_ids(ctrl, id_set) == set({}) + + +def test_get_id_range_for_partition_two_records(): + min_id = 1 + max_id = 2 + id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range + etl_config = {"partition_size": 10000} + ctrl = Controller(etl_config) + ctrl.min_id = min_id + ctrl.max_id = max_id + ctrl.record_count = id_range_item_count # assume records exist for each ID in range + ctrl.config["partitions"] = ctrl.determine_partitions() + partition_range = range(0, ctrl.config["partitions"]) + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[0]) + assert lower_bound == min_id + assert upper_bound == max_id + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[-1]) + assert lower_bound == min_id + assert upper_bound == max_id + id_set = set(range(min_id, max_id + 1)) + assert _remove_seen_ids(ctrl, id_set) == set({}) + + +def test_get_id_range_for_partition_with_evenly_divisible(): + """Check all is good when set of records fit evenly into partitions (each partition full)""" + min_id = 1 + max_id = 100 + partition_size = 20 + id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range + assert id_range_item_count % partition_size == 0 # evenly divisible + etl_config = {"partition_size": partition_size} + ctrl = Controller(etl_config) + ctrl.min_id = min_id + ctrl.max_id = max_id + ctrl.record_count = id_range_item_count # assume records exist for each ID in range + ctrl.config["partitions"] = ctrl.determine_partitions() + assert ctrl.config["partitions"] == ceil(id_range_item_count / partition_size) + partition_range = range(0, ctrl.config["partitions"]) + # First batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[0]) + assert lower_bound == min_id + assert upper_bound == lower_bound + (partition_size - 1) + # Second batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[1]) + assert lower_bound == min_id + partition_size + assert upper_bound == lower_bound + (partition_size - 1) + # Last batch should go all the way up to max_id + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[-1]) + assert lower_bound == (max_id - partition_size + 1) == (min_id + (partition_size * partition_range[-1])) + assert upper_bound == max_id + id_set = set(range(min_id, max_id + 1)) + assert _remove_seen_ids(ctrl, id_set) == set({}) + + +def test_get_id_range_for_partition_with_one_over(): + """Checks that the proper upper and lower bound are retrieved even when the range of IDs leaves only 1 item + in the last partition. There was a bug here before.""" + min_id = 1 + max_id = 101 + partition_size = 20 + id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range + assert id_range_item_count % partition_size == 1 # one over the partition size + etl_config = {"partition_size": partition_size} + ctrl = Controller(etl_config) + ctrl.min_id = min_id + ctrl.max_id = max_id + ctrl.record_count = id_range_item_count # assume records exist for each ID in range + ctrl.config["partitions"] = ctrl.determine_partitions() + assert ctrl.config["partitions"] == ceil(id_range_item_count / partition_size) + partition_range = range(0, ctrl.config["partitions"]) + # First batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[0]) + assert lower_bound == min_id + assert upper_bound == lower_bound + (partition_size - 1) + # Second batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[1]) + assert lower_bound == min_id + partition_size + assert upper_bound == lower_bound + (partition_size - 1) + # Last batch should go all the way up to max_id + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[-1]) + assert lower_bound == (min_id + (partition_size * partition_range[-1])) == 101 + assert upper_bound == max_id == 101 + id_set = set(range(min_id, max_id + 1)) + assert _remove_seen_ids(ctrl, id_set) == set({}) + + +def test_get_id_range_for_partition_with_evenly_divisible_partition_size_offset(): + """Checks that the proper upper and lower bound are retrieved even when the range of IDs is evenly divisible by + the partition size. There was a bug here before.""" + min_id = 4 + max_id = 6004 + partition_size = 2000 + id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range + etl_config = {"partition_size": partition_size} + ctrl = Controller(etl_config) + ctrl.min_id = min_id + ctrl.max_id = max_id + ctrl.record_count = id_range_item_count # assume records exist for each ID in range + ctrl.config["partitions"] = ctrl.determine_partitions() + assert ctrl.config["partitions"] == ceil(id_range_item_count / partition_size) + partition_range = range(0, ctrl.config["partitions"]) + # First batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[0]) + assert lower_bound == min_id + assert upper_bound == lower_bound + (partition_size - 1) + # Second batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[1]) + assert lower_bound == min_id + partition_size + assert upper_bound == lower_bound + (partition_size - 1) + # Last batch should go all the way up to max_id + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[-1]) + assert lower_bound == (min_id + (partition_size * partition_range[-1])) + assert upper_bound == max_id + id_set = set(range(min_id, max_id + 1)) + assert _remove_seen_ids(ctrl, id_set) == set({}) + + +def test_get_id_range_for_partition_with_sparse_range(): + """Checks that the proper upper and lower bound are retrieved even when the range of IDs is evenly divisible by + the partition size. There was a bug here before.""" + min_id = 4 + max_id = 5999 + partition_size = 2000 + id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range + record_ids = {4, 5, 7, 99, 101, 120, 1998, 1999, 2000, 2001, 2002, 4444, 5999} + etl_config = {"partition_size": partition_size} + ctrl = Controller(etl_config) + ctrl.min_id = min_id + ctrl.max_id = max_id + ctrl.record_count = len(record_ids) + ctrl.config["partitions"] = ctrl.determine_partitions() + assert ctrl.config["partitions"] == ceil(id_range_item_count / partition_size) + partition_range = range(0, ctrl.config["partitions"]) + # First batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[0]) + assert lower_bound == min_id + assert upper_bound == lower_bound + (partition_size - 1) + # Second batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[1]) + assert lower_bound == min_id + partition_size + assert upper_bound == lower_bound + (partition_size - 1) + # Last batch should go all the way up to max_id + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[-1]) + assert lower_bound == (min_id + (partition_size * partition_range[-1])) + assert upper_bound == max_id + assert _remove_seen_ids(ctrl, record_ids) == set({}) + + +def test_get_id_range_for_partition_with_empty_partitions(): + """Checks that the proper upper and lower bound are retrieved even when the range of IDs is evenly divisible by + the partition size. There was a bug here before.""" + min_id = 1 + max_id = 100 + partition_size = 20 + id_range_item_count = max_id - min_id + 1 # this many individual IDs should be processed for continuous ID range + record_ids = {1, 5, 7, 15, 19, 20, 41, 100} + etl_config = {"partition_size": partition_size} + ctrl = Controller(etl_config) + ctrl.min_id = min_id + ctrl.max_id = max_id + ctrl.record_count = len(record_ids) + ctrl.config["partitions"] = ctrl.determine_partitions() + assert ctrl.config["partitions"] == ceil(id_range_item_count / partition_size) + partition_range = range(0, ctrl.config["partitions"]) + # First batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[0]) + assert lower_bound == min_id + assert upper_bound == lower_bound + (partition_size - 1) + # Second batch + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[1]) + assert lower_bound == min_id + partition_size + assert upper_bound == lower_bound + (partition_size - 1) + # Last batch should go all the way up to max_id + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_range[-1]) + assert lower_bound == (min_id + (partition_size * partition_range[-1])) + assert upper_bound == max_id + assert _remove_seen_ids(ctrl, record_ids) == set({}) + + +def _remove_seen_ids(ctrl, id_set): + """Iterates through each bounded id-range, and removes IDs seen""" + partition_range = range(0, ctrl.config["partitions"]) + unseen_ids = id_set.copy() + for partition_idx in partition_range: + lower_bound, upper_bound = ctrl.get_id_range_for_partition(partition_idx) + for seen_id in id_set: + if lower_bound <= seen_id <= upper_bound: + unseen_ids.remove(seen_id) + return unseen_ids diff --git a/usaspending_api/etl/tests/unit/elasticsearch_loader_helpers/test_utilities.py b/usaspending_api/etl/tests/unit/elasticsearch_loader_helpers/test_utilities.py new file mode 100644 index 0000000000..8a6d43a3d1 --- /dev/null +++ b/usaspending_api/etl/tests/unit/elasticsearch_loader_helpers/test_utilities.py @@ -0,0 +1,36 @@ +from usaspending_api.etl.elasticsearch_loader_helpers.utilities import is_snapshot_running + + +def test_is_snapshot_running(monkeypatch): + class MockSnapshot: + def status(self): + return { + "snapshots": [ + {"snapshot": "test_snapshot", "indices": {"2021-02-12-covid19-faba": {}, "2021-02-12-awards": {}}} + ] + } + + class MockClient: + snapshot = MockSnapshot() + + mock_client = MockClient() + + # snapshot running for index + index_names = ["2021-02-12-covid19-faba"] + result = is_snapshot_running(mock_client, index_names) + assert result + + # snapshot not running for index + index_names = ["2021-02-12-transactions"] + result = is_snapshot_running(mock_client, index_names) + assert not result + + # one of two indexes overlap with snapshot + index_names = ["2021-02-12-awards", "2021-02-12-transactions"] + result = is_snapshot_running(mock_client, index_names) + assert result + + # one of two indexes overlap with snapshot (reverse order) + index_names = ["2021-02-12-transactions", "2021-02-12-awards"] + result = is_snapshot_running(mock_client, index_names) + assert result diff --git a/usaspending_api/etl/transaction_loaders/data_load_helpers.py b/usaspending_api/etl/transaction_loaders/data_load_helpers.py index 32271b7ecf..5cb43233e4 100644 --- a/usaspending_api/etl/transaction_loaders/data_load_helpers.py +++ b/usaspending_api/etl/transaction_loaders/data_load_helpers.py @@ -7,7 +7,7 @@ from django.conf import settings -logger = logging.getLogger("console") +logger = logging.getLogger("script") def capitalize_if_string(val): diff --git a/usaspending_api/etl/transaction_loaders/fpds_loader.py b/usaspending_api/etl/transaction_loaders/fpds_loader.py index b9c821da0d..7171a1cd9e 100644 --- a/usaspending_api/etl/transaction_loaders/fpds_loader.py +++ b/usaspending_api/etl/transaction_loaders/fpds_loader.py @@ -24,7 +24,7 @@ from usaspending_api.common.helpers.timing_helpers import ConsoleTimer as Timer -logger = logging.getLogger("console") +logger = logging.getLogger("script") failed_ids = [] diff --git a/usaspending_api/recipient/management/commands/load_state_data.py b/usaspending_api/recipient/management/commands/load_state_data.py index 8538266797..235fe1f654 100644 --- a/usaspending_api/recipient/management/commands/load_state_data.py +++ b/usaspending_api/recipient/management/commands/load_state_data.py @@ -9,7 +9,7 @@ from django.conf import settings from django.core.management.base import BaseCommand -logger = logging.getLogger("console") +logger = logging.getLogger("script") LOCAL_STATE_DATA_FILENAME = "CensusStateData.csv" LOCAL_STATE_DATA = str(settings.APP_DIR / "data" / LOCAL_STATE_DATA_FILENAME) diff --git a/usaspending_api/recipient/management/commands/update_parent_duns_fabs.py b/usaspending_api/recipient/management/commands/update_parent_duns_fabs.py index 3f99a58ed4..9755691aa2 100644 --- a/usaspending_api/recipient/management/commands/update_parent_duns_fabs.py +++ b/usaspending_api/recipient/management/commands/update_parent_duns_fabs.py @@ -4,7 +4,7 @@ from django.db import connection from django.core.management.base import BaseCommand -logger = logging.getLogger("console") +logger = logging.getLogger("script") FABS_PARENT_DUNS_SQL_MATCH = """ WITH joined_historical_fabs AS ( diff --git a/usaspending_api/references/management/commands/load_download_static_data.py b/usaspending_api/references/management/commands/load_download_static_data.py index c1c882d034..b06e25ca66 100644 --- a/usaspending_api/references/management/commands/load_download_static_data.py +++ b/usaspending_api/references/management/commands/load_download_static_data.py @@ -13,7 +13,7 @@ class Command(BaseCommand): @transaction.atomic def handle(self, *args, **options): - logger = logging.getLogger("console") + logger = logging.getLogger("script") JobStatus.objects.all().delete() for status in lookups.JOB_STATUS: diff --git a/usaspending_api/references/management/commands/load_glossary.py b/usaspending_api/references/management/commands/load_glossary.py index 15b5d9aa96..36e49d79b7 100644 --- a/usaspending_api/references/management/commands/load_glossary.py +++ b/usaspending_api/references/management/commands/load_glossary.py @@ -10,7 +10,7 @@ class Command(BaseCommand): help = "Loads an Excel spreadsheet of USAspending terminology definitions into the Glossary model" - logger = logging.getLogger("console") + logger = logging.getLogger("script") default_path = str(settings.APP_DIR / "data" / "USAspendingGlossary.xlsx") @@ -25,7 +25,7 @@ def handle(self, *args, **options): def load_glossary(path, append): - logger = logging.getLogger("console") + logger = logging.getLogger("script") wb = load_workbook(filename=path) ws = wb.active diff --git a/usaspending_api/references/management/commands/load_naics.py b/usaspending_api/references/management/commands/load_naics.py index 7cd0e3a858..0bf030de51 100644 --- a/usaspending_api/references/management/commands/load_naics.py +++ b/usaspending_api/references/management/commands/load_naics.py @@ -13,7 +13,7 @@ class Command(BaseCommand): help = "Updates DB from Excel spreadsheets of USAspending terminology definitions into the naics model" - logger = logging.getLogger("console") + logger = logging.getLogger("script") default_path = str(settings.APP_DIR / "data" / "naics_archive") @@ -72,7 +72,7 @@ def load_single_naics(naics_code, naics_year, naics_desc): @transaction.atomic def load_naics(path, append): - logger = logging.getLogger("console") + logger = logging.getLogger("script") if append: logger.info("Appending definitions to existing guide") diff --git a/usaspending_api/references/management/commands/load_object_classes.py b/usaspending_api/references/management/commands/load_object_classes.py index ff3e53b53c..f1a8ba0b78 100644 --- a/usaspending_api/references/management/commands/load_object_classes.py +++ b/usaspending_api/references/management/commands/load_object_classes.py @@ -30,7 +30,7 @@ ); """ -logger = logging.getLogger("console") +logger = logging.getLogger("script") RawObjectClass = namedtuple("RawObjectClass", ["row_number", "object_class", "object_class_name"]) diff --git a/usaspending_api/references/management/commands/load_program_activity.py b/usaspending_api/references/management/commands/load_program_activity.py index 2de59bbe13..b3dc99fa23 100644 --- a/usaspending_api/references/management/commands/load_program_activity.py +++ b/usaspending_api/references/management/commands/load_program_activity.py @@ -16,7 +16,7 @@ class Command(BaseCommand): help = "Loads program activity codes." - logger = logging.getLogger("console") + logger = logging.getLogger("script") def add_arguments(self, parser): parser.add_argument("file", nargs="?", help="the file to load") diff --git a/usaspending_api/references/management/commands/load_psc.py b/usaspending_api/references/management/commands/load_psc.py index 6446e1b119..1672f22045 100644 --- a/usaspending_api/references/management/commands/load_psc.py +++ b/usaspending_api/references/management/commands/load_psc.py @@ -8,7 +8,7 @@ class Command(BaseCommand): help = "Loads program information obtained from Excel file on https://www.acquisition.gov/PSC_Manual" - logger = logging.getLogger("console") + logger = logging.getLogger("script") default_directory = os.path.normpath("usaspending_api/references/management/commands/") default_filepath = os.path.join(default_directory, "PSC_Data_June_2019_Edition_FINAL_6-20-19+DRW.xlsx") @@ -29,7 +29,7 @@ def load_psc(fullpath, update): Create/Update Product or Service Code records from a Excel doc of historical data. """ try: - logger = logging.getLogger("console") + logger = logging.getLogger("script") wb = load_workbook(filename=fullpath, data_only=True) ws = wb.active for current_row, row in enumerate(ws.rows): diff --git a/usaspending_api/references/management/commands/load_reference_csv.py b/usaspending_api/references/management/commands/load_reference_csv.py index e02fbe4e87..591bddf451 100644 --- a/usaspending_api/references/management/commands/load_reference_csv.py +++ b/usaspending_api/references/management/commands/load_reference_csv.py @@ -5,7 +5,7 @@ from usaspending_api.references.models import RefCountryCode, ObjectClass, RefProgramActivity -logger = logging.getLogger("console") +logger = logging.getLogger("script") class Command(BaseCommand): diff --git a/usaspending_api/references/management/commands/load_reference_data.py b/usaspending_api/references/management/commands/load_reference_data.py index fdc1fdae70..8af797c8d3 100644 --- a/usaspending_api/references/management/commands/load_reference_data.py +++ b/usaspending_api/references/management/commands/load_reference_data.py @@ -10,7 +10,7 @@ class Command(BaseCommand): creating a new database. This command should be run in the same \ directory where manage.py is located" - logger = logging.getLogger("console") + logger = logging.getLogger("script") def handle(self, *args, **options): self.logger.info("Beginning reference data loading. This may take a few minutes.") diff --git a/usaspending_api/references/management/commands/load_rosetta.py b/usaspending_api/references/management/commands/load_rosetta.py index 117a99891b..4688c5a3bc 100644 --- a/usaspending_api/references/management/commands/load_rosetta.py +++ b/usaspending_api/references/management/commands/load_rosetta.py @@ -12,7 +12,7 @@ from usaspending_api.common.retrieve_file_from_uri import RetrieveFileFromUri from usaspending_api.references.models import Rosetta -logger = logging.getLogger("console") +logger = logging.getLogger("script") EXCEL_COLUMNS = [ diff --git a/usaspending_api/references/management/commands/load_tas.py b/usaspending_api/references/management/commands/load_tas.py index d1bafe172e..1278fab600 100644 --- a/usaspending_api/references/management/commands/load_tas.py +++ b/usaspending_api/references/management/commands/load_tas.py @@ -23,7 +23,7 @@ update_federal_accounts, ) -logger = logging.getLogger("console") +logger = logging.getLogger("script") TAS_SQL_PATH = "usaspending_api/references/management/sql/restock_tas.sql" diff --git a/usaspending_api/references/management/commands/loadcfda.py b/usaspending_api/references/management/commands/loadcfda.py index c8e8bfbafb..455c72a891 100644 --- a/usaspending_api/references/management/commands/loadcfda.py +++ b/usaspending_api/references/management/commands/loadcfda.py @@ -13,7 +13,7 @@ from usaspending_api.references.models import Cfda -logger = logging.getLogger("console") +logger = logging.getLogger("script") Reporter = OpsReporter(iso_start_datetime=datetime.now(timezone.utc).isoformat(), job_name="loadcfda.py") diff --git a/usaspending_api/reporting/tests/integration/test_agencies_overview.py b/usaspending_api/reporting/tests/integration/test_agencies_overview.py index 42e090d6ae..f60e2f955f 100644 --- a/usaspending_api/reporting/tests/integration/test_agencies_overview.py +++ b/usaspending_api/reporting/tests/integration/test_agencies_overview.py @@ -13,7 +13,8 @@ url = "/api/v2/reporting/agencies/overview/" CURRENT_FISCAL_YEAR = current_fiscal_year() -CURRENT_LAST_PERIOD = get_final_period_of_quarter(calculate_last_completed_fiscal_quarter(CURRENT_FISCAL_YEAR)) or 3 +CURRENT_LAST_QUARTER = calculate_last_completed_fiscal_quarter(CURRENT_FISCAL_YEAR) or 1 +CURRENT_LAST_PERIOD = get_final_period_of_quarter(CURRENT_LAST_QUARTER) or 3 assurance_statement_1 = ( f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/Raw%20DATA%20Act%20Files/" @@ -26,8 +27,8 @@ ) assurance_statement_3 = ( f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/Raw%20DATA%20Act%20Files/" - f"{CURRENT_FISCAL_YEAR}/P{CURRENT_LAST_PERIOD:02}/001%20-%20Test%20Agency%203%20(AAA)/" - f"{CURRENT_FISCAL_YEAR}-P{CURRENT_LAST_PERIOD:02}-001_Test%20Agency%203%20(AAA)-Assurance_Statement.txt" + f"{CURRENT_FISCAL_YEAR}/Q{CURRENT_LAST_QUARTER}/001%20-%20Test%20Agency%203%20(AAA)/" + f"{CURRENT_FISCAL_YEAR}-Q{CURRENT_LAST_QUARTER}-001_Test%20Agency%203%20(AAA)-Assurance_Statement.txt" ) @@ -35,13 +36,31 @@ def setup_test_data(db): """ Insert data into DB for testing """ sub = mommy.make( - "submissions.SubmissionAttributes", submission_id=1, reporting_fiscal_year=2019, reporting_fiscal_period=6 + "submissions.SubmissionAttributes", + submission_id=1, + toptier_code="123", + quarter_format_flag=False, + reporting_fiscal_year=2019, + reporting_fiscal_period=6, + published_date="2019-07-03", ) sub2 = mommy.make( "submissions.SubmissionAttributes", submission_id=2, + toptier_code="987", + quarter_format_flag=False, + reporting_fiscal_year=CURRENT_FISCAL_YEAR, + reporting_fiscal_period=CURRENT_LAST_PERIOD, + published_date=f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07", + ) + sub3 = mommy.make( + "submissions.SubmissionAttributes", + submission_id=3, + toptier_code="001", + quarter_format_flag=True, reporting_fiscal_year=CURRENT_FISCAL_YEAR, reporting_fiscal_period=CURRENT_LAST_PERIOD, + published_date=f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07", ) mommy.make("references.Agency", id=1, toptier_agency_id=1, toptier_flag=True) mommy.make("references.Agency", id=2, toptier_agency_id=2, toptier_flag=True) @@ -88,8 +107,8 @@ def setup_test_data(db): ] approps = [ {"sub_id": sub.submission_id, "treasury_account": treas_accounts[0], "total_resources": 50}, - {"sub_id": sub.submission_id, "treasury_account": treas_accounts[1], "total_resources": 12}, - {"sub_id": sub2.submission_id, "treasury_account": treas_accounts[1], "total_resources": 29}, + {"sub_id": sub3.submission_id, "treasury_account": treas_accounts[1], "total_resources": 12}, + {"sub_id": sub3.submission_id, "treasury_account": treas_accounts[1], "total_resources": 29}, {"sub_id": sub2.submission_id, "treasury_account": treas_accounts[2], "total_resources": 15.5}, ] for approp in approps: @@ -137,22 +156,30 @@ def setup_test_data(db): mommy.make( "reporting.ReportingAgencyOverview", reporting_agency_overview_id=1, - toptier_code=123, + toptier_code="123", fiscal_year=2019, fiscal_period=6, total_dollars_obligated_gtas=1788370.03, total_budgetary_resources=22478810.97, total_diff_approp_ocpa_obligated_amounts=84931.95, + unlinked_procurement_c_awards=1, + unlinked_assistance_c_awards=2, + unlinked_procurement_d_awards=3, + unlinked_assistance_d_awards=4, ) mommy.make( "reporting.ReportingAgencyOverview", reporting_agency_overview_id=2, - toptier_code=987, + toptier_code="987", fiscal_year=CURRENT_FISCAL_YEAR, fiscal_period=CURRENT_LAST_PERIOD, total_dollars_obligated_gtas=18.6, total_budgetary_resources=100, total_diff_approp_ocpa_obligated_amounts=0, + unlinked_procurement_c_awards=10, + unlinked_assistance_c_awards=20, + unlinked_procurement_d_awards=30, + unlinked_assistance_d_awards=40, ) mommy.make( "reporting.ReportingAgencyOverview", @@ -163,10 +190,14 @@ def setup_test_data(db): total_dollars_obligated_gtas=20.0, total_budgetary_resources=10.0, total_diff_approp_ocpa_obligated_amounts=10.0, + unlinked_procurement_c_awards=100, + unlinked_assistance_c_awards=200, + unlinked_procurement_d_awards=300, + unlinked_assistance_d_awards=400, ) mommy.make( "reporting.ReportingAgencyMissingTas", - toptier_code=123, + toptier_code="123", fiscal_year=2019, fiscal_period=6, tas_rendering_label="TAS 1", @@ -174,7 +205,7 @@ def setup_test_data(db): ) mommy.make( "reporting.ReportingAgencyMissingTas", - toptier_code=123, + toptier_code="123", fiscal_year=2019, fiscal_period=6, tas_rendering_label="TAS 2", @@ -182,19 +213,27 @@ def setup_test_data(db): ) mommy.make( "reporting.ReportingAgencyMissingTas", - toptier_code=987, + toptier_code="987", fiscal_year=CURRENT_FISCAL_YEAR, fiscal_period=CURRENT_LAST_PERIOD, tas_rendering_label="TAS 2", obligated_amount=12.0, ) + mommy.make( + "reporting.ReportingAgencyMissingTas", + toptier_code="987", + fiscal_year=current_fiscal_year(), + fiscal_period=get_final_period_of_quarter(calculate_last_completed_fiscal_quarter(current_fiscal_year())) or 3, + tas_rendering_label="TAS 3", + obligated_amount=0, + ) def test_basic_success(setup_test_data, client): resp = client.get(url) assert resp.status_code == status.HTTP_200_OK response = resp.json() - assert len(response["results"]) == 2 + assert len(response["results"]) == 3 expected_results = [ { "agency_name": "Test Agency 2", @@ -202,7 +241,7 @@ def test_basic_success(setup_test_data, client): "toptier_code": "987", "agency_id": 2, "current_total_budget_authority_amount": 100.0, - "recent_publication_date": None, + "recent_publication_date": f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 18.6, @@ -211,8 +250,8 @@ def test_basic_success(setup_test_data, client): "missing_tas_accounts_count": 1, }, "obligation_difference": 0.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, "assurance_statement_url": assurance_statement_2, }, { @@ -221,7 +260,7 @@ def test_basic_success(setup_test_data, client): "toptier_code": "001", "agency_id": 3, "current_total_budget_authority_amount": 10.0, - "recent_publication_date": None, + "recent_publication_date": f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 20.0, @@ -230,10 +269,29 @@ def test_basic_success(setup_test_data, client): "missing_tas_accounts_count": 0, }, "obligation_difference": 10.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, "assurance_statement_url": assurance_statement_3, }, + { + "agency_name": "Test Agency", + "abbreviation": "ABC", + "toptier_code": "123", + "agency_id": 1, + "current_total_budget_authority_amount": None, + "recent_publication_date": None, + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": None, + "tas_accounts_total": None, + "tas_obligation_not_in_gtas_total": 0.0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": None, + "unlinked_contract_award_count": None, + "unlinked_assistance_award_count": None, + "assurance_statement_url": None, + }, ] assert response["results"] == expected_results @@ -246,7 +304,7 @@ def test_filter(setup_test_data, client): "toptier_code": "987", "agency_id": 2, "current_total_budget_authority_amount": 100.0, - "recent_publication_date": None, + "recent_publication_date": f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 18.6, @@ -255,8 +313,8 @@ def test_filter(setup_test_data, client): "missing_tas_accounts_count": 1, }, "obligation_difference": 0.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, "assurance_statement_url": assurance_statement_2, } ] @@ -286,7 +344,7 @@ def test_pagination(setup_test_data, client): "toptier_code": "987", "agency_id": 2, "current_total_budget_authority_amount": 100.0, - "recent_publication_date": None, + "recent_publication_date": f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 18.6, @@ -295,8 +353,8 @@ def test_pagination(setup_test_data, client): "missing_tas_accounts_count": 1, }, "obligation_difference": 0.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, "assurance_statement_url": assurance_statement_2, } ] @@ -313,7 +371,7 @@ def test_pagination(setup_test_data, client): "toptier_code": "001", "agency_id": 3, "current_total_budget_authority_amount": 10.0, - "recent_publication_date": None, + "recent_publication_date": "2021-07-07T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 20.0, @@ -322,17 +380,17 @@ def test_pagination(setup_test_data, client): "missing_tas_accounts_count": 0, }, "obligation_difference": 10.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, "assurance_statement_url": assurance_statement_3, - } + }, ] assert response["results"] == expected_results resp = client.get(url + "?sort=obligation_difference&order=desc") assert resp.status_code == status.HTTP_200_OK response = resp.json() - assert len(response["results"]) == 2 + assert len(response["results"]) == 3 expected_results = [ { "agency_name": "Test Agency 3", @@ -340,7 +398,7 @@ def test_pagination(setup_test_data, client): "toptier_code": "001", "agency_id": 3, "current_total_budget_authority_amount": 10.0, - "recent_publication_date": None, + "recent_publication_date": f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 20.0, @@ -349,8 +407,8 @@ def test_pagination(setup_test_data, client): "missing_tas_accounts_count": 0, }, "obligation_difference": 10.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, "assurance_statement_url": assurance_statement_3, }, { @@ -359,8 +417,54 @@ def test_pagination(setup_test_data, client): "toptier_code": "987", "agency_id": 2, "current_total_budget_authority_amount": 100.0, + "recent_publication_date": f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 18.6, + "tas_accounts_total": 100.00, + "tas_obligation_not_in_gtas_total": 12.0, + "missing_tas_accounts_count": 1, + }, + "obligation_difference": 0.0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, + "assurance_statement_url": assurance_statement_2, + }, + { + "agency_name": "Test Agency", + "abbreviation": "ABC", + "toptier_code": "123", + "agency_id": 1, + "current_total_budget_authority_amount": None, "recent_publication_date": None, "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": None, + "tas_accounts_total": None, + "tas_obligation_not_in_gtas_total": 0.0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": None, + "unlinked_contract_award_count": None, + "unlinked_assistance_award_count": None, + "assurance_statement_url": None, + }, + ] + assert response["results"] == expected_results + + resp = client.get(url + "?sort=unlinked_assistance_award_count&order=asc") + assert resp.status_code == status.HTTP_200_OK + response = resp.json() + assert len(response["results"]) == 3 + expected_results = [ + { + "agency_name": "Test Agency 2", + "abbreviation": "XYZ", + "toptier_code": "987", + "agency_id": 2, + "current_total_budget_authority_amount": 100.0, + "recent_publication_date": f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07T00:00:00Z", + "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 18.6, "tas_accounts_total": 100.00, @@ -368,10 +472,48 @@ def test_pagination(setup_test_data, client): "missing_tas_accounts_count": 1, }, "obligation_difference": 0.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, "assurance_statement_url": assurance_statement_2, }, + { + "agency_name": "Test Agency 3", + "abbreviation": "AAA", + "toptier_code": "001", + "agency_id": 3, + "current_total_budget_authority_amount": 10.0, + "recent_publication_date": f"{CURRENT_FISCAL_YEAR}-{CURRENT_LAST_PERIOD+1:02}-07T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 20.0, + "tas_accounts_total": 100.00, + "tas_obligation_not_in_gtas_total": 0.0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": 10.0, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, + "assurance_statement_url": assurance_statement_3, + }, + { + "agency_name": "Test Agency", + "abbreviation": "ABC", + "toptier_code": "123", + "agency_id": 1, + "current_total_budget_authority_amount": None, + "recent_publication_date": None, + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": None, + "tas_accounts_total": None, + "tas_obligation_not_in_gtas_total": 0.0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": None, + "unlinked_contract_award_count": None, + "unlinked_assistance_award_count": None, + "assurance_statement_url": None, + }, ] assert response["results"] == expected_results @@ -380,7 +522,7 @@ def test_fiscal_year_period_selection(setup_test_data, client): resp = client.get(url + "?fiscal_year=2019&fiscal_period=6") assert resp.status_code == status.HTTP_200_OK response = resp.json() - assert len(response["results"]) == 1 + assert len(response["results"]) == 3 expected_results = [ { @@ -389,7 +531,7 @@ def test_fiscal_year_period_selection(setup_test_data, client): "toptier_code": "123", "agency_id": 1, "current_total_budget_authority_amount": 22478810.97, - "recent_publication_date": None, + "recent_publication_date": "2019-07-03T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 1788370.03, @@ -398,9 +540,47 @@ def test_fiscal_year_period_selection(setup_test_data, client): "missing_tas_accounts_count": 2, }, "obligation_difference": 84931.95, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, "assurance_statement_url": assurance_statement_1, - } + }, + { + "agency_name": "Test Agency 2", + "abbreviation": "XYZ", + "toptier_code": "987", + "agency_id": 2, + "current_total_budget_authority_amount": None, + "recent_publication_date": None, + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": None, + "tas_accounts_total": None, + "tas_obligation_not_in_gtas_total": 0.0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": None, + "unlinked_contract_award_count": None, + "unlinked_assistance_award_count": None, + "assurance_statement_url": None, + }, + { + "agency_name": "Test Agency 3", + "abbreviation": "AAA", + "toptier_code": "001", + "agency_id": 3, + "current_total_budget_authority_amount": None, + "recent_publication_date": None, + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": None, + "tas_accounts_total": None, + "tas_obligation_not_in_gtas_total": 0.0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": None, + "unlinked_contract_award_count": None, + "unlinked_assistance_award_count": None, + "assurance_statement_url": None, + }, ] assert response["results"] == expected_results diff --git a/usaspending_api/reporting/tests/integration/test_agency_code_overview.py b/usaspending_api/reporting/tests/integration/test_agency_code_overview.py index 83b5095639..afb3390fce 100644 --- a/usaspending_api/reporting/tests/integration/test_agency_code_overview.py +++ b/usaspending_api/reporting/tests/integration/test_agency_code_overview.py @@ -13,10 +13,22 @@ def setup_test_data(db): """ Insert data into DB for testing """ sub = mommy.make( - "submissions.SubmissionAttributes", submission_id=1, reporting_fiscal_year=2019, reporting_fiscal_period=6 + "submissions.SubmissionAttributes", + submission_id=1, + toptier_code="123", + quarter_format_flag=False, + reporting_fiscal_year=2019, + reporting_fiscal_period=6, + published_date="2019-07-03", ) sub2 = mommy.make( - "submissions.SubmissionAttributes", submission_id=2, reporting_fiscal_year=2020, reporting_fiscal_period=12 + "submissions.SubmissionAttributes", + submission_id=2, + toptier_code="123", + quarter_format_flag=False, + reporting_fiscal_year=2020, + reporting_fiscal_period=12, + published_date="2021-02-11", ) agency = mommy.make("references.ToptierAgency", toptier_code="123", abbreviation="ABC", name="Test Agency") @@ -97,6 +109,10 @@ def setup_test_data(db): total_dollars_obligated_gtas=1788370.03, total_budgetary_resources=22478810.97, total_diff_approp_ocpa_obligated_amounts=84931.95, + unlinked_procurement_c_awards=1, + unlinked_assistance_c_awards=2, + unlinked_procurement_d_awards=3, + unlinked_assistance_d_awards=4, ) mommy.make( "reporting.ReportingAgencyOverview", @@ -107,6 +123,10 @@ def setup_test_data(db): total_dollars_obligated_gtas=18.6, total_budgetary_resources=100, total_diff_approp_ocpa_obligated_amounts=0, + unlinked_procurement_c_awards=10, + unlinked_assistance_c_awards=20, + unlinked_procurement_d_awards=30, + unlinked_assistance_d_awards=40, ) mommy.make( "reporting.ReportingAgencyOverview", @@ -117,6 +137,10 @@ def setup_test_data(db): total_dollars_obligated_gtas=1788370.04, total_budgetary_resources=22478810.98, total_diff_approp_ocpa_obligated_amounts=84931.96, + unlinked_procurement_c_awards=100, + unlinked_assistance_c_awards=200, + unlinked_procurement_d_awards=300, + unlinked_assistance_d_awards=400, ) mommy.make( "reporting.ReportingAgencyMissingTas", @@ -142,6 +166,14 @@ def setup_test_data(db): tas_rendering_label="TAS 2", obligated_amount=12.0, ) + mommy.make( + "reporting.ReportingAgencyMissingTas", + toptier_code=123, + fiscal_year=2020, + fiscal_period=12, + tas_rendering_label="TAS 3", + obligated_amount=0, + ) mommy.make( "references.GTASSF133Balances", id=1, @@ -163,12 +195,17 @@ def setup_test_data(db): fiscal_period=12, total_budgetary_resources_cpe=100000000, ) + mommy.make( + "references.GTASSF133Balances", + id=4, + fiscal_year=2019, + fiscal_period=10, + total_budgetary_resources_cpe=10, + ) -assurance_statement_2019_9 = f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/Raw%20DATA%20Act%20Files/2019/P09/123%20-%20Test%20Agency%20(ABC)/2019-P09-123_Test%20Agency%20(ABC)-Assurance_Statement.txt" assurance_statement_2019_6 = f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/Raw%20DATA%20Act%20Files/2019/P06/123%20-%20Test%20Agency%20(ABC)/2019-P06-123_Test%20Agency%20(ABC)-Assurance_Statement.txt" assurance_statement_2020_12 = f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/Raw%20DATA%20Act%20Files/2020/P12/123%20-%20Test%20Agency%20(ABC)/2020-P12-123_Test%20Agency%20(ABC)-Assurance_Statement.txt" - assurance_statement_quarter = f"{settings.FILES_SERVER_BASE_URL}/agency_submissions/Raw%20DATA%20Act%20Files/2019/Q3/123%20-%20Quarterly%20Agency%20(QA)/2019-Q3-123_Quarterly%20Agency%20(QA)-Assurance_Statement.txt" @@ -193,9 +230,9 @@ def test_basic_success(setup_test_data, client): "missing_tas_accounts_count": 0, }, "obligation_difference": 84931.96, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, - "assurance_statement_url": assurance_statement_2019_9, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, + "assurance_statement_url": None, }, { "fiscal_year": 2019, @@ -203,7 +240,7 @@ def test_basic_success(setup_test_data, client): "current_total_budget_authority_amount": 22478810.97, "total_budgetary_resources": 200000000, "percent_of_total_budgetary_resources": 11.24, - "recent_publication_date": None, + "recent_publication_date": "2019-07-03T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 1788370.03, @@ -212,8 +249,8 @@ def test_basic_success(setup_test_data, client): "missing_tas_accounts_count": 2, }, "obligation_difference": 84931.95, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, "assurance_statement_url": assurance_statement_2019_6, }, { @@ -222,7 +259,7 @@ def test_basic_success(setup_test_data, client): "current_total_budget_authority_amount": 100.0, "total_budgetary_resources": 100000000, "percent_of_total_budgetary_resources": 0, - "recent_publication_date": None, + "recent_publication_date": "2021-02-11T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 18.6, @@ -231,8 +268,8 @@ def test_basic_success(setup_test_data, client): "missing_tas_accounts_count": 1, }, "obligation_difference": 0.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, "assurance_statement_url": assurance_statement_2020_12, }, ] @@ -251,7 +288,7 @@ def test_pagination(setup_test_data, client): "current_total_budget_authority_amount": 100.0, "total_budgetary_resources": 100000000, "percent_of_total_budgetary_resources": 0, - "recent_publication_date": None, + "recent_publication_date": "2021-02-11T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 18.6, @@ -260,8 +297,8 @@ def test_pagination(setup_test_data, client): "missing_tas_accounts_count": 1, }, "obligation_difference": 0.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, "assurance_statement_url": assurance_statement_2020_12, }, { @@ -270,7 +307,7 @@ def test_pagination(setup_test_data, client): "current_total_budget_authority_amount": 22478810.97, "total_budgetary_resources": 200000000, "percent_of_total_budgetary_resources": 11.24, - "recent_publication_date": None, + "recent_publication_date": "2019-07-03T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 1788370.03, @@ -279,8 +316,8 @@ def test_pagination(setup_test_data, client): "missing_tas_accounts_count": 2, }, "obligation_difference": 84931.95, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, "assurance_statement_url": assurance_statement_2019_6, }, { @@ -298,9 +335,9 @@ def test_pagination(setup_test_data, client): "missing_tas_accounts_count": 0, }, "obligation_difference": 84931.96, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, - "assurance_statement_url": assurance_statement_2019_9, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, + "assurance_statement_url": None, }, ] assert response["results"] == expected_results @@ -324,9 +361,9 @@ def test_pagination(setup_test_data, client): "missing_tas_accounts_count": 0, }, "obligation_difference": 84931.96, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, - "assurance_statement_url": assurance_statement_2019_9, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, + "assurance_statement_url": None, } ] assert response["results"] == expected_results @@ -341,7 +378,7 @@ def test_pagination(setup_test_data, client): "current_total_budget_authority_amount": 22478810.97, "total_budgetary_resources": 200000000, "percent_of_total_budgetary_resources": 11.24, - "recent_publication_date": None, + "recent_publication_date": "2019-07-03T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 1788370.03, @@ -350,8 +387,8 @@ def test_pagination(setup_test_data, client): "missing_tas_accounts_count": 2, }, "obligation_difference": 84931.95, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, "assurance_statement_url": assurance_statement_2019_6, } ] @@ -370,7 +407,7 @@ def test_secondary_sort(setup_test_data, client): "current_total_budget_authority_amount": 22478810.97, "total_budgetary_resources": 200000000, "percent_of_total_budgetary_resources": 11.24, - "recent_publication_date": None, + "recent_publication_date": "2019-07-03T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 1788370.03, @@ -379,8 +416,8 @@ def test_secondary_sort(setup_test_data, client): "missing_tas_accounts_count": 2, }, "obligation_difference": 84931.95, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, "assurance_statement_url": assurance_statement_2019_6, }, { @@ -398,9 +435,9 @@ def test_secondary_sort(setup_test_data, client): "missing_tas_accounts_count": 0, }, "obligation_difference": 84931.96, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, - "assurance_statement_url": assurance_statement_2019_9, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, + "assurance_statement_url": None, }, { "fiscal_year": 2020, @@ -408,7 +445,7 @@ def test_secondary_sort(setup_test_data, client): "current_total_budget_authority_amount": 100.0, "total_budgetary_resources": 100000000, "percent_of_total_budgetary_resources": 0, - "recent_publication_date": None, + "recent_publication_date": "2021-02-11T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 18.6, @@ -417,8 +454,8 @@ def test_secondary_sort(setup_test_data, client): "missing_tas_accounts_count": 1, }, "obligation_difference": 0.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, "assurance_statement_url": assurance_statement_2020_12, }, ] @@ -444,9 +481,9 @@ def test_secondary_sort(setup_test_data, client): "missing_tas_accounts_count": 0, }, "obligation_difference": 84931.96, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, - "assurance_statement_url": assurance_statement_2019_9, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, + "assurance_statement_url": None, }, { "fiscal_year": 2019, @@ -454,7 +491,7 @@ def test_secondary_sort(setup_test_data, client): "current_total_budget_authority_amount": 22478810.97, "total_budgetary_resources": 200000000, "percent_of_total_budgetary_resources": 11.24, - "recent_publication_date": None, + "recent_publication_date": "2019-07-03T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 1788370.03, @@ -463,8 +500,8 @@ def test_secondary_sort(setup_test_data, client): "missing_tas_accounts_count": 2, }, "obligation_difference": 84931.95, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, "assurance_statement_url": assurance_statement_2019_6, }, { @@ -473,7 +510,7 @@ def test_secondary_sort(setup_test_data, client): "current_total_budget_authority_amount": 100.0, "total_budgetary_resources": 100000000, "percent_of_total_budgetary_resources": 0, - "recent_publication_date": None, + "recent_publication_date": "2021-02-11T00:00:00Z", "recent_publication_date_certified": False, "tas_account_discrepancies_totals": { "gtas_obligation_total": 18.6, @@ -482,8 +519,8 @@ def test_secondary_sort(setup_test_data, client): "missing_tas_accounts_count": 1, }, "obligation_difference": 0.0, - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, "assurance_statement_url": assurance_statement_2020_12, }, ] @@ -503,3 +540,195 @@ def test_quarterly_assurance_statements(): assurance_statement = AgencyBase.create_assurance_statement_url(results) assert assurance_statement == assurance_statement_quarter + + +def test_secondary_period_sort(setup_test_data, client): + mommy.make( + "reporting.ReportingAgencyOverview", + reporting_agency_overview_id=4, + toptier_code=123, + fiscal_year=2019, + fiscal_period=10, + total_dollars_obligated_gtas=0.0, + total_budgetary_resources=0.0, + total_diff_approp_ocpa_obligated_amounts=0.0, + unlinked_procurement_c_awards=1, + unlinked_assistance_c_awards=2, + unlinked_procurement_d_awards=3, + unlinked_assistance_d_awards=4, + ) + mommy.make( + "reporting.ReportingAgencyMissingTas", + toptier_code=123, + fiscal_year=2019, + fiscal_period=10, + tas_rendering_label="TAS 2", + obligated_amount=1000, + ) + resp = client.get(url + "?sort=fiscal_year&order=asc") + assert resp.status_code == status.HTTP_200_OK + response = resp.json() + assert len(response["results"]) == 4 + expected_results = [ + { + "fiscal_year": 2019, + "fiscal_period": 6, + "current_total_budget_authority_amount": 22478810.97, + "total_budgetary_resources": 200000000.0, + "percent_of_total_budgetary_resources": 11.24, + "recent_publication_date": "2019-07-03T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 1788370.03, + "tas_accounts_total": 200.00, + "tas_obligation_not_in_gtas_total": 11.0, + "missing_tas_accounts_count": 2, + }, + "obligation_difference": 84931.95, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, + "assurance_statement_url": assurance_statement_2019_6, + }, + { + "fiscal_year": 2019, + "fiscal_period": 9, + "current_total_budget_authority_amount": 22478810.98, + "total_budgetary_resources": 150000000.0, + "percent_of_total_budgetary_resources": 14.99, + "recent_publication_date": None, + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 1788370.04, + "tas_accounts_total": None, + "tas_obligation_not_in_gtas_total": 0.0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": 84931.96, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, + "assurance_statement_url": None, + }, + { + "fiscal_year": 2019, + "fiscal_period": 10, + "current_total_budget_authority_amount": 0.0, + "total_budgetary_resources": 10.0, + "percent_of_total_budgetary_resources": 0.0, + "recent_publication_date": None, + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 0.0, + "tas_accounts_total": None, + "tas_obligation_not_in_gtas_total": 1000, + "missing_tas_accounts_count": 1, + }, + "obligation_difference": 0.0, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, + "assurance_statement_url": None, + }, + { + "fiscal_year": 2020, + "fiscal_period": 12, + "current_total_budget_authority_amount": 100.0, + "total_budgetary_resources": 100000000.0, + "percent_of_total_budgetary_resources": 0.0, + "recent_publication_date": "2021-02-11T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 18.6, + "tas_accounts_total": 100.00, + "tas_obligation_not_in_gtas_total": 12.0, + "missing_tas_accounts_count": 1, + }, + "obligation_difference": 0.0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, + "assurance_statement_url": assurance_statement_2020_12, + }, + ] + assert response["results"] == expected_results + + resp = client.get(url + "?sort=fiscal_year&order=desc") + assert resp.status_code == status.HTTP_200_OK + response = resp.json() + assert len(response["results"]) == 4 + expected_results = [ + { + "fiscal_year": 2020, + "fiscal_period": 12, + "current_total_budget_authority_amount": 100.0, + "total_budgetary_resources": 100000000.0, + "percent_of_total_budgetary_resources": 0.0, + "recent_publication_date": "2021-02-11T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 18.6, + "tas_accounts_total": 100.00, + "tas_obligation_not_in_gtas_total": 12.0, + "missing_tas_accounts_count": 1, + }, + "obligation_difference": 0.0, + "unlinked_contract_award_count": 40, + "unlinked_assistance_award_count": 60, + "assurance_statement_url": assurance_statement_2020_12, + }, + { + "fiscal_year": 2019, + "fiscal_period": 10, + "current_total_budget_authority_amount": 0.0, + "total_budgetary_resources": 10.0, + "percent_of_total_budgetary_resources": 0.0, + "recent_publication_date": None, + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 0.0, + "tas_accounts_total": None, + "tas_obligation_not_in_gtas_total": 1000, + "missing_tas_accounts_count": 1, + }, + "obligation_difference": 0.0, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, + "assurance_statement_url": None, + }, + { + "fiscal_year": 2019, + "fiscal_period": 9, + "current_total_budget_authority_amount": 22478810.98, + "total_budgetary_resources": 150000000.0, + "percent_of_total_budgetary_resources": 14.99, + "recent_publication_date": None, + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 1788370.04, + "tas_accounts_total": None, + "tas_obligation_not_in_gtas_total": 0.0, + "missing_tas_accounts_count": 0, + }, + "obligation_difference": 84931.96, + "unlinked_contract_award_count": 400, + "unlinked_assistance_award_count": 600, + "assurance_statement_url": None, + }, + { + "fiscal_year": 2019, + "fiscal_period": 6, + "current_total_budget_authority_amount": 22478810.97, + "total_budgetary_resources": 200000000.0, + "percent_of_total_budgetary_resources": 11.24, + "recent_publication_date": "2019-07-03T00:00:00Z", + "recent_publication_date_certified": False, + "tas_account_discrepancies_totals": { + "gtas_obligation_total": 1788370.03, + "tas_accounts_total": 200.00, + "tas_obligation_not_in_gtas_total": 11.0, + "missing_tas_accounts_count": 2, + }, + "obligation_difference": 84931.95, + "unlinked_contract_award_count": 4, + "unlinked_assistance_award_count": 6, + "assurance_statement_url": assurance_statement_2019_6, + }, + ] + assert response["results"] == expected_results diff --git a/usaspending_api/reporting/tests/integration/test_unlinked_awards.py b/usaspending_api/reporting/tests/integration/test_unlinked_awards.py new file mode 100644 index 0000000000..2b5c1f4c0b --- /dev/null +++ b/usaspending_api/reporting/tests/integration/test_unlinked_awards.py @@ -0,0 +1,95 @@ +import pytest +from model_mommy import mommy +from rest_framework import status + + +url = "/api/v2/reporting/agencies/{toptier_code}/{fiscal_year}/{fiscal_period}/unlinked_awards/{type}/" + + +@pytest.fixture +def setup_test_data(db): + mommy.make( + "reporting.ReportingAgencyOverview", + toptier_code="043", + fiscal_year=2020, + fiscal_period=8, + unlinked_assistance_c_awards=12, + unlinked_assistance_d_awards=24, + unlinked_procurement_c_awards=14, + unlinked_procurement_d_awards=28, + linked_assistance_awards=6, + linked_procurement_awards=7, + ) + + +def test_assistance_success(setup_test_data, client): + resp = client.get(url.format(toptier_code="043", fiscal_year=2020, fiscal_period=8, type="assistance")) + assert resp.status_code == status.HTTP_200_OK + response = resp.json() + + expected_results = { + "unlinked_file_c_award_count": 12, + "unlinked_file_d_award_count": 24, + "total_linked_award_count": 6, + } + + assert expected_results == response + + +def test_procurement_success(setup_test_data, client): + resp = client.get(url.format(toptier_code="043", fiscal_year=2020, fiscal_period=8, type="procurement")) + assert resp.status_code == status.HTTP_200_OK + response = resp.json() + + expected_results = { + "unlinked_file_c_award_count": 14, + "unlinked_file_d_award_count": 28, + "total_linked_award_count": 7, + } + + assert expected_results == response + + +def test_no_result_found(setup_test_data, client): + resp = client.get(url.format(toptier_code="045", fiscal_year=2020, fiscal_period=8, type="procurement")) + assert resp.status_code == status.HTTP_200_OK + response = resp.json() + + expected_results = { + "unlinked_file_c_award_count": 0, + "unlinked_file_d_award_count": 0, + "total_linked_award_count": 0, + } + + assert expected_results == response + + +def test_invalid_type(client): + # trailing S on procurement + resp = client.get(url.format(toptier_code="043", fiscal_year=2020, fiscal_period=8, type="procurementS")) + assert resp.status_code == status.HTTP_400_BAD_REQUEST + + response = resp.json() + detail = response["detail"] + + assert detail == "Field 'type' is outside valid values ['assistance', 'procurement']" + + +def test_too_high_year(client): + resp = client.get(url.format(toptier_code="043", fiscal_year=2100, fiscal_period=8, type="procurement")) + assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + response = resp.json() + detail = response["detail"] + + assert "Field 'fiscal_year' value '2100' is above max" in detail + + +def test_too_high_period(client): + resp = client.get(url.format(toptier_code="043", fiscal_year=2020, fiscal_period=13, type="procurement")) + assert resp.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + response = resp.json() + detail = response["detail"] + + assert detail == "Field 'fiscal_period' value '13' is above max '12'" diff --git a/usaspending_api/reporting/v2/views/agencies/overview.py b/usaspending_api/reporting/v2/views/agencies/overview.py index 1cf85f065a..2151c9ac91 100644 --- a/usaspending_api/reporting/v2/views/agencies/overview.py +++ b/usaspending_api/reporting/v2/views/agencies/overview.py @@ -1,4 +1,4 @@ -from django.db.models import Subquery, OuterRef, DecimalField, Func, F, Q, IntegerField +from django.db.models import Subquery, OuterRef, DecimalField, Func, F, Q, IntegerField, Value from rest_framework.response import Response from usaspending_api.agency.v2.views.agency_base import AgencyBase, PaginationMixin from django.utils.functional import cached_property @@ -23,7 +23,7 @@ def get(self, request): "toptier_code", "current_total_budget_authority_amount", "missing_tas_accounts_count", - "missing_tas_accounts_total", + "tas_accounts_total", "agency_name", "obligation_difference", "recent_publication_date", @@ -41,41 +41,75 @@ def get(self, request): ) def get_agency_overview(self): - agency_filters = [Q(toptier_code=OuterRef("toptier_code"))] + agency_filters = [] if self.filter is not None: agency_filters.append(Q(name__icontains=self.filter) | Q(abbreviation__icontains=self.filter)) + reporting_filters = [ + Q(toptier_code=OuterRef("toptier_code")), + Q(fiscal_year=self.fiscal_year), + Q(fiscal_period=self.fiscal_period), + ] result_list = ( - ReportingAgencyOverview.objects.filter(fiscal_year=self.fiscal_year, fiscal_period=self.fiscal_period) + ToptierAgency.objects.account_agencies() + .filter(*agency_filters) .annotate( - current_total_budget_authority_amount=F("total_budgetary_resources"), - obligation_difference=F("total_diff_approp_ocpa_obligated_amounts"), - agency_name=Subquery(ToptierAgency.objects.filter(*agency_filters).values("name")), - abbreviation=Subquery(ToptierAgency.objects.filter(*agency_filters).values("abbreviation")), + agency_name=F("name"), + fiscal_year=Value(self.fiscal_year, output_field=IntegerField()), + fiscal_period=Value(self.fiscal_period, output_field=IntegerField()), + current_total_budget_authority_amount=Subquery( + ReportingAgencyOverview.objects.filter(*reporting_filters).values("total_budgetary_resources") + ), + obligation_difference=Subquery( + ReportingAgencyOverview.objects.filter(*reporting_filters).values( + "total_diff_approp_ocpa_obligated_amounts" + ) + ), + total_dollars_obligated_gtas=Subquery( + ReportingAgencyOverview.objects.filter(*reporting_filters).values("total_dollars_obligated_gtas") + ), + unlinked_contract_award_count=Subquery( + ReportingAgencyOverview.objects.filter(*reporting_filters) + .annotate( + unlinked_contract_award_count=F("unlinked_procurement_c_awards") + + F("unlinked_procurement_d_awards") + ) + .values("unlinked_contract_award_count"), + output_field=IntegerField(), + ), + unlinked_assistance_award_count=Subquery( + ReportingAgencyOverview.objects.filter(*reporting_filters) + .annotate( + unlinked_assistance_award_count=F("unlinked_assistance_c_awards") + + F("unlinked_assistance_d_awards") + ) + .values("unlinked_assistance_award_count"), + output_field=IntegerField(), + ), recent_publication_date=Subquery( SubmissionAttributes.objects.filter( - reporting_fiscal_year=OuterRef("fiscal_year"), - reporting_fiscal_period=OuterRef("fiscal_period"), + reporting_fiscal_year=self.fiscal_year, + reporting_fiscal_period=self.fiscal_period, toptier_code=OuterRef("toptier_code"), ).values("published_date") ), recent_publication_date_certified=Subquery( SubmissionAttributes.objects.filter( - reporting_fiscal_year=OuterRef("fiscal_year"), - reporting_fiscal_period=OuterRef("fiscal_period"), + reporting_fiscal_year=self.fiscal_year, + reporting_fiscal_period=self.fiscal_period, toptier_code=OuterRef("toptier_code"), ).values("certified_date") ), submission_is_quarter=Subquery( SubmissionAttributes.objects.filter( - reporting_fiscal_year=OuterRef("fiscal_year"), - reporting_fiscal_period=OuterRef("fiscal_period"), + reporting_fiscal_year=self.fiscal_year, + reporting_fiscal_period=self.fiscal_period, toptier_code=OuterRef("toptier_code"), ).values("quarter_format_flag") ), - missing_tas_accounts_total=Subquery( + tas_accounts_total=Subquery( ReportingAgencyTas.objects.filter( - fiscal_year=OuterRef("fiscal_year"), - fiscal_period=OuterRef("fiscal_period"), + fiscal_year=self.fiscal_year, + fiscal_period=self.fiscal_period, toptier_code=OuterRef("toptier_code"), ) .annotate(the_sum=Func(F("appropriation_obligated_amount"), function="SUM")) @@ -84,8 +118,8 @@ def get_agency_overview(self): ), tas_obligation_not_in_gtas_total=Subquery( ReportingAgencyMissingTas.objects.filter( - fiscal_year=OuterRef("fiscal_year"), - fiscal_period=OuterRef("fiscal_period"), + fiscal_year=self.fiscal_year, + fiscal_period=self.fiscal_period, toptier_code=OuterRef("toptier_code"), ) .annotate(the_sum=Func(F("obligated_amount"), function="SUM")) @@ -94,16 +128,16 @@ def get_agency_overview(self): ), missing_tas_accounts_count=Subquery( ReportingAgencyMissingTas.objects.filter( - fiscal_year=OuterRef("fiscal_year"), - fiscal_period=OuterRef("fiscal_period"), + fiscal_year=self.fiscal_year, + fiscal_period=self.fiscal_period, toptier_code=OuterRef("toptier_code"), ) + .exclude(obligated_amount=0) .annotate(count=Func(F("tas_rendering_label"), function="COUNT")) .values("count"), output_field=IntegerField(), ), ) - .exclude(agency_name__isnull=True) .values( "agency_name", "abbreviation", @@ -113,18 +147,22 @@ def get_agency_overview(self): "obligation_difference", "recent_publication_date", "recent_publication_date_certified", - "missing_tas_accounts_total", + "tas_accounts_total", "tas_obligation_not_in_gtas_total", "missing_tas_accounts_count", "fiscal_year", "fiscal_period", "submission_is_quarter", + "unlinked_contract_award_count", + "unlinked_assistance_award_count", ) - .order_by( - f"{'-' if self.pagination.sort_order == 'desc' else ''}{self.pagination.sort_key if self.pagination.sort_key not in ['unlinked_contract_award_count','unlinked_assistance_award_count'] else self.default_sort_column}" - ) - # currently we are just returning 0 for the unlinked awards, once this is removed, we should be able to remove this conditional ) + + if self.pagination.sort_order == "desc": + result_list = result_list.order_by(F(self.pagination.sort_key).desc(nulls_last=True)) + else: + result_list = result_list.order_by(F(self.pagination.sort_key).asc(nulls_last=True)) + return self.format_results(result_list) def format_results(self, result_list): @@ -143,14 +181,16 @@ def format_results(self, result_list): "recent_publication_date_certified": result["recent_publication_date_certified"] is not None, "tas_account_discrepancies_totals": { "gtas_obligation_total": result["total_dollars_obligated_gtas"], - "tas_accounts_total": result["missing_tas_accounts_total"], + "tas_accounts_total": result["tas_accounts_total"], "tas_obligation_not_in_gtas_total": result["tas_obligation_not_in_gtas_total"] or 0.0, "missing_tas_accounts_count": result["missing_tas_accounts_count"], }, "obligation_difference": result["obligation_difference"], - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, - "assurance_statement_url": self.create_assurance_statement_url(result), + "unlinked_contract_award_count": result["unlinked_contract_award_count"], + "unlinked_assistance_award_count": result["unlinked_assistance_award_count"], + "assurance_statement_url": self.create_assurance_statement_url(result) + if result["recent_publication_date"] + else None, } for result in result_list ] diff --git a/usaspending_api/reporting/v2/views/agencies/toptier_code/fiscal_year/fiscal_period/unlinked_awards.py b/usaspending_api/reporting/v2/views/agencies/toptier_code/fiscal_year/fiscal_period/unlinked_awards.py new file mode 100644 index 0000000000..a24842112e --- /dev/null +++ b/usaspending_api/reporting/v2/views/agencies/toptier_code/fiscal_year/fiscal_period/unlinked_awards.py @@ -0,0 +1,95 @@ +from django.conf import settings +from django.db.models import F +from rest_framework.response import Response + +from usaspending_api.agency.v2.views.agency_base import AgencyBase +from usaspending_api.common.cache_decorator import cache_response +from usaspending_api.common.helpers.date_helper import fy +from usaspending_api.common.helpers.fiscal_year_helpers import current_fiscal_year +from usaspending_api.common.validator.tinyshield import TinyShield +from usaspending_api.reporting.models import ReportingAgencyOverview + + +class UnlinkedAwards(AgencyBase): + """Returns submission history of the specified agency for the specified fiscal year and period""" + + endpoint_doc = "usaspending_api/api_contracts/contracts/v2/reporting/agencies/toptier_code/fiscal_year/fiscal_period/unlinked_awards/type.md" + + annotation_options = { + "assistance": { + "unlinked_file_c_award_count": F("unlinked_assistance_c_awards"), + "unlinked_file_d_award_count": F("unlinked_assistance_d_awards"), + "total_linked_award_count": F("linked_assistance_awards"), + }, + "procurement": { + "unlinked_file_c_award_count": F("unlinked_procurement_c_awards"), + "unlinked_file_d_award_count": F("unlinked_procurement_d_awards"), + "total_linked_award_count": F("linked_procurement_awards"), + }, + } + + tinyshield_model = [ + { + "key": "type", + "name": "type", + "type": "enum", + "enum_values": ["assistance", "procurement"], + "optional": False, + "default": None, + "allow_nulls": False, + }, + { + "key": "fiscal_year", + "name": "fiscal_year", + "type": "integer", + "min": fy(settings.API_SEARCH_MIN_DATE), + "max": current_fiscal_year(), + "optional": False, + "default": None, + "allow_nulls": False, + }, + { + "key": "fiscal_period", + "name": "fiscal_period", + "type": "integer", + "min": 2, + "max": 12, + "optional": False, + "default": None, + "allow_nulls": False, + }, + ] + + @cache_response() + def get(self, request, toptier_code, fiscal_year, fiscal_period, type): + my_request = {"type": type, "fiscal_year": fiscal_year, "fiscal_period": fiscal_period} + validated = TinyShield(self.tinyshield_model).block(my_request) + + self.annotations = self.annotation_options[validated["type"]] + self.fiscal_year = validated["fiscal_year"] + self.fiscal_period = validated["fiscal_period"] + + return Response(self.get_unlinked_awards()) + + def get_unlinked_awards(self): + result = ( + ReportingAgencyOverview.objects.filter( + toptier_code=self.toptier_code, fiscal_year=self.fiscal_year, fiscal_period=self.fiscal_period + ) + .annotate(**self.annotations) + .values( + "unlinked_file_c_award_count", + "unlinked_file_d_award_count", + "total_linked_award_count", + ) + .first() + ) + + if not result: + result = { + "unlinked_file_c_award_count": 0, + "unlinked_file_d_award_count": 0, + "total_linked_award_count": 0, + } + + return result diff --git a/usaspending_api/reporting/v2/views/agencies/toptier_code/overview.py b/usaspending_api/reporting/v2/views/agencies/toptier_code/overview.py index acd7bbe863..937419b675 100644 --- a/usaspending_api/reporting/v2/views/agencies/toptier_code/overview.py +++ b/usaspending_api/reporting/v2/views/agencies/toptier_code/overview.py @@ -19,7 +19,7 @@ def get(self, request, toptier_code): "current_total_budget_authority_amount", "fiscal_year", "missing_tas_accounts_count", - "missing_tas_accounts_total", + "tas_accounts_total", "obligation_difference", "percent_of_total_budgetary_resources", "recent_publication_date", @@ -84,6 +84,7 @@ def get_agency_overview(self): fiscal_period=OuterRef("fiscal_period"), toptier_code=OuterRef("toptier_code"), ) + .exclude(obligated_amount=0) .annotate(count=Func(F("tas_rendering_label"), function="COUNT")) .values("count"), output_field=IntegerField(), @@ -113,6 +114,10 @@ def get_agency_overview(self): "tas_obligations", "tas_obligation_not_in_gtas_total", "missing_tas_accounts", + "unlinked_procurement_c_awards", + "unlinked_assistance_c_awards", + "unlinked_procurement_d_awards", + "unlinked_assistance_d_awards", ) ) return self.format_results(result_list) @@ -136,18 +141,24 @@ def format_results(self, result_list): "missing_tas_accounts_count": result["missing_tas_accounts"], }, "obligation_difference": result["total_diff_approp_ocpa_obligated_amounts"], - "unlinked_contract_award_count": 0, - "unlinked_assistance_award_count": 0, - "assurance_statement_url": self.create_assurance_statement_url(result), + "unlinked_contract_award_count": result["unlinked_procurement_c_awards"] + + result["unlinked_procurement_d_awards"], + "unlinked_assistance_award_count": result["unlinked_assistance_c_awards"] + + result["unlinked_assistance_d_awards"], + "assurance_statement_url": self.create_assurance_statement_url(result) + if result["recent_publication_date"] + else None, } for result in result_list ] + if self.pagination.sort_key == "fiscal_year": + self.pagination.secondary_sort_key = "fiscal_period" results = sorted( results, key=lambda x: x["tas_account_discrepancies_totals"][self.pagination.sort_key] if ( self.pagination.sort_key == "missing_tas_accounts_count" - or self.pagination.sort_key == "missing_tas_accounts_total" + or self.pagination.sort_key == "tas_accounts_total" or self.pagination.sort_key == "tas_obligation_not_in_gtas_total" ) else (x[self.pagination.sort_key], x[self.pagination.secondary_sort_key]) diff --git a/usaspending_api/reporting/v2/views/agencies/urls.py b/usaspending_api/reporting/v2/views/agencies/urls.py index 5afbd49aa9..46a55b1a59 100644 --- a/usaspending_api/reporting/v2/views/agencies/urls.py +++ b/usaspending_api/reporting/v2/views/agencies/urls.py @@ -4,6 +4,9 @@ from usaspending_api.reporting.v2.views.agencies.overview import AgenciesOverview from usaspending_api.reporting.v2.views.agencies.toptier_code.differences import Differences from usaspending_api.reporting.v2.views.submission_history import SubmissionHistory +from usaspending_api.reporting.v2.views.agencies.toptier_code.fiscal_year.fiscal_period.unlinked_awards import ( + UnlinkedAwards, +) from usaspending_api.reporting.v2.views.agencies.publish_dates import PublishDates @@ -16,5 +19,9 @@ r"^(?P[0-9]{3,4})/(?P[0-9]{4})/(?P[0-9]{1,2})/submission_history/$", SubmissionHistory.as_view(), ), + url( + r"^(?P[0-9]{3,4})/(?P[0-9]{4})/(?P[0-9]{1,2})/unlinked_awards/(?P[\w]+)/$", + UnlinkedAwards.as_view(), + ), url(r"^publish_dates/$", PublishDates.as_view()), ] diff --git a/usaspending_api/reporting/v2/views/placeholder.py b/usaspending_api/reporting/v2/views/placeholder.py deleted file mode 100644 index 9c4c3101f0..0000000000 --- a/usaspending_api/reporting/v2/views/placeholder.py +++ /dev/null @@ -1,11 +0,0 @@ -from rest_framework.response import Response -from rest_framework.views import APIView - - -class Placeholder(APIView): - """Placeholder""" - - endpoint_doc = "usaspending_api/api_contracts/contracts/v2/reporting/placeholder.md" - - def get(self, request): - return Response({"status": "success"}) diff --git a/usaspending_api/static_doc_files/css/main.css b/usaspending_api/static_doc_files/css/main.css index a3ac839c54..539ae9a945 100644 --- a/usaspending_api/static_doc_files/css/main.css +++ b/usaspending_api/static_doc_files/css/main.css @@ -8210,6 +8210,9 @@ table thead { color: #ffffff; background: #5b616b; } table thead tr th { border-bottom: none; vertical-align: middle; } table tbody tr:last-child { border-bottom: 1px solid #ddd; } table tbody tr td { vertical-align: middle; } +tr td:first-child { + word-break: break-all; +} .list-group-item { padding: 15px; } .list-group-item:first-child { border-top-right-radius: 0; border-top-left-radius: 0; } diff --git a/usaspending_api/submissions/management/commands/rm_submission.py b/usaspending_api/submissions/management/commands/rm_submission.py index 7bd21ca178..569448d161 100644 --- a/usaspending_api/submissions/management/commands/rm_submission.py +++ b/usaspending_api/submissions/management/commands/rm_submission.py @@ -8,22 +8,20 @@ from usaspending_api.submissions.models import SubmissionAttributes from usaspending_api.awards.models import FinancialAccountsByAwards, Award +logger = logging.getLogger("script") + class Command(BaseCommand): - """ - This command will remove a submission and all associated data with it from the - database - """ + """Remove a DABS record and all associated data with it from the database""" - help = "Removes a single submission from the configured data broker database" - logger = logging.getLogger("console") + help = "Removes a single submission from the database" def add_arguments(self, parser): - parser.add_argument("submission_id", help="the broker submission id to delete", type=int) + parser.add_argument("submission_id", help="the Broker submission ID to delete", type=int) @transaction.atomic def handle(self, *args, **options): - self.logger.info("Starting rm_submissions management command") + logger.info("Starting rm_submissions management command") def signal_handler(signal, frame): transaction.set_rollback(True) @@ -37,7 +35,7 @@ def signal_handler(signal, frame): try: submission = SubmissionAttributes.objects.get(submission_id=submission_id) except ObjectDoesNotExist: - raise RuntimeError(f"Broker submission id {submission_id} does not exist") + raise RuntimeError(f"Broker submission ID {submission_id} does not exist") # Mark associated Accounts as updated, so they will be reloaded in ES nightly load Award.objects.filter( @@ -46,10 +44,22 @@ def signal_handler(signal, frame): deleted_stats = submission.delete() - self.logger.info("Finished deletions.") + models = { + "accounts.AppropriationAccountBalances": {"name": "File A", "count": 0}, + "financial_activities.FinancialAccountsByProgramActivityObjectClass": {"name": "File B", "count": 0}, + "awards.FinancialAccountsByAwards": {"name": "File C", "count": 0}, + "submissions.SubmissionAttributes": {"name": "Submission", "count": 0}, + "Total Rows": {"name": "DABS", "count": 0}, + } # Using a Dict to set the logging order below - statistics = f"Statistics:\n Total objects removed: {deleted_stats[0]:,}" for (model, count) in deleted_stats[1].items(): - statistics += f"\n {model}: {count:,}" + models[str(model)]["count"] = count + models["Total Rows"]["count"] += count + + if deleted_stats[0] != models["Total Rows"]["count"]: + logger.error(f"Delete records mismatch!! Check for unknown FK relationships!") + raise RuntimeError(f"ORM deletes {deleted_stats[0]:,} != expected {models['Total Rows']['count']:,}") - self.logger.info(f"Deleted broker submission id {submission_id}. {statistics}") + statistics = "\n\t".join([f"{m} ({x['name']}): {x['count']:,}" for m, x in models.items()]) + logger.info(f"Deleted Broker submission ID {submission_id}:\n\t{statistics}") + logger.info("Finished deletions by rm_submissions") diff --git a/usaspending_api/transactions/agnostic_transaction_loader.py b/usaspending_api/transactions/agnostic_transaction_loader.py index 01a405ba3c..fcec8bed36 100644 --- a/usaspending_api/transactions/agnostic_transaction_loader.py +++ b/usaspending_api/transactions/agnostic_transaction_loader.py @@ -200,4 +200,9 @@ def copy_broker_table_data(self, source_tablename, dest_tablename, primary_key): else: transactions_remaining_count = 0 self.upsert_records += record_count - logger.info(f"{self.upsert_records:,} successful upserts, {transactions_remaining_count:,} remaining.") + percentage = self.upsert_records * 100 / self.total_ids_to_process if self.total_ids_to_process != 0 else 0 + logger.info( + f"{self.upsert_records:,} successful upserts, " + f"{transactions_remaining_count:,} remaining. " + f"[{percentage:.2f}%]" + )