From 2a2137fe596956fd34308cf9af5c4f6f330d581f Mon Sep 17 00:00:00 2001 From: Pavlo Mokiichuk Date: Fri, 18 Oct 2024 16:26:45 +0200 Subject: [PATCH] Optimization script remove data with `is_original=True` v3 (#4345) * upd remove_migrated_data_is_original() & add get_statistic_is_original() * black * add transaction on batch * upd * upd test * upd ::fire:: * cleanup :fire: --- .../remove_migrated_data_is_original.py | 87 +++++++++++++------ .../test_remove_migrated_data_is_original.py | 2 + 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/src/hct_mis_api/one_time_scripts/remove_migrated_data_is_original.py b/src/hct_mis_api/one_time_scripts/remove_migrated_data_is_original.py index d42edec2c9..bed5769143 100644 --- a/src/hct_mis_api/one_time_scripts/remove_migrated_data_is_original.py +++ b/src/hct_mis_api/one_time_scripts/remove_migrated_data_is_original.py @@ -1,36 +1,71 @@ +from typing import Any, List + from django.apps import apps +from django.db import transaction from django.utils import timezone -def remove_migrated_data_is_original(batch_size: int = 1000) -> None: - start_time = timezone.now() +def _get_model_list_is_original() -> List[Any]: all_models = apps.get_models() + all_models_with_is_original_field = [] for model in all_models: if hasattr(model, "is_original"): - if model.__name__ == "GrievanceTicket": - queryset_all = model.default_for_migrations_fix.all().only("is_original", "id") - queryset_is_original = queryset_all.filter(is_original=True) - elif model.__name__ in ["HouseholdSelection", "EntitlementCard", "Feedback", "Message"]: - queryset_all = model.original_and_repr_objects.all().only("is_original", "id") - queryset_is_original = queryset_all.filter(is_original=True) - else: - queryset_all = model.all_objects.all().only("is_original", "id") - queryset_is_original = queryset_all.filter(is_original=True) - - print( - f"*** {model.__name__} All objects: {queryset_all.count()}. " - f"Removing objects with 'is_original=True': {queryset_is_original.count()}" - ) - - deleted_count = 0 - ids_to_delete = list(queryset_is_original.values_list("id", flat=True).iterator(chunk_size=batch_size)) - - for i in range(0, len(ids_to_delete), batch_size): - batch_pks = ids_to_delete[i : i + batch_size] - count, _ = queryset_all.filter(pk__in=batch_pks).delete() - deleted_count += count - - print(f"Deleted {model.__name__} and related objects {deleted_count}.\n") + all_models_with_is_original_field.append(model) + + return all_models_with_is_original_field + + +def remove_migrated_data_is_original(batch_size: int = 1000) -> None: + start_time = timezone.now() + for model in _get_model_list_is_original(): + if model.__name__ == "GrievanceTicket": + model_qs = model.default_for_migrations_fix + elif model.__name__ in ["HouseholdSelection", "EntitlementCard", "Feedback", "Message"]: + model_qs = model.original_and_repr_objects + else: + model_qs = model.all_objects + + queryset_is_original = model_qs.filter(is_original=True).only("id") + print(f"Removing objects with 'is_original=True': {model.__name__}.") + + ids_to_delete = [ + str(obj_id) for obj_id in queryset_is_original.values_list("id", flat=True).iterator(chunk_size=batch_size) + ] + deleted_count = 0 + total_to_delete = len(ids_to_delete) + + for i in range(0, total_to_delete, batch_size): + batch_ids = ids_to_delete[i : i + batch_size] + # batch processing atomically + with transaction.atomic(): + deleted, _ = queryset_is_original.filter(id__in=batch_ids).delete() + deleted_count += deleted + + if i % (batch_size * 10) == 0: + print( + f"Progress: Deleted {deleted_count:,} {model.__name__} and related objects. " + f"{model.__name__} list contains {total_to_delete:,} records." + ) + print(f"Deleted {model.__name__} and related objects: {deleted_count:,}.\n") + print(f"Completed in {timezone.now() - start_time}\n", "*" * 60) + + +def get_statistic_is_original() -> None: + start_time = timezone.now() + for model in _get_model_list_is_original(): + if model.__name__ == "GrievanceTicket": + queryset_all = model.default_for_migrations_fix.all().only("is_original", "id") + queryset_is_original = queryset_all.filter(is_original=True) + elif model.__name__ in ["HouseholdSelection", "EntitlementCard", "Feedback", "Message"]: + queryset_all = model.original_and_repr_objects.all().only("is_original", "id") + queryset_is_original = queryset_all.filter(is_original=True) + else: + queryset_all = model.all_objects.all().only("is_original", "id") + queryset_is_original = queryset_all.filter(is_original=True) + print( + f"*** {model.__name__} All objects: {queryset_all.count():,}. " + f"Will remove objects with 'is_original=True': {queryset_is_original.count():,}" + ) print(f"Completed in {timezone.now() - start_time}\n", "*" * 55) diff --git a/tests/unit/one_time_scripts/test_remove_migrated_data_is_original.py b/tests/unit/one_time_scripts/test_remove_migrated_data_is_original.py index 35a057688c..6f5b21af97 100644 --- a/tests/unit/one_time_scripts/test_remove_migrated_data_is_original.py +++ b/tests/unit/one_time_scripts/test_remove_migrated_data_is_original.py @@ -48,6 +48,7 @@ from hct_mis_api.apps.targeting.fixtures import HouseholdSelectionFactory from hct_mis_api.apps.targeting.models import HouseholdSelection from hct_mis_api.one_time_scripts.remove_migrated_data_is_original import ( + get_statistic_is_original, remove_migrated_data_is_original, ) @@ -132,6 +133,7 @@ def test_run_remove_migrated_data_is_original(self) -> None: self.assertEqual(GrievanceTicket.default_for_migrations_fix.count(), 2) self.assertEqual(TicketNeedsAdjudicationDetails.objects.count(), 2) + get_statistic_is_original() remove_migrated_data_is_original() # check count after