Skip to content

Commit

Permalink
#3748 Added logic to delete unnecessary workbook artifacts via django… (
Browse files Browse the repository at this point in the history
#4193)

* #3748 Added logic to delete unnecessary workbook artifacts via django command

* #3748 Updated django command to remove files from  S3 only

* Bug fix

* Fixed linting

* Fixed unit tests
  • Loading branch information
sambodeme authored Sep 6, 2024
1 parent 672b7e2 commit 5dc66d2
Show file tree
Hide file tree
Showing 3 changed files with 425 additions and 114 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from django.core.management.base import BaseCommand
from dissemination.remove_workbook_artifacts import delete_workbooks

import logging


logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = "Delete workbook artifacts for a specific partition of disseminated reports."

def add_arguments(self, parser):
parser.add_argument(
"--partition_number",
type=int,
required=True,
help="The partition number to process (e.g., 1, 2, 3).",
)
parser.add_argument(
"--total_partitions",
type=int,
required=True,
help="The total number of partitions (e.g., 4 if splitting the load into four parts).",
)
parser.add_argument(
"--page_size",
type=int,
required=False,
default=10,
help="Number of items to process per page",
)
parser.add_argument(
"--pages",
type=int,
required=False,
default=None,
help="Maximum number of pages to process",
)

def handle(self, *args, **options):
partition_number = options["partition_number"]
total_partitions = options["total_partitions"]
page_size = options["page_size"]
pages = options["pages"]

self.stdout.write(
self.style.SUCCESS(
f"Processing partition {partition_number} of {total_partitions}"
)
)
delete_workbooks(
partition_number, total_partitions, page_size=page_size, pages=pages
)
283 changes: 216 additions & 67 deletions backend/dissemination/remove_workbook_artifacts.py
Original file line number Diff line number Diff line change
@@ -1,67 +1,216 @@
import logging

from django.conf import settings
from audit.models.models import ExcelFile
from boto3 import client as boto3_client
from botocore.client import ClientError, Config

logger = logging.getLogger(__name__)


def remove_workbook_artifacts(sac):
"""
Remove all workbook artifacts associated with the given sac.
"""
try:
excel_files = ExcelFile.objects.filter(sac=sac)
files = [f"excel/{excel_file.filename}" for excel_file in excel_files]

if files:
# Delete the files from S3 in bulk
delete_files_in_bulk(files, sac)

except ExcelFile.DoesNotExist:
logger.info(f"No files found to delete for report: {sac.report_id}")
except Exception as e:
logger.error(
f"Failed to delete files from S3 for report: {sac.report_id}. Error: {e}"
)


def delete_files_in_bulk(filenames, sac):
"""Delete files from S3 in bulk."""
# This client uses the internal endpoint URL because we're making a request to S3 from within the app
s3_client = boto3_client(
service_name="s3",
region_name=settings.AWS_S3_PRIVATE_REGION_NAME,
aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID,
aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY,
endpoint_url=settings.AWS_S3_PRIVATE_INTERNAL_ENDPOINT,
config=Config(signature_version="s3v4"),
)

try:
delete_objects = [{"Key": filename} for filename in filenames]

response = s3_client.delete_objects(
Bucket=settings.AWS_PRIVATE_STORAGE_BUCKET_NAME,
Delete={"Objects": delete_objects},
)

deleted_files = response.get("Deleted", [])
for deleted in deleted_files:
logger.info(
f"Successfully deleted {deleted['Key']} from S3 for report: {sac.report_id}"
)

errors = response.get("Errors", [])
if errors:
for error in errors:
logger.error(
f"Failed to delete {error['Key']} from S3 for report: {sac.report_id}. Error: {error['Message']}" # nosec B608
)

except ClientError as e:
logger.error(
f"Failed to delete files from S3 for report: {sac.report_id}. Error: {e}"
)
import logging
import math

from django.conf import settings
from audit.models.models import ExcelFile, SingleAuditChecklist
from boto3 import client as boto3_client
from botocore.client import ClientError, Config
from django.core.paginator import Paginator
from django.core.paginator import PageNotAnInteger, EmptyPage


logger = logging.getLogger(__name__)


def remove_workbook_artifacts(sac):
"""
Remove all workbook artifacts associated with the given sac.
"""
try:
excel_files = ExcelFile.objects.filter(sac=sac)
files = [f"excel/{excel_file.filename}" for excel_file in excel_files]

if files:
# Delete the files from S3 in bulk
delete_files_in_bulk(files, sac)

except ExcelFile.DoesNotExist:
logger.info(f"No files found to delete for report: {sac.report_id}")
except Exception as e:
logger.error(
f"Failed to delete files from S3 for report: {sac.report_id}. Error: {e}"
)


def delete_files_in_bulk(filenames, sac):
"""Delete files from S3 in bulk."""
# This client uses the internal endpoint URL because we're making a request to S3 from within the app
s3_client = boto3_client(
service_name="s3",
region_name=settings.AWS_S3_PRIVATE_REGION_NAME,
aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID,
aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY,
endpoint_url=settings.AWS_S3_PRIVATE_INTERNAL_ENDPOINT,
config=Config(signature_version="s3v4"),
)

try:
delete_objects = [{"Key": filename} for filename in filenames]

response = s3_client.delete_objects(
Bucket=settings.AWS_PRIVATE_STORAGE_BUCKET_NAME,
Delete={"Objects": delete_objects},
)

deleted_files = response.get("Deleted", [])
for deleted in deleted_files:
logger.info(
f"Successfully deleted {deleted['Key']} from S3 for report: {sac.report_id}"
)

errors = response.get("Errors", [])
if errors:
for error in errors:
logger.error(
f"Failed to delete {error['Key']} from S3 for report: {sac.report_id}. Error: {error['Message']}" # nosec B608
)

except ClientError as e:
logger.error(
f"Failed to delete files from S3 for report: {sac.report_id}. Error: {e}"
)


def clean_artifacts(sac_list):
"""
Perform necessary cleanup associated with the given list of sac values.
"""
try:
excel_files = ExcelFile.objects.filter(sac__in=sac_list)
files = [f"excel/{excel_file.filename}" for excel_file in excel_files]

if files:
logger.info(
f"Found {len(files)} ExcelFile records for reports: {[sac.report_id for sac in sac_list]}"
)

# Track results but do not delete the ExcelFile records from the database
successful_deletes, failed_deletes = batch_removal(
files,
sac_list,
{
f"excel/{excel_file.filename}": excel_file.sac.report_id
for excel_file in excel_files
},
)

if failed_deletes:
logger.error(
f"Failed to delete the following files from S3: {failed_deletes}"
)
if successful_deletes:
logger.info(
f"Successfully deleted the following files from S3: {successful_deletes}"
)

except Exception as e:
logger.error(f"Failed to process files for the provided sac values. Error: {e}")


def batch_removal(filenames, sac_list, sac_to_report_id_map):
"""Delete files from S3 in bulk and return the results."""
s3_client = boto3_client(
service_name="s3",
region_name=settings.AWS_S3_PRIVATE_REGION_NAME,
aws_access_key_id=settings.AWS_PRIVATE_ACCESS_KEY_ID,
aws_secret_access_key=settings.AWS_PRIVATE_SECRET_ACCESS_KEY,
endpoint_url=settings.AWS_S3_PRIVATE_INTERNAL_ENDPOINT,
config=Config(signature_version="s3v4"),
)

try:
delete_objects = [{"Key": filename} for filename in filenames]
response = s3_client.delete_objects(
Bucket=settings.AWS_PRIVATE_STORAGE_BUCKET_NAME,
Delete={"Objects": delete_objects},
)

successful_deletes = []
failed_deletes = []
deleted_files = response.get("Deleted", [])
for deleted in deleted_files:
filename = deleted["Key"]
successful_deletes.append(
{
"filename": filename,
"sac_report_id": sac_to_report_id_map[filename],
}
)

errors = response.get("Errors", [])
if errors:
for error in errors:
filename = error["Key"]
failed_deletes.append(
{
"filename": filename,
"sac_report_id": sac_to_report_id_map[filename],
"error_message": error["Message"],
}
)

return successful_deletes, failed_deletes

except ClientError as e:
logger.error(
f"Failed to delete files from S3 for sac values: {[sac.report_id for sac in sac_list]}. Error: {e}"
)
return [], [{"error_message": str(e)}]
except Exception as e:
logger.error(f"Failed to delete files from S3. Error: {e}")
return [], [{"error_message": str(e)}]


def delete_workbooks(partition_number, total_partitions, page_size=10, pages=None):
"""Iterates over disseminated reports for the specified partition."""

if partition_number < 1 or partition_number > total_partitions:
raise ValueError(
"Invalid partition number. It must be between 1 and the total number of partitions."
)

all_ids = (
SingleAuditChecklist.objects.filter(
submission_status=SingleAuditChecklist.STATUS.DISSEMINATED
)
.values_list("id", flat=True)
.order_by("id")
)

total_ids = len(all_ids)
ids_per_partition = math.ceil(total_ids / total_partitions)

start_index = (partition_number - 1) * ids_per_partition
end_index = min(partition_number * ids_per_partition, total_ids)

ids_to_process = all_ids[start_index:end_index]

sacs = SingleAuditChecklist.objects.filter(id__in=ids_to_process).order_by("id")

paginator = Paginator(sacs, page_size)
total_pages = (
paginator.num_pages if pages is None else min(pages, paginator.num_pages)
)

logger.info(
f"Retrieving {sacs.count()} reports for partition {partition_number} of {total_partitions}"
)

for page_number in range(1, total_pages + 1):
try:
page = paginator.page(page_number)
logger.info(
f"Processing page {page_number} with {page.object_list.count()} reports."
)

# Extract sac values from the current page
sac_list = list(page.object_list)
clean_artifacts(sac_list)

except PageNotAnInteger:
logger.error(f"Page number {page_number} is not an integer.")
except EmptyPage:
logger.info(f"No more pages to process after page {page_number}.")
break
except Exception as e:
logger.error(f"An error occurred while processing page {page_number}: {e}")
Loading

0 comments on commit 5dc66d2

Please sign in to comment.