From 72293cc586dab877b4cc4652a54c5c9e2f8a45b4 Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Tue, 8 Oct 2024 16:58:17 +1030 Subject: [PATCH] issue #1178 - properly quote CSVs (handle commas) --- analysis/tasks/analysis_grid_export_tasks.py | 2 +- library/django_utils/jqgrid_view.py | 2 +- .../commands/one_off_fix_csv_quoting.py | 52 +++++++++++++------ 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/analysis/tasks/analysis_grid_export_tasks.py b/analysis/tasks/analysis_grid_export_tasks.py index e56d1308d..f175f56cd 100644 --- a/analysis/tasks/analysis_grid_export_tasks.py +++ b/analysis/tasks/analysis_grid_export_tasks.py @@ -79,7 +79,7 @@ def _write_node_to_cached_generated_file(cgf, analysis, node, name, export_type) if export_type == 'csv': original_filename = media_root_filename zip_file_path = media_root_filename + ".zip" - with zipfile.ZipFile(zip_file_path, 'w') as zipf: + with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: zipf.write(original_filename, arcname=os.path.basename(original_filename)) os.unlink(original_filename) media_root_filename = zip_file_path diff --git a/library/django_utils/jqgrid_view.py b/library/django_utils/jqgrid_view.py index c03c05ad8..90784f729 100644 --- a/library/django_utils/jqgrid_view.py +++ b/library/django_utils/jqgrid_view.py @@ -136,7 +136,7 @@ def grid_export_csv(colmodels, items) -> Iterator[str]: pseudo_buffer = StashFile() header, labels = colmodel_header_labels(colmodels, label_overrides=label_overrides) # Don't use dictwriter as some sample names may be the same - writer = csv.writer(pseudo_buffer, dialect='excel', escapechar='\\', quoting=csv.QUOTE_NONE) + writer = csv.writer(pseudo_buffer, dialect='excel', quoting=csv.QUOTE_MINIMAL) writer.writerow(header) def iter_row_writer(): diff --git a/snpdb/management/commands/one_off_fix_csv_quoting.py b/snpdb/management/commands/one_off_fix_csv_quoting.py index 6c64f2e7d..cdd14cf96 100644 --- a/snpdb/management/commands/one_off_fix_csv_quoting.py +++ b/snpdb/management/commands/one_off_fix_csv_quoting.py @@ -1,6 +1,8 @@ import csv +import logging import os import zipfile +from zipfile import ZipFile from django.core.management.base import BaseCommand from snpdb.models import CachedGeneratedFile @@ -11,25 +13,43 @@ class Command(BaseCommand): """ @staticmethod - def fix_csv_zip(zip_filename): - # Move the old one to a backup - backup_name = zip_filename + ".backup" - os.rename(zip_filename, backup_name) + def fix_csv(csv_file, fixed_csv_filename): + reader = csv.reader(csv_file.read().decode('utf-8').splitlines(), dialect='excel', escapechar='\\', + quoting=csv.QUOTE_NONE) - with zipfile.ZipFile(zip_filename, 'r') as zip_file: - csv_filename = os.path.splitext(os.path.basename(zip_filename))[0] - with zip_file.open(csv_filename) as csv_file: - reader = csv.reader(csv_file.read().decode('utf-8').splitlines(), dialect='excel', escapechar='\\', - quoting=csv.QUOTE_NONE) + logging.info("Fixing in file: %s", fixed_csv_filename) + with open(fixed_csv_filename, "wt") as out_csv_file: + writer = csv.writer(out_csv_file, dialect='excel', quoting=csv.QUOTE_MINIMAL) + for r in reader: + writer.writerow(r) + @staticmethod + def fix_csv_zip(zip_file, zip_filename): + csv_filename = os.path.splitext(os.path.basename(zip_filename))[0] + with zip_file.open(csv_filename) as csv_file: fixed_csv_filename = os.path.splitext(zip_filename)[0] - with open(fixed_csv_filename, "wt") as out_csv_file: - writer = csv.writer(out_csv_file, dialect='excel', quoting=csv.QUOTE_MINIMAL) - for r in reader: - writer.writerow(r) + Command.fix_csv(csv_file, fixed_csv_filename) + + zip_file_path = fixed_csv_filename + ".zip" + logging.info("Putting into zip: %s", zip_file_path) + with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: + zipf.write(fixed_csv_filename, arcname=os.path.basename(fixed_csv_filename)) + os.unlink(fixed_csv_filename) def handle(self, *args, **options): AFFECTED_GENERATORS = ["export_sample_to_downloadable_file", "export_cohort_to_downloadable_file"] - for cgf in CachedGeneratedFile.objects.filter(generator__in=AFFECTED_GENERATORS): - self.fix_csv_zip(cgf.filename) - + for cgf in CachedGeneratedFile.objects.filter(generator__in=AFFECTED_GENERATORS, filename__isnull=False): + if not (cgf.filename.endswith(".csv") or cgf.filename.endswith(".csv.zip")): + continue + + if os.path.exists(cgf.filename): + logging.info("Need to fix: %s", cgf.filename) + # Move the old one to a backup + backup_name = cgf.filename + ".backup" + os.rename(cgf.filename, backup_name) + if cgf.filename.endswith(".zip"): + with zipfile.ZipFile(backup_name, 'r') as zip_file: + self.fix_csv_zip(zip_file, cgf.filename) + else: + with open(backup_name, "rb") as csv_file: + Command.fix_csv(csv_file, cgf.filename)