From 9cc85486c884ea35a630953706e20784dd2680ec Mon Sep 17 00:00:00 2001 From: Ewen Corre Date: Mon, 6 Jan 2025 12:06:47 +0100 Subject: [PATCH 1/2] import_geiq: add tests --- itou/utils/faker_providers.py | 7 + .../test_management_command_import_geiq.py | 157 ++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 tests/companies/test_management_command_import_geiq.py diff --git a/itou/utils/faker_providers.py b/itou/utils/faker_providers.py index 7fa682ed76..6f60f46360 100644 --- a/itou/utils/faker_providers.py +++ b/itou/utils/faker_providers.py @@ -2,6 +2,7 @@ import random from django.contrib.gis.geos import Point +from django.utils import timezone from faker.providers import BaseProvider @@ -13,6 +14,12 @@ def asp_ea2_filename(self, date: datetime.date = None) -> str: date_part = random.randint(0, 99999999) if date is None else date.strftime("%Y%m%d") return f"FLUX_EA2_ITOU_{date_part}.zip" + def geiq_filename(self, date: datetime.date = None) -> str: + if date is None: + date = timezone.localdate() + date_part = date.strftime("%Y-%m-%d") + return f"{date_part} - Export BDD FFGEIQ.xls" + def geopoint(self) -> Point: return Point( [float(coord) for coord in self.generator.format("local_latlng", country_code="FR", coords_only=True)] diff --git a/tests/companies/test_management_command_import_geiq.py b/tests/companies/test_management_command_import_geiq.py new file mode 100644 index 0000000000..93c85cc0fb --- /dev/null +++ b/tests/companies/test_management_command_import_geiq.py @@ -0,0 +1,157 @@ +import pytest +from faker import Faker + +from itou.companies.management.commands.import_geiq import get_geiq_df +from itou.utils.export import generate_excel_sheet +from tests.utils.test import create_fake_postcode + + +faker = Faker() + +FILE_HEADERS = ["Nom", "Rue", "Rue (suite)", "Code Postal", "Ville", "SIRET", "e-mail"] + + +def generate_data(rows=185, rows_with_empty_siret=0, rows_with_empty_email=0, duplicated_sirets=0): + data = [] + rows_count = 0 + duplicated_sirets_count = 0 + while rows_count < rows: + if rows_with_empty_siret > 0: + siret = "" + rows_with_empty_siret -= 1 + else: + siret = faker.numerify("1#############") + + if rows_with_empty_email > 0: + email = "" + rows_with_empty_email -= 1 + else: + email = faker.email() + + row = [ + faker.name(), + faker.street_address(), + "Sous l'escalier", + create_fake_postcode(), + faker.city(), + siret, + email, + ] + + data.append(row) + + if duplicated_sirets_count < duplicated_sirets: + data.append(row) + rows_count += 1 + duplicated_sirets_count += 1 + + rows_count += 1 + return data + + +def test_get_geiq_df(sftp_directory, faker): + # Correct data + rows = 185 + rows_with_empty_siret = 0 + rows_with_empty_email = 0 + data = generate_data( + rows=rows, rows_with_empty_siret=rows_with_empty_siret, rows_with_empty_email=rows_with_empty_email + ) + file_path = sftp_directory.joinpath(faker.geiq_filename()) + with open(file_path, "wb") as xlsxfile: + workbook = generate_excel_sheet(FILE_HEADERS, data) + workbook.save(xlsxfile) + df, info_stats = get_geiq_df(file_path) + assert df.shape == (rows, 8) + assert info_stats == { + "rows_in_file": rows, + "rows_with_a_siret": rows, + "rows_after_deduplication": rows, + "rows_with_empty_email": rows_with_empty_email, + } + + # File too small, need at least 150 rows + rows = 140 + rows_with_empty_siret = 0 + rows_with_empty_email = 0 + data = generate_data( + rows=rows, rows_with_empty_siret=rows_with_empty_siret, rows_with_empty_email=rows_with_empty_email + ) + file_path = sftp_directory.joinpath(faker.geiq_filename()) + with open(file_path, "wb") as xlsxfile: + workbook = generate_excel_sheet(FILE_HEADERS, data) + workbook.save(xlsxfile) + with pytest.raises(AssertionError): + df, info_stats = get_geiq_df(file_path) + + # Too many missing emails + rows = 185 + rows_with_empty_siret = 0 + rows_with_empty_email = 100 + data = generate_data( + rows=rows, rows_with_empty_siret=rows_with_empty_siret, rows_with_empty_email=rows_with_empty_email + ) + file_path = sftp_directory.joinpath(faker.geiq_filename()) + with open(file_path, "wb") as xlsxfile: + workbook = generate_excel_sheet(FILE_HEADERS, data) + workbook.save(xlsxfile) + with pytest.raises(AssertionError): + df, info_stats = get_geiq_df(file_path) + + # Some missing emails + rows = 185 + rows_with_empty_siret = 0 + rows_with_empty_email = 20 + data = generate_data( + rows=rows, rows_with_empty_siret=rows_with_empty_siret, rows_with_empty_email=rows_with_empty_email + ) + file_path = sftp_directory.joinpath(faker.geiq_filename()) + with open(file_path, "wb") as xlsxfile: + workbook = generate_excel_sheet(FILE_HEADERS, data) + workbook.save(xlsxfile) + df, info_stats = get_geiq_df(file_path) + assert df.shape == (rows - rows_with_empty_email, 8) + assert info_stats == { + "rows_in_file": rows, + "rows_with_a_siret": rows, + "rows_after_deduplication": rows, + "rows_with_empty_email": rows_with_empty_email, + } + + # Too many missing sirets + rows = 185 + rows_with_empty_siret = 100 + rows_with_empty_email = 0 + data = generate_data( + rows=rows, rows_with_empty_siret=rows_with_empty_siret, rows_with_empty_email=rows_with_empty_email + ) + file_path = sftp_directory.joinpath(faker.geiq_filename()) + with open(file_path, "wb") as xlsxfile: + workbook = generate_excel_sheet(FILE_HEADERS, data) + workbook.save(xlsxfile) + with pytest.raises(AssertionError): + df, info_stats = get_geiq_df(file_path) + + # Duplicated rows + rows = 250 + rows_with_empty_siret = 0 + rows_with_empty_email = 0 + duplicated_sirets = 20 + data = generate_data( + rows=rows, + rows_with_empty_siret=rows_with_empty_siret, + rows_with_empty_email=rows_with_empty_email, + duplicated_sirets=duplicated_sirets, + ) + file_path = sftp_directory.joinpath(faker.geiq_filename()) + with open(file_path, "wb") as xlsxfile: + workbook = generate_excel_sheet(FILE_HEADERS, data) + workbook.save(xlsxfile) + df, info_stats = get_geiq_df(file_path) + assert df.shape == (rows - duplicated_sirets, 8) + assert info_stats == { + "rows_in_file": rows, + "rows_with_a_siret": rows, + "rows_after_deduplication": rows - duplicated_sirets, + "rows_with_empty_email": rows_with_empty_email, + } From 0b84479da1a7f8f4f9a49f6eed7b4943904a1465 Mon Sep 17 00:00:00 2001 From: Ewen Corre Date: Mon, 6 Jan 2025 11:31:00 +0100 Subject: [PATCH 2/2] import_geiq: force SIRET type to integer When replacing NaN elements with None, if the siret column is not explicitely defined as integer, it is converted to float. These values later converted to string, need to be integers, otherwise they are suffixed with `.0`. --- .../management/commands/import_geiq.py | 3 +++ .../test_management_command_import_geiq.py | 20 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/itou/companies/management/commands/import_geiq.py b/itou/companies/management/commands/import_geiq.py index 8c1811d1f5..a80e2bcb7f 100755 --- a/itou/companies/management/commands/import_geiq.py +++ b/itou/companies/management/commands/import_geiq.py @@ -34,6 +34,9 @@ def get_geiq_df(filename): } df = remap_columns(df, column_mapping=column_mapping) + # Force siret type to integer, otherwise replacing NaN elements to None blindly converts them to float. + df["siret"] = df["siret"].astype("Int64") + # Replace NaN elements with None. df = df.replace({np.nan: None}) diff --git a/tests/companies/test_management_command_import_geiq.py b/tests/companies/test_management_command_import_geiq.py index 93c85cc0fb..4e6e33a65f 100644 --- a/tests/companies/test_management_command_import_geiq.py +++ b/tests/companies/test_management_command_import_geiq.py @@ -132,6 +132,26 @@ def test_get_geiq_df(sftp_directory, faker): with pytest.raises(AssertionError): df, info_stats = get_geiq_df(file_path) + # Missing some sirets + rows = 185 + rows_with_empty_siret = 20 + rows_with_empty_email = 0 + data = generate_data( + rows=rows, rows_with_empty_siret=rows_with_empty_siret, rows_with_empty_email=rows_with_empty_email + ) + file_path = sftp_directory.joinpath(faker.geiq_filename()) + with open(file_path, "wb") as xlsxfile: + workbook = generate_excel_sheet(FILE_HEADERS, data) + workbook.save(xlsxfile) + df, info_stats = get_geiq_df(file_path) + assert df.shape == (rows - rows_with_empty_siret, 8) + assert info_stats == { + "rows_in_file": rows, + "rows_with_a_siret": rows - rows_with_empty_siret, + "rows_after_deduplication": rows - rows_with_empty_siret, + "rows_with_empty_email": 0, + } + # Duplicated rows rows = 250 rows_with_empty_siret = 0