From 120e6c170c61afe06515d6ca6fc90b81f39369e4 Mon Sep 17 00:00:00 2001 From: Yunlong Li Date: Thu, 22 Aug 2024 11:03:21 +0800 Subject: [PATCH] validate sample name uniqueness and fix date format --- .../ckanext/igsn_theme/logic/batch_process.py | 19 +++++++++++++++++-- .../igsn_theme/logic/batch_validation.py | 19 +++++++++++++------ .../ckanext/igsn_theme/views.py | 8 +++++--- 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_process.py b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_process.py index c77fcc70..3ae2356e 100644 --- a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_process.py +++ b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_process.py @@ -64,6 +64,8 @@ def process_author_emails(sample, authors_df): def prepare_samples_data(samples_df, authors_df, related_resources_df, funding_df, org_id): samples_data = [] + existing_names = set() + errors = [] for _, row in samples_df.iterrows(): sample = row.to_dict() sample["author"] = process_author_emails(sample, authors_df) @@ -99,10 +101,23 @@ def prepare_samples_data(samples_df, authors_df, related_resources_df, funding_d sample["name"] = generate_sample_name(org_id, sample['sample_type'], sample['sample_number']) sample["title"] = generate_sample_title(org_id, sample['sample_type'], sample['sample_number']) + # Check for uniqueness + if sample["name"] in existing_names: + errors.append(f"Duplicate sample name: {sample['name']}") + else: + existing_names.add(sample["name"]) samples_data.append(sample) - - return samples_data + try: + package_list = toolkit.get_action('package_list')({}, {}) + for package in package_list: + package_data = toolkit.get_action('package_show')({}, {'id': package}) + existing_name = package_data.get('name') + if existing_name in existing_names: + errors.append(f"Sample name {existing_name} already exists in CKAN") + except Exception as e: + errors.append(f"Error fetching CKAN data: {str(e)}") + return samples_data, errors def process_related_resources(sample, related_resources_df): related_resources_urls = sample.get("related_resources_urls") diff --git a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_validation.py b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_validation.py index cde35b22..bed15c65 100644 --- a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_validation.py +++ b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_validation.py @@ -135,26 +135,32 @@ def validate_acquisition_date(sample_df): errors = [] for column in ['acquisition_start_date', 'acquisition_end_date']: + # Check for missing dates + missing_dates = sample_df[column].isna() | (sample_df[column] == '') + for index in missing_dates[missing_dates].index: + errors.append(f"Row {index}: '{column}' is missing") + # Check for non-empty cells that are not valid dates non_empty = sample_df[column].notna() & (sample_df[column] != '') - non_date = sample_df[non_empty & pd.to_datetime(sample_df[column], errors='coerce').isna()] + non_date = sample_df[non_empty & pd.to_datetime(sample_df[column], errors='coerce', format='%Y-%m-%d').isna()] for index, value in non_date[column].items(): errors.append(f"Row {index}: '{column}' value '{value}' is not a valid date") - # Convert to datetime, coerce errors to NaT - sample_df['acquisition_start_date'] = pd.to_datetime(sample_df['acquisition_start_date'], errors='coerce') - sample_df['acquisition_end_date'] = pd.to_datetime(sample_df['acquisition_end_date'], errors='coerce') + # Convert to datetime, coerce errors to NaT, and format as 'YYYY-MM-DD' + for column in ['acquisition_start_date', 'acquisition_end_date']: + sample_df[column] = pd.to_datetime(sample_df[column], errors='coerce', format='%Y-%m-%d').dt.strftime('%Y-%m-%d') # Filter for rows where both dates are not null valid_df = sample_df.dropna(subset=['acquisition_start_date', 'acquisition_end_date']) - # Compare dates only for non-null pairs - invalid_mask = valid_df["acquisition_start_date"] >= valid_df["acquisition_end_date"] + # Compare dates + invalid_mask = pd.to_datetime(valid_df["acquisition_start_date"]) >= pd.to_datetime(valid_df["acquisition_end_date"]) invalid_df = valid_df[invalid_mask] for index, row in invalid_df.iterrows(): errors.append(f"Row {index}: acquisition_start_date {row['acquisition_start_date']} is not before acquisition_end_date {row['acquisition_end_date']}") + return errors def validate_parent_samples(df): @@ -438,6 +444,7 @@ def validate_authors(authors_df): errors.extend(validate_author_identifier(authors_df, valid_identifier_types)) return errors + def validate_samples(samples_df, related_resources_df, authors_df, funding_df): errors = [] samples_columns_to_check = ['sample_number', 'description', 'user_keywords', 'sample_type', 'author_emails'] diff --git a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/views.py b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/views.py index ad73f81b..8071d132 100644 --- a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/views.py +++ b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/views.py @@ -85,6 +85,11 @@ def process_excel(self, uploaded_file, org_id): all_errors.extend(validate_related_resources(related_resources_df)) all_errors.extend(validate_parent_samples(samples_df)) + + + samples_data, errors = prepare_samples_data(samples_df, authors_df, related_resources_df, funding_df, org_id) + all_errors.extend(errors) + if all_errors: error_list = "\n".join(f"Error {i+1}. {error}. " for i, error in enumerate(all_errors)) # format the error list to be displayed in human readable format @@ -92,9 +97,6 @@ def process_excel(self, uploaded_file, org_id): raise ValueError(f"""The following errors were found: {formatted_errors}""") - samples_data = prepare_samples_data(samples_df, authors_df, related_resources_df, funding_df, org_id) - - return_value = { "samples": samples_data, "authors": authors_df.to_dict("records"),