Skip to content

Commit

Permalink
validate sample name uniqueness and fix date format
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunlong Li committed Aug 22, 2024
1 parent 1f677e9 commit 120e6c1
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def process_author_emails(sample, authors_df):

def prepare_samples_data(samples_df, authors_df, related_resources_df, funding_df, org_id):
samples_data = []
existing_names = set()
errors = []
for _, row in samples_df.iterrows():
sample = row.to_dict()
sample["author"] = process_author_emails(sample, authors_df)
Expand Down Expand Up @@ -99,10 +101,23 @@ def prepare_samples_data(samples_df, authors_df, related_resources_df, funding_d

sample["name"] = generate_sample_name(org_id, sample['sample_type'], sample['sample_number'])
sample["title"] = generate_sample_title(org_id, sample['sample_type'], sample['sample_number'])
# Check for uniqueness
if sample["name"] in existing_names:
errors.append(f"Duplicate sample name: {sample['name']}")
else:
existing_names.add(sample["name"])

samples_data.append(sample)

return samples_data
try:
package_list = toolkit.get_action('package_list')({}, {})
for package in package_list:
package_data = toolkit.get_action('package_show')({}, {'id': package})
existing_name = package_data.get('name')
if existing_name in existing_names:
errors.append(f"Sample name {existing_name} already exists in CKAN")
except Exception as e:
errors.append(f"Error fetching CKAN data: {str(e)}")
return samples_data, errors

def process_related_resources(sample, related_resources_df):
related_resources_urls = sample.get("related_resources_urls")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -135,26 +135,32 @@ def validate_acquisition_date(sample_df):
errors = []

for column in ['acquisition_start_date', 'acquisition_end_date']:
# Check for missing dates
missing_dates = sample_df[column].isna() | (sample_df[column] == '')
for index in missing_dates[missing_dates].index:
errors.append(f"Row {index}: '{column}' is missing")

# Check for non-empty cells that are not valid dates
non_empty = sample_df[column].notna() & (sample_df[column] != '')
non_date = sample_df[non_empty & pd.to_datetime(sample_df[column], errors='coerce').isna()]
non_date = sample_df[non_empty & pd.to_datetime(sample_df[column], errors='coerce', format='%Y-%m-%d').isna()]

for index, value in non_date[column].items():
errors.append(f"Row {index}: '{column}' value '{value}' is not a valid date")

# Convert to datetime, coerce errors to NaT
sample_df['acquisition_start_date'] = pd.to_datetime(sample_df['acquisition_start_date'], errors='coerce')
sample_df['acquisition_end_date'] = pd.to_datetime(sample_df['acquisition_end_date'], errors='coerce')
# Convert to datetime, coerce errors to NaT, and format as 'YYYY-MM-DD'
for column in ['acquisition_start_date', 'acquisition_end_date']:
sample_df[column] = pd.to_datetime(sample_df[column], errors='coerce', format='%Y-%m-%d').dt.strftime('%Y-%m-%d')

# Filter for rows where both dates are not null
valid_df = sample_df.dropna(subset=['acquisition_start_date', 'acquisition_end_date'])

# Compare dates only for non-null pairs
invalid_mask = valid_df["acquisition_start_date"] >= valid_df["acquisition_end_date"]
# Compare dates
invalid_mask = pd.to_datetime(valid_df["acquisition_start_date"]) >= pd.to_datetime(valid_df["acquisition_end_date"])
invalid_df = valid_df[invalid_mask]

for index, row in invalid_df.iterrows():
errors.append(f"Row {index}: acquisition_start_date {row['acquisition_start_date']} is not before acquisition_end_date {row['acquisition_end_date']}")

return errors

def validate_parent_samples(df):
Expand Down Expand Up @@ -438,6 +444,7 @@ def validate_authors(authors_df):
errors.extend(validate_author_identifier(authors_df, valid_identifier_types))
return errors


def validate_samples(samples_df, related_resources_df, authors_df, funding_df):
errors = []
samples_columns_to_check = ['sample_number', 'description', 'user_keywords', 'sample_type', 'author_emails']
Expand Down
8 changes: 5 additions & 3 deletions ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,18 @@ def process_excel(self, uploaded_file, org_id):
all_errors.extend(validate_related_resources(related_resources_df))
all_errors.extend(validate_parent_samples(samples_df))



samples_data, errors = prepare_samples_data(samples_df, authors_df, related_resources_df, funding_df, org_id)
all_errors.extend(errors)

if all_errors:
error_list = "\n".join(f"Error {i+1}. {error}. " for i, error in enumerate(all_errors))
# format the error list to be displayed in human readable format
formatted_errors = f"<pre style='white-space: pre-wrap;'>{error_list}</pre>"
raise ValueError(f"""The following errors were found:
{formatted_errors}""")

samples_data = prepare_samples_data(samples_df, authors_df, related_resources_df, funding_df, org_id)


return_value = {
"samples": samples_data,
"authors": authors_df.to_dict("records"),
Expand Down

0 comments on commit 120e6c1

Please sign in to comment.