diff --git a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_process.py b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_process.py index fca3d04e..f2fc9757 100644 --- a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_process.py +++ b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_process.py @@ -64,6 +64,8 @@ def process_author_emails(sample, authors_df): def prepare_samples_data(samples_df, authors_df, related_resources_df, funding_df, org_id): samples_data = [] + existing_names = set() + errors = [] for _, row in samples_df.iterrows(): sample = row.to_dict() sample["author"] = process_author_emails(sample, authors_df) @@ -75,6 +77,10 @@ def prepare_samples_data(samples_df, authors_df, related_resources_df, funding_d sample['notes'] = sample['description'] sample['location_choice'] = 'noLocation' sample['parent_sample'] = sample['parent_sample'] + sample['parent'] = '' + + sample['acquisition_start_date'] = row['acquisition_start_date'].strftime('%Y-%m-%d') if pd.notnull(row['acquisition_start_date']) else None + sample['acquisition_end_date'] = row['acquisition_end_date'].strftime('%Y-%m-%d') if pd.notnull(row['acquisition_end_date']) else None org = toolkit.get_action('organization_show')({}, {'id': org_id}) sample['owner_org'] = org_id @@ -98,10 +104,23 @@ def prepare_samples_data(samples_df, authors_df, related_resources_df, funding_d sample["name"] = generate_sample_name(org_id, sample['sample_type'], sample['sample_number']) sample["title"] = generate_sample_title(org_id, sample['sample_type'], sample['sample_number']) + # Check for uniqueness + if sample["name"] in existing_names: + errors.append(f"Duplicate sample name: {sample['name']}") + else: + existing_names.add(sample["name"]) samples_data.append(sample) - - return samples_data + try: + package_list = toolkit.get_action('package_list')({}, {}) + for package in package_list: + package_data = toolkit.get_action('package_show')({}, {'id': package}) + existing_name = package_data.get('name') + if existing_name in existing_names: + errors.append(f"Sample name {existing_name} already exists in CKAN") + except Exception as e: + errors.append(f"Error fetching CKAN data: {str(e)}") + return samples_data, errors def process_related_resources(sample, related_resources_df): related_resources_urls = sample.get("related_resources_urls") diff --git a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_validation.py b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_validation.py index 82e5af27..bed15c65 100644 --- a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_validation.py +++ b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/logic/batch_validation.py @@ -2,15 +2,21 @@ import re import pandas as pd -# TODO this function also has some issues, when its empty: still return Invalid rows (depth_from >= depth_to): 0 + def validate_sample_depth(sample_df): errors = [] filtered_df = sample_df.dropna(subset=["depth_from", "depth_to"]) - + # Check if depth_from and depth_to are valid numbers + for column in ["depth_from", "depth_to"]: + non_empty = sample_df[column].notna() & (sample_df[column].astype(str).str.strip() != '') + non_numeric = sample_df[non_empty & pd.to_numeric(sample_df[column].astype(str).str.strip(), errors='coerce').isna()] + for index, value in non_numeric[column].items(): + errors.append(f"Row {index}: '{column}' value '{value.strip()}' is not a valid number") + # Convert 'depth_from' and 'depth_to' to numeric, treating them as strings initially - filtered_df["depth_from"] = pd.to_numeric(filtered_df["depth_from"], errors="coerce") - filtered_df["depth_to"] = pd.to_numeric(filtered_df["depth_to"], errors="coerce") + filtered_df["depth_from"] = pd.to_numeric(filtered_df["depth_from"].astype(str).str.strip(), errors="coerce") + filtered_df["depth_to"] = pd.to_numeric(filtered_df["depth_to"].astype(str).str.strip(), errors="coerce") # Remove rows where conversion to numeric failed filtered_df = filtered_df.dropna(subset=["depth_from", "depth_to"]) @@ -29,85 +35,132 @@ def validate_elevation(sample_df): non_empty_elevations = sample_df["elevation"].dropna().replace('', pd.NA).dropna() # Attempt to convert non-empty values to numeric - numeric_result = pd.to_numeric(non_empty_elevations, errors='coerce') + numeric_result = pd.to_numeric(non_empty_elevations.astype(str).str.strip(), errors='coerce') # Identify rows where conversion failed (resulting in NaN) invalid_rows = numeric_result[numeric_result.isna()] for index, value in invalid_rows.items(): - original_value = sample_df.loc[index, "elevation"] + original_value = sample_df.loc[index, "elevation"].strip() errors.append(f"Row {index}: Elevation '{original_value}' is not a valid number") return errors def validate_match_related_resource_url(sample_df, resource_df): errors = [] + + # Extract unique URLs from sample_df sample_urls = set() for cell in sample_df['related_resources_urls']: urls = set(url.strip() for url in cell.split(';') if url.strip()) sample_urls.update(urls) - # Check each URL in resource_df against the set of sample URLs - missing_urls = [] - for url in resource_df['related_resource_url']: - if url not in sample_urls: - missing_urls.append(url) - - # If there are any missing URLs, raise an error - if missing_urls: - errors.append("The following related resource URLs are not in the sample sheet, please double check your related resource URLs:\n" + ', '.join(missing_urls)) + + # Extract unique URLs from resource_df + resource_urls = set(url.strip() for url in resource_df['related_resource_url'] if url.strip()) + + # Check if the URL sets in both dataframes match exactly + if sample_urls != resource_urls: + missing_in_sample = resource_urls - sample_urls + missing_in_resource = sample_urls - resource_urls + + if missing_in_sample: + errors.append("The following related resource URLs are not in the sample sheet:\n" + ', '.join(missing_in_sample)) + + if missing_in_resource: + errors.append("The following sample URLs are not in the related resource sheet:\n" + ', '.join(missing_in_resource)) + return errors def validate_match_user_email(sample_df, author_df): errors = [] sample_emails = set() for cell in sample_df['author_emails']: - emails = set(email.strip() for email in cell.split(';') if email.strip()) - sample_emails.update(emails) - # Check each email in author_df against the set of sample emails - missing_emails = [] + if isinstance(cell, str): + emails = set(email.strip().lower() for email in cell.split(';') if email.strip()) + sample_emails.update(emails) + + # Extract unique emails from author_df + author_emails = set() for email in author_df['author_email']: - if email not in sample_emails: - missing_emails.append(email) - - # If there are any missing emails, raise an error - if missing_emails: - errors.append("The following author emails are not in the sample sheet, please double check your author emails:\n" + ', '.join(missing_emails)) + if isinstance(email, str): + stripped_email = email.strip().lower() + if stripped_email: + author_emails.add(stripped_email) + + # Check if the email sets in both dataframes match exactly + if sample_emails != author_emails: + missing_in_sample = author_emails - sample_emails + missing_in_author = sample_emails - author_emails + + if missing_in_sample: + errors.append("The following author emails are not in the sample sheet:\n" + ', '.join(missing_in_sample)) + + if missing_in_author: + errors.append("The following sample emails are not in the author sheet:\n" + ', '.join(missing_in_author)) + return errors def validate_match_project_identifier(sample_df, project_df): errors = [] - sample_project_identifiers = set() + + # Extract unique project IDs from sample_df + sample_project_ids = set() for cell in sample_df['project_ids']: - # Skip if the cell is empty or NaN - if pd.isna(cell) or cell == '': - continue - project_identifiers = set(project_identifier.strip() for project_identifier in cell.split(';') if project_identifier.strip()) - sample_project_identifiers.update(project_identifiers) - # Check each project identifier in project_df against the set of sample project identifiers - missing_project_identifiers = [] - for project_identifier in project_df['project_identifier']: - if project_identifier not in sample_project_identifiers and project_identifier != '': - missing_project_identifiers.append(project_identifier) - - # If there are any missing project identifiers, raise an error - if missing_project_identifiers: - errors.append("The following project identifiers are not in the sample sheet, please double check your project identifiers:\n" + ', '.join(missing_project_identifiers)) + if pd.notna(cell) and cell.strip() != '': + ids = set(id.strip() for id in cell.split(';') if id.strip()) + sample_project_ids.update(ids) + + # Extract unique project identifiers from project_df + project_identifiers = set(id.strip() for id in project_df['project_identifier'] if pd.notna(id) and id.strip()) + + # Check if the project ID/identifier sets in both dataframes match exactly + if sample_project_ids != project_identifiers: + missing_in_sample = project_identifiers - sample_project_ids + missing_in_project = sample_project_ids - project_identifiers + + if missing_in_sample: + errors.append("The following project identifiers are not in the sample sheet:\n" + ', '.join(missing_in_sample)) + + if missing_in_project: + errors.append("The following project IDs from the sample sheet are not in the project sheet:\n" + ', '.join(missing_in_project)) + + # Optionally, you can add a check to ensure there's at least one project ID/identifier in each dataframe + if not sample_project_ids: + errors.append("The sample sheet does not contain any valid project IDs.") + if not project_identifiers: + errors.append("The project sheet does not contain any valid project identifiers.") + return errors def validate_acquisition_date(sample_df): errors = [] - # Convert to datetime, coerce errors to NaT - sample_df['acquisition_start_date'] = pd.to_datetime(sample_df['acquisition_start_date'], errors='coerce') - sample_df['acquisition_end_date'] = pd.to_datetime(sample_df['acquisition_end_date'], errors='coerce') + + for column in ['acquisition_start_date', 'acquisition_end_date']: + # Check for missing dates + missing_dates = sample_df[column].isna() | (sample_df[column] == '') + for index in missing_dates[missing_dates].index: + errors.append(f"Row {index}: '{column}' is missing") + + # Check for non-empty cells that are not valid dates + non_empty = sample_df[column].notna() & (sample_df[column] != '') + non_date = sample_df[non_empty & pd.to_datetime(sample_df[column], errors='coerce', format='%Y-%m-%d').isna()] + + for index, value in non_date[column].items(): + errors.append(f"Row {index}: '{column}' value '{value}' is not a valid date") + + # Convert to datetime, coerce errors to NaT, and format as 'YYYY-MM-DD' + for column in ['acquisition_start_date', 'acquisition_end_date']: + sample_df[column] = pd.to_datetime(sample_df[column], errors='coerce', format='%Y-%m-%d').dt.strftime('%Y-%m-%d') # Filter for rows where both dates are not null valid_df = sample_df.dropna(subset=['acquisition_start_date', 'acquisition_end_date']) - # Compare dates only for non-null pairs - invalid_mask = valid_df["acquisition_start_date"] >= valid_df["acquisition_end_date"] + # Compare dates + invalid_mask = pd.to_datetime(valid_df["acquisition_start_date"]) >= pd.to_datetime(valid_df["acquisition_end_date"]) invalid_df = valid_df[invalid_mask] for index, row in invalid_df.iterrows(): errors.append(f"Row {index}: acquisition_start_date {row['acquisition_start_date']} is not before acquisition_end_date {row['acquisition_end_date']}") + return errors def validate_parent_samples(df): @@ -149,7 +202,7 @@ def validate_sample(row): elif row['parent_sample'] in ckan_samples: parent_start_date = ckan_samples[row['parent_sample']] else: - errors.append(f"Parent sample {row['parent_sample']} for sample {row['sample_number']} not found in DataFrame or CKAN") + errors.append(f"Parent sample {row['parent_sample']} for sample {row['sample_number']} not found in the sample repository") return False if pd.isnull(parent_start_date): @@ -206,8 +259,9 @@ def validate_epsg(samples_df): ] # Identify rows with non-numeric latitude or longitude - invalid_latitudes = samples_df['point_latitude'].apply(lambda x: not is_cell_empty(x) and not is_numeric(x)) - invalid_longitudes = samples_df['point_longitude'].apply(lambda x: not is_cell_empty(x) and not is_numeric(x)) + invalid_latitudes = samples_df['point_latitude'].apply(lambda x: not is_cell_empty(x.strip() if isinstance(x, str) else x) and not is_numeric(x.strip() if isinstance(x, str) else x)) + invalid_longitudes = samples_df['point_longitude'].apply(lambda x: not is_cell_empty(x.strip() if isinstance(x, str) else x) and not is_numeric(x.strip() if isinstance(x, str) else x)) + # Raise an error if any invalid rows are found if invalid_latitudes.any(): @@ -266,8 +320,8 @@ def is_url(url: str) -> bool: def validate_affiliation_identifier(authors_df, valid_affiliation_identifier_types): errors = [] if 'author_affiliation_identifier' in authors_df.columns: - authors_df['author_affiliation_identifier'] = authors_df['author_affiliation_identifier'].fillna('') - authors_df['author_affiliation_identifier_type'] = authors_df['author_affiliation_identifier_type'].fillna('') + authors_df['author_affiliation_identifier'] = authors_df['author_affiliation_identifier'].fillna('').astype(str).str.strip() + authors_df['author_affiliation_identifier_type'] = authors_df['author_affiliation_identifier_type'].fillna('').astype(str).str.strip() missing_affil_type = authors_df[ authors_df['author_affiliation_identifier'].apply(lambda x: not is_cell_empty(x)) & @@ -304,15 +358,15 @@ def validate_related_resources(related_resources_df): ] # Check for any missing required fields in any of the related resources entries - if related_resources_df[required_fields].map(is_cell_empty).any().any(): + if related_resources_df[required_fields].applymap(lambda x: is_cell_empty(x.strip() if isinstance(x, str) else x)).any().any(): errors.append("Missing required fields in related resources entries.") # make sure the related_resource_url is a valid URL - invalid_entries = related_resources_df[related_resources_df['related_resource_url'].apply(lambda x: not is_url(str(x)))] + invalid_entries = related_resources_df[related_resources_df['related_resource_url'].apply(lambda x: not is_url(str(x).strip()))] if not invalid_entries.empty: - error_message = "Invalid URLs found at the following entries:\n" + error_message = "Invalid related resources URLs found at the following entries:\n" for idx, value in invalid_entries['related_resource_url'].items(): - error_message += f"Index {idx}: {value}\n" + error_message += f"Index {idx}: {value.strip()}\n" errors.append(error_message) if not related_resources_df['related_resource_type'].isin(valid_resource_types).all(): @@ -328,23 +382,25 @@ def validate_related_resources(related_resources_df): def validate_author_identifier(authors_df, valid_identifier_types): errors = [] if 'author_identifier' in authors_df.columns: - valid_identifiers = authors_df[~authors_df['author_identifier'].apply(is_cell_empty)] + valid_identifiers = authors_df[~authors_df['author_identifier'].apply(lambda x: is_cell_empty(x.strip() if isinstance(x, str) else x))] if not valid_identifiers.empty: - missing_identifier_type = valid_identifiers[valid_identifiers['author_identifier_type'].apply(is_cell_empty)] + missing_identifier_type = valid_identifiers[valid_identifiers['author_identifier_type'].apply(lambda x: is_cell_empty(x.strip() if isinstance(x, str) else x))] if not missing_identifier_type.empty: - errors.append("author_identifier_type is required when author_identifier is provided for the following rows:") - errors.extend(missing_identifier_type.index.tolist()) + missing_rows = missing_identifier_type.index.tolist() + errors.append(f"author_identifier_type is required when author_identifier is provided for the following: rows [{', '.join(map(str, missing_rows))}]") - invalid_identifier_types = valid_identifiers[~valid_identifiers['author_identifier_type'].isin(valid_identifier_types)] + invalid_identifier_types = valid_identifiers[~valid_identifiers['author_identifier_type'].apply(lambda x: x.strip() if isinstance(x, str) else x).isin(valid_identifier_types)] if not invalid_identifier_types.empty: - errors.append(f"author_identifier_type is invalid. Must be one of: {', '.join(valid_identifier_types)}. Invalid types found in rows:") - errors.append(invalid_identifier_types.index.tolist()) + invalid_rows = invalid_identifier_types.index.tolist() + errors.append(f"author_identifier_type is invalid. Must be one of: {', '.join(valid_identifier_types)}. Invalid types found in: rows [{', '.join(map(str, invalid_rows))}]") + # make sure the author_identifier is a valid URL - invalid_url = valid_identifiers[~valid_identifiers['author_identifier'].apply(is_url)] + invalid_url = valid_identifiers[~valid_identifiers['author_identifier'].apply(lambda x: is_url(str(x).strip()))] if not invalid_url.empty: - errors.append("Invalid URLs found in the following rows:") - errors.append(invalid_url.index.tolist()) + invalid_url_rows = invalid_url.index.tolist() + errors.append(f"Invalid author identifier URLs found in the following: rows [{', '.join(map(str, invalid_url_rows))}]") + return errors def validate_sample_type(sample_df): errors = [] @@ -357,9 +413,14 @@ def validate_sample_type(sample_df): "Termite Mound", "Vegetation", "Water", "Pisolite", "Hardpan soil", "Thin Section", "Polished Block", "Polished Round" ] - invalid_types = sample_df[~sample_df['sample_type'].isin(sample_types)] + # Filter out empty or null values + valid_samples = sample_df[sample_df['sample_type'].notna() & (sample_df['sample_type'] != '')] + + invalid_types = valid_samples[~valid_samples['sample_type'].isin(sample_types)] if not invalid_types.empty: - errors.append(f"Invalid sample types found: {invalid_types['sample_type'].unique().tolist()}. Must be one of: {', '.join(sample_types)}") + invalid_type_list = invalid_types['sample_type'].unique().tolist() + errors.append(f"Invalid sample types found: {invalid_type_list}. Must be one of: {', '.join(sample_types)}") + return errors def validate_user_keywords(user_keywords): @@ -383,6 +444,7 @@ def validate_authors(authors_df): errors.extend(validate_author_identifier(authors_df, valid_identifier_types)) return errors + def validate_samples(samples_df, related_resources_df, authors_df, funding_df): errors = [] samples_columns_to_check = ['sample_number', 'description', 'user_keywords', 'sample_type', 'author_emails'] diff --git a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/batch/create_batch_dataset.html b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/batch/create_batch_dataset.html index 293039cf..ae5eba4b 100644 --- a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/batch/create_batch_dataset.html +++ b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/batch/create_batch_dataset.html @@ -70,7 +70,7 @@

Upload batch sample metadata

- + Download Template
diff --git a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/home/snippets/about.html b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/home/snippets/about.html index 2013ca97..72dfcb21 100644 --- a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/home/snippets/about.html +++ b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/home/snippets/about.html @@ -1,8 +1,24 @@ {% trans %} -

AuScope focuses on delivering data, services, and tools to support future research in the Australian geoscience research community. As a component of the “Data Lens” of the AuScope Downward-Looking Telescope, the AuScope Discovery Portal harvests metadata from affiliated data catalogues to support more comprehensive access. Over time, it has become apparent that the data repositories, and more specifically sample data repositories, offered by the AuScope partners and universities need to be improved for curating sample data from the AuScope projects.

-

AuScope is committed to providing a trusted digital repository for its communities (e.g., NCRIS-funded data projects and Australian Geoscience research communities) to curate and publish their sample data following the FAIR Data Guiding Principles. Sample metadata and links to related datasets will be available openly through the repository with appropriate attributions to promote open science. The repository is essential for geoscience research innovation in support of the AuScope 5-Year Investment Plan and Australian Academy of Science Decadal plan for Australian Geoscience: Our Planet, Australia's Future.

+

The AuScope Virtual Research Environment (AVRE) is an AuScope program that focuses on delivering data, services, and tools to support the future research of the Australian geoscience research community. AVRE provides a rich ecosystem of Findable, Accessible, Interoperable and Reusable (FAIR) data and tools to a diverse range of Australian research organisations, government geological surveys and the international community. The AVRE program has contributed to the development and spearheaded the adoption of the International Generic Sample Numbers (IGSN) system in Australia in collaboration with the IGSN e.V. Organisation, DataCite, and the Australian Research Data Commons (ARDC).

+

+The AuScope Sample Repository aims to offer a digital repository for the AuScope community to register persistent identifiers for specimen data and publish specimen metadata following the + FAIR Data Guiding Principles. +The AuScope communities comprise NCRIS-funded data projects and Australian geoscience research communities. The AuScope Sample Repository is operated as a self-service facility for the storage, dissemination, and publication of metadata from AuScope-funded projects and instruments. +

+{% endtrans %} -

This work is supported by AuScope and funded by the Australian Government through the National Collaborative Research Infrastructure Strategy, NCRIS.

+ \ No newline at end of file diff --git a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/home/snippets/instructions.html b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/home/snippets/instructions.html index c5868a91..c2c81a67 100644 --- a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/home/snippets/instructions.html +++ b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/templates/home/snippets/instructions.html @@ -17,7 +17,7 @@

{{ _("Quick Start") }}

  • Start Batch Upload (register multiple samples by uploading a - spreadsheet + spreadsheet )
  • @@ -34,7 +34,7 @@

    {{ _("Quick Start") }}

  • Start Batch Upload (register multiple samples by uploading a - spreadsheet + spreadsheet )
  • diff --git a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/views.py b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/views.py index ad73f81b..8071d132 100644 --- a/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/views.py +++ b/ckan/src/ckanext-igsn-theme/ckanext/igsn_theme/views.py @@ -85,6 +85,11 @@ def process_excel(self, uploaded_file, org_id): all_errors.extend(validate_related_resources(related_resources_df)) all_errors.extend(validate_parent_samples(samples_df)) + + + samples_data, errors = prepare_samples_data(samples_df, authors_df, related_resources_df, funding_df, org_id) + all_errors.extend(errors) + if all_errors: error_list = "\n".join(f"Error {i+1}. {error}. " for i, error in enumerate(all_errors)) # format the error list to be displayed in human readable format @@ -92,9 +97,6 @@ def process_excel(self, uploaded_file, org_id): raise ValueError(f"""The following errors were found: {formatted_errors}""") - samples_data = prepare_samples_data(samples_df, authors_df, related_resources_df, funding_df, org_id) - - return_value = { "samples": samples_data, "authors": authors_df.to_dict("records"), diff --git a/ckan/src/shared/public/base/files/auscope-sample-template-v3.xlsx b/ckan/src/shared/public/base/files/auscope-sample-template-v3.xlsx new file mode 100644 index 00000000..2d876774 Binary files /dev/null and b/ckan/src/shared/public/base/files/auscope-sample-template-v3.xlsx differ diff --git a/ckan/src/shared/public/base/files/auscope-sample-template.xlsx b/ckan/src/shared/public/base/files/auscope-sample-template.xlsx deleted file mode 100644 index ff7eeb88..00000000 Binary files a/ckan/src/shared/public/base/files/auscope-sample-template.xlsx and /dev/null differ