diff --git a/schema/Makefile b/schema/Makefile index 0f03aa524..cc646af77 100644 --- a/schema/Makefile +++ b/schema/Makefile @@ -85,4 +85,4 @@ validate-configs: .PHONY: validate-configs-with-network validate-configs-with-network: - python $(VALIDATE_CONFIGS) --network-validation + python $(VALIDATE_CONFIGS) --network-validation --verbose diff --git a/schema/ingestion_config/v1.0.0/ingestion_config_models_extended.py b/schema/ingestion_config/v1.0.0/ingestion_config_models_extended.py index 9c048a003..fdca228f8 100644 --- a/schema/ingestion_config/v1.0.0/ingestion_config_models_extended.py +++ b/schema/ingestion_config/v1.0.0/ingestion_config_models_extended.py @@ -17,7 +17,6 @@ AnnotationEntity, AnnotationObject, AnnotationSource, - AnnotationSourceFile, Author, CameraDetails, CellComponent, @@ -31,7 +30,6 @@ DatasetKeyPhotoSource, DatasetSource, DateStamp, - DefaultSource, Deposition, DepositionEntity, DepositionKeyPhotoEntity, @@ -51,6 +49,7 @@ RunEntity, RunSource, SampleTypeEnum, + StandardSource, TiltRange, TiltSeries, TiltSeriesEntity, @@ -77,7 +76,9 @@ CAMERA_MANUFACTURER_TO_MODEL = { ("FEI", "TFS"): ["FALCON IV", "Falcon4i"], ("Gatan", "GATAN"): ["K2", "K2 SUMMIT", "K3", "K3 BIOQUANTUM", "UltraCam", "UltraScan"], + ("simulated"): ["simulated"], } +SOURCE_FILTER_FIELDS = {"parent_filters", "exclude"} # Flag to determine if network validation should be run (set by provided Container arg) running_network_validation = False @@ -302,7 +303,7 @@ async def validate_uniprot_id(id: str) -> Tuple[List[str], bool]: return [], False data = await response.json() name = data["proteinDescription"]["recommendedName"]["fullName"]["value"] - return name, True + return [name], True def validate_id_name_object( @@ -431,6 +432,7 @@ class ExtendedValidationDepositionKeyPhotoSource(DepositionKeyPhotoSource): async def lookup_doi(doi: str) -> Tuple[str, bool]: doi = doi.replace("doi:", "") url = f"https://api.crossref.org/works/{doi}/agency" + logger.debug("Checking DOI %s at %s", doi, url) async with aiohttp.ClientSession() as session, session.head(url) as response: return doi, response.status == 200 @@ -438,6 +440,7 @@ async def lookup_doi(doi: str) -> Tuple[str, bool]: @alru_cache async def lookup_empiar(empiar_id: str) -> Tuple[str, bool]: url = f"https://www.ebi.ac.uk/empiar/api/entry/{empiar_id}/" + logger.debug("Checking EMPIAR ID %s at %s", empiar_id, url) async with aiohttp.ClientSession() as session, session.head(url) as response: return empiar_id, response.status == 200 @@ -445,6 +448,7 @@ async def lookup_empiar(empiar_id: str) -> Tuple[str, bool]: @alru_cache async def lookup_emdb(emdb_id: str) -> Tuple[str, bool]: url = f"https://www.ebi.ac.uk/emdb/api/entry/{emdb_id}" + logger.debug("Checking EMDB ID %s at %s", emdb_id, url) async with aiohttp.ClientSession() as session, session.head(url) as response: return emdb_id, response.status == 200 @@ -453,7 +457,8 @@ async def lookup_emdb(emdb_id: str) -> Tuple[str, bool]: async def lookup_pdb(pdb_id: str) -> Tuple[str, bool]: pdb_id = pdb_id.replace("PDB-", "") url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" - async with aiohttp.ClientSession() as session, session.head(url) as response: + logger.debug("Checking PDB ID %s at %s", pdb_id, url) + async with aiohttp.ClientSession() as session, session.get(url) as response: return pdb_id, response.status == 200 @@ -496,39 +501,29 @@ def validate_publications(publications: Optional[str]) -> Optional[str]: # ============================================================================== # Sources Validation # ============================================================================== -def validate_sources(source_list: List[DefaultSource] | List[VoxelSpacingSource]) -> None: +def validate_sources(source_list: List[StandardSource] | List[VoxelSpacingSource]) -> None: total_errors = [] - # For verifying that all source entries each only have one finder type - multiple_finders_in_each_source_entries_errors = [] - standalone_finders_in_each_source_entries_errors = [] - - for i, source_element in enumerate(source_list): + for index, source_element in enumerate(source_list): finders_in_source_entry = [] - has_parent_filters = False for finder in source_element.model_fields: - # If the finder is not None, add it to the list of finders in the source entry - if getattr(source_element, finder) is None: - continue - if finder == "exclude": - continue - if finder == "parent_filters": - has_parent_filters = True - else: + # If the finder is not None and an actual finder, add it to the list of finders in the source entry + if getattr(source_element, finder) is not None and finder not in SOURCE_FILTER_FIELDS: finders_in_source_entry.append(finder) # If there are more than one finder in the source entry, add the source entry index and the finders to the error set if len(finders_in_source_entry) > 1: - multiple_finders_in_each_source_entries_errors.append((i, finders_in_source_entry)) - elif len(finders_in_source_entry) == 0 and has_parent_filters: - # means there's only a parent_filters entry - standalone_finders_in_each_source_entries_errors.append(i) - - for i, finders in multiple_finders_in_each_source_entries_errors: - total_errors.append( - ValueError(f"Source entry {i} cannot have multiple finders (split into multiple entries): {finders}"), - ) - for index in standalone_finders_in_each_source_entries_errors: - total_errors.append(ValueError(f"Source entry {index} cannot only have a parent_filters entry")) + total_errors.append( + ValueError( + f"Source entry {index} cannot have multiple finders (split into multiple entries): {finders_in_source_entry}", + ), + ) + if len(finders_in_source_entry) == 0: + for source_child in SOURCE_FILTER_FIELDS: + if getattr(source_element, source_child) is None: + continue + total_errors.append( + ValueError(f"Source entry {index} cannot have a {source_child} entry without a finder"), + ) if total_errors: raise ValueError(total_errors) @@ -620,49 +615,44 @@ def valid_sources(cls: Self, source_list: List[AnnotationSource]) -> List[Annota else: used_shapes.add(shape) - # For verifying that all source entries each only have one shape entry - multiple_shapes_in_each_source_entries_errors = [] - + # For verifying that all source entries each only have one shape entry and that there is only one of glob_string and glob_strings for i, source_element in enumerate(source_list): shapes_in_source_entry = [] for shape in source_element.model_fields: - if getattr(source_element, shape) is None: - continue - if shape in {"parent_filters", "exclude"}: + # If the shape is not None and an actual shape, add it to the list of shapes in the source entry + shape_entry = getattr(source_element, shape) + if shape_entry is None or shape in SOURCE_FILTER_FIELDS: continue - # If the shape is not None, add it to the list of shapes in the source entry shapes_in_source_entry.append(shape) + has_glob_string = getattr(shape_entry, "glob_string", None) is not None + has_glob_strings = getattr(shape_entry, "glob_strings", None) is not None and shape_entry.glob_strings + if has_glob_string and has_glob_strings: + total_errors.append( + ValueError( + f"Source entry {i} shape {shape} must have exactly one of glob_string or glob_strings", + ), + ) + # If there are more than one shape in the source entry, add the source entry index and the shapes to the error set if len(shapes_in_source_entry) > 1: - multiple_shapes_in_each_source_entries_errors.append((i, shapes_in_source_entry)) - - # For verifying that all source entries have exactly one of glob_string and glob_strings - multiple_glob_strings_errors = [] - - for i, source_element in enumerate(source_list): - for shape in source_element.model_fields: - source_entry: AnnotationSourceFile | None = getattr(source_element, shape) - if source_entry is None: - continue - - has_glob_string = getattr(source_entry, "glob_string", None) is not None - has_glob_strings = getattr(source_entry, "glob_strings", None) is not None and source_entry.glob_strings - if has_glob_string and has_glob_strings: - multiple_glob_strings_errors.append((i, shape)) + total_errors.append( + ValueError( + f"Source entry {i} cannot have multiple shapes (split into multiple entries): {shapes_in_source_entry}", + ), + ) + if len(shapes_in_source_entry) == 0: + for source_child in SOURCE_FILTER_FIELDS: + if getattr(source_element, source_child) is None: + continue + total_errors.append( + ValueError(f"Source entry {i} cannot have a {source_child} entry without a shape"), + ) if shapes_used_multiple_times_errors: total_errors.append( ValueError(f"Annotation cannot have multiple same-shape sources: {shapes_used_multiple_times_errors}"), ) - for i, shapes in multiple_shapes_in_each_source_entries_errors: - total_errors.append( - ValueError(f"Source entry {i} cannot have multiple shapes (split into multiple entries): {shapes}"), - ) - for i, shape in multiple_glob_strings_errors: - total_errors.append( - ValueError(f"Source entry {i} shape {shape} must have exactly one of glob_string or glob_strings"), - ) if total_errors: raise ValueError(total_errors) diff --git a/schema/ingestion_config/v1.0.0/ingestion_config_validate.py b/schema/ingestion_config/v1.0.0/ingestion_config_validate.py index 2ebf91e97..65a4807c7 100644 --- a/schema/ingestion_config/v1.0.0/ingestion_config_validate.py +++ b/schema/ingestion_config/v1.0.0/ingestion_config_validate.py @@ -12,6 +12,7 @@ from ingestion_config_models_extended import ExtendedValidationContainer from pydantic import ValidationError +ORIGINAL_DIR = os.getcwd() os.chdir(os.path.dirname(os.path.abspath(__file__))) ROOT_DIR = "../../../" sys.path.append(ROOT_DIR) # To import the helper function from common.py @@ -151,6 +152,10 @@ def main( logger.error("Provide input files or --input-dir, not both.") exit(1) + # if input files are relative paths, make them absolute + if input_files: + input_files = [os.path.join(ORIGINAL_DIR, file) if not os.path.isabs(file) else file for file in input_files] + files_to_validate = get_yaml_config_files( input_files=input_files, include_glob=include_glob,