Skip to content

Commit

Permalink
ingestion config fixes (#220)
Browse files Browse the repository at this point in the history
* fixes

* restore method type - ground truth status validator

* fixes

* fixes

* more fixes

* fixes
  • Loading branch information
daniel-ji authored Aug 26, 2024
1 parent cff1fbe commit 6f3f7e7
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 61 deletions.
2 changes: 1 addition & 1 deletion schema/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,4 +85,4 @@ validate-configs:

.PHONY: validate-configs-with-network
validate-configs-with-network:
python $(VALIDATE_CONFIGS) --network-validation
python $(VALIDATE_CONFIGS) --network-validation --verbose
110 changes: 50 additions & 60 deletions schema/ingestion_config/v1.0.0/ingestion_config_models_extended.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
AnnotationEntity,
AnnotationObject,
AnnotationSource,
AnnotationSourceFile,
Author,
CameraDetails,
CellComponent,
Expand All @@ -31,7 +30,6 @@
DatasetKeyPhotoSource,
DatasetSource,
DateStamp,
DefaultSource,
Deposition,
DepositionEntity,
DepositionKeyPhotoEntity,
Expand All @@ -51,6 +49,7 @@
RunEntity,
RunSource,
SampleTypeEnum,
StandardSource,
TiltRange,
TiltSeries,
TiltSeriesEntity,
Expand All @@ -77,7 +76,9 @@
CAMERA_MANUFACTURER_TO_MODEL = {
("FEI", "TFS"): ["FALCON IV", "Falcon4i"],
("Gatan", "GATAN"): ["K2", "K2 SUMMIT", "K3", "K3 BIOQUANTUM", "UltraCam", "UltraScan"],
("simulated"): ["simulated"],
}
SOURCE_FILTER_FIELDS = {"parent_filters", "exclude"}

# Flag to determine if network validation should be run (set by provided Container arg)
running_network_validation = False
Expand Down Expand Up @@ -302,7 +303,7 @@ async def validate_uniprot_id(id: str) -> Tuple[List[str], bool]:
return [], False
data = await response.json()
name = data["proteinDescription"]["recommendedName"]["fullName"]["value"]
return name, True
return [name], True


def validate_id_name_object(
Expand Down Expand Up @@ -431,20 +432,23 @@ class ExtendedValidationDepositionKeyPhotoSource(DepositionKeyPhotoSource):
async def lookup_doi(doi: str) -> Tuple[str, bool]:
doi = doi.replace("doi:", "")
url = f"https://api.crossref.org/works/{doi}/agency"
logger.debug("Checking DOI %s at %s", doi, url)
async with aiohttp.ClientSession() as session, session.head(url) as response:
return doi, response.status == 200


@alru_cache
async def lookup_empiar(empiar_id: str) -> Tuple[str, bool]:
url = f"https://www.ebi.ac.uk/empiar/api/entry/{empiar_id}/"
logger.debug("Checking EMPIAR ID %s at %s", empiar_id, url)
async with aiohttp.ClientSession() as session, session.head(url) as response:
return empiar_id, response.status == 200


@alru_cache
async def lookup_emdb(emdb_id: str) -> Tuple[str, bool]:
url = f"https://www.ebi.ac.uk/emdb/api/entry/{emdb_id}"
logger.debug("Checking EMDB ID %s at %s", emdb_id, url)
async with aiohttp.ClientSession() as session, session.head(url) as response:
return emdb_id, response.status == 200

Expand All @@ -453,7 +457,8 @@ async def lookup_emdb(emdb_id: str) -> Tuple[str, bool]:
async def lookup_pdb(pdb_id: str) -> Tuple[str, bool]:
pdb_id = pdb_id.replace("PDB-", "")
url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}"
async with aiohttp.ClientSession() as session, session.head(url) as response:
logger.debug("Checking PDB ID %s at %s", pdb_id, url)
async with aiohttp.ClientSession() as session, session.get(url) as response:
return pdb_id, response.status == 200


Expand Down Expand Up @@ -496,39 +501,29 @@ def validate_publications(publications: Optional[str]) -> Optional[str]:
# ==============================================================================
# Sources Validation
# ==============================================================================
def validate_sources(source_list: List[DefaultSource] | List[VoxelSpacingSource]) -> None:
def validate_sources(source_list: List[StandardSource] | List[VoxelSpacingSource]) -> None:
total_errors = []

# For verifying that all source entries each only have one finder type
multiple_finders_in_each_source_entries_errors = []
standalone_finders_in_each_source_entries_errors = []

for i, source_element in enumerate(source_list):
for index, source_element in enumerate(source_list):
finders_in_source_entry = []
has_parent_filters = False
for finder in source_element.model_fields:
# If the finder is not None, add it to the list of finders in the source entry
if getattr(source_element, finder) is None:
continue
if finder == "exclude":
continue
if finder == "parent_filters":
has_parent_filters = True
else:
# If the finder is not None and an actual finder, add it to the list of finders in the source entry
if getattr(source_element, finder) is not None and finder not in SOURCE_FILTER_FIELDS:
finders_in_source_entry.append(finder)
# If there are more than one finder in the source entry, add the source entry index and the finders to the error set
if len(finders_in_source_entry) > 1:
multiple_finders_in_each_source_entries_errors.append((i, finders_in_source_entry))
elif len(finders_in_source_entry) == 0 and has_parent_filters:
# means there's only a parent_filters entry
standalone_finders_in_each_source_entries_errors.append(i)

for i, finders in multiple_finders_in_each_source_entries_errors:
total_errors.append(
ValueError(f"Source entry {i} cannot have multiple finders (split into multiple entries): {finders}"),
)
for index in standalone_finders_in_each_source_entries_errors:
total_errors.append(ValueError(f"Source entry {index} cannot only have a parent_filters entry"))
total_errors.append(
ValueError(
f"Source entry {index} cannot have multiple finders (split into multiple entries): {finders_in_source_entry}",
),
)
if len(finders_in_source_entry) == 0:
for source_child in SOURCE_FILTER_FIELDS:
if getattr(source_element, source_child) is None:
continue
total_errors.append(
ValueError(f"Source entry {index} cannot have a {source_child} entry without a finder"),
)

if total_errors:
raise ValueError(total_errors)
Expand Down Expand Up @@ -620,49 +615,44 @@ def valid_sources(cls: Self, source_list: List[AnnotationSource]) -> List[Annota
else:
used_shapes.add(shape)

# For verifying that all source entries each only have one shape entry
multiple_shapes_in_each_source_entries_errors = []

# For verifying that all source entries each only have one shape entry and that there is only one of glob_string and glob_strings
for i, source_element in enumerate(source_list):
shapes_in_source_entry = []
for shape in source_element.model_fields:
if getattr(source_element, shape) is None:
continue
if shape in {"parent_filters", "exclude"}:
# If the shape is not None and an actual shape, add it to the list of shapes in the source entry
shape_entry = getattr(source_element, shape)
if shape_entry is None or shape in SOURCE_FILTER_FIELDS:
continue

# If the shape is not None, add it to the list of shapes in the source entry
shapes_in_source_entry.append(shape)
has_glob_string = getattr(shape_entry, "glob_string", None) is not None
has_glob_strings = getattr(shape_entry, "glob_strings", None) is not None and shape_entry.glob_strings
if has_glob_string and has_glob_strings:
total_errors.append(
ValueError(
f"Source entry {i} shape {shape} must have exactly one of glob_string or glob_strings",
),
)

# If there are more than one shape in the source entry, add the source entry index and the shapes to the error set
if len(shapes_in_source_entry) > 1:
multiple_shapes_in_each_source_entries_errors.append((i, shapes_in_source_entry))

# For verifying that all source entries have exactly one of glob_string and glob_strings
multiple_glob_strings_errors = []

for i, source_element in enumerate(source_list):
for shape in source_element.model_fields:
source_entry: AnnotationSourceFile | None = getattr(source_element, shape)
if source_entry is None:
continue

has_glob_string = getattr(source_entry, "glob_string", None) is not None
has_glob_strings = getattr(source_entry, "glob_strings", None) is not None and source_entry.glob_strings
if has_glob_string and has_glob_strings:
multiple_glob_strings_errors.append((i, shape))
total_errors.append(
ValueError(
f"Source entry {i} cannot have multiple shapes (split into multiple entries): {shapes_in_source_entry}",
),
)
if len(shapes_in_source_entry) == 0:
for source_child in SOURCE_FILTER_FIELDS:
if getattr(source_element, source_child) is None:
continue
total_errors.append(
ValueError(f"Source entry {i} cannot have a {source_child} entry without a shape"),
)

if shapes_used_multiple_times_errors:
total_errors.append(
ValueError(f"Annotation cannot have multiple same-shape sources: {shapes_used_multiple_times_errors}"),
)
for i, shapes in multiple_shapes_in_each_source_entries_errors:
total_errors.append(
ValueError(f"Source entry {i} cannot have multiple shapes (split into multiple entries): {shapes}"),
)
for i, shape in multiple_glob_strings_errors:
total_errors.append(
ValueError(f"Source entry {i} shape {shape} must have exactly one of glob_string or glob_strings"),
)

if total_errors:
raise ValueError(total_errors)
Expand Down
5 changes: 5 additions & 0 deletions schema/ingestion_config/v1.0.0/ingestion_config_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ingestion_config_models_extended import ExtendedValidationContainer
from pydantic import ValidationError

ORIGINAL_DIR = os.getcwd()
os.chdir(os.path.dirname(os.path.abspath(__file__)))
ROOT_DIR = "../../../"
sys.path.append(ROOT_DIR) # To import the helper function from common.py
Expand Down Expand Up @@ -151,6 +152,10 @@ def main(
logger.error("Provide input files or --input-dir, not both.")
exit(1)

# if input files are relative paths, make them absolute
if input_files:
input_files = [os.path.join(ORIGINAL_DIR, file) if not os.path.isabs(file) else file for file in input_files]

files_to_validate = get_yaml_config_files(
input_files=input_files,
include_glob=include_glob,
Expand Down

0 comments on commit 6f3f7e7

Please sign in to comment.