diff --git a/ingestion_tools/dataset_configs/template.yaml b/ingestion_tools/dataset_configs/template.yaml index 45960821d..c412f8e34 100644 --- a/ingestion_tools/dataset_configs/template.yaml +++ b/ingestion_tools/dataset_configs/template.yaml @@ -8,7 +8,7 @@ annotations: OPTIONAL annotation_method: REQUIRED, STRING annotation_object: REQUIRED description: OPTIONAL, STRING - id: REQUIRED, STRING (GO_ID) + id: REQUIRED, STRING (GO_ID / UNIPROT_ID) name: REQUIRED, STRING state: OPTIONAL, STRING annotation_publications: OPTIONAL, STRING (DOI / EMPIAR / EMDB / PDB IDs) diff --git a/schema/api/v1.0.0/codegen/api_models_materialized.yaml b/schema/api/v1.0.0/codegen/api_models_materialized.yaml index 80f7f904f..378051d4e 100644 --- a/schema/api/v1.0.0/codegen/api_models_materialized.yaml +++ b/schema/api/v1.0.0/codegen/api_models_materialized.yaml @@ -295,6 +295,12 @@ types: from_schema: cdp-api-models base: str pattern: ^GO:[0-9]{7}$ + UNIPROT_ID: + name: UNIPROT_ID + description: A UniProt identifier + from_schema: cdp-api-models + base: str + pattern: ^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$ WORMBASE_ID: name: WORMBASE_ID description: A WormBase identifier @@ -336,19 +342,19 @@ types: description: A Protein Data Bank identifier from_schema: cdp-api-models base: str - pattern: ^pdb[0-9a-zA-Z]{4,8}$ + pattern: ^PDB-[0-9a-zA-Z]{4,8}$ EMPIAR_EMDB_PDB_LIST: name: EMPIAR_EMDB_PDB_LIST description: A list of EMPIAR, EMDB, and PDB identifiers from_schema: cdp-api-models base: str - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$ EMPIAR_EMDB_DOI_PDB_LIST: name: EMPIAR_EMDB_DOI_PDB_LIST description: A list of EMPIAR, EMDB, DOI, and PDB identifiers from_schema: cdp-api-models base: str - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$ enums: tomogram_type: name: tomogram_type @@ -489,6 +495,9 @@ enums: JEOL: text: JEOL description: JEOL Ltd. + SIMULATED: + text: SIMULATED + description: Simulated data fiducial_alignment_status_enum: name: fiducial_alignment_status_enum description: Fiducial Alignment method @@ -1118,8 +1127,8 @@ classes: inlined_as_list: true annotation_publication: name: annotation_publication - description: List of publication IDs (EMPIAR, EMDB, DOI) that describe this - annotation method. Comma separated. + description: List of publication IDs (EMPIAR, EMDB, DOI, PDB) that describe + this annotation method. Comma separated. from_schema: cdp-api-models exact_mappings: - cdp-common:annotation_publications @@ -1130,7 +1139,7 @@ classes: range: EMPIAR_EMDB_DOI_PDB_LIST inlined: true inlined_as_list: true - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$ annotation_method: name: annotation_method description: Describe how the annotation is made (e.g. Manual, crYoLO, Positive @@ -1173,11 +1182,13 @@ classes: owner: Annotation domain_of: - Annotation - range: GO_ID required: true inlined: true inlined_as_list: true - pattern: ^GO:[0-9]{7}$ + pattern: (^GO:[0-9]{7}$)|(^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$) + any_of: + - range: GO_ID + - range: UNIPROT_ID object_name: name: object_name description: Name of the object being annotated (e.g. ribosome, nuclear pore @@ -2055,7 +2066,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. @@ -2487,7 +2498,7 @@ classes: required: true inlined: true inlined_as_list: true - pattern: (^FEI$)|(^TFS$)|(^JEOL$) + pattern: (^FEI$)|(^TFS$)|(^JEOL$)|(^SIMULATED$) microscope_model: name: microscope_model description: Microscope model name @@ -3682,7 +3693,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. diff --git a/schema/api/v2.0.0/codegen/api_models_materialized.yaml b/schema/api/v2.0.0/codegen/api_models_materialized.yaml index 634848f5e..2461a9b70 100644 --- a/schema/api/v2.0.0/codegen/api_models_materialized.yaml +++ b/schema/api/v2.0.0/codegen/api_models_materialized.yaml @@ -295,6 +295,12 @@ types: from_schema: cdp-api base: string pattern: ^GO:[0-9]{7}$ + UNIPROT_ID: + name: UNIPROT_ID + description: A UniProt identifier + from_schema: cdp-api + base: str + pattern: ^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$ WORMBASE_ID: name: WORMBASE_ID description: A WormBase identifier @@ -336,19 +342,19 @@ types: description: A Protein Data Bank identifier from_schema: cdp-api base: string - pattern: ^pdb[0-9a-zA-Z]{4,8}$ + pattern: ^PDB-[0-9a-zA-Z]{4,8}$ EMPIAR_EMDB_PDB_LIST: name: EMPIAR_EMDB_PDB_LIST description: A list of EMPIAR, EMDB, and PDB identifiers from_schema: cdp-api base: string - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$ EMPIAR_EMDB_DOI_PDB_LIST: name: EMPIAR_EMDB_DOI_PDB_LIST description: A list of EMPIAR, EMDB, DOI, and PDB identifiers from_schema: cdp-api base: string - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$ enums: annotation_file_source_enum: name: annotation_file_source_enum @@ -492,6 +498,9 @@ enums: JEOL: text: JEOL description: JEOL Ltd. + SIMULATED: + text: SIMULATED + description: Simulated data fiducial_alignment_status_enum: name: fiducial_alignment_status_enum description: Fiducial Alignment method @@ -737,7 +746,7 @@ classes: owner: AuthorEntityMixin domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true primary_author_status: @@ -751,7 +760,7 @@ classes: owner: AuthorEntityMixin domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true APIDateStampedEntityMixin: @@ -1232,7 +1241,7 @@ classes: owner: AnnotationAuthor domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true primary_author_status: @@ -1246,7 +1255,7 @@ classes: owner: AnnotationAuthor domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true AnnotationFile: @@ -1537,8 +1546,8 @@ classes: inlined_as_list: true annotation_publication: name: annotation_publication - description: List of publication IDs (EMPIAR, EMDB, DOI) that describe this - annotation method. Comma separated. + description: List of publication IDs (EMPIAR, EMDB, DOI, PDB) that describe + this annotation method. Comma separated. from_schema: cdp-api exact_mappings: - cdp-common:annotation_publications @@ -1549,7 +1558,7 @@ classes: range: EMPIAR_EMDB_DOI_PDB_LIST inlined: true inlined_as_list: true - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$ annotation_method: name: annotation_method description: Describe how the annotation is made (e.g. Manual, crYoLO, Positive @@ -1592,11 +1601,13 @@ classes: owner: Annotation domain_of: - Annotation - range: GO_ID required: true inlined: true inlined_as_list: true - pattern: ^GO:[0-9]{7}$ + pattern: (^GO:[0-9]{7}$)|(^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$) + any_of: + - range: GO_ID + - range: UNIPROT_ID object_name: name: object_name description: Name of the object being annotated (e.g. ribosome, nuclear pore @@ -1957,7 +1968,7 @@ classes: owner: DatasetAuthor domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true primary_author_status: @@ -1971,7 +1982,7 @@ classes: owner: DatasetAuthor domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true DatasetFunding: @@ -2458,7 +2469,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. @@ -2663,7 +2674,7 @@ classes: owner: DepositionAuthor domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true primary_author_status: @@ -2677,7 +2688,7 @@ classes: owner: DepositionAuthor domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true Deposition: @@ -2861,7 +2872,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. @@ -3828,7 +3839,7 @@ classes: required: true inlined: true inlined_as_list: true - pattern: (^FEI$)|(^TFS$)|(^JEOL$) + pattern: (^FEI$)|(^TFS$)|(^JEOL$)|(^SIMULATED$) microscope_model: name: microscope_model description: Microscope model name @@ -4333,7 +4344,7 @@ classes: owner: TomogramAuthor domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true primary_author_status: @@ -4347,7 +4358,7 @@ classes: owner: TomogramAuthor domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true TomogramVoxelSpacing: @@ -5036,7 +5047,7 @@ classes: domain_of: - DateStampedEntityMixin - APIDateStampedEntityMixin - range: string + range: date required: true inlined: true inlined_as_list: true @@ -5051,7 +5062,7 @@ classes: domain_of: - DateStampedEntityMixin - APIDateStampedEntityMixin - range: string + range: date required: true inlined: true inlined_as_list: true @@ -5067,7 +5078,7 @@ classes: domain_of: - DateStampedEntityMixin - APIDateStampedEntityMixin - range: string + range: date required: true inlined: true inlined_as_list: true @@ -5103,7 +5114,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. @@ -5212,7 +5223,7 @@ classes: owner: AuthorMixin domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true primary_author_status: @@ -5226,7 +5237,7 @@ classes: owner: AuthorMixin domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true source_file: api/v2.0.0/api_models.yaml diff --git a/schema/core/v1.1.0/codegen/metadata_materialized.yaml b/schema/core/v1.1.0/codegen/metadata_materialized.yaml index 167a50bf9..d65b7f263 100644 --- a/schema/core/v1.1.0/codegen/metadata_materialized.yaml +++ b/schema/core/v1.1.0/codegen/metadata_materialized.yaml @@ -312,6 +312,12 @@ types: from_schema: metadata base: str pattern: ^GO:[0-9]{7}$ + UNIPROT_ID: + name: UNIPROT_ID + description: A UniProt identifier + from_schema: metadata + base: str + pattern: ^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$ WORMBASE_ID: name: WORMBASE_ID description: A WormBase identifier @@ -353,19 +359,19 @@ types: description: A Protein Data Bank identifier from_schema: metadata base: str - pattern: ^pdb[0-9a-zA-Z]{4,8}$ + pattern: ^PDB-[0-9a-zA-Z]{4,8}$ EMPIAR_EMDB_PDB_LIST: name: EMPIAR_EMDB_PDB_LIST description: A list of EMPIAR, EMDB, and PDB identifiers from_schema: metadata base: str - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$ EMPIAR_EMDB_DOI_PDB_LIST: name: EMPIAR_EMDB_DOI_PDB_LIST description: A list of EMPIAR, EMDB, DOI, and PDB identifiers from_schema: metadata base: str - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$ enums: annotation_method_type_enum: name: annotation_method_type_enum @@ -495,6 +501,9 @@ enums: JEOL: text: JEOL description: JEOL Ltd. + SIMULATED: + text: SIMULATED + description: Simulated data fiducial_alignment_status_enum: name: fiducial_alignment_status_enum description: Fiducial Alignment method @@ -886,7 +895,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. @@ -1705,7 +1714,8 @@ classes: required: true inlined: true inlined_as_list: true - pattern: (^FEI$)|(^TFS$)|(^JEOL$)|(^[ ]*\{[a-zA-Z0-9_-]+\}[ ]*$) + pattern: (^FEI$)|(^TFS$)|(^JEOL$)|(^SIMULATED$)|(^[ ]*\{[a-zA-Z0-9_-]+\}[ + ]*$) any_of: - description: Name of the microscope manufacturer exact_mappings: @@ -2624,11 +2634,14 @@ classes: - CellStrain - CellComponent - AnnotationObject - range: GO_ID + range: Any required: true inlined: true inlined_as_list: true - pattern: ^GO:[0-9]{7}$ + pattern: (^GO:[0-9]{7}$)|(^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$) + any_of: + - range: GO_ID + - range: UNIPROT_ID name: name: name description: Name of the object being annotated (e.g. ribosome, nuclear pore @@ -3272,8 +3285,8 @@ classes: inlined_as_list: true annotation_publications: name: annotation_publications - description: List of publication IDs (EMPIAR, EMDB, DOI) that describe this - annotation method. Comma separated. + description: List of publication IDs (EMPIAR, EMDB, DOI, PDB) that describe + this annotation method. Comma separated. from_schema: metadata exact_mappings: - cdp-common:annotation_publications @@ -3284,7 +3297,7 @@ classes: range: EMPIAR_EMDB_DOI_PDB_LIST inlined: true inlined_as_list: true - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$ annotation_software: name: annotation_software description: Software used for generating this annotation @@ -3510,7 +3523,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. diff --git a/schema/core/v1.1.0/codegen/metadata_models.py b/schema/core/v1.1.0/codegen/metadata_models.py index eecce1ec8..6797660cb 100644 --- a/schema/core/v1.1.0/codegen/metadata_models.py +++ b/schema/core/v1.1.0/codegen/metadata_models.py @@ -98,14 +98,14 @@ def __contains__(self, key: str) -> bool: "description": "A list of EMPIAR, " "EMDB, DOI, and PDB " "identifiers", "from_schema": "metadata", "name": "EMPIAR_EMDB_DOI_PDB_LIST", - "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$", + "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$", }, "EMPIAR_EMDB_PDB_LIST": { "base": "str", "description": "A list of EMPIAR, EMDB, " "and PDB identifiers", "from_schema": "metadata", "name": "EMPIAR_EMDB_PDB_LIST", - "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$", + "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$", }, "EMPIAR_ID": { "base": "str", @@ -154,7 +154,7 @@ def __contains__(self, key: str) -> bool: "description": "A Protein Data Bank identifier", "from_schema": "metadata", "name": "PDB_ID", - "pattern": "^pdb[0-9a-zA-Z]{4,8}$", + "pattern": "^PDB-[0-9a-zA-Z]{4,8}$", }, "StringFormattedString": { "base": "str", @@ -163,6 +163,13 @@ def __contains__(self, key: str) -> bool: "name": "StringFormattedString", "pattern": "^[ ]*\\{[a-zA-Z0-9_-]+\\}[ " "]*$", }, + "UNIPROT_ID": { + "base": "str", + "description": "A UniProt identifier", + "from_schema": "metadata", + "name": "UNIPROT_ID", + "pattern": "^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$", + }, "URLorS3URI": { "base": "str", "description": "A URL or S3 URI", @@ -587,6 +594,8 @@ class TiltseriesMicroscopeManufacturerEnum(str, Enum): TFS = "TFS" # JEOL Ltd. JEOL = "JEOL" + # Simulated data + SIMULATED = "SIMULATED" class FiducialAlignmentStatusEnum(str, Enum): @@ -1558,7 +1567,7 @@ class MicroscopeDetails(ConfiguredBaseModel): @field_validator("manufacturer") def pattern_manufacturer(cls, v): - pattern = re.compile(r"(^FEI$)|(^TFS$)|(^JEOL$)|(^[ ]*\{[a-zA-Z0-9_-]+\}[ ]*$)") + pattern = re.compile(r"(^FEI$)|(^TFS$)|(^JEOL$)|(^SIMULATED$)|(^[ ]*\{[a-zA-Z0-9_-]+\}[ ]*$)") if isinstance(v, list): for element in v: if not pattern.match(element): @@ -2507,6 +2516,7 @@ class AnnotationObject(ConfiguredBaseModel): json_schema_extra={ "linkml_meta": { "alias": "id", + "any_of": [{"range": "GO_ID"}, {"range": "UNIPROT_ID"}], "domain_of": ["TissueDetails", "CellType", "CellStrain", "CellComponent", "AnnotationObject"], "exact_mappings": ["cdp-common:annotation_object_id"], } @@ -2558,7 +2568,9 @@ class AnnotationObject(ConfiguredBaseModel): @field_validator("id") def pattern_id(cls, v): - pattern = re.compile(r"^GO:[0-9]{7}$") + pattern = re.compile( + r"(^GO:[0-9]{7}$)|(^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$)" + ) if isinstance(v, list): for element in v: if not pattern.match(element): @@ -3221,7 +3233,7 @@ class Annotation(AuthoredEntity, DateStampedEntity): ) annotation_publications: Optional[str] = Field( None, - description="""List of publication IDs (EMPIAR, EMDB, DOI) that describe this annotation method. Comma separated.""", + description="""List of publication IDs (EMPIAR, EMDB, DOI, PDB) that describe this annotation method. Comma separated.""", json_schema_extra={ "linkml_meta": { "alias": "annotation_publications", @@ -3335,7 +3347,7 @@ class Annotation(AuthoredEntity, DateStampedEntity): @field_validator("annotation_publications") def pattern_annotation_publications(cls, v): pattern = re.compile( - r"^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$" + r"^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$" ) if isinstance(v, list): for element in v: @@ -3504,7 +3516,7 @@ def pattern_publications(cls, v): @field_validator("related_database_entries") def pattern_related_database_entries(cls, v): pattern = re.compile( - r"(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)" + r"(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)" ) if isinstance(v, list): for element in v: @@ -3577,7 +3589,7 @@ def pattern_publications(cls, v): @field_validator("related_database_entries") def pattern_related_database_entries(cls, v): pattern = re.compile( - r"(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)" + r"(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)" ) if isinstance(v, list): for element in v: diff --git a/schema/core/v1.1.0/common.yaml b/schema/core/v1.1.0/common.yaml index 0f6fa4849..cb5c0c268 100644 --- a/schema/core/v1.1.0/common.yaml +++ b/schema/core/v1.1.0/common.yaml @@ -577,7 +577,7 @@ slots: description: Classification of the annotation method based on supervision. annotation_publications: range: EMPIAR_EMDB_DOI_PDB_LIST - description: List of publication IDs (EMPIAR, EMDB, DOI) that describe this annotation method. Comma separated. + description: List of publication IDs (EMPIAR, EMDB, DOI, PDB) that describe this annotation method. Comma separated. annotation_software: range: string description: Software used for generating this annotation @@ -619,7 +619,9 @@ slots: range: string description: A textual description of the annotation object, can be a longer description to include additional information not covered by the Annotation object name and state. annotation_object_id: - range: GO_ID + any_of: + - range: GO_ID + - range: UNIPROT_ID required: true description: Gene Ontology Cellular Component identifier for the annotation object annotation_object_name: @@ -872,6 +874,8 @@ enums: description: Thermo Fisher Scientific JEOL: description: JEOL Ltd. + SIMULATED: + description: Simulated data fiducial_alignment_status_enum: description: Fiducial Alignment method @@ -958,6 +962,11 @@ types: base: str pattern: '^GO:[0-9]{7}$' + UNIPROT_ID: + description: A UniProt identifier + base: str + pattern: '^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$' + WORMBASE_ID: description: A WormBase identifier base: str @@ -991,17 +1000,17 @@ types: PDB_ID: description: A Protein Data Bank identifier base: str - pattern: '^pdb[0-9a-zA-Z]{4,8}$' + pattern: '^PDB-[0-9a-zA-Z]{4,8}$' EMPIAR_EMDB_PDB_LIST: description: A list of EMPIAR, EMDB, and PDB identifiers base: str - pattern: '^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$' + pattern: '^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$' EMPIAR_EMDB_DOI_PDB_LIST: description: A list of EMPIAR, EMDB, DOI, and PDB identifiers base: str - pattern: '^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$' + pattern: '^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$' classes: Any: diff --git a/schema/core/v2.0.0/codegen/metadata_materialized.yaml b/schema/core/v2.0.0/codegen/metadata_materialized.yaml index 2ca196933..1335b5a1d 100644 --- a/schema/core/v2.0.0/codegen/metadata_materialized.yaml +++ b/schema/core/v2.0.0/codegen/metadata_materialized.yaml @@ -312,6 +312,12 @@ types: from_schema: metadata base: string pattern: ^GO:[0-9]{7}$ + UNIPROT_ID: + name: UNIPROT_ID + description: A UniProt identifier + from_schema: metadata + base: str + pattern: ^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$ WORMBASE_ID: name: WORMBASE_ID description: A WormBase identifier @@ -353,19 +359,19 @@ types: description: A Protein Data Bank identifier from_schema: metadata base: string - pattern: ^pdb[0-9a-zA-Z]{4,8}$ + pattern: ^PDB-[0-9a-zA-Z]{4,8}$ EMPIAR_EMDB_PDB_LIST: name: EMPIAR_EMDB_PDB_LIST description: A list of EMPIAR, EMDB, and PDB identifiers from_schema: metadata base: string - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$ EMPIAR_EMDB_DOI_PDB_LIST: name: EMPIAR_EMDB_DOI_PDB_LIST description: A list of EMPIAR, EMDB, DOI, and PDB identifiers from_schema: metadata base: string - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$ enums: annotation_file_source_enum: name: annotation_file_source_enum @@ -509,6 +515,9 @@ enums: JEOL: text: JEOL description: JEOL Ltd. + SIMULATED: + text: SIMULATED + description: Simulated data fiducial_alignment_status_enum: name: fiducial_alignment_status_enum description: Fiducial Alignment method @@ -720,7 +729,7 @@ classes: owner: Author domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true primary_author_status: @@ -734,7 +743,7 @@ classes: owner: Author domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true FundingDetails: @@ -789,7 +798,7 @@ classes: owner: DateStamp domain_of: - DateStampedEntityMixin - range: string + range: date required: true inlined: true inlined_as_list: true @@ -803,7 +812,7 @@ classes: owner: DateStamp domain_of: - DateStampedEntityMixin - range: string + range: date required: true inlined: true inlined_as_list: true @@ -818,7 +827,7 @@ classes: owner: DateStamp domain_of: - DateStampedEntityMixin - range: string + range: date required: true inlined: true inlined_as_list: true @@ -913,7 +922,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. @@ -2468,8 +2477,7 @@ classes: attributes: id: name: id - description: Gene Ontology Cellular Component identifier for the annotation - object + description: A placeholder for any type of data. from_schema: metadata exact_mappings: - cdp-common:annotation_object_id @@ -2481,11 +2489,14 @@ classes: - CellStrain - CellComponent - AnnotationObject - range: GO_ID + range: Any required: true inlined: true inlined_as_list: true - pattern: ^GO:[0-9]{7}$ + pattern: (^GO:[0-9]{7}$)|(^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$) + any_of: + - range: GO_ID + - range: UNIPROT_ID name: name: name description: Name of the object being annotated (e.g. ribosome, nuclear pore @@ -3128,8 +3139,8 @@ classes: inlined_as_list: true annotation_publications: name: annotation_publications - description: List of publication IDs (EMPIAR, EMDB, DOI) that describe this - annotation method. Comma separated. + description: List of publication IDs (EMPIAR, EMDB, DOI, PDB) that describe + this annotation method. Comma separated. from_schema: metadata exact_mappings: - cdp-common:annotation_publications @@ -3140,7 +3151,7 @@ classes: range: EMPIAR_EMDB_DOI_PDB_LIST inlined: true inlined_as_list: true - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$ annotation_software: name: annotation_software description: Software used for generating this annotation @@ -3302,7 +3313,7 @@ classes: owner: DateStampedEntityMixin domain_of: - DateStampedEntityMixin - range: string + range: date required: true inlined: true inlined_as_list: true @@ -3316,7 +3327,7 @@ classes: owner: DateStampedEntityMixin domain_of: - DateStampedEntityMixin - range: string + range: date required: true inlined: true inlined_as_list: true @@ -3331,7 +3342,7 @@ classes: owner: DateStampedEntityMixin domain_of: - DateStampedEntityMixin - range: string + range: date required: true inlined: true inlined_as_list: true @@ -3367,7 +3378,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. @@ -3480,7 +3491,7 @@ classes: owner: AuthorMixin domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true primary_author_status: @@ -3494,7 +3505,7 @@ classes: owner: AuthorMixin domain_of: - AuthorMixin - range: string + range: boolean inlined: true inlined_as_list: true source_file: core/v2.0.0/metadata.yaml diff --git a/schema/core/v2.0.0/common.yaml b/schema/core/v2.0.0/common.yaml index 46618809b..b4162bcfb 100644 --- a/schema/core/v2.0.0/common.yaml +++ b/schema/core/v2.0.0/common.yaml @@ -13,7 +13,7 @@ prefixes: imports: - linkml:types default_prefix: cdp-common -default_range: string +default_range: Any slots: @@ -580,7 +580,7 @@ slots: description: Classification of the annotation method based on supervision. annotation_publications: range: EMPIAR_EMDB_DOI_PDB_LIST - description: List of publication IDs (EMPIAR, EMDB, DOI) that describe this annotation method. Comma separated. + description: List of publication IDs (EMPIAR, EMDB, DOI, PDB) that describe this annotation method. Comma separated. annotation_software: range: string description: Software used for generating this annotation @@ -622,7 +622,9 @@ slots: range: string description: A textual description of the annotation object, can be a longer description to include additional information not covered by the Annotation object name and state. annotation_object_id: - range: GO_ID + any_of: + - range: GO_ID + - range: UNIPROT_ID required: true description: Gene Ontology Cellular Component identifier for the annotation object annotation_object_name: @@ -1051,6 +1053,8 @@ enums: description: Thermo Fisher Scientific JEOL: description: JEOL Ltd. + SIMULATED: + description: Simulated data fiducial_alignment_status_enum: description: Fiducial Alignment method @@ -1147,6 +1151,11 @@ types: base: string pattern: '^GO:[0-9]{7}$' + UNIPROT_ID: + description: A UniProt identifier + base: str + pattern: '^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$' + WORMBASE_ID: description: A WormBase identifier base: string @@ -1180,17 +1189,17 @@ types: PDB_ID: description: A Protein Data Bank identifier base: string - pattern: '^pdb[0-9a-zA-Z]{4,8}$' + pattern: '^PDB-[0-9a-zA-Z]{4,8}$' EMPIAR_EMDB_PDB_LIST: description: A list of EMPIAR, EMDB, and PDB identifiers base: string - pattern: '^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$' + pattern: '^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$' EMPIAR_EMDB_DOI_PDB_LIST: description: A list of EMPIAR, EMDB, DOI, and PDB identifiers base: string - pattern: '^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$' + pattern: '^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$' classes: Any: diff --git a/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models.py b/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models.py index 4dbbd6175..4e26bf268 100644 --- a/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models.py +++ b/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models.py @@ -94,14 +94,14 @@ def __contains__(self, key: str) -> bool: "description": "A list of EMPIAR, " "EMDB, DOI, and PDB " "identifiers", "from_schema": "cdp-ingestion-config", "name": "EMPIAR_EMDB_DOI_PDB_LIST", - "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$", + "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$", }, "EMPIAR_EMDB_PDB_LIST": { "base": "str", "description": "A list of EMPIAR, EMDB, " "and PDB identifiers", "from_schema": "cdp-ingestion-config", "name": "EMPIAR_EMDB_PDB_LIST", - "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$", + "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$", }, "EMPIAR_ID": { "base": "str", @@ -150,7 +150,7 @@ def __contains__(self, key: str) -> bool: "description": "A Protein Data Bank identifier", "from_schema": "cdp-ingestion-config", "name": "PDB_ID", - "pattern": "^pdb[0-9a-zA-Z]{4,8}$", + "pattern": "^PDB-[0-9a-zA-Z]{4,8}$", }, "StringFormattedString": { "base": "str", @@ -159,6 +159,13 @@ def __contains__(self, key: str) -> bool: "name": "StringFormattedString", "pattern": "^[ ]*\\{[a-zA-Z0-9_-]+\\}[ " "]*$", }, + "UNIPROT_ID": { + "base": "str", + "description": "A UniProt identifier", + "from_schema": "cdp-ingestion-config", + "name": "UNIPROT_ID", + "pattern": "^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$", + }, "URLorS3URI": { "base": "str", "description": "A URL or S3 URI", @@ -583,6 +590,8 @@ class TiltseriesMicroscopeManufacturerEnum(str, Enum): TFS = "TFS" # JEOL Ltd. JEOL = "JEOL" + # Simulated data + SIMULATED = "SIMULATED" class FiducialAlignmentStatusEnum(str, Enum): @@ -1554,7 +1563,7 @@ class MicroscopeDetails(ConfiguredBaseModel): @field_validator("manufacturer") def pattern_manufacturer(cls, v): - pattern = re.compile(r"(^FEI$)|(^TFS$)|(^JEOL$)|(^[ ]*\{[a-zA-Z0-9_-]+\}[ ]*$)") + pattern = re.compile(r"(^FEI$)|(^TFS$)|(^JEOL$)|(^SIMULATED$)|(^[ ]*\{[a-zA-Z0-9_-]+\}[ ]*$)") if isinstance(v, list): for element in v: if not pattern.match(element): @@ -2503,6 +2512,7 @@ class AnnotationObject(ConfiguredBaseModel): json_schema_extra={ "linkml_meta": { "alias": "id", + "any_of": [{"range": "GO_ID"}, {"range": "UNIPROT_ID"}], "domain_of": ["TissueDetails", "CellType", "CellStrain", "CellComponent", "AnnotationObject"], "exact_mappings": ["cdp-common:annotation_object_id"], } @@ -2554,7 +2564,9 @@ class AnnotationObject(ConfiguredBaseModel): @field_validator("id") def pattern_id(cls, v): - pattern = re.compile(r"^GO:[0-9]{7}$") + pattern = re.compile( + r"(^GO:[0-9]{7}$)|(^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$)" + ) if isinstance(v, list): for element in v: if not pattern.match(element): @@ -3217,7 +3229,7 @@ class Annotation(AuthoredEntity, DateStampedEntity): ) annotation_publications: Optional[str] = Field( None, - description="""List of publication IDs (EMPIAR, EMDB, DOI) that describe this annotation method. Comma separated.""", + description="""List of publication IDs (EMPIAR, EMDB, DOI, PDB) that describe this annotation method. Comma separated.""", json_schema_extra={ "linkml_meta": { "alias": "annotation_publications", @@ -3331,7 +3343,7 @@ class Annotation(AuthoredEntity, DateStampedEntity): @field_validator("annotation_publications") def pattern_annotation_publications(cls, v): pattern = re.compile( - r"^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$" + r"^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$" ) if isinstance(v, list): for element in v: @@ -3500,7 +3512,7 @@ def pattern_publications(cls, v): @field_validator("related_database_entries") def pattern_related_database_entries(cls, v): pattern = re.compile( - r"(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)" + r"(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)" ) if isinstance(v, list): for element in v: @@ -3573,7 +3585,7 @@ def pattern_publications(cls, v): @field_validator("related_database_entries") def pattern_related_database_entries(cls, v): pattern = re.compile( - r"(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)" + r"(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)" ) if isinstance(v, list): for element in v: diff --git a/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models.schema.json b/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models.schema.json index f4057fea0..10d4fcca4 100644 --- a/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models.schema.json +++ b/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models.schema.json @@ -13,8 +13,8 @@ "description": "Metadata describing the object being annotated." }, "annotation_publications": { - "description": "List of publication IDs (EMPIAR, EMDB, DOI) that describe this annotation method. Comma separated.", - "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$", + "description": "List of publication IDs (EMPIAR, EMDB, DOI, PDB) that describe this annotation method. Comma separated.", + "pattern": "^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$", "type": [ "string", "null" @@ -296,9 +296,19 @@ ] }, "id": { + "$ref": "#/$defs/Any", + "anyOf": [ + { + "pattern": "^GO:[0-9]{7}$", + "type": "string" + }, + { + "pattern": "^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$", + "type": "string" + } + ], "description": "Gene Ontology Cellular Component identifier for the annotation object", - "pattern": "^GO:[0-9]{7}$", - "type": "string" + "pattern": "(^GO:[0-9]{7}$)|(^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$)" }, "name": { "description": "Name of the object being annotated (e.g. ribosome, nuclear pore complex, actin filament, membrane)", @@ -1089,7 +1099,7 @@ }, "related_database_entries": { "description": "Comma-separated list of related database entries for the dataset.", - "pattern": "(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)", + "pattern": "(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\\s*,\\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)", "type": [ "string", "null" @@ -2536,7 +2546,7 @@ } ], "description": "Name of the microscope manufacturer", - "pattern": "(^FEI$)|(^TFS$)|(^JEOL$)|(^[ ]*\\{[a-zA-Z0-9_-]+\\}[ ]*$)" + "pattern": "(^FEI$)|(^TFS$)|(^JEOL$)|(^SIMULATED$)|(^[ ]*\\{[a-zA-Z0-9_-]+\\}[ ]*$)" }, "model": { "description": "Microscope model name", @@ -3494,7 +3504,8 @@ "enum": [ "FEI", "TFS", - "JEOL" + "JEOL", + "SIMULATED" ], "title": "TiltseriesMicroscopeManufacturerEnum", "type": "string" diff --git a/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models_materialized.yaml b/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models_materialized.yaml index a1d86f1ea..8d05673bc 100644 --- a/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models_materialized.yaml +++ b/schema/ingestion_config/v1.0.0/codegen/ingestion_config_models_materialized.yaml @@ -296,6 +296,12 @@ types: from_schema: cdp-ingestion-config base: str pattern: ^GO:[0-9]{7}$ + UNIPROT_ID: + name: UNIPROT_ID + description: A UniProt identifier + from_schema: cdp-ingestion-config + base: str + pattern: ^UniProtKB:[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}+$ WORMBASE_ID: name: WORMBASE_ID description: A WormBase identifier @@ -337,19 +343,19 @@ types: description: A Protein Data Bank identifier from_schema: cdp-ingestion-config base: str - pattern: ^pdb[0-9a-zA-Z]{4,8}$ + pattern: ^PDB-[0-9a-zA-Z]{4,8}$ EMPIAR_EMDB_PDB_LIST: name: EMPIAR_EMDB_PDB_LIST description: A list of EMPIAR, EMDB, and PDB identifiers from_schema: cdp-ingestion-config base: str - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$ EMPIAR_EMDB_DOI_PDB_LIST: name: EMPIAR_EMDB_DOI_PDB_LIST description: A list of EMPIAR, EMDB, DOI, and PDB identifiers from_schema: cdp-ingestion-config base: str - pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|pdb[0-9a-zA-Z]{4,8}))*$ + pattern: ^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+|PDB-[0-9a-zA-Z]{4,8}))*$ enums: annotation_method_type_enum: name: annotation_method_type_enum @@ -479,6 +485,9 @@ enums: JEOL: text: JEOL description: JEOL Ltd. + SIMULATED: + text: SIMULATED + description: Simulated data fiducial_alignment_status_enum: name: fiducial_alignment_status_enum description: Fiducial Alignment method @@ -4106,7 +4115,7 @@ classes: recommended: true inlined: true inlined_as_list: true - pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|pdb[0-9a-zA-Z]{4,8}))*$) + pattern: (^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$)|(^(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8})(\s*,\s*(EMPIAR-[0-9]{5}|EMD-[0-9]{4,5}|PDB-[0-9a-zA-Z]{4,8}))*$) related_database_links: name: related_database_links description: Comma-separated list of related database links for the dataset. diff --git a/schema/ingestion_config/v1.0.0/ingestion_config_models_extended.py b/schema/ingestion_config/v1.0.0/ingestion_config_models_extended.py index 3fd1a2786..04240178d 100644 --- a/schema/ingestion_config/v1.0.0/ingestion_config_models_extended.py +++ b/schema/ingestion_config/v1.0.0/ingestion_config_models_extended.py @@ -69,6 +69,8 @@ validation_exclusions = {} CELLULAR_COMPONENT_GO_ID = "GO:0005575" +GO_ID_REGEX = r"^GO:[0-9]{7}$" +UNIPROT_ID_REGEX = r"^UniProtKB:[A-Z0-9]+$" STRING_FORMATTED_STRING_REGEX = r"^[ ]*\{[a-zA-Z0-9_-]+\}[ ]*$" VALID_IMAGE_FORMATS = ("image/png", "image/jpeg", "image/jpg", "image/gif") # Note that model namees should all be uppercase or pascal case @@ -275,7 +277,7 @@ async def validate_wormbase_id(id: str) -> Tuple[List[str], bool]: async with aiohttp.ClientSession() as session, session.get(names_url) as response: if response.status >= 400: - return names, True + return [], True data = await response.json() if other_names := data.get("other_names", {}).get("data", []): names += other_names @@ -283,6 +285,29 @@ async def validate_wormbase_id(id: str) -> Tuple[List[str], bool]: return names, True +@alru_cache +async def validate_uniprot_id(id: str) -> Tuple[List[str], bool]: + """ + Returns a tuple of the ID names and whether or not it is valid. + """ + + # Strip the UniProtKB: prefix + id = id.replace("UniProtKB:", "") + url = f"https://rest.uniprot.org/uniprotkb/{id}" + + logger.debug("Getting ID %s at %s", id, url) + + async with aiohttp.ClientSession() as session, session.get(url) as response: + if response.status >= 400: + return [], False + data = await response.json() + try: + name = data["proteinDescription"]["recommendedName"]["fullName"]["value"] + return [name], True + except KeyError: + return [], True + + def validate_id_name_object( self: Union[AnnotationObject, CellComponent, CellStrain, CellType, OrganismDetails, TissueDetails], id: str, @@ -321,7 +346,8 @@ def validate_id_name_object( logger.debug("Valid ID, now checking if name '%s' matches ID: %s", name, id) - valid_name = any(name == retrieved_name for retrieved_name in retrieved_names) + # if the retrieved names is empty, we can assume the name is valid + valid_name = retrieved_names == [] or any(name == retrieved_name for retrieved_name in retrieved_names) if not valid_name: raise ValueError(f"name '{name}' does not match id: {id}") @@ -406,6 +432,8 @@ class ExtendedValidationDepositionKeyPhotoSource(DepositionKeyPhotoSource): # ============================================================================== @alru_cache async def lookup_doi(doi: str) -> Tuple[str, bool]: + # Remove the doi: prefix if it exists + doi = doi.replace("doi:", "") url = f"https://api.crossref.org/works/{doi}/agency" async with aiohttp.ClientSession() as session, session.head(url) as response: return doi, response.status == 200 @@ -427,6 +455,8 @@ async def lookup_emdb(emdb_id: str) -> Tuple[str, bool]: @alru_cache async def lookup_pdb(pdb_id: str) -> Tuple[str, bool]: + # Strip the PDB- prefix + pdb_id = pdb_id.replace("PDB-", "") url = f"https://data.rcsb.org/rest/v1/core/entry/{pdb_id}" async with aiohttp.ClientSession() as session, session.head(url) as response: return pdb_id, response.status == 200 @@ -436,7 +466,7 @@ async def lookup_pdb(pdb_id: str) -> Tuple[str, bool]: "doi": (r"^(doi:)?10\.[0-9]{4,9}/[-._;()/:a-zA-Z0-9]+$", lookup_doi), "empiar": (r"^EMPIAR-[0-9]{5}$", lookup_empiar), "emdb": (r"^EMD-[0-9]{4,5}$", lookup_emdb), - "pdb": (r"^pdb[0-9a-zA-Z]{4,8}$", lookup_pdb), + "pdb": (r"^PDB-[0-9a-zA-Z]{4,8}$", lookup_pdb), } @@ -444,12 +474,10 @@ async def validate_publication_lists(publication_list: List[str]) -> List[str]: tasks = [] for publication in publication_list: - for publication_type, (regex, validate_function) in PUBLICATION_REGEXES_AND_FUNCTIONS.items(): + for _, (regex, validate_function) in PUBLICATION_REGEXES_AND_FUNCTIONS.items(): if not re.match(regex, publication): continue - # edge case for DOI, remove the doi: prefix - updated_publication = publication.replace("doi:", "") if publication_type == "doi" else publication - tasks.append(validate_function(updated_publication)) + tasks.append(validate_function(publication)) break results = await asyncio.gather(*tasks) @@ -516,7 +544,10 @@ def validate_sources(source_list: List[DefaultSource] | List[VoxelSpacingSource] class ExtendedValidationAnnotationObject(AnnotationObject): @model_validator(mode="after") def validate_annotation_object(self) -> Self: - validate_id_name_object(self, self.id, self.name, ancestor=CELLULAR_COMPONENT_GO_ID) + if re.match(GO_ID_REGEX, self.id): + validate_id_name_object(self, self.id, self.name, ancestor=CELLULAR_COMPONENT_GO_ID) + elif re.match(UNIPROT_ID_REGEX, self.id): + validate_id_name_object(self, self.id, self.name, validate_id_function=validate_uniprot_id) return self