diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index 9b5967d0..cef4ceae 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -569,7 +569,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: :param src_name: name of source for record. Not used by PostgreSQL instance. """ concept_id = record["concept_id"] - locations = [json.dumps(loc) for loc in record.get("locations", [])] + locations = [loc.model_dump_json() for loc in record.get("locations", [])] if not locations: locations = None with self.conn.cursor() as cur: diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index 93804f35..768165ae 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -12,7 +12,7 @@ from wags_tails import EnsemblData, HgncData, NcbiGeneData from gene.database import AbstractDatabase -from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName +from gene.schemas import ITEM_TYPES, Gene, MatchType, SourceName, StoredSequenceLocation _logger = logging.getLogger(__name__) @@ -122,11 +122,6 @@ def _load_gene(self, gene: Dict) -> None: except pydantic.ValidationError as e: _logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") else: - concept_id = gene["concept_id"] - gene["label_and_type"] = f"{concept_id.lower()}##identity" - gene["src_name"] = self._src_name.value - gene["item_type"] = "identity" - for attr_type in ITEM_TYPES: if attr_type in gene: value = gene[attr_type] @@ -137,7 +132,7 @@ def _load_gene(self, gene: Dict) -> None: gene[attr_type] = list(set(value)) self._database.add_record(gene, self._src_name) - self._processed_ids.append(concept_id) + self._processed_ids.append(gene["concept_id"]) def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo: """Return SeqRepo instance if seqrepo_dir exists. @@ -224,32 +219,31 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]: _logger.warning(f"SeqRepo raised KeyError: {e}") return aliases - def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict: - """Get a gene's GeneSequenceLocation. + def _build_sequence_location( + self, seq_id: str, gene: Feature, concept_id: str + ) -> Optional[StoredSequenceLocation]: + """Construct a sequence location for storing in a DB. :param seq_id: The sequence ID. :param gene: A gene from the source file. - :param params: The transformed gene record. - :return: A dictionary of a GA4GH VRS SequenceLocation, if seq_id alias found. - Else, empty dictionary + :param concept_id: record ID from source + :return: A storable SequenceLocation containing relevant params for returning a + VRS SequenceLocation, or None if unable to retrieve valid parameters """ - location = {} aliases = self._get_seq_id_aliases(seq_id) - if not aliases: - return location + if not aliases or gene.start is None or gene.end is None: + return None sequence = aliases[0] if gene.start != "." and gene.end != "." and sequence: - if 0 <= gene.start <= gene.end: # type: ignore - location = GeneSequenceLocation( - start=gene.start - 1, # type: ignore - end=gene.end, # type: ignore + if 0 <= gene.start <= gene.end: + return StoredSequenceLocation( + start=gene.start - 1, + end=gene.end, sequence_id=sequence, - ).model_dump() # type: ignore + ) else: _logger.warning( - f"{params['concept_id']} has invalid interval:" - f"start={gene.start - 1} end={gene.end}" - ) # type: ignore - return location + f"{concept_id} has invalid interval: start={gene.start - 1} end={gene.end}" + ) diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index c640b6ac..bb590047 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -7,7 +7,12 @@ from gffutils.feature import Feature from gene.etl.base import Base, GeneNormalizerEtlError -from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand +from gene.schemas import ( + DataLicenseAttributes, + NamespacePrefix, + SourceMeta, + Strand, +) _logger = logging.getLogger(__name__) @@ -66,22 +71,23 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: :param accession_numbers: Accession numbers for each chromosome and scaffold :return: A gene dictionary containing data if the ID attribute exists. """ - gene = dict() + gene_params = dict() if f.strand == "-": - gene["strand"] = Strand.REVERSE.value + gene_params["strand"] = Strand.REVERSE.value elif f.strand == "+": - gene["strand"] = Strand.FORWARD.value - gene["src_name"] = SourceName.ENSEMBL.value + gene_params["strand"] = Strand.FORWARD.value - self._add_attributes(f, gene) - location = self._add_location(f, gene, accession_numbers) + self._add_attributes(f, gene_params) + location = self._build_sequence_location( + accession_numbers[f.seqid], f, gene_params["concept_id"] + ) if location: - gene["locations"] = [location] + gene_params["locations"] = [location] - gene["label_and_type"] = f"{gene['concept_id'].lower()}##identity" - gene["item_type"] = "identity" + gene_params["label_and_type"] = f"{gene_params['concept_id'].lower()}##identity" + gene_params["item_type"] = "identity" - return gene + return gene_params def _add_attributes(self, f: Feature, gene: Dict) -> None: """Add concept_id, symbol, xrefs, and associated_with to a gene record. @@ -132,17 +138,6 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None: gene[attributes[key]] = val - def _add_location(self, f: Feature, gene: Dict, accession_numbers: Dict) -> Dict: - """Add GA4GH SequenceLocation to a gene record. - https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation - - :param f: A gene from the data - :param gene: A transformed gene record - :param accession_numbers: Accession numbers for each chromosome and scaffold - :return: gene record dictionary with location added - """ - return self._get_sequence_location(accession_numbers[f.seqid], f, gene) - def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: """Get xref or associated_with concept. @@ -181,11 +176,11 @@ def _add_meta(self) -> None: "genome_annotations": f"ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz" }, rdp_url=None, - data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, + data_license_attributes=DataLicenseAttributes( + non_commercial=False, + share_alike=False, + attribution=False, + ), genome_assemblies=[self._assembly], ) diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index 39e20ffc..2a61fc48 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -30,19 +30,16 @@ def _transform_data(self) -> None: records = data["response"]["docs"] for r in records: - gene = dict() - gene["concept_id"] = r["hgnc_id"].lower() - gene["label_and_type"] = f"{gene['concept_id']}##identity" - gene["item_type"] = "identity" - gene["symbol"] = r["symbol"] - gene["label"] = r["name"] - gene["src_name"] = SourceName.HGNC.value + gene = { + "concept_id": r["hgnc_id"].lower(), + "symbol": r["symbol"], + "label": r["name"], + } if r["status"]: if r["status"] == "Approved": gene["symbol_status"] = SymbolStatus.APPROVED.value elif r["status"] == "Entry Withdrawn": gene["symbol_status"] = SymbolStatus.WITHDRAWN.value - gene["src_name"] = SourceName.HGNC.value # store alias, xref, associated_with, prev_symbols, location self._get_aliases(r, gene) @@ -83,7 +80,7 @@ def _get_previous_symbols(self, r: Dict, gene: Dict) -> None: if prev_symbols: gene["previous_symbols"] = list(set(prev_symbols)) - def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: + def _get_xrefs_associated_with(self, record: Dict, gene: Dict) -> None: """Store xrefs and/or associated_with refs in a gene record. :param r: A gene record in the HGNC data file @@ -119,7 +116,7 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: ] for src in sources: - if src in r: + if src in record: if "-" in src: key = src.split("-")[0] elif "." in src: @@ -131,9 +128,11 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: if key.upper() in NamespacePrefix.__members__: if NamespacePrefix[key.upper()].value in PREFIX_LOOKUP.keys(): - self._get_xref_associated_with(key, src, r, xrefs) + self._get_xref_associated_with(key, src, record, xrefs) else: - self._get_xref_associated_with(key, src, r, associated_with) + self._get_xref_associated_with( + key, src, record, associated_with + ) else: _logger.warning(f"{key} not in schemas.py") @@ -202,6 +201,7 @@ def _set_annotation(self, loc: str, gene: Dict) -> None: :param gene: in-progress gene record :return: A bool whether or not a gene map location is provided """ + breakpoint() annotations = {v.value for v in Annotation.__members__.values()} for annotation in annotations: diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index da5d97e8..6e13da85 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -18,6 +18,7 @@ NamespacePrefix, SourceMeta, SourceName, + StoredSequenceLocation, SymbolStatus, ) @@ -191,11 +192,10 @@ def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None: if f_id.startswith("gene"): symbol = f.attributes["Name"][0] if symbol in info_genes: - # Just need to add SequenceLocation - params = info_genes.get(symbol) + params: Dict = info_genes.get(symbol) # type: ignore vrs_sq_location = self._get_vrs_sq_location(db, params, f_id) if vrs_sq_location: - params["locations"].append(vrs_sq_location) # type: ignore + params["locations"].append(vrs_sq_location) else: # Need to add entire gene gene = self._add_gff_gene(db, f, f_id) @@ -212,7 +212,6 @@ def _add_gff_gene( :return: A gene dictionary if the ID attribute exists. Else return None. """ params = dict() - params["src_name"] = SourceName.NCBI.value self._add_attributes(f, params) sq_loc = self._get_vrs_sq_location(db, params, f_id) if sq_loc: @@ -245,18 +244,18 @@ def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None: def _get_vrs_sq_location( self, db: gffutils.FeatureDB, params: Dict, f_id: str - ) -> Dict: + ) -> Optional[StoredSequenceLocation]: """Store GA4GH VRS SequenceLocation in a gene record. https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation :param db: GFF database :param params: A transformed gene record :param f_id: The feature's ID - :return: A GA4GH VRS SequenceLocation + :return: A storable set of SequenceLocation params """ gene = db[f_id] params["strand"] = gene.strand - return self._get_sequence_location(gene.seqid, gene, params) + return self._build_sequence_location(gene.seqid, gene, params["concept_id"]) def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: """Get xref or associated_with ref. diff --git a/src/gene/schemas.py b/src/gene/schemas.py index e6cb5183..9e0965d4 100644 --- a/src/gene/schemas.py +++ b/src/gene/schemas.py @@ -58,13 +58,12 @@ class MatchType(IntEnum): PREV_SYMBOL = 80 ALIAS = 60 XREF = 60 - ASSOCIATED_WITH = 60 FUZZY_MATCH = 20 NO_MATCH = 0 -class GeneSequenceLocation(BaseModel): - """Sequence Location model when storing in DynamoDB.""" +class StoredSequenceLocation(BaseModel): + """Sequence Location model when storing in database.""" type: Literal["SequenceLocation"] = "SequenceLocation" start: StrictInt @@ -73,7 +72,7 @@ class GeneSequenceLocation(BaseModel): # class GeneChromosomeLocation(BaseModel): -# """Chromosome Location model when storing in DynamDB.""" +# """Chromosome Location model when storing in database.""" # type: Literal["ChromosomeLocation"] = "ChromosomeLocation" # species_id: Literal["taxonomy:9606"] = "taxonomy:9606" @@ -94,14 +93,13 @@ class BaseGene(BaseModel): strand: Optional[Strand] = None location_annotations: List[StrictStr] = [] locations: Union[ - List[models.SequenceLocation], List[GeneSequenceLocation] + List[models.SequenceLocation], List[StoredSequenceLocation] # List[Union[SequenceLocation, ChromosomeLocation]], # List[Union[GeneSequenceLocation, GeneChromosomeLocation]] # dynamodb ] = [] aliases: List[StrictStr] = [] previous_symbols: List[StrictStr] = [] xrefs: List[CURIE] = [] - associated_with: List[CURIE] = [] gene_type: Optional[StrictStr] = None @@ -123,7 +121,6 @@ class Gene(BaseGene): "strand": "-", "locations": [], "location_annotations": [], - "associated_with": [], "gene_type": None, "match_type": 100, } @@ -242,7 +239,6 @@ class RefType(str, Enum): PREVIOUS_SYMBOLS = "prev_symbol" ALIASES = "alias" XREFS = "xref" - ASSOCIATED_WITH = "associated_with" # collective name to singular name, e.g. {"previous_symbols": "prev_symbol"} @@ -257,7 +253,7 @@ class SourceMeta(BaseModel): version: StrictStr data_url: Dict[StrictStr, StrictStr] # TODO strictness necessary? rdp_url: Optional[StrictStr] = None - data_license_attributes: Dict[StrictStr, StrictBool] + data_license_attributes: DataLicenseAttributes genome_assemblies: List[StrictStr] = [] model_config = ConfigDict( @@ -561,8 +557,9 @@ class UnmergedNormalizationService(BaseNormalizationService): ], "aliases": ["3.1.1.7"], "previous_symbols": ["YT"], - "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"], - "associated_with": [ + "xrefs": [ + "ncbigene:43", + "ensembl:ENSG00000087085", "ucsc:uc003uxi.4", "vega:OTTHUMG00000157033", "merops:S09.979", @@ -618,7 +615,6 @@ class UnmergedNormalizationService(BaseNormalizationService): "aliases": [], "previous_symbols": [], "xrefs": ["hgnc:108"], - "associated_with": [], "gene_type": "protein_coding", } ], @@ -669,8 +665,11 @@ class UnmergedNormalizationService(BaseNormalizationService): ], "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"], "previous_symbols": ["ACEE"], - "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"], - "associated_with": ["omim:100740"], + "xrefs": [ + "hgnc:108", + "ensembl:ENSG00000087085", + "omim:100740", + ], "gene_type": "protein-coding", } ],