Skip to content

Commit

Permalink
test
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Jan 2, 2024
1 parent 305359b commit 0c1303f
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 85 deletions.
2 changes: 1 addition & 1 deletion src/gene/database/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
:param src_name: name of source for record. Not used by PostgreSQL instance.
"""
concept_id = record["concept_id"]
locations = [json.dumps(loc) for loc in record.get("locations", [])]
locations = [loc.model_dump_json() for loc in record.get("locations", [])]
if not locations:
locations = None
with self.conn.cursor() as cur:
Expand Down
42 changes: 18 additions & 24 deletions src/gene/etl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from wags_tails import EnsemblData, HgncData, NcbiGeneData

from gene.database import AbstractDatabase
from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName
from gene.schemas import ITEM_TYPES, Gene, MatchType, SourceName, StoredSequenceLocation

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -122,11 +122,6 @@ def _load_gene(self, gene: Dict) -> None:
except pydantic.ValidationError as e:
_logger.warning(f"Unable to load {gene} due to validation error: " f"{e}")
else:
concept_id = gene["concept_id"]
gene["label_and_type"] = f"{concept_id.lower()}##identity"
gene["src_name"] = self._src_name.value
gene["item_type"] = "identity"

for attr_type in ITEM_TYPES:
if attr_type in gene:
value = gene[attr_type]
Expand All @@ -137,7 +132,7 @@ def _load_gene(self, gene: Dict) -> None:
gene[attr_type] = list(set(value))

self._database.add_record(gene, self._src_name)
self._processed_ids.append(concept_id)
self._processed_ids.append(gene["concept_id"])

def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo:
"""Return SeqRepo instance if seqrepo_dir exists.
Expand Down Expand Up @@ -224,32 +219,31 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]:
_logger.warning(f"SeqRepo raised KeyError: {e}")
return aliases

def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict:
"""Get a gene's GeneSequenceLocation.
def _build_sequence_location(
self, seq_id: str, gene: Feature, concept_id: str
) -> Optional[StoredSequenceLocation]:
"""Construct a sequence location for storing in a DB.
:param seq_id: The sequence ID.
:param gene: A gene from the source file.
:param params: The transformed gene record.
:return: A dictionary of a GA4GH VRS SequenceLocation, if seq_id alias found.
Else, empty dictionary
:param concept_id: record ID from source
:return: A storable SequenceLocation containing relevant params for returning a
VRS SequenceLocation, or None if unable to retrieve valid parameters
"""
location = {}
aliases = self._get_seq_id_aliases(seq_id)
if not aliases:
return location
if not aliases or gene.start is None or gene.end is None:
return None

sequence = aliases[0]

if gene.start != "." and gene.end != "." and sequence:
if 0 <= gene.start <= gene.end: # type: ignore
location = GeneSequenceLocation(
start=gene.start - 1, # type: ignore
end=gene.end, # type: ignore
if 0 <= gene.start <= gene.end:
return StoredSequenceLocation(
start=gene.start - 1,
end=gene.end,
sequence_id=sequence,
).model_dump() # type: ignore
)
else:
_logger.warning(
f"{params['concept_id']} has invalid interval:"
f"start={gene.start - 1} end={gene.end}"
) # type: ignore
return location
f"{concept_id} has invalid interval: start={gene.start - 1} end={gene.end}"
)
49 changes: 22 additions & 27 deletions src/gene/etl/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,12 @@
from gffutils.feature import Feature

from gene.etl.base import Base, GeneNormalizerEtlError
from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand
from gene.schemas import (
DataLicenseAttributes,
NamespacePrefix,
SourceMeta,
Strand,
)

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -66,22 +71,23 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict:
:param accession_numbers: Accession numbers for each chromosome and scaffold
:return: A gene dictionary containing data if the ID attribute exists.
"""
gene = dict()
gene_params = dict()
if f.strand == "-":
gene["strand"] = Strand.REVERSE.value
gene_params["strand"] = Strand.REVERSE.value
elif f.strand == "+":
gene["strand"] = Strand.FORWARD.value
gene["src_name"] = SourceName.ENSEMBL.value
gene_params["strand"] = Strand.FORWARD.value

self._add_attributes(f, gene)
location = self._add_location(f, gene, accession_numbers)
self._add_attributes(f, gene_params)
location = self._build_sequence_location(
accession_numbers[f.seqid], f, gene_params["concept_id"]
)
if location:
gene["locations"] = [location]
gene_params["locations"] = [location]

gene["label_and_type"] = f"{gene['concept_id'].lower()}##identity"
gene["item_type"] = "identity"
gene_params["label_and_type"] = f"{gene_params['concept_id'].lower()}##identity"
gene_params["item_type"] = "identity"

return gene
return gene_params

def _add_attributes(self, f: Feature, gene: Dict) -> None:
"""Add concept_id, symbol, xrefs, and associated_with to a gene record.
Expand Down Expand Up @@ -132,17 +138,6 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None:

gene[attributes[key]] = val

def _add_location(self, f: Feature, gene: Dict, accession_numbers: Dict) -> Dict:
"""Add GA4GH SequenceLocation to a gene record.
https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation
:param f: A gene from the data
:param gene: A transformed gene record
:param accession_numbers: Accession numbers for each chromosome and scaffold
:return: gene record dictionary with location added
"""
return self._get_sequence_location(accession_numbers[f.seqid], f, gene)

def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict:
"""Get xref or associated_with concept.
Expand Down Expand Up @@ -181,11 +176,11 @@ def _add_meta(self) -> None:
"genome_annotations": f"ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz"
},
rdp_url=None,
data_license_attributes={
"non_commercial": False,
"share_alike": False,
"attribution": False,
},
data_license_attributes=DataLicenseAttributes(
non_commercial=False,
share_alike=False,
attribution=False,
),
genome_assemblies=[self._assembly],
)

Expand Down
24 changes: 12 additions & 12 deletions src/gene/etl/hgnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,16 @@ def _transform_data(self) -> None:
records = data["response"]["docs"]

for r in records:
gene = dict()
gene["concept_id"] = r["hgnc_id"].lower()
gene["label_and_type"] = f"{gene['concept_id']}##identity"
gene["item_type"] = "identity"
gene["symbol"] = r["symbol"]
gene["label"] = r["name"]
gene["src_name"] = SourceName.HGNC.value
gene = {
"concept_id": r["hgnc_id"].lower(),
"symbol": r["symbol"],
"label": r["name"],
}
if r["status"]:
if r["status"] == "Approved":
gene["symbol_status"] = SymbolStatus.APPROVED.value
elif r["status"] == "Entry Withdrawn":
gene["symbol_status"] = SymbolStatus.WITHDRAWN.value
gene["src_name"] = SourceName.HGNC.value

# store alias, xref, associated_with, prev_symbols, location
self._get_aliases(r, gene)
Expand Down Expand Up @@ -83,7 +80,7 @@ def _get_previous_symbols(self, r: Dict, gene: Dict) -> None:
if prev_symbols:
gene["previous_symbols"] = list(set(prev_symbols))

def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None:
def _get_xrefs_associated_with(self, record: Dict, gene: Dict) -> None:
"""Store xrefs and/or associated_with refs in a gene record.
:param r: A gene record in the HGNC data file
Expand Down Expand Up @@ -119,7 +116,7 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None:
]

for src in sources:
if src in r:
if src in record:
if "-" in src:
key = src.split("-")[0]
elif "." in src:
Expand All @@ -131,9 +128,11 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None:

if key.upper() in NamespacePrefix.__members__:
if NamespacePrefix[key.upper()].value in PREFIX_LOOKUP.keys():
self._get_xref_associated_with(key, src, r, xrefs)
self._get_xref_associated_with(key, src, record, xrefs)
else:
self._get_xref_associated_with(key, src, r, associated_with)
self._get_xref_associated_with(
key, src, record, associated_with
)
else:
_logger.warning(f"{key} not in schemas.py")

Expand Down Expand Up @@ -202,6 +201,7 @@ def _set_annotation(self, loc: str, gene: Dict) -> None:
:param gene: in-progress gene record
:return: A bool whether or not a gene map location is provided
"""
breakpoint()
annotations = {v.value for v in Annotation.__members__.values()}

for annotation in annotations:
Expand Down
13 changes: 6 additions & 7 deletions src/gene/etl/ncbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
NamespacePrefix,
SourceMeta,
SourceName,
StoredSequenceLocation,
SymbolStatus,
)

Expand Down Expand Up @@ -191,11 +192,10 @@ def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None:
if f_id.startswith("gene"):
symbol = f.attributes["Name"][0]
if symbol in info_genes:
# Just need to add SequenceLocation
params = info_genes.get(symbol)
params: Dict = info_genes.get(symbol) # type: ignore
vrs_sq_location = self._get_vrs_sq_location(db, params, f_id)
if vrs_sq_location:
params["locations"].append(vrs_sq_location) # type: ignore
params["locations"].append(vrs_sq_location)
else:
# Need to add entire gene
gene = self._add_gff_gene(db, f, f_id)
Expand All @@ -212,7 +212,6 @@ def _add_gff_gene(
:return: A gene dictionary if the ID attribute exists. Else return None.
"""
params = dict()
params["src_name"] = SourceName.NCBI.value
self._add_attributes(f, params)
sq_loc = self._get_vrs_sq_location(db, params, f_id)
if sq_loc:
Expand Down Expand Up @@ -245,18 +244,18 @@ def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None:

def _get_vrs_sq_location(
self, db: gffutils.FeatureDB, params: Dict, f_id: str
) -> Dict:
) -> Optional[StoredSequenceLocation]:
"""Store GA4GH VRS SequenceLocation in a gene record.
https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation
:param db: GFF database
:param params: A transformed gene record
:param f_id: The feature's ID
:return: A GA4GH VRS SequenceLocation
:return: A storable set of SequenceLocation params
"""
gene = db[f_id]
params["strand"] = gene.strand
return self._get_sequence_location(gene.seqid, gene, params)
return self._build_sequence_location(gene.seqid, gene, params["concept_id"])

def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict:
"""Get xref or associated_with ref.
Expand Down
27 changes: 13 additions & 14 deletions src/gene/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,12 @@ class MatchType(IntEnum):
PREV_SYMBOL = 80
ALIAS = 60
XREF = 60
ASSOCIATED_WITH = 60
FUZZY_MATCH = 20
NO_MATCH = 0


class GeneSequenceLocation(BaseModel):
"""Sequence Location model when storing in DynamoDB."""
class StoredSequenceLocation(BaseModel):
"""Sequence Location model when storing in database."""

type: Literal["SequenceLocation"] = "SequenceLocation"
start: StrictInt
Expand All @@ -73,7 +72,7 @@ class GeneSequenceLocation(BaseModel):


# class GeneChromosomeLocation(BaseModel):
# """Chromosome Location model when storing in DynamDB."""
# """Chromosome Location model when storing in database."""

# type: Literal["ChromosomeLocation"] = "ChromosomeLocation"
# species_id: Literal["taxonomy:9606"] = "taxonomy:9606"
Expand All @@ -94,14 +93,13 @@ class BaseGene(BaseModel):
strand: Optional[Strand] = None
location_annotations: List[StrictStr] = []
locations: Union[
List[models.SequenceLocation], List[GeneSequenceLocation]
List[models.SequenceLocation], List[StoredSequenceLocation]
# List[Union[SequenceLocation, ChromosomeLocation]],
# List[Union[GeneSequenceLocation, GeneChromosomeLocation]] # dynamodb
] = []
aliases: List[StrictStr] = []
previous_symbols: List[StrictStr] = []
xrefs: List[CURIE] = []
associated_with: List[CURIE] = []
gene_type: Optional[StrictStr] = None


Expand All @@ -123,7 +121,6 @@ class Gene(BaseGene):
"strand": "-",
"locations": [],
"location_annotations": [],
"associated_with": [],
"gene_type": None,
"match_type": 100,
}
Expand Down Expand Up @@ -242,7 +239,6 @@ class RefType(str, Enum):
PREVIOUS_SYMBOLS = "prev_symbol"
ALIASES = "alias"
XREFS = "xref"
ASSOCIATED_WITH = "associated_with"


# collective name to singular name, e.g. {"previous_symbols": "prev_symbol"}
Expand All @@ -257,7 +253,7 @@ class SourceMeta(BaseModel):
version: StrictStr
data_url: Dict[StrictStr, StrictStr] # TODO strictness necessary?
rdp_url: Optional[StrictStr] = None
data_license_attributes: Dict[StrictStr, StrictBool]
data_license_attributes: DataLicenseAttributes
genome_assemblies: List[StrictStr] = []

model_config = ConfigDict(
Expand Down Expand Up @@ -561,8 +557,9 @@ class UnmergedNormalizationService(BaseNormalizationService):
],
"aliases": ["3.1.1.7"],
"previous_symbols": ["YT"],
"xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"],
"associated_with": [
"xrefs": [
"ncbigene:43",
"ensembl:ENSG00000087085",
"ucsc:uc003uxi.4",
"vega:OTTHUMG00000157033",
"merops:S09.979",
Expand Down Expand Up @@ -618,7 +615,6 @@ class UnmergedNormalizationService(BaseNormalizationService):
"aliases": [],
"previous_symbols": [],
"xrefs": ["hgnc:108"],
"associated_with": [],
"gene_type": "protein_coding",
}
],
Expand Down Expand Up @@ -669,8 +665,11 @@ class UnmergedNormalizationService(BaseNormalizationService):
],
"aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"],
"previous_symbols": ["ACEE"],
"xrefs": ["hgnc:108", "ensembl:ENSG00000087085"],
"associated_with": ["omim:100740"],
"xrefs": [
"hgnc:108",
"ensembl:ENSG00000087085",
"omim:100740",
],
"gene_type": "protein-coding",
}
],
Expand Down

0 comments on commit 0c1303f

Please sign in to comment.