Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix NCBI SRA/Andersen ingest #130

Merged
merged 3 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 40 additions & 13 deletions ingest/build-configs/ncbi/bin/curate-andersen-lab-data
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,35 @@ def create_new_record(anderson_record: dict) -> dict:

def use_date_when_available(andersen_record: dict) -> str:
"""
Give the old date field `Date` precedence since they are more specific
Give the old date field `Date` precedence since they are more specific.

If using the new date field `Collection_Date`, then verify that it's a
valid date.
"""
old_date_field = andersen_record.get("Date", "")
old_date_uncertain = "NA" in old_date_field or "?" in old_date_field

if old_date_field and not old_date_uncertain:
return old_date_field

return andersen_record["Collection_Date"]
new_date = andersen_record["Collection_Date"]
if new_date.lower() == "missing":
new_date = "XXXX-XX-XX"

# Certain dates are date ranges, e.g. "2022-04-22/2022-04-24"
# Only keep the first date for our metadata
date_range_pattern = r"([\d]{4}-[\d]{2}-[\d]{2})\/[\d]{4}-[\d]{2}-[\d]{2}"
matches = re.match(date_range_pattern, new_date)
if matches:
new_date = matches.group(1)

try:
parse_date(new_date)
except ValueError as err:
print(f"WARNING: {err}", file=stderr)
new_date = "XXXX-XX-XX"

return new_date


def parse_center_name(center_name: str) -> str:
Expand All @@ -88,8 +108,9 @@ def parse_center_name(center_name: str) -> str:

def construct_strain_name(record: dict, sample_id: str) -> str:
"""
Construct a strain name for the *sample_id* using metadata from the *record*
to include host, country, and year.
Check if the *sample_id* is follows a strain name pattern, otherwise
construct a strain name for the *sample_id* using metadata from the
*record* to include host, country, and year.

Removes all spaces in the constructed strain name because they are not
allowed in the downstream phylogenetic workflow. Also replaces invalid
Expand All @@ -99,26 +120,32 @@ def construct_strain_name(record: dict, sample_id: str) -> str:
¹ <https://github.com/iqtree/iqtree2/blob/74da454bbd98d6ecb8cb955975a50de59785fbde/utils/tools.cpp#L607>
² <https://github.com/nextstrain/avian-flu/issues/113>
"""
host = record['host']
country = record['country']
year = parse_year(record['date'])
strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
# A/<host>/<location>/<sample_id>/<year>
strain_pattern = r"(A\/.+\/.+\/.+\/[\d]{4})"
matches = re.search(strain_pattern, sample_id)
if matches:
strain = matches.group(1)
else:
host = record['host']
country = record['country']
year = str(parse_date(record['date']).year)
strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
return re.sub(r'[^\w\_\-\.\|\/]', '_', strain)


def parse_year(date_string: str) -> str:
def parse_date(date_string: str) -> datetime:
"""
Parse the year from the provided `date_string`
Parse the provided `date_string` as a datetime object.
"""
date_formats = ['%Y-%m-%d', '%Y']
date_formats = ['%Y-%m-%d', '%Y', 'XXXX-XX-XX']
for date_format in date_formats:
try:
parsed_date = datetime.strptime(date_string, date_format)
return str(parsed_date.year)
return parsed_date
except ValueError:
continue

raise ValueError(f"Could not parse year from date string {date_string!r}")
raise ValueError(f"Could not parse date from date string {date_string!r}")


if __name__ == '__main__':
Expand Down
91 changes: 90 additions & 1 deletion ingest/build-configs/ncbi/defaults/host-map.tsv
Original file line number Diff line number Diff line change
@@ -1,114 +1,201 @@
# Check for bird codes in https://www.carolinabirdclub.org/bandcodes.html
# Note that the first column in this TSV is case insensitive
Accipiter cooperii Avian
Accipiter striatus Avian
Accipitridae Avian
Accipitridae sp. Avian
Aix sponsa Avian
alpaca Nonhuman Mammal
american crow Avian
AMERICAN ROBIN Avian
Turdus migratorius Avian
american wigeon Avian
Anas acuta Avian
Anas crecca Avian
Anas rubripes Avian
Anas platyrhynchos Avian
Anas sp. Avian
Anatidae Avian
Anser albifrons Avian
Anser anser Avian
Anser caerulescens Avian
Anser rossii Avian
Anser sp. Avian
Antigone canadensis Avian
Ardea herodias Avian
Arenaria interpres Avian
Arinae sp. Avian
Aves sp. Avian
Aythya affinis Avian
Aythya collaris Avian
Aythya americana Avian
bald eagle Avian
black billed magpie Avian
blackbird Avian
black scoter Avian
Black vulture Avian
BLUE-WINGED TEAL Avian
BOBCAT Nonhuman Mammal
Bos taurus Cattle
BOTTLENOSE DOLPHIN Nonhuman Mammal
brandt goose Avian
Branta canadensis Avian
Bubo scandiacus Avian
Bubo virginianus Avian
Bucephala clangula Avian
Buteo jamaicensis Avian
Buteo lineatus Avian
CACKLING GOOSE Avian
cago Avian
Cairina moschata Avian
Calidris alba Avian
Callipepla sp. Avian
canada goose Avian
Capra hircus Nonhuman Mammal
cat Nonhuman Mammal
Cathartes aura Avian
Cathartidae sp. Avian
cattle Cattle
CATTLE MILK PRODUCT Cattle
CHEETAH Nonhuman Mammal
Chenonetta jubata Avian
chicken Avian
Columba livia Avian
Columbidae Avian
Common Eider Avian
common loon Avian
common raven Avian
comon-grackle Avian
Coragyps atratus Avian
Corvidae sp. Avian
Corvus Avian
Corvus brachyrhynchos Avian
Corvus corax Avian
Corvus ossifragus Avian
Corvus sp. Avian
CRANE Avian
CRESTED CARACARA Avian
Cygnus atratus Avian
Cygnus buccinator Avian
Cygnus columbianus Avian
Cygnus olor Avian
Cygnus sp. Avian
Dairy cattle Cattle
domestic cat Nonhuman Mammal
domestic-cat Nonhuman Mammal
Dove Avian
Dromaius novaehollandiae Avian
duck Avian
EMU Avian
Emu Avian
environment Environment
ERMINE Nonhuman Mammal
EURASIAN COLLARED DOVE Avian
Falco peregrinus Avian
Feliformia Nonhuman Mammal
feline Nonhuman Mammal
Felis catus Nonhuman Mammal
FLAMINGO Avian
FOX Nonhuman Mammal
GADWALL Avian
Gallus gallus Avian
ganada goose Avian
Gavia immer Avian
GEOFFROY'S CAT Nonhuman Mammal
goat Nonhuman Mammal
goose Avian
grackle Avian
Great Black-Backed Gull Avian
Great Black-backed Gulll Avian
great horned owl Avian
GREEN-WINGED TEAL Avian
Guinea Fowl Avian
GUINEAFOWL Avian
gull Avian
Haliaeetus leucocephalus Avian
harris hawk Avian
harris-hawk Avian
hawk Avian
Herring Gull Avian
Homo sapiens Human
HOSP Avian
HOUSE SPARROW Avian
House-Mouse Nonhuman Mammal
Icteridae Avian
Laridae sp. Avian
Larus argentatus Avian
Larus delawarensis Avian
Larus occidentalis Avian
Leopardus geoffroyi Nonhuman Mammal
Leptailurus serval Nonhuman Mammal
lesser scaup Avian
LION Nonhuman Mammal
Lophodytes cucullatus Avian
LYNX Nonhuman Mammal
Lynx rufus Nonhuman Mammal
mallard Avian
Mareca americana Avian
Mareca strepera Avian
Meleagris gallopavo Avian
Melegris gallopavo Avian
Mephitidae Nonhuman Mammal
Mephitis mephitis Avian
Mergus merganser Avian
mountain lion Nonhuman Mammal
mountain_lion Nonhuman Mammal
Mus musculus Nonhuman Mammal
mute swan Avian
NORTHERN SHOVELER Avian
Numididae Avian
Numididae sp. Avian
Panthera leo Nonhuman Mammal
Panthera tigris Nonhuman Mammal
Passer domesticus Avian
Pavo Avian
PEAFOWL Avian
pefa Avian
Pelecanidae Avian
Pelecanidae sp. Avian
Pelecanus erythrorhynchos Avian
Pelecanus occidentalis Avian
PELICAN Avian
pigeon Avian
Phalacrocoracidae sp. Avian
Phasianidae sp. Avian
Phasianinae Avian
Phasianus colchicus Avian
PHEASANT Avian
Pig Nonhuman Mammal
Podiceps auritus Avian
Procyon lotor Nonhuman Mammal
Puma concolor Nonhuman Mammal
Quiscalus quiscula Avian
raccoon Nonhuman Mammal
red fox Nonhuman Mammal
red tailed hawk Avian
Red-Tailed Hawk Avian
Rock Pigeon Avian
Sanderling Avian
SAVANNAH CAT Nonhuman Mammal
Scoter Avian
SERVAL Nonhuman Mammal
Sibirionetta formosa Avian
skunk Nonhuman Mammal
snow goose Avian
SNOWY OWL Avian
Spatula clypeata Avian
Spatula discors Avian
Streptopelia decaocto Avian
Strigiformes sp. Avian
Strix varia Avian
surf scoter Avian
Sus scrofa Nonhuman Mammal
SWALLOW Avian
SWAN Avian
tiger Nonhuman Mammal
Thalasseus maximus Avian
TRUMPETER SWAN Avian
Turdus merula Avian
Turdus migratorius Avian
turkey Avian
turkey vulture Avian
Vicugna pacos Nonhuman Mammal
Expand All @@ -118,5 +205,7 @@ western kingbird Avian
western sandpiper Avian
WHITE-FACED IBIS Avian
White-winged Dove Avian
White-winged Scoter Avian
Wild-Bird Avian
WOOD DUCK Avian
Zenaida asiatica Avian