diff --git a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data index d847e09..135e175 100755 --- a/ingest/build-configs/ncbi/bin/curate-andersen-lab-data +++ b/ingest/build-configs/ncbi/bin/curate-andersen-lab-data @@ -68,7 +68,10 @@ def create_new_record(anderson_record: dict) -> dict: def use_date_when_available(andersen_record: dict) -> str: """ - Give the old date field `Date` precedence since they are more specific + Give the old date field `Date` precedence since they are more specific. + + If using the new date field `Collection_Date`, then verify that it's a + valid date. """ old_date_field = andersen_record.get("Date", "") old_date_uncertain = "NA" in old_date_field or "?" in old_date_field @@ -76,7 +79,24 @@ def use_date_when_available(andersen_record: dict) -> str: if old_date_field and not old_date_uncertain: return old_date_field - return andersen_record["Collection_Date"] + new_date = andersen_record["Collection_Date"] + if new_date.lower() == "missing": + new_date = "XXXX-XX-XX" + + # Certain dates are date ranges, e.g. "2022-04-22/2022-04-24" + # Only keep the first date for our metadata + date_range_pattern = r"([\d]{4}-[\d]{2}-[\d]{2})\/[\d]{4}-[\d]{2}-[\d]{2}" + matches = re.match(date_range_pattern, new_date) + if matches: + new_date = matches.group(1) + + try: + parse_date(new_date) + except ValueError as err: + print(f"WARNING: {err}", file=stderr) + new_date = "XXXX-XX-XX" + + return new_date def parse_center_name(center_name: str) -> str: @@ -88,8 +108,9 @@ def parse_center_name(center_name: str) -> str: def construct_strain_name(record: dict, sample_id: str) -> str: """ - Construct a strain name for the *sample_id* using metadata from the *record* - to include host, country, and year. + Check if the *sample_id* is follows a strain name pattern, otherwise + construct a strain name for the *sample_id* using metadata from the + *record* to include host, country, and year. Removes all spaces in the constructed strain name because they are not allowed in the downstream phylogenetic workflow. Also replaces invalid @@ -99,26 +120,32 @@ def construct_strain_name(record: dict, sample_id: str) -> str: ¹ ² """ - host = record['host'] - country = record['country'] - year = parse_year(record['date']) - strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "") + # A//// + strain_pattern = r"(A\/.+\/.+\/.+\/[\d]{4})" + matches = re.search(strain_pattern, sample_id) + if matches: + strain = matches.group(1) + else: + host = record['host'] + country = record['country'] + year = str(parse_date(record['date']).year) + strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "") return re.sub(r'[^\w\_\-\.\|\/]', '_', strain) -def parse_year(date_string: str) -> str: +def parse_date(date_string: str) -> datetime: """ - Parse the year from the provided `date_string` + Parse the provided `date_string` as a datetime object. """ - date_formats = ['%Y-%m-%d', '%Y'] + date_formats = ['%Y-%m-%d', '%Y', 'XXXX-XX-XX'] for date_format in date_formats: try: parsed_date = datetime.strptime(date_string, date_format) - return str(parsed_date.year) + return parsed_date except ValueError: continue - raise ValueError(f"Could not parse year from date string {date_string!r}") + raise ValueError(f"Could not parse date from date string {date_string!r}") if __name__ == '__main__': diff --git a/ingest/build-configs/ncbi/defaults/host-map.tsv b/ingest/build-configs/ncbi/defaults/host-map.tsv index 5208ebf..0855b4b 100644 --- a/ingest/build-configs/ncbi/defaults/host-map.tsv +++ b/ingest/build-configs/ncbi/defaults/host-map.tsv @@ -1,98 +1,170 @@ # Check for bird codes in https://www.carolinabirdclub.org/bandcodes.html # Note that the first column in this TSV is case insensitive +Accipiter cooperii Avian +Accipiter striatus Avian Accipitridae Avian +Accipitridae sp. Avian +Aix sponsa Avian alpaca Nonhuman Mammal american crow Avian AMERICAN ROBIN Avian -Turdus migratorius Avian american wigeon Avian +Anas acuta Avian +Anas crecca Avian +Anas rubripes Avian Anas platyrhynchos Avian +Anas sp. Avian Anatidae Avian +Anser albifrons Avian +Anser anser Avian Anser caerulescens Avian +Anser rossii Avian +Anser sp. Avian +Antigone canadensis Avian +Ardea herodias Avian Arenaria interpres Avian +Arinae sp. Avian +Aves sp. Avian +Aythya affinis Avian +Aythya collaris Avian Aythya americana Avian bald eagle Avian black billed magpie Avian blackbird Avian +black scoter Avian +Black vulture Avian +BLUE-WINGED TEAL Avian BOBCAT Nonhuman Mammal Bos taurus Cattle +BOTTLENOSE DOLPHIN Nonhuman Mammal +brandt goose Avian Branta canadensis Avian +Bubo scandiacus Avian Bubo virginianus Avian +Bucephala clangula Avian Buteo jamaicensis Avian +Buteo lineatus Avian +CACKLING GOOSE Avian cago Avian +Cairina moschata Avian Calidris alba Avian +Callipepla sp. Avian canada goose Avian Capra hircus Nonhuman Mammal cat Nonhuman Mammal Cathartes aura Avian +Cathartidae sp. Avian cattle Cattle CATTLE MILK PRODUCT Cattle +CHEETAH Nonhuman Mammal Chenonetta jubata Avian chicken Avian Columba livia Avian Columbidae Avian +Common Eider Avian +common loon Avian common raven Avian comon-grackle Avian +Coragyps atratus Avian +Corvidae sp. Avian Corvus Avian Corvus brachyrhynchos Avian Corvus corax Avian +Corvus ossifragus Avian +Corvus sp. Avian +CRANE Avian +CRESTED CARACARA Avian +Cygnus atratus Avian +Cygnus buccinator Avian +Cygnus columbianus Avian Cygnus olor Avian +Cygnus sp. Avian Dairy cattle Cattle domestic cat Nonhuman Mammal domestic-cat Nonhuman Mammal +Dove Avian Dromaius novaehollandiae Avian duck Avian EMU Avian Emu Avian environment Environment +ERMINE Nonhuman Mammal EURASIAN COLLARED DOVE Avian Falco peregrinus Avian Feliformia Nonhuman Mammal feline Nonhuman Mammal Felis catus Nonhuman Mammal +FLAMINGO Avian FOX Nonhuman Mammal +GADWALL Avian Gallus gallus Avian ganada goose Avian +Gavia immer Avian GEOFFROY'S CAT Nonhuman Mammal goat Nonhuman Mammal goose Avian grackle Avian +Great Black-Backed Gull Avian +Great Black-backed Gulll Avian great horned owl Avian +GREEN-WINGED TEAL Avian +Guinea Fowl Avian +GUINEAFOWL Avian +gull Avian Haliaeetus leucocephalus Avian harris hawk Avian harris-hawk Avian hawk Avian +Herring Gull Avian Homo sapiens Human HOSP Avian HOUSE SPARROW Avian House-Mouse Nonhuman Mammal Icteridae Avian +Laridae sp. Avian +Larus argentatus Avian +Larus delawarensis Avian Larus occidentalis Avian Leopardus geoffroyi Nonhuman Mammal Leptailurus serval Nonhuman Mammal +lesser scaup Avian LION Nonhuman Mammal Lophodytes cucullatus Avian LYNX Nonhuman Mammal Lynx rufus Nonhuman Mammal mallard Avian +Mareca americana Avian +Mareca strepera Avian Meleagris gallopavo Avian +Melegris gallopavo Avian Mephitidae Nonhuman Mammal Mephitis mephitis Avian +Mergus merganser Avian mountain lion Nonhuman Mammal mountain_lion Nonhuman Mammal Mus musculus Nonhuman Mammal mute swan Avian +NORTHERN SHOVELER Avian Numididae Avian Numididae sp. Avian Panthera leo Nonhuman Mammal Panthera tigris Nonhuman Mammal Passer domesticus Avian Pavo Avian +PEAFOWL Avian pefa Avian Pelecanidae Avian +Pelecanidae sp. Avian Pelecanus erythrorhynchos Avian +Pelecanus occidentalis Avian +PELICAN Avian pigeon Avian +Phalacrocoracidae sp. Avian +Phasianidae sp. Avian Phasianinae Avian +Phasianus colchicus Avian +PHEASANT Avian +Pig Nonhuman Mammal Podiceps auritus Avian Procyon lotor Nonhuman Mammal Puma concolor Nonhuman Mammal @@ -100,15 +172,30 @@ Quiscalus quiscula Avian raccoon Nonhuman Mammal red fox Nonhuman Mammal red tailed hawk Avian +Red-Tailed Hawk Avian Rock Pigeon Avian +Sanderling Avian SAVANNAH CAT Nonhuman Mammal +Scoter Avian SERVAL Nonhuman Mammal +Sibirionetta formosa Avian skunk Nonhuman Mammal snow goose Avian +SNOWY OWL Avian +Spatula clypeata Avian +Spatula discors Avian Streptopelia decaocto Avian +Strigiformes sp. Avian +Strix varia Avian +surf scoter Avian Sus scrofa Nonhuman Mammal +SWALLOW Avian +SWAN Avian tiger Nonhuman Mammal +Thalasseus maximus Avian +TRUMPETER SWAN Avian Turdus merula Avian +Turdus migratorius Avian turkey Avian turkey vulture Avian Vicugna pacos Nonhuman Mammal @@ -118,5 +205,7 @@ western kingbird Avian western sandpiper Avian WHITE-FACED IBIS Avian White-winged Dove Avian +White-winged Scoter Avian Wild-Bird Avian +WOOD DUCK Avian Zenaida asiatica Avian