Skip to content

Commit

Permalink
curate-andersen-lab-data: Ensure dates are valid
Browse files Browse the repository at this point in the history
New data now include collection dates that are the string
"missing" or date ranges (e.g. "2022-04-22/2022-04-24").
Ensure that the returned date is a valid date that can be used to
construct the strain name.

Resolves <#129>
  • Loading branch information
joverlee521 committed Feb 18, 2025
1 parent cec595d commit 636a680
Showing 1 changed file with 28 additions and 8 deletions.
36 changes: 28 additions & 8 deletions ingest/build-configs/ncbi/bin/curate-andersen-lab-data
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,35 @@ def create_new_record(anderson_record: dict) -> dict:

def use_date_when_available(andersen_record: dict) -> str:
"""
Give the old date field `Date` precedence since they are more specific
Give the old date field `Date` precedence since they are more specific.
If using the new date field `Collection_Date`, then verify that it's a
valid date.
"""
old_date_field = andersen_record.get("Date", "")
old_date_uncertain = "NA" in old_date_field or "?" in old_date_field

if old_date_field and not old_date_uncertain:
return old_date_field

return andersen_record["Collection_Date"]
new_date = andersen_record["Collection_Date"]
if new_date.lower() == "missing":
new_date = "XXXX-XX-XX"

# Certain dates are date ranges, e.g. "2022-04-22/2022-04-24"
# Only keep the first date for our metadata
date_range_pattern = r"([\d]{4}-[\d]{2}-[\d]{2})\/[\d]{4}-[\d]{2}-[\d]{2}"
matches = re.match(date_range_pattern, new_date)
if matches:
new_date = matches.group(1)

try:
parse_date(new_date)
except ValueError as err:
print(f"WARNING: {err}", file=stderr)
new_date = "XXXX-XX-XX"

return new_date


def parse_center_name(center_name: str) -> str:
Expand All @@ -101,24 +121,24 @@ def construct_strain_name(record: dict, sample_id: str) -> str:
"""
host = record['host']
country = record['country']
year = parse_year(record['date'])
year = str(parse_date(record['date']).year)
strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
return re.sub(r'[^\w\_\-\.\|\/]', '_', strain)


def parse_year(date_string: str) -> str:
def parse_date(date_string: str) -> datetime:
"""
Parse the year from the provided `date_string`
Parse the provided `date_string` as a datetime object.
"""
date_formats = ['%Y-%m-%d', '%Y']
date_formats = ['%Y-%m-%d', '%Y', 'XXXX-XX-XX']
for date_format in date_formats:
try:
parsed_date = datetime.strptime(date_string, date_format)
return str(parsed_date.year)
return parsed_date
except ValueError:
continue

raise ValueError(f"Could not parse year from date string {date_string!r}")
raise ValueError(f"Could not parse date from date string {date_string!r}")


if __name__ == '__main__':
Expand Down

0 comments on commit 636a680

Please sign in to comment.