Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work around failures in ENA XML retrieval to still produce useful info #60

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 49 additions & 34 deletions ffq/ffq.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,59 +81,67 @@ def validate_accessions(accessions, search_types):
return IDs


def parse_run(soup):
def parse_run(soup, accession = None):
"""Given a BeautifulSoup object representing a run, parse out relevant
information.

:param soup: a BeautifulSoup object representing a run
:type soup: bs4.BeautifulSoup
:type accession: str

:return: a dictionary containing run information
:rtype: dict
"""
accession = soup.find("PRIMARY_ID", text=RUN_PARSER).text
experiment = (
soup.find("PRIMARY_ID", text=EXPERIMENT_PARSER).text
if soup.find("PRIMARY_ID", text=EXPERIMENT_PARSER)
else soup.find("EXPERIMENT_REF")["accession"]
)

title = None
sample = None
study = None
experiment = None
ftp_files = None
attributes = {}

study_parsed = soup.find("ID", text=PROJECT_PARSER)
if study_parsed:
study = study_parsed.text
if soup is None:
accession = accession
else:
# logger.warning(
# 'Failed to parse study information from ENA XML. Falling back to '
# 'ENA search...'
# )
accession = soup.find("PRIMARY_ID", text=RUN_PARSER).text
experiment = (
soup.find("PRIMARY_ID", text=EXPERIMENT_PARSER).text
if soup.find("PRIMARY_ID", text=EXPERIMENT_PARSER)
else soup.find("EXPERIMENT_REF")["accession"]
)

study_parsed = soup.find("ID", text=PROJECT_PARSER)
if study_parsed:
study = study_parsed.text

sample_parsed = soup.find("ID", text=SAMPLE_PARSER)
if sample_parsed:
sample = sample_parsed.text

title = soup.find("TITLE").text

for attr in soup.find_all("RUN_ATTRIBUTE"):
try:
tag = attr.find("TAG").text
value = attr.find("VALUE").text
attributes[tag] = value
except: # noqa
pass
ftp_files = get_files_metadata_from_run(soup)

if study is None:
study = search_ena_run_study(accession)
sample_parsed = soup.find("ID", text=SAMPLE_PARSER)
if sample_parsed:
sample = sample_parsed.text
else:
# logger.warning(
# 'Failed to parse sample information from ENA XML. Falling back to '
# 'ENA search...'
# )

if sample is None:
sample = search_ena_run_sample(accession)
title = soup.find("TITLE").text

attributes = {}

for attr in soup.find_all("RUN_ATTRIBUTE"):
try:
tag = attr.find("TAG").text
value = attr.find("VALUE").text
attributes[tag] = value
except: # noqa
pass
if attributes:
try:
attributes["ENA-SPOT-COUNT"] = int(attributes["ENA-SPOT-COUNT"])
attributes["ENA-BASE-COUNT"] = int(attributes["ENA-BASE-COUNT"])
except: # noqa
pass
ftp_files = get_files_metadata_from_run(soup)

# print(ftp_files)
# ftp_files = [file for file in ftp_files if accession in file['url']]
# print(ftp_files)
Expand Down Expand Up @@ -389,8 +397,15 @@ def ffq_run(accession, level=0): # noqa
:return: dictionary of run information
:rtype: dict
"""

logger.info(f"Parsing run {accession}")
run = parse_run(get_xml(accession))
try:
soup = get_xml(accession)
except InvalidAccession:
logger.error(f"Bad response for {accession} from ENA, proceeding without ENA data...")
soup = None

run = parse_run(soup = soup, accession = accession)
return run


Expand Down