From 36b122c67d31d83875a4144edac75193c865212f Mon Sep 17 00:00:00 2001 From: Marco Giulini <54807167+mgiulini@users.noreply.github.com> Date: Fri, 17 May 2024 09:49:47 +0200 Subject: [PATCH] removed defusedxml (#424) * removed defusedxml * refactor: Add temporary file handling to blast_remote function move `parse_xml` * add test_blast * add golden data * Add pytest-mock to dev dependencies --------- Co-authored-by: Rodrigo V Honorato Co-authored-by: Rodrigo Vargas Honorato --- pyproject.toml | 1 - src/arctic3d/modules/blast.py | 24 ++++++--- tests/golden_data/1crn.fasta | 2 + tests/golden_data/blast.xml | 97 +++++++++++++++++++++++++++++++++++ tests/test_blast.py | 43 ++++++++++++++++ 5 files changed, 160 insertions(+), 7 deletions(-) create mode 100644 tests/golden_data/1crn.fasta create mode 100755 tests/golden_data/blast.xml create mode 100644 tests/test_blast.py diff --git a/pyproject.toml b/pyproject.toml index f15d4a5..d678bf7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.9,<3.12" bio = "1.6.2" -defusedxml = "0.7.1" lxml = "5.2.1" mdanalysis = "2.7.0" requests = "2.31.0" diff --git a/src/arctic3d/modules/blast.py b/src/arctic3d/modules/blast.py index cfc0b09..cd94c89 100644 --- a/src/arctic3d/modules/blast.py +++ b/src/arctic3d/modules/blast.py @@ -1,13 +1,15 @@ """Function to BLAST input sequence and return accession id.""" + import logging import os import shlex import shutil import subprocess +import tempfile from pathlib import Path from Bio.Blast import NCBIWWW -from defusedxml import lxml as ET +from lxml import etree as ET log = logging.getLogger("arctic3d.log") @@ -87,7 +89,7 @@ def blast_local(fasta_file, db): return uniprot_id -def blast_remote(fasta_file): +def blast_remote(fasta_file: str) -> str: """ Blast sequence. @@ -106,12 +108,22 @@ def blast_remote(fasta_file): "blastp", "swissprot", fasta_file, hitlist_size=50 ) - # temp file for storing results - with open("blast_res.xml", "w") as save_output: + # TODO: Handle scenario in which the `qblast` call fails + + with tempfile.NamedTemporaryFile( + mode="w+", delete=True, suffix=".xml" + ) as temp: blast_res = blast_res_handle.read() - save_output.write(blast_res) + temp.write(blast_res) + temp.flush() + accession_id = parse_xml(temp.name) + + return accession_id + - tree = ET.parse("blast_res.xml") +def parse_xml(xml_file: str) -> str: + """Parse the BLAST XML file and return the first (?) accession ID.""" + tree = ET.parse(source=xml_file, parser=ET.XMLParser(encoding="utf-8")) root = tree.getroot() # root [BlastOutput_iterations] [Iteration] [Iteration_hits] \ diff --git a/tests/golden_data/1crn.fasta b/tests/golden_data/1crn.fasta new file mode 100644 index 0000000..a1fc338 --- /dev/null +++ b/tests/golden_data/1crn.fasta @@ -0,0 +1,2 @@ +>1CRN_1|Chain A|CRAMBIN|Crambe hispanica subsp. abyssinica (3721) +TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN \ No newline at end of file diff --git a/tests/golden_data/blast.xml b/tests/golden_data/blast.xml new file mode 100755 index 0000000..3fb0524 --- /dev/null +++ b/tests/golden_data/blast.xml @@ -0,0 +1,97 @@ + + + + blastp + BLASTP 2.15.0+ + Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402. + swissprot + Query_544091 + 1CRN_1|Chain A|CRAMBIN|Crambe hispanica subsp. abyssinica (3721) + 46 + + + BLOSUM62 + 0.05 + 11 + 1 + F + + + + + 1 + Query_544091 + 1CRN_1|Chain A|CRAMBIN|Crambe hispanica subsp. abyssinica (3721) + 46 + + + 1 + sp|P01542.2| + RecName: Full=Crambin [Crambe hispanica subsp. abyssinica] + P01542 + 46 + + + 1 + 92.4337 + 228 + 2.38732e-26 + 1 + 46 + 1 + 46 + 0 + 0 + 45 + 46 + 0 + 46 + TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN + TTCCPSIVARSNFNVCRLPGTPEALCATYTGCIIIPGATCPGDYAN + TTCCPSIVARSNFNVCRLPGTPEA+CATYTGCIIIPGATCPGDYAN + + + + + 2 + sp|P01541.1| + RecName: Full=Denclatoxin-B [Dendrophthora clavata] + P01541 + 46 + + + 1 + 53.9138 + 128 + 3.34186e-11 + 2 + 46 + 2 + 46 + 0 + 0 + 23 + 31 + 0 + 45 + TCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN + SCCPTTAARNQYNICRLPGTPRPVCAALSGCKIISGTGCPPGYRH + +CCP+ AR+ +N+CRLPGTP +CA +GC II G CP Y + + + + + + + + 483204 + 183688421 + 0 + 0 + 0.041 + 0.267 + 0.14 + + + + + diff --git a/tests/test_blast.py b/tests/test_blast.py new file mode 100644 index 0000000..721f7dd --- /dev/null +++ b/tests/test_blast.py @@ -0,0 +1,43 @@ +from io import StringIO +from pathlib import Path + +import pytest + +from arctic3d.modules.blast import blast_remote, parse_xml + +from . import golden_data + + +@pytest.fixture +def fasta_file(): + return Path(golden_data, "1crn.fasta") + + +@pytest.fixture +def xml_file(): + return Path(golden_data, "blast.xml") + + +def test_blast_remote(mocker, fasta_file, xml_file): + + # Mock the remote blast call by passing the xml file as stringIo as return value + with open(xml_file, "r") as f: + xml = f.read() + xml_string_io = StringIO(xml) + + mock_qbplast = mocker.patch("Bio.Blast.NCBIWWW.qblast") + mock_qbplast.return_value = xml_string_io + + accession_id = blast_remote(fasta_file) + + assert accession_id == "P01541" + + +@pytest.mark.skip(reason="Not implemented") +def test_blast_local(): + pass + + +def test_parse_xml(xml_file): + accession_id = parse_xml(xml_file) + assert accession_id == "P01541"