Skip to content

Commit

Permalink
removed defusedxml (#424)
Browse files Browse the repository at this point in the history
* removed defusedxml

* refactor: Add temporary file handling to blast_remote function move `parse_xml`

* add test_blast

* add golden data

* Add pytest-mock to dev dependencies

---------

Co-authored-by: Rodrigo V Honorato <[email protected]>
Co-authored-by: Rodrigo Vargas Honorato <[email protected]>
  • Loading branch information
3 people authored May 17, 2024
1 parent 570baf1 commit 36b122c
Show file tree
Hide file tree
Showing 5 changed files with 160 additions and 7 deletions.
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ classifiers = [
[tool.poetry.dependencies]
python = ">=3.9,<3.12"
bio = "1.6.2"
defusedxml = "0.7.1"
lxml = "5.2.1"
mdanalysis = "2.7.0"
requests = "2.31.0"
Expand Down
24 changes: 18 additions & 6 deletions src/arctic3d/modules/blast.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
"""Function to BLAST input sequence and return accession id."""

import logging
import os
import shlex
import shutil
import subprocess
import tempfile
from pathlib import Path

from Bio.Blast import NCBIWWW
from defusedxml import lxml as ET
from lxml import etree as ET

log = logging.getLogger("arctic3d.log")

Expand Down Expand Up @@ -87,7 +89,7 @@ def blast_local(fasta_file, db):
return uniprot_id


def blast_remote(fasta_file):
def blast_remote(fasta_file: str) -> str:
"""
Blast sequence.
Expand All @@ -106,12 +108,22 @@ def blast_remote(fasta_file):
"blastp", "swissprot", fasta_file, hitlist_size=50
)

# temp file for storing results
with open("blast_res.xml", "w") as save_output:
# TODO: Handle scenario in which the `qblast` call fails

with tempfile.NamedTemporaryFile(
mode="w+", delete=True, suffix=".xml"
) as temp:
blast_res = blast_res_handle.read()
save_output.write(blast_res)
temp.write(blast_res)
temp.flush()
accession_id = parse_xml(temp.name)

return accession_id


tree = ET.parse("blast_res.xml")
def parse_xml(xml_file: str) -> str:
"""Parse the BLAST XML file and return the first (?) accession ID."""
tree = ET.parse(source=xml_file, parser=ET.XMLParser(encoding="utf-8"))
root = tree.getroot()

# root [BlastOutput_iterations] [Iteration] [Iteration_hits] \
Expand Down
2 changes: 2 additions & 0 deletions tests/golden_data/1crn.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>1CRN_1|Chain A|CRAMBIN|Crambe hispanica subsp. abyssinica (3721)
TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN
97 changes: 97 additions & 0 deletions tests/golden_data/blast.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
<?xml version="1.0"?>
<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">
<BlastOutput>
<BlastOutput_program>blastp</BlastOutput_program>
<BlastOutput_version>BLASTP 2.15.0+</BlastOutput_version>
<BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>
<BlastOutput_db>swissprot</BlastOutput_db>
<BlastOutput_query-ID>Query_544091</BlastOutput_query-ID>
<BlastOutput_query-def>1CRN_1|Chain A|CRAMBIN|Crambe hispanica subsp. abyssinica (3721)</BlastOutput_query-def>
<BlastOutput_query-len>46</BlastOutput_query-len>
<BlastOutput_param>
<Parameters>
<Parameters_matrix>BLOSUM62</Parameters_matrix>
<Parameters_expect>0.05</Parameters_expect>
<Parameters_gap-open>11</Parameters_gap-open>
<Parameters_gap-extend>1</Parameters_gap-extend>
<Parameters_filter>F</Parameters_filter>
</Parameters>
</BlastOutput_param>
<BlastOutput_iterations>
<Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>Query_544091</Iteration_query-ID>
<Iteration_query-def>1CRN_1|Chain A|CRAMBIN|Crambe hispanica subsp. abyssinica (3721)</Iteration_query-def>
<Iteration_query-len>46</Iteration_query-len>
<Iteration_hits>
<Hit>
<Hit_num>1</Hit_num>
<Hit_id>sp|P01542.2|</Hit_id>
<Hit_def>RecName: Full=Crambin [Crambe hispanica subsp. abyssinica]</Hit_def>
<Hit_accession>P01542</Hit_accession>
<Hit_len>46</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>92.4337</Hsp_bit-score>
<Hsp_score>228</Hsp_score>
<Hsp_evalue>2.38732e-26</Hsp_evalue>
<Hsp_query-from>1</Hsp_query-from>
<Hsp_query-to>46</Hsp_query-to>
<Hsp_hit-from>1</Hsp_hit-from>
<Hsp_hit-to>46</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>45</Hsp_identity>
<Hsp_positive>46</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>46</Hsp_align-len>
<Hsp_qseq>TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN</Hsp_qseq>
<Hsp_hseq>TTCCPSIVARSNFNVCRLPGTPEALCATYTGCIIIPGATCPGDYAN</Hsp_hseq>
<Hsp_midline>TTCCPSIVARSNFNVCRLPGTPEA+CATYTGCIIIPGATCPGDYAN</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
<Hit>
<Hit_num>2</Hit_num>
<Hit_id>sp|P01541.1|</Hit_id>
<Hit_def>RecName: Full=Denclatoxin-B [Dendrophthora clavata]</Hit_def>
<Hit_accession>P01541</Hit_accession>
<Hit_len>46</Hit_len>
<Hit_hsps>
<Hsp>
<Hsp_num>1</Hsp_num>
<Hsp_bit-score>53.9138</Hsp_bit-score>
<Hsp_score>128</Hsp_score>
<Hsp_evalue>3.34186e-11</Hsp_evalue>
<Hsp_query-from>2</Hsp_query-from>
<Hsp_query-to>46</Hsp_query-to>
<Hsp_hit-from>2</Hsp_hit-from>
<Hsp_hit-to>46</Hsp_hit-to>
<Hsp_query-frame>0</Hsp_query-frame>
<Hsp_hit-frame>0</Hsp_hit-frame>
<Hsp_identity>23</Hsp_identity>
<Hsp_positive>31</Hsp_positive>
<Hsp_gaps>0</Hsp_gaps>
<Hsp_align-len>45</Hsp_align-len>
<Hsp_qseq>TCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN</Hsp_qseq>
<Hsp_hseq>SCCPTTAARNQYNICRLPGTPRPVCAALSGCKIISGTGCPPGYRH</Hsp_hseq>
<Hsp_midline>+CCP+ AR+ +N+CRLPGTP +CA +GC II G CP Y +</Hsp_midline>
</Hsp>
</Hit_hsps>
</Hit>
</Iteration_hits>
<Iteration_stat>
<Statistics>
<Statistics_db-num>483204</Statistics_db-num>
<Statistics_db-len>183688421</Statistics_db-len>
<Statistics_hsp-len>0</Statistics_hsp-len>
<Statistics_eff-space>0</Statistics_eff-space>
<Statistics_kappa>0.041</Statistics_kappa>
<Statistics_lambda>0.267</Statistics_lambda>
<Statistics_entropy>0.14</Statistics_entropy>
</Statistics>
</Iteration_stat>
</Iteration>
</BlastOutput_iterations>
</BlastOutput>
43 changes: 43 additions & 0 deletions tests/test_blast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from io import StringIO
from pathlib import Path

import pytest

from arctic3d.modules.blast import blast_remote, parse_xml

from . import golden_data


@pytest.fixture
def fasta_file():
return Path(golden_data, "1crn.fasta")


@pytest.fixture
def xml_file():
return Path(golden_data, "blast.xml")


def test_blast_remote(mocker, fasta_file, xml_file):

# Mock the remote blast call by passing the xml file as stringIo as return value
with open(xml_file, "r") as f:
xml = f.read()
xml_string_io = StringIO(xml)

mock_qbplast = mocker.patch("Bio.Blast.NCBIWWW.qblast")
mock_qbplast.return_value = xml_string_io

accession_id = blast_remote(fasta_file)

assert accession_id == "P01541"


@pytest.mark.skip(reason="Not implemented")
def test_blast_local():
pass


def test_parse_xml(xml_file):
accession_id = parse_xml(xml_file)
assert accession_id == "P01541"

0 comments on commit 36b122c

Please sign in to comment.