Skip to content

Commit 49c175f

Browse files
author
gitliver
committed
add (sloppy-ish) shell scripts to create Step 7: blast unassembled reads
1 parent d02d7ed commit 49c175f

File tree

5 files changed

+93
-4
lines changed

5 files changed

+93
-4
lines changed

pandora.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,8 @@ def get_arg():
7878
step 2: assembly, \
7979
step 3: blast contigs, \
8080
step 4: orf discovery, \
81-
step 5: reporting (default: 12345 - i.e, steps 1 through 5).')
81+
step 5: reporting (default: 12345 - i.e, steps 1 through 5), \
82+
step 7: blast unassembled reads.')
8283

8384
# Trinity default contig length is 200
8485
# Ioan: for detection of species, impose no bound on the contig length in assembly
@@ -230,7 +231,8 @@ def scan_main(args):
230231
'3': '-S {mypython} -N blst_{args.identifier} -V -cwd -o log.out -e log.err'.format(mypython=sys.executable, args=args),
231232
'4': '-S {mypython} -N orf_{args.identifier} -V -cwd -o log.out -e log.err'.format(mypython=sys.executable, args=args),
232233
'5': '-S {mypython} -N rep_{args.identifier} -V -cwd -o log.out -e log.err'.format(mypython=sys.executable, args=args),
233-
'6': '-S {mypython} -N rep2_{args.identifier} -V -cwd -o log.out -e log.err'.format(mypython=sys.executable, args=args)
234+
'6': '-S {mypython} -N rep2_{args.identifier} -V -cwd -o log.out -e log.err'.format(mypython=sys.executable, args=args),
235+
'7': '-S /bin/bash -N blst_unass_{args.identifier} -V -cwd -o log.out -e log.err'.format(args=args)
234236
}
235237

236238
# dict which maps each step to extra qsub params for the CUMC cluster
@@ -240,7 +242,8 @@ def scan_main(args):
240242
'3': ' -l mem=4G,time=8::',
241243
'4': ' -l mem=2G,time=2::',
242244
'5': ' -l mem=1G,time=1::',
243-
'6': ' -l mem=1G,time=1::'
245+
'6': ' -l mem=1G,time=1::',
246+
'7': ' -l mem=1G,time=12::'
244247
}
245248

246249
# dict which maps each step to the shell part of the command
@@ -250,7 +253,8 @@ def scan_main(args):
250253
'3': '{args.scripts}/scripts/blast_wrapper.py --scripts {args.scripts} --threshold {args.contigthreshold} --db {args.blastdb} --threads {args.blast_threads} --id {args.identifier} --filelength {args.blastchunk} --verbose {args.verbose} --noclean {args.noclean} --nosge {args.noSGE} --hpc {args.hpc} --btime {args.btime} --bmem {args.bmem}'.format(args=args),
251254
'4': '{args.scripts}/scripts/orf_discovery.py --scripts {args.scripts} --id {args.identifier} --threshold {args.orfthreshold} --db {args.pblastdb} --blast {args.orfblast} --verbose {args.verbose} --noclean {args.noclean}'.format(args=args),
252255
'5': '{args.scripts}/scripts/makereport.py --scripts {args.scripts} --id {args.identifier} --verbose {args.verbose} --blacklist {args.blacklist} --taxid2names {args.taxid2names} --hpc {args.hpc}'.format(args=args),
253-
'6': '{args.scripts}/scripts/makereport.py --outputdir report_ifilter --input blast/ifilter.concat.txt --scripts {args.scripts} --id {args.identifier} --verbose {args.verbose} --blacklist {args.blacklist} --taxid2names {args.taxid2names} --hpc {args.hpc}'.format(args=args)
256+
'6': '{args.scripts}/scripts/makereport.py --outputdir report_ifilter --input blast/ifilter.concat.txt --scripts {args.scripts} --id {args.identifier} --verbose {args.verbose} --blacklist {args.blacklist} --taxid2names {args.taxid2names} --hpc {args.hpc}'.format(args=args),
257+
'7': '{args.scripts}/scripts/blast_unassembled_reads.sh assembly/reads2contigs.bam blast_unassembled_reads {args.scripts} {args.blastdb} {args.blacklist} {args.taxid2names} {args.scripts}/resources/blast.header'.format(args=args)
254258
}
255259

256260
# start with job id set to zero string

resources/blast.header

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
qseqid sseqid saccver staxids pident nident length mismatch gapopen gaps qstart qend qlen qframe qcovs sstart send slen sframe sstrand evalue bitscore stitle

scripts/blast.sh

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
3+
# simple wrapper for blast
4+
5+
echo [start]
6+
echo [pwd] `pwd`
7+
echo [date] `date`
8+
9+
db=$1
10+
11+
input=${SGE_TASK_ID}.fasta
12+
output=${SGE_TASK_ID}.result
13+
14+
echo "input "${input}
15+
echo "output "${output}
16+
echo "db "${db}
17+
18+
# restrict to top 10 hits
19+
blastn -num_alignments 10 -outfmt "6 qseqid sseqid saccver staxids pident nident length mismatch gapopen gaps qstart qend qlen qframe qcovs sstart send slen sframe sstrand evalue bitscore stitle" -query ${input} -db ${db} > ${output}
20+
21+
echo [finish]
22+
echo [date] `date`

scripts/blast_unassembled_reads.sh

+47
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/bin/bash
2+
3+
bamfile=$1
4+
outputdir=$2
5+
scripts=$3
6+
blastdb=$4
7+
blacklist=$5
8+
taxid2names=$6
9+
header=$7
10+
11+
# output directory
12+
mkdir -p ${outputdir}
13+
if ! cd ${outputdir}; then exit; fi
14+
mkdir -p logs
15+
16+
# get unmapped reads
17+
samtools view -b -f 4 ../${bamfile} > unassembled.bam
18+
19+
# transform to fasta and split
20+
samtools view unassembled.bam | cut -f1,10 > tmp
21+
split -d -l 1000 -a 3 tmp
22+
num=0
23+
for i in x*; do ((num++)); cat $i | tr ":" "_" | awk '{print ">"$1; print $2;}' > ${num}.fasta; done
24+
echo "num: "${num}
25+
# filecount=$( ls -1 *fasta | wc -l )
26+
27+
# blast
28+
# message looks like this: Your job-array 3147839.1-2:1 ("wait") has been submitted
29+
jid=$( qsub -V -N bunassembled -e logs -o logs -cwd -l mem=10G,time=6:: -t 1-${num} ${scripts}/scripts/blast.sh ${blastdb} | cut -f3 -d' ' | cut -f1 -d'.' )
30+
echo "job id: "${jid}
31+
# pause
32+
qsub -V -b y -o logs -e logs -cwd -N wait -hold_jid ${jid} -sync y echo wait_here
33+
34+
# get top hit, concatenate
35+
for i in *.result; do ${scripts}/scripts/filterblast.py ${i} ${i}.tophit ${scripts}; done
36+
cat *.result > all.concat.txt
37+
cat *.tophit > top.concat.txt
38+
head -1000 logs/* > all.logs.txt
39+
# clean up
40+
echo 'clean up'
41+
rm x*
42+
rm *.fasta
43+
rm *.result
44+
rm *.tophit
45+
rm -r logs
46+
# generate report
47+
${scripts}/scripts/makereport.py --header ${header} --scripts ${scripts} -i top.concat.txt --outputdir . --id 1 --verbose 1 --blacklist ${blacklist} --taxid2names ${taxid2names} --hpc 1

scripts/filterblast.py

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#!/usr/bin/env python
2+
3+
import sys
4+
import os
5+
6+
inp = sys.argv[1]
7+
oup = sys.argv[2]
8+
scripts = sys.argv[3]
9+
10+
# need this to get local modules
11+
sys.path.append(scripts)
12+
from helpers import helpers as hp
13+
14+
# filter top hit
15+
hp.tophitsfilter(inp, oup)

0 commit comments

Comments
 (0)