Skip to content

Commit

Permalink
Merge pull request #6 from BIONF/fdog_assembly
Browse files Browse the repository at this point in the history
Merge master into fdog_goes_assembly
  • Loading branch information
HannahBioI authored Sep 9, 2021
2 parents 7ef3a05 + 97dcf81 commit 087cae2
Show file tree
Hide file tree
Showing 13 changed files with 271 additions and 206 deletions.
15 changes: 6 additions & 9 deletions fdog/addTaxa.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import re
import shutil
from tqdm import tqdm
from datetime import datetime

def checkFileExist(file):
if not os.path.exists(os.path.abspath(file)):
Expand Down Expand Up @@ -68,20 +69,18 @@ def parseMapFile(mappingFile):
try:
ver = tmp[3].strip()
except:
ver = 1
ver = datetime.today().strftime('%y%m%d') #1
# print(taxName+"@"+str(taxId)+"@"+str(ver))
nameDict[fileName] = (taxName, str(taxId), str(ver))
return(nameDict)

def runAddTaxon(args):
(f,n,i,o,c,v,a,cpus,replace,delete,oldFAS) = args
(f,n,i,o,c,v,a,cpus,replace,delete) = args
cmd = 'fdog.addTaxon -f %s -n %s -i %s -o %s -v %s --cpus %s' % (f,n,i,o,v,cpus)
if c == True:
cmd = cmd + ' -c'
if a == True:
cmd = cmd + ' -a'
if oldFAS == True:
cmd = cmd + ' --oldFAS'
if replace == True:
cmd = cmd + ' --replace'
if delete == True:
Expand All @@ -95,7 +94,7 @@ def runAddTaxon(args):
sys.exit('Problem running\n%s' % (cmd))

def main():
version = '0.0.5'
version = '0.0.9'
parser = argparse.ArgumentParser(description='You are running fdog.addTaxa version ' + str(version) + '.')
required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')
Expand All @@ -105,8 +104,7 @@ def main():
action='store', default='', required=True)
optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='')
optional.add_argument('-c', '--coreTaxa', help='Include these taxa to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False)
optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using annoFAS', action='store_true', default=False)
optional.add_argument('--oldFAS', help='Use old verion of FAS (annoFAS ≤ 1.2.0)', action='store_true', default=False)
optional.add_argument('-a', '--noAnno', help='Do NOT annotate these taxa using fas.doAnno', action='store_true', default=False)
optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int)
optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False)
optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False)
Expand All @@ -128,7 +126,6 @@ def main():
outPath = os.path.abspath(outPath)
noAnno = args.noAnno
coreTaxa = args.coreTaxa
oldFAS = args.oldFAS
cpus = args.cpus
if cpus == 0:
cpus = mp.cpu_count()-2
Expand Down Expand Up @@ -171,7 +168,7 @@ def main():
verProt = nameDict[f][2]
jobs.append([
folIn + '/' + f, nameDict[f][0], nameDict[f][1],
outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete, oldFAS
outPath, coreTaxa, nameDict[f][2], noAnno, cpus, replace, delete
])

if len(dupList) > 0:
Expand Down
62 changes: 36 additions & 26 deletions fdog/addTaxon.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import multiprocessing as mp
from ete3 import NCBITaxa
import re
import shutil
from datetime import datetime

def checkFileExist(file):
Expand Down Expand Up @@ -83,18 +84,17 @@ def runBlast(args):
os.symlink(fileInGenome, fileInBlast)

def main():
version = '0.0.5'
version = '0.0.10'
parser = argparse.ArgumentParser(description='You are running fdog.addTaxon version ' + str(version) + '.')
required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')
required.add_argument('-f', '--fasta', help='FASTA file of input taxon', action='store', default='', required=True)
required.add_argument('-i', '--taxid', help='Taxonomy ID of input taxon', action='store', default='', required=True, type=int)
optional.add_argument('-o', '--outPath', help='Path to output directory', action='store', default='')
optional.add_argument('-n', '--name', help='Acronym name of input taxon', action='store', default='', type=str)
optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default=1, type=str)
optional.add_argument('-v', '--verProt', help='Proteome version', action='store', default='', type=str)
optional.add_argument('-c', '--coreTaxa', help='Include this taxon to core taxa (i.e. taxa in blast_dir folder)', action='store_true', default=False)
optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using annoFAS', action='store_true', default=False)
optional.add_argument('--oldFAS', help='Use old verion of FAS (annoFAS ≤ 1.2.0)', action='store_true', default=False)
optional.add_argument('-a', '--noAnno', help='Do NOT annotate this taxon using fas.doAnno', action='store_true', default=False)
optional.add_argument('--cpus', help='Number of CPUs used for annotation. Default = available cores - 1', action='store', default=0, type=int)
optional.add_argument('--replace', help='Replace special characters in sequences by "X"', action='store_true', default=False)
optional.add_argument('--delete', help='Delete special characters in sequences', action='store_true', default=False)
Expand All @@ -119,7 +119,8 @@ def main():
noAnno = args.noAnno
coreTaxa = args.coreTaxa
ver = str(args.verProt)
oldFAS = args.oldFAS
if ver == '':
ver = datetime.today().strftime('%y%m%d')
cpus = args.cpus
if cpus == 0:
cpus = mp.cpu_count()-2
Expand All @@ -135,6 +136,13 @@ def main():
specName = name+'@'+taxId+'@'+ver
print('Species name\t%s' % specName)

### remove old folder if force is set
if force:
if os.path.exists(outPath + '/genome_dir/' + specName):
shutil.rmtree(outPath + '/genome_dir/' + specName)
if os.path.exists(outPath + '/blast_dir/' + specName):
shutil.rmtree(outPath + '/blast_dir/' + specName)

### create file in genome_dir
print('Parsing FASTA file...')
Path(outPath + '/genome_dir').mkdir(parents = True, exist_ok = True)
Expand All @@ -147,25 +155,30 @@ def main():
f = open(specFile, 'w')
index = 0
modIdIndex = 0
longId = 'no'
# longId = 'no'
tmpDict = {}
# with open(specFile + '.mapping', 'a') as mappingFile:
for id in inSeq:
seq = str(inSeq[id].seq)
# check ID
id = re.sub('\|', '_', id)
oriId = id
if len(id) > 30:
modIdIndex = modIdIndex + 1
id = specName + "_" + str(modIdIndex)
longId = 'yes'
with open(specFile + '.mapping', 'a') as mappingFile:
mappingFile.write('%s\t%s\n' % (id, oriId))
if not id in tmpDict:
tmpDict[id] = 1
# oriId = id
if ' ' in id:
sys.exit('\033[91mERROR: Sequence IDs (e.g. %s) must not contain space(s)!\033[0m' % id)
else:
index = index + 1
id = str(id) + '_' + str(index)
tmpDict[id] = 1
if '\|' in id:
print('\033[91mWARNING: Sequence IDs contain pipe(s). They will be replaced by "_"!\033[0m')
id = re.sub('\|', '_', id)
# if len(id) > 20:
# modIdIndex = modIdIndex + 1
# id = modIdIndex
# longId = 'yes'
# if not id in tmpDict:
# tmpDict[id] = 1
# else:
# index = index + 1
# id = str(index)
# tmpDict[id] = 1
# mappingFile.write('%s\t%s\n' % (id, oriId))
# check seq
if seq[-1] == '*':
seq = seq[:-1]
Expand All @@ -187,8 +200,8 @@ def main():
cf.write(str(datetime.now()))
cf.close()
# warning about long header
if longId == 'yes':
print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile)
# if longId == 'yes':
# print('\033[91mWARNING: Some headers longer than 80 characters have been automatically shortened. PLease check the %s.mapping file for details!\033[0m' % specFile)
else:
print(genomePath + '/' + specName + '.fa already exists!')

Expand All @@ -207,16 +220,13 @@ def main():
### create annotation
if not noAnno:
Path(outPath + '/weight_dir').mkdir(parents = True, exist_ok = True)
annoCmd = 'annoFAS -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus)
annoCmd = 'fas.doAnno -i %s/%s.fa -o %s --cpus %s' % (genomePath, specName, outPath+'/weight_dir', cpus)
if force:
annoCmd = annoCmd + " --force"
if oldFAS:
print("running old version of FAS...")
annoCmd = 'annoFAS -i %s/%s.fa -o %s -n %s --cores %s' % (genomePath, specName, outPath+'/weight_dir', specName, cpus)
try:
subprocess.call([annoCmd], shell = True)
except:
print('\033[91mProblem with running annoFAS. You can check it with this command:\n%s\033[0m' % annoCmd)
print('\033[91mProblem with running fas.doAnno. You can check it with this command:\n%s\033[0m' % annoCmd)

print('Output for %s can be found in %s within genome_dir [and blast_dir, weight_dir] folder[s]' % (specName, outPath))

Expand Down
141 changes: 71 additions & 70 deletions fdog/bin/hamstr.pl
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,10 @@
## 01.12.2020 (v13.4.1 - vinh) add silent option to muscle for checkCoOrthologsRef
## 21.01.2021 (v13.4.2 - vinh) fiexed bug when refspec has "dot" in its name
## 19.03.2021 (v13.4.3 - vinh) changed $path to current directory
## 19.03.2021 (v13.4.5 - vinh) do not replace space by @ for hmm output in parseHmmer4pm

######################## start main ###########################################
my $version = "HaMStR v.13.4.4";
my $version = "HaMStR v.13.4.5";
######################## checking whether the configure script has been run ###
my $configure = 0;
if ($configure == 0){
Expand Down Expand Up @@ -315,7 +316,7 @@
my $ublast = 0;
my $accel = 0.8;
#####determine the hostname#######
push @log, "VERSION:\t$version\n";
# push @log, "VERSION:\t$version\n";
my $hostname = `hostname`;
chomp $hostname;
push @log, "HOSTNAME\t$hostname\n";
Expand Down Expand Up @@ -520,7 +521,7 @@
exit;
}
else {
open (OUT, ">$outpath/hamstrsearch.log") or die "could not open logfile\n";
open (OUT, ">$outpath/fdog.log") or die "could not open logfile\n";
print OUT join "\n", @log;
close OUT;
}
Expand Down Expand Up @@ -1059,7 +1060,7 @@ sub checkInput {
}
}
} else {
push @log, "\trunning HaMStR with all hmms in $hmm_dir";
push @log, "\trunning fDOG with all hmms in $hmm_dir";
my $hmm_dir_tmp = $hmm_dir; $hmm_dir_tmp =~ s/\|/\\\|/g;
@hmms = `ls $hmm_dir_tmp`;
}
Expand Down Expand Up @@ -1299,10 +1300,10 @@ sub checkInput {
}
## 14) determin whether or not the -representative flag has been set
if (defined $rep) {
push @log, "\tHaMStR will run with the -representative option";
push @log, "\tfDOG will run with the -representative option";
}
else {
push @log, "\tHaMStR was called without the -representative option. More than one ortholog may be identified per core-ortholog group!";
push @log, "\tfDOG was called without the -representative option. More than one ortholog may be identified per core-ortholog group!";
}

## check further options
Expand Down Expand Up @@ -1854,68 +1855,68 @@ sub revComp {
return($seq);
}
##############################
sub parseHmmer3pm {
my ($file, $path) = @_;
my $hits;
my $query;
my %tmphash;
if (!defined $path){
$path = '.';
}
$file = $path . '/' . $file;
my $in = Bio::SearchIO->new(
-format => 'hmmer',
-file => $file
);
while( my $result = $in->next_result ) {
# this is a Bio::Search::Result::HMMERResult object
if (!defined $query){
$query = $result->query_name();
printOUT("query is $query\n");
}
my $hitcount = 0;
while( my $hit = $result->next_hit ) {
my $tmp = $hit->name();
my $tmpscore = $hit->score();
$tmp =~ s/_RF.*//;
if (!defined $tmphash{$tmp}){
$hits->[$hitcount]->{id} = $tmp;
$hits->[$hitcount]->{hmmscore} = $tmpscore;
$hitcount++;
$tmphash{$tmp}=1;
if (defined $bhh){
last;
}
}
}

if (defined $hits->[0]) {
####### a quick hack to obtain the lagPhase value
my $criticalValue; # takes the value used for candidate discrimination
my $hitLimitLoc = $hitlimit;
if (defined $autoLimit) {
printDebug("Entering getLag Routine\n");
## the user has invoked the autmated inference of a hit limit
($hitLimitLoc, $criticalValue) = getLag($hits, $hitcount);
if (!defined $criticalValue) {
## there was a problem in the computatation of the lagPhase
print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n";
($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount);
}
}
elsif (defined $scoreThreshold) {
printDebug("entering the scoreThreshold routine");
($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount);
printDebug("hitlimitloc is now $hitLimitLoc");
}

return ($query, $hits, $hitLimitLoc, $criticalValue);
}
else {
return ($query);
}
}
}
# sub parseHmmer3pm {
# my ($file, $path) = @_;
# my $hits;
# my $query;
# my %tmphash;
# if (!defined $path){
# $path = '.';
# }
# $file = $path . '/' . $file;
# my $in = Bio::SearchIO->new(
# -format => 'hmmer',
# -file => $file
# );
# while( my $result = $in->next_result ) {
# # this is a Bio::Search::Result::HMMERResult object
# if (!defined $query){
# $query = $result->query_name();
# printOUT("query is $query\n");
# }
# my $hitcount = 0;
# while( my $hit = $result->next_hit ) {
# my $tmp = $hit->name();
# my $tmpscore = $hit->score();
# $tmp =~ s/_RF.*//;
# if (!defined $tmphash{$tmp}){
# $hits->[$hitcount]->{id} = $tmp;
# $hits->[$hitcount]->{hmmscore} = $tmpscore;
# $hitcount++;
# $tmphash{$tmp}=1;
# if (defined $bhh){
# last;
# }
# }
# }
#
# if (defined $hits->[0]) {
# ####### a quick hack to obtain the lagPhase value
# my $criticalValue; # takes the value used for candidate discrimination
# my $hitLimitLoc = $hitlimit;
# if (defined $autoLimit) {
# printDebug("Entering getLag Routine\n");
# ## the user has invoked the autmated inference of a hit limit
# ($hitLimitLoc, $criticalValue) = getLag($hits, $hitcount);
# if (!defined $criticalValue) {
# ## there was a problem in the computatation of the lagPhase
# print "Computation of lagPhase did not succeed, switching to score threshold using a default cutoff of $scoreCutoff\n";
# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount);
# }
# }
# elsif (defined $scoreThreshold) {
# printDebug("entering the scoreThreshold routine");
# ($hitLimitLoc, $criticalValue) = getHitLimit($hits, $hitcount);
# printDebug("hitlimitloc is now $hitLimitLoc");
# }
#
# return ($query, $hits, $hitLimitLoc, $criticalValue);
# }
# else {
# return ($query);
# }
# }
# }
##############################
sub parseHmmer4pm {
my ($file, $path) = @_;
Expand All @@ -1931,9 +1932,9 @@ sub parseHmmer4pm {
$file = $path . '/' . $file;

$file =~ s/\|/\\\|/g;
my @hmmout = `$grepprog -v '#' $file |sort -rnk 9 |sed -e 's/ /@/g'`;
my @hmmout = `$grepprog -v '#' $file |sort -rnk 9`;
for (my $i = 0; $i < @hmmout; $i++) {
($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession}, $hmmhits->[$i]->{total_evalue}, $hmmhits->[$i]->{total_score}, $hmmhits->[$i]->{total_bias}, $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score}, $hmmhits->[$i]->{domain_bias}, @rest) = split(/@+/, $hmmout[$i]);
($hmmhits->[$i]->{target_name}, $hmmhits->[$i]->{target_accession}, $hmmhits->[$i]->{query_name}, $hmmhits->[$i]->{query_accession}, $hmmhits->[$i]->{total_evalue}, $hmmhits->[$i]->{total_score}, $hmmhits->[$i]->{total_bias}, $hmmhits->[$i]->{domain_evalue}, $hmmhits->[$i]->{domain_score}, $hmmhits->[$i]->{domain_bias}, @rest) = split(/\s+/, $hmmout[$i]);

if (!defined $query){
$query = $hmmhits->[$i]->{query_name};
Expand Down
Loading

0 comments on commit 087cae2

Please sign in to comment.