Skip to content

Commit

Permalink
Merge pull request #7 from gregvonkuster/pipes7
Browse files Browse the repository at this point in the history
Various pipeline fixes
  • Loading branch information
ewafula authored Feb 16, 2017
2 parents c94f109 + fb46bbe commit 317e65f
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 26 deletions.
9 changes: 8 additions & 1 deletion config/plantTribes.config
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ transdecoder=TransDecoder.LongOrfs
# in your PATH environment variable
genometools=gt
#
# Path to HMMSearch (HMMER >=3) executables if not in your PATH
# environment variable
hmmsearch=hmmsearch
#
# Path to cap3 executables if not in your PATH environment variable
cap3=cap3
#
###########################################################################
## GeneFamilyClassifier ##
###########################################################################
Expand Down Expand Up @@ -57,4 +64,4 @@ raxml=raxmlHPC-PTHREADS-SSE3
# variable
fasttree=FastTreeMP
#
###########################################################################
###########################################################################
21 changes: 14 additions & 7 deletions pipelines/AssemblyPostProcesser
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ my $usage = <<__EOUSAGE__;
#
# --scaffold <string> : Orthogroups or gene families proteins scaffold. This can either be an absolute
# path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1)
# or just the scaffold (e.g., 22Gv1.1). If the latter, $home/data is prepended to
# or just the scaffold (e.g., 22Gv1.1). If the latter, ~home/data is prepended to
# the scaffold to create the absolute path.
# If Angiosperms clusters (version 1.0): 22Gv1.0
# If Angiosperms clusters (version 1.1): 22Gv1.1
Expand All @@ -57,7 +57,7 @@ my $usage = <<__EOUSAGE__;
# --config_dir <string> : (Optional) Absolute path to the directory containing the default configuration files
# for the selected scaffold defined by the value of the --scaffold parameter (e.g.,
# /home/configs/22Gv1.1). If this parameter is not used, the directory containing the
# default configuration files is set to $home/config/$scaffold.
# default configuration files is set to ~home/config/~scaffold.
#
# --strand_specific : If de novo transcriptome assembly was performed with strand-specific library
# Default: not strand-specific
Expand Down Expand Up @@ -111,7 +111,7 @@ my $options = GetOptions ( 'transcripts=s' => \$transcripts,
);

if ($scaffold) {
if File::Spec->file_name_is_absolute($scaffold)) {
if (File::Spec->file_name_is_absolute($scaffold)) {
$scaffold_dir = $scaffold;
$scaffold = basename($scaffold);
} else {
Expand All @@ -120,7 +120,7 @@ if ($scaffold) {
}

if (!$config_dir || !File::Spec->file_name_is_absolute($config_dir)) {
$config_dir = $home/config;
$config_dir = "$home/config";
}

my %utilies;
Expand Down Expand Up @@ -162,7 +162,7 @@ if ($gene_family_search) {
# create output directory
my $dirname ="./assemblyPostProcessing_dir";
if (-d $dirname) { die "Exiting...!\nAssembly post processing output directory ($dirname) already exists!\n\n"; }
mkdir ($dirname, 0755);
make_directory($dirname);

if ( $prediction_method eq "estscan" ) {
unless ( $score_matrices ) { die "Reference score matrices required with ESTScan.\n\n$usage;"; }
Expand All @@ -188,6 +188,13 @@ exit(0);

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub-routines # # # # # # # # # # # # # # # # # # # # # # # # # # #

sub make_directory {
my ( $new_dir ) = @_;
if (!-d $new_dir) {
mkdir($new_dir, 0755);
}
}

sub run_estscan {
my ($estscan, $transcripts, $matrices, $stranded, $length, $gene_family_search, $out_dir ) = @_;
print localtime()." - Predicting coding regions with ESTScan\n\n";
Expand Down Expand Up @@ -572,7 +579,7 @@ sub targeted_gene_family_assembly {
$genometools, $mafft, $trimal, $transcripts, $score_matrices, $stranded, $dereplicate, $length, $gap_trimming, $num_threads, $out_dir, $scaffold_dir ) = @_;
print localtime()." - Starting targeted gene family assembly\n\n";
my $targeted_gene_families = "$out_dir/targeted_gene_families";
mkdir ($targeted_gene_families, 0755);
make_directory($targeted_gene_families);
my (%target_ids, %contigs, $contig_id);
open (IN, "$target_orthogroups") or die "can't open $target_orthogroups file\n";
while(<IN>){ chomp; if ($_ =~ /^(\d+)/) { $target_ids{$1} = $1; } else { next; } }
Expand All @@ -588,7 +595,7 @@ sub targeted_gene_family_assembly {
next;
}
my $target_out_dir = "$targeted_gene_families/$ortho_id";
mkdir ($target_out_dir, 0755);
make_directory($target_out_dir);
# hmmsearch target profile using post-processed predicted proteins - default parameters
if ($dereplicate) {
system "$hmmsearch -E 10 --cpu $num_threads --noali --tblout $target_out_dir/temp.1.hmm -o $target_out_dir/temp.1.hmm.log $scaffold_dir/hmms/$clustering_method/$ortho_id.hmm $out_dir/transcripts.cleaned.nr.pep >/dev/null 2>/dev/null";
Expand Down
19 changes: 13 additions & 6 deletions pipelines/GeneFamilyClassifier
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ my $usage = <<__EOUSAGE__;
#
# --scaffold <string> : Orthogroups or gene families proteins scaffold. This can either be an absolute
# path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1)
# or just the scaffold (e.g., 22Gv1.1). If the latter, $home/data is prepended to
# or just the scaffold (e.g., 22Gv1.1). If the latter, ~home/data is prepended to
# the scaffold to create the absolute path.
# If Angiosperms clusters (version 1.0): 22Gv1.0
# If Angiosperms clusters (version 1.1): 22Gv1.1
Expand All @@ -47,7 +47,7 @@ my $usage = <<__EOUSAGE__;
# --config_dir <string> : (Optional) Absolute path to the directory containing the default configuration files
# for the selected scaffold defined by the value of the --scaffold parameter (e.g.,
# /home/configs/22Gv1.1). If this parameter is not used, the directory containing the
# default configuration files is set to $home/config.
# default configuration files is set to ~home/config.
#
# --num_threads <int> : number of threads (CPUs) to used for HMMScan, BLASTP, and MAFFT
# Default: 1
Expand Down Expand Up @@ -116,7 +116,7 @@ if (File::Spec->file_name_is_absolute($scaffold)) {
}

if (!$config_dir || !File::Spec->file_name_is_absolute($config_dir)) {
$config_dir = $home/config;
$config_dir = "$home/config";
}

my %utilies;
Expand Down Expand Up @@ -161,7 +161,7 @@ print "-- Scaffold: $scaffolds{$scaffold}\n-- Method: $methods{$method}\n-- Clas
# create output directory
my $dirname ="./geneFamilyClassification_dir";
if (-d $dirname) { die "Exiting...!\nGene family classification output directory ($dirname) already exists!\n\n"; }
mkdir ($dirname, 0755);
make_directory($dirname);

if ( $classifier eq "blastp" ) {
sort_sequences ( $classifier, $blastp, $hmmscan, $proteins, $scaffold, $method, $num_threads, $super_orthogroups, $dirname, $scaffold_dir );
Expand Down Expand Up @@ -189,6 +189,13 @@ exit(0);

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub-routines # # # # # # # # # # # # # # # # # # # # # # # # # # #

sub make_directory {
my ( $new_dir ) = @_;
if (!-d $new_dir) {
mkdir($new_dir, 0755);
}
}

sub sort_sequences {
my ( $classifier, $blastp, $hmmscan, $proteins, $scaffold, $method, $num_threads, $super_orthogroups, $dirname, $scaffold_dir ) = @_;
print localtime()." - Sorting protein sequences\n";
Expand Down Expand Up @@ -416,7 +423,7 @@ sub get_orthogroup_fasta {
print "-- ".localtime()." - Retrieving orthogroup fasta files\n\n";
my (%orthos, %pep, %cds);
my $orthogroups = "$dirname/orthogroups_fasta";
mkdir ($orthogroups, 0755);
make_directory($orthogroups);
my $orthogroup_assignment = "proteins.$classifier.$scaffold.bestOrthos";
open (IN, "$dirname/$orthogroup_assignment") or die "can't open $dirname/$orthogroup_assignment file\n";
while (<IN>) {
Expand Down Expand Up @@ -445,7 +452,7 @@ sub get_orthogroup_fasta {
}
if ($single_copy_custom or $single_copy_taxa) {
my $single_copy = "$dirname/single_copy_fasta";
mkdir ($single_copy, 0755);
make_directory($single_copy);
my $single_copy_selection = "proteins.$classifier.$scaffold.bestOrthos.summary.singleCopy";
my (%single_copy_orthos, $single_copy_ortho_id);
open (IN, "$dirname/$single_copy_selection") or die "can't open $dirname/$single_copy_selection file\n";
Expand Down
43 changes: 31 additions & 12 deletions pipelines/PhylogenomicsAnalysis
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ my $usage = <<__EOUSAGE__;
#
# --scaffold <string> : Orthogroups or gene families proteins scaffold. This can either be an absolute
# path to the directory containing the scaffolds (e.g., /home/scaffolds/22Gv1.1)
# or just the scaffold (e.g., 22Gv1.1). If the latter, $home/data is prepended to
# or just the scaffold (e.g., 22Gv1.1). If the latter, ~home/data is prepended to
# the scaffold to create the absolute path.
#
# If Angiosperms clusters (version 1.0): 22Gv1.0
Expand Down Expand Up @@ -99,7 +99,7 @@ my $usage = <<__EOUSAGE__;
# --config_dir <string> : (Optional) Absolute path to the directory containing the default configuration files
# for the selected scaffold defined by the value of the --scaffold parameter (e.g.,
# /home/configs/22Gv1.1). If this parameter is not used, the directory containing the
# default configuration files is set to $home/config.
# default configuration files is set to ~home/config.
#
# --num_threads <int> : number of threads (CPUs) to assign to external utilities (MAFFT, PASTA, and RAxML)
# Default: 1
Expand All @@ -112,7 +112,7 @@ my $usage = <<__EOUSAGE__;
#
# --pasta_script_path <string> : Optional path to the location of the run_pasta.py script. which is used for running PASTA
# from the command line (useful since the script is a .py file). Using this will override
# the default defined in $home/config/plantTribes.
# the default defined in ~home/config/plantTribes.
#
# --orthogroup_fna : Corresponding gene family classification orthogroups CDS fasta files. Files should be in the
# same directory with input orthogroups protein fasta files.
Expand Down Expand Up @@ -181,7 +181,7 @@ my $options = GetOptions ( 'orthogroup_faa=s' => \$orthogroup_faa,
);

if (!$config_dir || !File::Spec->file_name_is_absolute($config_dir)) {
$config_dir = $home/config;
$config_dir = "$home/config";
}

my %utilies;
Expand Down Expand Up @@ -261,7 +261,7 @@ print "\n\n";
# create output directory
my $dirname ="./phylogenomicsAnalysis_dir";
if (-d $dirname) { die "Exiting...!\nPhylogenomics analysis output directory ($dirname) already exists!\n\n"; }
mkdir ($dirname, 0755);
make_directory($dirname);

get_orthogroup_fasta ( $orthogroup_faa, $orthogroup_fna, $scaffold, $method, $dirname, $scaffold_dir );

Expand Down Expand Up @@ -290,12 +290,19 @@ exit(0);

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # sub-routines # # # # # # # # # # # # # # # # # # # # # # # # # # # # #

sub make_directory {
my ( $new_dir ) = @_;
if (!-d $new_dir) {
mkdir($new_dir, 0755);
}
}

sub get_orthogroup_fasta {
my ( $orthogroup_faa, $orthogroup_fna, $scaffold, $method, $dirname, $scaffold_dir ) = @_;
print localtime()." - Creating orthogroup fasta\n\n";
my (%pep, %cds);
my $orthogroups_fasta = "$dirname/orthogroups_fasta";
mkdir ($orthogroups_fasta, 0755);
make_directory($orthogroups_fasta);
opendir (DIR, "$orthogroup_faa") or die "can't open $orthogroup_faa file\n";
while ( my $filename = readdir(DIR) ) {
if ($filename =~ /^(\d+)\.faa$/) { $pep{$1} = $1; }
Expand All @@ -318,8 +325,9 @@ sub create_orthogroup_alignments {
$scaffold, $method, $num_threads, $max_memory, $pasta_iter_limit, $dirname, $scaffold_dir ) = @_;
print localtime()." - Creating orthogroup alignments\n\n";
my $orthogroups_fasta = "$dirname/orthogroups_fasta";
make_directory($orthogroups_fasta);
my $orthogroups_aln = "$dirname/orthogroups_aln";
mkdir ($orthogroups_aln, 0755);
make_directory($orthogroups_aln);
my %pep;
opendir (DIR, "$orthogroups_fasta") or die "can't open $orthogroups_fasta file\n";
while ( my $filename = readdir(DIR) ) { if ($filename =~ /^(\d+)\.faa$/) { $pep{$1} = $1; } }
Expand All @@ -338,7 +346,7 @@ sub create_orthogroup_alignments {
}
if ($pasta_alignments) {
my $pasta_temp = "$dirname/pasta_temp";
mkdir $pasta_temp, 0755;
make_directory($pasta_temp);
system "python $pasta -d Protein -i $orthogroups_fasta/$ortho_id.faa -o $pasta_temp --max-mem-mb=$max_memory --num-cpus=$num_threads --iter-limit=$pasta_iter_limit >/dev/null";
my $pasta_aln = 0;
opendir (DIR, "$pasta_temp") or die "can't $pasta_temp directory\n";
Expand Down Expand Up @@ -404,7 +412,8 @@ sub trim_orthogroup_alignments {
if ( keys(%unfiltered_aln) != keys(%filtered_aln) ) {
my $orthogroups_fasta = "$dirname/orthogroups_fasta";
my $temp_dir = "$dirname/temp_dir";
mkdir ($temp_dir, 0755);
make_directory($temp_dir);

get_sequences ( $orthogroups_fasta, $temp_dir, $ortho_id, \%filtered_aln, $codon_alignments );
}
}
Expand Down Expand Up @@ -439,8 +448,9 @@ sub build_orthogroup_trees {
$gap_trimming, $remove_sequences, $raxml, $fasttree, $num_threads, $scaffold, $dirname, $config_dir ) = @_;
print localtime()." - Building orthogroup phylogenetic trees\n\n";
my $orthogroups_aln = "$dirname/orthogroups_aln";
make_directory($orthogroups_aln);
my $orthogroups_tree = "$dirname/orthogroups_tree";
mkdir ($orthogroups_tree, 0755);
make_directory($orthogroups_tree);
my ($seq_type, $aln_type, $max_seq, $min_seq);
if ($sequence_type and ($sequence_type eq "dna")) { $seq_type = "fna"; } else { $seq_type = "faa"; }
if ($remove_sequences) { $aln_type = "filter"; }
Expand Down Expand Up @@ -595,7 +605,7 @@ sub prepare_alignments {
$gap_trimming, $remove_sequences, $num_threads, $max_memory, $pasta_iter_limit, $dirname) = @_;
my $orthogroups_aln = "$dirname/orthogroups_aln";
my $temp_aln = "$temp_dir/aln_dir";
mkdir $temp_aln, 0755;
make_directory($temp_aln);
my %pep;
opendir (DIR, "$temp_dir") or die "can't open $temp_dir file\n";
while ( my $filename = readdir(DIR) ) { if ($filename =~ /^(\d+)\.faa$/) { $pep{$1} = $1; } }
Expand All @@ -609,8 +619,17 @@ sub prepare_alignments {
if ($seq_count < 3) { next; }
if ($pasta_alignments){
my $pasta_temp = "$dirname/pasta_temp";
mkdir $pasta_temp, 0755;
make_directory($pasta_temp);
system "python $pasta -d Protein -i $temp_dir/$ortho_id.faa -o $pasta_temp --max-mem-mb=$max_memory --num-cpus=$num_threads --iter-limit=$pasta_iter_limit >/dev/null 2>/dev/null";
my $pasta_aln = 0;
opendir (DIR, "$pasta_temp") or die "can't $pasta_temp directory\n";
while ( my $filename = readdir(DIR)) {
if ($filename =~ /\.aln$/){ $pasta_aln++; }
}
if($pasta_aln == 0) {
print "PASTA multiple alignments was not successful.\nEither PASTA is properly installed or your input fasta file is not in the required format.\nTerminating...\n\n";
exit(0);
}
system "mv $pasta_temp/*.$ortho_id.faa.aln $temp_aln/$ortho_id.faa.aln";
system "rm -r $pasta_temp";
}
Expand Down

0 comments on commit 317e65f

Please sign in to comment.