reslp · Jun 16, 2020
diff --git a/‎Snakefile_CH
+17 b/‎Snakefile_CH
+17
diff --git a/‎bin/augustus.PASS2.sh
+73 b/‎bin/augustus.PASS2.sh
+73
diff --git a/‎bin/cleanup.sh
+17 b/‎bin/cleanup.sh
+17
diff --git a/‎bin/convert_repeatmasker_gff_to_MAKER_compatible_gff.sh
+4 b/‎bin/convert_repeatmasker_gff_to_MAKER_compatible_gff.sh
+4
diff --git a/‎bin/count_length.sh
+38 b/‎bin/count_length.sh
+38
diff --git a/‎bin/gff_to_gbk.py
+65 b/‎bin/gff_to_gbk.py
+65
diff --git a/‎bin/merging.sh
+45 b/‎bin/merging.sh
+45
diff --git a/‎bin/setup_Repeatmasker.sh
+26 b/‎bin/setup_Repeatmasker.sh
+26
diff --git a/‎bin/snap.p1.sh
+34 b/‎bin/snap.p1.sh
+34
diff --git a/‎bin/snap.p2.sh
+28 b/‎bin/snap.p2.sh
+28
diff --git a/‎data/cluster_config_vsc4_CH.yaml
+123 b/‎data/cluster_config_vsc4_CH.yaml
+123
diff --git a/‎data/config_CH.yaml
+31 b/‎data/config_CH.yaml
+31
diff --git a/‎data/data_CH.csv
+2 b/‎data/data_CH.csv
+2
diff --git a/‎rules/functions.smk
+112 b/‎rules/functions.smk
+112
diff --git a/‎rules/maker_part_one.smk
+243 b/‎rules/maker_part_one.smk
+243
diff --git a/‎rules/maker_post_repeats.smk
+521 b/‎rules/maker_post_repeats.smk
+521
diff --git a/‎rules/repeats.smk
+165 b/‎rules/repeats.smk
+165
diff --git a/‎rules/setup_maker.smk
+35 b/‎rules/setup_maker.smk
+35
@@ -0,0 +1,17 @@
+configfile: "data/config.yaml"
+
+localrules: initiate, all
+
+include: "rules/setup_maker.smk"
+include: "rules/functions.smk"
+include: "rules/maker_part_one.smk"
+include: "rules/repeats.smk"
+include: "rules/maker_post_repeats.smk"
+
+rule all:
+	input:
+		expand("results/{unit.sample}/MAKER.PASS1/{unit.unit}/{unit.sample}.{unit.unit}.maker.output.tar.gz", unit=units.itertuples()),
+		expand("results/{unit.sample}/MAKER.PASS2/{unit.unit}/{unit.sample}.{unit.unit}.maker.output.tar.gz", unit=units.itertuples()),
+		expand("results/{name}/REPEATMODELER/repeatmodeler.cleanup.ok", name=samples.index.tolist()),
+		expand("results/{name}/MAKER.PASS2/{name}.all.maker.gff", name=samples.index.tolist())
+
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+threads=$1
+prefix=$2
+fasta=$3
+proteins=$4
+aed=$5
+local_config=$6
+training_params=$7
+cdna=$8
+
+AUGUSTUS_CONFIG_PATH=$local_config
+
+basedir=$(pwd)
+#prepare training parameters from BUSCO
+if [ ! -z $training_params ]
+then
+	echo -e "[$(date)]\tPreparing training set from previous Augustus run"
+	#get local copy of Augustus parameters from previous training round
+	cp -fr $training_params $local_config/species/$prefix
+
+	#rename files to current prefix
+	cd $local_config/species/$prefix
+	base=$(ls *weightmatrix.txt | sed 's/_weightmatrix.txt//')
+	#rename files
+	for file in $(ls -1); do new=$(echo -e "$file" | sed "s/$base/$prefix/g"); mv $file $new; done
+	#rename the files cited within certain HMM configuration files
+	sed -i "s/$base/$prefix/g" $prefix\_parameters.cfg
+	sed -i "s/$base/$prefix/g" $prefix\_parameters.cfg.orig1
+
+	cd $basedir
+fi
+
+if [ ! -z $aed ]
+then
+	echo -e "[$(date)]\tFiltering proteins with AED > $aed"
+	#extract only proteins with AED < x
+	cat <(echo -e "$aed") <(cat $proteins | perl -ne 'chomp; if ($_ =~ /^>/){print "\n$_\n"}else{print "$_"}' | grep -v "^$") | \
+perl -ne 'chomp; if ($. == 1){$AED = $_}else{$h=$_; $s=<>; @a=split(" "); $a[2] =~ s/AED://; if ($a[2] < $AED){print "$h\n$s"}}' | sed 's/ .*//' > $prefix.AED-st$aed.maker.proteins.fasta
+	proteins=$prefix.AED-st$aed.maker.proteins.fasta
+fi
+
+if [ -f "$cdna" ]
+then
+	cmd="autoAug.pl --genome=$fasta --species=$prefix --trainingset=$proteins --cdna=$cdna --singleCPU --threads $threads -v --useexisting"
+	echo -e "[$(date)]\tRunning autoAug.pl with cdna evidence:\n$cmd"
+	$cmd
+else
+	cmd="autoAug.pl --genome=$fasta --species=$prefix --trainingset=$proteins --singleCPU --threads $threads -v --useexisting"
+	echo -e "[$(date)]\tRunning autoAug.pl without cdna evidence:\n$cmd"
+	$cmd
+fi
+retVal=$?
+
+if [ ! $retVal -eq 0 ]
+then
+	if [ -s "$(pwd)/autoAug/autoAugPred_abinitio/predictions/augustus.gff" ]
+	then
+		>&2 echo "Augustus ended in an error, but abinitio predictions are there - continuing .."
+	else
+		>&2 echo "Augustus ended in an error"
+		exit $retVal
+	fi
+fi
+
+#copy the training set that was produced
+cp -rf $local_config/species/$prefix .
+
+echo -e "[$(date)]Reformatting to $(pwd)/autoAug/autoAugPred_abinitio/predictions/augustus.gff to GFF3 -> $(pwd)/augustus.gff3"
+cat autoAug/autoAugPred_abinitio/predictions/augustus.gff | perl -ne 'chomp; @a=split(/\t/); if ($a[2] eq 'gene'){$id=$a[-1]; $a[-1] =~ s/^/ID=/; print join("\t", @a)."\n"}else{if ($_ =~ /;$/){print "$_ Parent=$id\n"}else{print "$_; Parent=$id\n"}}' | sed 's/; /;/g' | sed 's/ /=/g' > augustus.gff3
+
+#cat $(pwd)/autoAug/autoAugPred_abinitio/predictions/augustus.gff | perl -ne 'chomp; @a=split(/\t/); if ($a[2] eq 'gene'){$id=$a[-1]; $a[-1] =~ s/^/ID=/; print join("\t", @a)."\n"}else{if ($_ =~ /;$/){print "$_ Parent=$id\n"}else{print "$_; Parent=$id\n"}}' | sed 's/; /;/g' | sed 's/ /=/g' > $(pwd)/autoAug/autoAugPred_abinitio/predictions/augustus.gff3 
+
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+dir=$1
+
+echo -e "\n$(date)\tStarting ...\n"
+
+echo -e "[$(date)]\t$dir -> $dir.tar.gz"
+tar cfz $dir.tar.gz $dir
+if [ $? -eq 0 ]
+then
+	rm -rf $dir
+else
+	echo -e "Some problem with $dir"
+fi
+
+echo -e "\n$(date)\tFinished!\n"
+
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+cat $1 | \
+grep -v -e "Satellite" -e ")n" -e "-rich" | perl -ne '$id; if(!/^\#/){chomp; $_ =~ s/\r//g; $id++; print "$_;ID=$id\n"}else{print "$_"}'
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+#./bin/count_length.sh test.fa.gz 100000 5000 count > check
+#./bin/count_length.sh test.fa.gz 100000 5000 split > check
+#check
+#sha256sum -c <(cut -f 1 check) 
+#only check for the file in question
+#sha256sum -c <(grep -P "test.fa.gz\t" check| cut -f 1)
+
+f=$1
+l=$2
+m=$3
+mode=$4 #could be either 'count' or 'split'
+
+if [ $(echo $f | rev | cut -c 1-3 | rev) == ".gz" ]
+then
+#	echo gzipped
+	if [ "$mode" == "count" ]
+	then
+		paste <(sha256sum $f) <(echo -e "$l\t$m\t$(cat <(echo -e "$l\t$m") <(zcat $f) | perl -ne 'chomp; if ($. == 1){@a=split("\t"); $cutoff=$a[0]; $minlen=$a[-1]; $counter=1; }else{if ($_ =~ /^>/){if ($. > 2){if ($length >= $minlen){$cum_length+=$length; if ($cum_length >= $cutoff){$counter++; $cum_length=0; }}} $length = 0}else{$length+=length($_)}}}; if (eof()){print "$counter\n"')")
+	fi
+	if [ "$mode" == "split" ]
+	then
+		paste <(sha256sum $f) <(echo -e "$l\t$m\t$(cat <(echo -e "$l\t$m") <(zcat $f) | perl -ne 'chomp; if ($. == 1){@a=split("\t"); $cutoff=$a[0]; $minlen=$a[-1]; $counter=1; open(FH, ">", sprintf("%04d", $counter).".fasta")}else{if ($_ =~ /^>/){if ($. > 2){if ($length >= $minlen){$cum_length+=$length; print FH "$header\n$seq\n"; $header = $_; $seq = ""; if ($cum_length >= $cutoff){close FH; $counter++; open(FH, ">", sprintf("%04d", $counter).".fasta"); $cum_length=0; }}}; $header = $_; $seq = ""; $length = 0}else{$length+=length($_); $seq.=$_}}}; if (eof()){print "$counter\n"; if ($length >= $minlen){print FH "$header\n$seq\n"}')")
+	fi
+else
+#	echo not gzipped
+	if [ "$mode" == "count" ]
+	then
+		paste <(sha256sum $f) <(echo -e "$l\t$m\t$(cat <(echo -e "$l\t$m") $f | perl -ne 'chomp; if ($. == 1){@a=split("\t"); $cutoff=$a[0]; $minlen=$a[-1]; $counter=1; }else{if ($_ =~ /^>/){if ($. > 2){if ($length >= $minlen){$cum_length+=$length; if ($cum_length >= $cutoff){$counter++; $cum_length=0; }}} $length = 0}else{$length+=length($_)}}}; if (eof()){print "$counter\n"')")
+	fi
+	if [ "$mode" == "split" ]
+	then
+		paste <(sha256sum $f) <(echo -e "$l\t$m\t$(cat <(echo -e "$l\t$m") $f | perl -ne 'chomp; if ($. == 1){@a=split("\t"); $cutoff=$a[0]; $minlen=$a[-1]; $counter=1; open(FH, ">", sprintf("%04d", $counter).".fasta")}else{if ($_ =~ /^>/){if ($. > 2){if ($length >= $minlen){$cum_length+=$length; print FH "$header\n$seq\n"; $header = $_; $seq = ""; if ($cum_length >= $cutoff){close FH; $counter++; open(FH, ">", sprintf("%04d", $counter).".fasta"); $cum_length=0; }}}; $header = $_; $seq = ""; $length = 0}else{$length+=length($_); $seq.=$_}}}; if (eof()){print "$counter\n"; if ($length >= $minlen){print FH "$header\n$seq\n"}')")
+	fi
+fi
+
+#cat <(echo -e ">\t$l\t$m") <(zcat $f) | perl -ne 'if ($_ =~ /^>/){$header = $_; $seq = ""; if ($. > 1){if ($length >= $minlen){$cum_length+=$length; print FH "$header\n$seq\n"} if ($cum_length >= $cutoff){$counter++; close FH; open(FH, '>', $counter.".fasta"); $cum_length=0}}else{chomp; @a=split("\t"); $cutoff=$a[-2]; $minlen=$a[-1]}; $length = 0}else{$length+=(length($_)-1); $seq+=$_}; if (eof()){$counter++; print "$counter\n"}'
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+"""Convert a GFF and associated FASTA file into GenBank format.
+Usage:
+    gff_to_genbank.py <GFF annotation file> <FASTA sequence file>
+"""
+from __future__ import print_function
+
+import sys
+import os
+
+from Bio import SeqIO
+from Bio.Alphabet import generic_dna
+import Bio.Alphabet as ab
+from Bio import Seq
+import random as rand
+
+from BCBio import GFF
+
+def main(gff_file, fasta_file):
+    out_file = "%s.gb" % os.path.splitext(gff_file)[0]
+    fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna))
+    gff_iter = GFF.parse(gff_file, fasta_input)
+    SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
+
+def _fix_ncbi_id(fasta_iter):
+    """GenBank identifiers can only be 16 characters; try to shorten NCBI.
+    """
+    for rec in fasta_iter:
+        if len(rec.name) > 16: 
+            new_id = rec.name[:8] + str(rand.randint(1,8000000))
+            print("Warning: shortening NCBI name %s to %s" % (rec.id, new_id))
+            rec.id = new_id
+            rec.name = new_id
+        yield rec
+
+def _check_gff(gff_iterator):
+    """Check GFF files before feeding to SeqIO to be sure they have sequences.
+    """
+    for rec in gff_iterator:
+        if isinstance(rec.seq, Seq.UnknownSeq):
+            print("Warning: FASTA sequence not found for '%s' in GFF file" % (
+                    rec.id))
+        rec.seq.alphabet = generic_dna
+        yield _flatten_features(rec)
+
+def _flatten_features(rec):
+    """Make sub_features in an input rec flat for output.
+    GenBank does not handle nested features, so we want to make
+    everything top level.
+    """
+    out = []
+    for f in rec.features:
+        cur = [f]
+        while len(cur) > 0:
+            nextf = []
+            for curf in cur:
+                out.append(curf)
+                if len(curf.sub_features) > 0:
+                    nextf.extend(curf.sub_features)
+            cur = nextf
+    rec.features = out
+    return rec
+
+if __name__ == "__main__":
+    main(*sys.argv[1:])
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+prefix=$1
+
+#get going
+echo -e "\n$(date)\tStarting ...\n"
+
+#combine results (this could be incorporated in the previous script)
+#combine all gffs without fasta sequences
+cat $(find ./ -name "$prefix.*.noseq.maker.gff" | sort) > $prefix.noseq.maker.gff
+#combine all gffs and add FASTA sequences from all 
+cat $prefix.noseq.maker.gff <(echo -e "##FASTA") <(for f in $(find ./ -name "$prefix.*.all.maker.gff" | sort); do cat $f | perl -ne 'chomp; if ($_ =~ /^##FASTA/){$ok=1}; if ($ok){print "$_\n"}'; done | grep -v "^##FASTA") > $prefix.all.maker.gff
+#combine all proteins
+cat $(find ./ -name "$prefix.*.all.maker.proteins.fasta" | sort) > $prefix.all.maker.proteins.fasta
+#combine all transcripts
+cat $(find ./ -name "$prefix.*.all.maker.transcripts.fasta" | sort) > $prefix.all.maker.transcripts.fasta
+
+#extract gff by evidence
+# transcript alignments
+awk '{ if ($2 ~ "est2genome") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.est2genome.gff
+awk '{ if ($2 ~ "cdna2genome") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.cdna2genome.gff
+# protein alignments
+awk '{ if ($2 ~ "protein2genome") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.protein2genome.gff
+# repeat alignments
+awk '{ if ($2 ~ "repeat") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.repeats.gff
+
+#genes predicted by snap
+awk '{ if ($2 ~ "snap") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.snap.gff
+#genes predicted by augustus
+awk '{ if ($2 ~ "augustus") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.augustus.gff
+#genes predicted by maker
+awk '{ if ($2 ~ "maker") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.maker.gff
+
+#rename genes/transcripts
+#create backups
+cp $prefix.all.maker.gff $refix.all.maker.backup.gff
+cp $prefix.all.maker.proteins.fasta $prefix.all.maker.proteins.backup.fasta
+cp $prefix.all.maker.transcripts.fasta $prefix.all.maker.transcripts.backup.fasta
+
+maker_map_ids --prefix $prefix --justify 5 --suffix - --iterate 1 $prefix.all.maker.gff > $prefix.makerID2short.map
+map_gff_ids $prefix.makerID2short.map $prefix.all.maker.gff
+map_fasta_ids $prefix.makerID2short.map $prefix.all.maker.transcripts.fasta
+map_fasta_ids $prefix.makerID2short.map $prefix.all.maker.proteins.fasta
+
+echo -e "\n$(date)\tFinished!\n"
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#use as ./setup_Repeatmasker.sh destination/ external/RepBaseRepeatMaskerEdition-20181026.tar.gz
+
+destination=$1
+repeattarball=$2
+
+#take care of RepeatMasker
+cp -pfr /usr/local/RepeatMasker $destination
+#cd $destination/RepeatMasker
+#perl ./rebuild
+#cd -
+tar xvfz $repeattarball -C $destination/RepeatMasker/
+
+#This is a custom step that checks the metadata against the actual sequence data and removes those that are not present in the metadata
+cd $destination/RepeatMasker
+echo "Checking Repbase metadata against sequence data"
+cat Libraries/RMRBSeqs.embl | grep "^ID " | sed 's/^ID   //' | cut -d " " -f 1 | sort -n | uniq | perl -ne 'chomp; print "$_\n$_\n"' > comp1
+cat Libraries/RMRBMeta.embl | grep "^ID " | sed 's/^ID   //' | cut -d " " -f 1 | sort -n | uniq > comp2
+cat comp1 comp2 | sort -n | uniq -c | grep "   1 " | sed 's/^ .*1 //' > missing
+for m in $(cat missing); do sed -i "/$m/,/\/\//d" Libraries/RMRBMeta.embl; done
+rm comp1 comp2 missing
+
+#BUild the repeatdatabases
+perl ./rebuild
+
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+prefix=$1
+gff=$2
+fasta=$3
+
+
+echo -e "[$(date)]\tConvert CEGMA gff to SNAP input"
+cegma2zff $gff $fasta
+retVal=$(( retVal + $? ))
+
+echo -e "[$(date)]\tgather some stats and validate"
+fathom genome.ann genome.dna -gene-stats > gene-stats.log 2>&1
+fathom genome.ann genome.dna -validate > validate.log 2>&1
+retVal=$(( retVal + $? ))
+
+echo -e "[$(date)]\tcollect the training sequences and annotations, plus 1000 surrounding bp for training"
+fathom genome.ann genome.dna -categorize 1000
+fathom -export 1000 -plus uni.ann uni.dna
+retVal=$(( retVal + $? ))
+
+echo -e "[$(date)]\tcreate the training parameters"
+forge export.ann export.dna
+retVal=$(( retVal + $? ))
+
+echo -e "[$(date)]\tassemble the HMMs"
+hmm-assembler.pl $prefix . > $prefix.cegma.snap.hmm
+retVal=$(( retVal + $? ))
+
+if [ ! $retVal -eq 0 ]
+then
+	echo "There was some error" 1>&2
+	exit $retVal
+fi
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+prefix=$1
+gff=$2
+aed=$3
+
+#Extract gene models with AED <= x
+echo -e "[$(date)]\tIdentify gene models with AED score > $aed"
+cat <(echo -e "$aed") <(cat $gff | grep -P "\tmRNA") | perl -ne 'chomp; if ($. == 1){$aed_max=$_}else{@a=split("\t"); @b=split(";",$a[8]);for (@b){if ($_ =~ /_AED/){$_ =~ s/_AED=//; $AED=$_; if ($AED > $aed_max){$b[0] =~ s/ID=//; print "$b[0]\n"; $b[0] =~ s/-mRNA.*//; print "$b[0];\n"}}}}' > gt.$aed.ids.txt
+
+echo -e "[$(date)]\tExclude these gene models from gff -> remainder written to file: MAKER.st$aed.maker.gff"
+grep -v -f gt.$aed.ids.txt $gff > MAKER.st$aed.maker.gff
+
+echo -e "[$(date)]\tConvert MAKER gff to SNAP input"
+maker2zff -n MAKER.st$aed.maker.gff
+
+echo -e "[$(date)]\tgather some stats and validate"
+fathom genome.ann genome.dna -gene-stats > gene-stats.log 2>&1
+fathom genome.ann genome.dna -validate > validate.log 2>&1
+echo -e "[$(date)]\tcollect the training sequences and annotations, plus 1000 surrounding bp for training"
+fathom genome.ann genome.dna -categorize 1000
+fathom -export 1000 -plus uni.ann uni.dna
+echo -e "[$(date)]\tcreate the training parameters"
+forge export.ann export.dna
+echo -e "[$(date)]\tassemble the HMMs"
+hmm-assembler.pl $prefix . > $prefix.MAKER.st$aed.snap.hmm
+
+ln -s $prefix.MAKER.st$aed.snap.hmm $prefix.MAKER_PASS1.snap.hmm
@@ -0,0 +1,123 @@
+__default__:
+   J: sm
+   n: 1
+   ntasks: 1
+   ntasks-per-node: 1
+   mem: 4G
+   hint: memory_bound
+   qos: mem_0096
+   partition: mem_0096
+   time: "00:10:00"
+   output: $(pwd)/log/%x-%j.stdout.txt
+   error: $(pwd)/log/%x-%j.stderr.txt
+all:
+   J: SnaMaall
+setup_maker:
+   J: SETUP
+initiate:
+   J: INIT
+genemark:
+   J: GEM
+   mem: 10G
+   time: "70:00:00"
+busco:
+   J: BUS1
+#   ntasks: 10
+#   ntasks-per-node: 10
+   mem: 20G
+   time: "70:00:00"
+cegma:
+   J: CEG1
+#   ntasks: 10
+#   ntasks-per-node: 10
+   mem: 10G
+   time: "70:00:00"
+snap_pass1:
+   J: SNA1
+#   ntasks: 1
+#   ntasks-per-node: 1
+repeatmodeler:
+   J: RMO1
+#   ntasks: 10
+#   ntasks-per-node: 10
+   mem: 10G
+   time: "70:00:00"
+cleanup_repeatmodeler:
+   J: cRMO
+   time: "70:00:00"
+repeatmasker:
+   J: RMA1
+#   ntasks: 10
+#   ntasks-per-node: 10
+   mem: 20G
+   time: "70:00:00"
+prepare_protein_evidence:
+   J: CDH0
+#   ntasks: 8
+#   ntasks-per-node: 8
+   mem: 20G
+   time: "01:00:00"
+split:
+   J: SPL0
+#   ntasks: 1
+#   ntasks-per-node: 1
+   mem: 10G
+   time: "01:00:00"
+initiate_MAKER_PASS1:
+   J: iMP1
+#   ntasks: 2
+#   ntasks-per-node: 2
+run_MAKER_PASS1:
+   J: rMP1
+#   ntasks: 20
+#   ntasks-per-node: 20
+   mem: 20G
+   time: "70:00:00"
+cleanup_MAKER_PASS1:
+   J: cMP1
+#   ntasks: 1
+#   ntasks-per-node: 1
+   mem: 4G
+   time: "70:00:00"
+merge_MAKER_PASS1:
+   J: mMP1
+#   ntasks: 1
+#   ntasks-per-node: 1
+   mem: 4G
+   time: "01:00:00"
+snap_pass2:
+   J: SNA2
+#   ntasks: 1
+#   ntasks-per-node: 1
+   mem: 4G
+   time: "00:30:00"
+AUGUSTUS_PASS2:
+   J: AUG2
+#   ntasks: 2
+#   ntasks-per-node: 2
+   mem: 20G
+   time: "70:00:00"
+pick_augustus_training_set:
+   J: PAM
+initiate_MAKER_PASS2:
+   J: iMP2
+#   ntasks: 2
+#   ntasks-per-node: 2
+run_MAKER_PASS2:
+   J: rMP2
+#   ntasks: 20
+#   ntasks-per-node: 20
+   mem: 20G
+   time: "70:00:00"
+cleanup_MAKER_PASS2:
+   J: cMP2
+#   ntasks: 1
+#   ntasks-per-node: 1
+   mem: 4G
+   time: "70:00:00"
+merge_MAKER_PASS2:
+   J: mMP2
+#   ntasks: 2
+#   ntasks-per-node: 2
+   mem: 4G
+   time: "01:00:00"
@@ -0,0 +1,31 @@
+samples: "data/data.csv"
+
+split_batch_length: 3000000
+split_min_length: 1000
+maker_tarball: data/external/maker-2.31.10.tgz
+RepbaseRepeatMaskerEdition: data/external/RepBaseRepeatMaskerEdition-20181026.tar.gz
+
+genemark:
+   genemark_dir: data/external/gmes_linux_64
+   gmes_petap_params:
+busco:
+   set: arthropoda_odb9
+   species: fly
+
+aed:
+   snap_pass2: "0.2"
+   AUGUSTUS_PASS2: ["0.0", "0.1", "0.2"]
+
+cdhit:
+   similarity: "0.98"
+
+threads:
+   genemark: 8
+   busco: 8
+   cegma: 8
+   repeatmodeler: 8
+   repeatmasker: 8
+   prepare_protein_evidence: 8
+   run_MAKER_PASS1: 8
+   AUGUSTUS_PASS2: 10
+   run_MAKER_PASS2: 8
@@ -0,0 +1,2 @@
+sample	fasta	species
+T1	data/genomes/Testspecies.fasta	Testus testus
@@ -0,0 +1,112 @@
+
+import pandas as pd
+import os
+import glob
+from math import ceil
+from pathlib import Path
+from subprocess import call
+
+
+n=int(config["split_batch_length"])
+min=int(config["split_min_length"])
+
+samples = pd.read_csv(config["samples"], sep="\t").set_index("sample", drop=False)
+samples.index.names = ["sample_id"]
+
+dic = {'sample': [], 'unit': []}
+
+def get_assembly_path(wildcards):
+# this is to get the path to the assembly from the CSV file
+	return samples.loc[wildcards.sample, ["fasta"]].to_list()
+
+def get_transcripts_path(wildcards, p="data/transcripts/*"):
+	#get paths to fasta transcript fasta files - if file has prefix identical to sample prefix in data.csv -> assume it's a transcriptome of this species -> MAKER 'est' option
+	dic = {'alt_ests': [], 'ests': []}
+	for f in glob.glob(p):
+		if f.split("/")[-1].startswith(wildcards.sample):
+			dic['ests'].append(os.path.abspath(f))
+		else:
+			dic['alt_ests'].append(os.path.abspath(f))
+	return dic
+
+
+def partition_by_length(fasta, max_length=n, min_length=min, pr=0, outdir="./"):
+#function that partitions the fasta file
+	headers = []
+	seqs = []
+	i=0
+	cum_length=0
+	printcount=1
+	for line in open(str(fasta)).readlines():
+		if line.strip().startswith(">"):
+			headers.append(line.strip())
+			seqs.append("")
+			if i >= 1:
+				if len(seqs[-2]) >= min_length:
+					cum_length+=len(seqs[-2])
+#					print("%s\t%s\t%s" %(headers[-2], len(seqs[-2]), cum_length))
+				else:
+					del headers[-2]
+					del seqs[-2]
+			if cum_length >= max_length:
+				if pr:
+					if not os.path.exists(outdir+"/"+str(printcount).zfill(4)):
+						os.mkdir(outdir+"/"+str(printcount).zfill(4))
+					fh = open(outdir+"/"+str(printcount).zfill(4)+"/p0001", 'w')
+#					print("%s\t%s" %(str(printcount).zfill(4), cum_length)) #"{:04d}".format(printcount))
+					for j in range(len(headers)-1):
+						fh.write("%s\n%s\n" %(headers[j],seqs[j]))
+					fh.close()
+				for j in reversed(range(len(headers)-1)):
+					del headers[j]
+					del seqs[j]
+					cum_length=len(seqs[-1])
+#				print("the lenght is again: %s" %len(headers))
+				printcount+=1
+			i+=1
+		else:
+			seqs[-1] = seqs[-1]+line.strip()
+
+	if pr:
+		if not os.path.exists(outdir+"/"+str(printcount).zfill(4)):
+			os.mkdir(outdir+"/"+str(printcount).zfill(4))
+		fh = open(outdir+"/"+str(printcount).zfill(4)+"/p0001", 'w')
+
+#		print("%s\t%s" %(str(printcount).zfill(4), cum_length+len(seqs[-1])))
+		for j in range(len(headers)):
+			fh.write("%s\n%s\n" %(headers[j],seqs[j]))
+		fh.close()
+
+	if not pr:
+		return printcount
+
+unitdict = {}
+print("Counting partitions (batchsize >= "+str(n)+"bp, minimum length = "+str(min)+"bp) ..")
+for sample in samples.index.values.tolist():
+    print("\t"+sample+" - n=", end='')
+    count = subprocess.run("bash ./bin/count_length.sh %s %i %i count" %(samples.fasta[sample], n, min), shell=True, stdout=subprocess.PIPE)
+    counter = int(count.stdout.decode('utf-8').split("\t")[-1])
+
+
+#    counter=partition_by_length(str(samples.fasta[sample]), max_length=n, min_length=min, pr=0) 
+    print(counter)
+    print("\t"+count.stdout.decode('utf-8').split("\t")[0])
+    unitdict[sample] = []
+    for i in range(1,counter+1):
+        dic['sample'].append(sample)
+        dic['unit'].append(str(i).zfill(4))
+	unitdict[sample].append(str(i).zfill(4))	
+	
+#print(unitdict)
+##print dic
+
+units = pd.DataFrame(dic).set_index(['sample','unit'], drop=False)
+#print(units)
+#print(units.index.tolist())
+#print units
+#for row in units.itertuples():
+#    print(row)
+
+units.index.names = ["sample_id", "unit_id"]
+units.index = units.index.set_levels(
+    [i.astype(str) for i in units.index.levels])  # enforce str in index
@@ -0,0 +1,243 @@
+rule initiate:
+	input:
+		rules.setup_maker.output
+	params:
+		prefix = "{sample}"
+	output:
+		"results/{sample}/{sample}.ok"
+	shell:
+		"""
+		if [[ ! -d results/{params.prefix} ]]
+		then
+			mkdir results/{params.prefix}
+		fi
+		touch {output}
+		"""
+
+rule split:
+	input:
+		fasta = get_assembly_path,
+		ok = rules.initiate.output
+	params:
+		prefix = "{sample}",
+		len = n,
+		min = min
+	log:
+		stdout = "results/{sample}/logs/split.{sample}.stdout.txt",
+		stderr = "results/{sample}/logs/split.{sample}.stderr.txt"
+	output:
+		ok = "results/{sample}/GENOME_PARTITIONS/splitting.ok",
+		fasta = "results/{sample}/GENOME_PARTITIONS/{sample}.min"+str(min)+".fasta"
+	shell:
+		"""
+		echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
+		basedir=$(pwd)
+
+		cd results/{params.prefix}/GENOME_PARTITIONS/
+		bash $basedir/bin/count_length.sh ../../../{input.fasta} {params.len} {params.min} split
+
+		retVal=$?
+
+                if [ ! $retVal -eq 0 ]
+                then
+                        echo "Splitting ended in an error"
+                        exit $retVal
+                else
+                        touch ../../../{output.ok}
+			cat *.fasta > ../../../{output.fasta}
+                fi
+
+		echo -e "\n$(date)\tFinished!\n"
+		"""
+
+rule genemark:
+	input:
+		ok = rules.initiate.output,
+		fasta = rules.split.output.fasta
+	params:
+		prefix = "{sample}",
+		genemark_dir = config["genemark"]["genemark_dir"],
+		gmes_petap_params = config["genemark"]["gmes_petap_params"]
+	threads: config["threads"]["genemark"]
+	singularity:
+		"docker://chrishah/premaker-plus:18"
+	log:
+		stdout = "results/{sample}/logs/GENEMARK.{sample}.stdout.txt",
+		stderr = "results/{sample}/logs/GENEMARK.{sample}.stderr.txt"
+	output:
+		ok = "results/{sample}/GENEMARK/genemark.status.ok",
+		model = "results/{sample}/GENEMARK/gmhmm.mod"
+	shell:
+		"""
+		echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
+		basedir=$(pwd)
+
+                if [[ ! -d results/{params.prefix}/GENEMARK ]]
+                then
+                        mkdir results/{params.prefix}/GENEMARK
+		else
+			if [ "$(ls -1 results/{params.prefix}/GENEMARK/ | wc -l)" -gt 0 ]
+			then
+				echo -e "Cleaning up remnants of previous run first" 1> {log.stdout} 2> {log.stderr}
+				rm results/{params.prefix}/GENEMARK
+				mkdir results/{params.prefix}/GENEMARK
+			fi
+                fi
+                cd results/{params.prefix}/GENEMARK
+
+		ln -sf $basedir/{params.genemark_dir}/gm_key .gm_key
+
+		if [ "{params.gmes_petap_params}" == "None" ]
+		then
+			gmes_petap.pl -ES -cores {threads} -sequence ../../../{input.fasta} 1> ../../../{log.stdout} 2> ../../../{log.stderr}
+		else
+			gmes_petap.pl -ES {params.gmes_petap_params} -cores {threads} -sequence ../../../{input.fasta} 1> ../../../{log.stdout} 2> ../../../{log.stderr}
+		fi
+
+		retVal=$?
+
+		if [ ! $retVal -eq 0 ]
+		then
+			echo "Genemark ended in an error"
+			exit $retVal
+		else
+			touch ../../../{output.ok}
+		fi
+		echo -e "\n$(date)\tFinished!\n"
+		
+		"""		
+		
+rule busco:
+	input:
+		ok = rules.initiate.output,
+		fasta = rules.split.output.fasta
+	params:
+		prefix = "{sample}",
+		busco_path = "data/BUSCO",
+		busco_set = config["busco"]["set"],
+		augustus_species = config["busco"]["species"]
+	threads: config["threads"]["busco"]
+	singularity:
+		"docker://chrishah/busco-docker:v3.1.0"
+	log:
+		stdout = "results/{sample}/logs/BUSCO.{sample}.stdout.txt",
+		stderr = "results/{sample}/logs/BUSCO.{sample}.stderr.txt"
+	output:
+		"results/{sample}/BUSCO/run_{sample}/single_copy_busco_sequences/{sample}.BUSCOs.fasta"
+	shell:
+		"""
+		echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
+
+		if [[ ! -d results/{params.prefix}/BUSCO ]]
+		then
+			mkdir results/{params.prefix}/BUSCO
+		fi
+		cd results/{params.prefix}/BUSCO
+
+		if [[ ! -d tmp ]]
+		then
+			mkdir tmp
+		fi
+
+		cp -rf /usr/share/augustus/config tmp/config
+		AUGUSTUS_CONFIG_PATH=$(pwd)/tmp/config
+
+		#run BUSCO
+		run_BUSCO.py \
+		--in ../../../{input.fasta} --out {params.prefix} -l ../../../{params.busco_path}/{params.busco_set} --mode genome -c {threads} -f \
+		-sp {params.augustus_species} --long --augustus_parameters='--progress=true' 1> ../../../{log.stdout} 2> ../../../{log.stderr}
+
+		#collect predicted BUSCOs
+		cat run_{params.prefix}/single_copy_busco_sequences/*.faa | sed 's/:.*//' > run_{params.prefix}/single_copy_busco_sequences/{params.prefix}.BUSCOs.fasta
+
+		echo -e "\n$(date)\tFinished!\n"
+		"""
+
+rule cegma:
+	input:
+		ok = rules.initiate.output,
+		fasta = rules.split.output.fasta
+	params:
+		prefix = "{sample}"
+	threads: config["threads"]["cegma"]
+	singularity:
+		"docker://chrishah/cegma:2.5"
+	log:
+		stdout = "results/{sample}/logs/CEGMA.{sample}.stdout.txt",
+		stderr = "results/{sample}/logs/CEGMA.{sample}.stderr.txt"
+	output:
+		ok = "results/{sample}/CEGMA/cegma.status.ok",
+		cegma_gff = "results/{sample}/CEGMA/{sample}.cegma.gff"
+	shell:
+		"""
+		echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
+
+		if [[ ! -d results/{params.prefix}/CEGMA ]]
+		then
+			mkdir results/{params.prefix}/CEGMA
+		fi
+		cd results/{params.prefix}/CEGMA
+
+		#run CEGMA
+		cegma -g ../../../{input.fasta} -T {threads} -o {params.prefix} 1> ../../../{log.stdout} 2> ../../../{log.stderr}
+
+		retVal=$?
+
+		if [ ! $retVal -eq 0 ]
+		then
+			echo "Cegma ended in an error"
+			exit $retVal
+		else
+			touch ../../../{output.ok}
+		fi
+		echo -e "\n$(date)\tFinished!\n"
+
+		"""
+
+rule snap_pass1:
+	input:
+		ok = rules.cegma.output.ok,
+		cegma_gff = rules.cegma.output.cegma_gff,
+		fasta = rules.split.output.fasta
+	params:
+		prefix = "{sample}",
+		script = "bin/snap.p1.sh"
+	singularity:
+		"docker://chrishah/premaker-plus:18"
+	log:
+		stdout = "results/{sample}/logs/SNAP.PASS1.{sample}.stdout.txt",
+		stderr = "results/{sample}/logs/SNAP.PASS1.{sample}.stderr.txt"
+	output:
+		ok = "results/{sample}/SNAP.PASS1/snap.status.ok",
+		hmm = "results/{sample}/SNAP.PASS1/{sample}.cegma.snap.hmm"
+	shell:
+		"""
+		echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
+		basedir=$(pwd)
+		
+		export PATH="$(pwd)/bin/maker/bin:$PATH"
+
+		if [[ ! -d results/{params.prefix}/SNAP.PASS1 ]]
+		then
+			mkdir results/{params.prefix}/SNAP.PASS1
+		fi
+		cd results/{params.prefix}/SNAP.PASS1
+
+		bash $basedir/{params.script} \
+		{params.prefix} \
+		$basedir/{input.cegma_gff} \
+		$basedir/{input.fasta} \
+		1> $basedir/{log.stdout} 2> $basedir/{log.stderr}
+
+		retVal=$?
+
+		if [ ! $retVal -eq 0 ]
+		then
+			echo "SNAP ended in an error"
+			exit $retVal
+		else
+			touch $basedir/{output.ok}
+		fi
+		echo -e "\n$(date)\tFinished!\n"
+		"""
+
@@ -0,0 +1,165 @@
+rule repeatmodeler:
+	input:
+		ok = rules.initiate.output,
+		fasta = rules.split.output.fasta
+	params:
+		prefix = "{sample}",
+	threads: config["threads"]["repeatmodeler"]
+	singularity:
+		"docker://chrishah/premaker-plus:18"
+	log:
+		stdout = "results/{sample}/logs/REPEATMODELER.{sample}.stdout.txt",
+		stderr = "results/{sample}/logs/REPEATMODELER.{sample}.stderr.txt"
+	output:
+		ok = "results/{sample}/REPEATMODELER/repeatmodeler.status.ok",
+		fasta = "results/{sample}/REPEATMODELER/{sample}-families.fa"
+	shell:
+		"""
+		echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
+
+		if [[ ! -d results/{params.prefix}/REPEATMODELER ]]
+		then
+			mkdir results/{params.prefix}/REPEATMODELER
+		else
+			if [ "$(ls -1 results/{params.prefix}/REPEATMODELER/ | wc -l)" -gt 0 ]
+			then
+				echo -e "Cleaning up remnants of previous run first"
+				rm results/{params.prefix}/REPEATMODELER
+				mkdir results/{params.prefix}/REPEATMODELER
+			fi
+		fi
+		cd results/{params.prefix}/REPEATMODELER
+
+		#run REPEATMODELER
+		BuildDatabase -name {params.prefix} -engine ncbi ../../../{input.fasta} 1> ../../../{log.stdout} 2> ../../../{log.stderr}
+
+		RepeatModeler -pa {threads} -engine ncbi -database {params.prefix} 1>> ../../../{log.stdout} 2>> ../../../{log.stderr}
+
+		retVal=$?
+
+		if [ ! $retVal -eq 0 ]
+		then
+			echo "REPEATMODELER ended in an error"
+			exit $retVal
+		else
+			touch ../../../{output.ok}
+		fi
+		echo -e "\n$(date)\tFinished!\n"
+		"""
+
+rule cleanup_repeatmodeler:
+	input:
+		rules.repeatmodeler.output
+	params:
+		prefix = "{sample}"
+	log:
+		stdout = "results/{sample}/logs/REPEATMODELER.cleanup.{sample}.stdout.txt",
+		stderr = "results/{sample}/logs/REPEATMODELER.cleanup.{sample}.stderr.txt"
+	output:
+		ok = "results/{sample}/REPEATMODELER/repeatmodeler.cleanup.ok"
+	shell:
+		"""
+		echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
+		basedir=$(pwd)
+		
+		cd results/{params.prefix}/REPEATMODELER
+		for f in $(find ./ -type d -name "RM_*")
+		do
+			echo -e "\nCompressing $f\n"
+			tar cfz $f.tar.gz $f
+			
+			if [ $? -eq 0 ]
+			then
+        			rm -rf $f
+			else
+        			echo -e "Some problem with $f"
+			fi
+		done
+
+		cd $basedir
+		touch {output.ok}
+
+		echo -e "\n$(date)\tFinished!\n"
+
+		"""
+
+
+rule repeatmasker:
+	input:
+		fasta = rules.split.output.fasta,
+		repmod = rules.repeatmodeler.output.fasta
+	params:
+		prefix = "{sample}",
+		repeat_taxon = "eukaryota",
+		conversion_script = "bin/convert_repeatmasker_gff_to_MAKER_compatible_gff.sh"
+	threads: config["threads"]["repeatmasker"]
+	singularity:
+		"docker://chrishah/premaker-plus:18"
+	log:
+		stdout = "results/{sample}/logs/REPEATMASKER.{sample}.stdout.txt",
+		stderr = "results/{sample}/logs/REPEATMASKER.{sample}.stderr.txt"
+	output:
+		ok = "results/{sample}/REPEATMASKER/repeatmasker.status.ok",
+		gff = "results/{sample}/REPEATMASKER/{sample}.masked.final.out.reformated.gff",
+		masked = "results/{sample}/REPEATMASKER/{sample}.masked.final.fasta"
+	shell:
+		"""
+		echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
+		basedir=$(pwd)
+
+		if [[ ! -d results/{params.prefix}/REPEATMASKER ]]
+		then
+			mkdir results/{params.prefix}/REPEATMASKER
+		fi
+		cd results/{params.prefix}/REPEATMASKER
+
+		#this is a bit of a hack, but since singularity does not allow to directly write to images ('no space left') and RepeatMasker in the
+		#process needs to produce some files, I need to first get the RepeatMasker directory out of the image.
+		#Then use the executable in the directory in my writable environment
+		#apparently I could also use singularities '--sandbox' option, but from what I can see this would write the content of the entire image to a 
+		#directory, so it would take much longer
+
+		#Copy the RepeatMasker directory from the image
+		cp -pfr /usr/local/RepeatMasker .
+
+		#Do RepeatMasking with denovo library
+		mkdir denovo
+		./RepeatMasker/RepeatMasker -engine ncbi -s -pa {threads} -lib $basedir/{input.repmod} -noisy -dir denovo -gff $basedir/{input.fasta} 1> $basedir/{log.stdout} 2> $basedir/{log.stderr}
+		retVal=$?
+
+		#run REPEATMASKER against full repeat library, but use only the assembly that is already masked based on the denovo library
+		mkdir full
+		ln -s $(find ./denovo -name '*fasta.masked') {params.prefix}.masked.denovo.fasta
+		./RepeatMasker/RepeatMasker -engine ncbi -s -pa {threads} -species {params.repeat_taxon} -noisy -dir full -gff {params.prefix}.masked.denovo.fasta 1>> $basedir/{log.stdout} 2>> $basedir/{log.stderr}
+		retVal=$(( retVal + $? ))
+
+		#cleanup - remove the RepeatMasker directory
+		rm -rf RepeatMasker
+		rm {params.prefix}.masked.denovo.fasta
+
+		#produce the final repeat annotation
+		#copy the final masked fasta and out files from the last Repeatmasker run
+		mkdir final
+		cd final
+		ln -s ../full/{params.prefix}.masked.denovo.fasta.masked {params.prefix}.masked.final.fasta
+		ln -s ../full/{params.prefix}.masked.denovo.fasta.out {params.prefix}.masked.final.out
+
+		#produce gff3 file from the final RepeatMasker output (this gff3 seems to work well with MAKER after some conversion - see below)
+		/usr/local/RepeatMasker/util/rmOutToGFF3.pl {params.prefix}.masked.final.out > {params.prefix}.masked.final.out.gff3
+		retVal=$(( retVal + $? ))
+
+		#modify gff3 file so MAKER accepts it down the line
+		$basedir/{params.conversion_script} {params.prefix}.masked.final.out.gff3 > $basedir/{output.gff}
+
+		cd ..
+		ln -s final/{params.prefix}.masked.final.fasta $basedir/{output.masked}
+
+		if [ ! $retVal -eq 0 ]
+		then
+			echo "There was some error"
+			exit $retVal
+		else
+			touch $basedir/{output.ok}
+		fi
+		echo -e "\n$(date)\tFinished!\n"
+		"""
@@ -0,0 +1,35 @@
+
+rule setup_maker:
+	input:
+		maker_tarball = config["maker_tarball"],
+	params:
+		repbase = config["RepbaseRepeatMaskerEdition"]
+	singularity:
+		"docker://chrishah/premaker-plus:18"
+	output: 
+		bin = directory("bin/maker/bin")
+	shell:
+		"""
+		echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
+		basedir=$(pwd)
+
+		#Copy the RepeatMasker directory from the image
+		cp -pfrv /usr/local/RepeatMasker bin/
+
+		if [ "{params.repbase}" == "None" ]
+		then
+			echo -e "No additional Repeatlibrary provided - ok"
+		else
+			bin/setup_Repeatmasker.sh bin/ {params.repbase}
+#			tar xvfz {params.repbase} -C bin/RepeatMasker/
+#			perl bin/RepeatMasker/rebuild
+		fi
+
+		if [ "{input.maker_tarball}" == "None" ]
+		then
+			echo -e "Providing a maker tarball is mandatory"
+			exit 1
+		else
+			bash bin/setup_maker.sh {input.maker_tarball} bin 
+		fi
+	"""
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+sample fasta species`
	`2`	`+T1 data/genomes/Testspecies.fasta Testus testus`