Skip to content

Commit 86a30a7

Browse files
committedJun 16, 2020
add rules, scripts, configs
1 parent ce05906 commit 86a30a7

18 files changed

+1579
-0
lines changed
 

‎Snakefile_CH

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
configfile: "data/config.yaml"
2+
3+
localrules: initiate, all
4+
5+
include: "rules/setup_maker.smk"
6+
include: "rules/functions.smk"
7+
include: "rules/maker_part_one.smk"
8+
include: "rules/repeats.smk"
9+
include: "rules/maker_post_repeats.smk"
10+
11+
rule all:
12+
input:
13+
expand("results/{unit.sample}/MAKER.PASS1/{unit.unit}/{unit.sample}.{unit.unit}.maker.output.tar.gz", unit=units.itertuples()),
14+
expand("results/{unit.sample}/MAKER.PASS2/{unit.unit}/{unit.sample}.{unit.unit}.maker.output.tar.gz", unit=units.itertuples()),
15+
expand("results/{name}/REPEATMODELER/repeatmodeler.cleanup.ok", name=samples.index.tolist()),
16+
expand("results/{name}/MAKER.PASS2/{name}.all.maker.gff", name=samples.index.tolist())
17+

‎bin/augustus.PASS2.sh

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/bin/bash
2+
3+
threads=$1
4+
prefix=$2
5+
fasta=$3
6+
proteins=$4
7+
aed=$5
8+
local_config=$6
9+
training_params=$7
10+
cdna=$8
11+
12+
AUGUSTUS_CONFIG_PATH=$local_config
13+
14+
basedir=$(pwd)
15+
#prepare training parameters from BUSCO
16+
if [ ! -z $training_params ]
17+
then
18+
echo -e "[$(date)]\tPreparing training set from previous Augustus run"
19+
#get local copy of Augustus parameters from previous training round
20+
cp -fr $training_params $local_config/species/$prefix
21+
22+
#rename files to current prefix
23+
cd $local_config/species/$prefix
24+
base=$(ls *weightmatrix.txt | sed 's/_weightmatrix.txt//')
25+
#rename files
26+
for file in $(ls -1); do new=$(echo -e "$file" | sed "s/$base/$prefix/g"); mv $file $new; done
27+
#rename the files cited within certain HMM configuration files
28+
sed -i "s/$base/$prefix/g" $prefix\_parameters.cfg
29+
sed -i "s/$base/$prefix/g" $prefix\_parameters.cfg.orig1
30+
31+
cd $basedir
32+
fi
33+
34+
if [ ! -z $aed ]
35+
then
36+
echo -e "[$(date)]\tFiltering proteins with AED > $aed"
37+
#extract only proteins with AED < x
38+
cat <(echo -e "$aed") <(cat $proteins | perl -ne 'chomp; if ($_ =~ /^>/){print "\n$_\n"}else{print "$_"}' | grep -v "^$") | \
39+
perl -ne 'chomp; if ($. == 1){$AED = $_}else{$h=$_; $s=<>; @a=split(" "); $a[2] =~ s/AED://; if ($a[2] < $AED){print "$h\n$s"}}' | sed 's/ .*//' > $prefix.AED-st$aed.maker.proteins.fasta
40+
proteins=$prefix.AED-st$aed.maker.proteins.fasta
41+
fi
42+
43+
if [ -f "$cdna" ]
44+
then
45+
cmd="autoAug.pl --genome=$fasta --species=$prefix --trainingset=$proteins --cdna=$cdna --singleCPU --threads $threads -v --useexisting"
46+
echo -e "[$(date)]\tRunning autoAug.pl with cdna evidence:\n$cmd"
47+
$cmd
48+
else
49+
cmd="autoAug.pl --genome=$fasta --species=$prefix --trainingset=$proteins --singleCPU --threads $threads -v --useexisting"
50+
echo -e "[$(date)]\tRunning autoAug.pl without cdna evidence:\n$cmd"
51+
$cmd
52+
fi
53+
retVal=$?
54+
55+
if [ ! $retVal -eq 0 ]
56+
then
57+
if [ -s "$(pwd)/autoAug/autoAugPred_abinitio/predictions/augustus.gff" ]
58+
then
59+
>&2 echo "Augustus ended in an error, but abinitio predictions are there - continuing .."
60+
else
61+
>&2 echo "Augustus ended in an error"
62+
exit $retVal
63+
fi
64+
fi
65+
66+
#copy the training set that was produced
67+
cp -rf $local_config/species/$prefix .
68+
69+
echo -e "[$(date)]Reformatting to $(pwd)/autoAug/autoAugPred_abinitio/predictions/augustus.gff to GFF3 -> $(pwd)/augustus.gff3"
70+
cat autoAug/autoAugPred_abinitio/predictions/augustus.gff | perl -ne 'chomp; @a=split(/\t/); if ($a[2] eq 'gene'){$id=$a[-1]; $a[-1] =~ s/^/ID=/; print join("\t", @a)."\n"}else{if ($_ =~ /;$/){print "$_ Parent=$id\n"}else{print "$_; Parent=$id\n"}}' | sed 's/; /;/g' | sed 's/ /=/g' > augustus.gff3
71+
72+
#cat $(pwd)/autoAug/autoAugPred_abinitio/predictions/augustus.gff | perl -ne 'chomp; @a=split(/\t/); if ($a[2] eq 'gene'){$id=$a[-1]; $a[-1] =~ s/^/ID=/; print join("\t", @a)."\n"}else{if ($_ =~ /;$/){print "$_ Parent=$id\n"}else{print "$_; Parent=$id\n"}}' | sed 's/; /;/g' | sed 's/ /=/g' > $(pwd)/autoAug/autoAugPred_abinitio/predictions/augustus.gff3
73+

‎bin/cleanup.sh

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
dir=$1
4+
5+
echo -e "\n$(date)\tStarting ...\n"
6+
7+
echo -e "[$(date)]\t$dir -> $dir.tar.gz"
8+
tar cfz $dir.tar.gz $dir
9+
if [ $? -eq 0 ]
10+
then
11+
rm -rf $dir
12+
else
13+
echo -e "Some problem with $dir"
14+
fi
15+
16+
echo -e "\n$(date)\tFinished!\n"
17+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
3+
cat $1 | \
4+
grep -v -e "Satellite" -e ")n" -e "-rich" | perl -ne '$id; if(!/^\#/){chomp; $_ =~ s/\r//g; $id++; print "$_;ID=$id\n"}else{print "$_"}'

‎bin/count_length.sh

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/bash
2+
3+
#./bin/count_length.sh test.fa.gz 100000 5000 count > check
4+
#./bin/count_length.sh test.fa.gz 100000 5000 split > check
5+
#check
6+
#sha256sum -c <(cut -f 1 check)
7+
#only check for the file in question
8+
#sha256sum -c <(grep -P "test.fa.gz\t" check| cut -f 1)
9+
10+
f=$1
11+
l=$2
12+
m=$3
13+
mode=$4 #could be either 'count' or 'split'
14+
15+
if [ $(echo $f | rev | cut -c 1-3 | rev) == ".gz" ]
16+
then
17+
# echo gzipped
18+
if [ "$mode" == "count" ]
19+
then
20+
paste <(sha256sum $f) <(echo -e "$l\t$m\t$(cat <(echo -e "$l\t$m") <(zcat $f) | perl -ne 'chomp; if ($. == 1){@a=split("\t"); $cutoff=$a[0]; $minlen=$a[-1]; $counter=1; }else{if ($_ =~ /^>/){if ($. > 2){if ($length >= $minlen){$cum_length+=$length; if ($cum_length >= $cutoff){$counter++; $cum_length=0; }}} $length = 0}else{$length+=length($_)}}}; if (eof()){print "$counter\n"')")
21+
fi
22+
if [ "$mode" == "split" ]
23+
then
24+
paste <(sha256sum $f) <(echo -e "$l\t$m\t$(cat <(echo -e "$l\t$m") <(zcat $f) | perl -ne 'chomp; if ($. == 1){@a=split("\t"); $cutoff=$a[0]; $minlen=$a[-1]; $counter=1; open(FH, ">", sprintf("%04d", $counter).".fasta")}else{if ($_ =~ /^>/){if ($. > 2){if ($length >= $minlen){$cum_length+=$length; print FH "$header\n$seq\n"; $header = $_; $seq = ""; if ($cum_length >= $cutoff){close FH; $counter++; open(FH, ">", sprintf("%04d", $counter).".fasta"); $cum_length=0; }}}; $header = $_; $seq = ""; $length = 0}else{$length+=length($_); $seq.=$_}}}; if (eof()){print "$counter\n"; if ($length >= $minlen){print FH "$header\n$seq\n"}')")
25+
fi
26+
else
27+
# echo not gzipped
28+
if [ "$mode" == "count" ]
29+
then
30+
paste <(sha256sum $f) <(echo -e "$l\t$m\t$(cat <(echo -e "$l\t$m") $f | perl -ne 'chomp; if ($. == 1){@a=split("\t"); $cutoff=$a[0]; $minlen=$a[-1]; $counter=1; }else{if ($_ =~ /^>/){if ($. > 2){if ($length >= $minlen){$cum_length+=$length; if ($cum_length >= $cutoff){$counter++; $cum_length=0; }}} $length = 0}else{$length+=length($_)}}}; if (eof()){print "$counter\n"')")
31+
fi
32+
if [ "$mode" == "split" ]
33+
then
34+
paste <(sha256sum $f) <(echo -e "$l\t$m\t$(cat <(echo -e "$l\t$m") $f | perl -ne 'chomp; if ($. == 1){@a=split("\t"); $cutoff=$a[0]; $minlen=$a[-1]; $counter=1; open(FH, ">", sprintf("%04d", $counter).".fasta")}else{if ($_ =~ /^>/){if ($. > 2){if ($length >= $minlen){$cum_length+=$length; print FH "$header\n$seq\n"; $header = $_; $seq = ""; if ($cum_length >= $cutoff){close FH; $counter++; open(FH, ">", sprintf("%04d", $counter).".fasta"); $cum_length=0; }}}; $header = $_; $seq = ""; $length = 0}else{$length+=length($_); $seq.=$_}}}; if (eof()){print "$counter\n"; if ($length >= $minlen){print FH "$header\n$seq\n"}')")
35+
fi
36+
fi
37+
38+
#cat <(echo -e ">\t$l\t$m") <(zcat $f) | perl -ne 'if ($_ =~ /^>/){$header = $_; $seq = ""; if ($. > 1){if ($length >= $minlen){$cum_length+=$length; print FH "$header\n$seq\n"} if ($cum_length >= $cutoff){$counter++; close FH; open(FH, '>', $counter.".fasta"); $cum_length=0}}else{chomp; @a=split("\t"); $cutoff=$a[-2]; $minlen=$a[-1]}; $length = 0}else{$length+=(length($_)-1); $seq+=$_}; if (eof()){$counter++; print "$counter\n"}'

‎bin/gff_to_gbk.py

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/bin/env python
2+
"""Convert a GFF and associated FASTA file into GenBank format.
3+
Usage:
4+
gff_to_genbank.py <GFF annotation file> <FASTA sequence file>
5+
"""
6+
from __future__ import print_function
7+
8+
import sys
9+
import os
10+
11+
from Bio import SeqIO
12+
from Bio.Alphabet import generic_dna
13+
import Bio.Alphabet as ab
14+
from Bio import Seq
15+
import random as rand
16+
17+
from BCBio import GFF
18+
19+
def main(gff_file, fasta_file):
20+
out_file = "%s.gb" % os.path.splitext(gff_file)[0]
21+
fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna))
22+
gff_iter = GFF.parse(gff_file, fasta_input)
23+
SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
24+
25+
def _fix_ncbi_id(fasta_iter):
26+
"""GenBank identifiers can only be 16 characters; try to shorten NCBI.
27+
"""
28+
for rec in fasta_iter:
29+
if len(rec.name) > 16:
30+
new_id = rec.name[:8] + str(rand.randint(1,8000000))
31+
print("Warning: shortening NCBI name %s to %s" % (rec.id, new_id))
32+
rec.id = new_id
33+
rec.name = new_id
34+
yield rec
35+
36+
def _check_gff(gff_iterator):
37+
"""Check GFF files before feeding to SeqIO to be sure they have sequences.
38+
"""
39+
for rec in gff_iterator:
40+
if isinstance(rec.seq, Seq.UnknownSeq):
41+
print("Warning: FASTA sequence not found for '%s' in GFF file" % (
42+
rec.id))
43+
rec.seq.alphabet = generic_dna
44+
yield _flatten_features(rec)
45+
46+
def _flatten_features(rec):
47+
"""Make sub_features in an input rec flat for output.
48+
GenBank does not handle nested features, so we want to make
49+
everything top level.
50+
"""
51+
out = []
52+
for f in rec.features:
53+
cur = [f]
54+
while len(cur) > 0:
55+
nextf = []
56+
for curf in cur:
57+
out.append(curf)
58+
if len(curf.sub_features) > 0:
59+
nextf.extend(curf.sub_features)
60+
cur = nextf
61+
rec.features = out
62+
return rec
63+
64+
if __name__ == "__main__":
65+
main(*sys.argv[1:])

‎bin/merging.sh

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/bin/bash
2+
3+
prefix=$1
4+
5+
#get going
6+
echo -e "\n$(date)\tStarting ...\n"
7+
8+
#combine results (this could be incorporated in the previous script)
9+
#combine all gffs without fasta sequences
10+
cat $(find ./ -name "$prefix.*.noseq.maker.gff" | sort) > $prefix.noseq.maker.gff
11+
#combine all gffs and add FASTA sequences from all
12+
cat $prefix.noseq.maker.gff <(echo -e "##FASTA") <(for f in $(find ./ -name "$prefix.*.all.maker.gff" | sort); do cat $f | perl -ne 'chomp; if ($_ =~ /^##FASTA/){$ok=1}; if ($ok){print "$_\n"}'; done | grep -v "^##FASTA") > $prefix.all.maker.gff
13+
#combine all proteins
14+
cat $(find ./ -name "$prefix.*.all.maker.proteins.fasta" | sort) > $prefix.all.maker.proteins.fasta
15+
#combine all transcripts
16+
cat $(find ./ -name "$prefix.*.all.maker.transcripts.fasta" | sort) > $prefix.all.maker.transcripts.fasta
17+
18+
#extract gff by evidence
19+
# transcript alignments
20+
awk '{ if ($2 ~ "est2genome") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.est2genome.gff
21+
awk '{ if ($2 ~ "cdna2genome") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.cdna2genome.gff
22+
# protein alignments
23+
awk '{ if ($2 ~ "protein2genome") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.protein2genome.gff
24+
# repeat alignments
25+
awk '{ if ($2 ~ "repeat") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.repeats.gff
26+
27+
#genes predicted by snap
28+
awk '{ if ($2 ~ "snap") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.snap.gff
29+
#genes predicted by augustus
30+
awk '{ if ($2 ~ "augustus") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.augustus.gff
31+
#genes predicted by maker
32+
awk '{ if ($2 ~ "maker") print $0 }' $prefix.noseq.maker.gff > $prefix.noseq.maker.maker.gff
33+
34+
#rename genes/transcripts
35+
#create backups
36+
cp $prefix.all.maker.gff $refix.all.maker.backup.gff
37+
cp $prefix.all.maker.proteins.fasta $prefix.all.maker.proteins.backup.fasta
38+
cp $prefix.all.maker.transcripts.fasta $prefix.all.maker.transcripts.backup.fasta
39+
40+
maker_map_ids --prefix $prefix --justify 5 --suffix - --iterate 1 $prefix.all.maker.gff > $prefix.makerID2short.map
41+
map_gff_ids $prefix.makerID2short.map $prefix.all.maker.gff
42+
map_fasta_ids $prefix.makerID2short.map $prefix.all.maker.transcripts.fasta
43+
map_fasta_ids $prefix.makerID2short.map $prefix.all.maker.proteins.fasta
44+
45+
echo -e "\n$(date)\tFinished!\n"

‎bin/setup_Repeatmasker.sh

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
3+
#use as ./setup_Repeatmasker.sh destination/ external/RepBaseRepeatMaskerEdition-20181026.tar.gz
4+
5+
destination=$1
6+
repeattarball=$2
7+
8+
#take care of RepeatMasker
9+
cp -pfr /usr/local/RepeatMasker $destination
10+
#cd $destination/RepeatMasker
11+
#perl ./rebuild
12+
#cd -
13+
tar xvfz $repeattarball -C $destination/RepeatMasker/
14+
15+
#This is a custom step that checks the metadata against the actual sequence data and removes those that are not present in the metadata
16+
cd $destination/RepeatMasker
17+
echo "Checking Repbase metadata against sequence data"
18+
cat Libraries/RMRBSeqs.embl | grep "^ID " | sed 's/^ID //' | cut -d " " -f 1 | sort -n | uniq | perl -ne 'chomp; print "$_\n$_\n"' > comp1
19+
cat Libraries/RMRBMeta.embl | grep "^ID " | sed 's/^ID //' | cut -d " " -f 1 | sort -n | uniq > comp2
20+
cat comp1 comp2 | sort -n | uniq -c | grep " 1 " | sed 's/^ .*1 //' > missing
21+
for m in $(cat missing); do sed -i "/$m/,/\/\//d" Libraries/RMRBMeta.embl; done
22+
rm comp1 comp2 missing
23+
24+
#BUild the repeatdatabases
25+
perl ./rebuild
26+

‎bin/snap.p1.sh

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#!/bin/bash
2+
3+
prefix=$1
4+
gff=$2
5+
fasta=$3
6+
7+
8+
echo -e "[$(date)]\tConvert CEGMA gff to SNAP input"
9+
cegma2zff $gff $fasta
10+
retVal=$(( retVal + $? ))
11+
12+
echo -e "[$(date)]\tgather some stats and validate"
13+
fathom genome.ann genome.dna -gene-stats > gene-stats.log 2>&1
14+
fathom genome.ann genome.dna -validate > validate.log 2>&1
15+
retVal=$(( retVal + $? ))
16+
17+
echo -e "[$(date)]\tcollect the training sequences and annotations, plus 1000 surrounding bp for training"
18+
fathom genome.ann genome.dna -categorize 1000
19+
fathom -export 1000 -plus uni.ann uni.dna
20+
retVal=$(( retVal + $? ))
21+
22+
echo -e "[$(date)]\tcreate the training parameters"
23+
forge export.ann export.dna
24+
retVal=$(( retVal + $? ))
25+
26+
echo -e "[$(date)]\tassemble the HMMs"
27+
hmm-assembler.pl $prefix . > $prefix.cegma.snap.hmm
28+
retVal=$(( retVal + $? ))
29+
30+
if [ ! $retVal -eq 0 ]
31+
then
32+
echo "There was some error" 1>&2
33+
exit $retVal
34+
fi

‎bin/snap.p2.sh

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
#!/bin/bash
2+
3+
prefix=$1
4+
gff=$2
5+
aed=$3
6+
7+
#Extract gene models with AED <= x
8+
echo -e "[$(date)]\tIdentify gene models with AED score > $aed"
9+
cat <(echo -e "$aed") <(cat $gff | grep -P "\tmRNA") | perl -ne 'chomp; if ($. == 1){$aed_max=$_}else{@a=split("\t"); @b=split(";",$a[8]);for (@b){if ($_ =~ /_AED/){$_ =~ s/_AED=//; $AED=$_; if ($AED > $aed_max){$b[0] =~ s/ID=//; print "$b[0]\n"; $b[0] =~ s/-mRNA.*//; print "$b[0];\n"}}}}' > gt.$aed.ids.txt
10+
11+
echo -e "[$(date)]\tExclude these gene models from gff -> remainder written to file: MAKER.st$aed.maker.gff"
12+
grep -v -f gt.$aed.ids.txt $gff > MAKER.st$aed.maker.gff
13+
14+
echo -e "[$(date)]\tConvert MAKER gff to SNAP input"
15+
maker2zff -n MAKER.st$aed.maker.gff
16+
17+
echo -e "[$(date)]\tgather some stats and validate"
18+
fathom genome.ann genome.dna -gene-stats > gene-stats.log 2>&1
19+
fathom genome.ann genome.dna -validate > validate.log 2>&1
20+
echo -e "[$(date)]\tcollect the training sequences and annotations, plus 1000 surrounding bp for training"
21+
fathom genome.ann genome.dna -categorize 1000
22+
fathom -export 1000 -plus uni.ann uni.dna
23+
echo -e "[$(date)]\tcreate the training parameters"
24+
forge export.ann export.dna
25+
echo -e "[$(date)]\tassemble the HMMs"
26+
hmm-assembler.pl $prefix . > $prefix.MAKER.st$aed.snap.hmm
27+
28+
ln -s $prefix.MAKER.st$aed.snap.hmm $prefix.MAKER_PASS1.snap.hmm

‎data/cluster_config_vsc4_CH.yaml

+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
__default__:
2+
J: sm
3+
n: 1
4+
ntasks: 1
5+
ntasks-per-node: 1
6+
mem: 4G
7+
hint: memory_bound
8+
qos: mem_0096
9+
partition: mem_0096
10+
time: "00:10:00"
11+
output: $(pwd)/log/%x-%j.stdout.txt
12+
error: $(pwd)/log/%x-%j.stderr.txt
13+
all:
14+
J: SnaMaall
15+
setup_maker:
16+
J: SETUP
17+
initiate:
18+
J: INIT
19+
genemark:
20+
J: GEM
21+
mem: 10G
22+
time: "70:00:00"
23+
busco:
24+
J: BUS1
25+
# ntasks: 10
26+
# ntasks-per-node: 10
27+
mem: 20G
28+
time: "70:00:00"
29+
cegma:
30+
J: CEG1
31+
# ntasks: 10
32+
# ntasks-per-node: 10
33+
mem: 10G
34+
time: "70:00:00"
35+
snap_pass1:
36+
J: SNA1
37+
# ntasks: 1
38+
# ntasks-per-node: 1
39+
repeatmodeler:
40+
J: RMO1
41+
# ntasks: 10
42+
# ntasks-per-node: 10
43+
mem: 10G
44+
time: "70:00:00"
45+
cleanup_repeatmodeler:
46+
J: cRMO
47+
time: "70:00:00"
48+
repeatmasker:
49+
J: RMA1
50+
# ntasks: 10
51+
# ntasks-per-node: 10
52+
mem: 20G
53+
time: "70:00:00"
54+
prepare_protein_evidence:
55+
J: CDH0
56+
# ntasks: 8
57+
# ntasks-per-node: 8
58+
mem: 20G
59+
time: "01:00:00"
60+
split:
61+
J: SPL0
62+
# ntasks: 1
63+
# ntasks-per-node: 1
64+
mem: 10G
65+
time: "01:00:00"
66+
initiate_MAKER_PASS1:
67+
J: iMP1
68+
# ntasks: 2
69+
# ntasks-per-node: 2
70+
run_MAKER_PASS1:
71+
J: rMP1
72+
# ntasks: 20
73+
# ntasks-per-node: 20
74+
mem: 20G
75+
time: "70:00:00"
76+
cleanup_MAKER_PASS1:
77+
J: cMP1
78+
# ntasks: 1
79+
# ntasks-per-node: 1
80+
mem: 4G
81+
time: "70:00:00"
82+
merge_MAKER_PASS1:
83+
J: mMP1
84+
# ntasks: 1
85+
# ntasks-per-node: 1
86+
mem: 4G
87+
time: "01:00:00"
88+
snap_pass2:
89+
J: SNA2
90+
# ntasks: 1
91+
# ntasks-per-node: 1
92+
mem: 4G
93+
time: "00:30:00"
94+
AUGUSTUS_PASS2:
95+
J: AUG2
96+
# ntasks: 2
97+
# ntasks-per-node: 2
98+
mem: 20G
99+
time: "70:00:00"
100+
pick_augustus_training_set:
101+
J: PAM
102+
initiate_MAKER_PASS2:
103+
J: iMP2
104+
# ntasks: 2
105+
# ntasks-per-node: 2
106+
run_MAKER_PASS2:
107+
J: rMP2
108+
# ntasks: 20
109+
# ntasks-per-node: 20
110+
mem: 20G
111+
time: "70:00:00"
112+
cleanup_MAKER_PASS2:
113+
J: cMP2
114+
# ntasks: 1
115+
# ntasks-per-node: 1
116+
mem: 4G
117+
time: "70:00:00"
118+
merge_MAKER_PASS2:
119+
J: mMP2
120+
# ntasks: 2
121+
# ntasks-per-node: 2
122+
mem: 4G
123+
time: "01:00:00"

‎data/config_CH.yaml

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
samples: "data/data.csv"
2+
3+
split_batch_length: 3000000
4+
split_min_length: 1000
5+
maker_tarball: data/external/maker-2.31.10.tgz
6+
RepbaseRepeatMaskerEdition: data/external/RepBaseRepeatMaskerEdition-20181026.tar.gz
7+
8+
genemark:
9+
genemark_dir: data/external/gmes_linux_64
10+
gmes_petap_params:
11+
busco:
12+
set: arthropoda_odb9
13+
species: fly
14+
15+
aed:
16+
snap_pass2: "0.2"
17+
AUGUSTUS_PASS2: ["0.0", "0.1", "0.2"]
18+
19+
cdhit:
20+
similarity: "0.98"
21+
22+
threads:
23+
genemark: 8
24+
busco: 8
25+
cegma: 8
26+
repeatmodeler: 8
27+
repeatmasker: 8
28+
prepare_protein_evidence: 8
29+
run_MAKER_PASS1: 8
30+
AUGUSTUS_PASS2: 10
31+
run_MAKER_PASS2: 8

‎data/data_CH.csv

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
sample fasta species
2+
T1 data/genomes/Testspecies.fasta Testus testus

‎rules/functions.smk

+112
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
2+
import pandas as pd
3+
import os
4+
import glob
5+
from math import ceil
6+
from pathlib import Path
7+
from subprocess import call
8+
9+
10+
n=int(config["split_batch_length"])
11+
min=int(config["split_min_length"])
12+
13+
samples = pd.read_csv(config["samples"], sep="\t").set_index("sample", drop=False)
14+
samples.index.names = ["sample_id"]
15+
16+
dic = {'sample': [], 'unit': []}
17+
18+
def get_assembly_path(wildcards):
19+
# this is to get the path to the assembly from the CSV file
20+
return samples.loc[wildcards.sample, ["fasta"]].to_list()
21+
22+
def get_transcripts_path(wildcards, p="data/transcripts/*"):
23+
#get paths to fasta transcript fasta files - if file has prefix identical to sample prefix in data.csv -> assume it's a transcriptome of this species -> MAKER 'est' option
24+
dic = {'alt_ests': [], 'ests': []}
25+
for f in glob.glob(p):
26+
if f.split("/")[-1].startswith(wildcards.sample):
27+
dic['ests'].append(os.path.abspath(f))
28+
else:
29+
dic['alt_ests'].append(os.path.abspath(f))
30+
return dic
31+
32+
33+
def partition_by_length(fasta, max_length=n, min_length=min, pr=0, outdir="./"):
34+
#function that partitions the fasta file
35+
headers = []
36+
seqs = []
37+
i=0
38+
cum_length=0
39+
printcount=1
40+
for line in open(str(fasta)).readlines():
41+
if line.strip().startswith(">"):
42+
headers.append(line.strip())
43+
seqs.append("")
44+
if i >= 1:
45+
if len(seqs[-2]) >= min_length:
46+
cum_length+=len(seqs[-2])
47+
# print("%s\t%s\t%s" %(headers[-2], len(seqs[-2]), cum_length))
48+
else:
49+
del headers[-2]
50+
del seqs[-2]
51+
if cum_length >= max_length:
52+
if pr:
53+
if not os.path.exists(outdir+"/"+str(printcount).zfill(4)):
54+
os.mkdir(outdir+"/"+str(printcount).zfill(4))
55+
fh = open(outdir+"/"+str(printcount).zfill(4)+"/p0001", 'w')
56+
# print("%s\t%s" %(str(printcount).zfill(4), cum_length)) #"{:04d}".format(printcount))
57+
for j in range(len(headers)-1):
58+
fh.write("%s\n%s\n" %(headers[j],seqs[j]))
59+
fh.close()
60+
for j in reversed(range(len(headers)-1)):
61+
del headers[j]
62+
del seqs[j]
63+
cum_length=len(seqs[-1])
64+
# print("the lenght is again: %s" %len(headers))
65+
printcount+=1
66+
i+=1
67+
else:
68+
seqs[-1] = seqs[-1]+line.strip()
69+
70+
if pr:
71+
if not os.path.exists(outdir+"/"+str(printcount).zfill(4)):
72+
os.mkdir(outdir+"/"+str(printcount).zfill(4))
73+
fh = open(outdir+"/"+str(printcount).zfill(4)+"/p0001", 'w')
74+
75+
# print("%s\t%s" %(str(printcount).zfill(4), cum_length+len(seqs[-1])))
76+
for j in range(len(headers)):
77+
fh.write("%s\n%s\n" %(headers[j],seqs[j]))
78+
fh.close()
79+
80+
if not pr:
81+
return printcount
82+
83+
unitdict = {}
84+
print("Counting partitions (batchsize >= "+str(n)+"bp, minimum length = "+str(min)+"bp) ..")
85+
for sample in samples.index.values.tolist():
86+
print("\t"+sample+" - n=", end='')
87+
count = subprocess.run("bash ./bin/count_length.sh %s %i %i count" %(samples.fasta[sample], n, min), shell=True, stdout=subprocess.PIPE)
88+
counter = int(count.stdout.decode('utf-8').split("\t")[-1])
89+
90+
91+
# counter=partition_by_length(str(samples.fasta[sample]), max_length=n, min_length=min, pr=0)
92+
print(counter)
93+
print("\t"+count.stdout.decode('utf-8').split("\t")[0])
94+
unitdict[sample] = []
95+
for i in range(1,counter+1):
96+
dic['sample'].append(sample)
97+
dic['unit'].append(str(i).zfill(4))
98+
unitdict[sample].append(str(i).zfill(4))
99+
100+
#print(unitdict)
101+
##print dic
102+
103+
units = pd.DataFrame(dic).set_index(['sample','unit'], drop=False)
104+
#print(units)
105+
#print(units.index.tolist())
106+
#print units
107+
#for row in units.itertuples():
108+
# print(row)
109+
110+
units.index.names = ["sample_id", "unit_id"]
111+
units.index = units.index.set_levels(
112+
[i.astype(str) for i in units.index.levels]) # enforce str in index

‎rules/maker_part_one.smk

+243
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
rule initiate:
2+
input:
3+
rules.setup_maker.output
4+
params:
5+
prefix = "{sample}"
6+
output:
7+
"results/{sample}/{sample}.ok"
8+
shell:
9+
"""
10+
if [[ ! -d results/{params.prefix} ]]
11+
then
12+
mkdir results/{params.prefix}
13+
fi
14+
touch {output}
15+
"""
16+
17+
rule split:
18+
input:
19+
fasta = get_assembly_path,
20+
ok = rules.initiate.output
21+
params:
22+
prefix = "{sample}",
23+
len = n,
24+
min = min
25+
log:
26+
stdout = "results/{sample}/logs/split.{sample}.stdout.txt",
27+
stderr = "results/{sample}/logs/split.{sample}.stderr.txt"
28+
output:
29+
ok = "results/{sample}/GENOME_PARTITIONS/splitting.ok",
30+
fasta = "results/{sample}/GENOME_PARTITIONS/{sample}.min"+str(min)+".fasta"
31+
shell:
32+
"""
33+
echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
34+
basedir=$(pwd)
35+
36+
cd results/{params.prefix}/GENOME_PARTITIONS/
37+
bash $basedir/bin/count_length.sh ../../../{input.fasta} {params.len} {params.min} split
38+
39+
retVal=$?
40+
41+
if [ ! $retVal -eq 0 ]
42+
then
43+
echo "Splitting ended in an error"
44+
exit $retVal
45+
else
46+
touch ../../../{output.ok}
47+
cat *.fasta > ../../../{output.fasta}
48+
fi
49+
50+
echo -e "\n$(date)\tFinished!\n"
51+
"""
52+
53+
rule genemark:
54+
input:
55+
ok = rules.initiate.output,
56+
fasta = rules.split.output.fasta
57+
params:
58+
prefix = "{sample}",
59+
genemark_dir = config["genemark"]["genemark_dir"],
60+
gmes_petap_params = config["genemark"]["gmes_petap_params"]
61+
threads: config["threads"]["genemark"]
62+
singularity:
63+
"docker://chrishah/premaker-plus:18"
64+
log:
65+
stdout = "results/{sample}/logs/GENEMARK.{sample}.stdout.txt",
66+
stderr = "results/{sample}/logs/GENEMARK.{sample}.stderr.txt"
67+
output:
68+
ok = "results/{sample}/GENEMARK/genemark.status.ok",
69+
model = "results/{sample}/GENEMARK/gmhmm.mod"
70+
shell:
71+
"""
72+
echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
73+
basedir=$(pwd)
74+
75+
if [[ ! -d results/{params.prefix}/GENEMARK ]]
76+
then
77+
mkdir results/{params.prefix}/GENEMARK
78+
else
79+
if [ "$(ls -1 results/{params.prefix}/GENEMARK/ | wc -l)" -gt 0 ]
80+
then
81+
echo -e "Cleaning up remnants of previous run first" 1> {log.stdout} 2> {log.stderr}
82+
rm results/{params.prefix}/GENEMARK
83+
mkdir results/{params.prefix}/GENEMARK
84+
fi
85+
fi
86+
cd results/{params.prefix}/GENEMARK
87+
88+
ln -sf $basedir/{params.genemark_dir}/gm_key .gm_key
89+
90+
if [ "{params.gmes_petap_params}" == "None" ]
91+
then
92+
gmes_petap.pl -ES -cores {threads} -sequence ../../../{input.fasta} 1> ../../../{log.stdout} 2> ../../../{log.stderr}
93+
else
94+
gmes_petap.pl -ES {params.gmes_petap_params} -cores {threads} -sequence ../../../{input.fasta} 1> ../../../{log.stdout} 2> ../../../{log.stderr}
95+
fi
96+
97+
retVal=$?
98+
99+
if [ ! $retVal -eq 0 ]
100+
then
101+
echo "Genemark ended in an error"
102+
exit $retVal
103+
else
104+
touch ../../../{output.ok}
105+
fi
106+
echo -e "\n$(date)\tFinished!\n"
107+
108+
"""
109+
110+
rule busco:
111+
input:
112+
ok = rules.initiate.output,
113+
fasta = rules.split.output.fasta
114+
params:
115+
prefix = "{sample}",
116+
busco_path = "data/BUSCO",
117+
busco_set = config["busco"]["set"],
118+
augustus_species = config["busco"]["species"]
119+
threads: config["threads"]["busco"]
120+
singularity:
121+
"docker://chrishah/busco-docker:v3.1.0"
122+
log:
123+
stdout = "results/{sample}/logs/BUSCO.{sample}.stdout.txt",
124+
stderr = "results/{sample}/logs/BUSCO.{sample}.stderr.txt"
125+
output:
126+
"results/{sample}/BUSCO/run_{sample}/single_copy_busco_sequences/{sample}.BUSCOs.fasta"
127+
shell:
128+
"""
129+
echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
130+
131+
if [[ ! -d results/{params.prefix}/BUSCO ]]
132+
then
133+
mkdir results/{params.prefix}/BUSCO
134+
fi
135+
cd results/{params.prefix}/BUSCO
136+
137+
if [[ ! -d tmp ]]
138+
then
139+
mkdir tmp
140+
fi
141+
142+
cp -rf /usr/share/augustus/config tmp/config
143+
AUGUSTUS_CONFIG_PATH=$(pwd)/tmp/config
144+
145+
#run BUSCO
146+
run_BUSCO.py \
147+
--in ../../../{input.fasta} --out {params.prefix} -l ../../../{params.busco_path}/{params.busco_set} --mode genome -c {threads} -f \
148+
-sp {params.augustus_species} --long --augustus_parameters='--progress=true' 1> ../../../{log.stdout} 2> ../../../{log.stderr}
149+
150+
#collect predicted BUSCOs
151+
cat run_{params.prefix}/single_copy_busco_sequences/*.faa | sed 's/:.*//' > run_{params.prefix}/single_copy_busco_sequences/{params.prefix}.BUSCOs.fasta
152+
153+
echo -e "\n$(date)\tFinished!\n"
154+
"""
155+
156+
rule cegma:
157+
input:
158+
ok = rules.initiate.output,
159+
fasta = rules.split.output.fasta
160+
params:
161+
prefix = "{sample}"
162+
threads: config["threads"]["cegma"]
163+
singularity:
164+
"docker://chrishah/cegma:2.5"
165+
log:
166+
stdout = "results/{sample}/logs/CEGMA.{sample}.stdout.txt",
167+
stderr = "results/{sample}/logs/CEGMA.{sample}.stderr.txt"
168+
output:
169+
ok = "results/{sample}/CEGMA/cegma.status.ok",
170+
cegma_gff = "results/{sample}/CEGMA/{sample}.cegma.gff"
171+
shell:
172+
"""
173+
echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
174+
175+
if [[ ! -d results/{params.prefix}/CEGMA ]]
176+
then
177+
mkdir results/{params.prefix}/CEGMA
178+
fi
179+
cd results/{params.prefix}/CEGMA
180+
181+
#run CEGMA
182+
cegma -g ../../../{input.fasta} -T {threads} -o {params.prefix} 1> ../../../{log.stdout} 2> ../../../{log.stderr}
183+
184+
retVal=$?
185+
186+
if [ ! $retVal -eq 0 ]
187+
then
188+
echo "Cegma ended in an error"
189+
exit $retVal
190+
else
191+
touch ../../../{output.ok}
192+
fi
193+
echo -e "\n$(date)\tFinished!\n"
194+
195+
"""
196+
197+
rule snap_pass1:
198+
input:
199+
ok = rules.cegma.output.ok,
200+
cegma_gff = rules.cegma.output.cegma_gff,
201+
fasta = rules.split.output.fasta
202+
params:
203+
prefix = "{sample}",
204+
script = "bin/snap.p1.sh"
205+
singularity:
206+
"docker://chrishah/premaker-plus:18"
207+
log:
208+
stdout = "results/{sample}/logs/SNAP.PASS1.{sample}.stdout.txt",
209+
stderr = "results/{sample}/logs/SNAP.PASS1.{sample}.stderr.txt"
210+
output:
211+
ok = "results/{sample}/SNAP.PASS1/snap.status.ok",
212+
hmm = "results/{sample}/SNAP.PASS1/{sample}.cegma.snap.hmm"
213+
shell:
214+
"""
215+
echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
216+
basedir=$(pwd)
217+
218+
export PATH="$(pwd)/bin/maker/bin:$PATH"
219+
220+
if [[ ! -d results/{params.prefix}/SNAP.PASS1 ]]
221+
then
222+
mkdir results/{params.prefix}/SNAP.PASS1
223+
fi
224+
cd results/{params.prefix}/SNAP.PASS1
225+
226+
bash $basedir/{params.script} \
227+
{params.prefix} \
228+
$basedir/{input.cegma_gff} \
229+
$basedir/{input.fasta} \
230+
1> $basedir/{log.stdout} 2> $basedir/{log.stderr}
231+
232+
retVal=$?
233+
234+
if [ ! $retVal -eq 0 ]
235+
then
236+
echo "SNAP ended in an error"
237+
exit $retVal
238+
else
239+
touch $basedir/{output.ok}
240+
fi
241+
echo -e "\n$(date)\tFinished!\n"
242+
"""
243+

‎rules/maker_post_repeats.smk

+521
Large diffs are not rendered by default.

‎rules/repeats.smk

+165
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
rule repeatmodeler:
2+
input:
3+
ok = rules.initiate.output,
4+
fasta = rules.split.output.fasta
5+
params:
6+
prefix = "{sample}",
7+
threads: config["threads"]["repeatmodeler"]
8+
singularity:
9+
"docker://chrishah/premaker-plus:18"
10+
log:
11+
stdout = "results/{sample}/logs/REPEATMODELER.{sample}.stdout.txt",
12+
stderr = "results/{sample}/logs/REPEATMODELER.{sample}.stderr.txt"
13+
output:
14+
ok = "results/{sample}/REPEATMODELER/repeatmodeler.status.ok",
15+
fasta = "results/{sample}/REPEATMODELER/{sample}-families.fa"
16+
shell:
17+
"""
18+
echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
19+
20+
if [[ ! -d results/{params.prefix}/REPEATMODELER ]]
21+
then
22+
mkdir results/{params.prefix}/REPEATMODELER
23+
else
24+
if [ "$(ls -1 results/{params.prefix}/REPEATMODELER/ | wc -l)" -gt 0 ]
25+
then
26+
echo -e "Cleaning up remnants of previous run first"
27+
rm results/{params.prefix}/REPEATMODELER
28+
mkdir results/{params.prefix}/REPEATMODELER
29+
fi
30+
fi
31+
cd results/{params.prefix}/REPEATMODELER
32+
33+
#run REPEATMODELER
34+
BuildDatabase -name {params.prefix} -engine ncbi ../../../{input.fasta} 1> ../../../{log.stdout} 2> ../../../{log.stderr}
35+
36+
RepeatModeler -pa {threads} -engine ncbi -database {params.prefix} 1>> ../../../{log.stdout} 2>> ../../../{log.stderr}
37+
38+
retVal=$?
39+
40+
if [ ! $retVal -eq 0 ]
41+
then
42+
echo "REPEATMODELER ended in an error"
43+
exit $retVal
44+
else
45+
touch ../../../{output.ok}
46+
fi
47+
echo -e "\n$(date)\tFinished!\n"
48+
"""
49+
50+
rule cleanup_repeatmodeler:
51+
input:
52+
rules.repeatmodeler.output
53+
params:
54+
prefix = "{sample}"
55+
log:
56+
stdout = "results/{sample}/logs/REPEATMODELER.cleanup.{sample}.stdout.txt",
57+
stderr = "results/{sample}/logs/REPEATMODELER.cleanup.{sample}.stderr.txt"
58+
output:
59+
ok = "results/{sample}/REPEATMODELER/repeatmodeler.cleanup.ok"
60+
shell:
61+
"""
62+
echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
63+
basedir=$(pwd)
64+
65+
cd results/{params.prefix}/REPEATMODELER
66+
for f in $(find ./ -type d -name "RM_*")
67+
do
68+
echo -e "\nCompressing $f\n"
69+
tar cfz $f.tar.gz $f
70+
71+
if [ $? -eq 0 ]
72+
then
73+
rm -rf $f
74+
else
75+
echo -e "Some problem with $f"
76+
fi
77+
done
78+
79+
cd $basedir
80+
touch {output.ok}
81+
82+
echo -e "\n$(date)\tFinished!\n"
83+
84+
"""
85+
86+
87+
rule repeatmasker:
88+
input:
89+
fasta = rules.split.output.fasta,
90+
repmod = rules.repeatmodeler.output.fasta
91+
params:
92+
prefix = "{sample}",
93+
repeat_taxon = "eukaryota",
94+
conversion_script = "bin/convert_repeatmasker_gff_to_MAKER_compatible_gff.sh"
95+
threads: config["threads"]["repeatmasker"]
96+
singularity:
97+
"docker://chrishah/premaker-plus:18"
98+
log:
99+
stdout = "results/{sample}/logs/REPEATMASKER.{sample}.stdout.txt",
100+
stderr = "results/{sample}/logs/REPEATMASKER.{sample}.stderr.txt"
101+
output:
102+
ok = "results/{sample}/REPEATMASKER/repeatmasker.status.ok",
103+
gff = "results/{sample}/REPEATMASKER/{sample}.masked.final.out.reformated.gff",
104+
masked = "results/{sample}/REPEATMASKER/{sample}.masked.final.fasta"
105+
shell:
106+
"""
107+
echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
108+
basedir=$(pwd)
109+
110+
if [[ ! -d results/{params.prefix}/REPEATMASKER ]]
111+
then
112+
mkdir results/{params.prefix}/REPEATMASKER
113+
fi
114+
cd results/{params.prefix}/REPEATMASKER
115+
116+
#this is a bit of a hack, but since singularity does not allow to directly write to images ('no space left') and RepeatMasker in the
117+
#process needs to produce some files, I need to first get the RepeatMasker directory out of the image.
118+
#Then use the executable in the directory in my writable environment
119+
#apparently I could also use singularities '--sandbox' option, but from what I can see this would write the content of the entire image to a
120+
#directory, so it would take much longer
121+
122+
#Copy the RepeatMasker directory from the image
123+
cp -pfr /usr/local/RepeatMasker .
124+
125+
#Do RepeatMasking with denovo library
126+
mkdir denovo
127+
./RepeatMasker/RepeatMasker -engine ncbi -s -pa {threads} -lib $basedir/{input.repmod} -noisy -dir denovo -gff $basedir/{input.fasta} 1> $basedir/{log.stdout} 2> $basedir/{log.stderr}
128+
retVal=$?
129+
130+
#run REPEATMASKER against full repeat library, but use only the assembly that is already masked based on the denovo library
131+
mkdir full
132+
ln -s $(find ./denovo -name '*fasta.masked') {params.prefix}.masked.denovo.fasta
133+
./RepeatMasker/RepeatMasker -engine ncbi -s -pa {threads} -species {params.repeat_taxon} -noisy -dir full -gff {params.prefix}.masked.denovo.fasta 1>> $basedir/{log.stdout} 2>> $basedir/{log.stderr}
134+
retVal=$(( retVal + $? ))
135+
136+
#cleanup - remove the RepeatMasker directory
137+
rm -rf RepeatMasker
138+
rm {params.prefix}.masked.denovo.fasta
139+
140+
#produce the final repeat annotation
141+
#copy the final masked fasta and out files from the last Repeatmasker run
142+
mkdir final
143+
cd final
144+
ln -s ../full/{params.prefix}.masked.denovo.fasta.masked {params.prefix}.masked.final.fasta
145+
ln -s ../full/{params.prefix}.masked.denovo.fasta.out {params.prefix}.masked.final.out
146+
147+
#produce gff3 file from the final RepeatMasker output (this gff3 seems to work well with MAKER after some conversion - see below)
148+
/usr/local/RepeatMasker/util/rmOutToGFF3.pl {params.prefix}.masked.final.out > {params.prefix}.masked.final.out.gff3
149+
retVal=$(( retVal + $? ))
150+
151+
#modify gff3 file so MAKER accepts it down the line
152+
$basedir/{params.conversion_script} {params.prefix}.masked.final.out.gff3 > $basedir/{output.gff}
153+
154+
cd ..
155+
ln -s final/{params.prefix}.masked.final.fasta $basedir/{output.masked}
156+
157+
if [ ! $retVal -eq 0 ]
158+
then
159+
echo "There was some error"
160+
exit $retVal
161+
else
162+
touch $basedir/{output.ok}
163+
fi
164+
echo -e "\n$(date)\tFinished!\n"
165+
"""

‎rules/setup_maker.smk

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
2+
rule setup_maker:
3+
input:
4+
maker_tarball = config["maker_tarball"],
5+
params:
6+
repbase = config["RepbaseRepeatMaskerEdition"]
7+
singularity:
8+
"docker://chrishah/premaker-plus:18"
9+
output:
10+
bin = directory("bin/maker/bin")
11+
shell:
12+
"""
13+
echo -e "\n$(date)\tStarting on host: $(hostname) ...\n"
14+
basedir=$(pwd)
15+
16+
#Copy the RepeatMasker directory from the image
17+
cp -pfrv /usr/local/RepeatMasker bin/
18+
19+
if [ "{params.repbase}" == "None" ]
20+
then
21+
echo -e "No additional Repeatlibrary provided - ok"
22+
else
23+
bin/setup_Repeatmasker.sh bin/ {params.repbase}
24+
# tar xvfz {params.repbase} -C bin/RepeatMasker/
25+
# perl bin/RepeatMasker/rebuild
26+
fi
27+
28+
if [ "{input.maker_tarball}" == "None" ]
29+
then
30+
echo -e "Providing a maker tarball is mandatory"
31+
exit 1
32+
else
33+
bash bin/setup_maker.sh {input.maker_tarball} bin
34+
fi
35+
"""

0 commit comments

Comments
 (0)
Please sign in to comment.