-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathprocessInputGenome.smk
99 lines (80 loc) · 3.12 KB
/
processInputGenome.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
rule sortIndexGenome:
input: config["GENOMESDIR"] +"{genome}.fa"
output:
sorted=config["GENOMESDIR"] +"{genome}.sorted.fa",
bioperlindex=config["GENOMESDIR"] +"{genome}.sorted.fa.index"
conda: "envs/perl_env.yml"
shell:
'''
uuid=$(uuidgen)
#check for duplicate sequences:
count=$(cat {input} | fgrep ">" | sort|uniq -d | wc -l)
if [ $count -gt 0 ]; then echo "$count duplicate sequence IDs found"; exit 1; fi
FastaToTbl {input} | sort -T {TMPDIR} -k1,1 | TblToFasta > {TMPDIR}/$uuid
mv {TMPDIR}/$uuid {output.sorted}
perl -e 'use Bio::DB::Fasta; my $chrdb = Bio::DB::Fasta->new("{output.sorted}");'
'''
rule makeGenomeFile:
input: config["GENOMESDIR"] +"{genome}.sorted.fa"
output: config["GENOMESDIR"] +"{genome}.sorted.genome"
shell:
'''
uuid=$(uuidgen)
FastaToTbl {input} | awk '{{print $1"\\t"length($2)}}' | sort -k1,1 > {TMPDIR}/$uuid
mv {TMPDIR}/$uuid {output}
'''
rule makeGencodePartition:
input:
gtf=lambda wildcards: GENOMETOANNOTGTF[CAPDESIGNTOGENOME[wildcards.capDesign]],
genome = lambda wildcards: config["GENOMESDIR"] + CAPDESIGNTOGENOME[wildcards.capDesign] + ".sorted.genome"
conda: "envs/xtools_env.yml"
output: "output/annotations/{capDesign}.partition.gff"
shell:
'''
partitionAnnotation.sh {input.gtf} {input.genome} | sortgff > {output}
# QC:
genomeSize=$(cat {input.genome} | cut -f2|sum.sh)
testSize=$(cat {output} | awk '{{print ($5-$4)+1}}' | sum.sh)
if [ $testSize -ne $genomeSize ]; then
echoerr "ERROR: sum of feature sizes in output gff is not equal to genome size. The output is probably bogus."
exit 1
fi
'''
rule makeSirvGff:
input: lambda wildcards: GENOMETOANNOTGTF[CAPDESIGNTOGENOME[wildcards.capDesign]]
output: "output/annotations/{capDesign}.SIRVs.gff"
shell:
'''
cat {input} | awk '$1 ~ /SIRV/' | sortgff > {output}
'''
rule makeGencodeBed:
input: lambda wildcards: GENOMETOANNOTGTF[CAPDESIGNTOGENOME[wildcards.capDesign]]
output: "output/annotations/{capDesign}.bed"
shell:
'''
cat {input} | gff2bed_full.pl - |sortbed > {output}
'''
rule simplifyGencode:
input: lambda wildcards: GENOMETOANNOTGTF[CAPDESIGNTOGENOME[wildcards.capDesign]]
output: "output/annotations/simplified/{capDesign}.gencode.simplified_biotypes.gtf"
shell:
'''
uuidTmpOut=$(uuidgen)
cat {input} | simplifyGencodeGeneTypes.pl - | sort -T {TMPDIR} -k1,1 -k4,4n -k5,5n > {TMPDIR}/$uuidTmpOut
mv {TMPDIR}/$uuidTmpOut {output}
'''
rule collapseGencode:
input: "output/annotations/simplified/{capDesign}.gencode.simplified_biotypes.gtf"
output: "output/annotations/simplified/{capDesign}.gencode.collapsed.simplified_biotypes.gtf"
threads:1
conda: "envs/xtools_env.yml"
shell:
'''
uuid=$(uuidgen)
uuidTmpOut=$(uuidgen)
cat {input} | skipcomments | sort -T {TMPDIR} -k1,1 -k4,4n -k5,5n | tmerge --exonOverhangTolerance {ExonOverhangTolerance} - |sort -T {TMPDIR} -k1,1 -k4,4n -k5,5n > {TMPDIR}/$uuid
uuidL=$(uuidgen)
bedtools intersect -s -wao -a {TMPDIR}/$uuid -b {TMPDIR}/$uuid | buildLoci.pl - |sort -T {TMPDIR} -k1,1 -k4,4n -k5,5n > {TMPDIR}/$uuidL
mergeToRef.pl {input} {TMPDIR}/$uuidL | sort -T {TMPDIR} -k1,1 -k4,4n -k5,5n > {TMPDIR}/$uuidTmpOut
mv {TMPDIR}/$uuidTmpOut {output}
'''