-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetKaKs.sh
283 lines (264 loc) · 9.66 KB
/
getKaKs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/usr/bin/bash
####此程序是函数定义脚本。主运行脚本是KK4D.sh
##读取配置文件
#source config.ini
if [ ${group} -eq 2 ];then
prefix1=${abbr[0]}
prefix2=${abbr[1]}
gff3file1=${gff3[0]}
gff3file2=${gff3[1]}
latin1=${sample[0]}
latin2=${sample[1]}
protein1=${protein[0]}
protein2=${protein[1]}
cds1=${cds[0]}
cds2=${cds[1]}
key1=${key[0]}
key2=${key[1]}
type1=${type[0]}
type2=${type[1]}
chrnum1=${chrnum[0]}
chrnum2=${chrnum[1]}
else
prefix1=${abbr[0]}
gff3file1=${gff3[0]}
latin1=${sample[0]}
protein1=${protein[0]}
cds1=${cds[0]}
key1=${key[0]}
type1=${type[0]}
chrnum1=${chrnum[0]}
prefix2=${abbr[0]}
gff3file2=${gff3[0]}
latin2=${sample[0]}
protein2=${protein[0]}
cds2=${cds[0]}
key2=${key[0]}
type2=${type[0]}
chrnum2=${chrnum[0]}
fi
##获取输入文件
#test.cds #每个基因最长的转录本的DNA序列
#test.pep #每个基因最长的蛋白序列
#从gff3文件获取bed,用法:getbed gff3file output前缀 第三列的type 第9列的前缀字符
function getbed(){
if [ $# -lt 2 ];then
echo "usage:
getbed inputgff3 outputprefix type key
inputgff3 can be gff3 or gff3.gz .(Required)
outputprefix is the output file prefix.Preferably a 3-character abbr.(Required)
type is gfffile the 3rd cloumn string (Value:mRNA ,gene or other,Default:mRNA)
key is gfffile the Prefix for column 9.(Value:ID or other,Default:ID)
"
exit 1
else
inputgff3=$1
prefix=$2
if [ $# -eq 3 ];then
type=$3
elif [ $# -eq 4 ];then
type=$3
key=$4
else
echo "Usage: -h /-help "
exit 1
fi
fi
python3 -m jcvi.formats.gff bed --type=${type:=mRNA} --key=${key:=ID} ${inputgff3} -o ${prefix}.bed
python3 -m jcvi.formats.bed uniq ${prefix}.bed
mv ${prefix}.uniq.bed ${prefix}.bed
}
function getcds(){
if [ $# -lt 2 ];then
echo "usage:
getbed input_cdsfa prefix
input_cdsfa can be fa or fa.gz .(Required)
prefix is the input bed file prefix.Preferably a 3-character abbr.(Required)
"
exit 1
else
input_cdsfa=$1
prefix=$2
fi
temp_cds=$(mktemp "${prefix}.XXXXXXXX") #使用临时文件作为输出,避免输出文件和输入文件名重复,而造成输出空文件的bug.
seqkit grep -f <(cut -f4 ${prefix}.bed) ${input_cdsfa} | seqkit seq -i >$temp_cds
mv $temp_cds ${prefix}.cds
}
function getpep(){
if [ $# -lt 2 ];then
echo "usage:
getbed input_proteinfa prefix
input_proteinfa can be fa or fa.gz .(Required)
prefix is the input bed file prefix.Preferably a 3-character abbr.(Required)
"
exit 1
else
input_proteinfa=$1
prefix=$2
fi
temp_pep=$(mktemp "${prefix}.XXXXXXXX")
seqkit grep -f <(cut -f4 ${prefix}.bed) ${input_proteinfa} | seqkit seq -i > $temp_pep
mv $temp_pep ${prefix}.pep
}
function getcoline(){
if [ $# -lt 2 ];then
echo "usage:
getcoline abbr1 abbr2
"
exit 1
fi
species1="$1"
species2="$2"
## 运行代码
python3 -m jcvi.compara.catalog ortholog --dbtype prot --no_strip_names $species1 $species2
python3 -m jcvi.compara.synteny screen --minspan=30 --simple $species1.$species2.anchors $species1.$species2.anchors.new
#绘制dotplot的共线性文件
python3 -m jcvi.graphics.dotplot $species1.$species2.anchors --nosep --nochpf --colororientation --dpi=300 --font=Arial -o ${species1}.${species2}.dotplot.pdf
}
function VisualColine(){
if [ $# -lt 2 ];then
echo "usage:
VisualColine abbr1 abbr2 chrnum1 chrnum2
"
exit 1
fi
abbr1=$1
abbr2=$2
chrnum1=$3
chrnum2=$4
##可视化
#使用awk对bed文件的第3列挑选每条染色体上最大的基因的位置,然后根据长度倒序排序染色体,选择出最长的n条染色体,然后再按照字母顺序排序染色体,最后把行转为一列,并用逗号分割。
awk '{if($3 > max[$1]) max[$1] = $3} END{for(key in max) print key, max[key]}' $abbr1.bed|sort -rn -k2|head -$chrnum1|sort|cut -d " " -f1|tr "\n" ","|sed 's/,$/\n/' >$abbr1.ids
awk '{if($3 > max[$1]) max[$1] = $3} END{for(key in max) print key, max[key]}' $abbr2.bed|sort -rn -k2|head -$chrnum2|sort|cut -d " " -f1|tr "\n" ","|sed 's/,$/\n/' >$abbr2.ids
cat $abbr1.ids $abbr2.ids >${abbr1}.${abbr2}.seqids
# 设置颜色,长宽等,注意下面的代码一定不能修改缩进,否则后续就会报错,python3严格依赖缩进
echo -e '# y, xstart, xend, rotation, color, label, va, bed
.6, .1, .8, 0, red, latin1, top, abbr1.bed
.4, .1, .8, 0, blue, latin2, top, abbr2.bed
# edges
e, 0, 1, abbr1.abbr2.anchors.simple' >${abbr1}.${abbr2}.layout
sed -i "s/abbr1/${abbr1}/g;s/abbr2/${abbr2}/g;s/latin1/${latin1}/g;s/latin2/${latin2}/g;" ${abbr1}.${abbr2}.layout
#生成共线性图片,很可能运行失败。注意:修改layout的细节就好,python3对文件要求比较严格。
python3 -m jcvi.graphics.karyotype ${abbr1}.${abbr2}.seqids ${abbr1}.${abbr2}.layout --font=Arial
#输出的是基于块的共线性
mv karyotype.pdf ${abbr1}_${abbr2}.block.coline.pdf
echo "${abbr1}_${abbr2}.block.coline.pdf is the coline picture!"
#输出的是具体的基因对的共线性
cat ${abbr1}.${abbr2}.anchors|grep -v ^#|awk '{print $1"\t"$1"\t"$2"\t"$2"\t"$3"\t""+"}' >${abbr1}.${abbr2}.anchors.gene
sed 's/simple/gene/g' ${abbr1}.${abbr2}.layout >${abbr1}.${abbr2}.layout.gene
python3 -m jcvi.graphics.karyotype ${abbr1}.${abbr2}.seqids ${abbr1}.${abbr2}.layout.gene --font=Arial --nocircles
mv karyotype.pdf ${abbr1}_${abbr2}.gene.coline.pdf
echo "${abbr1}_${abbr2}.gene.coline.pdf is the coline picture!"
#准备barplot和sankey的绘图数据
cat <(awk '{print $1,$4,$2,$3}' ${abbr1}.bed|sed "s/^/${abbr1}/g") <(awk '{print $1,$4,$2,$3}' ${abbr2}.bed|sed "s/^/${abbr2}/g") |tr " " "\t" >${abbr1}.${abbr2}.gff
grep -f <(sed "s/,/\n/g;s/^/${abbr1}/g" ${abbr1}.ids) ${abbr1}.${abbr2}.gff >${abbr1}.coline.gff
grep -f <(sed "s/,/\n/g;s/^/${abbr1}/g" ${abbr1}.ids) ${abbr1}.${abbr2}.gff >${abbr2}.coline.gff
cat ${abbr1}.coline.gff ${abbr2}.coline.gff >${abbr1}.${abbr2}.gff
grep -v ^# ${abbr1}.${abbr2}.anchors|cut -f1-2 >${abbr1}.${abbr2}.bar.coline
}
#准备kaks和4DTv的文件
function prepareResult(){
if [ $# -lt 2 ];then
echo "Usage:prepareResult abbr1 abbr2 threads
threads should be a number 32 or 64 or other
"
exit 1
elif [ $# -eq 2 ];then
abbr1=$1
abbr2=$2
else
abbr1=$1
abbr2=$2
thread=$3
fi
#判断旧版本的输出目录是否存在
if [ -d ${abbr1}_${abbr2}.result_dir ];then
read -p "The fold ${abbr1}_${abbr2}.result_dir is exist.Delete the old version ?(Y/N):" -n 1 answer
case $answer in
Y|y)
echo -e "\n ok!Delete the fold ${abbr1}_${abbr2}.result_dir!"
;;
N|n)
echo -e "\n The old version will be rename ${abbr1}_${abbr2}.result_dir.old!"
mv ${abbr1}_${abbr2}.result_dir ${abbr1}_${abbr2}.result_dir.old
;;
*)
echo -e "\n The old version will be rename ${abbr1}_${abbr2}.result_dir.old!"
mv ${abbr1}_${abbr2}.result_dir ${abbr1}_${abbr2}.result_dir.old
;;
esac
fi
echo ${thread:=32} >${abbr1}.${abbr2}.proc
cat ${abbr1}.${abbr2}.anchors|grep -v ^#|cut -f 1-2 >${abbr1}_${abbr2}.homolog
cat ${abbr1}.cds ${abbr2}.cds >${abbr1}_${abbr2}.cds
cat ${abbr1}.pep ${abbr2}.pep >${abbr1}_${abbr2}.pep
#此程序需要依赖较多
ParaAT.pl -h ${abbr1}_${abbr2}.homolog -n ${abbr1}_${abbr2}.cds -a ${abbr1}_${abbr2}.pep -p ${abbr1}.${abbr2}.proc -m mafft -f axt -g -k -o ${abbr1}_${abbr2}.result_dir
#制作可视化的config文件
echo -e "${abbr1}\t${latin1}\n${abbr2}\t${latin2}" >${abbr1}.${abbr2}.config.tsv
}
#输出结果在result_dir目录
function getkaks(){
if [ $# -lt 2 ];then
echo "usage:
getkaks abbr1 abbr2
"
exit 1
fi
abbr1=$1
abbr2=$2
#判断是否存在result_dir,不存在则需要先运行prepareResult
if [ ! -d ${abbr1}_${abbr2}.result_dir ];then
echo "请先运行prepareResult函数,以生成准备文件!"
exit 1
fi
#合并所有同源基因对的kaks值
find ${abbr1}_${abbr2}.result_dir -name "*.axt.kaks"|xargs cat | cut -f 1,3,4,5 | grep -v 'Sequence'|sort|uniq >${abbr1}_${abbr2}.all-kaks.results
cat ${abbr1}_${abbr2}.all-kaks.results|sed '1i\Seq\tKa\tKs\tKa/Ks'|tr "\t" "," >${abbr1}_${abbr2}.all-kaks.csv
}
function get4DTv(){
if [ $# -lt 2 ];then
echo "usage:
get4DTv abbr1 abbr2
"
exit 1
fi
abbr1=$1
abbr2=$2
#判断是否存在result_dir,不存在则需要先运行prepareResult
if [ ! -d ${abbr1}_${abbr2}.result_dir ];then
echo "请先运行prepareResult函数,以生成准备文件!"
exit 1
fi
##获取4DTv的值
#将多行axt文件转换成单行
for i in `find ${abbr1}_${abbr2}.result_dir -name "*.axt"`;do axt2one-line.py $i ${i}.one-line;done
#使用calculate_4DTV_correction.pl脚本计算4dtv值
find ${abbr1}_${abbr2}.result_dir -name "*.axt.one-line"|while read id;do calculate_4DTV_correction.pl $id >${id%%one-line}4dtv;done
#合并所有同源基因对的4dtv
find ${abbr1}_${abbr2}.result_dir -name "*.4dtv" |xargs cat| cut -f 1,3| grep -v '4dtv_raw'|sort|uniq >${abbr1}_${abbr2}.all-4dtv.results
cat ${abbr1}_${abbr2}.all-4dtv.results| sed '1i\Seq\t4dtv_corrected'|tr "\t" "," >${abbr1}_${abbr2}.all-4dtv.csv
}
function getkaks4DTv(){
if [ $# -lt 2 ];then
echo "usage:
getkaks4DTv abbr1 abbr2
"
exit 1
fi
abbr1=$1
abbr2=$2
#判断是否存在result_dir,不存在则需要先运行prepareResult
if [ ! -e ${abbr1}_${abbr2}.all-4dtv.results ];then
echo "请先运行get4DTv函数,以生成4DTv!"
exit 1
fi
if [ ! -e ${abbr1}_${abbr2}.all-kaks.results ];then
echo "请先运行getkaks函数,以生成kaks!"
exit 1
fi
#将kaks结果和4Dtv结果合并
join -a 1 -a 2 -1 1 -2 1 ${abbr1}_${abbr2}.all-4dtv.results ${abbr1}_${abbr2}.all-kaks.results |sed '1i\Seq 4dtv_corrected Ka Ks Ka/Ks' >${abbr1}_${abbr2}.all-results.txt
#给结果文件添加标题
cat ${abbr1}_${abbr2}.all-results.txt|sed 's/ /,/g' >${abbr1}_${abbr2}.kaks4DTv.csv
}