-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathvcf2fineRADstructure.sh
137 lines (110 loc) · 4.8 KB
/
vcf2fineRADstructure.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/bin/bash
# Shell script to convert the vcf file to an input format for fineRADstructure
# Written by Joana Meier, 2018
# Sorry, this is a very slow script as it extracts each RAD locus from the vcf file with vcftools
# usage: vcf2fineRADstructure.sh <vcf file> <path to bam files>
# This script requires the createRADmappingReport.sh script which requires the folder with the bam files
# It gets the RADloci present in at least 10 individuals with at least 10 reads each
# It runs files of 1000 lines in parallel (increase number of lines if it uses too many CPU).
# Define variables (modify as needed):
prefix="X" # fineRADstructure does not allow sample names to start with a number, if they do, add X here
minSites=10 # minimum number of sites that a RADlocus needs to contain to be considered
# (if monomorphic sites are included, this number should be higher)
minInds=10 # minimum number of individuals that must be sequenced for each RAD locus to be considered
# get the vcf file name (without suffix)
file=$1
file=${file%.gz}
file=${file%.vcf}
bamfilesFolder=$2
bamfilesFolder=${bamfilesFolder%/}
# Extract the RADtags present in at least 10 inds (positions of RAD sites from mapping report)
currentDir=`pwd`
# If the mapping report does not exist yet in the directory containing the bam files, run createRADmappingReport
if [ -s $bamfilesFolder/seq_depth_min10.txt ]
then
echo "Mapping report exists already. I will not regenerate it."
else
cd $bamfilesFolder; createMappingReport.sh; cd $currentDir
fi
# Extract RADloci with at least minInds individuals from seq_depth_min10.txt file
cut -d" " -f 3,4 $bamfilesFolder/seq_depth_min10.txt | sort | uniq -c > RADpos.c
awk -v minInds=$minInds '{if($1>=minInds) print $2,$3}' RADpos.c > RADpos
sort -V RADpos > RADpos.sorted
awk '{print $1" "$2-100" "$2"\n"$1" "$2" "$2+100}' RADpos.sorted | \
grep -v "locus" > RADloci
# Delete temporary files
rm RADpos RADpos.c RADpos.sorted
# Get the number of individuals
if [ -s $file.vcf.gz ]
then
nind=`zgrep ^#CH ${file}.vcf.gz | awk '{print NF-9}'`
suff=".gz" # suffix
vcfgz="gz" # for vcftools
elif [ -s $file.vcf ]
then
nind=`grep ^#CH ${file}.vcf | awk '{print NF-9}'`
suff="";vcfgz=""
else
echo -e "Error: file $file.vcf[.gz] not found!\nexiting..."
exit 1
fi
# Split the RADloci file into files of 10000 lines for parallelisation
split -l 10000 RADloci RADloci.
# Function to generate the $RADlocifile.file
function generateFile {
a=0
while read i
do
# increment the index
a=$((a+1))
# get the SNPs of the RADlocus
vcftools --${vcfgz}vcf $file.vcf$suff --chr `echo $i | cut -d" " -f1` --plink-tped \
--from-bp `echo $i | cut -d" " -f2` --to-bp `echo $i | cut -d" " -f3` \
--out ${RADlocifile}.$a
# if the RADlocus contains SNPs: append the info in the correct format to the ${RADlocifile}.file
if [[ -s ${RADlocifile}.$a.tped ]]
then
echo -e "locus"$a"\t"`head -1 ${RADlocifile}.$a.tped | cut -f 1,2`"\t" | tr -d '\n' >> ${RADlocifile}.file
for((c=5;c<($nind*2+5);c+=2))
do
echo -e `cut -f $c ${RADlocifile}.$a.tped | \
tr -d '\n'`"/"`cut -f $((c+1)) ${RADlocifile}.$a.tped | \
tr -d '\n'`"\t" | tr -d '\n' >> ${RADlocifile}.file
done
echo "" >> ${RADlocifile}.file
fi
# remove temporary files
rm ${RADlocifile}.$a.log ${RADlocifile}.$a.tfam ${RADlocifile}.$a.tped
# Read in the RADlocifile given as argument when running the function
done < $1
}
# run each file separately (in parallel) with the function above
for RADlocifile in RADloci.*
do
generateFile $RADlocifile &
done
wait
# Compute missing data for each individual (can later also be used to filter out bad individuals)
vcftools --missing-indv --${vcfgz}vcf $file.vcf$suff --out $file
# Get the number of individuals in the file:
nind=`grep -v INDV $file.imiss -c`
# Generate the final input file for fineRADstructure:
# Note: If samples start with a number, add X to the beginning of each individual name by specifying prefix above
awk -v prefix=$prefix 'BEGIN{printf "Chr\t"} !/INDV/ {printf prefix$1"\t"}END{print ""}' $file.imiss > ${file}_fineRADstructure
# Add the RADloci data (only RADtags longer than $minSites)
# Collapse the two haplotypes of an individual if they are identical
# Remove both haplotypes if they contain missing data
for RADlociFile in RADloci*file
do
cut -f 2-`echo ${nind}+2 | bc` $RADlociFile | \
awk '{split($2,cont,"[:]"); $1=""; $2=cont[1]; print $0}' | \
awk -v minSites=$minSites '{if(length($4)>=(minSites*2+1)) {
for(i=2; i <= NF; i++){
split($i,genot,"/");
if(genot[1]==genot[2]) $i=genot[1]
if($i~/0/) $i=""} print $0
}
}' | sed 's/ /\t/g' >> ${file}_fineRADstructure
done
# Replace blanks by tabs (fineRADstructure prefers that)
sed -i 's/ /\t/g' ${file}_fineRADstructure