-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path8_filter_entirelyunmappedreads.sh
48 lines (34 loc) · 1.39 KB
/
8_filter_entirelyunmappedreads.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/bin/bash
# Author: Ulrich Lautenschlager
# Usage: Execute in the folder that contains folders "reads" and "results" from the previous mapping step
# names of all individuals/sequenced samples have to be given - these need to be identical with the filenames in the "reads" folder, without the .fastq suffix
individuals=(individual_1 individual_2 individual3)
####
mkdir -p reads_unmapped
for ind in ${individuals[*]}; do
# count SAM files
n_ref=$(ls -1q results/"${ind}"_* | wc -l)
if [ $n_ref -eq 0 ]; then
echo "ERROR: no SAM file found for individual ${ind}! Individual skipped." > /dev/stderr
continue
fi
# get IDs of reads without any map
cat results/"${ind}"_* | awk -v "n=$n_ref" '
!/^@/ && and($2, 0x4) {
unmapped[$1]++
}
END {
for (read_id in unmapped) {
if (unmapped[read_id] == n) {
print read_id
}
}
}
' > ${ind}_unmapped.txt
# prepare filtering
sed 's/\(.*\)/@\1 /' ${ind}_unmapped.txt > ${ind}_unmapped_regex.txt
# more general, but much slower to grep:
# sed 's/\(.*\)/^@\1[[:space:]]/' ${ind}_unmapped.txt > ${ind}_unmapped_regex.txt
# extract fastq entries
grep --no-group-separator -A3 -f ${ind}_unmapped_regex.txt reads/${ind}.fastq > reads_unmapped/${ind}_unmapped.fastq
done