-
Notifications
You must be signed in to change notification settings - Fork 0
/
strictMaskFilter.R
executable file
·98 lines (75 loc) · 3.15 KB
/
strictMaskFilter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env Rscript
#SBATCH --partition=fn_long
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=8G
#SBATCH --time=10-00:00:00
#SBATCH --mail-type=BEGIN,FAIL,END
#SBATCH [email protected]
#SBATCH --output=strictMaskFilter-%x.log
print("###############################################################", quote = FALSE)
print("# Strickt Mask Filter #", quote = FALSE)
print("###############################################################", quote = FALSE)
# Install and load necessary
library(dplyr, lib.loc = "/gpfs/scratch/blanep01/chipAnalysis/lib/R/library")
library(readr, lib.loc = "/gpfs/scratch/blanep01/chipAnalysis/lib/R/library")
library(spatstat.utils, lib.loc = "/gpfs/scratch/blanep01/chipAnalysis/lib/R/library")
# Accept user defined arguments for the variant calls, name of the filter, filter file,
# and which chromosome to subset with
inputArgs <- commandArgs(trailingOnly = TRUE)
variantCalls <- inputArgs[1]
filterName <- inputArgs[2]
filterFile <- inputArgs[3]
queryChrom <- inputArgs[4]
# LoFreq VCF file
variantCalls <- read_delim(file = variantCalls,
delim = "\t",
col_names = TRUE)
# Read in filter file
regionFilter <- read_delim(file = filterFile,
delim = "\t",
col_types = "cdd")
colnames(regionFilter) <- c("chr", "start", "end")
# First subset the variant calls by the specified chromosome and then filter out any
# repeated variant call position to speed up process
callChromSubset <- variantCalls %>%
filter(CHR == queryChrom)
queryCalls <- callChromSubset %>%
select(POS) %>%
unique()
# Check if the chromosome string matches the query chrom format. If it does not, correct it
if(!length(grep("chr", regionFilter$chr[1]))) {
regionFilter$chr <- paste0("chr", regionFilter$chr)
}
# Subset the region filter by the specified chromosome
regionSubset <- regionFilter %>%
filter(chr == queryChrom) %>%
select(start, end)
# Create vector for positions that pass the filter and should be kept
positionsToKeep <- c()
# Loop through all query variant call positions to check if they fall within a filter region
for(i in 1:nrow(queryCalls)) {
position <- queryCalls$POS[i]
for(k in 1:nrow(regionSubset)) {
# Check if the variant's position falls in the range of the filter region, if so
# add it to the list of passing positions
if(inside.range(position, range(regionSubset[k,1:2]))) {
positionsToKeep <- append(positionsToKeep, position)
break
}
}
}
# Check if there are positions to keep from the variant call list, if so not there is problem
if(is.null(positionsToKeep)) {
print("ERROR: NO CALLS PASSED FILTER!", quote = FALSE)
print("###############################################################", quote = FALSE)
} else {
filteredCalls <- callChromSubset
}
# Write the new filtered variant call list to a file
fileName <- sprintf("%s%sFilteredCalls.txt", filterName, queryChrom)
write_delim(x = filteredCalls,
path = fileName,
delim = "\t",
col_names = TRUE)