Skip to content

Commit

Permalink
new options for the alghoritm, maybe this will make things possible
Browse files Browse the repository at this point in the history
  • Loading branch information
JokingHero committed Apr 19, 2024
1 parent f481ee5 commit 18a0ded
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 24 deletions.
11 changes: 10 additions & 1 deletion src/CHOPOFF.jl
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,13 @@ function parse_commandline(args::Array{String})
help = "Defines length of the hash. "
arg_type = Int
required = false
"--reuse_saved_not"
help = "Whether to reuse paths that were saved for Cas9 distance 4 and prefix 16."
action = :store_true
"--variant_overlaps"
help = "Whether to check for all potential combinations of alternate alleles for nearby variants. " *
"Only use with small VCF files! Preferably only run for specific variants."
action = :store_true
end

@add_arg_table! s["build"]["pamDB"] begin
Expand Down Expand Up @@ -524,7 +531,9 @@ function main(args::Array{String})
if hash_len === nothing
hash_len = min(length_noPAM(motif) - (motif.distance), 16)
end
build_vcfDB(args["name"], args["genome"], args["vcfDB"]["vcf"], motif, args["output"], hash_len)
build_vcfDB(args["name"], args["genome"], args["vcfDB"]["vcf"], motif, args["output"], hash_len;
reuse_saved = !args["vcfDB"]["reuse_saved_not"],
variant_overlaps = args["vcfDB"]["variant_overlaps"])
elseif args["%COMMAND%"] == "fmi"
build_fmiDB(args["genome"], args["output"])
elseif args["%COMMAND%"] == "pamDB"
Expand Down
55 changes: 35 additions & 20 deletions src/db_vcf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ function parse_vcf(vcf_filepath::String)
recordNum += 1
end
close(reader)
return rs_ids, rs_chroms, rs_ref, rs_ranges, rs_alt
order = sortperm(collect(zip(rs_chroms, rs_ranges)))
return rs_ids[order], rs_chroms[order], rs_ref[order], rs_ranges[order], rs_alt[order]
end


Expand All @@ -48,12 +49,15 @@ build_vcfDB(
motif::Motif,
storage_path::String,
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16);
reuse_saved::Bool = true)
reuse_saved::Bool = true,
variant_overlaps = false)
```
Builds a database of all potential off-targets that overlap any of the variants in the VCF file.
It supports combinations of variants that are close to each other, will report all possible combinations of
variants. This database uses simialr principles to `prefixHashDB`, also utilizes hashed prefix of specific length.
In case of troubles with loading of VCF files, the only fields that we use are ID, CHROM, POS, REF, ALT, so its
often possible to remove INFO field and other unnecesary fields which may cause troubles.
# Arguments
Expand All @@ -72,6 +76,8 @@ variants. This database uses simialr principles to `prefixHashDB`, also utilizes
`reuse_saved` - Whether to reuse paths that were saved for Cas9 distance 4 and prefix 16.
`variant_overlaps` - Whether to check for all potential combinations of alternate alleles for nearby variants.
Only use with small VCF files! Preferably only run for specific variants.
# Examples
```julia
Expand All @@ -84,19 +90,21 @@ function build_vcfDB(
vcfpath::String,
motif::Motif,
storage_path::String,
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16); reuse_saved = true)
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16);
reuse_saved = true,
variant_overlaps = false)

dbi = DBInfo(genomepath, name, motif; vcf_filepath = vcfpath)
hash_type = CHOPOFF.smallestutype(parse(UInt, repeat("1", hash_len * 2); base = 2))
suffix_type = CHOPOFF.smallestutype(parse(UInt, repeat("1", (CHOPOFF.length_noPAM(motif) - hash_len + motif.distance) * 2); base = 2))
ot_type = CHOPOFF.smallestutype(parse(UInt, repeat("1", (CHOPOFF.length_noPAM(motif) + motif.distance) * 2); base = 2))


@info "Step 1: Reading VCF file."
rs_ids, rs_chroms, rs_ref, rs_ranges, rs_alt = CHOPOFF.parse_vcf(vcfpath)
l = length_noPAM(motif) + motif.distance
lp = length(motif) + motif.distance # with PAM

@info "Step 1: Parsing the genomic relation to the VCF file."
# For each chromosome parallelized we build database
@info "Step 2: Parsing the genomic relation to the VCF file."
ref = open(dbi.gi.filepath, "r")
reader = dbi.gi.is_fa ? FASTA.Reader(ref, index = dbi.gi.filepath * ".fai") : TwoBit.Reader(ref)

Expand All @@ -107,27 +115,34 @@ function build_vcfDB(
ambig_annot = Vector{String}() # change to InlineStrings

for ch in unique(rs_chroms)
#ch = first(unique(rs_chroms)) # REMOVE
ch_ = convert(dbi.gi.chrom_type, findfirst(isequal(ch), dbi.gi.chrom))
ch_numeric = findfirst(isequal(ch), dbi.gi.chrom)
if isnothing(ch_numeric)
@warn("Chromosome " * string(ch) * " is not indexed for specified genome, skipping it.")
continue
end
ch_ = convert(dbi.gi.chrom_type, ch_numeric)
@info "Working on " * string(ch)

chrom_seq = CHOPOFF.getchromseq(dbi.gi.is_fa, reader[ch])
chrom_idx = findall(isequal(ch), rs_chroms)
# now for every snp (we assume they are sorted)

# group snps by proximity - as we have to enumerate all permutations of rs_alt for them
grouping = 1
grouping_idx = ones(Int, length(chrom_idx))
for i in 1:(length(chrom_idx) - 1)
x = (rs_ranges[chrom_idx[i]].start - l):(rs_ranges[chrom_idx[i]].stop + l)
if length(intersect(x, rs_ranges[chrom_idx[i+1]])) > 0 # rs overlaps
grouping_idx[i + 1] = grouping
else
grouping += 1
grouping_idx[i + 1] = grouping
grouping_idx = collect(1:length(chrom_idx))
if variant_overlaps # we need to correct grouping, otherwise each variant is in its own group
for i in 1:(length(chrom_idx) - 1)
x = (rs_ranges[chrom_idx[i]].start - l):(rs_ranges[chrom_idx[i]].stop + l)
if length(intersect(x, rs_ranges[chrom_idx[i+1]])) > 0 # rs overlaps
grouping_idx[i + 1] = grouping
else
grouping += 1
grouping_idx[i + 1] = grouping
end
end
end

# for each group we analyze these snps together
for group in unique(grouping_idx)
@showprogress dt=60 for group in unique(grouping_idx)
# group = first(unique(grouping_idx))
idxs = chrom_idx[grouping_idx .== group]
if length(idxs) == 1 # simple case - singular snp - potentially many alternate alleles
Expand Down Expand Up @@ -237,7 +252,7 @@ function build_vcfDB(
end
close(ref)

@info "Step 2: Constructing Paths for hashes"
@info "Step 3: Constructing Paths for hashes"
paths = nothing
if (reuse_saved && (motif.distance <= 4) && (hash_len <= 16))
m2 = Motif("Cas9")
Expand Down Expand Up @@ -276,7 +291,7 @@ function build_vcfDB(
end

if length(ambig_guides) > 0
@info "Step 3: Constructing DB for ambigous gRNAs."
@info "Step 4: Constructing DB for ambigous gRNAs."
order = sortperm(ambig_guides)
ambig_guides = ambig_guides[order]
ambig_chrom = ambig_chrom[order]
Expand Down
5 changes: 3 additions & 2 deletions test/src/db.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using CSV
using DataFrames

## SET WD when debugging
# cd("test")
cd("test")

## CRISPRitz compare functions - we test with up to 4 distance
function asguide(x::String)
Expand Down Expand Up @@ -94,7 +94,8 @@ end
vcf_storage_path = joinpath(vcf_path, "vcfDB.bin")
build_vcfDB(
"samirandom", genome, vcf,
Motif("Cas9"; distance = 2, ambig_max = 3), vcf_storage_path)
Motif("Cas9"; distance = 2, ambig_max = 3), vcf_storage_path;
variant_overlaps = true)

detail_path_vcf = joinpath(vcf_path, "output.csv")
search_vcfDB(vcf_storage_path, guides, detail_path_vcf; distance = 2,
Expand Down
2 changes: 1 addition & 1 deletion test/src/db_extends5_false.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ end
vcf_storage_path = joinpath(vcf_path, "vcfDB.bin")
build_vcfDB(
"samirandom", genome, vcf,
Motif("Cas12a"; distance = 1, ambig_max = 3), vcf_storage_path)
Motif("Cas12a"; distance = 1, ambig_max = 3), vcf_storage_path; variant_overlaps = true)

detail_path_vcf = joinpath(vcf_path, "output.csv")
search_vcfDB(vcf_storage_path, guides, detail_path_vcf; distance = 1,
Expand Down

0 comments on commit 18a0ded

Please sign in to comment.