Skip to content

Commit

Permalink
fixes for preloading of Cas9
Browse files Browse the repository at this point in the history
  • Loading branch information
JokingHero committed Apr 9, 2024
1 parent bd98d88 commit 6da3e41
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 49 deletions.
Binary file added data/Cas9_d4_p16_distances.bin
Binary file not shown.
Binary file added data/Cas9_d4_p16_paths_part1.bin
Binary file not shown.
Binary file added data/Cas9_d4_p16_paths_part2.bin
Binary file not shown.
19 changes: 15 additions & 4 deletions data/compute_default_path_templates.jl
Original file line number Diff line number Diff line change
@@ -1,15 +1,26 @@
using CHOPOFF
using DataFrames

motif = Motif("Cas9"; distance = 4)
mpt = build_PathTemplates(motif; restrict_to_len = 16)
paths = UInt8.(mpt.paths)
distances = UInt8.(mpt.distances)

paths = mpt.paths[:, 1:16]
not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto))))
paths = paths[not_dups, :]
distances = mpt.distances[not_dups]

split = Int(floor(length(distances) / 2))
paths1 = paths[1:split, :]
paths2 = paths[(split + 1):end, :]

CHOPOFF.save(distances, "./data/Cas9_d4_p16_distances.bin")
CHOPOFF.save(paths, "./data/Cas9_d4_p16_paths.bin")
CHOPOFF.save(paths1, "./data/Cas9_d4_p16_paths_part1.bin")
CHOPOFF.save(paths2, "./data/Cas9_d4_p16_paths_part2.bin")

d2 = CHOPOFF.load("./data/Cas9_d4_p16_distances.bin")
p2 = CHOPOFF.load("./data/Cas9_d4_p16_paths.bin")
p1 = CHOPOFF.load("./data/Cas9_d4_p16_paths_part1.bin")
p2 = CHOPOFF.load("./data/Cas9_d4_p16_paths_part2.bin")
p2 = vcat(p1, p2)
if (paths != p2) | (distances != d2)
@warn "Failed to sucessfully save the path templates. CHOPOFF will still work, but will be slower in some cases."
rm("./data/Cas9_d4_p16_paths.bin")
Expand Down
50 changes: 44 additions & 6 deletions src/db_prefix_hash.jl
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ name::String,
genomepath::String,
motif::Motif,
storage_dir::String,
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16))
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16);
reuse_saved::Bool = true)
```
Prepare prefixHashDB index for future searches using `search_prefixHashDB`.
Expand All @@ -162,6 +163,8 @@ You can also play with `hash_len` parameter, but keeping it at 16 should be clos
`hash_len` - Length of the hash in bp. At maximum 16.
`reuse_saved` - Whether to reuse paths that were saved for Cas9 distance 4 and prefix 16.
# Examples
```julia
$(make_example_doc("prefixHashDB"))
Expand All @@ -172,7 +175,8 @@ function build_prefixHashDB(
genomepath::String,
motif::Motif,
storage_dir::String,
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16))
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16);
reuse_saved::Bool = true)

if hash_len > 16
throw("hash_len $hash_len is more than 16")
Expand Down Expand Up @@ -228,11 +232,45 @@ function build_prefixHashDB(
isplus = BitVector(isplus[order])

@info "Step 3: Constructing Paths for hashes"
mpt = build_PathTemplates(motif; restrict_to_len = hash_len, withPAM = false)
paths = mpt.paths[:, 1:hash_len]
not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) # how can there be no duplicated function?!
# trying to preload paths for most common use case
paths = nothing
if (reuse_saved && (motif.distance <= 4) && (hash_len <= 16))
m2 = Motif("Cas9")
if (motif.fwd == m2.fwd &&
motif.rve == m2.rve &&
motif.pam_loci_fwd == m2.pam_loci_fwd &&
motif.pam_loci_rve == m2.pam_loci_rve)

dir = joinpath(dirname(pathof(CHOPOFF)), "..", "data")
pfile1 = joinpath(dir, "Cas9_d4_p16_paths_part1.bin")
pfile2 = joinpath(dir, "Cas9_d4_p16_paths_part2.bin")
dfile = joinpath(dir, "Cas9_d4_p16_distances.bin")
if (isfile(pfile1) && isfile(pfile2) && isfile(dfile))
@info "Reusing precomputed alignments."
paths = CHOPOFF.load(pfile1)
paths2 = CHOPOFF.load(pfile2)
paths = vcat(paths, paths2)
distances = CHOPOFF.load(joinpath(dir, "Cas9_d4_p16_distances.bin"))
paths = paths[:, 1:hash_len]
not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto))))
not_over_dist = BitVector(distances .<= motif.distance)
not = not_dups .& not_over_dist
paths = paths[not, :]
distances = distances[not]
paths = convert.(smallestutype(maximum(paths)), paths)
end
end
end

if isnothing(paths)
mpt = build_PathTemplates(motif; restrict_to_len = hash_len, withPAM = false)
paths = mpt.paths[:, 1:hash_len]
not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) # how can there be no duplicated function?!
paths = paths[not_dups, :]
distances = mpt.distances[not_dups]
end
mkpath(storage_dir)
save(PrefixHashDB(SymbolicAlignments(dbi, paths[not_dups, :], mpt.distances[not_dups], hash_len),
save(PrefixHashDB(SymbolicAlignments(dbi, paths, distances, hash_len),
prefixes, suffixes, chrom, pos, isplus), joinpath(storage_dir, "prefixHashDB.bin"))
@info "Finished constructing prefixHashDB in " * storage_dir

Expand Down
48 changes: 42 additions & 6 deletions src/db_vcf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ build_vcfDB(
vcfpath::String,
motif::Motif,
storage_path::String,
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16))
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16);
reuse_saved::Bool = true)
```
Builds a database of all potential off-targets that overlap any of the variants in the VCF file.
Expand All @@ -69,6 +70,8 @@ variants. This database uses simialr principles to `prefixHashDB`, also utilizes
`hash_len` - length of the prefix that is stored inside the hash
`reuse_saved` - Whether to reuse paths that were saved for Cas9 distance 4 and prefix 16.
# Examples
```julia
Expand All @@ -81,7 +84,7 @@ function build_vcfDB(
vcfpath::String,
motif::Motif,
storage_path::String,
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16))
hash_len::Int = min(length_noPAM(motif) - motif.distance, 16); reuse_saved = true)

dbi = DBInfo(genomepath, name, motif; vcf_filepath = vcfpath)
hash_type = CHOPOFF.smallestutype(parse(UInt, repeat("1", hash_len * 2); base = 2))
Expand Down Expand Up @@ -235,9 +238,42 @@ function build_vcfDB(
close(ref)

@info "Step 2: Constructing Paths for hashes"
mpt = build_PathTemplates(motif; restrict_to_len = hash_len, withPAM = false)
paths = mpt.paths[:, 1:hash_len]
not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) # how can there be no duplicated function?!
paths = nothing
if (reuse_saved && (motif.distance <= 4) && (hash_len <= 16))
m2 = Motif("Cas9")
if (motif.fwd == m2.fwd &&
motif.rve == m2.rve &&
motif.pam_loci_fwd == m2.pam_loci_fwd &&
motif.pam_loci_rve == m2.pam_loci_rve)

dir = joinpath(dirname(pathof(CHOPOFF)), "..", "data")
pfile1 = joinpath(dir, "Cas9_d4_p16_paths_part1.bin")
pfile2 = joinpath(dir, "Cas9_d4_p16_paths_part2.bin")
dfile = joinpath(dir, "Cas9_d4_p16_distances.bin")
if (isfile(pfile1) && isfile(pfile2) && isfile(dfile))
@info "Reusing precomputed alignments."
paths = CHOPOFF.load(pfile1)
paths2 = CHOPOFF.load(pfile2)
paths = vcat(paths, paths2)
distances = CHOPOFF.load(joinpath(dir, "Cas9_d4_p16_distances.bin"))
paths = paths[:, 1:hash_len]
not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto))))
not_over_dist = BitVector(distances .<= motif.distance)
not = not_dups .& not_over_dist
paths = paths[not, :]
distances = distances[not]
paths = convert.(smallestutype(maximum(paths)), paths)
end
end
end

if isnothing(paths)
mpt = build_PathTemplates(motif; restrict_to_len = hash_len, withPAM = false)
paths = mpt.paths[:, 1:hash_len]
not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) # how can there be no duplicated function?!
paths = paths[not_dups, :]
distances = mpt.distances[not_dups]
end

if length(ambig_guides) > 0
@info "Step 3: Constructing DB for ambigous gRNAs."
Expand All @@ -250,7 +286,7 @@ function build_vcfDB(
save(CHOPOFF.build_ambigPrefixHashDB(
ambig_guides, ambig_chrom, ambig_pos, ambig_isplus,
l, hash_len, ot_type, hash_type, suffix_type, ambig_annot,
CHOPOFF.SymbolicAlignments(dbi, paths[not_dups, :], mpt.distances[not_dups], hash_len)),
CHOPOFF.SymbolicAlignments(dbi, paths, distances, hash_len)),
storage_path)
@info "Finished constructing vcfDB in " * storage_path
else
Expand Down
34 changes: 1 addition & 33 deletions src/motif_path_templates.jl
Original file line number Diff line number Diff line change
Expand Up @@ -310,44 +310,12 @@ function build_PathTemplates(
storagepath::String = "",
mismatch_only::Bool = false,
restrict_to_len::Int = (length_noPAM(motif) + motif.distance),
withPAM::Bool = false,
reuse_saved::Bool = true)
withPAM::Bool = false)

len = length_noPAM(motif)
d = motif.distance
length_of_paths = (withPAM ? length(motif) : length_noPAM(motif)) + motif.distance

# trying to preload paths for most common use case
if (reuse_saved && (motif.distance <= 4) && (restrict_to_len <= 16) && !withPAM && !mismatch_only)
m2 = Motif("Cas9")
if (motif.fwd == m2.fwd &&
motif.rve == m2.rve &&
motif.pam_loci_fwd == m2.pam_loci_fwd &&
motif.pam_loci_rve == m2.pam_loci_rve)

dir = joinpath(dirname(pathof(CHOPOFF)), "..", "data")
pfile = joinpath(dir, "Cas9_d4_p16_paths.bin")
dfile = joinpath(dir, "Cas9_d4_p16_distances.bin")
if (isfile(pfile) && isfile(dfile))
@info "Reusing precomputed alignments."
paths = CHOPOFF.load(pfile)
distances = CHOPOFF.load(joinpath(dir, "Cas9_d4_p16_distances.bin"))
paths = paths[:, 1:restrict_to_len]
not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto))))
not_over_dist = BitVector(distances .<= d)
not = not_dups .& not_over_dist
paths = paths[not, :]
distances = distances[not]
paths = convert.(smallestutype(maximum(paths)), paths)
paths = PathTemplates(paths, distances, mismatch_only, motif, withPAM, restrict_to_len)
if storagepath != ""
save(paths, storagepath)
end
return paths
end
end
end

# path is mapped to these numbers, path numbers are
# (len + end) * (dist + 1) and
# (Ins (N) + Gap + MM) * len * dist
Expand Down

0 comments on commit 6da3e41

Please sign in to comment.