diff --git a/data/Cas9_d4_p16_distances.bin b/data/Cas9_d4_p16_distances.bin new file mode 100644 index 00000000..e149b36c Binary files /dev/null and b/data/Cas9_d4_p16_distances.bin differ diff --git a/data/Cas9_d4_p16_paths_part1.bin b/data/Cas9_d4_p16_paths_part1.bin new file mode 100644 index 00000000..896a1921 Binary files /dev/null and b/data/Cas9_d4_p16_paths_part1.bin differ diff --git a/data/Cas9_d4_p16_paths_part2.bin b/data/Cas9_d4_p16_paths_part2.bin new file mode 100644 index 00000000..53305379 Binary files /dev/null and b/data/Cas9_d4_p16_paths_part2.bin differ diff --git a/data/compute_default_path_templates.jl b/data/compute_default_path_templates.jl index 75581279..90d7a7f4 100644 --- a/data/compute_default_path_templates.jl +++ b/data/compute_default_path_templates.jl @@ -1,15 +1,26 @@ using CHOPOFF +using DataFrames motif = Motif("Cas9"; distance = 4) mpt = build_PathTemplates(motif; restrict_to_len = 16) -paths = UInt8.(mpt.paths) -distances = UInt8.(mpt.distances) + +paths = mpt.paths[:, 1:16] +not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) +paths = paths[not_dups, :] +distances = mpt.distances[not_dups] + +split = Int(floor(length(distances) / 2)) +paths1 = paths[1:split, :] +paths2 = paths[(split + 1):end, :] CHOPOFF.save(distances, "./data/Cas9_d4_p16_distances.bin") -CHOPOFF.save(paths, "./data/Cas9_d4_p16_paths.bin") +CHOPOFF.save(paths1, "./data/Cas9_d4_p16_paths_part1.bin") +CHOPOFF.save(paths2, "./data/Cas9_d4_p16_paths_part2.bin") d2 = CHOPOFF.load("./data/Cas9_d4_p16_distances.bin") -p2 = CHOPOFF.load("./data/Cas9_d4_p16_paths.bin") +p1 = CHOPOFF.load("./data/Cas9_d4_p16_paths_part1.bin") +p2 = CHOPOFF.load("./data/Cas9_d4_p16_paths_part2.bin") +p2 = vcat(p1, p2) if (paths != p2) | (distances != d2) @warn "Failed to sucessfully save the path templates. CHOPOFF will still work, but will be slower in some cases." rm("./data/Cas9_d4_p16_paths.bin") diff --git a/src/db_prefix_hash.jl b/src/db_prefix_hash.jl index 3514dbdc..4a88018e 100644 --- a/src/db_prefix_hash.jl +++ b/src/db_prefix_hash.jl @@ -141,7 +141,8 @@ name::String, genomepath::String, motif::Motif, storage_dir::String, - hash_len::Int = min(length_noPAM(motif) - motif.distance, 16)) + hash_len::Int = min(length_noPAM(motif) - motif.distance, 16); + reuse_saved::Bool = true) ``` Prepare prefixHashDB index for future searches using `search_prefixHashDB`. @@ -162,6 +163,8 @@ You can also play with `hash_len` parameter, but keeping it at 16 should be clos `hash_len` - Length of the hash in bp. At maximum 16. +`reuse_saved` - Whether to reuse paths that were saved for Cas9 distance 4 and prefix 16. + # Examples ```julia $(make_example_doc("prefixHashDB")) @@ -172,7 +175,8 @@ function build_prefixHashDB( genomepath::String, motif::Motif, storage_dir::String, - hash_len::Int = min(length_noPAM(motif) - motif.distance, 16)) + hash_len::Int = min(length_noPAM(motif) - motif.distance, 16); + reuse_saved::Bool = true) if hash_len > 16 throw("hash_len $hash_len is more than 16") @@ -228,11 +232,45 @@ function build_prefixHashDB( isplus = BitVector(isplus[order]) @info "Step 3: Constructing Paths for hashes" - mpt = build_PathTemplates(motif; restrict_to_len = hash_len, withPAM = false) - paths = mpt.paths[:, 1:hash_len] - not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) # how can there be no duplicated function?! + # trying to preload paths for most common use case + paths = nothing + if (reuse_saved && (motif.distance <= 4) && (hash_len <= 16)) + m2 = Motif("Cas9") + if (motif.fwd == m2.fwd && + motif.rve == m2.rve && + motif.pam_loci_fwd == m2.pam_loci_fwd && + motif.pam_loci_rve == m2.pam_loci_rve) + + dir = joinpath(dirname(pathof(CHOPOFF)), "..", "data") + pfile1 = joinpath(dir, "Cas9_d4_p16_paths_part1.bin") + pfile2 = joinpath(dir, "Cas9_d4_p16_paths_part2.bin") + dfile = joinpath(dir, "Cas9_d4_p16_distances.bin") + if (isfile(pfile1) && isfile(pfile2) && isfile(dfile)) + @info "Reusing precomputed alignments." + paths = CHOPOFF.load(pfile1) + paths2 = CHOPOFF.load(pfile2) + paths = vcat(paths, paths2) + distances = CHOPOFF.load(joinpath(dir, "Cas9_d4_p16_distances.bin")) + paths = paths[:, 1:hash_len] + not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) + not_over_dist = BitVector(distances .<= motif.distance) + not = not_dups .& not_over_dist + paths = paths[not, :] + distances = distances[not] + paths = convert.(smallestutype(maximum(paths)), paths) + end + end + end + + if isnothing(paths) + mpt = build_PathTemplates(motif; restrict_to_len = hash_len, withPAM = false) + paths = mpt.paths[:, 1:hash_len] + not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) # how can there be no duplicated function?! + paths = paths[not_dups, :] + distances = mpt.distances[not_dups] + end mkpath(storage_dir) - save(PrefixHashDB(SymbolicAlignments(dbi, paths[not_dups, :], mpt.distances[not_dups], hash_len), + save(PrefixHashDB(SymbolicAlignments(dbi, paths, distances, hash_len), prefixes, suffixes, chrom, pos, isplus), joinpath(storage_dir, "prefixHashDB.bin")) @info "Finished constructing prefixHashDB in " * storage_dir diff --git a/src/db_vcf.jl b/src/db_vcf.jl index 4da2ed8d..7cd5b453 100644 --- a/src/db_vcf.jl +++ b/src/db_vcf.jl @@ -47,7 +47,8 @@ build_vcfDB( vcfpath::String, motif::Motif, storage_path::String, - hash_len::Int = min(length_noPAM(motif) - motif.distance, 16)) + hash_len::Int = min(length_noPAM(motif) - motif.distance, 16); + reuse_saved::Bool = true) ``` Builds a database of all potential off-targets that overlap any of the variants in the VCF file. @@ -69,6 +70,8 @@ variants. This database uses simialr principles to `prefixHashDB`, also utilizes `hash_len` - length of the prefix that is stored inside the hash +`reuse_saved` - Whether to reuse paths that were saved for Cas9 distance 4 and prefix 16. + # Examples ```julia @@ -81,7 +84,7 @@ function build_vcfDB( vcfpath::String, motif::Motif, storage_path::String, - hash_len::Int = min(length_noPAM(motif) - motif.distance, 16)) + hash_len::Int = min(length_noPAM(motif) - motif.distance, 16); reuse_saved = true) dbi = DBInfo(genomepath, name, motif; vcf_filepath = vcfpath) hash_type = CHOPOFF.smallestutype(parse(UInt, repeat("1", hash_len * 2); base = 2)) @@ -235,9 +238,42 @@ function build_vcfDB( close(ref) @info "Step 2: Constructing Paths for hashes" - mpt = build_PathTemplates(motif; restrict_to_len = hash_len, withPAM = false) - paths = mpt.paths[:, 1:hash_len] - not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) # how can there be no duplicated function?! + paths = nothing + if (reuse_saved && (motif.distance <= 4) && (hash_len <= 16)) + m2 = Motif("Cas9") + if (motif.fwd == m2.fwd && + motif.rve == m2.rve && + motif.pam_loci_fwd == m2.pam_loci_fwd && + motif.pam_loci_rve == m2.pam_loci_rve) + + dir = joinpath(dirname(pathof(CHOPOFF)), "..", "data") + pfile1 = joinpath(dir, "Cas9_d4_p16_paths_part1.bin") + pfile2 = joinpath(dir, "Cas9_d4_p16_paths_part2.bin") + dfile = joinpath(dir, "Cas9_d4_p16_distances.bin") + if (isfile(pfile1) && isfile(pfile2) && isfile(dfile)) + @info "Reusing precomputed alignments." + paths = CHOPOFF.load(pfile1) + paths2 = CHOPOFF.load(pfile2) + paths = vcat(paths, paths2) + distances = CHOPOFF.load(joinpath(dir, "Cas9_d4_p16_distances.bin")) + paths = paths[:, 1:hash_len] + not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) + not_over_dist = BitVector(distances .<= motif.distance) + not = not_dups .& not_over_dist + paths = paths[not, :] + distances = distances[not] + paths = convert.(smallestutype(maximum(paths)), paths) + end + end + end + + if isnothing(paths) + mpt = build_PathTemplates(motif; restrict_to_len = hash_len, withPAM = false) + paths = mpt.paths[:, 1:hash_len] + not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) # how can there be no duplicated function?! + paths = paths[not_dups, :] + distances = mpt.distances[not_dups] + end if length(ambig_guides) > 0 @info "Step 3: Constructing DB for ambigous gRNAs." @@ -250,7 +286,7 @@ function build_vcfDB( save(CHOPOFF.build_ambigPrefixHashDB( ambig_guides, ambig_chrom, ambig_pos, ambig_isplus, l, hash_len, ot_type, hash_type, suffix_type, ambig_annot, - CHOPOFF.SymbolicAlignments(dbi, paths[not_dups, :], mpt.distances[not_dups], hash_len)), + CHOPOFF.SymbolicAlignments(dbi, paths, distances, hash_len)), storage_path) @info "Finished constructing vcfDB in " * storage_path else diff --git a/src/motif_path_templates.jl b/src/motif_path_templates.jl index ca9a9a4f..90920bfa 100644 --- a/src/motif_path_templates.jl +++ b/src/motif_path_templates.jl @@ -310,44 +310,12 @@ function build_PathTemplates( storagepath::String = "", mismatch_only::Bool = false, restrict_to_len::Int = (length_noPAM(motif) + motif.distance), - withPAM::Bool = false, - reuse_saved::Bool = true) + withPAM::Bool = false) len = length_noPAM(motif) d = motif.distance length_of_paths = (withPAM ? length(motif) : length_noPAM(motif)) + motif.distance - # trying to preload paths for most common use case - if (reuse_saved && (motif.distance <= 4) && (restrict_to_len <= 16) && !withPAM && !mismatch_only) - m2 = Motif("Cas9") - if (motif.fwd == m2.fwd && - motif.rve == m2.rve && - motif.pam_loci_fwd == m2.pam_loci_fwd && - motif.pam_loci_rve == m2.pam_loci_rve) - - dir = joinpath(dirname(pathof(CHOPOFF)), "..", "data") - pfile = joinpath(dir, "Cas9_d4_p16_paths.bin") - dfile = joinpath(dir, "Cas9_d4_p16_distances.bin") - if (isfile(pfile) && isfile(dfile)) - @info "Reusing precomputed alignments." - paths = CHOPOFF.load(pfile) - distances = CHOPOFF.load(joinpath(dir, "Cas9_d4_p16_distances.bin")) - paths = paths[:, 1:restrict_to_len] - not_dups = map(!, BitVector(nonunique(DataFrame(paths, :auto)))) - not_over_dist = BitVector(distances .<= d) - not = not_dups .& not_over_dist - paths = paths[not, :] - distances = distances[not] - paths = convert.(smallestutype(maximum(paths)), paths) - paths = PathTemplates(paths, distances, mismatch_only, motif, withPAM, restrict_to_len) - if storagepath != "" - save(paths, storagepath) - end - return paths - end - end - end - # path is mapped to these numbers, path numbers are # (len + end) * (dist + 1) and # (Ins (N) + Gap + MM) * len * dist