diff --git a/src/db_linear_hash.jl b/src/db_linear_hash.jl index 34be4de6..c3a20f33 100644 --- a/src/db_linear_hash.jl +++ b/src/db_linear_hash.jl @@ -8,7 +8,7 @@ struct SuffixHashDB suffix::Vector{LongDNA{4}} suffix_loci_idx::Vector{LociRange} loci::Vector{Loc} - hash::Vector{UInt64} + hash::Vector{UInt32} end @@ -55,7 +55,7 @@ if interested with searches within distance 4, preferably use prefix length of `prefix_len` - Size of the prefix by which off-targets are indexed. Prefix of 8 or larger will be the fastest, however it will also result in large number of files. -`hash_len` - Length of the hash in bp. Should be below 20bp - maximum alignment distance. +`hash_len` - Length of the hash in bp. At maximum 16. # Examples ```julia-repl @@ -68,14 +68,14 @@ function build_linearHashDB( motif::Motif, storage_dir::String, prefix_len::Int = 7, - hash_len::Int = (length_noPAM(motif) - motif.distance)) + hash_len::Int = min(length_noPAM(motif) - motif.distance, 16)) if prefix_len <= motif.distance throw("prefix_len $prefix_len is <= " * string(motif.distance)) end - if hash_len > (length_noPAM(motif) - motif.distance) - throw("prefix_len $hash_len is > " * string((length_noPAM(motif) - motif.distance))) + if hash_len > 16 + throw("hash_len $hash_len is more than 16") end dbi = DBInfo(genomepath, name, motif) @@ -113,7 +113,7 @@ function build_linearHashDB( (guides, loci_range, loci) = unique_guides(guides, loci) # hash part guides = prefix + guides hashes = ThreadsX.map(guides) do guide - convert(UInt64, (prefix * guide)[1:hash_len]) + convert(UInt32, (prefix * guide)[1:hash_len]) end sdb = SuffixHashDB(prefix, guides, loci_range, loci, hashes) save(sdb, joinpath(storage_dir, string(prefix) * ".bin")) @@ -132,7 +132,7 @@ end function search_prefix_hash( prefix::LongDNA{4}, - paths_set::Vector{Set{UInt64}}, + paths_set::Vector{Set{UInt32}}, dist::Int, dbi::DBInfo, detail::String, @@ -245,11 +245,11 @@ function search_linearHashDB( paths = ldb.paths[ldb.paths_distances .<= distance, :] paths_set = ThreadsX.map(copy(guides_)) do g - guides_uint64 = guide_to_template_format(g; alphabet = ALPHABET_TWOBIT) - ot_uint64 = guides_uint64[paths] - ot_uint64 = map(ARTEMIS.asUInt64, eachrow(ot_uint64)) + guides_formated = guide_to_template_format(g; alphabet = ALPHABET_TWOBIT) + ot_uint32 = guides_formated[paths] + ot_uint32 = map(ARTEMIS.asUInt32, eachrow(ot_uint32)) # BinaryFuseFilter{UInt32}(unique(ot_uint64)) # very space efficient!!! - return Set(ot_uint64) + return Set(ot_uint32) end mkpath(dirname(output_file)) diff --git a/src/motif_path_templates.jl b/src/motif_path_templates.jl index 1923bc1c..41a8f81a 100644 --- a/src/motif_path_templates.jl +++ b/src/motif_path_templates.jl @@ -504,6 +504,27 @@ function asUInt64(x::AbstractVecOrMat) end +""" +``` +asUInt32(x::AbstractVecOrMat) +``` + +Helper that allows you to create one UInt32 for DNA strings smaller 16bp. +There is no checking for the size of the vector therefore make sure it is <= 16bp. + + matp = guide_in_template_format[pathTemplate] + map(asUInt32, eachrow(matp)) +""" +function asUInt32(x::AbstractVecOrMat) + y = zero(UInt32) + for c in x + y = (y << 2) | UInt32(c) + end + mask = (one(UInt32) << (2 * length(x))) - UInt32(1) + return reinterpret(UInt32, y & mask) +end + + """ ``` duplicated(x::Vector{UInt64}) diff --git a/src/utils.jl b/src/utils.jl index e59ca3ee..58a17c85 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -162,6 +162,28 @@ function Base.convert(::Type{UInt64}, x::LongDNA{4}) end +function Base.convert(::Type{UInt32}, x::LongDNA{4}) + x = LongDNA{2}(x) + if (length(x) > 16) + throw("Sequence too long to save as UInt32.") + end + + y = zero(UInt32) + for c in x + nt = convert(DNA, c) + if isambiguous(nt) + throw(ArgumentError("cannot create a mer with ambiguous nucleotides")) + elseif isgap(nt) + throw(ArgumentError("cannot create a mer with gaps")) + end + y = (y << 2) | UInt32(BioSequences.twobitnucs[reinterpret(UInt8, nt) + 0x01]) + end + + mask = (one(UInt32) << (2 * length(x))) - one(UInt32) + return reinterpret(UInt32, y & mask) # encoded_data +end + + # this is needed inside saca.jl # TODO make sure the queried sequences are from the same type # or conform to the same encoding as the reference @@ -181,7 +203,7 @@ end end -@inline function BioSequences.LongDNA{4}(x::UInt64, len::Int) +@inline function BioSequences.LongDNA{4}(x::Union{UInt64, UInt32}, len::Int) y = [] for i in 1:len push!(y, reinterpret(DNA, 0x01 << ((x >> 2(len - i)) & 0b11))) diff --git a/test/src/utils.jl b/test/src/utils.jl index 8d79860f..49ac5870 100644 --- a/test/src/utils.jl +++ b/test/src/utils.jl @@ -73,6 +73,21 @@ using Combinatorics @test isempty(findall(dna"AAANN", dna"ACTGAAANACTG"; ambig_max = 0)) end + + @testset "UInt32 conversion" begin + x = dna"AAAAAATGCTACTGCG" + @test LongDNA{4}(convert(UInt32, x), length(x)) == x + @test_throws BioSequences.EncodeError convert(UInt32, dna"A-A") + @test_throws String convert(UInt32, dna"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA") + + for i in 1:10000 + for j in 1:16 + x = getseq(ceil(Int, rand()*j), ['A', 'C', 'G', 'T']) + @test String(LongDNA{4}(convert(UInt32, x), length(x))) == String(copy(x)) + end + end + end + @testset "UInt64 conversion" begin x = dna"AAAAAATGCTACTG" @test LongDNA{4}(convert(UInt64, x), length(x)) == x