From cbbddb6e0a430a77ca7bca780d8ec8c1f5e646eb Mon Sep 17 00:00:00 2001 From: Milos Maric Date: Mon, 23 Mar 2020 17:39:13 +0100 Subject: [PATCH 1/7] [cudamapper] Moved helper functions before main --- cudamapper/src/main.cu | 183 ++++++++++++++++++++--------------------- 1 file changed, 90 insertions(+), 93 deletions(-) diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu index 8602ed0ad..312274869 100644 --- a/cudamapper/src/main.cu +++ b/cudamapper/src/main.cu @@ -42,8 +42,96 @@ static struct option options[] = { {"help", no_argument, 0, 'h'}, }; -void help(int32_t exit_code); -std::size_t find_largest_contiguous_device_memory_section(); +void help(int32_t exit_code = 0) +{ + std::cerr << + R"(Usage: cudamapper [options ...] + + Input file in FASTA/FASTQ format (can be compressed with gzip) + containing sequences used for all-to-all overlapping + options: + -k, --kmer-size + length of kmer to use for minimizers [15] (Max=)" + << claragenomics::cudamapper::Index::maximum_kmer_size() << ")" + << R"( + -w, --window-size + length of window to use for minimizers [15])" + << R"( + -d, --num-devices + number of GPUs to use [1])" + << R"( + -c, --max-index-device-cache + number of indices to keep in GPU memory [100])" + << R"( + -C, --max-index-host-cache + number of indices to keep in host memory [0])" + << R"( + -m, --max-cached-memory + maximum aggregate cached memory per device in GiB, if 0 program tries to allocate as much memory as possible [0])" + << R"( + -i, --index-size + length of batch size used for query in MB [30])" + << R"( + -t, --target-index-size + length of batch sized used for target in MB [30])" + << R"( + -F, --filtering-parameter + filter all representations for which sketch_elements_with_that_representation/total_sketch_elements >= filtering_parameter), filtering disabled if filtering_parameter == 1.0 [1'000'000'001] (Min = 0.0, Max = 1.0))" + << R"( + -a, --alignment-engines + Number of alignment engines to use (per device) for generating CIGAR strings for overlap alignments. Default value 0 = no alignment to be performed. Typically 2-4 engines per device gives best perf.)" + << std::endl; + + exit(exit_code); +} + +/// @brief finds largest section of contiguous memory on device +/// @return number of bytes +std::size_t find_largest_contiguous_device_memory_section() +{ + // find the largest block of contiguous memory + size_t free; + size_t total; + cudaMemGetInfo(&free, &total); + const size_t memory_decrement = free / 100; // decrease requested memory one by one percent + size_t size_to_try = free - memory_decrement; // do not go for all memory + while (true) + { + void* dummy_ptr = nullptr; + cudaError_t status = cudaMalloc(&dummy_ptr, size_to_try); + // if it was able to allocate memory free the memory and return the size + if (status == cudaSuccess) + { + cudaFree(dummy_ptr); + return size_to_try; + } + + if (status == cudaErrorMemoryAllocation) + { + // if it was not possible to allocate the memory because there was not enough of it + // try allocating less memory in next iteration + if (size_to_try > memory_decrement) + { + size_to_try -= memory_decrement; + } + else + { // a very small amount of memory left, report an error + CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation); + return 0; + } + } + else + { + // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error + CGA_CU_CHECK_ERR(status); + } + } + + // this point should actually never be reached (loop either finds memory or causes an error) + assert(false); + CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation); + return 0; +} int main(int argc, char* argv[]) { @@ -494,94 +582,3 @@ int main(int argc, char* argv[]) return 0; } - -void help(int32_t exit_code = 0) -{ - std::cerr << - R"(Usage: cudamapper [options ...] - - Input file in FASTA/FASTQ format (can be compressed with gzip) - containing sequences used for all-to-all overlapping - options: - -k, --kmer-size - length of kmer to use for minimizers [15] (Max=)" - << claragenomics::cudamapper::Index::maximum_kmer_size() << ")" - << R"( - -w, --window-size - length of window to use for minimizers [15])" - << R"( - -d, --num-devices - number of GPUs to use [1])" - << R"( - -c, --max-index-device-cache - number of indices to keep in GPU memory [100])" - << R"( - -C, --max-index-host-cache - number of indices to keep in host memory [0])" - << R"( - -m, --max-cached-memory - maximum aggregate cached memory per device in GiB, if 0 program tries to allocate as much memory as possible [0])" - << R"( - -i, --index-size - length of batch size used for query in MB [30])" - << R"( - -t, --target-index-size - length of batch sized used for target in MB [30])" - << R"( - -F, --filtering-parameter - filter all representations for which sketch_elements_with_that_representation/total_sketch_elements >= filtering_parameter), filtering disabled if filtering_parameter == 1.0 [1'000'000'001] (Min = 0.0, Max = 1.0))" - << R"( - -a, --alignment-engines - Number of alignment engines to use (per device) for generating CIGAR strings for overlap alignments. Default value 0 = no alignment to be performed. Typically 2-4 engines per device gives best perf.)" - << std::endl; - - exit(exit_code); -} - -/// @brief finds largest section of contiguous memory on device -/// @return number of bytes -std::size_t find_largest_contiguous_device_memory_section() -{ - // find the largest block of contiguous memory - size_t free; - size_t total; - cudaMemGetInfo(&free, &total); - const size_t memory_decrement = free / 100; // decrease requested memory one by one percent - size_t size_to_try = free - memory_decrement; // do not go for all memory - while (true) - { - void* dummy_ptr = nullptr; - cudaError_t status = cudaMalloc(&dummy_ptr, size_to_try); - // if it was able to allocate memory free the memory and return the size - if (status == cudaSuccess) - { - cudaFree(dummy_ptr); - return size_to_try; - } - - if (status == cudaErrorMemoryAllocation) - { - // if it was not possible to allocate the memory because there was not enough of it - // try allocating less memory in next iteration - if (size_to_try > memory_decrement) - { - size_to_try -= memory_decrement; - } - else - { // a very small amount of memory left, report an error - CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation); - return 0; - } - } - else - { - // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error - CGA_CU_CHECK_ERR(status); - } - } - - // this point should actually never be reached (loop either finds memory or causes an error) - assert(false); - CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation); - return 0; -} From 2bfbdc5882f7de9aa9f645b2fdba4e878deeb029 Mon Sep 17 00:00:00 2001 From: Milos Maric Date: Mon, 23 Mar 2020 18:01:39 +0100 Subject: [PATCH 2/7] [cudamapper] Return from find_largest_contiguous_device_memory_section if there was an error --- cudamapper/src/main.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu index 312274869..37c7a1ced 100644 --- a/cudamapper/src/main.cu +++ b/cudamapper/src/main.cu @@ -124,6 +124,7 @@ std::size_t find_largest_contiguous_device_memory_section() { // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error CGA_CU_CHECK_ERR(status); + return 0; } } From 9d526500ef65dfb6eab088d4568a474c45ab98b5 Mon Sep 17 00:00:00 2001 From: Milos Maric Date: Mon, 23 Mar 2020 18:19:48 +0100 Subject: [PATCH 3/7] [cudamapper] extracted parameters parsing into a separate function --- cudamapper/src/main.cu | 318 ++++++++++++++++++++++------------------- 1 file changed, 168 insertions(+), 150 deletions(-) diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu index 37c7a1ced..531e84330 100644 --- a/cudamapper/src/main.cu +++ b/cudamapper/src/main.cu @@ -28,20 +28,8 @@ #include #include "overlapper_triggered.hpp" -static struct option options[] = { - {"kmer-size", required_argument, 0, 'k'}, - {"window-size", required_argument, 0, 'w'}, - {"num-devices", required_argument, 0, 'd'}, - {"max-index-device-cache", required_argument, 0, 'c'}, - {"max-index-host-cache", required_argument, 0, 'C'}, - {"max-cached-memory", required_argument, 0, 'm'}, - {"index-size", required_argument, 0, 'i'}, - {"target-index-size", required_argument, 0, 't'}, - {"filtering-parameter", required_argument, 0, 'F'}, - {"alignment-engines", required_argument, 0, 'a'}, - {"help", no_argument, 0, 'h'}, -}; - +/// @brief prints help message +/// @param exit_code void help(int32_t exit_code = 0) { std::cerr << @@ -85,111 +73,86 @@ void help(int32_t exit_code = 0) exit(exit_code); } -/// @brief finds largest section of contiguous memory on device -/// @return number of bytes -std::size_t find_largest_contiguous_device_memory_section() +/// @brief application parameteres, default or passed through command line +struct ApplicationParameteres { - // find the largest block of contiguous memory - size_t free; - size_t total; - cudaMemGetInfo(&free, &total); - const size_t memory_decrement = free / 100; // decrease requested memory one by one percent - size_t size_to_try = free - memory_decrement; // do not go for all memory - while (true) - { - void* dummy_ptr = nullptr; - cudaError_t status = cudaMalloc(&dummy_ptr, size_to_try); - // if it was able to allocate memory free the memory and return the size - if (status == cudaSuccess) - { - cudaFree(dummy_ptr); - return size_to_try; - } - - if (status == cudaErrorMemoryAllocation) - { - // if it was not possible to allocate the memory because there was not enough of it - // try allocating less memory in next iteration - if (size_to_try > memory_decrement) - { - size_to_try -= memory_decrement; - } - else - { // a very small amount of memory left, report an error - CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation); - return 0; - } - } - else - { - // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error - CGA_CU_CHECK_ERR(status); - return 0; - } - } - - // this point should actually never be reached (loop either finds memory or causes an error) - assert(false); - CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation); - return 0; -} - -int main(int argc, char* argv[]) -{ - using claragenomics::get_size; - claragenomics::logging::Init(); - uint32_t k = 15; // k uint32_t w = 15; // w std::int32_t num_devices = 1; // d std::int32_t max_index_cache_size_on_device = 100; // c - // ToDo: come up with a good heuristic to choose C and c - std::int32_t max_index_cache_size_on_host = 0; // C - std::int32_t max_cached_memory = 0; // m - std::int32_t index_size = 30; // i - std::int32_t target_index_size = 30; // t - double filtering_parameter = 1.0; // F - std::int32_t alignment_engines = 0; // a - std::string optstring = "k:w:d:c:C:m:i:t:F:h:a:"; - int32_t argument = 0; + std::int32_t max_index_cache_size_on_host = 0; // C + std::int32_t max_cached_memory = 0; // m + std::int32_t index_size = 30; // i + std::int32_t target_index_size = 30; // t + double filtering_parameter = 1.0; // F + std::int32_t alignment_engines = 0; // a + bool all_to_all = false; + std::string query_filepath; + std::string target_filepath; +}; + +/// @brief reads input from command line +/// @param argc +/// @param argv +/// @return application parameters passed through command line, default otherwise +ApplicationParameteres read_input(int argc, char* argv[]) +{ + ApplicationParameteres parameters; + + struct option options[] = { + {"kmer-size", required_argument, 0, 'k'}, + {"window-size", required_argument, 0, 'w'}, + {"num-devices", required_argument, 0, 'd'}, + {"max-index-device-cache", required_argument, 0, 'c'}, + {"max-index-host-cache", required_argument, 0, 'C'}, + {"max-cached-memory", required_argument, 0, 'm'}, + {"index-size", required_argument, 0, 'i'}, + {"target-index-size", required_argument, 0, 't'}, + {"filtering-parameter", required_argument, 0, 'F'}, + {"alignment-engines", required_argument, 0, 'a'}, + {"help", no_argument, 0, 'h'}, + }; + + std::string optstring = "k:w:d:c:C:m:i:t:F:h:a:"; + int32_t argument = 0; while ((argument = getopt_long(argc, argv, optstring.c_str(), options, nullptr)) != -1) { switch (argument) { case 'k': - k = atoi(optarg); + parameters.k = atoi(optarg); break; case 'w': - w = atoi(optarg); + parameters.w = atoi(optarg); break; case 'd': - num_devices = atoi(optarg); + parameters.num_devices = atoi(optarg); break; case 'c': - max_index_cache_size_on_device = atoi(optarg); + parameters.max_index_cache_size_on_device = atoi(optarg); break; case 'C': - max_index_cache_size_on_host = atoi(optarg); + parameters.max_index_cache_size_on_host = atoi(optarg); break; case 'm': #ifndef CGA_ENABLE_CACHING_ALLOCATOR std::cerr << "ERROR: Argument -m / --max-cached-memory cannot be used without caching allocator" << std::endl; exit(1); #endif - max_cached_memory = atoi(optarg); + parameters.max_cached_memory = atoi(optarg); break; case 'i': - index_size = atoi(optarg); + parameters.index_size = atoi(optarg); break; case 't': - target_index_size = atoi(optarg); + parameters.target_index_size = atoi(optarg); break; case 'F': - filtering_parameter = atof(optarg); + parameters.filtering_parameter = atof(optarg); break; case 'a': - alignment_engines = atoi(optarg); - claragenomics::throw_on_negative(alignment_engines, "Number of alignment engines should be non-negative"); + parameters.alignment_engines = atoi(optarg); + claragenomics::throw_on_negative(parameters.alignment_engines, "Number of alignment engines should be non-negative"); break; case 'h': help(0); @@ -198,19 +161,19 @@ int main(int argc, char* argv[]) } } - if (k > claragenomics::cudamapper::Index::maximum_kmer_size()) + if (parameters.k > claragenomics::cudamapper::Index::maximum_kmer_size()) { - std::cerr << "kmer of size " << k << " is not allowed, maximum k = " << claragenomics::cudamapper::Index::maximum_kmer_size() << std::endl; + std::cerr << "kmer of size " << parameters.k << " is not allowed, maximum k = " << claragenomics::cudamapper::Index::maximum_kmer_size() << std::endl; exit(1); } - if (filtering_parameter > 1.0 || filtering_parameter < 0.0) + if (parameters.filtering_parameter > 1.0 || parameters.filtering_parameter < 0.0) { std::cerr << "-F / --filtering-parameter must be in range [0.0, 1.0]" << std::endl; exit(1); } - if (max_cached_memory < 0) + if (parameters.max_cached_memory < 0) { std::cerr << "-m / --max-cached-memory must not be negative" << std::endl; exit(1); @@ -223,34 +186,89 @@ int main(int argc, char* argv[]) help(1); } - std::string query_filepath = std::string(argv[optind++]); - std::string target_filepath = std::string(argv[optind++]); + parameters.query_filepath = std::string(argv[optind++]); + parameters.target_filepath = std::string(argv[optind++]); - bool all_to_all = false; - if (query_filepath == target_filepath) + if (parameters.query_filepath == parameters.target_filepath) { - all_to_all = true; - target_index_size = index_size; + parameters.all_to_all = true; + parameters.target_index_size = parameters.index_size; std::cerr << "NOTE - Since query and target files are same, activating all_to_all mode. Query index size used for both files." << std::endl; } - std::shared_ptr query_parser = claragenomics::io::create_kseq_fasta_parser(query_filepath, k + w - 1); - int32_t queries = query_parser->get_num_seqences(); + return parameters; +} + +/// @brief finds largest section of contiguous memory on device +/// @return number of bytes +std::size_t find_largest_contiguous_device_memory_section() +{ + // find the largest block of contiguous memory + size_t free; + size_t total; + cudaMemGetInfo(&free, &total); + const size_t memory_decrement = free / 100; // decrease requested memory one by one percent + size_t size_to_try = free - memory_decrement; // do not go for all memory + while (true) + { + void* dummy_ptr = nullptr; + cudaError_t status = cudaMalloc(&dummy_ptr, size_to_try); + // if it was able to allocate memory free the memory and return the size + if (status == cudaSuccess) + { + cudaFree(dummy_ptr); + return size_to_try; + } + + if (status == cudaErrorMemoryAllocation) + { + // if it was not possible to allocate the memory because there was not enough of it + // try allocating less memory in next iteration + if (size_to_try > memory_decrement) + { + size_to_try -= memory_decrement; + } + else + { // a very small amount of memory left, report an error + CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation); + return 0; + } + } + else + { + // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error + CGA_CU_CHECK_ERR(status); + return 0; + } + } + + // this point should actually never be reached (loop either finds memory or causes an error) + assert(false); + CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation); + return 0; +} + +int main(int argc, char* argv[]) +{ + using claragenomics::get_size; + claragenomics::logging::Init(); + + const ApplicationParameteres parameters = read_input(argc, argv); + + std::shared_ptr query_parser = claragenomics::io::create_kseq_fasta_parser(parameters.query_filepath, parameters.k + parameters.w - 1); std::shared_ptr target_parser; - if (all_to_all) + if (parameters.all_to_all) { target_parser = query_parser; } else { - target_parser = claragenomics::io::create_kseq_fasta_parser(target_filepath, k + w - 1); + target_parser = claragenomics::io::create_kseq_fasta_parser(parameters.target_filepath, parameters.k + parameters.w - 1); } - int32_t targets = target_parser->get_num_seqences(); - - std::cerr << "Query " << query_filepath << " index " << queries << std::endl; - std::cerr << "Target " << target_filepath << " index " << targets << std::endl; + std::cerr << "Query " << parameters.query_filepath << " index " << query_parser->get_num_seqences() << std::endl; + std::cerr << "Target " << parameters.target_filepath << " index " << target_parser->get_num_seqences() << std::endl; // Data structure for holding overlaps to be written out std::mutex overlaps_writer_mtx; @@ -262,8 +280,8 @@ int main(int argc, char* argv[]) }; ///Factor of 1000000 to make max cache size in MiB - auto query_chunks = query_parser->get_read_chunks(index_size * 1000000); - auto target_chunks = target_parser->get_read_chunks(target_index_size * 1000000); + auto query_chunks = query_parser->get_read_chunks(parameters.index_size * 1000000); + auto target_chunks = target_parser->get_read_chunks(parameters.target_index_size * 1000000); //First generate all the ranges independently, then loop over them. std::vector query_target_ranges; @@ -281,7 +299,7 @@ int main(int argc, char* argv[]) // in all-to-all, for query chunk 0, we go through target chunks [target_idx = 0 , n = target_chunks.size()) // for query chunk 1, we only need target chunks [target_idx = 1 , n), and in general for query_chunk i, we need target chunks [target_idx = i , n) // therefore as we're looping through query chunks, in all-to-all, will increment target_idx - if (all_to_all) + if (parameters.all_to_all) { target_idx++; } @@ -291,21 +309,21 @@ int main(int argc, char* argv[]) std::map, std::shared_ptr> host_index_cache; // This is a per-device cache, if it has the index it will return it, if not it will generate it, store and return it. - std::vector, std::shared_ptr>> device_index_cache(num_devices); + std::vector, std::shared_ptr>> device_index_cache(parameters.num_devices); // The number of overlap chunks which are to be computed std::atomic num_overlap_chunks_to_print(0); - auto get_index = [&device_index_cache, &host_index_cache, max_index_cache_size_on_device, max_index_cache_size_on_host](claragenomics::DefaultDeviceAllocator allocator, - claragenomics::io::FastaParser& parser, - const claragenomics::cudamapper::read_id_t start_index, - const claragenomics::cudamapper::read_id_t end_index, - const std::uint64_t k, - const std::uint64_t w, - const int device_id, - const bool allow_cache_index, - const double filtering_parameter, - const cudaStream_t cuda_stream) { + auto get_index = [&device_index_cache, &host_index_cache, ¶meters](claragenomics::DefaultDeviceAllocator allocator, + claragenomics::io::FastaParser& parser, + const claragenomics::cudamapper::read_id_t start_index, + const claragenomics::cudamapper::read_id_t end_index, + const std::uint64_t k, + const std::uint64_t w, + const int device_id, + const bool allow_cache_index, + const double filtering_parameter, + const cudaStream_t cuda_stream) { CGA_NVTX_RANGE(profiler, "get index"); std::pair key; key.first = start_index; @@ -326,31 +344,31 @@ int main(int argc, char* argv[]) else { //create an index, with hashed representations (minimizers) - index = claragenomics::cudamapper::Index::create_index(allocator, - parser, - start_index, - end_index, - k, - w, - true, // hash_representations - filtering_parameter, - cuda_stream); + index = std::move(claragenomics::cudamapper::Index::create_index(allocator, + parser, + start_index, + end_index, + k, + w, + true, // hash_representations + filtering_parameter, + cuda_stream)); // If in all-to-all mode, put this query in the cache for later use. // Cache eviction is handled later on by the calling thread // using the evict_index function. - if (get_size(device_index_cache[device_id]) < max_index_cache_size_on_device && allow_cache_index) + if (get_size(device_index_cache[device_id]) < parameters.max_index_cache_size_on_device && allow_cache_index) { device_index_cache[device_id][key] = index; } - else if (get_size(host_index_cache) < max_index_cache_size_on_host && allow_cache_index && device_id == 0) + else if (get_size(host_index_cache) < parameters.max_index_cache_size_on_host && allow_cache_index && device_id == 0) { // if not cached on device, update host cache; only done on device 0 to avoid any race conditions in updating the host cache - host_index_cache[key] = claragenomics::cudamapper::IndexHostCopy::create_cache(*index, - start_index, - k, - w, - cuda_stream); + host_index_cache[key] = std::move(claragenomics::cudamapper::IndexHostCopy::create_cache(*index, + start_index, + k, + w, + cuda_stream)); } } return index; @@ -385,7 +403,7 @@ int main(int argc, char* argv[]) #ifdef CGA_ENABLE_CACHING_ALLOCATOR // uses CachingDeviceAllocator std::size_t max_cached_bytes = 0; - if (max_cached_memory == 0) + if (parameters.max_cached_memory == 0) { std::cerr << "Programmatically looking for max cached memory" << std::endl; max_cached_bytes = find_largest_contiguous_device_memory_section(); @@ -397,7 +415,7 @@ int main(int argc, char* argv[]) } else { - max_cached_bytes = max_cached_memory * 1024ull * 1024ull * 1024ull; // max_cached_memory is in GiB + max_cached_bytes = parameters.max_cached_memory * 1024ull * 1024ull * 1024ull; // max_cached_memory is in GiB } std::cerr << "Using device memory cache of " << max_cached_bytes << " bytes" << std::endl; @@ -426,11 +444,11 @@ int main(int argc, char* argv[]) *query_parser, query_start_index, query_end_index, - k, - w, + parameters.k, + parameters.w, device_id, - all_to_all, - filtering_parameter, + parameters.all_to_all, + parameters.filtering_parameter, cuda_stream); } @@ -446,11 +464,11 @@ int main(int argc, char* argv[]) *target_parser, target_start_index, target_end_index, - k, - w, + parameters.k, + parameters.w, device_id, true, - filtering_parameter, + parameters.filtering_parameter, cuda_stream); } { @@ -472,11 +490,11 @@ int main(int argc, char* argv[]) std::vector cigar; // Align overlaps - if (alignment_engines > 0) + if (parameters.alignment_engines > 0) { cigar.resize(overlaps_to_add->size()); CGA_NVTX_RANGE(profiler, "align_overlaps"); - claragenomics::cudamapper::Overlapper::Overlapper::align_overlaps(*overlaps_to_add, *query_parser, *target_parser, alignment_engines, cigar); + claragenomics::cudamapper::Overlapper::Overlapper::align_overlaps(*overlaps_to_add, *query_parser, *target_parser, parameters.alignment_engines, cigar); } //Increment counter which tracks number of overlap chunks to be filtered and printed @@ -513,9 +531,9 @@ int main(int argc, char* argv[]) } // If all-to-all mapping query will no longer be needed on device, remove it from the cache - if (all_to_all) + if (parameters.all_to_all) { - evict_index(query_start_index, query_end_index, device_id, num_devices); + evict_index(query_start_index, query_end_index, device_id, parameters.num_devices); } }; @@ -530,13 +548,13 @@ int main(int argc, char* argv[]) // Each worker thread gets its own CUDA stream to work on. Currently there is only one worker thread per GPU, // but it is still necessary assign streams to each of then explicitly. --default-stream per-thread could // cause problems beacuse there are subthreads for worker threads - std::vector cuda_streams(num_devices); + std::vector cuda_streams(parameters.num_devices); // Launch worker threads to enable multi-GPU. // One worker thread is responsible for one GPU so the number // of worker threads launched is equal to the number of devices specified // by the user - for (int device_id = 0; device_id < num_devices; ++device_id) + for (int device_id = 0; device_id < parameters.num_devices; ++device_id) { CGA_CU_CHECK_ERR(cudaStreamCreate(&cuda_streams[device_id])); //Worker thread consumes query-target ranges off a queue From 96afe4f3d8bfb5110f5e789c1303e4b0935f68ea Mon Sep 17 00:00:00 2001 From: Milos Maric Date: Mon, 23 Mar 2020 18:22:23 +0100 Subject: [PATCH 4/7] [cudamapper] Better output of input name and number of reads --- cudamapper/src/main.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu index 531e84330..1d686e87c 100644 --- a/cudamapper/src/main.cu +++ b/cudamapper/src/main.cu @@ -267,8 +267,8 @@ int main(int argc, char* argv[]) target_parser = claragenomics::io::create_kseq_fasta_parser(parameters.target_filepath, parameters.k + parameters.w - 1); } - std::cerr << "Query " << parameters.query_filepath << " index " << query_parser->get_num_seqences() << std::endl; - std::cerr << "Target " << parameters.target_filepath << " index " << target_parser->get_num_seqences() << std::endl; + std::cerr << "Query file: " << parameters.query_filepath << ", number of reads: " << query_parser->get_num_seqences() << std::endl; + std::cerr << "Target file: " << parameters.target_filepath << ", number of reads: " << target_parser->get_num_seqences() << std::endl; // Data structure for holding overlaps to be written out std::mutex overlaps_writer_mtx; From ead3a1b62fb4faca51b5bd0003bd1b1bdfd5eae4 Mon Sep 17 00:00:00 2001 From: Milos Maric Date: Mon, 23 Mar 2020 18:32:10 +0100 Subject: [PATCH 5/7] [cudamapper] Extracted writter thread --- cudamapper/src/main.cu | 68 +++++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 24 deletions(-) diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu index 1d686e87c..858d8b4ca 100644 --- a/cudamapper/src/main.cu +++ b/cudamapper/src/main.cu @@ -199,6 +199,41 @@ ApplicationParameteres read_input(int argc, char* argv[]) return parameters; } +/// @brief adds read names to overlaps and writes them to output +/// This function is expected to be executed async to matcher + overlapper +/// @param overlaps_writer_mtx locked while writing the output +/// @param num_overlap_chunks_to_print increased before the function is called, decreased right before the function finishes // TODO: improve this design +/// @param filtered_overlaps overlaps to be written out, on input without read names, on output cleared +/// @param query_index needed for read names // TODO: consider only passing vector of names, not whole indices +/// @param target_index needed for read names // TODO: consider only passing vector of names, not whole indices +/// @param cigar +/// @param device_id id of device on which query and target indices were created +void writer_thread_function(std::mutex& overlaps_writer_mtx, + std::atomic& num_overlap_chunks_to_print, + std::shared_ptr> filtered_overlaps, + std::shared_ptr query_index, + std::shared_ptr target_index, + const std::vector cigar, + const int device_id) +{ + // This function is expected to run in a separate thread so set current device in order to avoid problems + // with deallocating indices with different current device than the one on which they were created + cudaSetDevice(device_id); + + // parallel update of the query/target read names for filtered overlaps [parallel on host] + claragenomics::cudamapper::Overlapper::update_read_names(*filtered_overlaps, *query_index, *target_index); + std::lock_guard lck(overlaps_writer_mtx); + claragenomics::cudamapper::Overlapper::print_paf(*filtered_overlaps, cigar); + + //clear data + for (auto o : *filtered_overlaps) + { + o.clear(); + } + //Decrement counter which tracks number of overlap chunks to be filtered and printed + num_overlap_chunks_to_print--; +}; + /// @brief finds largest section of contiguous memory on device /// @return number of bytes std::size_t find_largest_contiguous_device_memory_section() @@ -499,30 +534,15 @@ int main(int argc, char* argv[]) //Increment counter which tracks number of overlap chunks to be filtered and printed num_overlap_chunks_to_print++; - auto print_overlaps = [&overlaps_writer_mtx, &num_overlap_chunks_to_print](std::shared_ptr> filtered_overlaps, - std::shared_ptr query_index, - std::shared_ptr target_index, - const std::vector& cigar, - const int device_id) { - // This lambda is expected to run in a separate thread so set current device in order to avoid problems - // with deallocating indices with different current device then the one on which they were created - cudaSetDevice(device_id); - - // parallel update of the query/target read names for filtered overlaps [parallel on host] - claragenomics::cudamapper::Overlapper::update_read_names(*filtered_overlaps, *query_index, *target_index); - std::lock_guard lck(overlaps_writer_mtx); - claragenomics::cudamapper::Overlapper::print_paf(*filtered_overlaps, cigar); - - //clear data - for (auto o : *filtered_overlaps) - { - o.clear(); - } - //Decrement counter which tracks number of overlap chunks to be filtered and printed - num_overlap_chunks_to_print--; - }; - std::thread t(print_overlaps, overlaps_to_add, query_index, target_index, cigar, device_id); + std::thread t(writer_thread_function, + std::ref(overlaps_writer_mtx), + std::ref(num_overlap_chunks_to_print), + overlaps_to_add, + query_index, + target_index, + std::move(cigar), + device_id); t.detach(); } @@ -600,4 +620,4 @@ int main(int argc, char* argv[]) } return 0; -} +} \ No newline at end of file From b9fb040ac6dea5053724481b9b7b7ee3f13c3700 Mon Sep 17 00:00:00 2001 From: Milos Maric Date: Mon, 23 Mar 2020 18:35:10 +0100 Subject: [PATCH 6/7] [cudamapper] Index size is in MB, not MiB --- cudamapper/src/main.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu index 858d8b4ca..abf6b995b 100644 --- a/cudamapper/src/main.cu +++ b/cudamapper/src/main.cu @@ -314,7 +314,7 @@ int main(int argc, char* argv[]) std::vector> target_ranges; }; - ///Factor of 1000000 to make max cache size in MiB + ///Factor of 1000000 to make max cache size in MB auto query_chunks = query_parser->get_read_chunks(parameters.index_size * 1000000); auto target_chunks = target_parser->get_read_chunks(parameters.target_index_size * 1000000); From 389d94c0c4567afdb783e9d1ec374d5e6259b3d2 Mon Sep 17 00:00:00 2001 From: Milos Maric Date: Tue, 24 Mar 2020 19:02:37 +0100 Subject: [PATCH 7/7] [cudamapper] removed unnecessary std::move --- cudamapper/src/main.cu | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu index abf6b995b..1051b9ea4 100644 --- a/cudamapper/src/main.cu +++ b/cudamapper/src/main.cu @@ -379,15 +379,15 @@ int main(int argc, char* argv[]) else { //create an index, with hashed representations (minimizers) - index = std::move(claragenomics::cudamapper::Index::create_index(allocator, - parser, - start_index, - end_index, - k, - w, - true, // hash_representations - filtering_parameter, - cuda_stream)); + index = claragenomics::cudamapper::Index::create_index(allocator, + parser, + start_index, + end_index, + k, + w, + true, // hash_representations + filtering_parameter, + cuda_stream); // If in all-to-all mode, put this query in the cache for later use. // Cache eviction is handled later on by the calling thread @@ -399,11 +399,11 @@ int main(int argc, char* argv[]) else if (get_size(host_index_cache) < parameters.max_index_cache_size_on_host && allow_cache_index && device_id == 0) { // if not cached on device, update host cache; only done on device 0 to avoid any race conditions in updating the host cache - host_index_cache[key] = std::move(claragenomics::cudamapper::IndexHostCopy::create_cache(*index, - start_index, - k, - w, - cuda_stream)); + host_index_cache[key] = claragenomics::cudamapper::IndexHostCopy::create_cache(*index, + start_index, + k, + w, + cuda_stream); } } return index;