From cbbddb6e0a430a77ca7bca780d8ec8c1f5e646eb Mon Sep 17 00:00:00 2001
From: Milos Maric <mmaric@nvidia.com>
Date: Mon, 23 Mar 2020 17:39:13 +0100
Subject: [PATCH 1/7] [cudamapper] Moved helper functions before main

---
 cudamapper/src/main.cu | 183 ++++++++++++++++++++---------------------
 1 file changed, 90 insertions(+), 93 deletions(-)
diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu
index 8602ed0ad..312274869 100644
--- a/cudamapper/src/main.cu
+++ b/cudamapper/src/main.cu
@@ -42,8 +42,96 @@ static struct option options[] = {
     {"help", no_argument, 0, 'h'},
 };
 
-void help(int32_t exit_code);
-std::size_t find_largest_contiguous_device_memory_section();
+void help(int32_t exit_code = 0)
+{
+    std::cerr <<
+        R"(Usage: cudamapper [options ...] <query_sequences> <target_sequences>
+     <sequences>
+        Input file in FASTA/FASTQ format (can be compressed with gzip)
+        containing sequences used for all-to-all overlapping
+     options:
+        -k, --kmer-size
+            length of kmer to use for minimizers [15] (Max=)"
+              << claragenomics::cudamapper::Index::maximum_kmer_size() << ")"
+              << R"(
+        -w, --window-size
+            length of window to use for minimizers [15])"
+              << R"(
+        -d, --num-devices
+            number of GPUs to use [1])"
+              << R"(
+        -c, --max-index-device-cache
+            number of indices to keep in GPU memory [100])"
+              << R"(
+        -C, --max-index-host-cache
+            number of indices to keep in host memory [0])"
+              << R"(
+        -m, --max-cached-memory
+            maximum aggregate cached memory per device in GiB, if 0 program tries to allocate as much memory as possible [0])"
+              << R"(
+        -i, --index-size
+            length of batch size used for query in MB [30])"
+              << R"(
+        -t, --target-index-size
+            length of batch sized used for target in MB [30])"
+              << R"(
+        -F, --filtering-parameter
+            filter all representations for which sketch_elements_with_that_representation/total_sketch_elements >= filtering_parameter), filtering disabled if filtering_parameter == 1.0 [1'000'000'001] (Min = 0.0, Max = 1.0))"
+              << R"(
+        -a, --alignment-engines
+            Number of alignment engines to use (per device) for generating CIGAR strings for overlap alignments. Default value 0 = no alignment to be performed. Typically 2-4 engines per device gives best perf.)"
+              << std::endl;
+
+    exit(exit_code);
+}
+
+/// @brief finds largest section of contiguous memory on device
+/// @return number of bytes
+std::size_t find_largest_contiguous_device_memory_section()
+{
+    // find the largest block of contiguous memory
+    size_t free;
+    size_t total;
+    cudaMemGetInfo(&free, &total);
+    const size_t memory_decrement = free / 100;              // decrease requested memory one by one percent
+    size_t size_to_try            = free - memory_decrement; // do not go for all memory
+    while (true)
+    {
+        void* dummy_ptr    = nullptr;
+        cudaError_t status = cudaMalloc(&dummy_ptr, size_to_try);
+        // if it was able to allocate memory free the memory and return the size
+        if (status == cudaSuccess)
+        {
+            cudaFree(dummy_ptr);
+            return size_to_try;
+        }
+
+        if (status == cudaErrorMemoryAllocation)
+        {
+            // if it was not possible to allocate the memory because there was not enough of it
+            // try allocating less memory in next iteration
+            if (size_to_try > memory_decrement)
+            {
+                size_to_try -= memory_decrement;
+            }
+            else
+            { // a very small amount of memory left, report an error
+                CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation);
+                return 0;
+            }
+        }
+        else
+        {
+            // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error
+            CGA_CU_CHECK_ERR(status);
+        }
+    }
+
+    // this point should actually never be reached (loop either finds memory or causes an error)
+    assert(false);
+    CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation);
+    return 0;
+}
 
 int main(int argc, char* argv[])
 {
@@ -494,94 +582,3 @@ int main(int argc, char* argv[])
 
     return 0;
 }
-
-void help(int32_t exit_code = 0)
-{
-    std::cerr <<
-        R"(Usage: cudamapper [options ...] <query_sequences> <target_sequences>
-     <sequences>
-        Input file in FASTA/FASTQ format (can be compressed with gzip)
-        containing sequences used for all-to-all overlapping
-     options:
-        -k, --kmer-size
-            length of kmer to use for minimizers [15] (Max=)"
-              << claragenomics::cudamapper::Index::maximum_kmer_size() << ")"
-              << R"(
-        -w, --window-size
-            length of window to use for minimizers [15])"
-              << R"(
-        -d, --num-devices
-            number of GPUs to use [1])"
-              << R"(
-        -c, --max-index-device-cache
-            number of indices to keep in GPU memory [100])"
-              << R"(
-        -C, --max-index-host-cache
-            number of indices to keep in host memory [0])"
-              << R"(
-        -m, --max-cached-memory
-            maximum aggregate cached memory per device in GiB, if 0 program tries to allocate as much memory as possible [0])"
-              << R"(
-        -i, --index-size
-            length of batch size used for query in MB [30])"
-              << R"(
-        -t, --target-index-size
-            length of batch sized used for target in MB [30])"
-              << R"(
-        -F, --filtering-parameter
-            filter all representations for which sketch_elements_with_that_representation/total_sketch_elements >= filtering_parameter), filtering disabled if filtering_parameter == 1.0 [1'000'000'001] (Min = 0.0, Max = 1.0))"
-              << R"(
-        -a, --alignment-engines
-            Number of alignment engines to use (per device) for generating CIGAR strings for overlap alignments. Default value 0 = no alignment to be performed. Typically 2-4 engines per device gives best perf.)"
-              << std::endl;
-
-    exit(exit_code);
-}
-
-/// @brief finds largest section of contiguous memory on device
-/// @return number of bytes
-std::size_t find_largest_contiguous_device_memory_section()
-{
-    // find the largest block of contiguous memory
-    size_t free;
-    size_t total;
-    cudaMemGetInfo(&free, &total);
-    const size_t memory_decrement = free / 100;              // decrease requested memory one by one percent
-    size_t size_to_try            = free - memory_decrement; // do not go for all memory
-    while (true)
-    {
-        void* dummy_ptr    = nullptr;
-        cudaError_t status = cudaMalloc(&dummy_ptr, size_to_try);
-        // if it was able to allocate memory free the memory and return the size
-        if (status == cudaSuccess)
-        {
-            cudaFree(dummy_ptr);
-            return size_to_try;
-        }
-
-        if (status == cudaErrorMemoryAllocation)
-        {
-            // if it was not possible to allocate the memory because there was not enough of it
-            // try allocating less memory in next iteration
-            if (size_to_try > memory_decrement)
-            {
-                size_to_try -= memory_decrement;
-            }
-            else
-            { // a very small amount of memory left, report an error
-                CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation);
-                return 0;
-            }
-        }
-        else
-        {
-            // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error
-            CGA_CU_CHECK_ERR(status);
-        }
-    }
-
-    // this point should actually never be reached (loop either finds memory or causes an error)
-    assert(false);
-    CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation);
-    return 0;
-}

From 2bfbdc5882f7de9aa9f645b2fdba4e878deeb029 Mon Sep 17 00:00:00 2001
From: Milos Maric <mmaric@nvidia.com>
Date: Mon, 23 Mar 2020 18:01:39 +0100
Subject: [PATCH 2/7] [cudamapper] Return from
 find_largest_contiguous_device_memory_section if there was an error

---
 cudamapper/src/main.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu
index 312274869..37c7a1ced 100644
--- a/cudamapper/src/main.cu
+++ b/cudamapper/src/main.cu
@@ -124,6 +124,7 @@ std::size_t find_largest_contiguous_device_memory_section()
         {
             // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error
             CGA_CU_CHECK_ERR(status);
+            return 0;
         }
     }
 

From 9d526500ef65dfb6eab088d4568a474c45ab98b5 Mon Sep 17 00:00:00 2001
From: Milos Maric <mmaric@nvidia.com>
Date: Mon, 23 Mar 2020 18:19:48 +0100
Subject: [PATCH 3/7] [cudamapper] extracted parameters parsing into a separate
 function

---
 cudamapper/src/main.cu | 318 ++++++++++++++++++++++-------------------
 1 file changed, 168 insertions(+), 150 deletions(-)

diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu
index 37c7a1ced..531e84330 100644
--- a/cudamapper/src/main.cu
+++ b/cudamapper/src/main.cu
@@ -28,20 +28,8 @@
 #include <claragenomics/cudamapper/overlapper.hpp>
 #include "overlapper_triggered.hpp"
 
-static struct option options[] = {
-    {"kmer-size", required_argument, 0, 'k'},
-    {"window-size", required_argument, 0, 'w'},
-    {"num-devices", required_argument, 0, 'd'},
-    {"max-index-device-cache", required_argument, 0, 'c'},
-    {"max-index-host-cache", required_argument, 0, 'C'},
-    {"max-cached-memory", required_argument, 0, 'm'},
-    {"index-size", required_argument, 0, 'i'},
-    {"target-index-size", required_argument, 0, 't'},
-    {"filtering-parameter", required_argument, 0, 'F'},
-    {"alignment-engines", required_argument, 0, 'a'},
-    {"help", no_argument, 0, 'h'},
-};
-
+/// @brief prints help message
+/// @param exit_code
 void help(int32_t exit_code = 0)
 {
     std::cerr <<
@@ -85,111 +73,86 @@ void help(int32_t exit_code = 0)
     exit(exit_code);
 }
 
-/// @brief finds largest section of contiguous memory on device
-/// @return number of bytes
-std::size_t find_largest_contiguous_device_memory_section()
+/// @brief application parameteres, default or passed through command line
+struct ApplicationParameteres
 {
-    // find the largest block of contiguous memory
-    size_t free;
-    size_t total;
-    cudaMemGetInfo(&free, &total);
-    const size_t memory_decrement = free / 100;              // decrease requested memory one by one percent
-    size_t size_to_try            = free - memory_decrement; // do not go for all memory
-    while (true)
-    {
-        void* dummy_ptr    = nullptr;
-        cudaError_t status = cudaMalloc(&dummy_ptr, size_to_try);
-        // if it was able to allocate memory free the memory and return the size
-        if (status == cudaSuccess)
-        {
-            cudaFree(dummy_ptr);
-            return size_to_try;
-        }
-
-        if (status == cudaErrorMemoryAllocation)
-        {
-            // if it was not possible to allocate the memory because there was not enough of it
-            // try allocating less memory in next iteration
-            if (size_to_try > memory_decrement)
-            {
-                size_to_try -= memory_decrement;
-            }
-            else
-            { // a very small amount of memory left, report an error
-                CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation);
-                return 0;
-            }
-        }
-        else
-        {
-            // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error
-            CGA_CU_CHECK_ERR(status);
-            return 0;
-        }
-    }
-
-    // this point should actually never be reached (loop either finds memory or causes an error)
-    assert(false);
-    CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation);
-    return 0;
-}
-
-int main(int argc, char* argv[])
-{
-    using claragenomics::get_size;
-    claragenomics::logging::Init();
-
     uint32_t k                                  = 15;  // k
     uint32_t w                                  = 15;  // w
     std::int32_t num_devices                    = 1;   // d
     std::int32_t max_index_cache_size_on_device = 100; // c
-    // ToDo: come up with a good heuristic to choose C and c
-    std::int32_t max_index_cache_size_on_host = 0;   // C
-    std::int32_t max_cached_memory            = 0;   // m
-    std::int32_t index_size                   = 30;  // i
-    std::int32_t target_index_size            = 30;  // t
-    double filtering_parameter                = 1.0; // F
-    std::int32_t alignment_engines            = 0;   // a
-    std::string optstring                     = "k:w:d:c:C:m:i:t:F:h:a:";
-    int32_t argument                          = 0;
+    std::int32_t max_index_cache_size_on_host   = 0;   // C
+    std::int32_t max_cached_memory              = 0;   // m
+    std::int32_t index_size                     = 30;  // i
+    std::int32_t target_index_size              = 30;  // t
+    double filtering_parameter                  = 1.0; // F
+    std::int32_t alignment_engines              = 0;   // a
+    bool all_to_all                             = false;
+    std::string query_filepath;
+    std::string target_filepath;
+};
+
+/// @brief reads input from command line
+/// @param argc
+/// @param argv
+/// @return application parameters passed through command line, default otherwise
+ApplicationParameteres read_input(int argc, char* argv[])
+{
+    ApplicationParameteres parameters;
+
+    struct option options[] = {
+        {"kmer-size", required_argument, 0, 'k'},
+        {"window-size", required_argument, 0, 'w'},
+        {"num-devices", required_argument, 0, 'd'},
+        {"max-index-device-cache", required_argument, 0, 'c'},
+        {"max-index-host-cache", required_argument, 0, 'C'},
+        {"max-cached-memory", required_argument, 0, 'm'},
+        {"index-size", required_argument, 0, 'i'},
+        {"target-index-size", required_argument, 0, 't'},
+        {"filtering-parameter", required_argument, 0, 'F'},
+        {"alignment-engines", required_argument, 0, 'a'},
+        {"help", no_argument, 0, 'h'},
+    };
+
+    std::string optstring = "k:w:d:c:C:m:i:t:F:h:a:";
+    int32_t argument      = 0;
     while ((argument = getopt_long(argc, argv, optstring.c_str(), options, nullptr)) != -1)
     {
         switch (argument)
         {
         case 'k':
-            k = atoi(optarg);
+            parameters.k = atoi(optarg);
             break;
         case 'w':
-            w = atoi(optarg);
+            parameters.w = atoi(optarg);
             break;
         case 'd':
-            num_devices = atoi(optarg);
+            parameters.num_devices = atoi(optarg);
             break;
         case 'c':
-            max_index_cache_size_on_device = atoi(optarg);
+            parameters.max_index_cache_size_on_device = atoi(optarg);
             break;
         case 'C':
-            max_index_cache_size_on_host = atoi(optarg);
+            parameters.max_index_cache_size_on_host = atoi(optarg);
             break;
         case 'm':
 #ifndef CGA_ENABLE_CACHING_ALLOCATOR
             std::cerr << "ERROR: Argument -m / --max-cached-memory cannot be used without caching allocator" << std::endl;
             exit(1);
 #endif
-            max_cached_memory = atoi(optarg);
+            parameters.max_cached_memory = atoi(optarg);
             break;
         case 'i':
-            index_size = atoi(optarg);
+            parameters.index_size = atoi(optarg);
             break;
         case 't':
-            target_index_size = atoi(optarg);
+            parameters.target_index_size = atoi(optarg);
             break;
         case 'F':
-            filtering_parameter = atof(optarg);
+            parameters.filtering_parameter = atof(optarg);
             break;
         case 'a':
-            alignment_engines = atoi(optarg);
-            claragenomics::throw_on_negative(alignment_engines, "Number of alignment engines should be non-negative");
+            parameters.alignment_engines = atoi(optarg);
+            claragenomics::throw_on_negative(parameters.alignment_engines, "Number of alignment engines should be non-negative");
             break;
         case 'h':
             help(0);
@@ -198,19 +161,19 @@ int main(int argc, char* argv[])
         }
     }
 
-    if (k > claragenomics::cudamapper::Index::maximum_kmer_size())
+    if (parameters.k > claragenomics::cudamapper::Index::maximum_kmer_size())
     {
-        std::cerr << "kmer of size " << k << " is not allowed, maximum k = " << claragenomics::cudamapper::Index::maximum_kmer_size() << std::endl;
+        std::cerr << "kmer of size " << parameters.k << " is not allowed, maximum k = " << claragenomics::cudamapper::Index::maximum_kmer_size() << std::endl;
         exit(1);
     }
 
-    if (filtering_parameter > 1.0 || filtering_parameter < 0.0)
+    if (parameters.filtering_parameter > 1.0 || parameters.filtering_parameter < 0.0)
     {
         std::cerr << "-F / --filtering-parameter must be in range [0.0, 1.0]" << std::endl;
         exit(1);
     }
 
-    if (max_cached_memory < 0)
+    if (parameters.max_cached_memory < 0)
     {
         std::cerr << "-m / --max-cached-memory must not be negative" << std::endl;
         exit(1);
@@ -223,34 +186,89 @@ int main(int argc, char* argv[])
         help(1);
     }
 
-    std::string query_filepath  = std::string(argv[optind++]);
-    std::string target_filepath = std::string(argv[optind++]);
+    parameters.query_filepath  = std::string(argv[optind++]);
+    parameters.target_filepath = std::string(argv[optind++]);
 
-    bool all_to_all = false;
-    if (query_filepath == target_filepath)
+    if (parameters.query_filepath == parameters.target_filepath)
     {
-        all_to_all        = true;
-        target_index_size = index_size;
+        parameters.all_to_all        = true;
+        parameters.target_index_size = parameters.index_size;
         std::cerr << "NOTE - Since query and target files are same, activating all_to_all mode. Query index size used for both files." << std::endl;
     }
 
-    std::shared_ptr<claragenomics::io::FastaParser> query_parser = claragenomics::io::create_kseq_fasta_parser(query_filepath, k + w - 1);
-    int32_t queries                                              = query_parser->get_num_seqences();
+    return parameters;
+}
+
+/// @brief finds largest section of contiguous memory on device
+/// @return number of bytes
+std::size_t find_largest_contiguous_device_memory_section()
+{
+    // find the largest block of contiguous memory
+    size_t free;
+    size_t total;
+    cudaMemGetInfo(&free, &total);
+    const size_t memory_decrement = free / 100;              // decrease requested memory one by one percent
+    size_t size_to_try            = free - memory_decrement; // do not go for all memory
+    while (true)
+    {
+        void* dummy_ptr    = nullptr;
+        cudaError_t status = cudaMalloc(&dummy_ptr, size_to_try);
+        // if it was able to allocate memory free the memory and return the size
+        if (status == cudaSuccess)
+        {
+            cudaFree(dummy_ptr);
+            return size_to_try;
+        }
+
+        if (status == cudaErrorMemoryAllocation)
+        {
+            // if it was not possible to allocate the memory because there was not enough of it
+            // try allocating less memory in next iteration
+            if (size_to_try > memory_decrement)
+            {
+                size_to_try -= memory_decrement;
+            }
+            else
+            { // a very small amount of memory left, report an error
+                CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation);
+                return 0;
+            }
+        }
+        else
+        {
+            // if cudaMalloc failed because of error other than cudaErrorMemoryAllocation process the error
+            CGA_CU_CHECK_ERR(status);
+            return 0;
+        }
+    }
+
+    // this point should actually never be reached (loop either finds memory or causes an error)
+    assert(false);
+    CGA_CU_CHECK_ERR(cudaErrorMemoryAllocation);
+    return 0;
+}
+
+int main(int argc, char* argv[])
+{
+    using claragenomics::get_size;
+    claragenomics::logging::Init();
+
+    const ApplicationParameteres parameters = read_input(argc, argv);
+
+    std::shared_ptr<claragenomics::io::FastaParser> query_parser = claragenomics::io::create_kseq_fasta_parser(parameters.query_filepath, parameters.k + parameters.w - 1);
 
     std::shared_ptr<claragenomics::io::FastaParser> target_parser;
-    if (all_to_all)
+    if (parameters.all_to_all)
     {
         target_parser = query_parser;
     }
     else
     {
-        target_parser = claragenomics::io::create_kseq_fasta_parser(target_filepath, k + w - 1);
+        target_parser = claragenomics::io::create_kseq_fasta_parser(parameters.target_filepath, parameters.k + parameters.w - 1);
     }
 
-    int32_t targets = target_parser->get_num_seqences();
-
-    std::cerr << "Query " << query_filepath << " index " << queries << std::endl;
-    std::cerr << "Target " << target_filepath << " index " << targets << std::endl;
+    std::cerr << "Query " << parameters.query_filepath << " index " << query_parser->get_num_seqences() << std::endl;
+    std::cerr << "Target " << parameters.target_filepath << " index " << target_parser->get_num_seqences() << std::endl;
 
     // Data structure for holding overlaps to be written out
     std::mutex overlaps_writer_mtx;
@@ -262,8 +280,8 @@ int main(int argc, char* argv[])
     };
 
     ///Factor of 1000000 to make max cache size in MiB
-    auto query_chunks  = query_parser->get_read_chunks(index_size * 1000000);
-    auto target_chunks = target_parser->get_read_chunks(target_index_size * 1000000);
+    auto query_chunks  = query_parser->get_read_chunks(parameters.index_size * 1000000);
+    auto target_chunks = target_parser->get_read_chunks(parameters.target_index_size * 1000000);
 
     //First generate all the ranges independently, then loop over them.
     std::vector<QueryTargetsRange> query_target_ranges;
@@ -281,7 +299,7 @@ int main(int argc, char* argv[])
         // in all-to-all, for query chunk 0, we go through target chunks [target_idx = 0 , n = target_chunks.size())
         // for query chunk 1, we only need target chunks [target_idx = 1 , n), and in general for query_chunk i, we need target chunks [target_idx = i , n)
         // therefore as we're looping through query chunks, in all-to-all, will increment target_idx
-        if (all_to_all)
+        if (parameters.all_to_all)
         {
             target_idx++;
         }
@@ -291,21 +309,21 @@ int main(int argc, char* argv[])
     std::map<std::pair<uint64_t, uint64_t>, std::shared_ptr<claragenomics::cudamapper::IndexHostCopy>> host_index_cache;
 
     // This is a per-device cache, if it has the index it will return it, if not it will generate it, store and return it.
-    std::vector<std::map<std::pair<uint64_t, uint64_t>, std::shared_ptr<claragenomics::cudamapper::Index>>> device_index_cache(num_devices);
+    std::vector<std::map<std::pair<uint64_t, uint64_t>, std::shared_ptr<claragenomics::cudamapper::Index>>> device_index_cache(parameters.num_devices);
 
     // The number of overlap chunks which are to be computed
     std::atomic<int> num_overlap_chunks_to_print(0);
 
-    auto get_index = [&device_index_cache, &host_index_cache, max_index_cache_size_on_device, max_index_cache_size_on_host](claragenomics::DefaultDeviceAllocator allocator,
-                                                                                                                            claragenomics::io::FastaParser& parser,
-                                                                                                                            const claragenomics::cudamapper::read_id_t start_index,
-                                                                                                                            const claragenomics::cudamapper::read_id_t end_index,
-                                                                                                                            const std::uint64_t k,
-                                                                                                                            const std::uint64_t w,
-                                                                                                                            const int device_id,
-                                                                                                                            const bool allow_cache_index,
-                                                                                                                            const double filtering_parameter,
-                                                                                                                            const cudaStream_t cuda_stream) {
+    auto get_index = [&device_index_cache, &host_index_cache, &parameters](claragenomics::DefaultDeviceAllocator allocator,
+                                                                           claragenomics::io::FastaParser& parser,
+                                                                           const claragenomics::cudamapper::read_id_t start_index,
+                                                                           const claragenomics::cudamapper::read_id_t end_index,
+                                                                           const std::uint64_t k,
+                                                                           const std::uint64_t w,
+                                                                           const int device_id,
+                                                                           const bool allow_cache_index,
+                                                                           const double filtering_parameter,
+                                                                           const cudaStream_t cuda_stream) {
         CGA_NVTX_RANGE(profiler, "get index");
         std::pair<uint64_t, uint64_t> key;
         key.first  = start_index;
@@ -326,31 +344,31 @@ int main(int argc, char* argv[])
         else
         {
             //create an index, with hashed representations (minimizers)
-            index = claragenomics::cudamapper::Index::create_index(allocator,
-                                                                   parser,
-                                                                   start_index,
-                                                                   end_index,
-                                                                   k,
-                                                                   w,
-                                                                   true, // hash_representations
-                                                                   filtering_parameter,
-                                                                   cuda_stream);
+            index = std::move(claragenomics::cudamapper::Index::create_index(allocator,
+                                                                             parser,
+                                                                             start_index,
+                                                                             end_index,
+                                                                             k,
+                                                                             w,
+                                                                             true, // hash_representations
+                                                                             filtering_parameter,
+                                                                             cuda_stream));
 
             // If in all-to-all mode, put this query in the cache for later use.
             // Cache eviction is handled later on by the calling thread
             // using the evict_index function.
-            if (get_size<int32_t>(device_index_cache[device_id]) < max_index_cache_size_on_device && allow_cache_index)
+            if (get_size<int32_t>(device_index_cache[device_id]) < parameters.max_index_cache_size_on_device && allow_cache_index)
             {
                 device_index_cache[device_id][key] = index;
             }
-            else if (get_size<int32_t>(host_index_cache) < max_index_cache_size_on_host && allow_cache_index && device_id == 0)
+            else if (get_size<int32_t>(host_index_cache) < parameters.max_index_cache_size_on_host && allow_cache_index && device_id == 0)
             {
                 // if not cached on device, update host cache; only done on device 0 to avoid any race conditions in updating the host cache
-                host_index_cache[key] = claragenomics::cudamapper::IndexHostCopy::create_cache(*index,
-                                                                                               start_index,
-                                                                                               k,
-                                                                                               w,
-                                                                                               cuda_stream);
+                host_index_cache[key] = std::move(claragenomics::cudamapper::IndexHostCopy::create_cache(*index,
+                                                                                                         start_index,
+                                                                                                         k,
+                                                                                                         w,
+                                                                                                         cuda_stream));
             }
         }
         return index;
@@ -385,7 +403,7 @@ int main(int argc, char* argv[])
 #ifdef CGA_ENABLE_CACHING_ALLOCATOR
     // uses CachingDeviceAllocator
     std::size_t max_cached_bytes = 0;
-    if (max_cached_memory == 0)
+    if (parameters.max_cached_memory == 0)
     {
         std::cerr << "Programmatically looking for max cached memory" << std::endl;
         max_cached_bytes = find_largest_contiguous_device_memory_section();
@@ -397,7 +415,7 @@ int main(int argc, char* argv[])
     }
     else
     {
-        max_cached_bytes = max_cached_memory * 1024ull * 1024ull * 1024ull; // max_cached_memory is in GiB
+        max_cached_bytes = parameters.max_cached_memory * 1024ull * 1024ull * 1024ull; // max_cached_memory is in GiB
     }
 
     std::cerr << "Using device memory cache of " << max_cached_bytes << " bytes" << std::endl;
@@ -426,11 +444,11 @@ int main(int argc, char* argv[])
                                     *query_parser,
                                     query_start_index,
                                     query_end_index,
-                                    k,
-                                    w,
+                                    parameters.k,
+                                    parameters.w,
                                     device_id,
-                                    all_to_all,
-                                    filtering_parameter,
+                                    parameters.all_to_all,
+                                    parameters.filtering_parameter,
                                     cuda_stream);
         }
 
@@ -446,11 +464,11 @@ int main(int argc, char* argv[])
                                          *target_parser,
                                          target_start_index,
                                          target_end_index,
-                                         k,
-                                         w,
+                                         parameters.k,
+                                         parameters.w,
                                          device_id,
                                          true,
-                                         filtering_parameter,
+                                         parameters.filtering_parameter,
                                          cuda_stream);
             }
             {
@@ -472,11 +490,11 @@ int main(int argc, char* argv[])
 
                 std::vector<std::string> cigar;
                 // Align overlaps
-                if (alignment_engines > 0)
+                if (parameters.alignment_engines > 0)
                 {
                     cigar.resize(overlaps_to_add->size());
                     CGA_NVTX_RANGE(profiler, "align_overlaps");
-                    claragenomics::cudamapper::Overlapper::Overlapper::align_overlaps(*overlaps_to_add, *query_parser, *target_parser, alignment_engines, cigar);
+                    claragenomics::cudamapper::Overlapper::Overlapper::align_overlaps(*overlaps_to_add, *query_parser, *target_parser, parameters.alignment_engines, cigar);
                 }
 
                 //Increment counter which tracks number of overlap chunks to be filtered and printed
@@ -513,9 +531,9 @@ int main(int argc, char* argv[])
         }
 
         // If all-to-all mapping query will no longer be needed on device, remove it from the cache
-        if (all_to_all)
+        if (parameters.all_to_all)
         {
-            evict_index(query_start_index, query_end_index, device_id, num_devices);
+            evict_index(query_start_index, query_end_index, device_id, parameters.num_devices);
         }
     };
 
@@ -530,13 +548,13 @@ int main(int argc, char* argv[])
     // Each worker thread gets its own CUDA stream to work on. Currently there is only one worker thread per GPU,
     // but it is still necessary assign streams to each of then explicitly. --default-stream per-thread could
     // cause problems beacuse there are subthreads for worker threads
-    std::vector<cudaStream_t> cuda_streams(num_devices);
+    std::vector<cudaStream_t> cuda_streams(parameters.num_devices);
 
     // Launch worker threads to enable multi-GPU.
     // One worker thread is responsible for one GPU so the number
     // of worker threads launched is equal to the number of devices specified
     // by the user
-    for (int device_id = 0; device_id < num_devices; ++device_id)
+    for (int device_id = 0; device_id < parameters.num_devices; ++device_id)
     {
         CGA_CU_CHECK_ERR(cudaStreamCreate(&cuda_streams[device_id]));
         //Worker thread consumes query-target ranges off a queue

From 96afe4f3d8bfb5110f5e789c1303e4b0935f68ea Mon Sep 17 00:00:00 2001
From: Milos Maric <mmaric@nvidia.com>
Date: Mon, 23 Mar 2020 18:22:23 +0100
Subject: [PATCH 4/7] [cudamapper] Better output of input name and number of
 reads

---
 cudamapper/src/main.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu
index 531e84330..1d686e87c 100644
--- a/cudamapper/src/main.cu
+++ b/cudamapper/src/main.cu
@@ -267,8 +267,8 @@ int main(int argc, char* argv[])
         target_parser = claragenomics::io::create_kseq_fasta_parser(parameters.target_filepath, parameters.k + parameters.w - 1);
     }
 
-    std::cerr << "Query " << parameters.query_filepath << " index " << query_parser->get_num_seqences() << std::endl;
-    std::cerr << "Target " << parameters.target_filepath << " index " << target_parser->get_num_seqences() << std::endl;
+    std::cerr << "Query file: " << parameters.query_filepath << ", number of reads: " << query_parser->get_num_seqences() << std::endl;
+    std::cerr << "Target file: " << parameters.target_filepath << ", number of reads: " << target_parser->get_num_seqences() << std::endl;
 
     // Data structure for holding overlaps to be written out
     std::mutex overlaps_writer_mtx;

From ead3a1b62fb4faca51b5bd0003bd1b1bdfd5eae4 Mon Sep 17 00:00:00 2001
From: Milos Maric <mmaric@nvidia.com>
Date: Mon, 23 Mar 2020 18:32:10 +0100
Subject: [PATCH 5/7] [cudamapper] Extracted writter thread

---
 cudamapper/src/main.cu | 68 +++++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 24 deletions(-)

diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu
index 1d686e87c..858d8b4ca 100644
--- a/cudamapper/src/main.cu
+++ b/cudamapper/src/main.cu
@@ -199,6 +199,41 @@ ApplicationParameteres read_input(int argc, char* argv[])
     return parameters;
 }
 
+/// @brief adds read names to overlaps and writes them to output
+/// This function is expected to be executed async to matcher + overlapper
+/// @param overlaps_writer_mtx locked while writing the output
+/// @param num_overlap_chunks_to_print increased before the function is called, decreased right before the function finishes // TODO: improve this design
+/// @param filtered_overlaps overlaps to be written out, on input without read names, on output cleared
+/// @param query_index needed for read names // TODO: consider only passing vector of names, not whole indices
+/// @param target_index needed for read names // TODO: consider only passing vector of names, not whole indices
+/// @param cigar
+/// @param device_id id of device on which query and target indices were created
+void writer_thread_function(std::mutex& overlaps_writer_mtx,
+                            std::atomic<int>& num_overlap_chunks_to_print,
+                            std::shared_ptr<std::vector<claragenomics::cudamapper::Overlap>> filtered_overlaps,
+                            std::shared_ptr<claragenomics::cudamapper::Index> query_index,
+                            std::shared_ptr<claragenomics::cudamapper::Index> target_index,
+                            const std::vector<std::string> cigar,
+                            const int device_id)
+{
+    // This function is expected to run in a separate thread so set current device in order to avoid problems
+    // with deallocating indices with different current device than the one on which they were created
+    cudaSetDevice(device_id);
+
+    // parallel update of the query/target read names for filtered overlaps [parallel on host]
+    claragenomics::cudamapper::Overlapper::update_read_names(*filtered_overlaps, *query_index, *target_index);
+    std::lock_guard<std::mutex> lck(overlaps_writer_mtx);
+    claragenomics::cudamapper::Overlapper::print_paf(*filtered_overlaps, cigar);
+
+    //clear data
+    for (auto o : *filtered_overlaps)
+    {
+        o.clear();
+    }
+    //Decrement counter which tracks number of overlap chunks to be filtered and printed
+    num_overlap_chunks_to_print--;
+};
+
 /// @brief finds largest section of contiguous memory on device
 /// @return number of bytes
 std::size_t find_largest_contiguous_device_memory_section()
@@ -499,30 +534,15 @@ int main(int argc, char* argv[])
 
                 //Increment counter which tracks number of overlap chunks to be filtered and printed
                 num_overlap_chunks_to_print++;
-                auto print_overlaps = [&overlaps_writer_mtx, &num_overlap_chunks_to_print](std::shared_ptr<std::vector<claragenomics::cudamapper::Overlap>> filtered_overlaps,
-                                                                                           std::shared_ptr<claragenomics::cudamapper::Index> query_index,
-                                                                                           std::shared_ptr<claragenomics::cudamapper::Index> target_index,
-                                                                                           const std::vector<std::string>& cigar,
-                                                                                           const int device_id) {
-                    // This lambda is expected to run in a separate thread so set current device in order to avoid problems
-                    // with deallocating indices with different current device then the one on which they were created
-                    cudaSetDevice(device_id);
-
-                    // parallel update of the query/target read names for filtered overlaps [parallel on host]
-                    claragenomics::cudamapper::Overlapper::update_read_names(*filtered_overlaps, *query_index, *target_index);
-                    std::lock_guard<std::mutex> lck(overlaps_writer_mtx);
-                    claragenomics::cudamapper::Overlapper::print_paf(*filtered_overlaps, cigar);
-
-                    //clear data
-                    for (auto o : *filtered_overlaps)
-                    {
-                        o.clear();
-                    }
-                    //Decrement counter which tracks number of overlap chunks to be filtered and printed
-                    num_overlap_chunks_to_print--;
-                };
 
-                std::thread t(print_overlaps, overlaps_to_add, query_index, target_index, cigar, device_id);
+                std::thread t(writer_thread_function,
+                              std::ref(overlaps_writer_mtx),
+                              std::ref(num_overlap_chunks_to_print),
+                              overlaps_to_add,
+                              query_index,
+                              target_index,
+                              std::move(cigar),
+                              device_id);
                 t.detach();
             }
 
@@ -600,4 +620,4 @@ int main(int argc, char* argv[])
     }
 
     return 0;
-}
+}
\ No newline at end of file

From b9fb040ac6dea5053724481b9b7b7ee3f13c3700 Mon Sep 17 00:00:00 2001
From: Milos Maric <mmaric@nvidia.com>
Date: Mon, 23 Mar 2020 18:35:10 +0100
Subject: [PATCH 6/7] [cudamapper] Index size is in MB, not MiB

---
 cudamapper/src/main.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu
index 858d8b4ca..abf6b995b 100644
--- a/cudamapper/src/main.cu
+++ b/cudamapper/src/main.cu
@@ -314,7 +314,7 @@ int main(int argc, char* argv[])
         std::vector<std::pair<std::int32_t, int32_t>> target_ranges;
     };
 
-    ///Factor of 1000000 to make max cache size in MiB
+    ///Factor of 1000000 to make max cache size in MB
     auto query_chunks  = query_parser->get_read_chunks(parameters.index_size * 1000000);
     auto target_chunks = target_parser->get_read_chunks(parameters.target_index_size * 1000000);
 

From 389d94c0c4567afdb783e9d1ec374d5e6259b3d2 Mon Sep 17 00:00:00 2001
From: Milos Maric <mmaric@nvidia.com>
Date: Tue, 24 Mar 2020 19:02:37 +0100
Subject: [PATCH 7/7] [cudamapper] removed unnecessary std::move

---
 cudamapper/src/main.cu | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/cudamapper/src/main.cu b/cudamapper/src/main.cu
index abf6b995b..1051b9ea4 100644
--- a/cudamapper/src/main.cu
+++ b/cudamapper/src/main.cu
@@ -379,15 +379,15 @@ int main(int argc, char* argv[])
         else
         {
             //create an index, with hashed representations (minimizers)
-            index = std::move(claragenomics::cudamapper::Index::create_index(allocator,
-                                                                             parser,
-                                                                             start_index,
-                                                                             end_index,
-                                                                             k,
-                                                                             w,
-                                                                             true, // hash_representations
-                                                                             filtering_parameter,
-                                                                             cuda_stream));
+            index = claragenomics::cudamapper::Index::create_index(allocator,
+                                                                   parser,
+                                                                   start_index,
+                                                                   end_index,
+                                                                   k,
+                                                                   w,
+                                                                   true, // hash_representations
+                                                                   filtering_parameter,
+                                                                   cuda_stream);
 
             // If in all-to-all mode, put this query in the cache for later use.
             // Cache eviction is handled later on by the calling thread
@@ -399,11 +399,11 @@ int main(int argc, char* argv[])
             else if (get_size<int32_t>(host_index_cache) < parameters.max_index_cache_size_on_host && allow_cache_index && device_id == 0)
             {
                 // if not cached on device, update host cache; only done on device 0 to avoid any race conditions in updating the host cache
-                host_index_cache[key] = std::move(claragenomics::cudamapper::IndexHostCopy::create_cache(*index,
-                                                                                                         start_index,
-                                                                                                         k,
-                                                                                                         w,
-                                                                                                         cuda_stream));
+                host_index_cache[key] = claragenomics::cudamapper::IndexHostCopy::create_cache(*index,
+                                                                                               start_index,
+                                                                                               k,
+                                                                                               w,
+                                                                                               cuda_stream);
             }
         }
         return index;