From 3bbf2e05cf7669c69f5b305dcc7ebfcf858c6910 Mon Sep 17 00:00:00 2001
From: Prabindh Sundareson <psundareson@nvidia.com>
Date: Tue, 9 Jan 2024 18:11:00 +0530
Subject: [PATCH] [Enhancement][1:1 docking] Add CUDA stream based acceleration
 for 1:1 dock

- Adds argument paired_batch_size for specifying fast 1:1 pairing
- Adds custom handlers for paired_batch_size
- Defines JSON schema for specifying 1:1 combinations and config
- Adds use_fast_math support in cmake build (default enabled)
- Fix memory leaks in Autodock Vina scoring_function reported by Valgrind
- Fix missing CUDA memory initialisation that causes crashes

The original code flow is retained when paired_batch_size is not specified.
Please refer the README for using the paired_batch_size argument.
---
 unidock/CMakeLists.txt                       |   9 +-
 unidock/README.md                            |  25 +
 unidock/src/cuda/monte_carlo.cu              | 618 +++++++++++++++++++
 unidock/src/lib/monte_carlo.h                |   6 +
 unidock/src/lib/scoring_function.h           |  26 +-
 unidock/src/lib/vina.cpp                     |  66 +-
 unidock/src/lib/vina.h                       |   7 +-
 unidock/src/main/complex_property.h          | 111 ++++
 unidock/src/main/main.cpp                    |  53 +-
 unidock/src/main/paired_batching.schema.json |  28 +
 unidock/src/main/simulation_container.h      | 517 ++++++++++++++++
 unidock/src/main/vina_cuda_worker.h          | 191 ++++++
 unidock/src/rocm/main.cu.hip                 |   2 +-
 13 files changed, 1616 insertions(+), 43 deletions(-)
 create mode 100644 unidock/src/main/complex_property.h
 create mode 100644 unidock/src/main/paired_batching.schema.json
 create mode 100644 unidock/src/main/simulation_container.h
 create mode 100644 unidock/src/main/vina_cuda_worker.h

diff --git a/unidock/CMakeLists.txt b/unidock/CMakeLists.txt
index f920aca..ff2b616 100644
--- a/unidock/CMakeLists.txt
+++ b/unidock/CMakeLists.txt
@@ -26,6 +26,13 @@ if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
 	)
 endif()
 
+# Add fast math
+option(BUILD_FAST_MATH "Build in fast math mode" ON)
+if(BUILD_FAST_MATH)
+	set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} --use_fast_math")
+	message("CMAKE_CUDA_FLAGS_RELEASE updated for fast_math: ${CMAKE_CUDA_FLAGS_RELEASE}")
+endif()
+
 set(VINA_BIN_NAME unidock)
 add_compile_definitions(ENABLE_CUDA)
 add_compile_definitions(VERSION="v${PROJECT_VERSION}")
@@ -61,4 +68,4 @@ add_custom_target(
   COMMAND ${CLANG_FORMAT} -i ${sources}
   COMMENT "Running clang-format"
   VERBATIM
-)
\ No newline at end of file
+)
diff --git a/unidock/README.md b/unidock/README.md
index 3fb097f..efc925a 100644
--- a/unidock/README.md
+++ b/unidock/README.md
@@ -235,3 +235,28 @@ DOI 10.1002/jcc.21334
 3. Uni-Dock computes slowly for few (<10) ligands.
 
      The optimal application of Uni-Dock occurs in scenarios where one binding pocket interacts with numerous (in an order of 1000) ligands. As the number of ligands within a single computational batch increases, the average processing speed improves. In instances where only a few ligands are present for one binding pocket, the overhead proportion becomes considerably large, leading to slower computational performance.
+     
+### Addendum to "FAQ 3 - Uni-Dock computes slowly for few (<10) ligands.":
+
+The `paired_batch` mode provides a mechanism to accelerate simultaneous 1:1 docking in batches using Vina scoring, using CUDA streams. To run docking using this mode, invoke unidock with the parameter `--paired_batch_size` value >0, with the protein:ligand configurations passed in JSON form using `--ligand_index`. The JSON file should use schema defined in paired_batching.schema.json.
+
+A sample input data.json is as below, complying to the schema:
+```
+{
+    "7LCU": {
+        "protein": "molecular_docking/PoseBuster/7LCU/7LCU_receptor.pdbqt",
+        "ligand": "molecular_docking/PoseBuster/7LCU/7LCU_ligand_prep.sdf",
+        "ligand_config": "molecular_docking/PoseBuster/7LCU/docking_grid.json"
+    },
+    "7KZ9": {
+        "protein": "molecular_docking/PoseBuster/7KZ9/7KZ9_receptor.pdbqt",
+        "ligand": "molecular_docking/PoseBuster/7KZ9/7KZ9_ligand_prep.sdf",
+        "ligand_config": "molecular_docking/PoseBuster/7KZ9/docking_grid.json"
+    }
+}
+```
+
+A typical usage using paired_batch mode is as shown below, with batch size of 10.
+
+`build/unidock --paired_batch_size 10 --ligand_index data_pb1.json --size_x 25 --size_y 25 --size_z 25 --dir test/prof_25_1024_80 --exhaustiveness 1024 --max_step 80 --seed 5`
+
diff --git a/unidock/src/cuda/monte_carlo.cu b/unidock/src/cuda/monte_carlo.cu
index bb2c408..b0ad2e0 100644
--- a/unidock/src/cuda/monte_carlo.cu
+++ b/unidock/src/cuda/monte_carlo.cu
@@ -212,6 +212,604 @@ __global__ __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) void kern
 
 /* Above based on kernel.cl */
 
+
+__host__ void monte_carlo::mc_stream(
+    std::vector<model> &m_gpu, std::vector<output_container> &out_gpu,
+    std::vector<precalculate_byatom> &p_gpu, triangular_matrix_cuda_t *m_data_list_gpu,
+    const igrid &ig, const vec &corner1, const vec &corner2, rng &generator, int verbosity,
+    unsigned long long seed, std::vector<std::vector<bias_element>> &bias_batch_list) const {
+    /* Definitions from vina1.2 */
+    DEBUG_PRINTF("entering CUDA monte_carlo search\n");  // debug
+
+    cudaStream_t curr_stream = 0;
+    checkCUDA(cudaStreamCreate ( &curr_stream));
+    DEBUG_PRINTF("Stream created [0x%p]\n", curr_stream);
+
+    vec authentic_v(1000, 1000,
+                    1000);  // FIXME? this is here to avoid max_fl/max_fl
+
+    quasi_newton quasi_newton_par;
+    const int quasi_newton_par_max_steps = local_steps;  // no need to decrease step
+
+    /* Allocate CPU memory and define new data structure */
+    DEBUG_PRINTF("Allocating CPU memory\n");  // debug
+    m_cuda_t *m_cuda;
+    checkCUDA(cudaMallocHost(&m_cuda, sizeof(m_cuda_t)));
+    memset(m_cuda, 0, sizeof(m_cuda_t));
+
+    output_type_cuda_t *rand_molec_struc_tmp;
+    checkCUDA(cudaMallocHost(&rand_molec_struc_tmp, sizeof(output_type_cuda_t)));
+    memset(rand_molec_struc_tmp, 0, sizeof(output_type_cuda_t));
+
+    ig_cuda_t *ig_cuda_ptr;
+    checkCUDA(cudaMallocHost(&ig_cuda_ptr, sizeof(ig_cuda_t)));
+    memset(ig_cuda_ptr, 0, sizeof(ig_cuda_t));
+
+    p_cuda_t_cpu *p_cuda;
+    checkCUDA(cudaMallocHost(&p_cuda, sizeof(p_cuda_t_cpu)));
+    memset(p_cuda, 0, sizeof(p_cuda_t_cpu));
+
+    /* End CPU allocation */
+
+    /* Allocate GPU memory */
+    DEBUG_PRINTF("Allocating GPU memory\n");
+    size_t m_cuda_size = sizeof(m_cuda_t);
+    DEBUG_PRINTF("m_cuda_size=%lu\n", m_cuda_size);
+    size_t ig_cuda_size = sizeof(ig_cuda_t);
+    DEBUG_PRINTF("ig_cuda_size=%lu\n", ig_cuda_size);
+    DEBUG_PRINTF("p_cuda_size_cpu=%lu\n", sizeof(p_cuda_t_cpu));
+
+    size_t p_cuda_size_gpu = sizeof(p_cuda_t);
+    DEBUG_PRINTF("p_cuda_size_gpu=%lu\n", p_cuda_size_gpu);
+
+    // rand_molec_struc_gpu
+    float *rand_molec_struc_gpu;
+    checkCUDA(cudaMalloc(&rand_molec_struc_gpu, thread * SIZE_OF_MOLEC_STRUC));
+    checkCUDA(cudaMemsetAsync(rand_molec_struc_gpu, 0, thread * SIZE_OF_MOLEC_STRUC, curr_stream));
+
+    float epsilon_fl_float = static_cast<float>(epsilon_fl);
+
+    // use cuRand to generate random values on GPU
+    curandStatePhilox4_32_10_t *states;
+    DEBUG_PRINTF("random states size=%lu\n", sizeof(curandStatePhilox4_32_10_t) * thread);
+    checkCUDA(cudaMalloc(&states, sizeof(curandStatePhilox4_32_10_t) * thread));
+    checkCUDA(cudaMemsetAsync(states, 0, sizeof(curandStatePhilox4_32_10_t) * thread, curr_stream));
+
+    // hunt_cap_gpu
+    float *hunt_cap_gpu;
+    float hunt_cap_float[3] = {static_cast<float>(hunt_cap[0]), static_cast<float>(hunt_cap[1]),
+                               static_cast<float>(hunt_cap[2])};
+
+    checkCUDA(cudaMalloc(&hunt_cap_gpu, 3 * sizeof(float)));
+    checkCUDA(cudaMemsetAsync(hunt_cap_gpu, 0, 3 * sizeof(float), curr_stream));
+    // Preparing m related data
+    m_cuda_t *m_cuda_gpu;
+    DEBUG_PRINTF("m_cuda_size=%lu", m_cuda_size);
+    checkCUDA(cudaMalloc(&m_cuda_gpu, num_of_ligands * m_cuda_size));
+    checkCUDA(cudaMemsetAsync(m_cuda_gpu, 0, num_of_ligands * m_cuda_size, curr_stream));
+    // Preparing p related data
+
+    p_cuda_t *p_cuda_gpu;
+    checkCUDA(cudaMalloc(&p_cuda_gpu, num_of_ligands * p_cuda_size_gpu));
+    checkCUDA(cudaMemsetAsync(p_cuda_gpu, 0, num_of_ligands * p_cuda_size_gpu, curr_stream));
+    DEBUG_PRINTF("p_cuda_gpu=%p\n", p_cuda_gpu);
+    // Preparing ig related data (cache related data)
+    ig_cuda_t *ig_cuda_gpu;
+
+    float *authentic_v_gpu;
+    float authentic_v_float[3]
+        = {static_cast<float>(authentic_v[0]), static_cast<float>(authentic_v[1]),
+           static_cast<float>(authentic_v[2])};
+
+    checkCUDA(cudaMalloc(&authentic_v_gpu, sizeof(authentic_v_float)));
+    checkCUDA(cudaMemsetAsync(authentic_v_gpu, 0, sizeof(authentic_v_float), curr_stream));
+    // Preparing result data
+    output_type_cuda_t *results_gpu;
+    checkCUDA(cudaMalloc(&results_gpu, thread * sizeof(output_type_cuda_t)));
+    checkCUDA(cudaMemsetAsync(results_gpu, 0, thread * sizeof(output_type_cuda_t), curr_stream));
+
+    m_cuda_t *m_cuda_global;
+    checkCUDA(cudaMalloc(&m_cuda_global, thread * sizeof(m_cuda_t)));
+    checkCUDA(cudaMemsetAsync(m_cuda_global, 0, thread * sizeof(m_cuda_t), curr_stream));
+
+    matrix_d *h_cuda_global;
+    checkCUDA(cudaMalloc(&h_cuda_global, thread * sizeof(matrix_d)));
+    checkCUDA(cudaMemsetAsync(h_cuda_global, 0, thread * sizeof(matrix_d), curr_stream));
+
+    /* End Allocating GPU Memory */
+
+    assert(num_of_ligands <= MAX_LIGAND_NUM);
+    assert(thread <= MAX_THREAD);
+
+    struct tmp_struct {
+        int start_index = 0;
+        int parent_index = 0;
+        void store_node(tree<segment> &child_ptr, rigid_cuda_t &rigid) {
+            start_index++;  // start with index 1, index 0 is root node
+            rigid.parent[start_index] = parent_index;
+            rigid.atom_range[start_index][0] = child_ptr.node.begin;
+            rigid.atom_range[start_index][1] = child_ptr.node.end;
+            for (int i = 0; i < 9; i++)
+                rigid.orientation_m[start_index][i] = child_ptr.node.get_orientation_m().data[i];
+            rigid.orientation_q[start_index][0] = child_ptr.node.orientation().R_component_1();
+            rigid.orientation_q[start_index][1] = child_ptr.node.orientation().R_component_2();
+            rigid.orientation_q[start_index][2] = child_ptr.node.orientation().R_component_3();
+            rigid.orientation_q[start_index][3] = child_ptr.node.orientation().R_component_4();
+            for (int i = 0; i < 3; i++) {
+                rigid.origin[start_index][i] = child_ptr.node.get_origin()[i];
+                rigid.axis[start_index][i] = child_ptr.node.get_axis()[i];
+                rigid.relative_axis[start_index][i] = child_ptr.node.relative_axis[i];
+                rigid.relative_origin[start_index][i] = child_ptr.node.relative_origin[i];
+            }
+            if (child_ptr.children.size() == 0)
+                return;
+            else {
+                assert(start_index < MAX_NUM_OF_RIGID);
+                int parent_index_tmp = start_index;
+                for (int i = 0; i < child_ptr.children.size(); i++) {
+                    this->parent_index = parent_index_tmp;  // Update parent index
+                    this->store_node(child_ptr.children[i], rigid);
+                }
+            }
+        }
+    };
+
+    for (int l = 0; l < num_of_ligands; ++l) {
+        model &m = m_gpu[l];
+        const precalculate_byatom &p = p_gpu[l];
+
+        /* Prepare m related data */
+        conf_size s = m.get_size();
+        change g(s);
+        output_type tmp(s, 0);
+        tmp.c = m.get_initial_conf();
+
+        assert(m.atoms.size() < MAX_NUM_OF_ATOMS);
+
+        // Preparing ligand data
+        DEBUG_PRINTF("prepare ligand data\n");
+        assert(m.num_other_pairs() == 0);  // m.other_pairs is not supported!
+        assert(m.ligands.size() <= 1);     // Only one ligand supported!
+
+        if (m.ligands.size() == 0) {  // ligand parsing error
+            m_cuda->m_num_movable_atoms = -1;
+            DEBUG_PRINTF("copy m_cuda to gpu, size=%lu\n", sizeof(m_cuda_t));
+            checkCUDA(cudaMemcpyAsync(m_cuda_gpu + l, m_cuda, sizeof(m_cuda_t), cudaMemcpyHostToDevice, curr_stream));
+        } else {
+            for (int i = 0; i < m.atoms.size(); i++) {
+                m_cuda->atoms[i].types[0]
+                    = m.atoms[i].el;  // To store 4 atoms types (el, ad, xs, sy)
+                m_cuda->atoms[i].types[1] = m.atoms[i].ad;
+                m_cuda->atoms[i].types[2] = m.atoms[i].xs;
+                m_cuda->atoms[i].types[3] = m.atoms[i].sy;
+                for (int j = 0; j < 3; j++) {
+                    m_cuda->atoms[i].coords[j] = m.atoms[i].coords[j];  // To store atom coords
+                }
+            }
+
+            // To store atoms coords
+            for (int i = 0; i < m.coords.size(); i++) {
+                for (int j = 0; j < 3; j++) {
+                    m_cuda->m_coords.coords[i][j] = m.coords[i].data[j];
+                }
+            }
+
+            // To store minus forces
+            for (int i = 0; i < m.coords.size(); i++) {
+                for (int j = 0; j < 3; j++) {
+                    m_cuda->minus_forces.coords[i][j] = m.minus_forces[i].data[j];
+                }
+            }
+
+            m_cuda->ligand.pairs.num_pairs = m.ligands[0].pairs.size();
+            for (int i = 0; i < m_cuda->ligand.pairs.num_pairs; i++) {
+                m_cuda->ligand.pairs.type_pair_index[i] = m.ligands[0].pairs[i].type_pair_index;
+                m_cuda->ligand.pairs.a[i] = m.ligands[0].pairs[i].a;
+                m_cuda->ligand.pairs.b[i] = m.ligands[0].pairs[i].b;
+            }
+            m_cuda->ligand.begin = m.ligands[0].begin;  // 0
+            m_cuda->ligand.end = m.ligands[0].end;      // 29
+            ligand &m_ligand = m.ligands[0];            // Only support one ligand
+            DEBUG_PRINTF("m_ligand.end=%lu, MAX_NUM_OF_ATOMS=%d\n", m_ligand.end, MAX_NUM_OF_ATOMS);
+            assert(m_ligand.end < MAX_NUM_OF_ATOMS);
+
+            // Store root node
+            m_cuda->ligand.rigid.atom_range[0][0] = m_ligand.node.begin;
+            m_cuda->ligand.rigid.atom_range[0][1] = m_ligand.node.end;
+            for (int i = 0; i < 3; i++)
+                m_cuda->ligand.rigid.origin[0][i] = m_ligand.node.get_origin()[i];
+            for (int i = 0; i < 9; i++)
+                m_cuda->ligand.rigid.orientation_m[0][i]
+                    = m_ligand.node.get_orientation_m().data[i];
+            m_cuda->ligand.rigid.orientation_q[0][0] = m_ligand.node.orientation().R_component_1();
+            m_cuda->ligand.rigid.orientation_q[0][1] = m_ligand.node.orientation().R_component_2();
+            m_cuda->ligand.rigid.orientation_q[0][2] = m_ligand.node.orientation().R_component_3();
+            m_cuda->ligand.rigid.orientation_q[0][3] = m_ligand.node.orientation().R_component_4();
+            for (int i = 0; i < 3; i++) {
+                m_cuda->ligand.rigid.axis[0][i] = 0;
+                m_cuda->ligand.rigid.relative_axis[0][i] = 0;
+                m_cuda->ligand.rigid.relative_origin[0][i] = 0;
+            }
+
+            // Store children nodes (in depth-first order)
+            DEBUG_PRINTF("store children nodes\n");
+
+            tmp_struct ts;
+            for (int i = 0; i < m_ligand.children.size(); i++) {
+                ts.parent_index = 0;  // Start a new branch, whose parent is 0
+                ts.store_node(m_ligand.children[i], m_cuda->ligand.rigid);
+            }
+            m_cuda->ligand.rigid.num_children = ts.start_index;
+
+            // set children map
+            DEBUG_PRINTF("set children map\n");
+            for (int i = 0; i < MAX_NUM_OF_RIGID; i++)
+                for (int j = 0; j < MAX_NUM_OF_RIGID; j++) {
+                    m_cuda->ligand.rigid.children_map[i][j] = false;
+                    m_cuda->ligand.rigid.descendant_map[i][j] = false;
+                }
+
+            for (int i = MAX_NUM_OF_RIGID - 1; i >= 0; i--) {
+                if (i > 0) {
+                    m_cuda->ligand.rigid.children_map[m_cuda->ligand.rigid.parent[i]][i] = true;
+                    m_cuda->ligand.rigid.descendant_map[m_cuda->ligand.rigid.parent[i]][i] = true;
+                }
+                for (int j = i + 1; j < MAX_NUM_OF_RIGID; j++) {
+                    if (m_cuda->ligand.rigid.descendant_map[i][j])
+                        m_cuda->ligand.rigid.descendant_map[m_cuda->ligand.rigid.parent[i]][j]
+                            = true;
+                }
+            }
+            m_cuda->m_num_movable_atoms = m.num_movable_atoms();
+
+            DEBUG_PRINTF("copy m_cuda to gpu, size=%lu\n", sizeof(m_cuda_t));
+            checkCUDA(cudaMemcpyAsync(m_cuda_gpu + l, m_cuda, sizeof(m_cuda_t), cudaMemcpyHostToDevice, curr_stream));
+
+            /* Prepare rand_molec_struc data */
+            int lig_torsion_size = tmp.c.ligands[0].torsions.size();
+            DEBUG_PRINTF("lig_torsion_size=%d\n", lig_torsion_size);
+            int flex_torsion_size;
+            if (tmp.c.flex.size() != 0)
+                flex_torsion_size = tmp.c.flex[0].torsions.size();
+            else
+                flex_torsion_size = 0;
+            // std::vector<vec> uniform_data;
+            // uniform_data.resize(thread);
+
+            for (int i = 0; i < threads_per_ligand; ++i) {
+                if (!local_only) {
+                    tmp.c.randomize(corner1, corner2,
+                                    generator);  // generate a random structure,
+                                                 // can move to GPU if necessary
+                }
+                for (int j = 0; j < 3; j++)
+                    rand_molec_struc_tmp->position[j] = tmp.c.ligands[0].rigid.position[j];
+                assert(lig_torsion_size <= MAX_NUM_OF_LIG_TORSION);
+                for (int j = 0; j < lig_torsion_size; j++)
+                    rand_molec_struc_tmp->lig_torsion[j]
+                        = tmp.c.ligands[0].torsions[j];  // Only support one ligand
+                assert(flex_torsion_size <= MAX_NUM_OF_FLEX_TORSION);
+                for (int j = 0; j < flex_torsion_size; j++)
+                    rand_molec_struc_tmp->flex_torsion[j]
+                        = tmp.c.flex[0].torsions[j];  // Only support one flex
+
+                rand_molec_struc_tmp->orientation[0]
+                    = (float)tmp.c.ligands[0].rigid.orientation.R_component_1();
+                rand_molec_struc_tmp->orientation[1]
+                    = (float)tmp.c.ligands[0].rigid.orientation.R_component_2();
+                rand_molec_struc_tmp->orientation[2]
+                    = (float)tmp.c.ligands[0].rigid.orientation.R_component_3();
+                rand_molec_struc_tmp->orientation[3]
+                    = (float)tmp.c.ligands[0].rigid.orientation.R_component_4();
+
+                rand_molec_struc_tmp->lig_torsion_size = lig_torsion_size;
+
+                float *rand_molec_struc_gpu_tmp
+                    = rand_molec_struc_gpu
+                      + (l * threads_per_ligand + i) * SIZE_OF_MOLEC_STRUC / sizeof(float);
+                checkCUDA(cudaMemcpyAsync(rand_molec_struc_gpu_tmp, rand_molec_struc_tmp,
+                                     SIZE_OF_MOLEC_STRUC, cudaMemcpyHostToDevice, curr_stream));
+            }
+
+            /* Preparing p related data */
+            DEBUG_PRINTF("Preaparing p related data\n");  // debug
+
+            // copy pointer instead of data
+            p_cuda->m_cutoff_sqr = p.m_cutoff_sqr;
+            p_cuda->factor = p.m_factor;
+            p_cuda->n = p.m_n;
+            p_cuda->m_data_size = p.m_data.m_data.size();
+            checkCUDA(cudaMemcpyAsync(p_cuda_gpu + l, p_cuda, sizeof(p_cuda_t), cudaMemcpyHostToDevice, curr_stream));
+            checkCUDA(cudaMemcpyAsync(&(p_cuda_gpu[l].m_data), &(m_data_list_gpu[l].p_data),
+                                 sizeof(p_m_data_cuda_t *),
+                                 cudaMemcpyHostToDevice, curr_stream));  // check if fl == float
+        }
+    }
+
+    /* Prepare data only concerns rigid receptor */
+
+    // Preparing igrid related data
+    DEBUG_PRINTF("Preparing ig related data\n");  // debug
+
+    bool multi_bias = (bias_batch_list.size() == num_of_ligands);
+    if (multi_bias) {
+        // multi bias mode
+        std::cout << "with multi bias ";
+
+        checkCUDA(cudaMalloc(&ig_cuda_gpu, ig_cuda_size * num_of_ligands));
+        checkCUDA(cudaMemsetAsync(ig_cuda_gpu, 0, ig_cuda_size * num_of_ligands, curr_stream));
+        for (int l = 0; l < num_of_ligands; ++l) {
+            if (ig.get_atu() == atom_type::XS) {
+                cache ig_tmp(ig.get_gd(), ig.get_slope());
+                ig_tmp.m_grids = ig.get_grids();
+                // // debug
+                // if (l == 1){
+                // 	std::cout << "writing original grid map\n";
+                // 	ig_tmp.write(std::string("./ori"), szv(1,0));
+                // }
+                ig_tmp.compute_bias(m_gpu[l], bias_batch_list[l]);
+                // // debug
+                // std::cout << "writing bias\n";
+                // ig_tmp.write(std::string("./")+std::to_string(l), szv(1,0));
+                ig_cuda_ptr->atu = ig.get_atu();  // atu
+                DEBUG_PRINTF("ig_cuda_ptr->atu=%d\n", ig_cuda_ptr->atu);
+                ig_cuda_ptr->slope = ig.get_slope();  // slope
+                std::vector<grid> tmp_grids = ig.get_grids();
+                int grid_size = tmp_grids.size();
+                DEBUG_PRINTF("ig.size()=%d, GRIDS_SIZE=%d, should be 33\n", grid_size, GRIDS_SIZE);
+
+                for (int i = 0; i < grid_size; i++) {
+                    // DEBUG_PRINTF("i=%d\n",i); //debug
+                    for (int j = 0; j < 3; j++) {
+                        ig_cuda_ptr->grids[i].m_init[j] = tmp_grids[i].m_init[j];
+                        ig_cuda_ptr->grids[i].m_factor[j] = tmp_grids[i].m_factor[j];
+                        ig_cuda_ptr->grids[i].m_dim_fl_minus_1[j]
+                            = tmp_grids[i].m_dim_fl_minus_1[j];
+                        ig_cuda_ptr->grids[i].m_factor_inv[j] = tmp_grids[i].m_factor_inv[j];
+                    }
+                    if (tmp_grids[i].m_data.dim0() != 0) {
+                        ig_cuda_ptr->grids[i].m_i = tmp_grids[i].m_data.dim0();
+                        assert(MAX_NUM_OF_GRID_MI >= ig_cuda_ptr->grids[i].m_i);
+                        ig_cuda_ptr->grids[i].m_j = tmp_grids[i].m_data.dim1();
+                        assert(MAX_NUM_OF_GRID_MJ >= ig_cuda_ptr->grids[i].m_j);
+                        ig_cuda_ptr->grids[i].m_k = tmp_grids[i].m_data.dim2();
+                        assert(MAX_NUM_OF_GRID_MK >= ig_cuda_ptr->grids[i].m_k);
+
+                        assert(tmp_grids[i].m_data.m_data.size()
+                               == ig_cuda_ptr->grids[i].m_i * ig_cuda_ptr->grids[i].m_j
+                                      * ig_cuda_ptr->grids[i].m_k);
+                        assert(tmp_grids[i].m_data.m_data.size() <= MAX_NUM_OF_GRID_POINT);
+                        memcpy(ig_cuda_ptr->grids[i].m_data, tmp_grids[i].m_data.m_data.data(),
+                               tmp_grids[i].m_data.m_data.size() * sizeof(fl));
+                    } else {
+                        ig_cuda_ptr->grids[i].m_i = 0;
+                        ig_cuda_ptr->grids[i].m_j = 0;
+                        ig_cuda_ptr->grids[i].m_k = 0;
+                    }
+                }
+            } else {
+                ad4cache ig_tmp(ig.get_slope());
+                ig_tmp.m_grids = ig.get_grids();
+                // // debug
+                // if (l == 1){
+                // 	std::cout << "writing original grid map\n";
+                // 	ig_tmp.write(std::string("./ori"), szv(1,0));
+                // }
+                ig_tmp.set_bias(bias_batch_list[l]);
+                // // debug
+                // std::cout << "writing bias\n";
+                // ig_tmp.write(std::string("./")+std::to_string(l), szv(1,0));
+                ig_cuda_ptr->atu = ig.get_atu();  // atu
+                DEBUG_PRINTF("ig_cuda_ptr->atu=%d\n", ig_cuda_ptr->atu);
+                ig_cuda_ptr->slope = ig.get_slope();  // slope
+                std::vector<grid> tmp_grids = ig.get_grids();
+                int grid_size = tmp_grids.size();
+                DEBUG_PRINTF("ig.size()=%d, GRIDS_SIZE=%d, should be 33\n", grid_size, GRIDS_SIZE);
+
+                for (int i = 0; i < grid_size; i++) {
+                    // DEBUG_PRINTF("i=%d\n",i); //debug
+                    for (int j = 0; j < 3; j++) {
+                        ig_cuda_ptr->grids[i].m_init[j] = tmp_grids[i].m_init[j];
+                        ig_cuda_ptr->grids[i].m_factor[j] = tmp_grids[i].m_factor[j];
+                        ig_cuda_ptr->grids[i].m_dim_fl_minus_1[j]
+                            = tmp_grids[i].m_dim_fl_minus_1[j];
+                        ig_cuda_ptr->grids[i].m_factor_inv[j] = tmp_grids[i].m_factor_inv[j];
+                    }
+                    if (tmp_grids[i].m_data.dim0() != 0) {
+                        ig_cuda_ptr->grids[i].m_i = tmp_grids[i].m_data.dim0();
+                        assert(MAX_NUM_OF_GRID_MI >= ig_cuda_ptr->grids[i].m_i);
+                        ig_cuda_ptr->grids[i].m_j = tmp_grids[i].m_data.dim1();
+                        assert(MAX_NUM_OF_GRID_MJ >= ig_cuda_ptr->grids[i].m_j);
+                        ig_cuda_ptr->grids[i].m_k = tmp_grids[i].m_data.dim2();
+                        assert(MAX_NUM_OF_GRID_MK >= ig_cuda_ptr->grids[i].m_k);
+
+                        assert(tmp_grids[i].m_data.m_data.size()
+                               == ig_cuda_ptr->grids[i].m_i * ig_cuda_ptr->grids[i].m_j
+                                      * ig_cuda_ptr->grids[i].m_k);
+                        memcpy(ig_cuda_ptr->grids[i].m_data, tmp_grids[i].m_data.m_data.data(),
+                               tmp_grids[i].m_data.m_data.size() * sizeof(fl));
+                    } else {
+                        ig_cuda_ptr->grids[i].m_i = 0;
+                        ig_cuda_ptr->grids[i].m_j = 0;
+                        ig_cuda_ptr->grids[i].m_k = 0;
+                    }
+                }
+            }
+
+            checkCUDA(
+                cudaMemcpyAsync(ig_cuda_gpu + l, ig_cuda_ptr, ig_cuda_size, cudaMemcpyHostToDevice, curr_stream));
+        }
+        std::cout << "set\n";
+    } else {
+        ig_cuda_ptr->atu = ig.get_atu();  // atu
+        DEBUG_PRINTF("ig_cuda_ptr->atu=%d\n", ig_cuda_ptr->atu);
+        ig_cuda_ptr->slope = ig.get_slope();  // slope
+        std::vector<grid> tmp_grids = ig.get_grids();
+        int grid_size = tmp_grids.size();
+        DEBUG_PRINTF("ig.size()=%d, GRIDS_SIZE=%d, should be 33\n", grid_size, GRIDS_SIZE);
+
+        for (int i = 0; i < grid_size; i++) {
+            // DEBUG_PRINTF("i=%d\n",i); //debug
+            for (int j = 0; j < 3; j++) {
+                ig_cuda_ptr->grids[i].m_init[j] = tmp_grids[i].m_init[j];
+                ig_cuda_ptr->grids[i].m_factor[j] = tmp_grids[i].m_factor[j];
+                ig_cuda_ptr->grids[i].m_dim_fl_minus_1[j] = tmp_grids[i].m_dim_fl_minus_1[j];
+                ig_cuda_ptr->grids[i].m_factor_inv[j] = tmp_grids[i].m_factor_inv[j];
+            }
+            if (tmp_grids[i].m_data.dim0() != 0) {
+                ig_cuda_ptr->grids[i].m_i = tmp_grids[i].m_data.dim0();
+                assert(MAX_NUM_OF_GRID_MI >= ig_cuda_ptr->grids[i].m_i);
+                ig_cuda_ptr->grids[i].m_j = tmp_grids[i].m_data.dim1();
+                assert(MAX_NUM_OF_GRID_MJ >= ig_cuda_ptr->grids[i].m_j);
+                ig_cuda_ptr->grids[i].m_k = tmp_grids[i].m_data.dim2();
+                assert(MAX_NUM_OF_GRID_MK >= ig_cuda_ptr->grids[i].m_k);
+
+                assert(tmp_grids[i].m_data.m_data.size()
+                       == ig_cuda_ptr->grids[i].m_i * ig_cuda_ptr->grids[i].m_j
+                              * ig_cuda_ptr->grids[i].m_k);
+                memcpy(ig_cuda_ptr->grids[i].m_data, tmp_grids[i].m_data.m_data.data(),
+                       tmp_grids[i].m_data.m_data.size() * sizeof(fl));
+            } else {
+                ig_cuda_ptr->grids[i].m_i = 0;
+                ig_cuda_ptr->grids[i].m_j = 0;
+                ig_cuda_ptr->grids[i].m_k = 0;
+            }
+        }
+        DEBUG_PRINTF("memcpy ig_cuda, ig_cuda_size=%lu\n", ig_cuda_size);
+        checkCUDA(cudaMalloc(&ig_cuda_gpu, ig_cuda_size));
+        checkCUDA(cudaMemcpyAsync(ig_cuda_gpu, ig_cuda_ptr, ig_cuda_size, cudaMemcpyHostToDevice, curr_stream));
+    }
+
+    float mutation_amplitude_float = static_cast<float>(mutation_amplitude);
+
+    checkCUDA(cudaMemcpyAsync(hunt_cap_gpu, hunt_cap_float, 3 * sizeof(float), cudaMemcpyHostToDevice, curr_stream));
+
+    checkCUDA(cudaMemcpyAsync(authentic_v_gpu, authentic_v_float, sizeof(authentic_v_float),
+                         cudaMemcpyHostToDevice, curr_stream));
+
+    /* Add timing */
+    cudaEvent_t start, stop;
+    checkCUDA(cudaEventCreate(&start));
+    checkCUDA(cudaEventCreate(&stop));
+    checkCUDA(cudaEventRecord(start, curr_stream));
+
+    /* Launch kernel */
+    DEBUG_PRINTF("launch kernel, global_steps=%d, thread=%d, num_of_ligands=%d\n", global_steps,
+                 thread, num_of_ligands);
+
+    output_type_cuda_t *results_aux;
+    checkCUDA(cudaMalloc(&results_aux, 5 * thread * sizeof(output_type_cuda_t)));
+    checkCUDA(cudaMemsetAsync(results_aux, 0, 5 * thread * sizeof(output_type_cuda_t), curr_stream));
+    change_cuda_t *change_aux;
+    checkCUDA(cudaMalloc(&change_aux, 6 * thread * sizeof(change_cuda_t)));
+    checkCUDA(cudaMemsetAsync(change_aux, 0, 6 * thread * sizeof(change_cuda_t), curr_stream));
+    pot_cuda_t *pot_aux;
+    checkCUDA(cudaMalloc(&pot_aux, thread * sizeof(pot_cuda_t)));
+    checkCUDA(cudaMemsetAsync(pot_aux, 0, thread * sizeof(pot_cuda_t), curr_stream));
+
+    kernel<32><<<thread, 32, 0, curr_stream>>>(m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu,
+                               quasi_newton_par_max_steps, mutation_amplitude_float, states, seed,
+                               epsilon_fl_float, hunt_cap_gpu, authentic_v_gpu, results_gpu,
+                               results_aux, change_aux, pot_aux, h_cuda_global, m_cuda_global,
+                               global_steps, num_of_ligands, threads_per_ligand, multi_bias);
+
+
+    // Wait for stream operations to complete
+    checkCUDA(cudaStreamSynchronize(curr_stream));
+
+    // Device to Host memcpy of precalculated_byatom, copy back data to p_gpu
+    p_m_data_cuda_t *p_data;
+    checkCUDA(cudaMallocHost(&p_data, sizeof(p_m_data_cuda_t) * MAX_P_DATA_M_DATA_SIZE));
+    memset(p_data, 0, sizeof(p_m_data_cuda_t) * MAX_P_DATA_M_DATA_SIZE);
+    output_type_cuda_t *results;
+    checkCUDA(cudaMallocHost(&results, thread * sizeof(output_type_cuda_t)));
+    memset(results, 0, thread * sizeof(output_type_cuda_t));
+
+    for (int l = 0; l < num_of_ligands; ++l) {
+        // copy data to m_data on CPU, then to p_gpu[l]
+        int pnum = p_gpu[l].m_data.m_data.size();
+        checkCUDA(cudaMemcpy(p_data, m_data_list_gpu[l].p_data, sizeof(p_m_data_cuda_t) * pnum,
+                             cudaMemcpyDeviceToHost));
+        checkCUDA(cudaFree(m_data_list_gpu[l].p_data));  // free m_cuda pointers in p_cuda
+        for (int i = 0; i < pnum; ++i) {
+            memcpy(&p_gpu[l].m_data.m_data[i].fast[0], p_data[i].fast, sizeof(p_data[i].fast));
+            memcpy(&p_gpu[l].m_data.m_data[i].smooth[0], p_data[i].smooth,
+                   sizeof(p_data[i].smooth));
+        }
+    }
+    // DEBUG_PRINTF("energies about the first ligand on GPU:\n");
+    // for (int i = 0;i < 20; ++i){
+    //     DEBUG_PRINTF("precalculated_byatom.m_data.m_data[%d]: (smooth.first,
+    //     smooth.second, fast) ", i); for (int j = 0;j < FAST_SIZE; ++j){
+    //         DEBUG_PRINTF("(%f, %f, %f) ",
+    //         p_gpu[0].m_data.m_data[i].smooth[j].first,
+    //         p_gpu[0].m_data.m_data[i].smooth[j].second,
+    //         p_gpu[0].m_data.m_data[i].fast[j]);
+    //     }
+    //     DEBUG_PRINTF("\n");
+    // }
+
+    /* Timing output */
+
+    checkCUDA(cudaEventRecord(stop, curr_stream));
+    cudaEventSynchronize(stop);
+    float msecTotal = 0.0f;
+    cudaEventElapsedTime(&msecTotal, start, stop);
+    DEBUG_PRINTF("Time spend on GPU is %f ms\n", msecTotal);
+
+    /* Convert result data. Can be improved by mapping memory
+     */
+    DEBUG_PRINTF("cuda to vina\n");
+
+    checkCUDA(cudaMemcpy(results, results_gpu, thread * sizeof(output_type_cuda_t),
+                         cudaMemcpyDeviceToHost));
+
+    std::vector<output_type> result_vina = cuda_to_vina(results, thread);
+
+    DEBUG_PRINTF("result size=%lu\n", result_vina.size());
+
+    for (int i = 0; i < thread; ++i) {
+        add_to_output_container(out_gpu[i / threads_per_ligand], result_vina[i], min_rmsd,
+                                num_saved_mins);
+    }
+    for (int i = 0; i < num_of_ligands; ++i) {
+        DEBUG_PRINTF("output poses size = %lu\n", out_gpu[i].size());
+        if (out_gpu[i].size() == 0) continue;
+        DEBUG_PRINTF("output poses energy from gpu =");
+        for (int j = 0; j < out_gpu[i].size(); ++j) DEBUG_PRINTF("%f ", out_gpu[i][j].e);
+        DEBUG_PRINTF("\n");
+    }
+
+    /* Free memory */
+    checkCUDA(cudaFree(m_cuda_gpu));
+    checkCUDA(cudaFree(ig_cuda_gpu));
+    checkCUDA(cudaFree(p_cuda_gpu));
+    checkCUDA(cudaFree(rand_molec_struc_gpu));
+    checkCUDA(cudaFree(hunt_cap_gpu));
+    checkCUDA(cudaFree(authentic_v_gpu));
+    checkCUDA(cudaFree(results_gpu));
+    checkCUDA(cudaFree(change_aux));
+    checkCUDA(cudaFree(results_aux));
+    checkCUDA(cudaFree(pot_aux));
+    checkCUDA(cudaFree(states));
+    checkCUDA(cudaFree(h_cuda_global));
+    checkCUDA(cudaFree(m_cuda_global));
+    checkCUDA(cudaFreeHost(m_cuda));
+    checkCUDA(cudaFreeHost(rand_molec_struc_tmp));
+    checkCUDA(cudaFreeHost(ig_cuda_ptr));
+    checkCUDA(cudaFreeHost(p_cuda));
+    checkCUDA(cudaFreeHost(p_data));
+    checkCUDA(cudaFreeHost(results));
+
+    checkCUDA(cudaEventDestroy(start));
+    checkCUDA(cudaEventDestroy(stop));
+    checkCUDA(cudaStreamDestroy(curr_stream));
+    curr_stream = 0;
+
+    DEBUG_PRINTF("exit monte_carlo\n");       
+
+}
+
 /* Below based on monte-carlo.cpp */
 
 // #ifdef ENABLE_CUDA
@@ -262,15 +860,19 @@ __host__ void monte_carlo::operator()(
     DEBUG_PRINTF("Allocating CPU memory\n");  // debug
     m_cuda_t *m_cuda;
     checkCUDA(cudaMallocHost(&m_cuda, sizeof(m_cuda_t)));
+    memset(m_cuda, 0, sizeof(m_cuda_t));
 
     output_type_cuda_t *rand_molec_struc_tmp;
     checkCUDA(cudaMallocHost(&rand_molec_struc_tmp, sizeof(output_type_cuda_t)));
+    memset(rand_molec_struc_tmp, 0, sizeof(output_type_cuda_t));
 
     ig_cuda_t *ig_cuda_ptr;
     checkCUDA(cudaMallocHost(&ig_cuda_ptr, sizeof(ig_cuda_t)));
+    memset(ig_cuda_ptr, 0, sizeof(ig_cuda_t));
 
     p_cuda_t_cpu *p_cuda;
     checkCUDA(cudaMallocHost(&p_cuda, sizeof(p_cuda_t_cpu)));
+    memset(p_cuda, 0, sizeof(p_cuda_t_cpu));
 
     /* End CPU allocation */
 
@@ -288,12 +890,15 @@ __host__ void monte_carlo::operator()(
     // rand_molec_struc_gpu
     float *rand_molec_struc_gpu;
     checkCUDA(cudaMalloc(&rand_molec_struc_gpu, thread * SIZE_OF_MOLEC_STRUC));
+    checkCUDA(cudaMemset(rand_molec_struc_gpu, 0, thread * SIZE_OF_MOLEC_STRUC));
+
     float epsilon_fl_float = static_cast<float>(epsilon_fl);
 
     // use cuRand to generate random values on GPU
     curandStatePhilox4_32_10_t *states;
     DEBUG_PRINTF("random states size=%lu\n", sizeof(curandStatePhilox4_32_10_t) * thread);
     checkCUDA(cudaMalloc(&states, sizeof(curandStatePhilox4_32_10_t) * thread));
+    checkCUDA(cudaMemset(states, 0, sizeof(curandStatePhilox4_32_10_t) * thread));
 
     // hunt_cap_gpu
     float *hunt_cap_gpu;
@@ -301,14 +906,17 @@ __host__ void monte_carlo::operator()(
                                static_cast<float>(hunt_cap[2])};
 
     checkCUDA(cudaMalloc(&hunt_cap_gpu, 3 * sizeof(float)));
+    checkCUDA(cudaMemset(hunt_cap_gpu, 0, 3 * sizeof(float)));
     // Preparing m related data
     m_cuda_t *m_cuda_gpu;
     DEBUG_PRINTF("m_cuda_size=%lu", m_cuda_size);
     checkCUDA(cudaMalloc(&m_cuda_gpu, num_of_ligands * m_cuda_size));
+    checkCUDA(cudaMemset(m_cuda_gpu, 0, num_of_ligands * m_cuda_size));
     // Preparing p related data
 
     p_cuda_t *p_cuda_gpu;
     checkCUDA(cudaMalloc(&p_cuda_gpu, num_of_ligands * p_cuda_size_gpu));
+    checkCUDA(cudaMemset(p_cuda_gpu, 0, num_of_ligands * p_cuda_size_gpu));
     DEBUG_PRINTF("p_cuda_gpu=%p\n", p_cuda_gpu);
     // Preparing ig related data (cache related data)
     ig_cuda_t *ig_cuda_gpu;
@@ -319,15 +927,19 @@ __host__ void monte_carlo::operator()(
            static_cast<float>(authentic_v[2])};
 
     checkCUDA(cudaMalloc(&authentic_v_gpu, sizeof(authentic_v_float)));
+    checkCUDA(cudaMemset(authentic_v_gpu, 0, sizeof(authentic_v_float)));
     // Preparing result data
     output_type_cuda_t *results_gpu;
     checkCUDA(cudaMalloc(&results_gpu, thread * sizeof(output_type_cuda_t)));
+    checkCUDA(cudaMemset(results_gpu, 0, thread * sizeof(output_type_cuda_t)));
 
     m_cuda_t *m_cuda_global;
     checkCUDA(cudaMalloc(&m_cuda_global, thread * sizeof(m_cuda_t)));
+    checkCUDA(cudaMemset(m_cuda_global, 0, thread * sizeof(m_cuda_t)));
 
     matrix_d *h_cuda_global;
     checkCUDA(cudaMalloc(&h_cuda_global, thread * sizeof(matrix_d)));
+    checkCUDA(cudaMemset(h_cuda_global, 0, thread * sizeof(matrix_d)));
 
     /* End Allocating GPU Memory */
 
@@ -550,6 +1162,7 @@ __host__ void monte_carlo::operator()(
         std::cout << "with multi bias ";
 
         checkCUDA(cudaMalloc(&ig_cuda_gpu, ig_cuda_size * num_of_ligands));
+        checkCUDA(cudaMemset(&ig_cuda_gpu, 0, ig_cuda_size * num_of_ligands));
         for (int l = 0; l < num_of_ligands; ++l) {
             if (ig.get_atu() == atom_type::XS) {
                 cache ig_tmp(ig.get_gd(), ig.get_slope());
@@ -713,10 +1326,13 @@ __host__ void monte_carlo::operator()(
 
     output_type_cuda_t *results_aux;
     checkCUDA(cudaMalloc(&results_aux, 5 * thread * sizeof(output_type_cuda_t)));
+    checkCUDA(cudaMemset(results_aux, 0, 5 * thread * sizeof(output_type_cuda_t)));
     change_cuda_t *change_aux;
     checkCUDA(cudaMalloc(&change_aux, 6 * thread * sizeof(change_cuda_t)));
+    checkCUDA(cudaMemset(change_aux, 0, 6 * thread * sizeof(change_cuda_t)));
     pot_cuda_t *pot_aux;
     checkCUDA(cudaMalloc(&pot_aux, thread * sizeof(pot_cuda_t)));
+    checkCUDA(cudaMemset(pot_aux, 0, thread * sizeof(pot_cuda_t)));
 
     kernel<32><<<thread, 32>>>(m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu,
                                quasi_newton_par_max_steps, mutation_amplitude_float, states, seed,
@@ -727,8 +1343,10 @@ __host__ void monte_carlo::operator()(
     // Device to Host memcpy of precalculated_byatom, copy back data to p_gpu
     p_m_data_cuda_t *p_data;
     checkCUDA(cudaMallocHost(&p_data, sizeof(p_m_data_cuda_t) * MAX_P_DATA_M_DATA_SIZE));
+    memset(p_data, 0, sizeof(p_m_data_cuda_t) * MAX_P_DATA_M_DATA_SIZE);
     output_type_cuda_t *results;
     checkCUDA(cudaMallocHost(&results, thread * sizeof(output_type_cuda_t)));
+    memset(results, 0, thread * sizeof(output_type_cuda_t));
 
     for (int l = 0; l < num_of_ligands; ++l) {
         // copy data to m_data on CPU, then to p_gpu[l]
diff --git a/unidock/src/lib/monte_carlo.h b/unidock/src/lib/monte_carlo.h
index feca0b4..8be77a2 100644
--- a/unidock/src/lib/monte_carlo.h
+++ b/unidock/src/lib/monte_carlo.h
@@ -63,6 +63,12 @@ struct monte_carlo {
                     const igrid& ig, const vec& corner1, const vec& corner2, rng& generator,
                     int verbosity, unsigned long long seed,
                     std::vector<std::vector<bias_element> >& bias_batch_list) const;
+    void mc_stream(std::vector<model>& m, std::vector<output_container>& out,
+                    std::vector<precalculate_byatom>& p, triangular_matrix_cuda_t* m_data_list_gpu,
+                    const igrid& ig, const vec& corner1, const vec& corner2, rng& generator,
+                    int verbosity, unsigned long long seed,
+                    std::vector<std::vector<bias_element> >& bias_batch_list) const;
+
     std::vector<output_type> cuda_to_vina(output_type_cuda_t* results_p, int thread) const;
 };
 
diff --git a/unidock/src/lib/scoring_function.h b/unidock/src/lib/scoring_function.h
index 71b9a58..77c119e 100644
--- a/unidock/src/lib/scoring_function.h
+++ b/unidock/src/lib/scoring_function.h
@@ -37,7 +37,11 @@ enum scoring_function_choice { SF_VINA, SF_AD42, SF_VINARDO };
 
 class ScoringFunction {
 public:
-    ScoringFunction() {}
+    ScoringFunction() {
+        m_num_potentials = 0;
+        m_num_conf_independents = 0;
+        m_sf_choice = 0;
+    }
     ScoringFunction(const scoring_function_choice sf_choice, const flv& weights) {
         switch (sf_choice) {
             case SF_VINA: {
@@ -81,7 +85,6 @@ class ScoringFunction {
                 break;
             }
             default: {
-                std::cout << "INSIDE everything::everything()   sfchoice = " << sf_choice << "\n";
                 VINA_CHECK(false);
                 break;
             }
@@ -90,7 +93,24 @@ class ScoringFunction {
         m_num_conf_independents = m_conf_independents.size();
         m_weights = weights;
     };
-    ~ScoringFunction() {}
+    void Destroy()
+    {
+        for (auto p : m_potentials)
+        {
+            delete p;
+        }
+        m_potentials.clear();
+        m_num_potentials = 0;
+        for (auto p : m_conf_independents)
+        {
+            delete p;
+        }
+        m_conf_independents.clear();
+        m_num_conf_independents = 0;
+    }
+    ~ScoringFunction() {
+        Destroy();
+    }
     fl eval(atom& a, atom& b, fl r) const {  // intentionally not checking for cutoff
         fl acc = 0;
         VINA_FOR(i, m_num_potentials) { acc += m_weights[i] * m_potentials[i]->eval(a, b, r); }
diff --git a/unidock/src/lib/vina.cpp b/unidock/src/lib/vina.cpp
index 878b323..173acb5 100644
--- a/unidock/src/lib/vina.cpp
+++ b/unidock/src/lib/vina.cpp
@@ -85,10 +85,10 @@ void Vina::set_receptor(const std::string& rigid_name, const std::string& flex_n
     // CONDITIONS 4, 5, 6, 7 (rigid_name and flex_name are empty strings per default)
     if (rigid_name.find("pdbqt") || flex_name.find("pdbqt")) {
         m_receptor
-            = parse_receptor_pdbqt(rigid_name, flex_name, m_scoring_function.get_atom_typing());
+            = parse_receptor_pdbqt(rigid_name, flex_name, m_scoring_function->get_atom_typing());
     } else if (rigid_name.find("pdb") && (!rigid_name.find("pdbqt"))) {
         m_receptor
-            = parse_receptor_pdb(rigid_name, flex_name, m_scoring_function.get_atom_typing());
+            = parse_receptor_pdb(rigid_name, flex_name, m_scoring_function->get_atom_typing());
     }
 
     m_model = m_receptor;
@@ -106,7 +106,7 @@ void Vina::set_ligand_from_string(const std::string& ligand_string) {
         exit(EXIT_FAILURE);
     }
 
-    atom_type::t atom_typing = m_scoring_function.get_atom_typing();
+    atom_type::t atom_typing = m_scoring_function->get_atom_typing();
 
     if (!m_receptor_initialized) {
         // This situation will happen if we don't need a receptor and we are using affinity maps
@@ -122,7 +122,7 @@ void Vina::set_ligand_from_string(const std::string& ligand_string) {
     m_model.append(parse_ligand_pdbqt_from_string(ligand_string, atom_typing));
 
     // Because we precalculate ligand atoms interactions
-    precalculate_byatom precalculated_byatom(m_scoring_function, m_model);
+    precalculate_byatom precalculated_byatom(*m_scoring_function, m_model);
 
     // Check that all atom types are in the grid (if initialized)
     if (m_map_initialized) {
@@ -149,7 +149,7 @@ void Vina::set_ligand_from_string(const std::vector<std::string>& ligand_string)
         exit(EXIT_FAILURE);
     }
 
-    atom_type::t atom_typing = m_scoring_function.get_atom_typing();
+    atom_type::t atom_typing = m_scoring_function->get_atom_typing();
 
     if (!m_receptor_initialized) {
         // This situation will happen if we don't need a receptor and we are using affinity maps
@@ -165,7 +165,7 @@ void Vina::set_ligand_from_string(const std::vector<std::string>& ligand_string)
     m_model.append(parse_ligand_pdbqt_from_string(ligand_string[i], atom_typing));
 
     // Because we precalculate ligand atoms interactions
-    precalculate_byatom precalculated_byatom(m_scoring_function, m_model);
+    precalculate_byatom precalculated_byatom(*m_scoring_function, m_model);
 
     // Check that all atom types are in the grid (if initialized)
     if (m_map_initialized) {
@@ -193,7 +193,7 @@ void Vina::set_ligand_from_string_gpu(const std::vector<std::string>& ligand_str
         exit(EXIT_FAILURE);
     }
 
-    atom_type::t atom_typing = m_scoring_function.get_atom_typing();
+    atom_type::t atom_typing = m_scoring_function->get_atom_typing();
 
     if (!m_receptor_initialized) {
         // This situation will happen if we don't need a receptor and we are using affinity maps
@@ -210,7 +210,7 @@ void Vina::set_ligand_from_string_gpu(const std::vector<std::string>& ligand_str
     for (int i = 0; i < ligand_string.size(); ++i) {
         m_model_gpu[i].append(
             parse_ligand_pdbqt_from_string_no_failure(ligand_string[i], atom_typing));
-        m_precalculated_byatom_gpu[i].init_without_calculation(m_scoring_function, m_model_gpu[i]);
+        m_precalculated_byatom_gpu[i].init_without_calculation(*m_scoring_function, m_model_gpu[i]);
     }
 
     // calculate common rs data
@@ -219,7 +219,7 @@ void Vina::set_ligand_from_string_gpu(const std::vector<std::string>& ligand_str
     // Because we precalculate ligand atoms interactions, which should be done in parallel
     int precalculate_thread_num = ligand_string.size();
 
-    precalculate_parallel(m_data_list_gpu, m_precalculated_byatom_gpu, m_scoring_function,
+    precalculate_parallel(m_data_list_gpu, m_precalculated_byatom_gpu, *m_scoring_function,
                           m_model_gpu, common_rs, precalculate_thread_num);
 
     VINA_RANGE(i, 0, ligand_string.size()) {
@@ -250,7 +250,7 @@ void Vina::set_ligand_from_object_gpu(const std::vector<model>& ligands) {
         exit(EXIT_FAILURE);
     }
 
-    atom_type::t atom_typing = m_scoring_function.get_atom_typing();
+    atom_type::t atom_typing = m_scoring_function->get_atom_typing();
 
     if (!m_receptor_initialized) {
         // This situation will happen if we don't need a receptor and we are using affinity maps
@@ -269,7 +269,7 @@ void Vina::set_ligand_from_object_gpu(const std::vector<model>& ligands) {
         if (multi_bias) {
             m_model_gpu[i].bias_list = bias_batch_list[i];
         }
-        m_precalculated_byatom_gpu[i].init_without_calculation(m_scoring_function, m_model_gpu[i]);
+        m_precalculated_byatom_gpu[i].init_without_calculation(*m_scoring_function, m_model_gpu[i]);
     }
 
     // calculate common rs data
@@ -278,7 +278,7 @@ void Vina::set_ligand_from_object_gpu(const std::vector<model>& ligands) {
     // Because we precalculate ligand atoms interactions, which should be done in parallel
     int precalculate_thread_num = ligands.size();
 
-    precalculate_parallel(m_data_list_gpu, m_precalculated_byatom_gpu, m_scoring_function,
+    precalculate_parallel(m_data_list_gpu, m_precalculated_byatom_gpu, *m_scoring_function,
                           m_model_gpu, common_rs, precalculate_thread_num);
 
     VINA_RANGE(i, 0, ligands.size()) {
@@ -309,7 +309,7 @@ void Vina::set_ligand_from_object(const std::vector<model>& ligands) {
         exit(EXIT_FAILURE);
     }
 
-    atom_type::t atom_typing = m_scoring_function.get_atom_typing();
+    atom_type::t atom_typing = m_scoring_function->get_atom_typing();
 
     if (!m_receptor_initialized) {
         // This situation will happen if we don't need a receptor and we are using affinity maps
@@ -325,7 +325,7 @@ void Vina::set_ligand_from_object(const std::vector<model>& ligands) {
     m_model.append(ligands[i]);
 
     // Because we precalculate ligand atoms interactions
-    precalculate_byatom precalculated_byatom(m_scoring_function, m_model);
+    precalculate_byatom precalculated_byatom(*m_scoring_function, m_model);
 
     // Check that all atom types are in the grid (if initialized)
     if (m_map_initialized) {
@@ -453,9 +453,8 @@ void Vina::set_ad4_weights(double weight_ad4_vdw, double weight_ad4_hb, double w
 }
 
 void Vina::set_forcefield() {
-    ScoringFunction scoring_function(m_sf_choice, m_weights);
     // Store in Vina object
-    m_scoring_function = scoring_function;
+    m_scoring_function = std::make_shared<ScoringFunction>(m_sf_choice, m_weights);
 }
 
 std::vector<double> Vina::grid_dimensions_from_ligand(double buffer_size) {
@@ -514,7 +513,7 @@ void Vina::compute_vina_maps(double center_x, double center_y, double center_z,
     vec center(center_x, center_y, center_z);
     const fl slope = 1e6;  // FIXME: too large? used to be 100
     szv atom_types;
-    atom_type::t atom_typing = m_scoring_function.get_atom_typing();
+    atom_type::t atom_typing = m_scoring_function->get_atom_typing();
 
     /* Atom types initialization
     If a ligand was defined before, we only use those present in the ligand
@@ -523,7 +522,7 @@ void Vina::compute_vina_maps(double center_x, double center_y, double center_z,
     if (m_ligand_initialized)
         atom_types = m_model.get_movable_atom_types(atom_typing);
     else
-        atom_types = m_scoring_function.get_atom_types();
+        atom_types = m_scoring_function->get_atom_types();
 
     // Grid dimensions
     VINA_FOR_IN(i, gd) {
@@ -540,7 +539,7 @@ void Vina::compute_vina_maps(double center_x, double center_y, double center_z,
     }
 
     // Initialize the scoring function
-    precalculate precalculated_sf(m_scoring_function);
+    precalculate precalculated_sf(*m_scoring_function);
     // Store it now in Vina object because of non_cache
     m_precalculated_sf = precalculated_sf;
 
@@ -593,7 +592,7 @@ void Vina::load_maps(std::string maps) {
     // Check that all the affinity map are present for ligands/flex residues (if initialized
     // already)
     if (m_ligand_initialized) {
-        atom_type::t atom_typing = m_scoring_function.get_atom_typing();
+        atom_type::t atom_typing = m_scoring_function->get_atom_typing();
         szv atom_types = m_model.get_movable_atom_types(atom_typing);
 
         if (m_sf_choice == SF_VINA || m_sf_choice == SF_VINARDO) {
@@ -615,12 +614,12 @@ void Vina::write_maps(const std::string& map_prefix, const std::string& gpf_file
     }
 
     szv atom_types;
-    atom_type::t atom_typing = m_scoring_function.get_atom_typing();
+    atom_type::t atom_typing = m_scoring_function->get_atom_typing();
 
     if (m_ligand_initialized)
         atom_types = m_model.get_movable_atom_types(atom_typing);
     else
-        atom_types = m_scoring_function.get_atom_types();
+        atom_types = m_scoring_function->get_atom_types();
 
     if (m_sf_choice == SF_VINA || m_sf_choice == SF_VINARDO) {
         doing("Writing Vina maps", m_verbosity, 0);
@@ -1201,7 +1200,7 @@ std::vector<double> Vina::score(double intramolecular_energy) {
         lig_intra = m_model.evali(m_precalculated_byatom, authentic_v);  // [2] ligand_i -- ligand_i
         intra = flex_grids + intra_pairs + lig_intra;
         // Total
-        total = m_scoring_function.conf_independent(
+        total = m_scoring_function->conf_independent(
             m_model,
             inter + intra
                 - intramolecular_energy);  // we pass intermolecular energy from the best pose
@@ -1220,7 +1219,7 @@ std::vector<double> Vina::score(double intramolecular_energy) {
         lig_intra = m_model.evali(m_precalculated_byatom, authentic_v);  // [2] ligand_i -- ligand_i
         intra = flex_grids + intra_pairs + lig_intra;
         // Torsion
-        conf_independent = m_scoring_function.conf_independent(
+        conf_independent = m_scoring_function->conf_independent(
             m_model, 0);  // [3] we can pass e=0 because we do not modify the energy like in vina
         // Total
         total = inter + conf_independent;  // (+ intra - intra)
@@ -1282,7 +1281,7 @@ std::vector<double> Vina::score_gpu(int i, double intramolecular_energy) {
                                          authentic_v);  // [2] ligand_i -- ligand_i
         intra = flex_grids + intra_pairs + lig_intra;
         // Total
-        total = m_scoring_function.conf_independent(
+        total = m_scoring_function->conf_independent(
             m_model_gpu[i],
             inter + intra
                 - intramolecular_energy);  // we pass intermolecular energy from the best pose
@@ -1303,7 +1302,7 @@ std::vector<double> Vina::score_gpu(int i, double intramolecular_energy) {
                                          authentic_v);  // [2] ligand_i -- ligand_i
         intra = flex_grids + intra_pairs + lig_intra;
         // Torsion
-        conf_independent = m_scoring_function.conf_independent(
+        conf_independent = m_scoring_function->conf_independent(
             m_model_gpu[i],
             0);  // [3] we can pass e=0 because we do not modify the energy like in vina
         // Total
@@ -1656,7 +1655,7 @@ void Vina::global_search(const int exhaustiveness, const int n_poses, const doub
 void Vina::global_search_gpu(const int exhaustiveness, const int n_poses, const double min_rmsd,
                              const int max_evals, const int max_step, int num_of_ligands,
                              unsigned long long seed, const int refine_step,
-                             const bool local_only) {
+                             const bool local_only, const bool create_new_stream) {
     // Vina search (Monte-carlo and local optimization)
     // Check if ff, box and ligand were initialized
     if (!m_ligand_initialized) {
@@ -1717,8 +1716,16 @@ void Vina::global_search_gpu(const int exhaustiveness, const int n_poses, const
     doing(sstm.str(), m_verbosity, 0);
     auto start = std::chrono::system_clock::now();
     if (m_sf_choice == SF_VINA || m_sf_choice == SF_VINARDO) {
-        mc(m_model_gpu, poses_gpu, m_precalculated_byatom_gpu, m_data_list_gpu, m_grid,
-           m_grid.corner1(), m_grid.corner2(), generator, m_verbosity, seed, bias_batch_list);
+        if (create_new_stream)
+        {
+            mc.mc_stream(m_model_gpu, poses_gpu, m_precalculated_byatom_gpu, m_data_list_gpu, m_grid,
+                m_grid.corner1(), m_grid.corner2(), generator, m_verbosity, seed, bias_batch_list);            
+        }
+        else
+        {
+            mc(m_model_gpu, poses_gpu, m_precalculated_byatom_gpu, m_data_list_gpu, m_grid,
+                m_grid.corner1(), m_grid.corner2(), generator, m_verbosity, seed, bias_batch_list);
+        }
     } else {
         mc(m_model_gpu, poses_gpu, m_precalculated_byatom_gpu, m_data_list_gpu, m_ad4grid,
            m_ad4grid.corner1(), m_ad4grid.corner2(), generator, m_verbosity, seed, bias_batch_list);
@@ -1864,7 +1871,6 @@ Vina::~Vina() {
     // scoring function
     scoring_function_choice m_sf_choice;
     flv m_weights;
-    ScoringFunction m_scoring_function;
     precalculate_byatom m_precalculated_byatom;
     precalculate m_precalculated_sf;
     // maps
diff --git a/unidock/src/lib/vina.h b/unidock/src/lib/vina.h
index cab13d3..2f24f8e 100644
--- a/unidock/src/lib/vina.h
+++ b/unidock/src/lib/vina.h
@@ -55,6 +55,7 @@
 #include "scoring_function.h"
 #include "precalculate.h"
 #include "bias.h"
+#include <memory>
 
 #ifdef DEBUG
 #    define DEBUG_PRINTF printf
@@ -75,7 +76,6 @@ class Vina {
         m_no_refine = no_refine;
         m_progress_callback = progress_callback;
         gpu = false;
-
         // Look for the number of cpu
         if (cpu <= 0) {
             unsigned num_cpus = boost::thread::hardware_concurrency();
@@ -149,7 +149,8 @@ class Vina {
                            const double min_rmsd = 1.0, const int max_evals = 0,
                            const int max_step = 0, int num_of_ligands = 1,
                            unsigned long long seed = 181129, const int refine_step = 5,
-                           const bool local_only = false);
+                           const bool local_only = false,
+                           const bool create_new_stream = false);
     std::string get_poses(int how_many = 9, double energy_range = 3.0);
     std::string get_sdf_poses(int how_many = 9, double energy_range = 3.0);
     std::string get_poses_gpu(int ligand_id, int how_many = 9, double energy_range = 3.0);
@@ -189,7 +190,7 @@ class Vina {
     // scoring function
     scoring_function_choice m_sf_choice;
     flv m_weights;
-    ScoringFunction m_scoring_function;
+    std::shared_ptr<ScoringFunction> m_scoring_function;
     precalculate_byatom m_precalculated_byatom;
     precalculate m_precalculated_sf;
     // gpu scoring function precalculated
diff --git a/unidock/src/main/complex_property.h b/unidock/src/main/complex_property.h
new file mode 100644
index 0000000..2eab6c2
--- /dev/null
+++ b/unidock/src/main/complex_property.h
@@ -0,0 +1,111 @@
+/*
+
+   Copyright (c) 2006-2010, The Scripps Research Institute
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+           http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   Author: Dr. Oleg Trott <ot14@columbia.edu>,
+                   The Olson Lab,
+                   The Scripps Research Institute
+
+*/
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>  // ligand paths
+#include <exception>
+#include <boost/program_options.hpp>
+#include "vina.h"
+#include "utils.h"
+#include "scoring_function.h"
+
+#include <thread>
+#include <chrono>
+#include <iterator>
+#include <cstddef>
+
+// Holds properties of each ligand complex
+
+struct complex_property
+{
+    double center_x = 0;
+    double center_y = 0;
+    double center_z = 0;
+    double box_x = 0;
+    double box_y = 0;
+    double box_z = 0;    
+    std::string protein_name;
+    std::string ligand_name;
+    complex_property(double x, double y, double z, 
+                double box_x, double box_y, double box_z,
+                std::string protein_name, std::string ligand_name):
+        center_x(x),
+        center_y(y),
+        center_z(z),
+        box_x(box_x),
+        box_y(box_y),
+        box_z(box_z),
+        protein_name(protein_name),
+        ligand_name(ligand_name){};
+    complex_property(){};
+};
+
+// Holds properties of all ligand complexs
+
+struct complex_property_holder
+{
+    int max_count;
+    complex_property* m_properties;
+    complex_property_holder(int N):
+        max_count(N),
+        m_properties(nullptr)
+        {
+            m_properties = new complex_property[N];
+        }
+    ~complex_property_holder()
+    {
+        delete [] m_properties;
+        m_properties = nullptr;
+    }
+    complex_property* get_end()
+    {
+        return &m_properties[max_count];
+    }
+
+    struct complex_property_iterator 
+    {
+        using iterator_category = std::forward_iterator_tag;
+        using difference_type   = std::ptrdiff_t;
+        using value_type        = complex_property;
+        using pointer           = complex_property*; 
+        using reference         = complex_property&; 
+
+        complex_property_iterator(pointer ptr) : m_ptr(ptr) {}
+        reference operator*() const { return *m_ptr; }
+        pointer operator->() { return m_ptr; }
+
+        // Prefix increment
+        complex_property_iterator& operator++() { m_ptr++; return *this; }  
+
+        // Postfix increment
+        complex_property_iterator operator++(int) { complex_property_iterator tmp = *this; ++(*this); return tmp; }
+
+        friend bool operator== (const complex_property_iterator& a, const complex_property_iterator& b) { return a.m_ptr == b.m_ptr; };
+        friend bool operator!= (const complex_property_iterator& a, const complex_property_iterator& b) { return a.m_ptr != b.m_ptr; }; 
+    private:
+        pointer m_ptr;    
+    };
+    complex_property_iterator begin() { return complex_property_iterator(&m_properties[0]); }
+    complex_property_iterator end()   { return complex_property_iterator(get_end()); }
+};
\ No newline at end of file
diff --git a/unidock/src/main/main.cpp b/unidock/src/main/main.cpp
index df45d48..f19a2ab 100644
--- a/unidock/src/main/main.cpp
+++ b/unidock/src/main/main.cpp
@@ -30,6 +30,7 @@
 
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
+#include "simulation_container.h"
 
 struct usage_error : public std::runtime_error {
     usage_error(const std::string& message) : std::runtime_error(message) {}
@@ -163,6 +164,7 @@ bug reporting, license agreements, and more information.      \n";
         std::string out_maps;
         std::vector<std::string> ligand_names;
         std::string ligand_index;  // path to a text file, containing paths to ligands files
+        int paired_batch_size = 0;
         std::vector<std::string> batch_ligand_names;
         std::vector<std::string> gpu_batch_ligand_names;
         // std::vector<std::string> gpu_batch_ligand_names_sdf;
@@ -242,8 +244,10 @@ bug reporting, license agreements, and more information.      \n";
             "flex", value<std::string>(&flex_name), "flexible side chains, if any (PDBQT or PDB)")(
             "ligand", value<std::vector<std::string> >(&ligand_names)->multitoken(),
             "ligand (PDBQT)")("ligand_index", value<std::string>(&ligand_index),
-                              "file containing paths to ligands (PDBQT or SDF")(
-            "batch", value<std::vector<std::string> >(&batch_ligand_names)->multitoken(),
+                              "file containing paths to ligands (PDBQT or SDF")
+            ("paired_batch_size",value<int>(&paired_batch_size),
+                "If > 0, uses batching for one-ligand-one-protein docking, with json config in ligand_index following paired_batching.schema.json")
+            ("batch", value<std::vector<std::string> >(&batch_ligand_names)->multitoken(),
             "batch ligand (PDBQT)")(
             "gpu_batch", value<std::vector<std::string> >(&gpu_batch_ligand_names)->multitoken(),
             "gpu batch ligand (PDBQT or SDF)")
@@ -474,6 +478,45 @@ bug reporting, license agreements, and more information.      \n";
             max_step = 40;
         }
 
+        // Use multiple workers for 1:1 docking, and exit
+        if (paired_batch_size > 0)
+        {
+            if (0 == vm.count("ligand_index")){
+                std::cout << "ERROR: Paired batch size set, but no config json specified via --ligand_index\n";
+                return -1;
+            }
+            if (0 == vm.count("size_x") || 0 == vm.count("size_y") || 0 == vm.count("size_z")){
+                std::cout << "WARN: Paired batch size set, but size_x/size_y/size_z not specified, using 25\n";
+                size_x = size_y = size_z = 25;
+            }
+            if (0 == vm.count("dir")) {
+                std::cout << "ERROR: Need to specify an output directory for batch mode.\n";
+                return -1;
+            }
+
+            std::cout << "Entering paired batch mode\n";
+
+            std::vector<double> box_size = {size_x, size_y, size_z};
+            simulation_container sc(seed, num_modes, refine_step, out_dir,
+                ligand_index, paired_batch_size, box_size, local_only, max_step, verbosity, exhaustiveness);
+
+            int res = sc.prime();
+            if (res <= 0)
+            {
+                std::cout << "Error priming [" << res << "]\n";
+                return res;
+            }
+
+            auto start = std::chrono::steady_clock::now();
+
+            int err = sc.launch();
+
+            auto end = std::chrono::steady_clock::now();
+            auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+            std::cout << "Completed Batched Operations in " << milliseconds << " mS with err = " << err << "\n";
+            return err;
+        }
+
         if (sf_name.compare("vina") == 0 || sf_name.compare("vinardo") == 0) {
             if (!vm.count("receptor") && !vm.count("maps")) {
                 std::cerr << desc_simple
@@ -631,7 +674,7 @@ bug reporting, license agreements, and more information.      \n";
             std::vector<model> ligands;
             VINA_FOR_IN(i, ligand_names) {
                 ligands.emplace_back(parse_ligand_from_file_no_failure(
-                    ligand_names[i], v.m_scoring_function.get_atom_typing(), keep_H));
+                    ligand_names[i], v.m_scoring_function->get_atom_typing(), keep_H));
             }
             v.set_ligand_from_object(ligands);
 
@@ -700,7 +743,7 @@ bug reporting, license agreements, and more information.      \n";
                 VINA_FOR_IN(i, ligand_names) {
                     std::vector<model> ligands;
                     ligands.emplace_back(parse_ligand_from_file_no_failure(
-                        ligand_names[i], v.m_scoring_function.get_atom_typing(), keep_H));
+                        ligand_names[i], v.m_scoring_function->get_atom_typing(), keep_H));
                     Vina v1(v);
                     v1.set_ligand_from_object(ligands);
                     std::vector<double> energies;
@@ -747,7 +790,7 @@ bug reporting, license agreements, and more information.      \n";
                      ++ligand_count) {
                     auto& ligand = ligand_names[ligand_count];
                     auto l = parse_ligand_from_file_no_failure(
-                        ligand, v.m_scoring_function.get_atom_typing(), keep_H);
+                        ligand, v.m_scoring_function->get_atom_typing(), keep_H);
 #pragma omp critical
                     all_ligands.emplace_back(std::make_pair(ligand, l));
                 }
diff --git a/unidock/src/main/paired_batching.schema.json b/unidock/src/main/paired_batching.schema.json
new file mode 100644
index 0000000..e66054c
--- /dev/null
+++ b/unidock/src/main/paired_batching.schema.json
@@ -0,0 +1,28 @@
+{
+    "$schema": "http://json-schema.org/draft-04/schema#",
+    "type": "object",
+    "properties": {
+      "batch": {
+        "type": "object",
+        "properties": {
+          "protein": {
+            "type": "string"
+          },
+          "ligand": {
+            "type": "string"
+          },
+          "ligand_config": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "protein",
+          "ligand",
+          "ligand_config"
+        ]
+      }
+    },
+    "required": [
+      "batch"
+    ]
+  }
\ No newline at end of file
diff --git a/unidock/src/main/simulation_container.h b/unidock/src/main/simulation_container.h
new file mode 100644
index 0000000..00bad90
--- /dev/null
+++ b/unidock/src/main/simulation_container.h
@@ -0,0 +1,517 @@
+/*
+
+   Copyright (c) 2006-2010, The Scripps Research Institute
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+           http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   Author: Dr. Oleg Trott <ot14@columbia.edu>,
+                   The Olson Lab,
+                   The Scripps Research Institute
+
+*/
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>  // ligand paths
+#include <exception>
+#include <boost/program_options.hpp>
+#include "vina.h"
+#include "utils.h"
+#include "scoring_function.h"
+
+#include <thread>
+#include <chrono>
+#include <iterator>
+#include <cstddef>
+#include <atomic>
+
+#include "vina_cuda_worker.h"
+
+#include <boost/property_tree/ptree.hpp>
+#include <boost/property_tree/json_parser.hpp>
+
+
+// Information about current simulation
+struct simulation_container
+{
+    std::string m_work_dir;
+    std::string m_input_path;
+    std::string m_out_phrase;
+    int m_batch_size;
+    std::vector<double> m_box_size;
+    bool m_local_only;
+    int m_max_limits = 5000;
+    int m_max_global_steps;
+    int m_verbosity;
+    int m_exhaustiveness = 512;
+    bool m_isGPU;
+    int m_seed = 5;
+    int m_num_modes = 9;
+    int m_refine_steps = 3;
+
+    std::vector<std::string> m_complex_names;
+    std::string m_config_json_path;
+    std::vector<boost::filesystem::directory_entry> m_ligand_paths;
+    std::vector<boost::filesystem::directory_entry> m_ligand_config_paths;
+    std::vector<boost::filesystem::directory_entry> m_protein_paths;
+    complex_property_holder * m_ptr_complex_property_holder;
+    int m_successful_property_count;
+
+    simulation_container(
+        int seed,
+        int num_modes,
+        int refine_steps,
+        std::string out_dir,
+        std::string config_json_path,
+        int paired_batch_size,
+        std::vector<double> box_size_xyz,
+        int local_only,
+        int max_step,
+        int verbosity,
+        int exh):
+
+        m_seed(seed),
+        m_num_modes(num_modes),
+        m_refine_steps(refine_steps),
+        m_work_dir(out_dir),
+        m_config_json_path(config_json_path),
+        m_batch_size(paired_batch_size),
+        m_box_size(box_size_xyz),
+        m_local_only(local_only),
+        m_max_global_steps(max_step),
+        m_verbosity(verbosity),
+        m_exhaustiveness(exh),
+        m_isGPU(true),
+        m_successful_property_count (0)
+     {
+        //m_out_phrase = util_random_string(5);
+     }
+     ~simulation_container()
+     {
+        if (m_ptr_complex_property_holder)
+        {
+            delete m_ptr_complex_property_holder;
+            m_ptr_complex_property_holder = 0;
+        }
+     };
+
+
+    std::string util_random_string(std::size_t length)
+    {
+        const std::string CHARACTERS = "iamafunnydogthatlaughsindeterministically";
+
+        std::random_device random_device;
+        std::mt19937 generator(random_device());
+        std::uniform_int_distribution<> distribution(0, CHARACTERS.size() - 1);
+
+        std::string rstring;
+
+        for (std::size_t i = 0; i < length; ++i)
+        {
+            rstring += CHARACTERS[distribution(generator)];
+        }
+
+        return rstring;
+    }
+
+    void add_rank_combinations_from_json(std::string filename)
+    {
+        int curr_entry_size = 0;
+        boost::property_tree::ptree tree_root;
+        boost::property_tree::read_json(filename, tree_root);
+
+        using boost::property_tree::ptree;
+        ptree::const_iterator end = tree_root.end();
+
+        for (ptree::const_iterator it = tree_root.begin(); it != end; ++it) {
+
+            m_complex_names.emplace_back(it->first);
+
+            for (ptree::const_iterator it_entries = it->second.begin(); it_entries != it->second.end(); ++it_entries)
+            {
+                if (it_entries->first == "ligand")
+                {
+                    m_ligand_paths.emplace_back(it_entries->second.get_value<std::string>());
+                }
+                if (it_entries->first == "protein")
+                {
+                    m_protein_paths.emplace_back(it_entries->second.get_value<std::string>());
+                }
+                if (it_entries->first == "ligand_config")
+                {
+                    m_ligand_config_paths.emplace_back(it_entries->second.get_value<std::string>());
+                }
+            }
+            curr_entry_size ++;
+            if (curr_entry_size >= m_max_limits)
+            {
+                std::cout << "Limiting number of ranked samples to max limits " << m_max_limits << "\n";
+                break;
+            }
+        }
+    }
+
+    std::string trim_eol(std::string line)
+    {
+        std::string newString;
+
+        for (char ch : line)
+        {
+            if (ch == '\n' || ch == '\r')
+                continue;
+            newString += ch;
+        }
+        return newString;
+    }
+
+    int fill_config_from_json(complex_property & cp, std::string path, std::string protein_name, std::string ligand_name)
+    {
+        boost::property_tree::ptree tree_root;
+        boost::property_tree::read_json(path, tree_root);
+
+        // Default to provided box, update if in config file
+        cp.box_x = m_box_size[0];
+        cp.box_y = m_box_size[1];
+        cp.box_z = m_box_size[2];
+
+        try
+        {
+            cp.center_x = tree_root.get<float>("center_x");
+            cp.center_y = tree_root.get<float>("center_y");
+            cp.center_z = tree_root.get<float>("center_z");
+            cp.box_x = tree_root.get<float>("size_x");
+            cp.box_y = tree_root.get<float>("size_y");
+            cp.box_z = tree_root.get<float>("size_z");
+        }
+        catch(...)
+        {
+            std::cout << "Error parsing config json " << path << "\n";
+            return -1;
+        }
+
+        cp.protein_name = protein_name;
+        cp.ligand_name = ligand_name;
+
+        return 0;
+    }
+
+    int_least32_t fill_config(complex_property & cp, std::string path, std::string protein_name, std::string ligand_name)
+    {
+        // Default to provided box, update if in config file
+        cp.box_x = m_box_size[0];
+        cp.box_y = m_box_size[1];
+        cp.box_z = m_box_size[2];
+
+        if (path.empty())
+        {
+            return -1;
+        }
+        else
+        {
+            std::ifstream ifs(path);
+            std::string line;
+            double vals[3];
+            int id = 0;
+            while (std::getline(ifs, line))
+            {
+                std::string trimmed(trim_eol(line));
+                int pos = trimmed.find('=');
+                vals[id] = std::stod(trimmed.substr(pos+1,  std::string::npos));
+                id ++;
+            }
+
+            cp.center_x = vals[0];
+            cp.center_y = vals[1];
+            cp.center_z = vals[2];
+
+            if (id > 3)
+            {
+                cp.box_x = vals[3];
+                cp.box_y = vals[4];
+                cp.box_z = vals[5];
+            }
+            ifs.close();
+        }
+
+        cp.protein_name = protein_name;
+        cp.ligand_name = ligand_name;
+
+        return 0;
+    }
+
+    void add_rank_combinations(std::string effective_path)
+    {
+        int curr_entry_size = 0;
+        //search for complex_rank<n>.pdbqt for ranked ligands
+        for (boost::filesystem::directory_entry& entry : boost::filesystem::recursive_directory_iterator(effective_path))
+        {
+            int pos_rank = entry.path().string().find("_rank");
+            int pos_config = entry.path().stem().string().find("_config");
+            int pos_pdbqt = entry.path().extension().string().find(".pdbqt");
+
+            if (pos_rank != std::string::npos &&
+                pos_pdbqt != std::string::npos &&
+                pos_config == std::string::npos)
+            {
+                int pos_complex = entry.path().stem().string().find("_rank");
+                std::string complex = entry.path().stem().string().substr(0, pos_complex);
+                m_complex_names.emplace_back(complex);
+                m_ligand_paths.emplace_back(entry.path());
+                m_protein_paths.emplace_back(entry.path().parent_path() / boost::filesystem::path(complex + "_protein.pdbqt"));
+                m_ligand_config_paths.emplace_back(entry.path().parent_path() / boost::filesystem::path(entry.path().stem().string() + "_config.txt"));
+
+                curr_entry_size ++;
+                if (curr_entry_size >= m_max_limits)
+                {
+                    std::cout << "Limiting number of ranked samples to max limits " << m_max_limits << "\n";
+                    break;
+                }
+            }
+        }
+    }
+    void add_combinations(std::string effective_path)
+    {
+        int curr_entry_size = 0;
+        for (boost::filesystem::directory_entry& entry : boost::filesystem::recursive_directory_iterator(effective_path))
+        {
+            int pos = entry.path().string().find("_protein.pdbqt");
+
+            if (pos != std::string::npos)
+            {
+                int pos_complex = entry.path().stem().string().find("_protein");
+                std::string complex = entry.path().stem().string().substr(0, pos_complex);
+                
+                m_complex_names.emplace_back(complex);
+                m_protein_paths.emplace_back(entry.path());
+                m_ligand_paths.emplace_back(entry.path().parent_path() / boost::filesystem::path(complex + "_ligand.pdbqt"));
+                m_ligand_config_paths.emplace_back(entry.path().parent_path() / boost::filesystem::path(complex + "_ligand_config.txt"));
+
+                curr_entry_size ++;
+                if (curr_entry_size >= m_max_limits)
+                {
+                    std::cout << "Limiting number of samples to max limits " << m_max_limits << "\n";
+                    break;
+                }
+            }
+        }
+    }
+
+    void add_combinations(std::vector<std::string> ligand_names)
+    {
+        int curr_entry_size = 0;
+        for (std::string& path : ligand_names)
+        {
+            int pos = path.find("_ligand.pdbqt");
+
+            if (pos != std::string::npos)
+            {
+                int pos_complex = path.find("_ligand");
+                std::string complex = path.substr(0, pos_complex);
+
+                m_complex_names.emplace_back(complex);
+                m_protein_paths.emplace_back(complex + "_protein.pdbqt");
+                m_ligand_paths.emplace_back(path);
+                m_ligand_config_paths.emplace_back(complex + "_ligand_config.txt");
+
+                curr_entry_size ++;
+                if (curr_entry_size >= m_max_limits)
+                {
+                    std::cout << "Limiting number of samples to max limits " << m_max_limits << "\n";
+                    break;
+                }
+            }
+        }
+    }
+
+    int prime()
+    {
+        if (m_config_json_path.empty())
+        {
+            std::cout << "Found nothing to prime.\n";
+            return -1;
+        }
+        else
+        {
+            try
+            {
+                add_rank_combinations_from_json(m_config_json_path);
+            }
+            catch(const std::exception& e)
+            {
+                std::cerr << e.what() << '\n';
+                std::cout << e.what() << '\n';
+                return -1;
+            }
+        }
+
+        std::cout << "Found " << m_complex_names.size() << " to be primed.\n";
+
+        m_ptr_complex_property_holder = new complex_property_holder(m_complex_names.size());
+
+        for (int id = 0;id < m_complex_names.size();id ++)
+        {
+            int success_filled = -1;
+
+            complex_property& cp = m_ptr_complex_property_holder->m_properties[m_successful_property_count];
+
+            if (boost::filesystem::extension(m_ligand_config_paths[id].path().string()) == ".json")
+            {
+                try
+                {
+                    success_filled = fill_config_from_json(cp, m_ligand_config_paths[id].path().string(), m_protein_paths[id].path().string(), m_ligand_paths[id].path().string());
+                }
+                catch(const std::exception& e)
+                {
+                    std::cout << "Error reading config json " << e.what() << "\n";                   
+                    success_filled = -1;                    
+                }
+            }
+            else
+            {
+                success_filled = fill_config(cp, m_ligand_config_paths[id].path().string(), m_protein_paths[id].path().string(), m_ligand_paths[id].path().string());
+            }
+
+            if (0 == success_filled)
+            {
+                m_successful_property_count ++;
+            }
+        }
+        std::cout << "Filled " << m_successful_property_count << " properties successfully.\n";
+        return m_successful_property_count;
+    }   
+    // Launch simulations 
+    int launch()
+    {
+        if (0 == m_successful_property_count)
+        {
+            std::cout << "m_successful_property_count = 0\n";
+            return -1;
+        }
+        int batches = m_successful_property_count/m_batch_size;
+        std::cout << "Parameters: exh = " << m_exhaustiveness << \
+            ", box[0] = " << m_box_size[0] << \
+            ", max_eval_steps global = " << m_max_global_steps << \
+            ", num_modes = " << m_num_modes << \
+            ", refine_steps = " << m_refine_steps << "\n";
+
+        std::cout << "To do [" << batches << "] batches\n";
+        std::cout << "Batched output to " << m_work_dir << "\n";
+
+        if (!boost::filesystem::exists(m_work_dir))
+        {
+            std::cout << "Creating work dir " << m_work_dir  << "\n";
+            boost::filesystem::create_directory(m_work_dir);
+        }
+
+
+        std::vector<complex_property> cp;
+	    int total_err_count = 0;
+        for (int i = 0;i < batches;i ++)
+        {            
+            for (int curr = 0;curr < m_batch_size;curr ++)
+            {
+                int index = i*m_batch_size + curr;
+                cp.emplace_back(m_ptr_complex_property_holder->m_properties[index]);
+                std::cout << "Processing " << m_ptr_complex_property_holder->m_properties[index].ligand_name << "\n";
+            }
+            // run
+            int err_count = batch_dock_with_worker(cp, m_local_only, m_work_dir, m_input_path, m_out_phrase);  
+            std::cout << "Batch [" << i+1 << "/" << batches << "] completed. " << err_count << " errors.\n";
+	        total_err_count += err_count;
+            cp.clear();
+        }
+        // Remaining if any
+        int remaining = m_complex_names.size() - batches * m_batch_size;
+        if (remaining > 0)
+        {
+            for (int i = 0;i < remaining;i ++)
+            {
+                int index = i + batches * m_batch_size;
+                cp.emplace_back(m_ptr_complex_property_holder->m_properties[index]);
+            }
+            int err_count = batch_dock_with_worker(cp, m_local_only, m_work_dir, m_input_path, m_out_phrase);
+            total_err_count += err_count;
+	        cp.clear();
+        }
+        std::cout << "Remaining [" << remaining << "/" << m_complex_names.size() << "] completed.\n" << total_err_count << " Total errors\n";
+
+        return total_err_count;
+    };
+
+    struct err_counter
+    {
+        std::atomic<int> err_count;
+        void update()
+        {
+            err_count ++;
+        }
+        int get()
+        {
+            return err_count;
+        }
+        void clear()
+        {
+            err_count = 0;
+        }
+    };
+    err_counter counter;
+
+// Launches a batch of vcw workers in separate threads
+// to perform 1:1 docking.
+// Each launch uses CUDA stream for concurrent operation of the batches
+
+    int batch_dock_with_worker(
+                std::vector<complex_property> props, 
+                bool local_only,
+                std::string workdir,
+                std::string input_dir,
+                std::string out_phrase)
+    {
+        std::vector<std::thread> worker_threads;
+
+        counter.clear();
+
+        for (int i = 0;i < props.size();i ++)
+        {
+            worker_threads.emplace_back(std::thread(
+                [=]()
+                {
+                    vina_cuda_worker vcw(m_seed, m_num_modes, m_refine_steps, props[i].center_x, props[i].center_y, 
+                            props[i].center_z, props[i].protein_name,props[i].ligand_name,
+                            local_only, std::vector<double>{props[i].box_x, props[i].box_y, props[i].box_z}, m_max_global_steps, m_verbosity,
+                            m_exhaustiveness, workdir, input_dir, out_phrase);
+                    try
+                    {
+                        int ret = vcw.launch();
+                        if (ret)
+                        {
+                            counter.update();
+                        }
+                    }
+                    catch(const std::exception& e)
+                    {
+                        std::cerr << "Exception processing " << props[i].ligand_name << ", " << e.what() << "\n";
+			            counter.update();
+                    }
+                }
+            )
+            );
+        }
+        for (int i = 0;i < props.size();i ++)
+        {
+            worker_threads[i].join();
+        }
+	    return counter.get();
+    }
+}; // simulation_container
diff --git a/unidock/src/main/vina_cuda_worker.h b/unidock/src/main/vina_cuda_worker.h
new file mode 100644
index 0000000..d2151d7
--- /dev/null
+++ b/unidock/src/main/vina_cuda_worker.h
@@ -0,0 +1,191 @@
+/*
+
+   Copyright (c) 2006-2010, The Scripps Research Institute
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+           http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+   Author: Dr. Oleg Trott <ot14@columbia.edu>,
+                   The Olson Lab,
+                   The Scripps Research Institute
+
+*/
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <vector>  // ligand paths
+#include <exception>
+#include <boost/program_options.hpp>
+
+//#define DEBUG
+
+#include "vina.h"
+#include "utils.h"
+#include "scoring_function.h"
+
+#include <thread>
+#include <chrono>
+#include <iterator>
+#include <cstddef>
+
+#include "complex_property.h"
+// Use vina sf, and accelerate operations with CUDA streams
+
+class vina_cuda_worker : public Vina
+{
+    int exhaustiveness = 512;
+    int num_modes = 1;
+    int min_rmsd = 0;
+    int max_evals = 0;
+    int max_step = 60;   
+    int seed = 5;
+    int refine_step = 3;
+    bool local_only = false;
+    double energy_range = 3.0;
+    bool keep_H = true;
+    std::string sf_name = "vina";
+    int cpu = 0;
+    bool no_refine = false;
+    double size_x = 25;
+    double size_y = 25;
+    double size_z = 25;
+    double grid_spacing = 0.375;
+    bool force_even_voxels = false;
+    // vina weights
+    double weight_gauss1 = -0.035579;
+    double weight_gauss2 = -0.005156;
+    double weight_repulsion = 0.840245;
+    double weight_hydrophobic = -0.035069;
+    double weight_hydrogen = -0.587439;
+    double weight_rot = 0.05846;    
+    // macrocycle closure
+    double weight_glue = 50.000000;  // linear attraction
+    std::vector<std::string> gpu_out_name;
+    std::string workdir;
+    std::string input_dir;
+    std::string out_dir;
+    std::vector<model> batch_ligands; 
+    double center_x;
+    double center_y; 
+    double center_z; 
+    std::string protein_name;
+    std::string ligand_name;
+    void init(std::string out_phrase)
+    {
+        out_dir = workdir + "/" + out_phrase;
+        if (!boost::filesystem::exists(out_dir))
+        {
+            boost::filesystem::create_directory(out_dir);
+        }
+	    m_seed = seed;
+    }
+public:            
+    vina_cuda_worker(
+            int seed,
+            int num_modes,
+            int refine_steps,
+            double center_x, 
+            double center_y, 
+            double center_z, 
+            std::string protein_name,
+            std::string ligand_name,
+            bool local_only,
+            std::vector<double> box_size_xyz,
+            int max_step,
+            int verbosity,
+            int exh,
+            std::string workdir,
+            std::string input_dir,
+            std::string out_phrase):
+
+            seed(seed),
+            num_modes(num_modes),
+            refine_step(refine_steps),
+            workdir(workdir),
+            input_dir(input_dir),
+            center_x(center_x),
+            center_y(center_y),
+            center_z(center_z),
+            size_x(box_size_xyz[0]),
+            size_y(box_size_xyz[1]),
+            size_z(box_size_xyz[2]),
+            max_step(max_step),
+            exhaustiveness(exh),
+            protein_name(protein_name),
+            ligand_name(ligand_name),
+            local_only(local_only),
+            out_dir(out_phrase),
+            Vina{"vina", 0, seed, verbosity, false, NULL}
+    {
+        init(out_phrase);
+    }
+
+    ~vina_cuda_worker()
+    {
+
+    }
+
+
+    // Performs CUDA Stream based docking of 1 ligand and 1 protein
+
+    int launch()
+    {
+        multi_bias = false;
+	    bias_batch_list.clear();
+        
+        set_vina_weights(weight_gauss1, weight_gauss2, weight_repulsion, weight_hydrophobic,
+                                weight_hydrogen, weight_glue, weight_rot);        
+        std::string flex;
+        std::string rigid(protein_name);
+
+        if (! boost::filesystem::exists( ligand_name ) )
+        {
+            std::cout << "Input ligand file does not exist ("  << ligand_name << ")\n";
+            return -1;
+        }        
+        if (! boost::filesystem::exists( rigid ) )
+        {
+            std::cout << "Input (rigid) protein file does not exist (" << rigid << ")\n";
+            return -1;
+        }    
+ 
+        set_receptor(rigid, flex);                                
+
+        enable_gpu();
+        compute_vina_maps(center_x, center_y, center_z, size_x, size_y, size_z,
+                                                grid_spacing, force_even_voxels);
+
+        auto parsed_ligand = parse_ligand_from_file_no_failure(
+            ligand_name, m_scoring_function->get_atom_typing(), keep_H);
+        batch_ligands.emplace_back(parsed_ligand);
+
+        set_ligand_from_object_gpu(batch_ligands);
+
+        bool create_new_stream = true;
+        global_search_gpu(  exhaustiveness, num_modes, min_rmsd, max_evals, max_step,
+                            1, (unsigned long long)seed, refine_step,
+                            local_only, create_new_stream);
+
+        std::vector<std::string> gpu_out_name;
+        gpu_out_name.push_back(
+            default_output(get_filename(ligand_name), out_dir));
+        write_poses_gpu(gpu_out_name, num_modes, energy_range);
+
+        return 0;
+    }
+
+    // Protectors
+    vina_cuda_worker (const vina_cuda_worker&);
+    vina_cuda_worker& operator=(const vina_cuda_worker&);
+};
+
diff --git a/unidock/src/rocm/main.cu.hip b/unidock/src/rocm/main.cu.hip
index dbf6464..60482fa 100644
--- a/unidock/src/rocm/main.cu.hip
+++ b/unidock/src/rocm/main.cu.hip
@@ -606,7 +606,7 @@ Thank you!\n";
 			{
 				auto& ligand=ligand_names[ligand_count];
 				auto l = parse_ligand_pdbqt_from_file_no_failure(
-						ligand, v.m_scoring_function.get_atom_typing());
+						ligand, v.m_scoring_function->get_atom_typing());
 				#pragma omp critical
 				all_ligands.emplace_back(std::make_pair(ligand,l));
 			}