From a3048845e2f76ff70f51a3eeac1a432dba8af13b Mon Sep 17 00:00:00 2001
From: Zihua Wu <zihuaw@nvidia.com>
Date: Mon, 9 Oct 2023 09:53:53 +0800
Subject: [PATCH 1/5] Warp-level parallelization

---
 .gitignore                                    |    4 +
 ...constants_gpu.h => atom_constants_gpu.cuh} |    0
 unidock/src/cuda/common.cuh                   |  778 ++++++++++
 unidock/src/cuda/kernel.h                     |   30 +-
 unidock/src/cuda/monte_carlo.cu               | 1266 +++--------------
 unidock/src/cuda/precalculate.cu              |    2 +-
 ...recalculate_gpu.h => precalculate_gpu.cuh} |    2 +-
 unidock/src/cuda/warp_ops.cuh                 |  660 +++++++++
 8 files changed, 1669 insertions(+), 1073 deletions(-)
 rename unidock/src/cuda/{atom_constants_gpu.h => atom_constants_gpu.cuh} (100%)
 create mode 100644 unidock/src/cuda/common.cuh
 rename unidock/src/cuda/{precalculate_gpu.h => precalculate_gpu.cuh} (99%)
 create mode 100644 unidock/src/cuda/warp_ops.cuh
diff --git a/.gitignore b/.gitignore
index 70d0727..ab78e12 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,7 @@ unidock_tools/dist
 unidock_tools/dist/*
 unidock_tools/unidock_tools.egg-info
 unidock_tools/unidock_tools.egg-info/*
+
+build-*
+nvidia_inha_benchmark
+compile_commands.json
diff --git a/unidock/src/cuda/atom_constants_gpu.h b/unidock/src/cuda/atom_constants_gpu.cuh
similarity index 100%
rename from unidock/src/cuda/atom_constants_gpu.h
rename to unidock/src/cuda/atom_constants_gpu.cuh
diff --git a/unidock/src/cuda/common.cuh b/unidock/src/cuda/common.cuh
new file mode 100644
index 0000000..1e0000f
--- /dev/null
+++ b/unidock/src/cuda/common.cuh
@@ -0,0 +1,778 @@
+#pragma once
+#include "cuda.h"
+#include "curand_kernel.h"
+#include "kernel.h"
+#include "math.h"
+#include <cmath>
+#include <cstdint>
+#include <vector>
+/* Original Include files */
+#include "ad4cache.h"
+#include "cache.h"
+#include "coords.h"
+#include "model.h"
+#include "monte_carlo.h"
+#include "mutate.h"
+#include "precalculate.h"
+#include "quasi_newton.h"
+#include <nvtx3/nvToolsExt.h>
+
+#define M_PI_F 3.1415927f
+
+// symmetric matrix_d (only half of it are stored)
+typedef struct {
+    float data[MAX_HESSIAN_MATRIX_D_SIZE];
+    int dim;
+} matrix_d;
+
+/* Below based on mutate_conf.cpp */
+
+__device__ __forceinline__ void quaternion_increment(float *q, const float *rotation,
+                                                     float epsilon_fl);
+
+__device__ __forceinline__ void normalize_angle(float *x);
+
+__device__ __forceinline__ void output_type_cuda_init(output_type_cuda_t *out, const float *ptr) {
+    memcpy(out, ptr, sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION));
+    out->lig_torsion_size = ptr[3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION];
+    // did not assign coords and e
+}
+
+__device__ __forceinline__ void output_type_cuda_init_with_output(
+    output_type_cuda_t *out_new, const output_type_cuda_t *out_old) {
+    memcpy(out_new, out_old,
+           sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION));
+    out_new->lig_torsion_size = out_old->lig_torsion_size;
+    // assign e but not coords
+    out_new->e = out_old->e;
+}
+
+__device__ __forceinline__ void output_type_cuda_increment(output_type_cuda_t *x,
+                                                           const change_cuda_t *c, float factor,
+                                                           float epsilon_fl) {
+    // position increment
+    for (int k = 0; k < 3; k++) x->position[k] += factor * c->position[k];
+    // orientation increment
+    float rotation[3];
+    for (int k = 0; k < 3; k++) rotation[k] = factor * c->orientation[k];
+    quaternion_increment(x->orientation, rotation, epsilon_fl);
+
+    // torsion increment
+    for (int k = 0; k < x->lig_torsion_size; k++) {
+        float tmp = factor * c->lig_torsion[k];
+        normalize_angle(&tmp);
+        x->lig_torsion[k] += tmp;
+        normalize_angle(&(x->lig_torsion[k]));
+    }
+}
+
+__device__ __forceinline__ float norm3(const float *a) {
+    return sqrtf(a[0] * a[0] + a[1] * a[1] + a[2] * a[2]);
+}
+
+__device__ __forceinline__ void random_inside_sphere_gpu(float *random_inside_sphere,
+                                                         curandStatePhilox4_32_10_t *state) {
+    float4 random_inside_sphere_fl;
+    while (true) {  // on average, this will have to be run about twice
+        random_inside_sphere_fl = curand_uniform4(state);  // ~ U[0,1]
+        random_inside_sphere[0] = (random_inside_sphere_fl.x - 0.5) * 2.0;
+        random_inside_sphere[1] = (random_inside_sphere_fl.y - 0.5) * 2.0;
+        random_inside_sphere[2] = (random_inside_sphere_fl.z - 0.5) * 2.0;
+        random_inside_sphere[3] = random_inside_sphere_fl.w;
+        float r = norm3(random_inside_sphere);
+        if (r < 1) {
+            return;
+        }
+    }
+}
+
+__device__ __forceinline__ void normalize_angle(float *x) {
+    while (1) {
+        if (*x >= -(M_PI_F) && *x <= (M_PI_F)) {
+            break;
+        } else if (*x > 3 * (M_PI_F)) {
+            float n = (*x - (M_PI_F)) / (2 * (M_PI_F));
+            *x -= 2 * (M_PI_F)*ceil(n);
+        } else if (*x < 3 * -(M_PI_F)) {
+            float n = (-*x - (M_PI_F)) / (2 * (M_PI_F));
+            *x += 2 * (M_PI_F)*ceil(n);
+        } else if (*x > (M_PI_F)) {
+            *x -= 2 * (M_PI_F);
+        } else if (*x < -(M_PI_F)) {
+            *x += 2 * (M_PI_F);
+        } else {
+            break;
+        }
+    }
+}
+
+__device__ __forceinline__ bool quaternion_is_normalized(float *q) {
+    float q_pow = q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3];
+    float sqrt_q_pow = sqrtf(q_pow);
+    return (q_pow - 1 < 0.001) && (sqrt_q_pow - 1 < 0.001);
+}
+
+__device__ __forceinline__ void angle_to_quaternion(float *q, const float *rotation,
+                                                    float epsilon_fl) {
+    float angle = norm3(rotation);
+    if (angle > epsilon_fl) {
+        float axis[3] = {rotation[0] / angle, rotation[1] / angle, rotation[2] / angle};
+        normalize_angle(&angle);
+        float c = cos(angle / 2);
+        float s = sin(angle / 2);
+        q[0] = c;
+        q[1] = s * axis[0];
+        q[2] = s * axis[1];
+        q[3] = s * axis[2];
+        return;
+    }
+    q[0] = 1;
+    q[1] = 0;
+    q[2] = 0;
+    q[3] = 0;
+    return;
+}
+
+// quaternion multiplication
+__device__ __forceinline__ void angle_to_quaternion_multi(float *qa, const float *qb) {
+    float tmp[4] = {qa[0], qa[1], qa[2], qa[3]};
+    qa[0] = tmp[0] * qb[0] - tmp[1] * qb[1] - tmp[2] * qb[2] - tmp[3] * qb[3];
+    qa[1] = tmp[0] * qb[1] + tmp[1] * qb[0] + tmp[2] * qb[3] - tmp[3] * qb[2];
+    qa[2] = tmp[0] * qb[2] - tmp[1] * qb[3] + tmp[2] * qb[0] + tmp[3] * qb[1];
+    qa[3] = tmp[0] * qb[3] + tmp[1] * qb[2] - tmp[2] * qb[1] + tmp[3] * qb[0];
+}
+
+__device__ __forceinline__ void quaternion_normalize_approx(float *q, float epsilon_fl) {
+    const float s = q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3];
+    // Omit one assert()
+    if (fabs(s - 1) < TOLERANCE)
+        ;
+    else {
+        const float a = sqrtf(s);
+        for (int i = 0; i < 4; i++) q[i] /= a;
+    }
+}
+
+__device__ __forceinline__ void quaternion_increment(float *q, const float *rotation,
+                                                     float epsilon_fl) {
+    float q_old[4] = {q[0], q[1], q[2], q[3]};
+    angle_to_quaternion(q, rotation, epsilon_fl);
+    angle_to_quaternion_multi(q, q_old);
+    quaternion_normalize_approx(q, epsilon_fl);
+    // assert(quaternion_is_normalized(q)); // unnecessary
+}
+
+__device__ __forceinline__ float vec_distance_sqr(float *a, float *b) {
+    return (a[0] - b[0]) * (a[0] - b[0]) + (a[1] - b[1]) * (a[1] - b[1])
+           + (a[2] - b[2]) * (a[2] - b[2]);
+}
+
+__device__ __forceinline__ float gyration_radius(int m_lig_begin, int m_lig_end,
+                                                 const atom_cuda_t *atoms,
+                                                 const m_coords_cuda_t *m_coords_gpu,
+                                                 const float *m_lig_node_origin) {
+    float acc = 0;
+    int counter = 0;
+    float origin[3] = {m_lig_node_origin[0], m_lig_node_origin[1], m_lig_node_origin[2]};
+    for (int i = m_lig_begin; i < m_lig_end; i++) {
+        float current_coords[3]
+            = {m_coords_gpu->coords[i][0], m_coords_gpu->coords[i][1], m_coords_gpu->coords[i][2]};
+        if (atoms[i].types[0]
+            != EL_TYPE_H) {  // for el, we use the first element (atoms[i].types[0])
+            acc += vec_distance_sqr(current_coords, origin);
+            ++counter;
+        }
+    }
+    return (counter > 0) ? sqrtf(acc / counter) : 0;
+}
+
+__device__ __forceinline__ void mutate_conf_cuda(const int num_steps, output_type_cuda_t *c,
+                                                 curandStatePhilox4_32_10_t *state,
+                                                 const int m_lig_begin, const int m_lig_end,
+                                                 const atom_cuda_t *atoms,
+                                                 const m_coords_cuda_t *m_coords_gpu,
+                                                 const float *m_lig_node_origin_gpu,
+                                                 const float epsilon_fl, const float amplitude) {
+    int flex_torsion_size = 0;  // FIX? 20210727
+    int count_mutable_entities = 2 + c->lig_torsion_size + flex_torsion_size;
+    int which = curand(state) % count_mutable_entities;
+    float random_inside_sphere[4];
+    random_inside_sphere_gpu(random_inside_sphere, state);
+    if (which == 0) {
+        DEBUG_PRINTF("random sphere r=%f\n", norm3(random_inside_sphere));
+    }
+
+    float random_pi = (random_inside_sphere[3] - 0.5) * 2.0 * pi;  // ~ U[-pi, pi]
+    if (which == 0) {
+        DEBUG_PRINTF("random pi=%f\n", random_pi);
+    }
+
+    if (which == 0) {
+        for (int i = 0; i < 3; i++) c->position[i] += amplitude * random_inside_sphere[i];
+        return;
+    }
+    --which;
+    if (which == 0) {
+        float gr
+            = gyration_radius(m_lig_begin, m_lig_end, atoms, m_coords_gpu, m_lig_node_origin_gpu);
+        if (gr > epsilon_fl) {
+            float rotation[3];
+            for (int i = 0; i < 3; i++) rotation[i] = amplitude / gr * random_inside_sphere[i];
+            quaternion_increment(c->orientation, rotation, epsilon_fl);
+        }
+        return;
+    }
+    --which;
+    if (which < c->lig_torsion_size) {
+        c->lig_torsion[which] = random_pi;
+        return;
+    }
+    which -= c->lig_torsion_size;
+
+    if (flex_torsion_size != 0) {
+        if (which < flex_torsion_size) {
+            c->flex_torsion[which] = random_pi;
+            return;
+        }
+        which -= flex_torsion_size;
+    }
+}
+
+/*  Above based on mutate_conf.cpp */
+
+/* Below based on matrix.cpp */
+
+__device__ __forceinline__ void matrix_d_init(matrix_d *m, int dim, float fill_data) {
+    m->dim = dim;
+    if ((dim * (dim + 1) / 2) > MAX_HESSIAN_MATRIX_D_SIZE)
+        DEBUG_PRINTF("\nnmatrix_d: matrix_d_init() ERROR!");
+    // ((dim * (dim + 1) / 2)*sizeof(float)); // symmetric matrix_d
+    for (int i = 0; i < (dim * (dim + 1) / 2); i++) m->data[i] = fill_data;
+    for (int i = (dim * (dim + 1) / 2); i < MAX_HESSIAN_MATRIX_D_SIZE; i++)
+        m->data[i] = 0;  // Others will be 0
+}
+
+// as rugular 3x3 matrix_d
+__device__ __forceinline__ void mat_init(matrix_d *m, float fill_data) {
+    m->dim = 3;  // fixed to 3x3 matrix_d
+    if (9 > MAX_HESSIAN_MATRIX_D_SIZE) DEBUG_PRINTF("\nnmatrix_d: mat_init() ERROR!");
+    for (int i = 0; i < 9; i++) m->data[i] = fill_data;
+}
+
+__device__ __forceinline__ void matrix_d_set_diagonal(matrix_d *m, float fill_data) {
+    for (int i = 0; i < m->dim; i++) {
+        m->data[i + i * (i + 1) / 2] = fill_data;
+    }
+}
+
+// as regular matrix_d
+__device__ __forceinline__ void matrix_d_set_element(matrix_d *m, int dim, int x, int y,
+                                                     float fill_data) {
+    m->data[x + y * dim] = fill_data;
+}
+
+__device__ __forceinline__ void matrix_d_set_element_tri(matrix_d *m, int x, int y,
+                                                         float fill_data) {
+    m->data[x + y * (y + 1) / 2] = fill_data;
+}
+__device__ __forceinline__ int tri_index(int n, int i, int j) {
+    if (j >= n || i > j) DEBUG_PRINTF("\nmatrix_d: tri_index ERROR!");
+    return i + j * (j + 1) / 2;
+}
+
+__device__ __forceinline__ int index_permissive(const matrix_d *m, int i, int j) {
+    return (i < j) ? tri_index(m->dim, i, j) : tri_index(m->dim, j, i);
+}
+
+/* Above based on matrix_d.cpp */
+
+/* Below based on quasi_newton.cpp */
+
+__device__ __forceinline__ void change_cuda_init(change_cuda_t *g, const float *ptr) {
+    for (int i = 0; i < 3; i++) g->position[i] = ptr[i];
+    for (int i = 0; i < 3; i++) g->orientation[i] = ptr[i + 3];
+    for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++) g->lig_torsion[i] = ptr[i + 3 + 3];
+    for (int i = 0; i < MAX_NUM_OF_FLEX_TORSION; i++)
+        g->flex_torsion[i] = ptr[i + 3 + 3 + MAX_NUM_OF_LIG_TORSION];
+    g->lig_torsion_size = ptr[3 + 3 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION];
+}
+
+__device__ __forceinline__ void change_cuda_init_with_change(change_cuda_t *g_new,
+                                                             const change_cuda_t *g_old) {
+    for (int i = 0; i < 3; i++) g_new->position[i] = g_old->position[i];
+    for (int i = 0; i < 3; i++) g_new->orientation[i] = g_old->orientation[i];
+    for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++) g_new->lig_torsion[i] = g_old->lig_torsion[i];
+    for (int i = 0; i < MAX_NUM_OF_FLEX_TORSION; i++)
+        g_new->flex_torsion[i] = g_old->flex_torsion[i];
+    g_new->lig_torsion_size = g_old->lig_torsion_size;
+}
+
+void print_output_type(output_type_cuda_t *x, int torsion_size) {
+    for (int i = 0; i < 3; i++) DEBUG_PRINTF("\nx.position[%d] = %0.16f", i, x->position[i]);
+    for (int i = 0; i < 4; i++) DEBUG_PRINTF("\nx.orientation[%d] = %0.16f", i, x->orientation[i]);
+    for (int i = 0; i < torsion_size; i++)
+        DEBUG_PRINTF("\n x.torsion[%d] = %0.16f", i, x->lig_torsion[i]);
+    DEBUG_PRINTF("\n x.torsion_size = %f", x->lig_torsion_size);
+    DEBUG_PRINTF("\n !!! x.e = %f\n", x->e);
+}
+
+void print_change(change_cuda_t *g, int torsion_size) {
+    for (int i = 0; i < 3; i++) DEBUG_PRINTF("\ng.position[%d] = %0.16f", i, g->position[i]);
+    for (int i = 0; i < 3; i++) DEBUG_PRINTF("\ng.orientation[%d] = %0.16f", i, g->orientation[i]);
+    for (int i = 0; i < torsion_size; i++)
+        DEBUG_PRINTF("\ng.torsion[%d] = %0.16f", i, g->lig_torsion[i]);
+    DEBUG_PRINTF("\ng.torsion_size = %f", g->lig_torsion_size);
+}
+
+__device__ __forceinline__ int num_atom_types(int atu) {
+    switch (atu) {
+        case 0:
+            return EL_TYPE_SIZE;
+        case 1:
+            return AD_TYPE_SIZE;
+        case 2:
+            return XS_TYPE_SIZE;
+        case 3:
+            return SY_TYPE_SIZE;
+        default:
+            DEBUG_PRINTF("Kernel1:num_atom_types() ERROR!");
+            return INFINITY;
+    }
+}
+
+__device__ __forceinline__ void elementwise_product(float *out, const float *a, const float *b) {
+    out[0] = a[0] * b[0];
+    out[1] = a[1] * b[1];
+    out[2] = a[2] * b[2];
+}
+
+__device__ __forceinline__ float elementwise_product_sum(const float *a, const float *b) {
+    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
+}
+
+__device__ __forceinline__ float access_m_data(float *m_data, int m_i, int m_j, int i, int j,
+                                               int k) {
+    return m_data[i + m_i * (j + m_j * k)];
+}
+
+__device__ __forceinline__ bool not_max_gpu(float x) {
+    return (x < 0.1 * INFINITY); /* Problem: replace max_fl with INFINITY? */
+}
+
+__device__ __forceinline__ void curl_with_deriv(float *e, float *deriv, float v,
+                                                const float epsilon_fl) {
+    if (*e > 0 && not_max_gpu(v)) {
+        float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e));
+        *e *= tmp;
+        for (int i = 0; i < 3; i++) deriv[i] *= tmp * tmp;
+    }
+}
+
+__device__ __forceinline__ void curl_without_deriv(float *e, float v, const float epsilon_fl) {
+    if (*e > 0 && not_max_gpu(v)) {
+        float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e));
+        *e *= tmp;
+    }
+}
+
+__device__ __forceinline__ float g_evaluate(grid_cuda_t *g, const float *m_coords, /* double[3] */
+                                            const float slope,                     /* double */
+                                            const float v,                         /* double */
+                                            float *deriv,                          /* double[3] */
+                                            const float epsilon_fl) {
+    int m_i = g->m_i;
+    int m_j = g->m_j;
+    int m_k = g->m_k;
+    if (m_i * m_j * m_k == 0) DEBUG_PRINTF("\nkernel2: g_evaluate ERROR!#1");
+    float tmp_vec[3]
+        = {m_coords[0] - g->m_init[0], m_coords[1] - g->m_init[1], m_coords[2] - g->m_init[2]};
+    float tmp_vec2[3] = {g->m_factor[0], g->m_factor[1], g->m_factor[2]};
+    float s[3];
+    elementwise_product(s, tmp_vec, tmp_vec2);
+
+    float miss[3] = {0, 0, 0};
+    int region[3];
+    int a[3];
+    int m_data_dims[3] = {m_i, m_j, m_k};
+    for (int i = 0; i < 3; i++) {
+        if (s[i] < 0) {
+            miss[i] = -s[i];
+            region[i] = -1;
+            a[i] = 0;
+            s[i] = 0;
+        } else if (s[i] >= g->m_dim_fl_minus_1[i]) {
+            miss[i] = s[i] - g->m_dim_fl_minus_1[i];
+            region[i] = 1;
+            if (m_data_dims[i] < 2) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#2");
+            a[i] = m_data_dims[i] - 2;
+            s[i] = 1;
+        } else {
+            region[i] = 0;
+            a[i] = (int)s[i];
+            s[i] -= a[i];
+        }
+        if (s[i] < 0) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#3");
+        if (s[i] > 1) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#4");
+        if (a[i] < 0) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#5");
+        if (a[i] + 1 >= m_data_dims[i]) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#5");
+    }
+
+    float tmp_m_factor_inv[3] = {g->m_factor_inv[0], g->m_factor_inv[1], g->m_factor_inv[2]};
+    const float penalty = slope * elementwise_product_sum(miss, tmp_m_factor_inv);
+    if (penalty <= -epsilon_fl) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#6");
+
+    const int x0 = a[0];
+    const int y0 = a[1];
+    const int z0 = a[2];
+
+    const int x1 = x0 + 1;
+    const int y1 = y0 + 1;
+    const int z1 = z0 + 1;
+
+    const float f000 = access_m_data(g->m_data, m_i, m_j, x0, y0, z0);
+    const float f100 = access_m_data(g->m_data, m_i, m_j, x1, y0, z0);
+    const float f010 = access_m_data(g->m_data, m_i, m_j, x0, y1, z0);
+    const float f110 = access_m_data(g->m_data, m_i, m_j, x1, y1, z0);
+    const float f001 = access_m_data(g->m_data, m_i, m_j, x0, y0, z1);
+    const float f101 = access_m_data(g->m_data, m_i, m_j, x1, y0, z1);
+    const float f011 = access_m_data(g->m_data, m_i, m_j, x0, y1, z1);
+    const float f111 = access_m_data(g->m_data, m_i, m_j, x1, y1, z1);
+
+    const float x = s[0];
+    const float y = s[1];
+    const float z = s[2];
+
+    const float mx = 1 - x;
+    const float my = 1 - y;
+    const float mz = 1 - z;
+
+    float f = f000 * mx * my * mz + f100 * x * my * mz + f010 * mx * y * mz + f110 * x * y * mz
+              + f001 * mx * my * z + f101 * x * my * z + f011 * mx * y * z + f111 * x * y * z;
+
+    if (deriv) {
+        const float x_g = f000 * (-1) * my * mz + f100 * 1 * my * mz + f010 * (-1) * y * mz
+                          + f110 * 1 * y * mz + f001 * (-1) * my * z + f101 * 1 * my * z
+                          + f011 * (-1) * y * z + f111 * 1 * y * z;
+
+        const float y_g = f000 * mx * (-1) * mz + f100 * x * (-1) * mz + f010 * mx * 1 * mz
+                          + f110 * x * 1 * mz + f001 * mx * (-1) * z + f101 * x * (-1) * z
+                          + f011 * mx * 1 * z + f111 * x * 1 * z;
+
+        const float z_g = f000 * mx * my * (-1) + f100 * x * my * (-1) + f010 * mx * y * (-1)
+                          + f110 * x * y * (-1) + f001 * mx * my * 1 + f101 * x * my * 1
+                          + f011 * mx * y * 1 + f111 * x * y * 1;
+
+        float gradient[3] = {x_g, y_g, z_g};
+
+        curl_with_deriv(&f, gradient, v, epsilon_fl);
+
+        float gradient_everywhere[3];
+
+        for (int i = 0; i < 3; i++) {
+            gradient_everywhere[i] = ((region[i] == 0) ? gradient[i] : 0);
+            deriv[i] = g->m_factor[i] * gradient_everywhere[i] + slope * region[i];
+        }
+        return f + penalty;
+    } else { /* none valid pointer */
+        DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#7");
+        curl_without_deriv(&f, v, epsilon_fl);
+        return f + penalty;
+    }
+}
+
+__device__ __forceinline__ float ig_eval_deriv(output_type_cuda_t *x, change_cuda_t *g,
+                                               const float v, ig_cuda_t *ig_cuda_gpu,
+                                               m_cuda_t *m_cuda_gpu, const float epsilon_fl) {
+    float e = 0;
+    int nat = num_atom_types(ig_cuda_gpu->atu);
+    for (int i = 0; i < m_cuda_gpu->m_num_movable_atoms; i++) {
+        int t = m_cuda_gpu->atoms[i].types[ig_cuda_gpu->atu];
+        if (t >= nat) {
+            for (int j = 0; j < 3; j++) m_cuda_gpu->minus_forces.coords[i][j] = 0;
+            continue;
+        }
+        float deriv[3];
+
+        e = e
+            + g_evaluate(&ig_cuda_gpu->grids[t], m_cuda_gpu->m_coords.coords[i], ig_cuda_gpu->slope,
+                         v, deriv, epsilon_fl);
+
+        for (int j = 0; j < 3; j++) m_cuda_gpu->minus_forces.coords[i][j] = deriv[j];
+    }
+    return e;
+}
+
+__device__ __forceinline__ void quaternion_to_r3(const float *q, float *orientation_m) {
+    /* Omit assert(quaternion_is_normalized(q)); */
+    const float a = q[0];
+    const float b = q[1];
+    const float c = q[2];
+    const float d = q[3];
+
+    const float aa = a * a;
+    const float ab = a * b;
+    const float ac = a * c;
+    const float ad = a * d;
+    const float bb = b * b;
+    const float bc = b * c;
+    const float bd = b * d;
+    const float cc = c * c;
+    const float cd = c * d;
+    const float dd = d * d;
+
+    /* Omit assert(eq(aa + bb + cc + dd, 1)); */
+    matrix_d tmp;
+    mat_init(&tmp, 0); /* matrix_d with fixed dimension 3(here we treate this as
+                          a regular matrix_d(not triangular matrix_d!)) */
+
+    matrix_d_set_element(&tmp, 3, 0, 0, (aa + bb - cc - dd));
+    matrix_d_set_element(&tmp, 3, 0, 1, 2 * (-ad + bc));
+    matrix_d_set_element(&tmp, 3, 0, 2, 2 * (ac + bd));
+
+    matrix_d_set_element(&tmp, 3, 1, 0, 2 * (ad + bc));
+    matrix_d_set_element(&tmp, 3, 1, 1, (aa - bb + cc - dd));
+    matrix_d_set_element(&tmp, 3, 1, 2, 2 * (-ab + cd));
+
+    matrix_d_set_element(&tmp, 3, 2, 0, 2 * (-ac + bd));
+    matrix_d_set_element(&tmp, 3, 2, 1, 2 * (ab + cd));
+    matrix_d_set_element(&tmp, 3, 2, 2, (aa - bb - cc + dd));
+
+    for (int i = 0; i < 9; i++) orientation_m[i] = tmp.data[i];
+}
+
+__device__ __forceinline__ void local_to_lab_direction(float *out, const float *local_direction,
+                                                       const float *orientation_m) {
+    out[0] = orientation_m[0] * local_direction[0] + orientation_m[3] * local_direction[1]
+             + orientation_m[6] * local_direction[2];
+    out[1] = orientation_m[1] * local_direction[0] + orientation_m[4] * local_direction[1]
+             + orientation_m[7] * local_direction[2];
+    out[2] = orientation_m[2] * local_direction[0] + orientation_m[5] * local_direction[1]
+             + orientation_m[8] * local_direction[2];
+}
+
+__device__ __forceinline__ void local_to_lab(float *out, const float *origin,
+                                             const float *local_coords,
+                                             const float *orientation_m) {
+    out[0] = origin[0]
+             + (orientation_m[0] * local_coords[0] + orientation_m[3] * local_coords[1]
+                + orientation_m[6] * local_coords[2]);
+    out[1] = origin[1]
+             + (orientation_m[1] * local_coords[0] + orientation_m[4] * local_coords[1]
+                + orientation_m[7] * local_coords[2]);
+    out[2] = origin[2]
+             + (orientation_m[2] * local_coords[0] + orientation_m[5] * local_coords[1]
+                + orientation_m[8] * local_coords[2]);
+}
+
+__device__ __forceinline__ void angle_to_quaternion2(float *out, const float *axis, float angle) {
+    normalize_angle(&angle);
+    float c = cos(angle / 2);
+    float s = sin(angle / 2);
+    out[0] = c;
+    out[1] = s * axis[0];
+    out[2] = s * axis[1];
+    out[3] = s * axis[2];
+}
+
+__device__ __forceinline__ void set(const output_type_cuda_t *x, rigid_cuda_t *lig_rigid_gpu,
+                                    m_coords_cuda_t *m_coords_gpu, const atom_cuda_t *atoms,
+                                    const int m_num_movable_atoms, const float epsilon_fl) {
+    for (int i = 0; i < 3; i++) lig_rigid_gpu->origin[0][i] = x->position[i];
+    for (int i = 0; i < 4; i++) lig_rigid_gpu->orientation_q[0][i] = x->orientation[i];
+    quaternion_to_r3(lig_rigid_gpu->orientation_q[0],
+                     lig_rigid_gpu->orientation_m[0]); /* set orientation_m */
+
+    int begin = lig_rigid_gpu->atom_range[0][0];
+    int end = lig_rigid_gpu->atom_range[0][1];
+
+    for (int i = begin; i < end; i++) {
+        local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[0], atoms[i].coords,
+                     lig_rigid_gpu->orientation_m[0]);
+    }
+    /* ************* end node.set_conf ************* */
+
+    /* ************* branches_set_conf ************* */
+    /* update nodes in depth-first order */
+    for (int current = 1; current < lig_rigid_gpu->num_children + 1;
+         current++) { /* current starts from 1 (namely starts from first child
+                         node) */
+        int parent = lig_rigid_gpu->parent[current];
+        float torsion = x->lig_torsion[current - 1]; /* torsions are all related to child nodes */
+        local_to_lab(lig_rigid_gpu->origin[current], lig_rigid_gpu->origin[parent],
+                     lig_rigid_gpu->relative_origin[current], lig_rigid_gpu->orientation_m[parent]);
+        local_to_lab_direction(lig_rigid_gpu->axis[current], lig_rigid_gpu->relative_axis[current],
+                               lig_rigid_gpu->orientation_m[parent]);
+        float tmp[4];
+        float parent_q[4]
+            = {lig_rigid_gpu->orientation_q[parent][0], lig_rigid_gpu->orientation_q[parent][1],
+               lig_rigid_gpu->orientation_q[parent][2], lig_rigid_gpu->orientation_q[parent][3]};
+        float current_axis[3] = {lig_rigid_gpu->axis[current][0], lig_rigid_gpu->axis[current][1],
+                                 lig_rigid_gpu->axis[current][2]};
+
+        angle_to_quaternion2(tmp, current_axis, torsion);
+        angle_to_quaternion_multi(tmp, parent_q);
+        quaternion_normalize_approx(tmp, epsilon_fl);
+
+        for (int i = 0; i < 4; i++)
+            lig_rigid_gpu->orientation_q[current][i] = tmp[i]; /* set orientation_q */
+        quaternion_to_r3(lig_rigid_gpu->orientation_q[current],
+                         lig_rigid_gpu->orientation_m[current]); /* set orientation_m */
+
+        /* set coords */
+        begin = lig_rigid_gpu->atom_range[current][0];
+        end = lig_rigid_gpu->atom_range[current][1];
+        for (int i = begin; i < end; i++) {
+            local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[current], atoms[i].coords,
+                         lig_rigid_gpu->orientation_m[current]);
+        }
+    }
+    /* ************* end branches_set_conf ************* */
+}
+
+__device__ __forceinline__ void p_eval_deriv(float *out, int type_pair_index, float r2,
+                                             p_cuda_t *p_cuda_gpu, const float epsilon_fl) {
+    const float cutoff_sqr = p_cuda_gpu->m_cutoff_sqr;
+    if (r2 > cutoff_sqr)
+        DEBUG_PRINTF(
+            "\nkernel2: p_eval_deriv() ERROR!, r2 > Cutoff_sqr, r2=%f, "
+            "cutoff_sqr=%f",
+            r2, cutoff_sqr);
+
+    p_m_data_cuda_t *tmp = &p_cuda_gpu->m_data[type_pair_index];
+    float r2_factored = tmp->factor * r2;
+    int i1 = (int)(r2_factored);
+    int i2 = i1 + 1;
+    float rem = r2_factored - i1;
+    if (rem < -epsilon_fl) DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!");
+    if (rem >= 1 + epsilon_fl) DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!");
+    float p1[2] = {tmp->smooth[i1][0], tmp->smooth[i1][1]};
+    if (i1 >= SMOOTH_SIZE) p1[0] = p1[1] = 0;
+    float p2[2] = {tmp->smooth[i2][0], tmp->smooth[i2][1]};
+    if (i2 >= SMOOTH_SIZE) p2[0] = p2[1] = 0;
+    float e = p1[0] + rem * (p2[0] - p1[0]);
+    float dor = p1[1] + rem * (p2[1] - p1[1]);
+    out[0] = e;
+    out[1] = dor;
+}
+
+__device__ __forceinline__ void curl(float *e, float *deriv, float v, const float epsilon_fl) {
+    if (*e > 0 && not_max_gpu(v)) {
+        float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e));
+        (*e) = tmp * (*e);
+        for (int i = 0; i < 3; i++) deriv[i] = deriv[i] * (tmp * tmp);
+    }
+}
+
+__device__ __forceinline__ float eval_interacting_pairs_deriv(p_cuda_t *p_cuda_gpu, const float v,
+                                                              const lig_pairs_cuda_t *pairs,
+                                                              const m_coords_cuda_t *m_coords,
+                                                              m_minus_forces_t *minus_forces,
+                                                              const float epsilon_fl) {
+    float e = 0;
+    for (int i = 0; i < pairs->num_pairs; i++) {
+        const int ip[3] = {pairs->type_pair_index[i], pairs->a[i], pairs->b[i]};
+        int index = pairs->a[i] + pairs->b[i] * (pairs->b[i] + 1) / 2;
+        float coords_b[3]
+            = {m_coords->coords[ip[2]][0], m_coords->coords[ip[2]][1], m_coords->coords[ip[2]][2]};
+        float coords_a[3]
+            = {m_coords->coords[ip[1]][0], m_coords->coords[ip[1]][1], m_coords->coords[ip[1]][2]};
+        float r[3]
+            = {coords_b[0] - coords_a[0], coords_b[1] - coords_a[1], coords_b[2] - coords_a[2]};
+        float r2 = r[0] * r[0] + r[1] * r[1] + r[2] * r[2];
+
+        if (r2 < p_cuda_gpu->m_cutoff_sqr) {
+            float tmp[2];
+            p_eval_deriv(tmp, index, r2, p_cuda_gpu, epsilon_fl);
+            float force[3] = {r[0] * tmp[1], r[1] * tmp[1], r[2] * tmp[1]};
+            curl(&tmp[0], force, v, epsilon_fl);
+            e += tmp[0];
+            for (int j = 0; j < 3; j++) minus_forces->coords[ip[1]][j] -= force[j];
+            for (int j = 0; j < 3; j++) minus_forces->coords[ip[2]][j] += force[j];
+        }
+    }
+    return e;
+}
+
+template <typename T1, typename T2, typename T3>
+__device__ __forceinline__ void product(T1 *res, const T2 *a, const T3 *b) {
+    res[0] = a[1] * b[2] - a[2] * b[1];
+    res[1] = a[2] * b[0] - a[0] * b[2];
+    res[2] = a[0] * b[1] - a[1] * b[0];
+}
+
+__device__ __forceinline__ float find_change_index_read(const change_cuda_t *g, int index) {
+    if (index < 3) return g->position[index];
+    index -= 3;
+    if (index < 3) return g->orientation[index];
+    index -= 3;
+    if (index < g->lig_torsion_size) return g->lig_torsion[index];
+    DEBUG_PRINTF("\nKernel2:find_change_index_read() ERROR!"); /* Shouldn't be here */
+}
+
+__device__ __forceinline__ void find_change_index_write(change_cuda_t *g, int index, float data) {
+    if (index < 3) {
+        g->position[index] = data;
+        return;
+    }
+    index -= 3;
+    if (index < 3) {
+        g->orientation[index] = data;
+        return;
+    }
+    index -= 3;
+    if (index < g->lig_torsion_size) {
+        g->lig_torsion[index] = data;
+        return;
+    }
+    DEBUG_PRINTF("\nKernel2:find_change_index_write() ERROR!"); /* Shouldn't be here */
+}
+
+__device__ __forceinline__ void minus_mat_vec_product(const matrix_d *h, const change_cuda_t *in,
+                                                      change_cuda_t *out) {
+    int n = h->dim;
+    for (int i = 0; i < n; i++) {
+        float sum = 0;
+        for (int j = 0; j < n; j++) {
+            sum += h->data[index_permissive(h, i, j)] * find_change_index_read(in, j);
+        }
+        find_change_index_write(out, i, -sum);
+    }
+}
+
+__device__ __forceinline__ float scalar_product(const change_cuda_t *a, const change_cuda_t *b,
+                                                int n) {
+    float tmp = 0;
+    for (int i = 0; i < n; i++) {
+        tmp += find_change_index_read(a, i) * find_change_index_read(b, i);
+    }
+    return tmp;
+}
+
+__device__ __forceinline__ bool bfgs_update(matrix_d *h, const change_cuda_t *p,
+                                            const change_cuda_t *y, const float alpha,
+                                            const float epsilon_fl) {
+    const float yp = scalar_product(y, p, h->dim);
+
+    if (alpha * yp < epsilon_fl) return false;
+    change_cuda_t minus_hy;
+    change_cuda_init_with_change(&minus_hy, y);
+    minus_mat_vec_product(h, y, &minus_hy);
+    const float yhy = -scalar_product(y, &minus_hy, h->dim);
+    const float r = 1 / (alpha * yp);
+    const int n = 6 + p->lig_torsion_size;
+
+    for (int i = 0; i < n; i++) {
+        for (int j = i; j < n; j++) {
+            float tmp
+                = alpha * r
+                      * (find_change_index_read(&minus_hy, i) * find_change_index_read(p, j)
+                         + find_change_index_read(&minus_hy, j) * find_change_index_read(p, i))
+                  + +alpha * alpha * (r * r * yhy + r) * find_change_index_read(p, i)
+                        * find_change_index_read(p, j);
+
+            h->data[i + j * (j + 1) / 2] += tmp;
+        }
+    }
+
+    return true;
+}
diff --git a/unidock/src/cuda/kernel.h b/unidock/src/cuda/kernel.h
index 9abbbbe..bbee8ad 100644
--- a/unidock/src/cuda/kernel.h
+++ b/unidock/src/cuda/kernel.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <cstdint>
 #include <stdexcept>
 template <typename T>
 void check(T result, char const *const func, const char *const file, int const line) {
@@ -43,20 +44,22 @@ void check(T result, char const *const func, const char *const file, int const l
 #define MAX_NUM_OF_GRID_MK 128  // 81
 #define MAX_NUM_OF_GRID_POINT 512000
 
-//#define GRID_MI 65//55
-//#define GRID_MJ 71//55
-//#define GRID_MK 61//81
+// #define GRID_MI 65//55
+// #define GRID_MJ 71//55
+// #define GRID_MK 61//81
 #define MAX_PRECAL_NUM_ATOM 30
 #define MAX_P_DATA_M_DATA_SIZE \
-    45150  // modified for vina1.2, should be larger, n*(n+1)/2, n=num_of_atom, select n=140
-//#define MAX_NUM_OF_GRID_ATOMS 150
+    45150  // modified for vina1.2, should be larger, n*(n+1)/2, n=num_of_atom,
+           // select n=140
+// #define MAX_NUM_OF_GRID_ATOMS 150
 #define FAST_SIZE 2051  // modified for vina1.2 m_max_cutoff^2 * factor + 3, ad4=13424
 #define SMOOTH_SIZE 2051
 #define MAX_CONTAINER_SIZE_EVERY_WI 5
 
 #define MAX_THREAD 41700000  // modified for vina1.2, to calculate random map memory upper bound
 #define MAX_LIGAND_NUM \
-    10250  // modified for vina1.2, to calculate precalculate_byatom memory upper bound
+    10250  // modified for vina1.2, to calculate precalculate_byatom memory upper
+           // bound
 
 typedef struct {
     float data[GRIDS_SIZE];
@@ -107,11 +110,12 @@ typedef struct {  // depth-first order
     float relative_axis[MAX_NUM_OF_RIGID][3];    // 1st column is root node, all 0s
     float relative_origin[MAX_NUM_OF_RIGID][3];  // 1st column is root node, all 0s
 
-    int parent[MAX_NUM_OF_RIGID];  // every node has only 1 parent node
-    bool children_map[MAX_NUM_OF_RIGID]
-                     [MAX_NUM_OF_RIGID];  // chidren_map[i][j] = true if node i's child is node j
+    int parent[MAX_NUM_OF_RIGID];                             // every node has only 1 parent node
+    bool children_map[MAX_NUM_OF_RIGID][MAX_NUM_OF_RIGID];    // chidren_map[i][j] = true if node
+                                                              // i's child is node j
+    bool descendant_map[MAX_NUM_OF_RIGID][MAX_NUM_OF_RIGID];  // descendant_map[i][j] = true if
+                                                              // node i is ancestor of node j
     int num_children;
-
 } rigid_cuda_t;
 
 typedef struct {
@@ -160,6 +164,12 @@ typedef struct {
     grid_cuda_t grids[GRIDS_SIZE];
 } ig_cuda_t;
 
+typedef struct {
+    float ptmp[MAX_NUM_OF_RIGID][3];
+    float p[MAX_NUM_OF_RIGID][3];
+    float o[MAX_NUM_OF_RIGID][3];
+} pot_cuda_t;
+
 typedef struct {
     float fast[FAST_SIZE];
     float smooth[SMOOTH_SIZE][2];
diff --git a/unidock/src/cuda/monte_carlo.cu b/unidock/src/cuda/monte_carlo.cu
index 8483578..c8971ed 100644
--- a/unidock/src/cuda/monte_carlo.cu
+++ b/unidock/src/cuda/monte_carlo.cu
@@ -20,979 +20,42 @@
 
 */
 
+#include "common.cuh"
+#include "cuda.h"
+#include "curand_kernel.h"
 #include "kernel.h"
 #include "math.h"
+#include "warp_ops.cuh"
+#include <cmath>
+#include <cstdint>
 #include <vector>
-#include "curand_kernel.h"
-#include "cuda.h"
 /* Original Include files */
-#include "monte_carlo.h"
+#include "ad4cache.h"
+#include "cache.h"
 #include "coords.h"
-#include "mutate.h"
-#include "quasi_newton.h"
 #include "model.h"
+#include "monte_carlo.h"
+#include "mutate.h"
 #include "precalculate.h"
-#include "cache.h"
-#include "ad4cache.h"
-
-/* Below based on mutate_conf.cpp */
-
-__device__ __forceinline__ void quaternion_increment(float* q, const float* rotation,
-                                                     float epsilon_fl);
-
-__device__ __forceinline__ void normalize_angle(float* x);
-
-__device__ __forceinline__ void output_type_cuda_init(output_type_cuda_t* out, const float* ptr) {
-    memcpy(out, ptr, sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION));
-    out->lig_torsion_size = ptr[3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION];
-    // did not assign coords and e
-}
-
-__device__ __forceinline__ void output_type_cuda_init_with_output(
-    output_type_cuda_t* out_new, const output_type_cuda_t* out_old) {
-    memcpy(out_new, out_old,
-           sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION));
-    out_new->lig_torsion_size = out_old->lig_torsion_size;
-    // assign e but not coords
-    out_new->e = out_old->e;
-}
-
-__device__ __forceinline__ void output_type_cuda_increment(output_type_cuda_t* x,
-                                                           const change_cuda_t* c, float factor,
-                                                           float epsilon_fl) {
-    // position increment
-    for (int k = 0; k < 3; k++) x->position[k] += factor * c->position[k];
-    // orientation increment
-    float rotation[3];
-    for (int k = 0; k < 3; k++) rotation[k] = factor * c->orientation[k];
-    quaternion_increment(x->orientation, rotation, epsilon_fl);
-
-    // torsion increment
-    for (int k = 0; k < x->lig_torsion_size; k++) {
-        float tmp = factor * c->lig_torsion[k];
-        normalize_angle(&tmp);
-        x->lig_torsion[k] += tmp;
-        normalize_angle(&(x->lig_torsion[k]));
-    }
-}
-
-__device__ __forceinline__ float norm3(const float* a) {
-    return sqrt(pow(a[0], 2) + pow(a[1], 2) + pow(a[2], 2));
-}
-
-__device__ __forceinline__ void random_inside_sphere_gpu(float* random_inside_sphere,
-                                                         curandStatePhilox4_32_10_t* state) {
-    float4 random_inside_sphere_fl;
-    while (true) {  // on average, this will have to be run about twice
-        random_inside_sphere_fl = curand_uniform4(state);  // ~ U[0,1]
-        random_inside_sphere[0] = (random_inside_sphere_fl.x - 0.5) * 2.0;
-        random_inside_sphere[1] = (random_inside_sphere_fl.y - 0.5) * 2.0;
-        random_inside_sphere[2] = (random_inside_sphere_fl.z - 0.5) * 2.0;
-        random_inside_sphere[3] = random_inside_sphere_fl.w;
-        float r = norm3(random_inside_sphere);
-        if (r < 1) {
-            return;
-        }
-    }
-}
-
-__device__ __forceinline__ void normalize_angle(float* x) {
-    while (1) {
-        if (*x >= -(M_PI) && *x <= (M_PI)) {
-            break;
-        } else if (*x > 3 * (M_PI)) {
-            float n = (*x - (M_PI)) / (2 * (M_PI));
-            *x -= 2 * (M_PI)*ceil(n);
-        } else if (*x < 3 * -(M_PI)) {
-            float n = (-*x - (M_PI)) / (2 * (M_PI));
-            *x += 2 * (M_PI)*ceil(n);
-        } else if (*x > (M_PI)) {
-            *x -= 2 * (M_PI);
-        } else if (*x < -(M_PI)) {
-            *x += 2 * (M_PI);
-        } else {
-            break;
-        }
-    }
-}
-
-__device__ __forceinline__ bool quaternion_is_normalized(float* q) {
-    float q_pow = pow(q[0], 2) + pow(q[1], 2) + pow(q[2], 2) + pow(q[3], 2);
-    float sqrt_q_pow = sqrt(q_pow);
-    return (q_pow - 1 < 0.001) && (sqrt_q_pow - 1 < 0.001);
-}
-
-__device__ __forceinline__ void angle_to_quaternion(float* q, const float* rotation,
-                                                    float epsilon_fl) {
-    float angle = norm3(rotation);
-    if (angle > epsilon_fl) {
-        float axis[3] = {rotation[0] / angle, rotation[1] / angle, rotation[2] / angle};
-        normalize_angle(&angle);
-        float c = cos(angle / 2);
-        float s = sin(angle / 2);
-        q[0] = c;
-        q[1] = s * axis[0];
-        q[2] = s * axis[1];
-        q[3] = s * axis[2];
-        return;
-    }
-    q[0] = 1;
-    q[1] = 0;
-    q[2] = 0;
-    q[3] = 0;
-    return;
-}
-
-// quaternion multiplication
-__device__ __forceinline__ void angle_to_quaternion_multi(float* qa, const float* qb) {
-    float tmp[4] = {qa[0], qa[1], qa[2], qa[3]};
-    qa[0] = tmp[0] * qb[0] - tmp[1] * qb[1] - tmp[2] * qb[2] - tmp[3] * qb[3];
-    qa[1] = tmp[0] * qb[1] + tmp[1] * qb[0] + tmp[2] * qb[3] - tmp[3] * qb[2];
-    qa[2] = tmp[0] * qb[2] - tmp[1] * qb[3] + tmp[2] * qb[0] + tmp[3] * qb[1];
-    qa[3] = tmp[0] * qb[3] + tmp[1] * qb[2] - tmp[2] * qb[1] + tmp[3] * qb[0];
-}
-
-__device__ __forceinline__ void quaternion_normalize_approx(float* q, float epsilon_fl) {
-    const float s = pow(q[0], 2) + pow(q[1], 2) + pow(q[2], 2) + pow(q[3], 2);
-    // Omit one assert()
-    if (fabs(s - 1) < TOLERANCE)
-        ;
-    else {
-        const float a = sqrt(s);
-        for (int i = 0; i < 4; i++) q[i] /= a;
-    }
-}
-
-__device__ __forceinline__ void quaternion_increment(float* q, const float* rotation,
-                                                     float epsilon_fl) {
-    float q_old[4] = {q[0], q[1], q[2], q[3]};
-    angle_to_quaternion(q, rotation, epsilon_fl);
-    angle_to_quaternion_multi(q, q_old);
-    quaternion_normalize_approx(q, epsilon_fl);
-    // assert(quaternion_is_normalized(q)); // unnecessary
-}
-
-__device__ __forceinline__ float vec_distance_sqr(float* a, float* b) {
-    return pow(a[0] - b[0], 2) + pow(a[1] - b[1], 2) + pow(a[2] - b[2], 2);
-}
-
-__device__ __forceinline__ float gyration_radius(int m_lig_begin, int m_lig_end,
-                                                 const atom_cuda_t* atoms,
-                                                 const m_coords_cuda_t* m_coords_gpu,
-                                                 const float* m_lig_node_origin) {
-    float acc = 0;
-    int counter = 0;
-    float origin[3] = {m_lig_node_origin[0], m_lig_node_origin[1], m_lig_node_origin[2]};
-    for (int i = m_lig_begin; i < m_lig_end; i++) {
-        float current_coords[3]
-            = {m_coords_gpu->coords[i][0], m_coords_gpu->coords[i][1], m_coords_gpu->coords[i][2]};
-        if (atoms[i].types[0]
-            != EL_TYPE_H) {  // for el, we use the first element (atoms[i].types[0])
-            acc += vec_distance_sqr(current_coords, origin);
-            ++counter;
-        }
-    }
-    return (counter > 0) ? sqrt(acc / counter) : 0;
-}
-
-__device__ __forceinline__ void mutate_conf_cuda(const int num_steps, output_type_cuda_t* c,
-                                                 curandStatePhilox4_32_10_t* state,
-                                                 const int m_lig_begin, const int m_lig_end,
-                                                 const atom_cuda_t* atoms,
-                                                 const m_coords_cuda_t* m_coords_gpu,
-                                                 const float* m_lig_node_origin_gpu,
-                                                 const float epsilon_fl, const float amplitude) {
-    int flex_torsion_size = 0;  // FIX? 20210727
-    int count_mutable_entities = 2 + c->lig_torsion_size + flex_torsion_size;
-    int which = curand(state) % count_mutable_entities;
-
-    float random_inside_sphere[4];
-    random_inside_sphere_gpu(random_inside_sphere, state);
-    if (which == 0) {
-        DEBUG_PRINTF("random sphere r=%f\n", norm3(random_inside_sphere));
-    }
-
-    float random_pi = (random_inside_sphere[3] - 0.5) * 2.0 * pi;  // ~ U[-pi, pi]
-    if (which == 0) {
-        DEBUG_PRINTF("random pi=%f\n", random_pi);
-    }
-
-    if (which == 0) {
-        for (int i = 0; i < 3; i++) c->position[i] += amplitude * random_inside_sphere[i];
-        return;
-    }
-    --which;
-    if (which == 0) {
-        float gr
-            = gyration_radius(m_lig_begin, m_lig_end, atoms, m_coords_gpu, m_lig_node_origin_gpu);
-        if (gr > epsilon_fl) {
-            float rotation[3];
-            for (int i = 0; i < 3; i++) rotation[i] = amplitude / gr * random_inside_sphere[i];
-            quaternion_increment(c->orientation, rotation, epsilon_fl);
-        }
-        return;
-    }
-    --which;
-    if (which < c->lig_torsion_size) {
-        c->lig_torsion[which] = random_pi;
-        return;
-    }
-    which -= c->lig_torsion_size;
-
-    if (flex_torsion_size != 0) {
-        if (which < flex_torsion_size) {
-            c->flex_torsion[which] = random_pi;
-            return;
-        }
-        which -= flex_torsion_size;
-    }
-}
-
-/*  Above based on mutate_conf.cpp */
-
-/* Below based on matrix.cpp */
-
-// symmetric matrix_d (only half of it are stored)
-typedef struct {
-    float data[MAX_HESSIAN_MATRIX_D_SIZE];
-    int dim;
-} matrix_d;
-
-__device__ __forceinline__ void matrix_d_init(matrix_d* m, int dim, float fill_data) {
-    m->dim = dim;
-    if ((dim * (dim + 1) / 2) > MAX_HESSIAN_MATRIX_D_SIZE)
-        DEBUG_PRINTF("\nnmatrix_d: matrix_d_init() ERROR!");
-    // ((dim * (dim + 1) / 2)*sizeof(float)); // symmetric matrix_d
-    for (int i = 0; i < (dim * (dim + 1) / 2); i++) m->data[i] = fill_data;
-    for (int i = (dim * (dim + 1) / 2); i < MAX_HESSIAN_MATRIX_D_SIZE; i++)
-        m->data[i] = 0;  // Others will be 0
-}
-
-// as rugular 3x3 matrix_d
-__device__ __forceinline__ void mat_init(matrix_d* m, float fill_data) {
-    m->dim = 3;  // fixed to 3x3 matrix_d
-    if (9 > MAX_HESSIAN_MATRIX_D_SIZE) DEBUG_PRINTF("\nnmatrix_d: mat_init() ERROR!");
-    for (int i = 0; i < 9; i++) m->data[i] = fill_data;
-}
-
-__device__ __forceinline__ void matrix_d_set_diagonal(matrix_d* m, float fill_data) {
-    for (int i = 0; i < m->dim; i++) {
-        m->data[i + i * (i + 1) / 2] = fill_data;
-    }
-}
-
-// as regular matrix_d
-__device__ __forceinline__ void matrix_d_set_element(matrix_d* m, int dim, int x, int y,
-                                                     float fill_data) {
-    m->data[x + y * dim] = fill_data;
-}
-
-__device__ __forceinline__ void matrix_d_set_element_tri(matrix_d* m, int x, int y,
-                                                         float fill_data) {
-    m->data[x + y * (y + 1) / 2] = fill_data;
-}
-__device__ __forceinline__ int tri_index(int n, int i, int j) {
-    if (j >= n || i > j) DEBUG_PRINTF("\nmatrix_d: tri_index ERROR!");
-    return i + j * (j + 1) / 2;
-}
-
-__device__ __forceinline__ int index_permissive(const matrix_d* m, int i, int j) {
-    return (i < j) ? tri_index(m->dim, i, j) : tri_index(m->dim, j, i);
-}
-
-/* Above based on matrix_d.cpp */
-
-/* Below based on quasi_newton.cpp */
-
-__device__ __forceinline__ void change_cuda_init(change_cuda_t* g, const float* ptr) {
-    for (int i = 0; i < 3; i++) g->position[i] = ptr[i];
-    for (int i = 0; i < 3; i++) g->orientation[i] = ptr[i + 3];
-    for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++) g->lig_torsion[i] = ptr[i + 3 + 3];
-    for (int i = 0; i < MAX_NUM_OF_FLEX_TORSION; i++)
-        g->flex_torsion[i] = ptr[i + 3 + 3 + MAX_NUM_OF_LIG_TORSION];
-    g->lig_torsion_size = ptr[3 + 3 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION];
-}
-
-__device__ __forceinline__ void change_cuda_init_with_change(change_cuda_t* g_new,
-                                                             const change_cuda_t* g_old) {
-    for (int i = 0; i < 3; i++) g_new->position[i] = g_old->position[i];
-    for (int i = 0; i < 3; i++) g_new->orientation[i] = g_old->orientation[i];
-    for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++) g_new->lig_torsion[i] = g_old->lig_torsion[i];
-    for (int i = 0; i < MAX_NUM_OF_FLEX_TORSION; i++)
-        g_new->flex_torsion[i] = g_old->flex_torsion[i];
-    g_new->lig_torsion_size = g_old->lig_torsion_size;
-}
-
-void print_output_type(output_type_cuda_t* x, int torsion_size) {
-    for (int i = 0; i < 3; i++) DEBUG_PRINTF("\nx.position[%d] = %0.16f", i, x->position[i]);
-    for (int i = 0; i < 4; i++) DEBUG_PRINTF("\nx.orientation[%d] = %0.16f", i, x->orientation[i]);
-    for (int i = 0; i < torsion_size; i++)
-        DEBUG_PRINTF("\n x.torsion[%d] = %0.16f", i, x->lig_torsion[i]);
-    DEBUG_PRINTF("\n x.torsion_size = %f", x->lig_torsion_size);
-    DEBUG_PRINTF("\n !!! x.e = %f\n", x->e);
-}
-
-void print_change(change_cuda_t* g, int torsion_size) {
-    for (int i = 0; i < 3; i++) DEBUG_PRINTF("\ng.position[%d] = %0.16f", i, g->position[i]);
-    for (int i = 0; i < 3; i++) DEBUG_PRINTF("\ng.orientation[%d] = %0.16f", i, g->orientation[i]);
-    for (int i = 0; i < torsion_size; i++)
-        DEBUG_PRINTF("\ng.torsion[%d] = %0.16f", i, g->lig_torsion[i]);
-    DEBUG_PRINTF("\ng.torsion_size = %f", g->lig_torsion_size);
-}
-
-__device__ __forceinline__ int num_atom_types(int atu) {
-    switch (atu) {
-        case 0:
-            return EL_TYPE_SIZE;
-        case 1:
-            return AD_TYPE_SIZE;
-        case 2:
-            return XS_TYPE_SIZE;
-        case 3:
-            return SY_TYPE_SIZE;
-        default:
-            DEBUG_PRINTF("Kernel1:num_atom_types() ERROR!");
-            return INFINITY;
-    }
-}
-
-__device__ __forceinline__ void elementwise_product(float* out, const float* a, const float* b) {
-    out[0] = a[0] * b[0];
-    out[1] = a[1] * b[1];
-    out[2] = a[2] * b[2];
-}
-
-__device__ __forceinline__ float elementwise_product_sum(const float* a, const float* b) {
-    return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
-}
-
-__device__ __forceinline__ float access_m_data(float* m_data, int m_i, int m_j, int i, int j,
-                                               int k) {
-    return m_data[i + m_i * (j + m_j * k)];
-}
-
-__device__ __forceinline__ bool not_max_gpu(float x) {
-    return (x < 0.1 * INFINITY); /* Problem: replace max_fl with INFINITY? */
-}
-
-__device__ __forceinline__ void curl_with_deriv(float* e, float* deriv, float v,
-                                                const float epsilon_fl) {
-    if (*e > 0 && not_max_gpu(v)) {
-        float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e));
-        *e *= tmp;
-        for (int i = 0; i < 3; i++) deriv[i] *= pow(tmp, 2);
-    }
-}
-
-__device__ __forceinline__ void curl_without_deriv(float* e, float v, const float epsilon_fl) {
-    if (*e > 0 && not_max_gpu(v)) {
-        float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e));
-        *e *= tmp;
-    }
-}
-
-__device__ __forceinline__ float g_evaluate(grid_cuda_t* g, const float* m_coords, /* double[3] */
-                                            const float slope,                     /* double */
-                                            const float v,                         /* double */
-                                            float* deriv,                          /* double[3] */
-                                            const float epsilon_fl) {
-    int m_i = g->m_i;
-    int m_j = g->m_j;
-    int m_k = g->m_k;
-    if (m_i * m_j * m_k == 0) DEBUG_PRINTF("\nkernel2: g_evaluate ERROR!#1");
-    float tmp_vec[3]
-        = {m_coords[0] - g->m_init[0], m_coords[1] - g->m_init[1], m_coords[2] - g->m_init[2]};
-    float tmp_vec2[3] = {g->m_factor[0], g->m_factor[1], g->m_factor[2]};
-    float s[3];
-    elementwise_product(s, tmp_vec, tmp_vec2);
-
-    float miss[3] = {0, 0, 0};
-    int region[3];
-    int a[3];
-    int m_data_dims[3] = {m_i, m_j, m_k};
-    for (int i = 0; i < 3; i++) {
-        if (s[i] < 0) {
-            miss[i] = -s[i];
-            region[i] = -1;
-            a[i] = 0;
-            s[i] = 0;
-        } else if (s[i] >= g->m_dim_fl_minus_1[i]) {
-            miss[i] = s[i] - g->m_dim_fl_minus_1[i];
-            region[i] = 1;
-            if (m_data_dims[i] < 2) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#2");
-            a[i] = m_data_dims[i] - 2;
-            s[i] = 1;
-        } else {
-            region[i] = 0;
-            a[i] = (int)s[i];
-            s[i] -= a[i];
-        }
-        if (s[i] < 0) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#3");
-        if (s[i] > 1) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#4");
-        if (a[i] < 0) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#5");
-        if (a[i] + 1 >= m_data_dims[i]) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#5");
-    }
-
-    float tmp_m_factor_inv[3] = {g->m_factor_inv[0], g->m_factor_inv[1], g->m_factor_inv[2]};
-    const float penalty = slope * elementwise_product_sum(miss, tmp_m_factor_inv);
-    if (penalty <= -epsilon_fl) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#6");
-
-    const int x0 = a[0];
-    const int y0 = a[1];
-    const int z0 = a[2];
-
-    const int x1 = x0 + 1;
-    const int y1 = y0 + 1;
-    const int z1 = z0 + 1;
-
-    const float f000 = access_m_data(g->m_data, m_i, m_j, x0, y0, z0);
-    const float f100 = access_m_data(g->m_data, m_i, m_j, x1, y0, z0);
-    const float f010 = access_m_data(g->m_data, m_i, m_j, x0, y1, z0);
-    const float f110 = access_m_data(g->m_data, m_i, m_j, x1, y1, z0);
-    const float f001 = access_m_data(g->m_data, m_i, m_j, x0, y0, z1);
-    const float f101 = access_m_data(g->m_data, m_i, m_j, x1, y0, z1);
-    const float f011 = access_m_data(g->m_data, m_i, m_j, x0, y1, z1);
-    const float f111 = access_m_data(g->m_data, m_i, m_j, x1, y1, z1);
-
-    const float x = s[0];
-    const float y = s[1];
-    const float z = s[2];
-
-    const float mx = 1 - x;
-    const float my = 1 - y;
-    const float mz = 1 - z;
-
-    float f = f000 * mx * my * mz + f100 * x * my * mz + f010 * mx * y * mz + f110 * x * y * mz
-              + f001 * mx * my * z + f101 * x * my * z + f011 * mx * y * z + f111 * x * y * z;
-
-    if (deriv) {
-        const float x_g = f000 * (-1) * my * mz + f100 * 1 * my * mz + f010 * (-1) * y * mz
-                          + f110 * 1 * y * mz + f001 * (-1) * my * z + f101 * 1 * my * z
-                          + f011 * (-1) * y * z + f111 * 1 * y * z;
-
-        const float y_g = f000 * mx * (-1) * mz + f100 * x * (-1) * mz + f010 * mx * 1 * mz
-                          + f110 * x * 1 * mz + f001 * mx * (-1) * z + f101 * x * (-1) * z
-                          + f011 * mx * 1 * z + f111 * x * 1 * z;
-
-        const float z_g = f000 * mx * my * (-1) + f100 * x * my * (-1) + f010 * mx * y * (-1)
-                          + f110 * x * y * (-1) + f001 * mx * my * 1 + f101 * x * my * 1
-                          + f011 * mx * y * 1 + f111 * x * y * 1;
-
-        float gradient[3] = {x_g, y_g, z_g};
-
-        curl_with_deriv(&f, gradient, v, epsilon_fl);
-
-        float gradient_everywhere[3];
-
-        for (int i = 0; i < 3; i++) {
-            gradient_everywhere[i] = ((region[i] == 0) ? gradient[i] : 0);
-            deriv[i] = g->m_factor[i] * gradient_everywhere[i] + slope * region[i];
-        }
-        return f + penalty;
-    } else { /* none valid pointer */
-        DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#7");
-        curl_without_deriv(&f, v, epsilon_fl);
-        return f + penalty;
-    }
-}
-
-__device__ __forceinline__ float ig_eval_deriv(output_type_cuda_t* x, change_cuda_t* g,
-                                               const float v, ig_cuda_t* ig_cuda_gpu,
-                                               m_cuda_t* m_cuda_gpu, const float epsilon_fl) {
-    float e = 0;
-    int nat = num_atom_types(ig_cuda_gpu->atu);
-    for (int i = 0; i < m_cuda_gpu->m_num_movable_atoms; i++) {
-        int t = m_cuda_gpu->atoms[i].types[ig_cuda_gpu->atu];
-        if (t >= nat) {
-            for (int j = 0; j < 3; j++) m_cuda_gpu->minus_forces.coords[i][j] = 0;
-            continue;
-        }
-        float deriv[3];
-
-        e = e
-            + g_evaluate(&ig_cuda_gpu->grids[t], m_cuda_gpu->m_coords.coords[i], ig_cuda_gpu->slope,
-                         v, deriv, epsilon_fl);
-
-        for (int j = 0; j < 3; j++) m_cuda_gpu->minus_forces.coords[i][j] = deriv[j];
-    }
-    return e;
-}
-
-__device__ __forceinline__ void quaternion_to_r3(const float* q, float* orientation_m) {
-    /* Omit assert(quaternion_is_normalized(q)); */
-    const float a = q[0];
-    const float b = q[1];
-    const float c = q[2];
-    const float d = q[3];
-
-    const float aa = a * a;
-    const float ab = a * b;
-    const float ac = a * c;
-    const float ad = a * d;
-    const float bb = b * b;
-    const float bc = b * c;
-    const float bd = b * d;
-    const float cc = c * c;
-    const float cd = c * d;
-    const float dd = d * d;
-
-    /* Omit assert(eq(aa + bb + cc + dd, 1)); */
-    matrix_d tmp;
-    mat_init(&tmp, 0); /* matrix_d with fixed dimension 3(here we treate this as a regular
-                          matrix_d(not triangular matrix_d!)) */
-
-    matrix_d_set_element(&tmp, 3, 0, 0, (aa + bb - cc - dd));
-    matrix_d_set_element(&tmp, 3, 0, 1, 2 * (-ad + bc));
-    matrix_d_set_element(&tmp, 3, 0, 2, 2 * (ac + bd));
-
-    matrix_d_set_element(&tmp, 3, 1, 0, 2 * (ad + bc));
-    matrix_d_set_element(&tmp, 3, 1, 1, (aa - bb + cc - dd));
-    matrix_d_set_element(&tmp, 3, 1, 2, 2 * (-ab + cd));
-
-    matrix_d_set_element(&tmp, 3, 2, 0, 2 * (-ac + bd));
-    matrix_d_set_element(&tmp, 3, 2, 1, 2 * (ab + cd));
-    matrix_d_set_element(&tmp, 3, 2, 2, (aa - bb - cc + dd));
-
-    for (int i = 0; i < 9; i++) orientation_m[i] = tmp.data[i];
-}
-
-__device__ __forceinline__ void local_to_lab_direction(float* out, const float* local_direction,
-                                                       const float* orientation_m) {
-    out[0] = orientation_m[0] * local_direction[0] + orientation_m[3] * local_direction[1]
-             + orientation_m[6] * local_direction[2];
-    out[1] = orientation_m[1] * local_direction[0] + orientation_m[4] * local_direction[1]
-             + orientation_m[7] * local_direction[2];
-    out[2] = orientation_m[2] * local_direction[0] + orientation_m[5] * local_direction[1]
-             + orientation_m[8] * local_direction[2];
-}
-
-__device__ __forceinline__ void local_to_lab(float* out, const float* origin,
-                                             const float* local_coords,
-                                             const float* orientation_m) {
-    out[0] = origin[0]
-             + (orientation_m[0] * local_coords[0] + orientation_m[3] * local_coords[1]
-                + orientation_m[6] * local_coords[2]);
-    out[1] = origin[1]
-             + (orientation_m[1] * local_coords[0] + orientation_m[4] * local_coords[1]
-                + orientation_m[7] * local_coords[2]);
-    out[2] = origin[2]
-             + (orientation_m[2] * local_coords[0] + orientation_m[5] * local_coords[1]
-                + orientation_m[8] * local_coords[2]);
-}
-
-__device__ __forceinline__ void angle_to_quaternion2(float* out, const float* axis, float angle) {
-    normalize_angle(&angle);
-    float c = cos(angle / 2);
-    float s = sin(angle / 2);
-    out[0] = c;
-    out[1] = s * axis[0];
-    out[2] = s * axis[1];
-    out[3] = s * axis[2];
-}
-
-__device__ __forceinline__ void set(const output_type_cuda_t* x, rigid_cuda_t* lig_rigid_gpu,
-                                    m_coords_cuda_t* m_coords_gpu, const atom_cuda_t* atoms,
-                                    const int m_num_movable_atoms, const float epsilon_fl) {
-    for (int i = 0; i < 3; i++) lig_rigid_gpu->origin[0][i] = x->position[i];
-    for (int i = 0; i < 4; i++) lig_rigid_gpu->orientation_q[0][i] = x->orientation[i];
-    quaternion_to_r3(lig_rigid_gpu->orientation_q[0],
-                     lig_rigid_gpu->orientation_m[0]); /* set orientation_m */
-
-    int begin = lig_rigid_gpu->atom_range[0][0];
-    int end = lig_rigid_gpu->atom_range[0][1];
-    for (int i = begin; i < end; i++) {
-        local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[0], atoms[i].coords,
-                     lig_rigid_gpu->orientation_m[0]);
-    }
-    /* ************* end node.set_conf ************* */
-
-    /* ************* branches_set_conf ************* */
-    /* update nodes in depth-first order */
-    for (int current = 1; current < lig_rigid_gpu->num_children + 1;
-         current++) { /* current starts from 1 (namely starts from first child node) */
-        int parent = lig_rigid_gpu->parent[current];
-        float torsion = x->lig_torsion[current - 1]; /* torsions are all related to child nodes */
-        local_to_lab(lig_rigid_gpu->origin[current], lig_rigid_gpu->origin[parent],
-                     lig_rigid_gpu->relative_origin[current], lig_rigid_gpu->orientation_m[parent]);
-        local_to_lab_direction(lig_rigid_gpu->axis[current], lig_rigid_gpu->relative_axis[current],
-                               lig_rigid_gpu->orientation_m[parent]);
-        float tmp[4];
-        float parent_q[4]
-            = {lig_rigid_gpu->orientation_q[parent][0], lig_rigid_gpu->orientation_q[parent][1],
-               lig_rigid_gpu->orientation_q[parent][2], lig_rigid_gpu->orientation_q[parent][3]};
-        float current_axis[3] = {lig_rigid_gpu->axis[current][0], lig_rigid_gpu->axis[current][1],
-                                 lig_rigid_gpu->axis[current][2]};
-
-        angle_to_quaternion2(tmp, current_axis, torsion);
-        angle_to_quaternion_multi(tmp, parent_q);
-        quaternion_normalize_approx(tmp, epsilon_fl);
-
-        for (int i = 0; i < 4; i++)
-            lig_rigid_gpu->orientation_q[current][i] = tmp[i]; /* set orientation_q */
-        quaternion_to_r3(lig_rigid_gpu->orientation_q[current],
-                         lig_rigid_gpu->orientation_m[current]); /* set orientation_m */
-
-        /* set coords */
-        begin = lig_rigid_gpu->atom_range[current][0];
-        end = lig_rigid_gpu->atom_range[current][1];
-        for (int i = begin; i < end; i++) {
-            local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[current], atoms[i].coords,
-                         lig_rigid_gpu->orientation_m[current]);
-        }
-    }
-    /* ************* end branches_set_conf ************* */
-}
-
-__device__ __forceinline__ void p_eval_deriv(float* out, int type_pair_index, float r2,
-                                             p_cuda_t* p_cuda_gpu, const float epsilon_fl) {
-    const float cutoff_sqr = p_cuda_gpu->m_cutoff_sqr;
-    if (r2 > cutoff_sqr)
-        DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!, r2 > Cutoff_sqr, r2=%f, cutoff_sqr=%f", r2,
-                     cutoff_sqr);
-
-    p_m_data_cuda_t* tmp = &p_cuda_gpu->m_data[type_pair_index];
-    float r2_factored = tmp->factor * r2;
-    int i1 = (int)(r2_factored);
-    int i2 = i1 + 1;
-    float rem = r2_factored - i1;
-    if (rem < -epsilon_fl) DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!");
-    if (rem >= 1 + epsilon_fl) DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!");
-    float p1[2] = {tmp->smooth[i1][0], tmp->smooth[i1][1]};
-    if (i1 >= SMOOTH_SIZE) p1[0] = p1[1] = 0;
-    float p2[2] = {tmp->smooth[i2][0], tmp->smooth[i2][1]};
-    if (i2 >= SMOOTH_SIZE) p2[0] = p2[1] = 0;
-    float e = p1[0] + rem * (p2[0] - p1[0]);
-    float dor = p1[1] + rem * (p2[1] - p1[1]);
-    out[0] = e;
-    out[1] = dor;
-}
-
-__device__ __forceinline__ void curl(float* e, float* deriv, float v, const float epsilon_fl) {
-    if (*e > 0 && not_max_gpu(v)) {
-        float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e));
-        (*e) = tmp * (*e);
-        for (int i = 0; i < 3; i++) deriv[i] = deriv[i] * (tmp * tmp);
-    }
-}
-
-__device__ __forceinline__ float eval_interacting_pairs_deriv(p_cuda_t* p_cuda_gpu, const float v,
-                                                              const lig_pairs_cuda_t* pairs,
-                                                              const m_coords_cuda_t* m_coords,
-                                                              m_minus_forces_t* minus_forces,
-                                                              const float epsilon_fl) {
-    float e = 0;
-    for (int i = 0; i < pairs->num_pairs; i++) {
-        const int ip[3] = {pairs->type_pair_index[i], pairs->a[i], pairs->b[i]};
-        int index = pairs->a[i] + pairs->b[i] * (pairs->b[i] + 1) / 2;
-        float coords_b[3]
-            = {m_coords->coords[ip[2]][0], m_coords->coords[ip[2]][1], m_coords->coords[ip[2]][2]};
-        float coords_a[3]
-            = {m_coords->coords[ip[1]][0], m_coords->coords[ip[1]][1], m_coords->coords[ip[1]][2]};
-        float r[3]
-            = {coords_b[0] - coords_a[0], coords_b[1] - coords_a[1], coords_b[2] - coords_a[2]};
-        float r2 = r[0] * r[0] + r[1] * r[1] + r[2] * r[2];
-
-        if (r2 < p_cuda_gpu->m_cutoff_sqr) {
-            float tmp[2];
-            p_eval_deriv(tmp, index, r2, p_cuda_gpu, epsilon_fl);
-            float force[3] = {r[0] * tmp[1], r[1] * tmp[1], r[2] * tmp[1]};
-            curl(&tmp[0], force, v, epsilon_fl);
-            e += tmp[0];
-            for (int j = 0; j < 3; j++) minus_forces->coords[ip[1]][j] -= force[j];
-            for (int j = 0; j < 3; j++) minus_forces->coords[ip[2]][j] += force[j];
-        }
-    }
-    return e;
-}
-
-__device__ __forceinline__ void product(float* res, const float* a, const float* b) {
-    res[0] = a[1] * b[2] - a[2] * b[1];
-    res[1] = a[2] * b[0] - a[0] * b[2];
-    res[2] = a[0] * b[1] - a[1] * b[0];
-}
-
-__device__ __forceinline__ void POT_deriv(const m_minus_forces_t* minus_forces,
-                                          const rigid_cuda_t* lig_rigid_gpu,
-                                          const m_coords_cuda_t* m_coords, change_cuda_t* g) {
-    int num_torsion = lig_rigid_gpu->num_children;
-    int num_rigid = num_torsion + 1;
-    float position_derivative_tmp[MAX_NUM_OF_RIGID][3];
-    float position_derivative[MAX_NUM_OF_RIGID][3];
-    float orientation_derivative_tmp[MAX_NUM_OF_RIGID][3];
-    float orientation_derivative[MAX_NUM_OF_RIGID][3];
-    float torsion_derivative[MAX_NUM_OF_RIGID]; /* torsion_derivative[0] has no meaning(root node
-                                                   has no torsion) */
-
-    for (int i = 0; i < num_rigid; i++) {
-        int begin = lig_rigid_gpu->atom_range[i][0];
-        int end = lig_rigid_gpu->atom_range[i][1];
-        for (int k = 0; k < 3; k++) position_derivative_tmp[i][k] = 0;
-        for (int k = 0; k < 3; k++) orientation_derivative_tmp[i][k] = 0;
-        for (int j = begin; j < end; j++) {
-            for (int k = 0; k < 3; k++) position_derivative_tmp[i][k] += minus_forces->coords[j][k];
-
-            float tmp1[3] = {m_coords->coords[j][0] - lig_rigid_gpu->origin[i][0],
-                             m_coords->coords[j][1] - lig_rigid_gpu->origin[i][1],
-                             m_coords->coords[j][2] - lig_rigid_gpu->origin[i][2]};
-            float tmp2[3] = {minus_forces->coords[j][0], minus_forces->coords[j][1],
-                             minus_forces->coords[j][2]};
-            float tmp3[3];
-            product(tmp3, tmp1, tmp2);
-            for (int k = 0; k < 3; k++) orientation_derivative_tmp[i][k] += tmp3[k];
-        }
-    }
-
-    /* position_derivative  */
-    for (int i = num_rigid - 1; i >= 0; i--) { /* from bottom to top */
-        for (int k = 0; k < 3; k++) position_derivative[i][k] = position_derivative_tmp[i][k];
-        /* looking for chidren node */
-        for (int j = 0; j < num_rigid; j++) {
-            if (lig_rigid_gpu->children_map[i][j] == true) {
-                for (int k = 0; k < 3; k++)
-                    position_derivative[i][k] += position_derivative[j][k]; /* self+children node */
-            }
-        }
-    }
-
-    /* orientation_derivetive */
-    for (int i = num_rigid - 1; i >= 0; i--) { /* from bottom to top */
-        for (int k = 0; k < 3; k++) orientation_derivative[i][k] = orientation_derivative_tmp[i][k];
-        /* looking for chidren node */
-        for (int j = 0; j < num_rigid; j++) {
-            if (lig_rigid_gpu->children_map[i][j] == true) { /* self + children node + product */
-                for (int k = 0; k < 3; k++)
-                    orientation_derivative[i][k] += orientation_derivative[j][k];
-                float product_out[3];
-                float origin_temp[3] = {lig_rigid_gpu->origin[j][0] - lig_rigid_gpu->origin[i][0],
-                                        lig_rigid_gpu->origin[j][1] - lig_rigid_gpu->origin[i][1],
-                                        lig_rigid_gpu->origin[j][2] - lig_rigid_gpu->origin[i][2]};
-                product(product_out, origin_temp, position_derivative[j]);
-                for (int k = 0; k < 3; k++) orientation_derivative[i][k] += product_out[k];
-            }
-        }
-    }
-
-    /* torsion_derivative */
-    for (int i = num_rigid - 1; i >= 0; i--) {
-        float sum = 0;
-        for (int j = 0; j < 3; j++) sum += orientation_derivative[i][j] * lig_rigid_gpu->axis[i][j];
-        torsion_derivative[i] = sum;
-    }
-
-    for (int k = 0; k < 3; k++) g->position[k] = position_derivative[0][k];
-    for (int k = 0; k < 3; k++) g->orientation[k] = orientation_derivative[0][k];
-    for (int k = 0; k < num_torsion; k++) g->lig_torsion[k] = torsion_derivative[k + 1];
-}
-
-__device__ __forceinline__ float m_eval_deriv(output_type_cuda_t* c, change_cuda_t* g,
-                                              m_cuda_t* m_cuda_gpu, p_cuda_t* p_cuda_gpu,
-                                              ig_cuda_t* ig_cuda_gpu, const float* v,
-                                              const float epsilon_fl) {
-    // check set args
-    set(c, &m_cuda_gpu->ligand.rigid, &m_cuda_gpu->m_coords, m_cuda_gpu->atoms,
-        m_cuda_gpu->m_num_movable_atoms, epsilon_fl);
-
-    float e = 0;
-    e = ig_eval_deriv(c, g, v[1], ig_cuda_gpu, m_cuda_gpu, epsilon_fl);
-    e += eval_interacting_pairs_deriv(p_cuda_gpu, v[0], &m_cuda_gpu->ligand.pairs,
-                                      &m_cuda_gpu->m_coords, &m_cuda_gpu->minus_forces, epsilon_fl);
-    // should add derivs for glue, other and inter pairs
-    POT_deriv(&m_cuda_gpu->minus_forces, &m_cuda_gpu->ligand.rigid, &m_cuda_gpu->m_coords, g);
-
-    return e;
-}
-
-__device__ __forceinline__ float find_change_index_read(const change_cuda_t* g, int index) {
-    if (index < 3) return g->position[index];
-    index -= 3;
-    if (index < 3) return g->orientation[index];
-    index -= 3;
-    if (index < g->lig_torsion_size) return g->lig_torsion[index];
-    DEBUG_PRINTF("\nKernel2:find_change_index_read() ERROR!"); /* Shouldn't be here */
-}
-
-__device__ __forceinline__ void find_change_index_write(change_cuda_t* g, int index, float data) {
-    if (index < 3) {
-        g->position[index] = data;
-        return;
-    }
-    index -= 3;
-    if (index < 3) {
-        g->orientation[index] = data;
-        return;
-    }
-    index -= 3;
-    if (index < g->lig_torsion_size) {
-        g->lig_torsion[index] = data;
-        return;
-    }
-    DEBUG_PRINTF("\nKernel2:find_change_index_write() ERROR!"); /* Shouldn't be here */
-}
-
-__device__ __forceinline__ void minus_mat_vec_product(const matrix_d* h, const change_cuda_t* in,
-                                                      change_cuda_t* out) {
-    int n = h->dim;
-    for (int i = 0; i < n; i++) {
-        float sum = 0;
-        for (int j = 0; j < n; j++) {
-            sum += h->data[index_permissive(h, i, j)] * find_change_index_read(in, j);
-        }
-        find_change_index_write(out, i, -sum);
-    }
-}
-
-__device__ __forceinline__ float scalar_product(const change_cuda_t* a, const change_cuda_t* b,
-                                                int n) {
-    float tmp = 0;
-    for (int i = 0; i < n; i++) {
-        tmp += find_change_index_read(a, i) * find_change_index_read(b, i);
-    }
-    return tmp;
-}
-
-__device__ __forceinline__ float line_search(m_cuda_t* m_cuda_gpu, p_cuda_t* p_cuda_gpu,
-                                             ig_cuda_t* ig_cuda_gpu, int n,
-                                             const output_type_cuda_t* x, const change_cuda_t* g,
-                                             const float f0, const change_cuda_t* p,
-                                             output_type_cuda_t* x_new, change_cuda_t* g_new,
-                                             float* f1, const float epsilon_fl,
-                                             const float* hunt_cap) {
-    const float c0 = 0.0001;
-    const int max_trials = 10;
-    const float multiplier = 0.5;
-    float alpha = 1;
-
-    const float pg = scalar_product(p, g, n);
-
-    for (int trial = 0; trial < max_trials; trial++) {
-        output_type_cuda_init_with_output(x_new, x);
-        output_type_cuda_increment(x_new, p, alpha, epsilon_fl);
-        *f1 = m_eval_deriv(x_new, g_new, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, hunt_cap, epsilon_fl);
-        if (*f1 - f0 < c0 * alpha * pg) break;
-        alpha *= multiplier;
-    }
-    return alpha;
-}
-
-__device__ __forceinline__ bool bfgs_update(matrix_d* h, const change_cuda_t* p,
-                                            const change_cuda_t* y, const float alpha,
-                                            const float epsilon_fl) {
-    const float yp = scalar_product(y, p, h->dim);
-
-    if (alpha * yp < epsilon_fl) return false;
-    change_cuda_t minus_hy;
-    change_cuda_init_with_change(&minus_hy, y);
-    minus_mat_vec_product(h, y, &minus_hy);
-    const float yhy = -scalar_product(y, &minus_hy, h->dim);
-    const float r = 1 / (alpha * yp);
-    const int n = 6 + p->lig_torsion_size;
-
-    for (int i = 0; i < n; i++) {
-        for (int j = i; j < n; j++) {
-            float tmp
-                = alpha * r
-                      * (find_change_index_read(&minus_hy, i) * find_change_index_read(p, j)
-                         + find_change_index_read(&minus_hy, j) * find_change_index_read(p, i))
-                  + +alpha * alpha * (r * r * yhy + r) * find_change_index_read(p, i)
-                        * find_change_index_read(p, j);
-
-            h->data[i + j * (j + 1) / 2] += tmp;
-        }
-    }
-
-    return true;
-}
-
-__device__ __forceinline__ void bfgs(output_type_cuda_t* x, change_cuda_t* g, m_cuda_t* m_cuda_gpu,
-                                     p_cuda_t* p_cuda_gpu, ig_cuda_t* ig_cuda_gpu,
-                                     const float* hunt_cap, const float epsilon_fl,
-                                     const int max_steps) {
-    int n = 3 + 3 + x->lig_torsion_size; /* the dimensions of matirx */
-
-    matrix_d h;
-    matrix_d_init(&h, n, 0);
-    matrix_d_set_diagonal(&h, 1);
-
-    change_cuda_t g_new;
-    change_cuda_init_with_change(&g_new, g);
-
-    output_type_cuda_t x_new;
-    output_type_cuda_init_with_output(&x_new, x);
-
-    float f0 = m_eval_deriv(x, g, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, hunt_cap, epsilon_fl);
-
-    float f_orig = f0;
-    /* Init g_orig, x_orig */
-    change_cuda_t g_orig;
-    change_cuda_init_with_change(&g_orig, g);
-    output_type_cuda_t x_orig;
-    output_type_cuda_init_with_output(&x_orig, x);
-    /* Init p */
-    change_cuda_t p;
-    change_cuda_init_with_change(&p, g);
-
-    for (int step = 0; step < max_steps; step++) {
-        minus_mat_vec_product(&h, g, &p);
-        float f1 = 0;
-
-        const float alpha = line_search(m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, n, x, g, f0, &p,
-                                        &x_new, &g_new, &f1, epsilon_fl, hunt_cap);
-
-        change_cuda_t y;
-        change_cuda_init_with_change(&y, &g_new);
-        /* subtract_change */
-        for (int i = 0; i < n; i++) {
-            float tmp = find_change_index_read(&y, i) - find_change_index_read(g, i);
-            find_change_index_write(&y, i, tmp);
-        }
-        // f_values[step + 1] = f1;
-        f0 = f1;
-        output_type_cuda_init_with_output(x, &x_new);
-        if (!(sqrt(scalar_product(g, g, n)) >= 1e-5)) break;
-        change_cuda_init_with_change(g, &g_new);
-
-        if (step == 0) {
-            float yy = scalar_product(&y, &y, n);
-            if (fabs(yy) > epsilon_fl) {
-                matrix_d_set_diagonal(&h, alpha * scalar_product(&y, &p, n) / yy);
-            }
-        }
-
-        bool h_updated = bfgs_update(&h, &p, &y, alpha, epsilon_fl);
-    }
-
-    if (!(f0 <= f_orig)) {
-        f0 = f_orig;
-        output_type_cuda_init_with_output(x, &x_orig);
-        change_cuda_init_with_change(g, &g_orig);
-    }
-
-    // write output_type_cuda energy
-    x->e = f0;
-}
+#include "quasi_newton.h"
+#include <nvtx3/nvToolsExt.h>
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 
-/* Above based on quasi_newton.cpp */
+namespace cg = cooperative_groups;
 
 /* Below is monte-carlo kernel, based on kernel.cl*/
 
-__device__ __forceinline__ void m_cuda_init_with_m_cuda(const m_cuda_t* m_cuda_old,
-                                                        m_cuda_t* m_cuda_new) {
-    memcpy(m_cuda_new, m_cuda_old, MAX_NUM_OF_ATOMS * sizeof(atom_cuda_t));
-    m_cuda_new->m_coords = m_cuda_old->m_coords;
-    m_cuda_new->minus_forces = m_cuda_old->minus_forces;
-    m_cuda_new->ligand = m_cuda_old->ligand;
-    m_cuda_new->m_num_movable_atoms = m_cuda_old->m_num_movable_atoms;
-}
-
-__device__ __forceinline__ void get_heavy_atom_movable_coords(output_type_cuda_t* tmp,
-                                                              const m_cuda_t* m_cuda_gpu) {
+__device__ __forceinline__ void get_heavy_atom_movable_coords(output_type_cuda_t *tmp,
+                                                              const m_cuda_t *m_cuda_gpu) {
     int counter = 0;
     for (int i = 0; i < m_cuda_gpu->m_num_movable_atoms; i++) {
         if (m_cuda_gpu->atoms[i].types[0] != EL_TYPE_H) {
             for (int j = 0; j < 3; j++) tmp->coords[counter][j] = m_cuda_gpu->m_coords.coords[i][j];
             counter++;
         } else {
-            // DEBUG_PRINTF("\n P2: removed H atom coords in get_heavy_atom_movable_coords()!");
+            // DEBUG_PRINTF("\n P2: removed H atom coords in
+            // get_heavy_atom_movable_coords()!");
         }
     }
     /* assign 0 for others */
@@ -1001,8 +64,8 @@ __device__ __forceinline__ void get_heavy_atom_movable_coords(output_type_cuda_t
     }
 }
 
-__device__ __forceinline__ float generate_n(const float* pi_map, const int step) {
-    return fabs(pi_map[step]) / M_PI;
+__device__ __forceinline__ float generate_n(const float *pi_map, const int step) {
+    return fabs(pi_map[step]) / M_PI_F;
 }
 
 __device__ __forceinline__ bool metropolis_accept(float old_f, float new_f, float temperature,
@@ -1012,8 +75,8 @@ __device__ __forceinline__ bool metropolis_accept(float old_f, float new_f, floa
     return n < acceptance_probability;
 }
 
-__device__ __forceinline__ void write_back(output_type_cuda_t* results,
-                                           const output_type_cuda_t* best_out) {
+__device__ __forceinline__ void write_back(output_type_cuda_t *results,
+                                           const output_type_cuda_t *best_out) {
     for (int i = 0; i < 3; i++) results->position[i] = best_out->position[i];
     for (int i = 0; i < 4; i++) results->orientation[i] = best_out->orientation[i];
     for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++)
@@ -1028,76 +91,125 @@ __device__ __forceinline__ void write_back(output_type_cuda_t* results,
         }
     }
 }
-// MAX_THREADS_PER_BLOCK and MIN_BLOCKS_PER_MP should be adjusted according to the profiling results
+// MAX_THREADS_PER_BLOCK and MIN_BLOCKS_PER_MP should be adjusted according to
+// the profiling results
 #define MAX_THREADS_PER_BLOCK 32
 #define MIN_BLOCKS_PER_MP 32
+template <unsigned int TileSize = 32>
 __global__ __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) void kernel(
-    m_cuda_t* m_cuda_global, ig_cuda_t* ig_cuda_gpu, p_cuda_t* p_cuda_gpu,
-    float* rand_molec_struc_gpu, float* best_e_gpu, int bfgs_max_steps, float mutation_amplitude,
-    curandStatePhilox4_32_10_t* states, unsigned long long seed, float epsilon_fl,
-    float* hunt_cap_gpu, float* authentic_v_gpu, output_type_cuda_t* results, int search_depth,
-    int num_of_ligands, int threads_per_ligand, bool multi_bias) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    m_cuda_t *m_cuda_global, ig_cuda_t *ig_cuda_gpu, p_cuda_t *p_cuda_gpu,
+    float *rand_molec_struc_gpu, float *best_e_gpu, int bfgs_max_steps, float mutation_amplitude,
+    curandStatePhilox4_32_10_t *states, unsigned long long seed, float epsilon_fl,
+    float *hunt_cap_gpu, float *authentic_v_gpu, output_type_cuda_t *results,
+    output_type_cuda_t *output_aux, change_cuda_t *change_aux, pot_cuda_t *pot_aux,
+    matrix_d *h_cuda_gpu, m_cuda_t *m_cuda_gpu, int search_depth, int num_of_ligands,
+    int threads_per_ligand, bool multi_bias) {
+    int bid = blockIdx.x, tid = threadIdx.x;
+    int pose_id = (bid * warpSize + tid) / TileSize;
+    auto tb = cg::this_thread_block();
+    cg::thread_block_tile<TileSize> tile = cg::tiled_partition<TileSize>(tb);
+    if (m_cuda_global[pose_id / threads_per_ligand].m_num_movable_atoms == -1) {
+        return;
+    }
+
     float best_e = INFINITY;
+    // __shared__ output_type_cuda_t tmp, best_out, candidate, x_new, x_orig;
+    output_type_cuda_t &tmp = output_aux[pose_id * 5];
+    output_type_cuda_t &best_out = output_aux[pose_id * 5 + 1];
+    output_type_cuda_t &candidate = output_aux[pose_id * 5 + 2];
+    output_type_cuda_t &x_new = output_aux[pose_id * 5 + 3];
+    output_type_cuda_t &x_orig = output_aux[pose_id * 5 + 4];
+
+    // __shared__ change_cuda_t g, tmp1, tmp2, tmp3, tmp4;
+    change_cuda_t &g = change_aux[pose_id * 5];
+    change_cuda_t &tmp1 = change_aux[pose_id * 5 + 1];
+    change_cuda_t &tmp2 = change_aux[pose_id * 5 + 2];
+    change_cuda_t &tmp3 = change_aux[pose_id * 5 + 3];
+    change_cuda_t &tmp4 = change_aux[pose_id * 5 + 4];
+
+    if (pose_id < num_of_ligands * threads_per_ligand) {
+        output_type_cuda_init_warp(
+            tile, &tmp, rand_molec_struc_gpu + pose_id * (SIZE_OF_MOLEC_STRUC / sizeof(float)));
+
+        m_cuda_init_with_m_cuda_warp(tile, &m_cuda_global[pose_id / threads_per_ligand],
+                                     &m_cuda_gpu[pose_id]);
+
+        if (tile.thread_rank() == 0) {
+            curand_init(seed, pose_id, 0, &states[pose_id]);
+            g.lig_torsion_size = tmp.lig_torsion_size;
+        }
+        tile.sync();
 
-    if (idx < num_of_ligands * threads_per_ligand) {
-        // if (idx % 100 == 0)DEBUG_PRINTF("\nThread %d START", idx);
-        output_type_cuda_t tmp;  // private memory, shared only in work item
-        change_cuda_t g;
-        m_cuda_t m_cuda_gpu;
-        // update pointer to get correct ligand data
-        output_type_cuda_init(&tmp,
-                              rand_molec_struc_gpu + idx * (SIZE_OF_MOLEC_STRUC / sizeof(float)));
-        curand_init(seed, idx, 0, &states[idx]);
-        m_cuda_init_with_m_cuda(m_cuda_global + idx / threads_per_ligand, &m_cuda_gpu);
         if (multi_bias) {
-            ig_cuda_gpu = ig_cuda_gpu + idx / threads_per_ligand;
+            ig_cuda_gpu += pose_id / threads_per_ligand;
         }
-        if (m_cuda_gpu.m_num_movable_atoms == -1) {
-            return;
-        }
-        p_cuda_gpu = p_cuda_gpu + idx / threads_per_ligand;
 
-        g.lig_torsion_size = tmp.lig_torsion_size;
-        // BFGS
-        output_type_cuda_t best_out;
-        output_type_cuda_t candidate;
+        pot_aux += pose_id;
+        p_cuda_gpu += pose_id / threads_per_ligand;
 
+        // BFGS
         for (int step = 0; step < search_depth; step++) {
-            output_type_cuda_init_with_output(&candidate, &tmp);
-            mutate_conf_cuda(bfgs_max_steps, &candidate, &states[idx], m_cuda_gpu.ligand.begin,
-                             m_cuda_gpu.ligand.end, m_cuda_gpu.atoms, &m_cuda_gpu.m_coords,
-                             m_cuda_gpu.ligand.rigid.origin[0], epsilon_fl, mutation_amplitude);
-            bfgs(&candidate, &g, &m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, hunt_cap_gpu, epsilon_fl,
-                 bfgs_max_steps);
-            // n ~ U[0,1]
-            float n = curand_uniform(&states[idx]);
-
-            // if (idx == 0)
-            // 	DEBUG_PRINTF("metropolis_accept tmp.e=%f, candidate.e=%f, n=%f\n", tmp.e,
-            // candidate.e, n);
-
-            if (step == 0 || metropolis_accept(tmp.e, candidate.e, 1.2, n)) {
-                output_type_cuda_init_with_output(&tmp, &candidate);
-                set(&tmp, &m_cuda_gpu.ligand.rigid, &m_cuda_gpu.m_coords, m_cuda_gpu.atoms,
-                    m_cuda_gpu.m_num_movable_atoms, epsilon_fl);
+            output_type_cuda_init_with_output_warp(tile, &candidate, &tmp);
+
+            if (tile.thread_rank() == 0)
+                mutate_conf_cuda(bfgs_max_steps, &candidate, &states[pose_id],
+                                 m_cuda_gpu[pose_id].ligand.begin, m_cuda_gpu[pose_id].ligand.end,
+                                 m_cuda_gpu[pose_id].atoms, &m_cuda_gpu[pose_id].m_coords,
+                                 m_cuda_gpu[pose_id].ligand.rigid.origin[0], epsilon_fl,
+                                 mutation_amplitude);
+            tile.sync();
+
+            bfgs_warp(tile, &candidate, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4,
+                      &h_cuda_gpu[pose_id], &m_cuda_gpu[pose_id], p_cuda_gpu, ig_cuda_gpu, pot_aux,
+                      hunt_cap_gpu, epsilon_fl, bfgs_max_steps);
+
+            bool accepted;
+            if (tile.thread_rank() == 0) {
+                // n ~ U[0,1]
+                float n = curand_uniform(&states[pose_id]);
+                accepted = metropolis_accept(tmp.e, candidate.e, 1.2, n);
+            }
+            accepted = tile.shfl(accepted, 0);
+
+            if (step == 0 || accepted) {
+                output_type_cuda_init_with_output_warp(tile, &tmp, &candidate);
+
+                if (tile.thread_rank() == 0) {
+                    set(&tmp, &m_cuda_gpu[pose_id].ligand.rigid, &m_cuda_gpu[pose_id].m_coords,
+                        m_cuda_gpu[pose_id].atoms, m_cuda_gpu[pose_id].m_num_movable_atoms,
+                        epsilon_fl);
+                }
+                tile.sync();
+
                 if (tmp.e < best_e) {
-                    bfgs(&tmp, &g, &m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, authentic_v_gpu,
-                         epsilon_fl, bfgs_max_steps);
+                    bfgs_warp(tile, &tmp, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4,
+                              &h_cuda_gpu[pose_id], &m_cuda_gpu[pose_id], p_cuda_gpu, ig_cuda_gpu,
+                              pot_aux, authentic_v_gpu, epsilon_fl, bfgs_max_steps);
+
                     // set
                     if (tmp.e < best_e) {
-                        set(&tmp, &m_cuda_gpu.ligand.rigid, &m_cuda_gpu.m_coords, m_cuda_gpu.atoms,
-                            m_cuda_gpu.m_num_movable_atoms, epsilon_fl);
-                        output_type_cuda_init_with_output(&best_out, &tmp);
-                        get_heavy_atom_movable_coords(&best_out, &m_cuda_gpu);  // get coords
+                        if (tile.thread_rank() == 0)
+                            set(&tmp, &m_cuda_gpu[pose_id].ligand.rigid,
+                                &m_cuda_gpu[pose_id].m_coords, m_cuda_gpu[pose_id].atoms,
+                                m_cuda_gpu[pose_id].m_num_movable_atoms, epsilon_fl);
+                        tile.sync();
+
+                        output_type_cuda_init_with_output_warp(tile, &best_out, &tmp);
+
+                        if (tile.thread_rank() == 0) {
+                            get_heavy_atom_movable_coords(&best_out,
+                                                          &m_cuda_gpu[pose_id]);  // get coords
+                        }
+                        tile.sync();
+
                         best_e = tmp.e;
                     }
                 }
             }
         }
+
         // write the best conformation back to CPU // FIX?? should add more
-        write_back(results + idx, &best_out);
-        // if (idx % 100 == 0) DEBUG_PRINTF("\nThread %d FINISH", idx);
+        write_back_warp(tile, results + pose_id, &best_out);
     }
 }
 
@@ -1136,30 +248,31 @@ std::vector<output_type> monte_carlo::cuda_to_vina(output_type_cuda_t results_pt
 }
 
 __host__ void monte_carlo::operator()(
-    std::vector<model>& m_gpu, std::vector<output_container>& out_gpu,
-    std::vector<precalculate_byatom>& p_gpu, triangular_matrix_cuda_t* m_data_list_gpu,
-    const igrid& ig, const vec& corner1, const vec& corner2, rng& generator, int verbosity,
-    unsigned long long seed, std::vector<std::vector<bias_element>>& bias_batch_list) const {
+    std::vector<model> &m_gpu, std::vector<output_container> &out_gpu,
+    std::vector<precalculate_byatom> &p_gpu, triangular_matrix_cuda_t *m_data_list_gpu,
+    const igrid &ig, const vec &corner1, const vec &corner2, rng &generator, int verbosity,
+    unsigned long long seed, std::vector<std::vector<bias_element>> &bias_batch_list) const {
     /* Definitions from vina1.2 */
     DEBUG_PRINTF("entering CUDA monte_carlo search\n");  // debug
 
-    vec authentic_v(1000, 1000, 1000);  // FIXME? this is here to avoid max_fl/max_fl
+    vec authentic_v(1000, 1000,
+                    1000);  // FIXME? this is here to avoid max_fl/max_fl
 
     quasi_newton quasi_newton_par;
     const int quasi_newton_par_max_steps = local_steps;  // no need to decrease step
 
     /* Allocate CPU memory and define new data structure */
     DEBUG_PRINTF("Allocating CPU memory\n");  // debug
-    m_cuda_t* m_cuda;
+    m_cuda_t *m_cuda;
     checkCUDA(cudaMallocHost(&m_cuda, sizeof(m_cuda_t)));
 
-    output_type_cuda_t* rand_molec_struc_tmp;
+    output_type_cuda_t *rand_molec_struc_tmp;
     checkCUDA(cudaMallocHost(&rand_molec_struc_tmp, sizeof(output_type_cuda_t)));
 
-    ig_cuda_t* ig_cuda_ptr;
+    ig_cuda_t *ig_cuda_ptr;
     checkCUDA(cudaMallocHost(&ig_cuda_ptr, sizeof(ig_cuda_t)));
 
-    p_cuda_t_cpu* p_cuda;
+    p_cuda_t_cpu *p_cuda;
     checkCUDA(cudaMallocHost(&p_cuda, sizeof(p_cuda_t_cpu)));
 
     /* End CPU allocation */
@@ -1176,47 +289,53 @@ __host__ void monte_carlo::operator()(
     DEBUG_PRINTF("p_cuda_size_gpu=%lu\n", p_cuda_size_gpu);
 
     // rand_molec_struc_gpu
-    float* rand_molec_struc_gpu;
+    float *rand_molec_struc_gpu;
     checkCUDA(cudaMalloc(&rand_molec_struc_gpu, thread * SIZE_OF_MOLEC_STRUC));
     // best_e_gpu
-    float* best_e_gpu;
+    float *best_e_gpu;
     float epsilon_fl_float = static_cast<float>(epsilon_fl);
     checkCUDA(cudaMalloc(&best_e_gpu, sizeof(float)));
     checkCUDA(cudaMemcpy(best_e_gpu, &max_fl, sizeof(float), cudaMemcpyHostToDevice));
 
     // use cuRand to generate random values on GPU
-    curandStatePhilox4_32_10_t* states;
+    curandStatePhilox4_32_10_t *states;
     DEBUG_PRINTF("random states size=%lu\n", sizeof(curandStatePhilox4_32_10_t) * thread);
     checkCUDA(cudaMalloc(&states, sizeof(curandStatePhilox4_32_10_t) * thread));
 
     // hunt_cap_gpu
-    float* hunt_cap_gpu;
+    float *hunt_cap_gpu;
     float hunt_cap_float[3] = {static_cast<float>(hunt_cap[0]), static_cast<float>(hunt_cap[1]),
                                static_cast<float>(hunt_cap[2])};
 
     checkCUDA(cudaMalloc(&hunt_cap_gpu, 3 * sizeof(float)));
     // Preparing m related data
-    m_cuda_t* m_cuda_gpu;
+    m_cuda_t *m_cuda_gpu;
     DEBUG_PRINTF("m_cuda_size=%lu", m_cuda_size);
     checkCUDA(cudaMalloc(&m_cuda_gpu, num_of_ligands * m_cuda_size));
     // Preparing p related data
 
-    p_cuda_t* p_cuda_gpu;
+    p_cuda_t *p_cuda_gpu;
     checkCUDA(cudaMalloc(&p_cuda_gpu, num_of_ligands * p_cuda_size_gpu));
     DEBUG_PRINTF("p_cuda_gpu=%p\n", p_cuda_gpu);
     // Preparing ig related data (cache related data)
-    ig_cuda_t* ig_cuda_gpu;
+    ig_cuda_t *ig_cuda_gpu;
 
-    float* authentic_v_gpu;
+    float *authentic_v_gpu;
     float authentic_v_float[3]
         = {static_cast<float>(authentic_v[0]), static_cast<float>(authentic_v[1]),
            static_cast<float>(authentic_v[2])};
 
     checkCUDA(cudaMalloc(&authentic_v_gpu, sizeof(authentic_v_float)));
     // Preparing result data
-    output_type_cuda_t* results_gpu;
+    output_type_cuda_t *results_gpu;
     checkCUDA(cudaMalloc(&results_gpu, thread * sizeof(output_type_cuda_t)));
 
+    m_cuda_t *m_cuda_global;
+    checkCUDA(cudaMalloc(&m_cuda_global, thread * sizeof(m_cuda_t)));
+
+    matrix_d *h_cuda_global;
+    checkCUDA(cudaMalloc(&h_cuda_global, thread * sizeof(matrix_d)));
+
     /* End Allocating GPU Memory */
 
     assert(num_of_ligands <= MAX_LIGAND_NUM);
@@ -1225,7 +344,7 @@ __host__ void monte_carlo::operator()(
     struct tmp_struct {
         int start_index = 0;
         int parent_index = 0;
-        void store_node(tree<segment>& child_ptr, rigid_cuda_t& rigid) {
+        void store_node(tree<segment> &child_ptr, rigid_cuda_t &rigid) {
             start_index++;  // start with index 1, index 0 is root node
             rigid.parent[start_index] = parent_index;
             rigid.atom_range[start_index][0] = child_ptr.node.begin;
@@ -1256,8 +375,8 @@ __host__ void monte_carlo::operator()(
     };
 
     for (int l = 0; l < num_of_ligands; ++l) {
-        model& m = m_gpu[l];
-        const precalculate_byatom& p = p_gpu[l];
+        model &m = m_gpu[l];
+        const precalculate_byatom &p = p_gpu[l];
 
         /* Prepare m related data */
         conf_size s = m.get_size();
@@ -1310,7 +429,7 @@ __host__ void monte_carlo::operator()(
             }
             m_cuda->ligand.begin = m.ligands[0].begin;  // 0
             m_cuda->ligand.end = m.ligands[0].end;      // 29
-            ligand& m_ligand = m.ligands[0];            // Only support one ligand
+            ligand &m_ligand = m.ligands[0];            // Only support one ligand
             DEBUG_PRINTF("m_ligand.end=%lu, MAX_NUM_OF_ATOMS=%d\n", m_ligand.end, MAX_NUM_OF_ATOMS);
             assert(m_ligand.end < MAX_NUM_OF_ATOMS);
 
@@ -1342,14 +461,24 @@ __host__ void monte_carlo::operator()(
             }
             m_cuda->ligand.rigid.num_children = ts.start_index;
 
-            // set children_map
+            // set children map
             DEBUG_PRINTF("set children map\n");
             for (int i = 0; i < MAX_NUM_OF_RIGID; i++)
-                for (int j = 0; j < MAX_NUM_OF_RIGID; j++)
+                for (int j = 0; j < MAX_NUM_OF_RIGID; j++) {
                     m_cuda->ligand.rigid.children_map[i][j] = false;
-            for (int i = 1; i < m_cuda->ligand.rigid.num_children + 1; i++) {
-                int parent_index = m_cuda->ligand.rigid.parent[i];
-                m_cuda->ligand.rigid.children_map[parent_index][i] = true;
+                    m_cuda->ligand.rigid.descendant_map[i][j] = false;
+                }
+
+            for (int i = MAX_NUM_OF_RIGID - 1; i >= 0; i--) {
+                if (i > 0) {
+                    m_cuda->ligand.rigid.children_map[m_cuda->ligand.rigid.parent[i]][i] = true;
+                    m_cuda->ligand.rigid.descendant_map[m_cuda->ligand.rigid.parent[i]][i] = true;
+                }
+                for (int j = i + 1; j < MAX_NUM_OF_RIGID; j++) {
+                    if (m_cuda->ligand.rigid.descendant_map[i][j])
+                        m_cuda->ligand.rigid.descendant_map[m_cuda->ligand.rigid.parent[i]][j]
+                            = true;
+                }
             }
             m_cuda->m_num_movable_atoms = m.num_movable_atoms();
 
@@ -1369,9 +498,9 @@ __host__ void monte_carlo::operator()(
 
             for (int i = 0; i < threads_per_ligand; ++i) {
                 if (!local_only) {
-                    tmp.c.randomize(
-                        corner1, corner2,
-                        generator);  // generate a random structure, can move to GPU if necessary
+                    tmp.c.randomize(corner1, corner2,
+                                    generator);  // generate a random structure,
+                                                 // can move to GPU if necessary
                 }
                 for (int j = 0; j < 3; j++)
                     rand_molec_struc_tmp->position[j] = tmp.c.ligands[0].rigid.position[j];
@@ -1395,7 +524,7 @@ __host__ void monte_carlo::operator()(
 
                 rand_molec_struc_tmp->lig_torsion_size = lig_torsion_size;
 
-                float* rand_molec_struc_gpu_tmp
+                float *rand_molec_struc_gpu_tmp
                     = rand_molec_struc_gpu
                       + (l * threads_per_ligand + i) * SIZE_OF_MOLEC_STRUC / sizeof(float);
                 checkCUDA(cudaMemcpy(rand_molec_struc_gpu_tmp, rand_molec_struc_tmp,
@@ -1412,7 +541,7 @@ __host__ void monte_carlo::operator()(
             p_cuda->m_data_size = p.m_data.m_data.size();
             checkCUDA(cudaMemcpy(p_cuda_gpu + l, p_cuda, sizeof(p_cuda_t), cudaMemcpyHostToDevice));
             checkCUDA(cudaMemcpy(&(p_cuda_gpu[l].m_data), &(m_data_list_gpu[l].p_data),
-                                 sizeof(p_m_data_cuda_t*),
+                                 sizeof(p_m_data_cuda_t *),
                                  cudaMemcpyHostToDevice));  // check if fl == float
         }
     }
@@ -1588,16 +717,24 @@ __host__ void monte_carlo::operator()(
     /* Launch kernel */
     DEBUG_PRINTF("launch kernel, global_steps=%d, thread=%d, num_of_ligands=%d\n", global_steps,
                  thread, num_of_ligands);
-    kernel<<<thread / 32 + 1, 32>>>(m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu,
-                                    best_e_gpu, quasi_newton_par_max_steps,
-                                    mutation_amplitude_float, states, seed, epsilon_fl_float,
-                                    hunt_cap_gpu, authentic_v_gpu, results_gpu, global_steps,
-                                    num_of_ligands, threads_per_ligand, multi_bias);
+
+    output_type_cuda_t *results_aux;
+    checkCUDA(cudaMalloc(&results_aux, 5 * thread * sizeof(output_type_cuda_t)));
+    change_cuda_t *change_aux;
+    checkCUDA(cudaMalloc(&change_aux, 5 * thread * sizeof(change_cuda_t)));
+    pot_cuda_t *pot_aux;
+    checkCUDA(cudaMalloc(&pot_aux, thread * sizeof(pot_cuda_t)));
+
+    kernel<32><<<thread, 32>>>(
+        m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu, best_e_gpu,
+        quasi_newton_par_max_steps, mutation_amplitude_float, states, seed, epsilon_fl_float,
+        hunt_cap_gpu, authentic_v_gpu, results_gpu, results_aux, change_aux, pot_aux, h_cuda_global,
+        m_cuda_global, global_steps, num_of_ligands, threads_per_ligand, multi_bias);
 
     // Device to Host memcpy of precalculated_byatom, copy back data to p_gpu
-    p_m_data_cuda_t* p_data;
+    p_m_data_cuda_t *p_data;
     checkCUDA(cudaMallocHost(&p_data, sizeof(p_m_data_cuda_t) * MAX_P_DATA_M_DATA_SIZE));
-    output_type_cuda_t* results;
+    output_type_cuda_t *results;
     checkCUDA(cudaMallocHost(&results, thread * sizeof(output_type_cuda_t)));
 
     for (int l = 0; l < num_of_ligands; ++l) {
@@ -1614,10 +751,12 @@ __host__ void monte_carlo::operator()(
     }
     // DEBUG_PRINTF("energies about the first ligand on GPU:\n");
     // for (int i = 0;i < 20; ++i){
-    //     DEBUG_PRINTF("precalculated_byatom.m_data.m_data[%d]: (smooth.first, smooth.second, fast)
-    //     ", i); for (int j = 0;j < FAST_SIZE; ++j){
-    //         DEBUG_PRINTF("(%f, %f, %f) ", p_gpu[0].m_data.m_data[i].smooth[j].first,
-    //         p_gpu[0].m_data.m_data[i].smooth[j].second, p_gpu[0].m_data.m_data[i].fast[j]);
+    //     DEBUG_PRINTF("precalculated_byatom.m_data.m_data[%d]: (smooth.first,
+    //     smooth.second, fast) ", i); for (int j = 0;j < FAST_SIZE; ++j){
+    //         DEBUG_PRINTF("(%f, %f, %f) ",
+    //         p_gpu[0].m_data.m_data[i].smooth[j].first,
+    //         p_gpu[0].m_data.m_data[i].smooth[j].second,
+    //         p_gpu[0].m_data.m_data[i].fast[j]);
     //     }
     //     DEBUG_PRINTF("\n");
     // }
@@ -1629,7 +768,8 @@ __host__ void monte_carlo::operator()(
     cudaEventSynchronize(stop);
     float msecTotal = 0.0f;
     cudaEventElapsedTime(&msecTotal, start, stop);
-    DEBUG_PRINTF("Time spend on GPU is %f ms\n", msecTotal);
+    printf("Time spend on GPU is %f ms\n", msecTotal);
+    // DEBUG_PRINTF("Time spend on GPU is %f ms\n", msecTotal);
 
     /* Convert result data. Can be improved by mapping memory
      */
@@ -1663,6 +803,8 @@ __host__ void monte_carlo::operator()(
     checkCUDA(cudaFree(hunt_cap_gpu));
     checkCUDA(cudaFree(authentic_v_gpu));
     checkCUDA(cudaFree(results_gpu));
+    checkCUDA(cudaFree(change_aux));
+    checkCUDA(cudaFree(results_aux));
     checkCUDA(cudaFree(states));
     checkCUDA(cudaFreeHost(m_cuda));
     checkCUDA(cudaFreeHost(rand_molec_struc_tmp));
@@ -1674,17 +816,18 @@ __host__ void monte_carlo::operator()(
     DEBUG_PRINTF("exit monte_carlo\n");
 }
 
-bool metropolis_accept(fl old_f, fl new_f, fl temperature, rng& generator) {
+bool metropolis_accept(fl old_f, fl new_f, fl temperature, rng &generator) {
     if (new_f < old_f) return true;
     const fl acceptance_probability = std::exp((old_f - new_f) / temperature);
     return random_fl(0, 1, generator) < acceptance_probability;
 }
 
-__host__ void monte_carlo::operator()(model& m, output_container& out, const precalculate_byatom& p,
-                                      const igrid& ig, const vec& corner1, const vec& corner2,
-                                      rng& generator) const {
+__host__ void monte_carlo::operator()(model &m, output_container &out, const precalculate_byatom &p,
+                                      const igrid &ig, const vec &corner1, const vec &corner2,
+                                      rng &generator) const {
     int evalcount = 0;
-    vec authentic_v(1000, 1000, 1000);  // FIXME? this is here to avoid max_fl/max_fl
+    vec authentic_v(1000, 1000,
+                    1000);  // FIXME? this is here to avoid max_fl/max_fl
     conf_size s = m.get_size();
     change g(s);
     output_type tmp(s, 0);
@@ -1709,7 +852,8 @@ __host__ void monte_carlo::operator()(model& m, output_container& out, const pre
                 quasi_newton_par(m, p, ig, tmp, g, authentic_v, evalcount);
                 m.set(tmp.c);  // FIXME? useless?
                 tmp.coords = m.get_heavy_atom_movable_coords();
-                add_to_output_container(out, tmp, min_rmsd, num_saved_mins);  // 20 - max size
+                add_to_output_container(out, tmp, min_rmsd,
+                                        num_saved_mins);  // 20 - max size
                 if (tmp.e < best_e) best_e = tmp.e;
             }
         }
diff --git a/unidock/src/cuda/precalculate.cu b/unidock/src/cuda/precalculate.cu
index 675ac70..03313a1 100644
--- a/unidock/src/cuda/precalculate.cu
+++ b/unidock/src/cuda/precalculate.cu
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "precalculate.h"
-#include "precalculate_gpu.h"
+#include "precalculate_gpu.cuh"
 
 // TODO: define kernel here
 __global__ void precalculate_gpu(triangular_matrix_cuda_t *m_data_gpu_list,
diff --git a/unidock/src/cuda/precalculate_gpu.h b/unidock/src/cuda/precalculate_gpu.cuh
similarity index 99%
rename from unidock/src/cuda/precalculate_gpu.h
rename to unidock/src/cuda/precalculate_gpu.cuh
index c22b620..673f8b4 100644
--- a/unidock/src/cuda/precalculate_gpu.h
+++ b/unidock/src/cuda/precalculate_gpu.cuh
@@ -5,7 +5,7 @@
 // Define GPU precalculate structures
 
 /* atom related start */
-#include "atom_constants_gpu.h"
+#include "atom_constants_gpu.cuh"
 
 /* atom related end */
 
diff --git a/unidock/src/cuda/warp_ops.cuh b/unidock/src/cuda/warp_ops.cuh
new file mode 100644
index 0000000..1ea368f
--- /dev/null
+++ b/unidock/src/cuda/warp_ops.cuh
@@ -0,0 +1,660 @@
+#pragma once
+#include "bfgs.h"
+#include "common.cuh"
+#include "kernel.h"
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
+
+namespace cg = cooperative_groups;
+
+template <unsigned int TileSize>
+__device__ __forceinline__ void matrix_d_init_warp(cg::thread_block_tile<TileSize> &tile,
+                                                   matrix_d *m, int dim, float fill_data) {
+    if (tile.thread_rank() == 0) m->dim = dim;
+    if ((dim * (dim + 1) / 2) > MAX_HESSIAN_MATRIX_D_SIZE)
+        DEBUG_PRINTF("\nnmatrix_d: matrix_d_init() ERROR!");
+    // ((dim * (dim + 1) / 2)*sizeof(float)); // symmetric matrix_d
+    for (int i = tile.thread_rank(); i < (dim * (dim + 1) / 2); i += tile.num_threads())
+        m->data[i] = fill_data;
+    for (int i = (dim * (dim + 1) / 2) + tile.thread_rank(); i < MAX_HESSIAN_MATRIX_D_SIZE;
+         i += tile.num_threads())
+        m->data[i] = 0;  // Others will be 0
+    tile.sync();
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ void matrix_d_set_diagonal_warp(cg::thread_block_tile<TileSize> &tile,
+                                                           matrix_d *m, float fill_data) {
+    for (int i = tile.thread_rank(); i < m->dim; i += tile.num_threads()) {
+        m->data[i + i * (i + 1) / 2] = fill_data;
+    }
+    tile.sync();
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ float scalar_product_warp(cg::thread_block_tile<TileSize> &tile,
+                                                     const change_cuda_t *a, const change_cuda_t *b,
+                                                     int n) {
+    float tmp = 0;
+    for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) {
+        tmp += find_change_index_read(a, i) * find_change_index_read(b, i);
+    }
+    tile.sync();
+
+    return cg::reduce(tile, tmp, cg::plus<float>());
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ void minus_mat_vec_product_warp(cg::thread_block_tile<TileSize> &tile,
+                                                           const matrix_d *h,
+                                                           const change_cuda_t *in,
+                                                           change_cuda_t *out) {
+    int n = h->dim;
+    for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) {
+        float sum = 0;
+        for (int j = 0; j < n; j++) {
+            sum += h->data[index_permissive(h, i, j)] * find_change_index_read(in, j);
+        }
+        find_change_index_write(out, i, -sum);
+    }
+    tile.sync();
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ void output_type_cuda_init_warp(cg::thread_block_tile<TileSize> &tile,
+                                                           output_type_cuda_t *out,
+                                                           const float *ptr) {
+    for (int i = tile.thread_rank(); i < 3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION;
+         i += tile.num_threads()) {
+        if (i < 3)
+            out->position[i] = ptr[i];
+        else if (i < 7)
+            out->orientation[i - 3] = ptr[i];
+        else if (i < 7 + MAX_NUM_OF_LIG_TORSION)
+            out->lig_torsion[i - 7] = ptr[i];
+        else
+            out->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION] = ptr[i];
+    }
+
+    if (tile.thread_rank() == 0)
+        out->lig_torsion_size = ptr[3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION];
+    // did not assign coords and e
+
+    tile.sync();
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ void output_type_cuda_init_with_output_warp(
+    cg::thread_block_tile<TileSize> &tile, output_type_cuda_t *out_new,
+    const output_type_cuda_t *out_old) {
+    for (int i = tile.thread_rank(); i < 3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION;
+         i += tile.num_threads()) {
+        if (i < 3)
+            out_new->position[i] = out_old->position[i];
+        else if (i < 7)
+            out_new->orientation[i - 3] = out_old->orientation[i - 3];
+        else if (i < 7 + MAX_NUM_OF_LIG_TORSION)
+            out_new->lig_torsion[i - 7] = out_old->lig_torsion[i - 7];
+        else
+            out_new->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION]
+                = out_old->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION];
+    }
+
+    if (tile.thread_rank() == 0) {
+        out_new->lig_torsion_size = out_old->lig_torsion_size;
+        // assign e but not coords
+        out_new->e = out_old->e;
+    }
+
+    tile.sync();
+}
+
+template <unsigned int TileSize> __device__ __forceinline__ void output_type_cuda_increment_warp(
+    cg::thread_block_tile<TileSize> &tile, output_type_cuda_t *x, const change_cuda_t *c,
+    float factor, float epsilon_fl) {
+    // position increment
+    if (tile.thread_rank() == 0) {
+        for (int k = 0; k < 3; k++) x->position[k] += factor * c->position[k];
+        // orientation increment
+        float rotation[3];
+        for (int k = 0; k < 3; k++) rotation[k] = factor * c->orientation[k];
+        quaternion_increment(x->orientation, rotation, epsilon_fl);
+    }
+
+    // torsion increment
+    for (int k = tile.thread_rank(); k < x->lig_torsion_size; k += tile.num_threads()) {
+        float tmp = factor * c->lig_torsion[k];
+        normalize_angle(&tmp);
+        x->lig_torsion[k] += tmp;
+        normalize_angle(&(x->lig_torsion[k]));
+    }
+
+    tile.sync();
+}
+
+template <unsigned int TileSize> __device__ __forceinline__ void change_cuda_init_with_change_warp(
+    cg::thread_block_tile<TileSize> &tile, change_cuda_t *g_new, const change_cuda_t *g_old) {
+    for (int i = tile.thread_rank(); i < 3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION;
+         i += tile.num_threads()) {
+        if (i < 3)
+            g_new->position[i] = g_old->position[i];
+        else if (i < 7)
+            g_new->orientation[i - 3] = g_old->orientation[i - 3];
+        else if (i < 7 + MAX_NUM_OF_LIG_TORSION)
+            g_new->lig_torsion[i - 7] = g_old->lig_torsion[i - 7];
+        else
+            g_new->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION]
+                = g_old->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION];
+    }
+
+    if (tile.thread_rank() == 0) g_new->lig_torsion_size = g_old->lig_torsion_size;
+    // did not assign coords and e
+
+    tile.sync();
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ void ligand_init_with_ligand_warp(cg::thread_block_tile<TileSize> &tile,
+                                                             const ligand_cuda_t *ligand_cuda_old,
+                                                             ligand_cuda_t *ligand_cuda_new) {
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_LIG_PAIRS; i += tile.num_threads()) {
+        ligand_cuda_new->pairs.type_pair_index[i] = ligand_cuda_old->pairs.type_pair_index[i];
+    }
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_LIG_PAIRS; i += tile.num_threads()) {
+        ligand_cuda_new->pairs.a[i] = ligand_cuda_old->pairs.a[i];
+    }
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_LIG_PAIRS; i += tile.num_threads()) {
+        ligand_cuda_new->pairs.b[i] = ligand_cuda_old->pairs.b[i];
+    }
+
+    if (tile.thread_rank() == 0)
+        ligand_cuda_new->pairs.num_pairs = ligand_cuda_old->pairs.num_pairs;
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) {
+        for (int j = 0; j < 2; ++j)
+            ligand_cuda_new->rigid.atom_range[i][j] = ligand_cuda_old->rigid.atom_range[i][j];
+    }
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) {
+        for (int j = 0; j < 3; ++j)
+            ligand_cuda_new->rigid.origin[i][j] = ligand_cuda_old->rigid.origin[i][j];
+    }
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) {
+        for (int j = 0; j < 9; ++j)
+            ligand_cuda_new->rigid.orientation_m[i][j] = ligand_cuda_old->rigid.orientation_m[i][j];
+    }
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) {
+        for (int j = 0; j < 4; ++j)
+            ligand_cuda_new->rigid.orientation_q[i][j] = ligand_cuda_old->rigid.orientation_q[i][j];
+    }
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) {
+        for (int j = 0; j < 3; ++j)
+            ligand_cuda_new->rigid.axis[i][j] = ligand_cuda_old->rigid.axis[i][j];
+    }
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) {
+        for (int j = 0; j < 3; ++j)
+            ligand_cuda_new->rigid.relative_axis[i][j] = ligand_cuda_old->rigid.relative_axis[i][j];
+    }
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) {
+        for (int j = 0; j < 3; ++j)
+            ligand_cuda_new->rigid.relative_origin[i][j]
+                = ligand_cuda_old->rigid.relative_origin[i][j];
+    }
+
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) {
+        ligand_cuda_new->rigid.parent[i] = ligand_cuda_old->rigid.parent[i];
+    }
+
+    for (int i = 0; i < MAX_NUM_OF_RIGID; i++) {
+        for (int j = tile.thread_rank(); j < MAX_NUM_OF_RIGID; j += tile.num_threads())
+            ligand_cuda_new->rigid.children_map[i][j] = ligand_cuda_old->rigid.children_map[i][j];
+
+        for (int j = tile.thread_rank(); j < MAX_NUM_OF_RIGID; j += tile.num_threads())
+            ligand_cuda_new->rigid.descendant_map[i][j]
+                = ligand_cuda_old->rigid.descendant_map[i][j];
+    }
+
+    if (tile.thread_rank() == 0) {
+        ligand_cuda_new->rigid.num_children = ligand_cuda_old->rigid.num_children;
+        ligand_cuda_new->begin = ligand_cuda_old->begin;
+        ligand_cuda_new->end = ligand_cuda_old->end;
+    }
+
+    tile.sync();
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ void m_cuda_init_with_m_cuda_warp(cg::thread_block_tile<TileSize> &tile,
+                                                             const m_cuda_t *m_cuda_old,
+                                                             m_cuda_t *m_cuda_new) {
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_ATOMS; i += tile.num_threads()) {
+        m_cuda_new->atoms[i] = m_cuda_old->atoms[i];
+    }
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_ATOMS; i += tile.num_threads()) {
+        for (int j = 0; j < 3; ++j)
+            m_cuda_new->m_coords.coords[i][j] = m_cuda_old->m_coords.coords[i][j];
+    }
+    for (int i = tile.thread_rank(); i < MAX_NUM_OF_ATOMS; i += tile.num_threads()) {
+        for (int j = 0; j < 3; ++j)
+            m_cuda_new->minus_forces.coords[i][j] = m_cuda_old->minus_forces.coords[i][j];
+    }
+
+    ligand_init_with_ligand_warp(tile, &m_cuda_old->ligand, &m_cuda_new->ligand);
+
+    if (tile.thread_rank() == 0) m_cuda_new->m_num_movable_atoms = m_cuda_old->m_num_movable_atoms;
+
+    tile.sync();
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ float ig_eval_deriv_warp(cg::thread_block_tile<TileSize> &tile,
+                                                    output_type_cuda_t *x, const float v,
+                                                    ig_cuda_t *ig_cuda_gpu, m_cuda_t *m_cuda_gpu,
+                                                    const float epsilon_fl) {
+    float e = 0;
+    float deriv[3];
+    int nat = num_atom_types(ig_cuda_gpu->atu);
+    for (int i = tile.thread_rank(); i < m_cuda_gpu->m_num_movable_atoms; i += tile.num_threads()) {
+        int t = m_cuda_gpu->atoms[i].types[ig_cuda_gpu->atu];
+        if (t >= nat) {
+            m_cuda_gpu->minus_forces.coords[i][0] = 0.0f;
+            m_cuda_gpu->minus_forces.coords[i][1] = 0.0f;
+            m_cuda_gpu->minus_forces.coords[i][2] = 0.0f;
+            continue;
+        }
+
+        e += g_evaluate(&ig_cuda_gpu->grids[t], m_cuda_gpu->m_coords.coords[i], ig_cuda_gpu->slope,
+                        v, deriv, epsilon_fl);
+
+        m_cuda_gpu->minus_forces.coords[i][0] = deriv[0];
+        m_cuda_gpu->minus_forces.coords[i][1] = deriv[1];
+        m_cuda_gpu->minus_forces.coords[i][2] = deriv[2];
+    }
+    tile.sync();
+    return e;
+}
+
+template <unsigned int TileSize> __device__ __forceinline__ float eval_interacting_pairs_deriv_warp(
+    cg::thread_block_tile<TileSize> &tile, p_cuda_t *p_cuda_gpu, const float v,
+    const lig_pairs_cuda_t *pairs, const m_coords_cuda_t *m_coords, m_minus_forces_t *minus_forces,
+    const float epsilon_fl) {
+    float e = 0.0f;
+
+    for (int i = tile.thread_rank(); i < pairs->num_pairs; i += tile.num_threads()) {
+        int ai = pairs->a[i], bi = pairs->b[i];
+        int index = pairs->a[i] + pairs->b[i] * (pairs->b[i] + 1) / 2;
+        float r[3] = {m_coords->coords[bi][0] - m_coords->coords[ai][0],
+                      m_coords->coords[bi][1] - m_coords->coords[ai][1],
+                      m_coords->coords[bi][2] - m_coords->coords[ai][2]};
+        float r2 = r[0] * r[0] + r[1] * r[1] + r[2] * r[2];
+
+        if (r2 < p_cuda_gpu->m_cutoff_sqr) {
+            float tmp[2];
+            p_eval_deriv(tmp, index, r2, p_cuda_gpu, epsilon_fl);
+            float force[3] = {r[0] * tmp[1], r[1] * tmp[1], r[2] * tmp[1]};
+            curl(&tmp[0], force, v, epsilon_fl);
+            e += tmp[0];
+            atomicAdd(&minus_forces->coords[ai][0], -force[0]);
+            atomicAdd(&minus_forces->coords[ai][1], -force[1]);
+            atomicAdd(&minus_forces->coords[ai][2], -force[2]);
+            atomicAdd(&minus_forces->coords[bi][0], force[0]);
+            atomicAdd(&minus_forces->coords[bi][1], force[1]);
+            atomicAdd(&minus_forces->coords[bi][2], force[2]);
+        }
+    }
+    tile.sync();
+    return e;
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ void POT_deriv_warp(cg::thread_block_tile<TileSize> &tile,
+                                               const m_minus_forces_t *minus_forces,
+                                               const rigid_cuda_t *lig_rigid_gpu,
+                                               const m_coords_cuda_t *m_coords, change_cuda_t *g,
+                                               pot_cuda_t *p) {
+    int num_torsion = lig_rigid_gpu->num_children;
+    int num_rigid = num_torsion + 1;
+
+    float pos_tmp[3], ori_tmp[3], tmp1[3], tmp2[3], tmp3[3];
+    for (int i = tile.thread_rank(); i < num_rigid; i += tile.num_threads()) {
+        int begin = lig_rigid_gpu->atom_range[i][0];
+        int end = lig_rigid_gpu->atom_range[i][1];
+        pos_tmp[0] = pos_tmp[1] = pos_tmp[2] = 0.0f;
+        ori_tmp[0] = ori_tmp[1] = ori_tmp[2] = 0.0f;
+        for (int j = begin; j < end; j++) {
+            pos_tmp[0] += minus_forces->coords[j][0];
+            pos_tmp[1] += minus_forces->coords[j][1];
+            pos_tmp[2] += minus_forces->coords[j][2];
+            tmp1[0] = m_coords->coords[j][0] - lig_rigid_gpu->origin[i][0];
+            tmp1[1] = m_coords->coords[j][1] - lig_rigid_gpu->origin[i][1];
+            tmp1[2] = m_coords->coords[j][2] - lig_rigid_gpu->origin[i][2];
+            tmp2[0] = minus_forces->coords[j][0];
+            tmp2[1] = minus_forces->coords[j][1];
+            tmp2[2] = minus_forces->coords[j][2];
+            product(tmp3, tmp1, tmp2);
+            ori_tmp[0] += tmp3[0];
+            ori_tmp[1] += tmp3[1];
+            ori_tmp[2] += tmp3[2];
+        }
+        p->ptmp[i][0] = pos_tmp[0];
+        p->ptmp[i][1] = pos_tmp[1];
+        p->ptmp[i][2] = pos_tmp[2];
+        p->o[i][0] = ori_tmp[0];
+        p->o[i][1] = ori_tmp[1];
+        p->o[i][2] = ori_tmp[2];
+    }
+    tile.sync();
+
+    /* position_derivative  */
+    for (int i = tile.thread_rank(); i < num_rigid; i += tile.num_threads()) {
+        p->p[i][0] = p->ptmp[i][0];
+        p->p[i][1] = p->ptmp[i][1];
+        p->p[i][2] = p->ptmp[i][2];
+        for (int j = i + 1; j < num_rigid; j++) {
+            if (lig_rigid_gpu->descendant_map[i][j]) {
+                p->p[i][0] += p->ptmp[j][0];
+                p->p[i][1] += p->ptmp[j][1];
+                p->p[i][2] += p->ptmp[j][2];
+            }
+        }
+    }
+    tile.sync();
+
+    /* orientation derivative */
+    if (tile.thread_rank() == 0) {  // NOTE: Single thread is better here
+        float origin_temp[3], product_out[3];
+        for (int i = num_rigid - 1; i >= 0; i--) { /* from bottom to top */
+            ori_tmp[0] = p->o[i][0];
+            ori_tmp[1] = p->o[i][1];
+            ori_tmp[2] = p->o[i][2];
+            for (int j = i + 1; j < num_rigid; j++) {
+                if (lig_rigid_gpu->children_map[i][j]) {
+                    ori_tmp[0] += p->o[j][0];
+                    ori_tmp[1] += p->o[j][1];
+                    ori_tmp[2] += p->o[j][2]; /* self+children node
+                                               */
+
+                    origin_temp[0] = lig_rigid_gpu->origin[j][0] - lig_rigid_gpu->origin[i][0];
+                    origin_temp[1] = lig_rigid_gpu->origin[j][1] - lig_rigid_gpu->origin[i][1];
+                    origin_temp[2] = lig_rigid_gpu->origin[j][2] - lig_rigid_gpu->origin[i][2];
+
+                    product(product_out, origin_temp, p->p[j]);
+                    ori_tmp[0] += product_out[0];
+                    ori_tmp[1] += product_out[1];
+                    ori_tmp[2] += product_out[2];
+                }
+            }
+            p->o[i][0] = ori_tmp[0];
+            p->o[i][1] = ori_tmp[1];
+            p->o[i][2] = ori_tmp[2];
+        }
+    }
+    tile.sync();
+
+    /* torsion_derivative */
+    for (int i = tile.thread_rank(); i < num_torsion; i += tile.num_threads()) {
+        g->lig_torsion[i - 1] = p->o[i][0] * lig_rigid_gpu->axis[i][0]
+                                + p->o[i][1] * lig_rigid_gpu->axis[i][1]
+                                + p->o[i][2] * lig_rigid_gpu->axis[i][2];
+    }
+    tile.sync();
+
+    for (int i = tile.thread_rank(); i < 3; i += tile.num_threads()) {
+        g->position[i] = p->p[0][i];
+        g->orientation[i] = p->o[0][i];
+    }
+
+    tile.sync();
+}
+
+template <unsigned int TileSize>
+__device__ float m_eval_deriv_warp(cg::thread_block_tile<TileSize> &tile, output_type_cuda_t *c,
+                                   change_cuda_t *g, m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu,
+                                   ig_cuda_t *ig_cuda_gpu, pot_cuda_t *pot_aux, const float *v,
+                                   const float epsilon_fl) {
+    // check set args
+    if (tile.thread_rank() == 0) {
+        set(c, &m_cuda_gpu->ligand.rigid, &m_cuda_gpu->m_coords, m_cuda_gpu->atoms,
+            m_cuda_gpu->m_num_movable_atoms, epsilon_fl);
+    }
+    tile.sync();
+
+    float e = ig_eval_deriv_warp(tile, c, v[1], ig_cuda_gpu, m_cuda_gpu, epsilon_fl);
+    e += eval_interacting_pairs_deriv_warp(tile, p_cuda_gpu, v[0], &m_cuda_gpu->ligand.pairs,
+                                           &m_cuda_gpu->m_coords, &m_cuda_gpu->minus_forces,
+                                           epsilon_fl);
+    tile.sync();
+    e = cg::reduce(tile, e, cg::plus<float>());
+
+    // should add derivs for glue, other and inter pairs
+    POT_deriv_warp(tile, &m_cuda_gpu->minus_forces, &m_cuda_gpu->ligand.rigid,
+                   &m_cuda_gpu->m_coords, g, pot_aux);
+
+    return e;
+}
+
+template <unsigned int TileSize> __device__ __forceinline__ void line_search_warp(
+    cg::thread_block_tile<TileSize> &tile, m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu,
+    ig_cuda_t *ig_cuda_gpu, int n, const output_type_cuda_t *x, const change_cuda_t *g,
+    const float f0, const change_cuda_t *p, output_type_cuda_t *x_new, change_cuda_t *g_new,
+    pot_cuda_t *pot_aux, float *f, float *alpha, const float epsilon_fl, const float *hunt_cap) {
+    const float c0 = 0.0001;
+    const int max_trials = 10;
+    const float multiplier = 0.5;
+    float alpha_ = 1.0, f_;
+
+    const float pg = scalar_product_warp(tile, p, g, n);
+    for (int trial = 0; trial < max_trials; trial++) {
+        output_type_cuda_init_with_output_warp(tile, x_new, x);
+        output_type_cuda_increment_warp(tile, x_new, p, alpha_, epsilon_fl);
+        f_ = m_eval_deriv_warp(tile, x_new, g_new, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, pot_aux,
+                               hunt_cap, epsilon_fl);
+        if (f_ - f0 < c0 * alpha_ * pg) break;
+        alpha_ *= multiplier;
+    }
+
+    *f = f_;
+    *alpha = alpha_;
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ bool bfgs_update_warp(cg::thread_block_tile<TileSize> &tile, matrix_d *h,
+                                                 const change_cuda_t *p, const change_cuda_t *y,
+                                                 const float alpha, const float epsilon_fl) {
+    float yp, yhy;
+    yp = scalar_product_warp(tile, y, p, h->dim);
+    if (alpha * yp < epsilon_fl) return false;
+
+    __shared__ change_cuda_t minus_hy[32 / TileSize];
+    auto minus_hy_ptr = &minus_hy[tile.meta_group_rank()];
+    change_cuda_init_with_change_warp(tile, minus_hy_ptr, y);
+    minus_mat_vec_product_warp(tile, h, y, minus_hy_ptr);
+    yhy = -scalar_product_warp(tile, y, minus_hy_ptr, h->dim);
+    float r = 1 / (alpha * yp);
+    int n = 6 + p->lig_torsion_size;
+
+    __shared__ float minus_hy_[(6 + MAX_NUM_OF_LIG_TORSION) * 32 / TileSize],
+        p_[(6 + MAX_NUM_OF_LIG_TORSION) * 32 / TileSize];
+
+    // Calculate offset
+    auto minus_hy_ptr_ = &minus_hy_[tile.meta_group_rank() * (6 + MAX_NUM_OF_LIG_TORSION)];
+    auto p_ptr_ = &p_[tile.meta_group_rank() * (6 + MAX_NUM_OF_LIG_TORSION)];
+    for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) {
+        minus_hy_ptr_[i] = find_change_index_read(minus_hy, i);
+        p_ptr_[i] = find_change_index_read(p, i);
+    }
+    tile.sync();
+
+    for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) {
+        for (int j = i; j < n; j++) {
+            float tmp = alpha * r * (minus_hy_ptr_[i] * p_ptr_[j] + minus_hy_ptr_[j] * p_ptr_[i])
+                        + alpha * alpha * (r * r * yhy + r) * p_ptr_[i] * p_ptr_[j];
+            h->data[i + j * (j + 1) / 2] += tmp;
+        }
+    }
+    tile.sync();
+    return true;
+}
+
+template <unsigned int TileSize>
+__device__ void bfgs_warp(cg::thread_block_tile<TileSize> &tile, output_type_cuda_t *x,
+                          output_type_cuda_t *x_new, output_type_cuda_t *x_orig, change_cuda_t *g,
+                          change_cuda_t *g_new, change_cuda_t *g_orig, change_cuda_t *p,
+                          change_cuda_t *y, matrix_d *h, m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu,
+                          ig_cuda_t *ig_cuda_gpu, pot_cuda_t *pot_aux, const float *hunt_cap,
+                          const float epsilon_fl, const int max_steps) {
+    // Profiling: perform timing within kernel
+    int n = 3 + 3 + x->lig_torsion_size; /* the dimensions of matirx */
+
+    float f0, f1, f_orig, alpha;
+
+    matrix_d_init_warp(tile, h, n, 0);
+    matrix_d_set_diagonal_warp(tile, h, 1);
+    change_cuda_init_with_change_warp(tile, g_new, g);
+    output_type_cuda_init_with_output_warp(tile, x_new, x);
+    f_orig = m_eval_deriv_warp(tile, x, g, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, pot_aux, hunt_cap,
+                               epsilon_fl);
+
+    /* Init g_orig, x_orig */
+    change_cuda_init_with_change_warp(tile, g_orig, g);
+    output_type_cuda_init_with_output_warp(tile, x_orig, x);
+
+    /* Init p */
+    change_cuda_init_with_change_warp(tile, p, g);
+
+    for (int step = 0; step < max_steps; step++) {
+        minus_mat_vec_product_warp(tile, h, g, p);
+        line_search_warp(tile, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, n, x, g, f0, p, x_new, g_new,
+                         pot_aux, &f1, &alpha, epsilon_fl, hunt_cap);
+        change_cuda_init_with_change_warp(tile, y, g_new);
+
+        /* subtract_change */
+        for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) {
+            float tmp = find_change_index_read(y, i) - find_change_index_read(g, i);
+            find_change_index_write(y, i, tmp);
+        }
+        tile.sync();
+        f0 = f1;
+
+        output_type_cuda_init_with_output_warp(tile, x, x_new);
+
+        float gg = scalar_product_warp(tile, g, g, n);
+        if (!(sqrtf(gg) >= 1e-5f)) break;
+
+        change_cuda_init_with_change_warp(tile, g, g_new);
+
+        if (step == 0) {
+            float yy = scalar_product_warp(tile, y, y, n);
+            if (fabs(yy) > epsilon_fl) {
+                float yp = scalar_product_warp(tile, y, p, n);
+                matrix_d_set_diagonal_warp(tile, h, alpha * yp / yy);
+            }
+        }
+        tile.sync();
+
+        bfgs_update_warp(tile, h, p, y, alpha, epsilon_fl);
+    }
+
+    if (!(f0 <= f_orig)) {
+        f0 = f_orig;
+        output_type_cuda_init_with_output_warp(tile, x, x_orig);
+        change_cuda_init_with_change_warp(tile, g, g_orig);
+    }
+
+    // write output_type_cuda energy
+    x->e = f0;
+}
+
+__device__ __forceinline__ void set_warp(const output_type_cuda_t *x, rigid_cuda_t *lig_rigid_gpu,
+                                         m_coords_cuda_t *m_coords_gpu, const atom_cuda_t *atoms,
+                                         const int m_num_movable_atoms, const float epsilon_fl) {
+    for (int i = 0; i < 3; i++) lig_rigid_gpu->origin[0][i] = x->position[i];
+    for (int i = 0; i < 4; i++) lig_rigid_gpu->orientation_q[0][i] = x->orientation[i];
+    quaternion_to_r3(lig_rigid_gpu->orientation_q[0],
+                     lig_rigid_gpu->orientation_m[0]); /* set orientation_m */
+
+    int begin = lig_rigid_gpu->atom_range[0][0];
+    int end = lig_rigid_gpu->atom_range[0][1];
+    for (int i = begin; i < end; i++) {
+        local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[0], atoms[i].coords,
+                     lig_rigid_gpu->orientation_m[0]);
+    }
+    /* ************* end node.set_conf ************* */
+
+    /* ************* branches_set_conf ************* */
+    /* update nodes in depth-first order */
+    for (int current = 1; current < lig_rigid_gpu->num_children + 1;
+         current++) { /* current starts from 1 (namely starts from first
+                         child node) */
+        int parent = lig_rigid_gpu->parent[current];
+        float torsion = x->lig_torsion[current - 1]; /* torsions are all related to child nodes */
+        local_to_lab(lig_rigid_gpu->origin[current], lig_rigid_gpu->origin[parent],
+                     lig_rigid_gpu->relative_origin[current], lig_rigid_gpu->orientation_m[parent]);
+        local_to_lab_direction(lig_rigid_gpu->axis[current], lig_rigid_gpu->relative_axis[current],
+                               lig_rigid_gpu->orientation_m[parent]);
+        float tmp[4];
+        float parent_q[4]
+            = {lig_rigid_gpu->orientation_q[parent][0], lig_rigid_gpu->orientation_q[parent][1],
+               lig_rigid_gpu->orientation_q[parent][2], lig_rigid_gpu->orientation_q[parent][3]};
+        float current_axis[3] = {lig_rigid_gpu->axis[current][0], lig_rigid_gpu->axis[current][1],
+                                 lig_rigid_gpu->axis[current][2]};
+
+        angle_to_quaternion2(tmp, current_axis, torsion);
+        angle_to_quaternion_multi(tmp, parent_q);
+        quaternion_normalize_approx(tmp, epsilon_fl);
+
+        for (int i = 0; i < 4; i++)
+            lig_rigid_gpu->orientation_q[current][i] = tmp[i]; /* set orientation_q */
+        quaternion_to_r3(lig_rigid_gpu->orientation_q[current],
+                         lig_rigid_gpu->orientation_m[current]); /* set orientation_m */
+
+        /* set coords */
+        begin = lig_rigid_gpu->atom_range[current][0];
+        end = lig_rigid_gpu->atom_range[current][1];
+        for (int i = begin; i < end; i++) {
+            local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[current], atoms[i].coords,
+                         lig_rigid_gpu->orientation_m[current]);
+        }
+    }
+    /* ************* end branches_set_conf ************* */
+}
+
+template <unsigned int TileSize>
+__device__ __forceinline__ void write_back_warp(cg::thread_block_tile<TileSize> &tile,
+                                                output_type_cuda_t *results,
+                                                const output_type_cuda_t *best_out) {
+    for (int i = tile.thread_rank();
+         i < 3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION + MAX_NUM_OF_ATOMS;
+         i += tile.num_threads()) {
+        if (i < 3) {
+            results->position[i] = best_out->position[i];
+        } else if (i < 7) {
+            results->orientation[i - 3] = best_out->orientation[i - 3];
+        } else if (i < 7 + MAX_NUM_OF_LIG_TORSION) {
+            results->lig_torsion[i - 7] = best_out->lig_torsion[i - 7];
+        } else if (i < 7 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION) {
+            results->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION]
+                = best_out->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION];
+        } else {
+#pragma unroll
+            for (int j = 0; j < 3; j++) {
+                results->coords[i - 7 - MAX_NUM_OF_LIG_TORSION - MAX_NUM_OF_FLEX_TORSION][j]
+                    = best_out->coords[i - 7 - MAX_NUM_OF_LIG_TORSION - MAX_NUM_OF_FLEX_TORSION][j];
+            }
+        }
+    }
+
+    if (tile.thread_rank() == 0) {
+        results->lig_torsion_size = best_out->lig_torsion_size;
+        results->e = best_out->e;
+    }
+
+    tile.sync();
+}

From 64bbef2f69998c6565ca39f1d8fd15ba8a2d36d2 Mon Sep 17 00:00:00 2001
From: Zihua Wu <zihuaw@nvidia.com>
Date: Sat, 18 Nov 2023 17:33:02 +0800
Subject: [PATCH 2/5] robust memory size estimation

---
 unidock/src/cuda/common.cuh     |  6 ---
 unidock/src/cuda/kernel.h       |  6 +++
 unidock/src/cuda/monte_carlo.cu | 59 ++++++++++++++++------------
 unidock/src/cuda/warp_ops.cuh   | 20 +++++-----
 unidock/src/main/main.cpp       | 69 ++++++++++++++++++++-------------
 5 files changed, 94 insertions(+), 66 deletions(-)

diff --git a/unidock/src/cuda/common.cuh b/unidock/src/cuda/common.cuh
index 1e0000f..9e08c82 100644
--- a/unidock/src/cuda/common.cuh
+++ b/unidock/src/cuda/common.cuh
@@ -19,12 +19,6 @@
 
 #define M_PI_F 3.1415927f
 
-// symmetric matrix_d (only half of it are stored)
-typedef struct {
-    float data[MAX_HESSIAN_MATRIX_D_SIZE];
-    int dim;
-} matrix_d;
-
 /* Below based on mutate_conf.cpp */
 
 __device__ __forceinline__ void quaternion_increment(float *q, const float *rotation,
diff --git a/unidock/src/cuda/kernel.h b/unidock/src/cuda/kernel.h
index bbee8ad..fc5e0f5 100644
--- a/unidock/src/cuda/kernel.h
+++ b/unidock/src/cuda/kernel.h
@@ -212,3 +212,9 @@ typedef struct {
     output_type_cuda_t container[MAX_CONTAINER_SIZE_EVERY_WI];
     int current_size;
 } output_container_cuda_t;
+
+// symmetric matrix_d (only half of it are stored)
+typedef struct {
+    float data[MAX_HESSIAN_MATRIX_D_SIZE];
+    int dim;
+} matrix_d;
diff --git a/unidock/src/cuda/monte_carlo.cu b/unidock/src/cuda/monte_carlo.cu
index c8971ed..04ffdc0 100644
--- a/unidock/src/cuda/monte_carlo.cu
+++ b/unidock/src/cuda/monte_carlo.cu
@@ -98,34 +98,34 @@ __device__ __forceinline__ void write_back(output_type_cuda_t *results,
 template <unsigned int TileSize = 32>
 __global__ __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) void kernel(
     m_cuda_t *m_cuda_global, ig_cuda_t *ig_cuda_gpu, p_cuda_t *p_cuda_gpu,
-    float *rand_molec_struc_gpu, float *best_e_gpu, int bfgs_max_steps, float mutation_amplitude,
+    float *rand_molec_struc_gpu, int bfgs_max_steps, float mutation_amplitude,
     curandStatePhilox4_32_10_t *states, unsigned long long seed, float epsilon_fl,
     float *hunt_cap_gpu, float *authentic_v_gpu, output_type_cuda_t *results,
     output_type_cuda_t *output_aux, change_cuda_t *change_aux, pot_cuda_t *pot_aux,
     matrix_d *h_cuda_gpu, m_cuda_t *m_cuda_gpu, int search_depth, int num_of_ligands,
     int threads_per_ligand, bool multi_bias) {
     int bid = blockIdx.x, tid = threadIdx.x;
-    int pose_id = (bid * warpSize + tid) / TileSize;
-    auto tb = cg::this_thread_block();
-    cg::thread_block_tile<TileSize> tile = cg::tiled_partition<TileSize>(tb);
+    int pose_id = (bid * blockDim.x + tid) / TileSize;
     if (m_cuda_global[pose_id / threads_per_ligand].m_num_movable_atoms == -1) {
         return;
     }
 
+    auto tb = cg::this_thread_block();
+    cg::thread_block_tile<TileSize> tile = cg::tiled_partition<TileSize>(tb);
+
     float best_e = INFINITY;
-    // __shared__ output_type_cuda_t tmp, best_out, candidate, x_new, x_orig;
     output_type_cuda_t &tmp = output_aux[pose_id * 5];
     output_type_cuda_t &best_out = output_aux[pose_id * 5 + 1];
     output_type_cuda_t &candidate = output_aux[pose_id * 5 + 2];
     output_type_cuda_t &x_new = output_aux[pose_id * 5 + 3];
     output_type_cuda_t &x_orig = output_aux[pose_id * 5 + 4];
 
-    // __shared__ change_cuda_t g, tmp1, tmp2, tmp3, tmp4;
-    change_cuda_t &g = change_aux[pose_id * 5];
-    change_cuda_t &tmp1 = change_aux[pose_id * 5 + 1];
-    change_cuda_t &tmp2 = change_aux[pose_id * 5 + 2];
-    change_cuda_t &tmp3 = change_aux[pose_id * 5 + 3];
-    change_cuda_t &tmp4 = change_aux[pose_id * 5 + 4];
+    change_cuda_t &g = change_aux[pose_id * 6];
+    change_cuda_t &tmp1 = change_aux[pose_id * 6 + 1];
+    change_cuda_t &tmp2 = change_aux[pose_id * 6 + 2];
+    change_cuda_t &tmp3 = change_aux[pose_id * 6 + 3];
+    change_cuda_t &tmp4 = change_aux[pose_id * 6 + 4];
+    change_cuda_t &tmp5 = change_aux[pose_id * 6 + 5];
 
     if (pose_id < num_of_ligands * threads_per_ligand) {
         output_type_cuda_init_warp(
@@ -159,7 +159,7 @@ __global__ __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) void kern
                                  mutation_amplitude);
             tile.sync();
 
-            bfgs_warp(tile, &candidate, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4,
+            bfgs_warp(tile, &candidate, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5,
                       &h_cuda_gpu[pose_id], &m_cuda_gpu[pose_id], p_cuda_gpu, ig_cuda_gpu, pot_aux,
                       hunt_cap_gpu, epsilon_fl, bfgs_max_steps);
 
@@ -182,7 +182,7 @@ __global__ __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) void kern
                 tile.sync();
 
                 if (tmp.e < best_e) {
-                    bfgs_warp(tile, &tmp, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4,
+                    bfgs_warp(tile, &tmp, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5,
                               &h_cuda_gpu[pose_id], &m_cuda_gpu[pose_id], p_cuda_gpu, ig_cuda_gpu,
                               pot_aux, authentic_v_gpu, epsilon_fl, bfgs_max_steps);
 
@@ -274,6 +274,7 @@ __host__ void monte_carlo::operator()(
 
     p_cuda_t_cpu *p_cuda;
     checkCUDA(cudaMallocHost(&p_cuda, sizeof(p_cuda_t_cpu)));
+    size_t avail, total;
 
     /* End CPU allocation */
 
@@ -291,11 +292,7 @@ __host__ void monte_carlo::operator()(
     // rand_molec_struc_gpu
     float *rand_molec_struc_gpu;
     checkCUDA(cudaMalloc(&rand_molec_struc_gpu, thread * SIZE_OF_MOLEC_STRUC));
-    // best_e_gpu
-    float *best_e_gpu;
     float epsilon_fl_float = static_cast<float>(epsilon_fl);
-    checkCUDA(cudaMalloc(&best_e_gpu, sizeof(float)));
-    checkCUDA(cudaMemcpy(best_e_gpu, &max_fl, sizeof(float), cudaMemcpyHostToDevice));
 
     // use cuRand to generate random values on GPU
     curandStatePhilox4_32_10_t *states;
@@ -324,17 +321,29 @@ __host__ void monte_carlo::operator()(
     float authentic_v_float[3]
         = {static_cast<float>(authentic_v[0]), static_cast<float>(authentic_v[1]),
            static_cast<float>(authentic_v[2])};
+    cudaMemGetInfo(&avail, &total);
+    printf("Available Memory = %dMiB   Total Memory = %dMiB\n", int(avail / 1024 / 1024),
+           int(total / 1024 / 1024));
 
     checkCUDA(cudaMalloc(&authentic_v_gpu, sizeof(authentic_v_float)));
     // Preparing result data
     output_type_cuda_t *results_gpu;
     checkCUDA(cudaMalloc(&results_gpu, thread * sizeof(output_type_cuda_t)));
+    cudaMemGetInfo(&avail, &total);
+    printf("Available Memory = %dMiB   Total Memory = %dMiB\n", int(avail / 1024 / 1024),
+           int(total / 1024 / 1024));
 
     m_cuda_t *m_cuda_global;
     checkCUDA(cudaMalloc(&m_cuda_global, thread * sizeof(m_cuda_t)));
+    cudaMemGetInfo(&avail, &total);
+    printf("Available Memory = %dMiB   Total Memory = %dMiB\n", int(avail / 1024 / 1024),
+           int(total / 1024 / 1024));
 
     matrix_d *h_cuda_global;
     checkCUDA(cudaMalloc(&h_cuda_global, thread * sizeof(matrix_d)));
+    cudaMemGetInfo(&avail, &total);
+    printf("Available Memory = %dMiB   Total Memory = %dMiB\n", int(avail / 1024 / 1024),
+           int(total / 1024 / 1024));
 
     /* End Allocating GPU Memory */
 
@@ -721,15 +730,15 @@ __host__ void monte_carlo::operator()(
     output_type_cuda_t *results_aux;
     checkCUDA(cudaMalloc(&results_aux, 5 * thread * sizeof(output_type_cuda_t)));
     change_cuda_t *change_aux;
-    checkCUDA(cudaMalloc(&change_aux, 5 * thread * sizeof(change_cuda_t)));
+    checkCUDA(cudaMalloc(&change_aux, 6 * thread * sizeof(change_cuda_t)));
     pot_cuda_t *pot_aux;
     checkCUDA(cudaMalloc(&pot_aux, thread * sizeof(pot_cuda_t)));
 
-    kernel<32><<<thread, 32>>>(
-        m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu, best_e_gpu,
-        quasi_newton_par_max_steps, mutation_amplitude_float, states, seed, epsilon_fl_float,
-        hunt_cap_gpu, authentic_v_gpu, results_gpu, results_aux, change_aux, pot_aux, h_cuda_global,
-        m_cuda_global, global_steps, num_of_ligands, threads_per_ligand, multi_bias);
+    kernel<32><<<thread, 32>>>(m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu,
+                               quasi_newton_par_max_steps, mutation_amplitude_float, states, seed,
+                               epsilon_fl_float, hunt_cap_gpu, authentic_v_gpu, results_gpu,
+                               results_aux, change_aux, pot_aux, h_cuda_global, m_cuda_global,
+                               global_steps, num_of_ligands, threads_per_ligand, multi_bias);
 
     // Device to Host memcpy of precalculated_byatom, copy back data to p_gpu
     p_m_data_cuda_t *p_data;
@@ -799,13 +808,15 @@ __host__ void monte_carlo::operator()(
     checkCUDA(cudaFree(ig_cuda_gpu));
     checkCUDA(cudaFree(p_cuda_gpu));
     checkCUDA(cudaFree(rand_molec_struc_gpu));
-    checkCUDA(cudaFree(best_e_gpu));
     checkCUDA(cudaFree(hunt_cap_gpu));
     checkCUDA(cudaFree(authentic_v_gpu));
     checkCUDA(cudaFree(results_gpu));
     checkCUDA(cudaFree(change_aux));
     checkCUDA(cudaFree(results_aux));
+    checkCUDA(cudaFree(pot_aux));
     checkCUDA(cudaFree(states));
+    checkCUDA(cudaFree(h_cuda_global));
+    checkCUDA(cudaFree(m_cuda_global));
     checkCUDA(cudaFreeHost(m_cuda));
     checkCUDA(cudaFreeHost(rand_molec_struc_tmp));
     checkCUDA(cudaFreeHost(ig_cuda_ptr));
diff --git a/unidock/src/cuda/warp_ops.cuh b/unidock/src/cuda/warp_ops.cuh
index 1ea368f..71d4abf 100644
--- a/unidock/src/cuda/warp_ops.cuh
+++ b/unidock/src/cuda/warp_ops.cuh
@@ -467,16 +467,15 @@ template <unsigned int TileSize> __device__ __forceinline__ void line_search_war
 template <unsigned int TileSize>
 __device__ __forceinline__ bool bfgs_update_warp(cg::thread_block_tile<TileSize> &tile, matrix_d *h,
                                                  const change_cuda_t *p, const change_cuda_t *y,
-                                                 const float alpha, const float epsilon_fl) {
+                                                 change_cuda_t *minus_hy, const float alpha,
+                                                 const float epsilon_fl) {
     float yp, yhy;
     yp = scalar_product_warp(tile, y, p, h->dim);
     if (alpha * yp < epsilon_fl) return false;
 
-    __shared__ change_cuda_t minus_hy[32 / TileSize];
-    auto minus_hy_ptr = &minus_hy[tile.meta_group_rank()];
-    change_cuda_init_with_change_warp(tile, minus_hy_ptr, y);
-    minus_mat_vec_product_warp(tile, h, y, minus_hy_ptr);
-    yhy = -scalar_product_warp(tile, y, minus_hy_ptr, h->dim);
+    change_cuda_init_with_change_warp(tile, minus_hy, y);
+    minus_mat_vec_product_warp(tile, h, y, minus_hy);
+    yhy = -scalar_product_warp(tile, y, minus_hy, h->dim);
     float r = 1 / (alpha * yp);
     int n = 6 + p->lig_torsion_size;
 
@@ -507,9 +506,10 @@ template <unsigned int TileSize>
 __device__ void bfgs_warp(cg::thread_block_tile<TileSize> &tile, output_type_cuda_t *x,
                           output_type_cuda_t *x_new, output_type_cuda_t *x_orig, change_cuda_t *g,
                           change_cuda_t *g_new, change_cuda_t *g_orig, change_cuda_t *p,
-                          change_cuda_t *y, matrix_d *h, m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu,
-                          ig_cuda_t *ig_cuda_gpu, pot_cuda_t *pot_aux, const float *hunt_cap,
-                          const float epsilon_fl, const int max_steps) {
+                          change_cuda_t *y, change_cuda_t *minus_hy, matrix_d *h,
+                          m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu, ig_cuda_t *ig_cuda_gpu,
+                          pot_cuda_t *pot_aux, const float *hunt_cap, const float epsilon_fl,
+                          const int max_steps) {
     // Profiling: perform timing within kernel
     int n = 3 + 3 + x->lig_torsion_size; /* the dimensions of matirx */
 
@@ -559,7 +559,7 @@ __device__ void bfgs_warp(cg::thread_block_tile<TileSize> &tile, output_type_cud
         }
         tile.sync();
 
-        bfgs_update_warp(tile, h, p, y, alpha, epsilon_fl);
+        bfgs_update_warp(tile, h, p, y, minus_hy, alpha, epsilon_fl);
     }
 
     if (!(f0 <= f_orig)) {
diff --git a/unidock/src/main/main.cpp b/unidock/src/main/main.cpp
index dded9dd..b1db789 100644
--- a/unidock/src/main/main.cpp
+++ b/unidock/src/main/main.cpp
@@ -20,11 +20,14 @@
 
 */
 
+#include <cstdint>
 #include <iostream>
 #include <string>
 #include <vector>  // ligand paths
 #include <exception>
 #include <boost/program_options.hpp>
+#include "conf.h"
+#include "kernel.h"
 #include "vina.h"
 #include "utils.h"
 #include "scoring_function.h"
@@ -32,6 +35,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
+#include <unistd.h>
 
 struct usage_error : public std::runtime_error {
     usage_error(const std::string& message) : std::runtime_error(message) {}
@@ -68,25 +72,39 @@ void check_occurrence(boost::program_options::variables_map& vm,
 }
 
 int predict_peak_memory(int batch_size, int exhaustiveness, int all_atom2_numbers,
-                        bool use_v100 = true, bool multi_bias = false) {
-    if (!multi_bias) {
-        if (use_v100) {
-            return 1.214869 * batch_size + .0038522 * exhaustiveness * batch_size
-                   + .011978 * all_atom2_numbers + 20017.72;  // this is based on V100, 32G
-        } else {
-            return 1.166067 * batch_size + .0038676 * exhaustiveness * batch_size
-                   + .0119598 * all_atom2_numbers + 5313.848;  // this is based on T4, 16G
-        }
-    } else {
-        if (use_v100) {
-            return 65.214869 * batch_size + .0038522 * exhaustiveness * batch_size
-                   + .011978 * all_atom2_numbers + 20017.72;  // this is based on V100, 32G
-        } else {
-            return 65.166067 * batch_size + .0038676 * exhaustiveness * batch_size
-                   + .0119598 * all_atom2_numbers + 5313.848;  // this is based on T4, 16G
-        }
-    }
-    return 0;
+                        bool multi_bias) {
+    int64_t gpu_memory = 0;
+    // precalculate
+    gpu_memory += (int64_t)(1) * all_atom2_numbers * sizeof(precalculate_element_cuda_t);
+
+    // m_cuda_gpu
+    gpu_memory += (int64_t)(1) * batch_size * sizeof(m_cuda_t);
+    // ig_cuda_gpu
+    if (multi_bias)
+        gpu_memory += (int64_t)(1) * batch_size * sizeof(ig_cuda_t);
+    else
+        gpu_memory += sizeof(ig_cuda_t);
+    // p_cuda_gpu
+    gpu_memory += (int64_t)(1) * batch_size * sizeof(p_cuda_t);
+    // rand_molec_struc_gpu
+    gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * SIZE_OF_MOLEC_STRUC;
+    // results_gpu
+    gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * sizeof(output_type_cuda_t);
+    // m_cuda_global
+    gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * sizeof(m_cuda_t);
+    // h_cuda_global
+    gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * sizeof(matrix_d);
+    // change_aux
+    gpu_memory += (int64_t)(6) * batch_size * exhaustiveness * sizeof(change_cuda_t);
+    // results_aux
+    gpu_memory += (int64_t)(5) * batch_size * exhaustiveness * sizeof(output_type_cuda_t);
+    // pot_aux
+    gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * sizeof(pot_cuda_t);
+    // states
+    gpu_memory
+        += (int64_t)(1) * batch_size * exhaustiveness * 64;  // sizeof(curandStatePhilox4_32_10_t)
+
+    return gpu_memory / (1024 * 1024);
 }
 
 int main(int argc, char* argv[]) {
@@ -703,7 +721,9 @@ bug reporting, license agreements, and more information.      \n";
             int deviceCount = 0;
             size_t avail;
             size_t total;
-            float max_memory = 32000;
+            // get total memory in MB and leave 5%
+            float max_memory
+                = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE) / 1024 / 1024 * 0.95;
             bool use_v100 = true;
             bool ad4 = false;
             if (sf_name.compare("ad4") == 0) ad4 = true;
@@ -713,12 +733,9 @@ bug reporting, license agreements, and more information.      \n";
                 cudaMemGetInfo(&avail, &total);
                 printf("Available Memory = %dMiB   Total Memory = %dMiB\n",
                        int(avail / 1024 / 1024), int(total / 1024 / 1024));
-                max_memory = avail / 1024 / 1024 * 0.95;  // leave 5% to prevent error
-            }
-            if (max_memory < 17000) {
-                // using T4 or other 16G global memory GPU
-                use_v100 = false;
+                max_gpu_memory = avail / 1024 / 1024 * 0.95;  // leave 5%
             }
+
             if (max_gpu_memory > 0 && max_gpu_memory < max_memory) {
                 max_memory = (float)max_gpu_memory;
             }
@@ -758,7 +775,7 @@ bug reporting, license agreements, and more information.      \n";
                     std::vector<model> batch_ligands;  // ligands in current batch
                     v1.bias_batch_list.clear();
                     while (predict_peak_memory(batch_size, exhaustiveness, all_atom2_numbers,
-                                               use_v100, v.multi_bias)
+                                               multi_bias)
                                < max_memory
                            && processed_ligands + batch_size < all_ligands.size()) {
                         batch_ligands.emplace_back(

From a29b3cdc7cfbc172cc510f451ccedd43291d1ec0 Mon Sep 17 00:00:00 2001
From: Zihua Wu <zihuaw@nvidia.com>
Date: Sat, 18 Nov 2023 17:42:51 +0800
Subject: [PATCH 3/5] remove debug info

---
 unidock/src/cuda/monte_carlo.cu | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/unidock/src/cuda/monte_carlo.cu b/unidock/src/cuda/monte_carlo.cu
index 04ffdc0..9587e93 100644
--- a/unidock/src/cuda/monte_carlo.cu
+++ b/unidock/src/cuda/monte_carlo.cu
@@ -274,7 +274,6 @@ __host__ void monte_carlo::operator()(
 
     p_cuda_t_cpu *p_cuda;
     checkCUDA(cudaMallocHost(&p_cuda, sizeof(p_cuda_t_cpu)));
-    size_t avail, total;
 
     /* End CPU allocation */
 
@@ -321,29 +320,17 @@ __host__ void monte_carlo::operator()(
     float authentic_v_float[3]
         = {static_cast<float>(authentic_v[0]), static_cast<float>(authentic_v[1]),
            static_cast<float>(authentic_v[2])};
-    cudaMemGetInfo(&avail, &total);
-    printf("Available Memory = %dMiB   Total Memory = %dMiB\n", int(avail / 1024 / 1024),
-           int(total / 1024 / 1024));
 
     checkCUDA(cudaMalloc(&authentic_v_gpu, sizeof(authentic_v_float)));
     // Preparing result data
     output_type_cuda_t *results_gpu;
     checkCUDA(cudaMalloc(&results_gpu, thread * sizeof(output_type_cuda_t)));
-    cudaMemGetInfo(&avail, &total);
-    printf("Available Memory = %dMiB   Total Memory = %dMiB\n", int(avail / 1024 / 1024),
-           int(total / 1024 / 1024));
 
     m_cuda_t *m_cuda_global;
     checkCUDA(cudaMalloc(&m_cuda_global, thread * sizeof(m_cuda_t)));
-    cudaMemGetInfo(&avail, &total);
-    printf("Available Memory = %dMiB   Total Memory = %dMiB\n", int(avail / 1024 / 1024),
-           int(total / 1024 / 1024));
 
     matrix_d *h_cuda_global;
     checkCUDA(cudaMalloc(&h_cuda_global, thread * sizeof(matrix_d)));
-    cudaMemGetInfo(&avail, &total);
-    printf("Available Memory = %dMiB   Total Memory = %dMiB\n", int(avail / 1024 / 1024),
-           int(total / 1024 / 1024));
 
     /* End Allocating GPU Memory */
 

From b4a1766db7e85177ea61dd0c5c5c39a3eb34f43e Mon Sep 17 00:00:00 2001
From: Zihua Wu <zihuaw@nvidia.com>
Date: Wed, 13 Dec 2023 13:22:57 +0800
Subject: [PATCH 4/5] clean unused code

---
 .gitignore                            |   4 -
 unidock/src/cuda/common.cuh           | 112 --------------------------
 unidock/src/cuda/kernel.h             |   1 -
 unidock/src/cuda/monte_carlo.cu       |   6 +-
 unidock/src/cuda/precalculate_gpu.cuh |   2 -
 unidock/src/cuda/warp_ops.cuh         |   1 -
 unidock/src/lib/precalculate.h        |   4 +-
 7 files changed, 2 insertions(+), 128 deletions(-)

diff --git a/.gitignore b/.gitignore
index ab78e12..70d0727 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,3 @@ unidock_tools/dist
 unidock_tools/dist/*
 unidock_tools/unidock_tools.egg-info
 unidock_tools/unidock_tools.egg-info/*
-
-build-*
-nvidia_inha_benchmark
-compile_commands.json
diff --git a/unidock/src/cuda/common.cuh b/unidock/src/cuda/common.cuh
index 9e08c82..feded27 100644
--- a/unidock/src/cuda/common.cuh
+++ b/unidock/src/cuda/common.cuh
@@ -1,21 +1,9 @@
 #pragma once
-#include "cuda.h"
 #include "curand_kernel.h"
 #include "kernel.h"
 #include "math.h"
 #include <cmath>
-#include <cstdint>
-#include <vector>
-/* Original Include files */
-#include "ad4cache.h"
-#include "cache.h"
-#include "coords.h"
-#include "model.h"
-#include "monte_carlo.h"
-#include "mutate.h"
 #include "precalculate.h"
-#include "quasi_newton.h"
-#include <nvtx3/nvToolsExt.h>
 
 #define M_PI_F 3.1415927f
 
@@ -26,40 +14,6 @@ __device__ __forceinline__ void quaternion_increment(float *q, const float *rota
 
 __device__ __forceinline__ void normalize_angle(float *x);
 
-__device__ __forceinline__ void output_type_cuda_init(output_type_cuda_t *out, const float *ptr) {
-    memcpy(out, ptr, sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION));
-    out->lig_torsion_size = ptr[3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION];
-    // did not assign coords and e
-}
-
-__device__ __forceinline__ void output_type_cuda_init_with_output(
-    output_type_cuda_t *out_new, const output_type_cuda_t *out_old) {
-    memcpy(out_new, out_old,
-           sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION));
-    out_new->lig_torsion_size = out_old->lig_torsion_size;
-    // assign e but not coords
-    out_new->e = out_old->e;
-}
-
-__device__ __forceinline__ void output_type_cuda_increment(output_type_cuda_t *x,
-                                                           const change_cuda_t *c, float factor,
-                                                           float epsilon_fl) {
-    // position increment
-    for (int k = 0; k < 3; k++) x->position[k] += factor * c->position[k];
-    // orientation increment
-    float rotation[3];
-    for (int k = 0; k < 3; k++) rotation[k] = factor * c->orientation[k];
-    quaternion_increment(x->orientation, rotation, epsilon_fl);
-
-    // torsion increment
-    for (int k = 0; k < x->lig_torsion_size; k++) {
-        float tmp = factor * c->lig_torsion[k];
-        normalize_angle(&tmp);
-        x->lig_torsion[k] += tmp;
-        normalize_angle(&(x->lig_torsion[k]));
-    }
-}
-
 __device__ __forceinline__ float norm3(const float *a) {
     return sqrtf(a[0] * a[0] + a[1] * a[1] + a[2] * a[2]);
 }
@@ -236,16 +190,6 @@ __device__ __forceinline__ void mutate_conf_cuda(const int num_steps, output_typ
 
 /* Below based on matrix.cpp */
 
-__device__ __forceinline__ void matrix_d_init(matrix_d *m, int dim, float fill_data) {
-    m->dim = dim;
-    if ((dim * (dim + 1) / 2) > MAX_HESSIAN_MATRIX_D_SIZE)
-        DEBUG_PRINTF("\nnmatrix_d: matrix_d_init() ERROR!");
-    // ((dim * (dim + 1) / 2)*sizeof(float)); // symmetric matrix_d
-    for (int i = 0; i < (dim * (dim + 1) / 2); i++) m->data[i] = fill_data;
-    for (int i = (dim * (dim + 1) / 2); i < MAX_HESSIAN_MATRIX_D_SIZE; i++)
-        m->data[i] = 0;  // Others will be 0
-}
-
 // as rugular 3x3 matrix_d
 __device__ __forceinline__ void mat_init(matrix_d *m, float fill_data) {
     m->dim = 3;  // fixed to 3x3 matrix_d
@@ -253,12 +197,6 @@ __device__ __forceinline__ void mat_init(matrix_d *m, float fill_data) {
     for (int i = 0; i < 9; i++) m->data[i] = fill_data;
 }
 
-__device__ __forceinline__ void matrix_d_set_diagonal(matrix_d *m, float fill_data) {
-    for (int i = 0; i < m->dim; i++) {
-        m->data[i + i * (i + 1) / 2] = fill_data;
-    }
-}
-
 // as regular matrix_d
 __device__ __forceinline__ void matrix_d_set_element(matrix_d *m, int dim, int x, int y,
                                                      float fill_data) {
@@ -720,53 +658,3 @@ __device__ __forceinline__ void find_change_index_write(change_cuda_t *g, int in
     }
     DEBUG_PRINTF("\nKernel2:find_change_index_write() ERROR!"); /* Shouldn't be here */
 }
-
-__device__ __forceinline__ void minus_mat_vec_product(const matrix_d *h, const change_cuda_t *in,
-                                                      change_cuda_t *out) {
-    int n = h->dim;
-    for (int i = 0; i < n; i++) {
-        float sum = 0;
-        for (int j = 0; j < n; j++) {
-            sum += h->data[index_permissive(h, i, j)] * find_change_index_read(in, j);
-        }
-        find_change_index_write(out, i, -sum);
-    }
-}
-
-__device__ __forceinline__ float scalar_product(const change_cuda_t *a, const change_cuda_t *b,
-                                                int n) {
-    float tmp = 0;
-    for (int i = 0; i < n; i++) {
-        tmp += find_change_index_read(a, i) * find_change_index_read(b, i);
-    }
-    return tmp;
-}
-
-__device__ __forceinline__ bool bfgs_update(matrix_d *h, const change_cuda_t *p,
-                                            const change_cuda_t *y, const float alpha,
-                                            const float epsilon_fl) {
-    const float yp = scalar_product(y, p, h->dim);
-
-    if (alpha * yp < epsilon_fl) return false;
-    change_cuda_t minus_hy;
-    change_cuda_init_with_change(&minus_hy, y);
-    minus_mat_vec_product(h, y, &minus_hy);
-    const float yhy = -scalar_product(y, &minus_hy, h->dim);
-    const float r = 1 / (alpha * yp);
-    const int n = 6 + p->lig_torsion_size;
-
-    for (int i = 0; i < n; i++) {
-        for (int j = i; j < n; j++) {
-            float tmp
-                = alpha * r
-                      * (find_change_index_read(&minus_hy, i) * find_change_index_read(p, j)
-                         + find_change_index_read(&minus_hy, j) * find_change_index_read(p, i))
-                  + +alpha * alpha * (r * r * yhy + r) * find_change_index_read(p, i)
-                        * find_change_index_read(p, j);
-
-            h->data[i + j * (j + 1) / 2] += tmp;
-        }
-    }
-
-    return true;
-}
diff --git a/unidock/src/cuda/kernel.h b/unidock/src/cuda/kernel.h
index fc5e0f5..706108b 100644
--- a/unidock/src/cuda/kernel.h
+++ b/unidock/src/cuda/kernel.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <cstdint>
 #include <stdexcept>
 template <typename T>
 void check(T result, char const *const func, const char *const file, int const line) {
diff --git a/unidock/src/cuda/monte_carlo.cu b/unidock/src/cuda/monte_carlo.cu
index 9587e93..bb2c408 100644
--- a/unidock/src/cuda/monte_carlo.cu
+++ b/unidock/src/cuda/monte_carlo.cu
@@ -21,13 +21,11 @@
 */
 
 #include "common.cuh"
-#include "cuda.h"
 #include "curand_kernel.h"
 #include "kernel.h"
 #include "math.h"
 #include "warp_ops.cuh"
 #include <cmath>
-#include <cstdint>
 #include <vector>
 /* Original Include files */
 #include "ad4cache.h"
@@ -38,7 +36,6 @@
 #include "mutate.h"
 #include "precalculate.h"
 #include "quasi_newton.h"
-#include <nvtx3/nvToolsExt.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 
@@ -764,8 +761,7 @@ __host__ void monte_carlo::operator()(
     cudaEventSynchronize(stop);
     float msecTotal = 0.0f;
     cudaEventElapsedTime(&msecTotal, start, stop);
-    printf("Time spend on GPU is %f ms\n", msecTotal);
-    // DEBUG_PRINTF("Time spend on GPU is %f ms\n", msecTotal);
+    DEBUG_PRINTF("Time spend on GPU is %f ms\n", msecTotal);
 
     /* Convert result data. Can be improved by mapping memory
      */
diff --git a/unidock/src/cuda/precalculate_gpu.cuh b/unidock/src/cuda/precalculate_gpu.cuh
index 673f8b4..ff6c1b3 100644
--- a/unidock/src/cuda/precalculate_gpu.cuh
+++ b/unidock/src/cuda/precalculate_gpu.cuh
@@ -1,6 +1,4 @@
 #include "math.h"
-#include "kernel.h"
-#include "macros.h"
 
 // Define GPU precalculate structures
 
diff --git a/unidock/src/cuda/warp_ops.cuh b/unidock/src/cuda/warp_ops.cuh
index 71d4abf..9647cc0 100644
--- a/unidock/src/cuda/warp_ops.cuh
+++ b/unidock/src/cuda/warp_ops.cuh
@@ -1,5 +1,4 @@
 #pragma once
-#include "bfgs.h"
 #include "common.cuh"
 #include "kernel.h"
 #include <cooperative_groups.h>
diff --git a/unidock/src/lib/precalculate.h b/unidock/src/lib/precalculate.h
index d738ac5..4715146 100644
--- a/unidock/src/lib/precalculate.h
+++ b/unidock/src/lib/precalculate.h
@@ -25,6 +25,7 @@
 
 #include "scoring_function.h"
 #include "matrix.h"
+#include "model.h"
 #include "kernel.h"
 
 #ifdef DEBUG
@@ -33,9 +34,6 @@
 #    define DEBUG_PRINTF(...)
 #endif
 
-// Forward declaration
-struct model;
-
 struct precalculate_element {
 public:
     precalculate_element(sz n, fl factor_) : fast(n, 0), smooth(n, pr(0, 0)), factor(factor_) {}

From 6b579d1f39c86c59315d2ccb6041fe19ffef7400 Mon Sep 17 00:00:00 2001
From: Zihua Wu <zihuaw@nvidia.com>
Date: Wed, 13 Dec 2023 13:26:39 +0800
Subject: [PATCH 5/5] clean unused code

---
 unidock/src/main/main.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/unidock/src/main/main.cpp b/unidock/src/main/main.cpp
index b1db789..df45d48 100644
--- a/unidock/src/main/main.cpp
+++ b/unidock/src/main/main.cpp
@@ -20,22 +20,16 @@
 
 */
 
-#include <cstdint>
 #include <iostream>
 #include <string>
 #include <vector>  // ligand paths
-#include <exception>
 #include <boost/program_options.hpp>
-#include "conf.h"
-#include "kernel.h"
 #include "vina.h"
 #include "utils.h"
 #include "scoring_function.h"
 
-#include <cuda.h>
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
-#include <unistd.h>
 
 struct usage_error : public std::runtime_error {
     usage_error(const std::string& message) : std::runtime_error(message) {}