From a3048845e2f76ff70f51a3eeac1a432dba8af13b Mon Sep 17 00:00:00 2001 From: Zihua Wu Date: Mon, 9 Oct 2023 09:53:53 +0800 Subject: [PATCH 1/5] Warp-level parallelization --- .gitignore | 4 + ...constants_gpu.h => atom_constants_gpu.cuh} | 0 unidock/src/cuda/common.cuh | 778 ++++++++++ unidock/src/cuda/kernel.h | 30 +- unidock/src/cuda/monte_carlo.cu | 1266 +++-------------- unidock/src/cuda/precalculate.cu | 2 +- ...recalculate_gpu.h => precalculate_gpu.cuh} | 2 +- unidock/src/cuda/warp_ops.cuh | 660 +++++++++ 8 files changed, 1669 insertions(+), 1073 deletions(-) rename unidock/src/cuda/{atom_constants_gpu.h => atom_constants_gpu.cuh} (100%) create mode 100644 unidock/src/cuda/common.cuh rename unidock/src/cuda/{precalculate_gpu.h => precalculate_gpu.cuh} (99%) create mode 100644 unidock/src/cuda/warp_ops.cuh diff --git a/.gitignore b/.gitignore index 70d0727..ab78e12 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,7 @@ unidock_tools/dist unidock_tools/dist/* unidock_tools/unidock_tools.egg-info unidock_tools/unidock_tools.egg-info/* + +build-* +nvidia_inha_benchmark +compile_commands.json diff --git a/unidock/src/cuda/atom_constants_gpu.h b/unidock/src/cuda/atom_constants_gpu.cuh similarity index 100% rename from unidock/src/cuda/atom_constants_gpu.h rename to unidock/src/cuda/atom_constants_gpu.cuh diff --git a/unidock/src/cuda/common.cuh b/unidock/src/cuda/common.cuh new file mode 100644 index 0000000..1e0000f --- /dev/null +++ b/unidock/src/cuda/common.cuh @@ -0,0 +1,778 @@ +#pragma once +#include "cuda.h" +#include "curand_kernel.h" +#include "kernel.h" +#include "math.h" +#include +#include +#include +/* Original Include files */ +#include "ad4cache.h" +#include "cache.h" +#include "coords.h" +#include "model.h" +#include "monte_carlo.h" +#include "mutate.h" +#include "precalculate.h" +#include "quasi_newton.h" +#include + +#define M_PI_F 3.1415927f + +// symmetric matrix_d (only half of it are stored) +typedef struct { + float data[MAX_HESSIAN_MATRIX_D_SIZE]; + int dim; +} matrix_d; + +/* Below based on mutate_conf.cpp */ + +__device__ __forceinline__ void quaternion_increment(float *q, const float *rotation, + float epsilon_fl); + +__device__ __forceinline__ void normalize_angle(float *x); + +__device__ __forceinline__ void output_type_cuda_init(output_type_cuda_t *out, const float *ptr) { + memcpy(out, ptr, sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION)); + out->lig_torsion_size = ptr[3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION]; + // did not assign coords and e +} + +__device__ __forceinline__ void output_type_cuda_init_with_output( + output_type_cuda_t *out_new, const output_type_cuda_t *out_old) { + memcpy(out_new, out_old, + sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION)); + out_new->lig_torsion_size = out_old->lig_torsion_size; + // assign e but not coords + out_new->e = out_old->e; +} + +__device__ __forceinline__ void output_type_cuda_increment(output_type_cuda_t *x, + const change_cuda_t *c, float factor, + float epsilon_fl) { + // position increment + for (int k = 0; k < 3; k++) x->position[k] += factor * c->position[k]; + // orientation increment + float rotation[3]; + for (int k = 0; k < 3; k++) rotation[k] = factor * c->orientation[k]; + quaternion_increment(x->orientation, rotation, epsilon_fl); + + // torsion increment + for (int k = 0; k < x->lig_torsion_size; k++) { + float tmp = factor * c->lig_torsion[k]; + normalize_angle(&tmp); + x->lig_torsion[k] += tmp; + normalize_angle(&(x->lig_torsion[k])); + } +} + +__device__ __forceinline__ float norm3(const float *a) { + return sqrtf(a[0] * a[0] + a[1] * a[1] + a[2] * a[2]); +} + +__device__ __forceinline__ void random_inside_sphere_gpu(float *random_inside_sphere, + curandStatePhilox4_32_10_t *state) { + float4 random_inside_sphere_fl; + while (true) { // on average, this will have to be run about twice + random_inside_sphere_fl = curand_uniform4(state); // ~ U[0,1] + random_inside_sphere[0] = (random_inside_sphere_fl.x - 0.5) * 2.0; + random_inside_sphere[1] = (random_inside_sphere_fl.y - 0.5) * 2.0; + random_inside_sphere[2] = (random_inside_sphere_fl.z - 0.5) * 2.0; + random_inside_sphere[3] = random_inside_sphere_fl.w; + float r = norm3(random_inside_sphere); + if (r < 1) { + return; + } + } +} + +__device__ __forceinline__ void normalize_angle(float *x) { + while (1) { + if (*x >= -(M_PI_F) && *x <= (M_PI_F)) { + break; + } else if (*x > 3 * (M_PI_F)) { + float n = (*x - (M_PI_F)) / (2 * (M_PI_F)); + *x -= 2 * (M_PI_F)*ceil(n); + } else if (*x < 3 * -(M_PI_F)) { + float n = (-*x - (M_PI_F)) / (2 * (M_PI_F)); + *x += 2 * (M_PI_F)*ceil(n); + } else if (*x > (M_PI_F)) { + *x -= 2 * (M_PI_F); + } else if (*x < -(M_PI_F)) { + *x += 2 * (M_PI_F); + } else { + break; + } + } +} + +__device__ __forceinline__ bool quaternion_is_normalized(float *q) { + float q_pow = q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]; + float sqrt_q_pow = sqrtf(q_pow); + return (q_pow - 1 < 0.001) && (sqrt_q_pow - 1 < 0.001); +} + +__device__ __forceinline__ void angle_to_quaternion(float *q, const float *rotation, + float epsilon_fl) { + float angle = norm3(rotation); + if (angle > epsilon_fl) { + float axis[3] = {rotation[0] / angle, rotation[1] / angle, rotation[2] / angle}; + normalize_angle(&angle); + float c = cos(angle / 2); + float s = sin(angle / 2); + q[0] = c; + q[1] = s * axis[0]; + q[2] = s * axis[1]; + q[3] = s * axis[2]; + return; + } + q[0] = 1; + q[1] = 0; + q[2] = 0; + q[3] = 0; + return; +} + +// quaternion multiplication +__device__ __forceinline__ void angle_to_quaternion_multi(float *qa, const float *qb) { + float tmp[4] = {qa[0], qa[1], qa[2], qa[3]}; + qa[0] = tmp[0] * qb[0] - tmp[1] * qb[1] - tmp[2] * qb[2] - tmp[3] * qb[3]; + qa[1] = tmp[0] * qb[1] + tmp[1] * qb[0] + tmp[2] * qb[3] - tmp[3] * qb[2]; + qa[2] = tmp[0] * qb[2] - tmp[1] * qb[3] + tmp[2] * qb[0] + tmp[3] * qb[1]; + qa[3] = tmp[0] * qb[3] + tmp[1] * qb[2] - tmp[2] * qb[1] + tmp[3] * qb[0]; +} + +__device__ __forceinline__ void quaternion_normalize_approx(float *q, float epsilon_fl) { + const float s = q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]; + // Omit one assert() + if (fabs(s - 1) < TOLERANCE) + ; + else { + const float a = sqrtf(s); + for (int i = 0; i < 4; i++) q[i] /= a; + } +} + +__device__ __forceinline__ void quaternion_increment(float *q, const float *rotation, + float epsilon_fl) { + float q_old[4] = {q[0], q[1], q[2], q[3]}; + angle_to_quaternion(q, rotation, epsilon_fl); + angle_to_quaternion_multi(q, q_old); + quaternion_normalize_approx(q, epsilon_fl); + // assert(quaternion_is_normalized(q)); // unnecessary +} + +__device__ __forceinline__ float vec_distance_sqr(float *a, float *b) { + return (a[0] - b[0]) * (a[0] - b[0]) + (a[1] - b[1]) * (a[1] - b[1]) + + (a[2] - b[2]) * (a[2] - b[2]); +} + +__device__ __forceinline__ float gyration_radius(int m_lig_begin, int m_lig_end, + const atom_cuda_t *atoms, + const m_coords_cuda_t *m_coords_gpu, + const float *m_lig_node_origin) { + float acc = 0; + int counter = 0; + float origin[3] = {m_lig_node_origin[0], m_lig_node_origin[1], m_lig_node_origin[2]}; + for (int i = m_lig_begin; i < m_lig_end; i++) { + float current_coords[3] + = {m_coords_gpu->coords[i][0], m_coords_gpu->coords[i][1], m_coords_gpu->coords[i][2]}; + if (atoms[i].types[0] + != EL_TYPE_H) { // for el, we use the first element (atoms[i].types[0]) + acc += vec_distance_sqr(current_coords, origin); + ++counter; + } + } + return (counter > 0) ? sqrtf(acc / counter) : 0; +} + +__device__ __forceinline__ void mutate_conf_cuda(const int num_steps, output_type_cuda_t *c, + curandStatePhilox4_32_10_t *state, + const int m_lig_begin, const int m_lig_end, + const atom_cuda_t *atoms, + const m_coords_cuda_t *m_coords_gpu, + const float *m_lig_node_origin_gpu, + const float epsilon_fl, const float amplitude) { + int flex_torsion_size = 0; // FIX? 20210727 + int count_mutable_entities = 2 + c->lig_torsion_size + flex_torsion_size; + int which = curand(state) % count_mutable_entities; + float random_inside_sphere[4]; + random_inside_sphere_gpu(random_inside_sphere, state); + if (which == 0) { + DEBUG_PRINTF("random sphere r=%f\n", norm3(random_inside_sphere)); + } + + float random_pi = (random_inside_sphere[3] - 0.5) * 2.0 * pi; // ~ U[-pi, pi] + if (which == 0) { + DEBUG_PRINTF("random pi=%f\n", random_pi); + } + + if (which == 0) { + for (int i = 0; i < 3; i++) c->position[i] += amplitude * random_inside_sphere[i]; + return; + } + --which; + if (which == 0) { + float gr + = gyration_radius(m_lig_begin, m_lig_end, atoms, m_coords_gpu, m_lig_node_origin_gpu); + if (gr > epsilon_fl) { + float rotation[3]; + for (int i = 0; i < 3; i++) rotation[i] = amplitude / gr * random_inside_sphere[i]; + quaternion_increment(c->orientation, rotation, epsilon_fl); + } + return; + } + --which; + if (which < c->lig_torsion_size) { + c->lig_torsion[which] = random_pi; + return; + } + which -= c->lig_torsion_size; + + if (flex_torsion_size != 0) { + if (which < flex_torsion_size) { + c->flex_torsion[which] = random_pi; + return; + } + which -= flex_torsion_size; + } +} + +/* Above based on mutate_conf.cpp */ + +/* Below based on matrix.cpp */ + +__device__ __forceinline__ void matrix_d_init(matrix_d *m, int dim, float fill_data) { + m->dim = dim; + if ((dim * (dim + 1) / 2) > MAX_HESSIAN_MATRIX_D_SIZE) + DEBUG_PRINTF("\nnmatrix_d: matrix_d_init() ERROR!"); + // ((dim * (dim + 1) / 2)*sizeof(float)); // symmetric matrix_d + for (int i = 0; i < (dim * (dim + 1) / 2); i++) m->data[i] = fill_data; + for (int i = (dim * (dim + 1) / 2); i < MAX_HESSIAN_MATRIX_D_SIZE; i++) + m->data[i] = 0; // Others will be 0 +} + +// as rugular 3x3 matrix_d +__device__ __forceinline__ void mat_init(matrix_d *m, float fill_data) { + m->dim = 3; // fixed to 3x3 matrix_d + if (9 > MAX_HESSIAN_MATRIX_D_SIZE) DEBUG_PRINTF("\nnmatrix_d: mat_init() ERROR!"); + for (int i = 0; i < 9; i++) m->data[i] = fill_data; +} + +__device__ __forceinline__ void matrix_d_set_diagonal(matrix_d *m, float fill_data) { + for (int i = 0; i < m->dim; i++) { + m->data[i + i * (i + 1) / 2] = fill_data; + } +} + +// as regular matrix_d +__device__ __forceinline__ void matrix_d_set_element(matrix_d *m, int dim, int x, int y, + float fill_data) { + m->data[x + y * dim] = fill_data; +} + +__device__ __forceinline__ void matrix_d_set_element_tri(matrix_d *m, int x, int y, + float fill_data) { + m->data[x + y * (y + 1) / 2] = fill_data; +} +__device__ __forceinline__ int tri_index(int n, int i, int j) { + if (j >= n || i > j) DEBUG_PRINTF("\nmatrix_d: tri_index ERROR!"); + return i + j * (j + 1) / 2; +} + +__device__ __forceinline__ int index_permissive(const matrix_d *m, int i, int j) { + return (i < j) ? tri_index(m->dim, i, j) : tri_index(m->dim, j, i); +} + +/* Above based on matrix_d.cpp */ + +/* Below based on quasi_newton.cpp */ + +__device__ __forceinline__ void change_cuda_init(change_cuda_t *g, const float *ptr) { + for (int i = 0; i < 3; i++) g->position[i] = ptr[i]; + for (int i = 0; i < 3; i++) g->orientation[i] = ptr[i + 3]; + for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++) g->lig_torsion[i] = ptr[i + 3 + 3]; + for (int i = 0; i < MAX_NUM_OF_FLEX_TORSION; i++) + g->flex_torsion[i] = ptr[i + 3 + 3 + MAX_NUM_OF_LIG_TORSION]; + g->lig_torsion_size = ptr[3 + 3 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION]; +} + +__device__ __forceinline__ void change_cuda_init_with_change(change_cuda_t *g_new, + const change_cuda_t *g_old) { + for (int i = 0; i < 3; i++) g_new->position[i] = g_old->position[i]; + for (int i = 0; i < 3; i++) g_new->orientation[i] = g_old->orientation[i]; + for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++) g_new->lig_torsion[i] = g_old->lig_torsion[i]; + for (int i = 0; i < MAX_NUM_OF_FLEX_TORSION; i++) + g_new->flex_torsion[i] = g_old->flex_torsion[i]; + g_new->lig_torsion_size = g_old->lig_torsion_size; +} + +void print_output_type(output_type_cuda_t *x, int torsion_size) { + for (int i = 0; i < 3; i++) DEBUG_PRINTF("\nx.position[%d] = %0.16f", i, x->position[i]); + for (int i = 0; i < 4; i++) DEBUG_PRINTF("\nx.orientation[%d] = %0.16f", i, x->orientation[i]); + for (int i = 0; i < torsion_size; i++) + DEBUG_PRINTF("\n x.torsion[%d] = %0.16f", i, x->lig_torsion[i]); + DEBUG_PRINTF("\n x.torsion_size = %f", x->lig_torsion_size); + DEBUG_PRINTF("\n !!! x.e = %f\n", x->e); +} + +void print_change(change_cuda_t *g, int torsion_size) { + for (int i = 0; i < 3; i++) DEBUG_PRINTF("\ng.position[%d] = %0.16f", i, g->position[i]); + for (int i = 0; i < 3; i++) DEBUG_PRINTF("\ng.orientation[%d] = %0.16f", i, g->orientation[i]); + for (int i = 0; i < torsion_size; i++) + DEBUG_PRINTF("\ng.torsion[%d] = %0.16f", i, g->lig_torsion[i]); + DEBUG_PRINTF("\ng.torsion_size = %f", g->lig_torsion_size); +} + +__device__ __forceinline__ int num_atom_types(int atu) { + switch (atu) { + case 0: + return EL_TYPE_SIZE; + case 1: + return AD_TYPE_SIZE; + case 2: + return XS_TYPE_SIZE; + case 3: + return SY_TYPE_SIZE; + default: + DEBUG_PRINTF("Kernel1:num_atom_types() ERROR!"); + return INFINITY; + } +} + +__device__ __forceinline__ void elementwise_product(float *out, const float *a, const float *b) { + out[0] = a[0] * b[0]; + out[1] = a[1] * b[1]; + out[2] = a[2] * b[2]; +} + +__device__ __forceinline__ float elementwise_product_sum(const float *a, const float *b) { + return a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; +} + +__device__ __forceinline__ float access_m_data(float *m_data, int m_i, int m_j, int i, int j, + int k) { + return m_data[i + m_i * (j + m_j * k)]; +} + +__device__ __forceinline__ bool not_max_gpu(float x) { + return (x < 0.1 * INFINITY); /* Problem: replace max_fl with INFINITY? */ +} + +__device__ __forceinline__ void curl_with_deriv(float *e, float *deriv, float v, + const float epsilon_fl) { + if (*e > 0 && not_max_gpu(v)) { + float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e)); + *e *= tmp; + for (int i = 0; i < 3; i++) deriv[i] *= tmp * tmp; + } +} + +__device__ __forceinline__ void curl_without_deriv(float *e, float v, const float epsilon_fl) { + if (*e > 0 && not_max_gpu(v)) { + float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e)); + *e *= tmp; + } +} + +__device__ __forceinline__ float g_evaluate(grid_cuda_t *g, const float *m_coords, /* double[3] */ + const float slope, /* double */ + const float v, /* double */ + float *deriv, /* double[3] */ + const float epsilon_fl) { + int m_i = g->m_i; + int m_j = g->m_j; + int m_k = g->m_k; + if (m_i * m_j * m_k == 0) DEBUG_PRINTF("\nkernel2: g_evaluate ERROR!#1"); + float tmp_vec[3] + = {m_coords[0] - g->m_init[0], m_coords[1] - g->m_init[1], m_coords[2] - g->m_init[2]}; + float tmp_vec2[3] = {g->m_factor[0], g->m_factor[1], g->m_factor[2]}; + float s[3]; + elementwise_product(s, tmp_vec, tmp_vec2); + + float miss[3] = {0, 0, 0}; + int region[3]; + int a[3]; + int m_data_dims[3] = {m_i, m_j, m_k}; + for (int i = 0; i < 3; i++) { + if (s[i] < 0) { + miss[i] = -s[i]; + region[i] = -1; + a[i] = 0; + s[i] = 0; + } else if (s[i] >= g->m_dim_fl_minus_1[i]) { + miss[i] = s[i] - g->m_dim_fl_minus_1[i]; + region[i] = 1; + if (m_data_dims[i] < 2) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#2"); + a[i] = m_data_dims[i] - 2; + s[i] = 1; + } else { + region[i] = 0; + a[i] = (int)s[i]; + s[i] -= a[i]; + } + if (s[i] < 0) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#3"); + if (s[i] > 1) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#4"); + if (a[i] < 0) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#5"); + if (a[i] + 1 >= m_data_dims[i]) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#5"); + } + + float tmp_m_factor_inv[3] = {g->m_factor_inv[0], g->m_factor_inv[1], g->m_factor_inv[2]}; + const float penalty = slope * elementwise_product_sum(miss, tmp_m_factor_inv); + if (penalty <= -epsilon_fl) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#6"); + + const int x0 = a[0]; + const int y0 = a[1]; + const int z0 = a[2]; + + const int x1 = x0 + 1; + const int y1 = y0 + 1; + const int z1 = z0 + 1; + + const float f000 = access_m_data(g->m_data, m_i, m_j, x0, y0, z0); + const float f100 = access_m_data(g->m_data, m_i, m_j, x1, y0, z0); + const float f010 = access_m_data(g->m_data, m_i, m_j, x0, y1, z0); + const float f110 = access_m_data(g->m_data, m_i, m_j, x1, y1, z0); + const float f001 = access_m_data(g->m_data, m_i, m_j, x0, y0, z1); + const float f101 = access_m_data(g->m_data, m_i, m_j, x1, y0, z1); + const float f011 = access_m_data(g->m_data, m_i, m_j, x0, y1, z1); + const float f111 = access_m_data(g->m_data, m_i, m_j, x1, y1, z1); + + const float x = s[0]; + const float y = s[1]; + const float z = s[2]; + + const float mx = 1 - x; + const float my = 1 - y; + const float mz = 1 - z; + + float f = f000 * mx * my * mz + f100 * x * my * mz + f010 * mx * y * mz + f110 * x * y * mz + + f001 * mx * my * z + f101 * x * my * z + f011 * mx * y * z + f111 * x * y * z; + + if (deriv) { + const float x_g = f000 * (-1) * my * mz + f100 * 1 * my * mz + f010 * (-1) * y * mz + + f110 * 1 * y * mz + f001 * (-1) * my * z + f101 * 1 * my * z + + f011 * (-1) * y * z + f111 * 1 * y * z; + + const float y_g = f000 * mx * (-1) * mz + f100 * x * (-1) * mz + f010 * mx * 1 * mz + + f110 * x * 1 * mz + f001 * mx * (-1) * z + f101 * x * (-1) * z + + f011 * mx * 1 * z + f111 * x * 1 * z; + + const float z_g = f000 * mx * my * (-1) + f100 * x * my * (-1) + f010 * mx * y * (-1) + + f110 * x * y * (-1) + f001 * mx * my * 1 + f101 * x * my * 1 + + f011 * mx * y * 1 + f111 * x * y * 1; + + float gradient[3] = {x_g, y_g, z_g}; + + curl_with_deriv(&f, gradient, v, epsilon_fl); + + float gradient_everywhere[3]; + + for (int i = 0; i < 3; i++) { + gradient_everywhere[i] = ((region[i] == 0) ? gradient[i] : 0); + deriv[i] = g->m_factor[i] * gradient_everywhere[i] + slope * region[i]; + } + return f + penalty; + } else { /* none valid pointer */ + DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#7"); + curl_without_deriv(&f, v, epsilon_fl); + return f + penalty; + } +} + +__device__ __forceinline__ float ig_eval_deriv(output_type_cuda_t *x, change_cuda_t *g, + const float v, ig_cuda_t *ig_cuda_gpu, + m_cuda_t *m_cuda_gpu, const float epsilon_fl) { + float e = 0; + int nat = num_atom_types(ig_cuda_gpu->atu); + for (int i = 0; i < m_cuda_gpu->m_num_movable_atoms; i++) { + int t = m_cuda_gpu->atoms[i].types[ig_cuda_gpu->atu]; + if (t >= nat) { + for (int j = 0; j < 3; j++) m_cuda_gpu->minus_forces.coords[i][j] = 0; + continue; + } + float deriv[3]; + + e = e + + g_evaluate(&ig_cuda_gpu->grids[t], m_cuda_gpu->m_coords.coords[i], ig_cuda_gpu->slope, + v, deriv, epsilon_fl); + + for (int j = 0; j < 3; j++) m_cuda_gpu->minus_forces.coords[i][j] = deriv[j]; + } + return e; +} + +__device__ __forceinline__ void quaternion_to_r3(const float *q, float *orientation_m) { + /* Omit assert(quaternion_is_normalized(q)); */ + const float a = q[0]; + const float b = q[1]; + const float c = q[2]; + const float d = q[3]; + + const float aa = a * a; + const float ab = a * b; + const float ac = a * c; + const float ad = a * d; + const float bb = b * b; + const float bc = b * c; + const float bd = b * d; + const float cc = c * c; + const float cd = c * d; + const float dd = d * d; + + /* Omit assert(eq(aa + bb + cc + dd, 1)); */ + matrix_d tmp; + mat_init(&tmp, 0); /* matrix_d with fixed dimension 3(here we treate this as + a regular matrix_d(not triangular matrix_d!)) */ + + matrix_d_set_element(&tmp, 3, 0, 0, (aa + bb - cc - dd)); + matrix_d_set_element(&tmp, 3, 0, 1, 2 * (-ad + bc)); + matrix_d_set_element(&tmp, 3, 0, 2, 2 * (ac + bd)); + + matrix_d_set_element(&tmp, 3, 1, 0, 2 * (ad + bc)); + matrix_d_set_element(&tmp, 3, 1, 1, (aa - bb + cc - dd)); + matrix_d_set_element(&tmp, 3, 1, 2, 2 * (-ab + cd)); + + matrix_d_set_element(&tmp, 3, 2, 0, 2 * (-ac + bd)); + matrix_d_set_element(&tmp, 3, 2, 1, 2 * (ab + cd)); + matrix_d_set_element(&tmp, 3, 2, 2, (aa - bb - cc + dd)); + + for (int i = 0; i < 9; i++) orientation_m[i] = tmp.data[i]; +} + +__device__ __forceinline__ void local_to_lab_direction(float *out, const float *local_direction, + const float *orientation_m) { + out[0] = orientation_m[0] * local_direction[0] + orientation_m[3] * local_direction[1] + + orientation_m[6] * local_direction[2]; + out[1] = orientation_m[1] * local_direction[0] + orientation_m[4] * local_direction[1] + + orientation_m[7] * local_direction[2]; + out[2] = orientation_m[2] * local_direction[0] + orientation_m[5] * local_direction[1] + + orientation_m[8] * local_direction[2]; +} + +__device__ __forceinline__ void local_to_lab(float *out, const float *origin, + const float *local_coords, + const float *orientation_m) { + out[0] = origin[0] + + (orientation_m[0] * local_coords[0] + orientation_m[3] * local_coords[1] + + orientation_m[6] * local_coords[2]); + out[1] = origin[1] + + (orientation_m[1] * local_coords[0] + orientation_m[4] * local_coords[1] + + orientation_m[7] * local_coords[2]); + out[2] = origin[2] + + (orientation_m[2] * local_coords[0] + orientation_m[5] * local_coords[1] + + orientation_m[8] * local_coords[2]); +} + +__device__ __forceinline__ void angle_to_quaternion2(float *out, const float *axis, float angle) { + normalize_angle(&angle); + float c = cos(angle / 2); + float s = sin(angle / 2); + out[0] = c; + out[1] = s * axis[0]; + out[2] = s * axis[1]; + out[3] = s * axis[2]; +} + +__device__ __forceinline__ void set(const output_type_cuda_t *x, rigid_cuda_t *lig_rigid_gpu, + m_coords_cuda_t *m_coords_gpu, const atom_cuda_t *atoms, + const int m_num_movable_atoms, const float epsilon_fl) { + for (int i = 0; i < 3; i++) lig_rigid_gpu->origin[0][i] = x->position[i]; + for (int i = 0; i < 4; i++) lig_rigid_gpu->orientation_q[0][i] = x->orientation[i]; + quaternion_to_r3(lig_rigid_gpu->orientation_q[0], + lig_rigid_gpu->orientation_m[0]); /* set orientation_m */ + + int begin = lig_rigid_gpu->atom_range[0][0]; + int end = lig_rigid_gpu->atom_range[0][1]; + + for (int i = begin; i < end; i++) { + local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[0], atoms[i].coords, + lig_rigid_gpu->orientation_m[0]); + } + /* ************* end node.set_conf ************* */ + + /* ************* branches_set_conf ************* */ + /* update nodes in depth-first order */ + for (int current = 1; current < lig_rigid_gpu->num_children + 1; + current++) { /* current starts from 1 (namely starts from first child + node) */ + int parent = lig_rigid_gpu->parent[current]; + float torsion = x->lig_torsion[current - 1]; /* torsions are all related to child nodes */ + local_to_lab(lig_rigid_gpu->origin[current], lig_rigid_gpu->origin[parent], + lig_rigid_gpu->relative_origin[current], lig_rigid_gpu->orientation_m[parent]); + local_to_lab_direction(lig_rigid_gpu->axis[current], lig_rigid_gpu->relative_axis[current], + lig_rigid_gpu->orientation_m[parent]); + float tmp[4]; + float parent_q[4] + = {lig_rigid_gpu->orientation_q[parent][0], lig_rigid_gpu->orientation_q[parent][1], + lig_rigid_gpu->orientation_q[parent][2], lig_rigid_gpu->orientation_q[parent][3]}; + float current_axis[3] = {lig_rigid_gpu->axis[current][0], lig_rigid_gpu->axis[current][1], + lig_rigid_gpu->axis[current][2]}; + + angle_to_quaternion2(tmp, current_axis, torsion); + angle_to_quaternion_multi(tmp, parent_q); + quaternion_normalize_approx(tmp, epsilon_fl); + + for (int i = 0; i < 4; i++) + lig_rigid_gpu->orientation_q[current][i] = tmp[i]; /* set orientation_q */ + quaternion_to_r3(lig_rigid_gpu->orientation_q[current], + lig_rigid_gpu->orientation_m[current]); /* set orientation_m */ + + /* set coords */ + begin = lig_rigid_gpu->atom_range[current][0]; + end = lig_rigid_gpu->atom_range[current][1]; + for (int i = begin; i < end; i++) { + local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[current], atoms[i].coords, + lig_rigid_gpu->orientation_m[current]); + } + } + /* ************* end branches_set_conf ************* */ +} + +__device__ __forceinline__ void p_eval_deriv(float *out, int type_pair_index, float r2, + p_cuda_t *p_cuda_gpu, const float epsilon_fl) { + const float cutoff_sqr = p_cuda_gpu->m_cutoff_sqr; + if (r2 > cutoff_sqr) + DEBUG_PRINTF( + "\nkernel2: p_eval_deriv() ERROR!, r2 > Cutoff_sqr, r2=%f, " + "cutoff_sqr=%f", + r2, cutoff_sqr); + + p_m_data_cuda_t *tmp = &p_cuda_gpu->m_data[type_pair_index]; + float r2_factored = tmp->factor * r2; + int i1 = (int)(r2_factored); + int i2 = i1 + 1; + float rem = r2_factored - i1; + if (rem < -epsilon_fl) DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!"); + if (rem >= 1 + epsilon_fl) DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!"); + float p1[2] = {tmp->smooth[i1][0], tmp->smooth[i1][1]}; + if (i1 >= SMOOTH_SIZE) p1[0] = p1[1] = 0; + float p2[2] = {tmp->smooth[i2][0], tmp->smooth[i2][1]}; + if (i2 >= SMOOTH_SIZE) p2[0] = p2[1] = 0; + float e = p1[0] + rem * (p2[0] - p1[0]); + float dor = p1[1] + rem * (p2[1] - p1[1]); + out[0] = e; + out[1] = dor; +} + +__device__ __forceinline__ void curl(float *e, float *deriv, float v, const float epsilon_fl) { + if (*e > 0 && not_max_gpu(v)) { + float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e)); + (*e) = tmp * (*e); + for (int i = 0; i < 3; i++) deriv[i] = deriv[i] * (tmp * tmp); + } +} + +__device__ __forceinline__ float eval_interacting_pairs_deriv(p_cuda_t *p_cuda_gpu, const float v, + const lig_pairs_cuda_t *pairs, + const m_coords_cuda_t *m_coords, + m_minus_forces_t *minus_forces, + const float epsilon_fl) { + float e = 0; + for (int i = 0; i < pairs->num_pairs; i++) { + const int ip[3] = {pairs->type_pair_index[i], pairs->a[i], pairs->b[i]}; + int index = pairs->a[i] + pairs->b[i] * (pairs->b[i] + 1) / 2; + float coords_b[3] + = {m_coords->coords[ip[2]][0], m_coords->coords[ip[2]][1], m_coords->coords[ip[2]][2]}; + float coords_a[3] + = {m_coords->coords[ip[1]][0], m_coords->coords[ip[1]][1], m_coords->coords[ip[1]][2]}; + float r[3] + = {coords_b[0] - coords_a[0], coords_b[1] - coords_a[1], coords_b[2] - coords_a[2]}; + float r2 = r[0] * r[0] + r[1] * r[1] + r[2] * r[2]; + + if (r2 < p_cuda_gpu->m_cutoff_sqr) { + float tmp[2]; + p_eval_deriv(tmp, index, r2, p_cuda_gpu, epsilon_fl); + float force[3] = {r[0] * tmp[1], r[1] * tmp[1], r[2] * tmp[1]}; + curl(&tmp[0], force, v, epsilon_fl); + e += tmp[0]; + for (int j = 0; j < 3; j++) minus_forces->coords[ip[1]][j] -= force[j]; + for (int j = 0; j < 3; j++) minus_forces->coords[ip[2]][j] += force[j]; + } + } + return e; +} + +template +__device__ __forceinline__ void product(T1 *res, const T2 *a, const T3 *b) { + res[0] = a[1] * b[2] - a[2] * b[1]; + res[1] = a[2] * b[0] - a[0] * b[2]; + res[2] = a[0] * b[1] - a[1] * b[0]; +} + +__device__ __forceinline__ float find_change_index_read(const change_cuda_t *g, int index) { + if (index < 3) return g->position[index]; + index -= 3; + if (index < 3) return g->orientation[index]; + index -= 3; + if (index < g->lig_torsion_size) return g->lig_torsion[index]; + DEBUG_PRINTF("\nKernel2:find_change_index_read() ERROR!"); /* Shouldn't be here */ +} + +__device__ __forceinline__ void find_change_index_write(change_cuda_t *g, int index, float data) { + if (index < 3) { + g->position[index] = data; + return; + } + index -= 3; + if (index < 3) { + g->orientation[index] = data; + return; + } + index -= 3; + if (index < g->lig_torsion_size) { + g->lig_torsion[index] = data; + return; + } + DEBUG_PRINTF("\nKernel2:find_change_index_write() ERROR!"); /* Shouldn't be here */ +} + +__device__ __forceinline__ void minus_mat_vec_product(const matrix_d *h, const change_cuda_t *in, + change_cuda_t *out) { + int n = h->dim; + for (int i = 0; i < n; i++) { + float sum = 0; + for (int j = 0; j < n; j++) { + sum += h->data[index_permissive(h, i, j)] * find_change_index_read(in, j); + } + find_change_index_write(out, i, -sum); + } +} + +__device__ __forceinline__ float scalar_product(const change_cuda_t *a, const change_cuda_t *b, + int n) { + float tmp = 0; + for (int i = 0; i < n; i++) { + tmp += find_change_index_read(a, i) * find_change_index_read(b, i); + } + return tmp; +} + +__device__ __forceinline__ bool bfgs_update(matrix_d *h, const change_cuda_t *p, + const change_cuda_t *y, const float alpha, + const float epsilon_fl) { + const float yp = scalar_product(y, p, h->dim); + + if (alpha * yp < epsilon_fl) return false; + change_cuda_t minus_hy; + change_cuda_init_with_change(&minus_hy, y); + minus_mat_vec_product(h, y, &minus_hy); + const float yhy = -scalar_product(y, &minus_hy, h->dim); + const float r = 1 / (alpha * yp); + const int n = 6 + p->lig_torsion_size; + + for (int i = 0; i < n; i++) { + for (int j = i; j < n; j++) { + float tmp + = alpha * r + * (find_change_index_read(&minus_hy, i) * find_change_index_read(p, j) + + find_change_index_read(&minus_hy, j) * find_change_index_read(p, i)) + + +alpha * alpha * (r * r * yhy + r) * find_change_index_read(p, i) + * find_change_index_read(p, j); + + h->data[i + j * (j + 1) / 2] += tmp; + } + } + + return true; +} diff --git a/unidock/src/cuda/kernel.h b/unidock/src/cuda/kernel.h index 9abbbbe..bbee8ad 100644 --- a/unidock/src/cuda/kernel.h +++ b/unidock/src/cuda/kernel.h @@ -1,5 +1,6 @@ #pragma once +#include #include template void check(T result, char const *const func, const char *const file, int const line) { @@ -43,20 +44,22 @@ void check(T result, char const *const func, const char *const file, int const l #define MAX_NUM_OF_GRID_MK 128 // 81 #define MAX_NUM_OF_GRID_POINT 512000 -//#define GRID_MI 65//55 -//#define GRID_MJ 71//55 -//#define GRID_MK 61//81 +// #define GRID_MI 65//55 +// #define GRID_MJ 71//55 +// #define GRID_MK 61//81 #define MAX_PRECAL_NUM_ATOM 30 #define MAX_P_DATA_M_DATA_SIZE \ - 45150 // modified for vina1.2, should be larger, n*(n+1)/2, n=num_of_atom, select n=140 -//#define MAX_NUM_OF_GRID_ATOMS 150 + 45150 // modified for vina1.2, should be larger, n*(n+1)/2, n=num_of_atom, + // select n=140 +// #define MAX_NUM_OF_GRID_ATOMS 150 #define FAST_SIZE 2051 // modified for vina1.2 m_max_cutoff^2 * factor + 3, ad4=13424 #define SMOOTH_SIZE 2051 #define MAX_CONTAINER_SIZE_EVERY_WI 5 #define MAX_THREAD 41700000 // modified for vina1.2, to calculate random map memory upper bound #define MAX_LIGAND_NUM \ - 10250 // modified for vina1.2, to calculate precalculate_byatom memory upper bound + 10250 // modified for vina1.2, to calculate precalculate_byatom memory upper + // bound typedef struct { float data[GRIDS_SIZE]; @@ -107,11 +110,12 @@ typedef struct { // depth-first order float relative_axis[MAX_NUM_OF_RIGID][3]; // 1st column is root node, all 0s float relative_origin[MAX_NUM_OF_RIGID][3]; // 1st column is root node, all 0s - int parent[MAX_NUM_OF_RIGID]; // every node has only 1 parent node - bool children_map[MAX_NUM_OF_RIGID] - [MAX_NUM_OF_RIGID]; // chidren_map[i][j] = true if node i's child is node j + int parent[MAX_NUM_OF_RIGID]; // every node has only 1 parent node + bool children_map[MAX_NUM_OF_RIGID][MAX_NUM_OF_RIGID]; // chidren_map[i][j] = true if node + // i's child is node j + bool descendant_map[MAX_NUM_OF_RIGID][MAX_NUM_OF_RIGID]; // descendant_map[i][j] = true if + // node i is ancestor of node j int num_children; - } rigid_cuda_t; typedef struct { @@ -160,6 +164,12 @@ typedef struct { grid_cuda_t grids[GRIDS_SIZE]; } ig_cuda_t; +typedef struct { + float ptmp[MAX_NUM_OF_RIGID][3]; + float p[MAX_NUM_OF_RIGID][3]; + float o[MAX_NUM_OF_RIGID][3]; +} pot_cuda_t; + typedef struct { float fast[FAST_SIZE]; float smooth[SMOOTH_SIZE][2]; diff --git a/unidock/src/cuda/monte_carlo.cu b/unidock/src/cuda/monte_carlo.cu index 8483578..c8971ed 100644 --- a/unidock/src/cuda/monte_carlo.cu +++ b/unidock/src/cuda/monte_carlo.cu @@ -20,979 +20,42 @@ */ +#include "common.cuh" +#include "cuda.h" +#include "curand_kernel.h" #include "kernel.h" #include "math.h" +#include "warp_ops.cuh" +#include +#include #include -#include "curand_kernel.h" -#include "cuda.h" /* Original Include files */ -#include "monte_carlo.h" +#include "ad4cache.h" +#include "cache.h" #include "coords.h" -#include "mutate.h" -#include "quasi_newton.h" #include "model.h" +#include "monte_carlo.h" +#include "mutate.h" #include "precalculate.h" -#include "cache.h" -#include "ad4cache.h" - -/* Below based on mutate_conf.cpp */ - -__device__ __forceinline__ void quaternion_increment(float* q, const float* rotation, - float epsilon_fl); - -__device__ __forceinline__ void normalize_angle(float* x); - -__device__ __forceinline__ void output_type_cuda_init(output_type_cuda_t* out, const float* ptr) { - memcpy(out, ptr, sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION)); - out->lig_torsion_size = ptr[3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION]; - // did not assign coords and e -} - -__device__ __forceinline__ void output_type_cuda_init_with_output( - output_type_cuda_t* out_new, const output_type_cuda_t* out_old) { - memcpy(out_new, out_old, - sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION)); - out_new->lig_torsion_size = out_old->lig_torsion_size; - // assign e but not coords - out_new->e = out_old->e; -} - -__device__ __forceinline__ void output_type_cuda_increment(output_type_cuda_t* x, - const change_cuda_t* c, float factor, - float epsilon_fl) { - // position increment - for (int k = 0; k < 3; k++) x->position[k] += factor * c->position[k]; - // orientation increment - float rotation[3]; - for (int k = 0; k < 3; k++) rotation[k] = factor * c->orientation[k]; - quaternion_increment(x->orientation, rotation, epsilon_fl); - - // torsion increment - for (int k = 0; k < x->lig_torsion_size; k++) { - float tmp = factor * c->lig_torsion[k]; - normalize_angle(&tmp); - x->lig_torsion[k] += tmp; - normalize_angle(&(x->lig_torsion[k])); - } -} - -__device__ __forceinline__ float norm3(const float* a) { - return sqrt(pow(a[0], 2) + pow(a[1], 2) + pow(a[2], 2)); -} - -__device__ __forceinline__ void random_inside_sphere_gpu(float* random_inside_sphere, - curandStatePhilox4_32_10_t* state) { - float4 random_inside_sphere_fl; - while (true) { // on average, this will have to be run about twice - random_inside_sphere_fl = curand_uniform4(state); // ~ U[0,1] - random_inside_sphere[0] = (random_inside_sphere_fl.x - 0.5) * 2.0; - random_inside_sphere[1] = (random_inside_sphere_fl.y - 0.5) * 2.0; - random_inside_sphere[2] = (random_inside_sphere_fl.z - 0.5) * 2.0; - random_inside_sphere[3] = random_inside_sphere_fl.w; - float r = norm3(random_inside_sphere); - if (r < 1) { - return; - } - } -} - -__device__ __forceinline__ void normalize_angle(float* x) { - while (1) { - if (*x >= -(M_PI) && *x <= (M_PI)) { - break; - } else if (*x > 3 * (M_PI)) { - float n = (*x - (M_PI)) / (2 * (M_PI)); - *x -= 2 * (M_PI)*ceil(n); - } else if (*x < 3 * -(M_PI)) { - float n = (-*x - (M_PI)) / (2 * (M_PI)); - *x += 2 * (M_PI)*ceil(n); - } else if (*x > (M_PI)) { - *x -= 2 * (M_PI); - } else if (*x < -(M_PI)) { - *x += 2 * (M_PI); - } else { - break; - } - } -} - -__device__ __forceinline__ bool quaternion_is_normalized(float* q) { - float q_pow = pow(q[0], 2) + pow(q[1], 2) + pow(q[2], 2) + pow(q[3], 2); - float sqrt_q_pow = sqrt(q_pow); - return (q_pow - 1 < 0.001) && (sqrt_q_pow - 1 < 0.001); -} - -__device__ __forceinline__ void angle_to_quaternion(float* q, const float* rotation, - float epsilon_fl) { - float angle = norm3(rotation); - if (angle > epsilon_fl) { - float axis[3] = {rotation[0] / angle, rotation[1] / angle, rotation[2] / angle}; - normalize_angle(&angle); - float c = cos(angle / 2); - float s = sin(angle / 2); - q[0] = c; - q[1] = s * axis[0]; - q[2] = s * axis[1]; - q[3] = s * axis[2]; - return; - } - q[0] = 1; - q[1] = 0; - q[2] = 0; - q[3] = 0; - return; -} - -// quaternion multiplication -__device__ __forceinline__ void angle_to_quaternion_multi(float* qa, const float* qb) { - float tmp[4] = {qa[0], qa[1], qa[2], qa[3]}; - qa[0] = tmp[0] * qb[0] - tmp[1] * qb[1] - tmp[2] * qb[2] - tmp[3] * qb[3]; - qa[1] = tmp[0] * qb[1] + tmp[1] * qb[0] + tmp[2] * qb[3] - tmp[3] * qb[2]; - qa[2] = tmp[0] * qb[2] - tmp[1] * qb[3] + tmp[2] * qb[0] + tmp[3] * qb[1]; - qa[3] = tmp[0] * qb[3] + tmp[1] * qb[2] - tmp[2] * qb[1] + tmp[3] * qb[0]; -} - -__device__ __forceinline__ void quaternion_normalize_approx(float* q, float epsilon_fl) { - const float s = pow(q[0], 2) + pow(q[1], 2) + pow(q[2], 2) + pow(q[3], 2); - // Omit one assert() - if (fabs(s - 1) < TOLERANCE) - ; - else { - const float a = sqrt(s); - for (int i = 0; i < 4; i++) q[i] /= a; - } -} - -__device__ __forceinline__ void quaternion_increment(float* q, const float* rotation, - float epsilon_fl) { - float q_old[4] = {q[0], q[1], q[2], q[3]}; - angle_to_quaternion(q, rotation, epsilon_fl); - angle_to_quaternion_multi(q, q_old); - quaternion_normalize_approx(q, epsilon_fl); - // assert(quaternion_is_normalized(q)); // unnecessary -} - -__device__ __forceinline__ float vec_distance_sqr(float* a, float* b) { - return pow(a[0] - b[0], 2) + pow(a[1] - b[1], 2) + pow(a[2] - b[2], 2); -} - -__device__ __forceinline__ float gyration_radius(int m_lig_begin, int m_lig_end, - const atom_cuda_t* atoms, - const m_coords_cuda_t* m_coords_gpu, - const float* m_lig_node_origin) { - float acc = 0; - int counter = 0; - float origin[3] = {m_lig_node_origin[0], m_lig_node_origin[1], m_lig_node_origin[2]}; - for (int i = m_lig_begin; i < m_lig_end; i++) { - float current_coords[3] - = {m_coords_gpu->coords[i][0], m_coords_gpu->coords[i][1], m_coords_gpu->coords[i][2]}; - if (atoms[i].types[0] - != EL_TYPE_H) { // for el, we use the first element (atoms[i].types[0]) - acc += vec_distance_sqr(current_coords, origin); - ++counter; - } - } - return (counter > 0) ? sqrt(acc / counter) : 0; -} - -__device__ __forceinline__ void mutate_conf_cuda(const int num_steps, output_type_cuda_t* c, - curandStatePhilox4_32_10_t* state, - const int m_lig_begin, const int m_lig_end, - const atom_cuda_t* atoms, - const m_coords_cuda_t* m_coords_gpu, - const float* m_lig_node_origin_gpu, - const float epsilon_fl, const float amplitude) { - int flex_torsion_size = 0; // FIX? 20210727 - int count_mutable_entities = 2 + c->lig_torsion_size + flex_torsion_size; - int which = curand(state) % count_mutable_entities; - - float random_inside_sphere[4]; - random_inside_sphere_gpu(random_inside_sphere, state); - if (which == 0) { - DEBUG_PRINTF("random sphere r=%f\n", norm3(random_inside_sphere)); - } - - float random_pi = (random_inside_sphere[3] - 0.5) * 2.0 * pi; // ~ U[-pi, pi] - if (which == 0) { - DEBUG_PRINTF("random pi=%f\n", random_pi); - } - - if (which == 0) { - for (int i = 0; i < 3; i++) c->position[i] += amplitude * random_inside_sphere[i]; - return; - } - --which; - if (which == 0) { - float gr - = gyration_radius(m_lig_begin, m_lig_end, atoms, m_coords_gpu, m_lig_node_origin_gpu); - if (gr > epsilon_fl) { - float rotation[3]; - for (int i = 0; i < 3; i++) rotation[i] = amplitude / gr * random_inside_sphere[i]; - quaternion_increment(c->orientation, rotation, epsilon_fl); - } - return; - } - --which; - if (which < c->lig_torsion_size) { - c->lig_torsion[which] = random_pi; - return; - } - which -= c->lig_torsion_size; - - if (flex_torsion_size != 0) { - if (which < flex_torsion_size) { - c->flex_torsion[which] = random_pi; - return; - } - which -= flex_torsion_size; - } -} - -/* Above based on mutate_conf.cpp */ - -/* Below based on matrix.cpp */ - -// symmetric matrix_d (only half of it are stored) -typedef struct { - float data[MAX_HESSIAN_MATRIX_D_SIZE]; - int dim; -} matrix_d; - -__device__ __forceinline__ void matrix_d_init(matrix_d* m, int dim, float fill_data) { - m->dim = dim; - if ((dim * (dim + 1) / 2) > MAX_HESSIAN_MATRIX_D_SIZE) - DEBUG_PRINTF("\nnmatrix_d: matrix_d_init() ERROR!"); - // ((dim * (dim + 1) / 2)*sizeof(float)); // symmetric matrix_d - for (int i = 0; i < (dim * (dim + 1) / 2); i++) m->data[i] = fill_data; - for (int i = (dim * (dim + 1) / 2); i < MAX_HESSIAN_MATRIX_D_SIZE; i++) - m->data[i] = 0; // Others will be 0 -} - -// as rugular 3x3 matrix_d -__device__ __forceinline__ void mat_init(matrix_d* m, float fill_data) { - m->dim = 3; // fixed to 3x3 matrix_d - if (9 > MAX_HESSIAN_MATRIX_D_SIZE) DEBUG_PRINTF("\nnmatrix_d: mat_init() ERROR!"); - for (int i = 0; i < 9; i++) m->data[i] = fill_data; -} - -__device__ __forceinline__ void matrix_d_set_diagonal(matrix_d* m, float fill_data) { - for (int i = 0; i < m->dim; i++) { - m->data[i + i * (i + 1) / 2] = fill_data; - } -} - -// as regular matrix_d -__device__ __forceinline__ void matrix_d_set_element(matrix_d* m, int dim, int x, int y, - float fill_data) { - m->data[x + y * dim] = fill_data; -} - -__device__ __forceinline__ void matrix_d_set_element_tri(matrix_d* m, int x, int y, - float fill_data) { - m->data[x + y * (y + 1) / 2] = fill_data; -} -__device__ __forceinline__ int tri_index(int n, int i, int j) { - if (j >= n || i > j) DEBUG_PRINTF("\nmatrix_d: tri_index ERROR!"); - return i + j * (j + 1) / 2; -} - -__device__ __forceinline__ int index_permissive(const matrix_d* m, int i, int j) { - return (i < j) ? tri_index(m->dim, i, j) : tri_index(m->dim, j, i); -} - -/* Above based on matrix_d.cpp */ - -/* Below based on quasi_newton.cpp */ - -__device__ __forceinline__ void change_cuda_init(change_cuda_t* g, const float* ptr) { - for (int i = 0; i < 3; i++) g->position[i] = ptr[i]; - for (int i = 0; i < 3; i++) g->orientation[i] = ptr[i + 3]; - for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++) g->lig_torsion[i] = ptr[i + 3 + 3]; - for (int i = 0; i < MAX_NUM_OF_FLEX_TORSION; i++) - g->flex_torsion[i] = ptr[i + 3 + 3 + MAX_NUM_OF_LIG_TORSION]; - g->lig_torsion_size = ptr[3 + 3 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION]; -} - -__device__ __forceinline__ void change_cuda_init_with_change(change_cuda_t* g_new, - const change_cuda_t* g_old) { - for (int i = 0; i < 3; i++) g_new->position[i] = g_old->position[i]; - for (int i = 0; i < 3; i++) g_new->orientation[i] = g_old->orientation[i]; - for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++) g_new->lig_torsion[i] = g_old->lig_torsion[i]; - for (int i = 0; i < MAX_NUM_OF_FLEX_TORSION; i++) - g_new->flex_torsion[i] = g_old->flex_torsion[i]; - g_new->lig_torsion_size = g_old->lig_torsion_size; -} - -void print_output_type(output_type_cuda_t* x, int torsion_size) { - for (int i = 0; i < 3; i++) DEBUG_PRINTF("\nx.position[%d] = %0.16f", i, x->position[i]); - for (int i = 0; i < 4; i++) DEBUG_PRINTF("\nx.orientation[%d] = %0.16f", i, x->orientation[i]); - for (int i = 0; i < torsion_size; i++) - DEBUG_PRINTF("\n x.torsion[%d] = %0.16f", i, x->lig_torsion[i]); - DEBUG_PRINTF("\n x.torsion_size = %f", x->lig_torsion_size); - DEBUG_PRINTF("\n !!! x.e = %f\n", x->e); -} - -void print_change(change_cuda_t* g, int torsion_size) { - for (int i = 0; i < 3; i++) DEBUG_PRINTF("\ng.position[%d] = %0.16f", i, g->position[i]); - for (int i = 0; i < 3; i++) DEBUG_PRINTF("\ng.orientation[%d] = %0.16f", i, g->orientation[i]); - for (int i = 0; i < torsion_size; i++) - DEBUG_PRINTF("\ng.torsion[%d] = %0.16f", i, g->lig_torsion[i]); - DEBUG_PRINTF("\ng.torsion_size = %f", g->lig_torsion_size); -} - -__device__ __forceinline__ int num_atom_types(int atu) { - switch (atu) { - case 0: - return EL_TYPE_SIZE; - case 1: - return AD_TYPE_SIZE; - case 2: - return XS_TYPE_SIZE; - case 3: - return SY_TYPE_SIZE; - default: - DEBUG_PRINTF("Kernel1:num_atom_types() ERROR!"); - return INFINITY; - } -} - -__device__ __forceinline__ void elementwise_product(float* out, const float* a, const float* b) { - out[0] = a[0] * b[0]; - out[1] = a[1] * b[1]; - out[2] = a[2] * b[2]; -} - -__device__ __forceinline__ float elementwise_product_sum(const float* a, const float* b) { - return a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; -} - -__device__ __forceinline__ float access_m_data(float* m_data, int m_i, int m_j, int i, int j, - int k) { - return m_data[i + m_i * (j + m_j * k)]; -} - -__device__ __forceinline__ bool not_max_gpu(float x) { - return (x < 0.1 * INFINITY); /* Problem: replace max_fl with INFINITY? */ -} - -__device__ __forceinline__ void curl_with_deriv(float* e, float* deriv, float v, - const float epsilon_fl) { - if (*e > 0 && not_max_gpu(v)) { - float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e)); - *e *= tmp; - for (int i = 0; i < 3; i++) deriv[i] *= pow(tmp, 2); - } -} - -__device__ __forceinline__ void curl_without_deriv(float* e, float v, const float epsilon_fl) { - if (*e > 0 && not_max_gpu(v)) { - float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e)); - *e *= tmp; - } -} - -__device__ __forceinline__ float g_evaluate(grid_cuda_t* g, const float* m_coords, /* double[3] */ - const float slope, /* double */ - const float v, /* double */ - float* deriv, /* double[3] */ - const float epsilon_fl) { - int m_i = g->m_i; - int m_j = g->m_j; - int m_k = g->m_k; - if (m_i * m_j * m_k == 0) DEBUG_PRINTF("\nkernel2: g_evaluate ERROR!#1"); - float tmp_vec[3] - = {m_coords[0] - g->m_init[0], m_coords[1] - g->m_init[1], m_coords[2] - g->m_init[2]}; - float tmp_vec2[3] = {g->m_factor[0], g->m_factor[1], g->m_factor[2]}; - float s[3]; - elementwise_product(s, tmp_vec, tmp_vec2); - - float miss[3] = {0, 0, 0}; - int region[3]; - int a[3]; - int m_data_dims[3] = {m_i, m_j, m_k}; - for (int i = 0; i < 3; i++) { - if (s[i] < 0) { - miss[i] = -s[i]; - region[i] = -1; - a[i] = 0; - s[i] = 0; - } else if (s[i] >= g->m_dim_fl_minus_1[i]) { - miss[i] = s[i] - g->m_dim_fl_minus_1[i]; - region[i] = 1; - if (m_data_dims[i] < 2) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#2"); - a[i] = m_data_dims[i] - 2; - s[i] = 1; - } else { - region[i] = 0; - a[i] = (int)s[i]; - s[i] -= a[i]; - } - if (s[i] < 0) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#3"); - if (s[i] > 1) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#4"); - if (a[i] < 0) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#5"); - if (a[i] + 1 >= m_data_dims[i]) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#5"); - } - - float tmp_m_factor_inv[3] = {g->m_factor_inv[0], g->m_factor_inv[1], g->m_factor_inv[2]}; - const float penalty = slope * elementwise_product_sum(miss, tmp_m_factor_inv); - if (penalty <= -epsilon_fl) DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#6"); - - const int x0 = a[0]; - const int y0 = a[1]; - const int z0 = a[2]; - - const int x1 = x0 + 1; - const int y1 = y0 + 1; - const int z1 = z0 + 1; - - const float f000 = access_m_data(g->m_data, m_i, m_j, x0, y0, z0); - const float f100 = access_m_data(g->m_data, m_i, m_j, x1, y0, z0); - const float f010 = access_m_data(g->m_data, m_i, m_j, x0, y1, z0); - const float f110 = access_m_data(g->m_data, m_i, m_j, x1, y1, z0); - const float f001 = access_m_data(g->m_data, m_i, m_j, x0, y0, z1); - const float f101 = access_m_data(g->m_data, m_i, m_j, x1, y0, z1); - const float f011 = access_m_data(g->m_data, m_i, m_j, x0, y1, z1); - const float f111 = access_m_data(g->m_data, m_i, m_j, x1, y1, z1); - - const float x = s[0]; - const float y = s[1]; - const float z = s[2]; - - const float mx = 1 - x; - const float my = 1 - y; - const float mz = 1 - z; - - float f = f000 * mx * my * mz + f100 * x * my * mz + f010 * mx * y * mz + f110 * x * y * mz - + f001 * mx * my * z + f101 * x * my * z + f011 * mx * y * z + f111 * x * y * z; - - if (deriv) { - const float x_g = f000 * (-1) * my * mz + f100 * 1 * my * mz + f010 * (-1) * y * mz - + f110 * 1 * y * mz + f001 * (-1) * my * z + f101 * 1 * my * z - + f011 * (-1) * y * z + f111 * 1 * y * z; - - const float y_g = f000 * mx * (-1) * mz + f100 * x * (-1) * mz + f010 * mx * 1 * mz - + f110 * x * 1 * mz + f001 * mx * (-1) * z + f101 * x * (-1) * z - + f011 * mx * 1 * z + f111 * x * 1 * z; - - const float z_g = f000 * mx * my * (-1) + f100 * x * my * (-1) + f010 * mx * y * (-1) - + f110 * x * y * (-1) + f001 * mx * my * 1 + f101 * x * my * 1 - + f011 * mx * y * 1 + f111 * x * y * 1; - - float gradient[3] = {x_g, y_g, z_g}; - - curl_with_deriv(&f, gradient, v, epsilon_fl); - - float gradient_everywhere[3]; - - for (int i = 0; i < 3; i++) { - gradient_everywhere[i] = ((region[i] == 0) ? gradient[i] : 0); - deriv[i] = g->m_factor[i] * gradient_everywhere[i] + slope * region[i]; - } - return f + penalty; - } else { /* none valid pointer */ - DEBUG_PRINTF("\nKernel2: g_evaluate ERROR!#7"); - curl_without_deriv(&f, v, epsilon_fl); - return f + penalty; - } -} - -__device__ __forceinline__ float ig_eval_deriv(output_type_cuda_t* x, change_cuda_t* g, - const float v, ig_cuda_t* ig_cuda_gpu, - m_cuda_t* m_cuda_gpu, const float epsilon_fl) { - float e = 0; - int nat = num_atom_types(ig_cuda_gpu->atu); - for (int i = 0; i < m_cuda_gpu->m_num_movable_atoms; i++) { - int t = m_cuda_gpu->atoms[i].types[ig_cuda_gpu->atu]; - if (t >= nat) { - for (int j = 0; j < 3; j++) m_cuda_gpu->minus_forces.coords[i][j] = 0; - continue; - } - float deriv[3]; - - e = e - + g_evaluate(&ig_cuda_gpu->grids[t], m_cuda_gpu->m_coords.coords[i], ig_cuda_gpu->slope, - v, deriv, epsilon_fl); - - for (int j = 0; j < 3; j++) m_cuda_gpu->minus_forces.coords[i][j] = deriv[j]; - } - return e; -} - -__device__ __forceinline__ void quaternion_to_r3(const float* q, float* orientation_m) { - /* Omit assert(quaternion_is_normalized(q)); */ - const float a = q[0]; - const float b = q[1]; - const float c = q[2]; - const float d = q[3]; - - const float aa = a * a; - const float ab = a * b; - const float ac = a * c; - const float ad = a * d; - const float bb = b * b; - const float bc = b * c; - const float bd = b * d; - const float cc = c * c; - const float cd = c * d; - const float dd = d * d; - - /* Omit assert(eq(aa + bb + cc + dd, 1)); */ - matrix_d tmp; - mat_init(&tmp, 0); /* matrix_d with fixed dimension 3(here we treate this as a regular - matrix_d(not triangular matrix_d!)) */ - - matrix_d_set_element(&tmp, 3, 0, 0, (aa + bb - cc - dd)); - matrix_d_set_element(&tmp, 3, 0, 1, 2 * (-ad + bc)); - matrix_d_set_element(&tmp, 3, 0, 2, 2 * (ac + bd)); - - matrix_d_set_element(&tmp, 3, 1, 0, 2 * (ad + bc)); - matrix_d_set_element(&tmp, 3, 1, 1, (aa - bb + cc - dd)); - matrix_d_set_element(&tmp, 3, 1, 2, 2 * (-ab + cd)); - - matrix_d_set_element(&tmp, 3, 2, 0, 2 * (-ac + bd)); - matrix_d_set_element(&tmp, 3, 2, 1, 2 * (ab + cd)); - matrix_d_set_element(&tmp, 3, 2, 2, (aa - bb - cc + dd)); - - for (int i = 0; i < 9; i++) orientation_m[i] = tmp.data[i]; -} - -__device__ __forceinline__ void local_to_lab_direction(float* out, const float* local_direction, - const float* orientation_m) { - out[0] = orientation_m[0] * local_direction[0] + orientation_m[3] * local_direction[1] - + orientation_m[6] * local_direction[2]; - out[1] = orientation_m[1] * local_direction[0] + orientation_m[4] * local_direction[1] - + orientation_m[7] * local_direction[2]; - out[2] = orientation_m[2] * local_direction[0] + orientation_m[5] * local_direction[1] - + orientation_m[8] * local_direction[2]; -} - -__device__ __forceinline__ void local_to_lab(float* out, const float* origin, - const float* local_coords, - const float* orientation_m) { - out[0] = origin[0] - + (orientation_m[0] * local_coords[0] + orientation_m[3] * local_coords[1] - + orientation_m[6] * local_coords[2]); - out[1] = origin[1] - + (orientation_m[1] * local_coords[0] + orientation_m[4] * local_coords[1] - + orientation_m[7] * local_coords[2]); - out[2] = origin[2] - + (orientation_m[2] * local_coords[0] + orientation_m[5] * local_coords[1] - + orientation_m[8] * local_coords[2]); -} - -__device__ __forceinline__ void angle_to_quaternion2(float* out, const float* axis, float angle) { - normalize_angle(&angle); - float c = cos(angle / 2); - float s = sin(angle / 2); - out[0] = c; - out[1] = s * axis[0]; - out[2] = s * axis[1]; - out[3] = s * axis[2]; -} - -__device__ __forceinline__ void set(const output_type_cuda_t* x, rigid_cuda_t* lig_rigid_gpu, - m_coords_cuda_t* m_coords_gpu, const atom_cuda_t* atoms, - const int m_num_movable_atoms, const float epsilon_fl) { - for (int i = 0; i < 3; i++) lig_rigid_gpu->origin[0][i] = x->position[i]; - for (int i = 0; i < 4; i++) lig_rigid_gpu->orientation_q[0][i] = x->orientation[i]; - quaternion_to_r3(lig_rigid_gpu->orientation_q[0], - lig_rigid_gpu->orientation_m[0]); /* set orientation_m */ - - int begin = lig_rigid_gpu->atom_range[0][0]; - int end = lig_rigid_gpu->atom_range[0][1]; - for (int i = begin; i < end; i++) { - local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[0], atoms[i].coords, - lig_rigid_gpu->orientation_m[0]); - } - /* ************* end node.set_conf ************* */ - - /* ************* branches_set_conf ************* */ - /* update nodes in depth-first order */ - for (int current = 1; current < lig_rigid_gpu->num_children + 1; - current++) { /* current starts from 1 (namely starts from first child node) */ - int parent = lig_rigid_gpu->parent[current]; - float torsion = x->lig_torsion[current - 1]; /* torsions are all related to child nodes */ - local_to_lab(lig_rigid_gpu->origin[current], lig_rigid_gpu->origin[parent], - lig_rigid_gpu->relative_origin[current], lig_rigid_gpu->orientation_m[parent]); - local_to_lab_direction(lig_rigid_gpu->axis[current], lig_rigid_gpu->relative_axis[current], - lig_rigid_gpu->orientation_m[parent]); - float tmp[4]; - float parent_q[4] - = {lig_rigid_gpu->orientation_q[parent][0], lig_rigid_gpu->orientation_q[parent][1], - lig_rigid_gpu->orientation_q[parent][2], lig_rigid_gpu->orientation_q[parent][3]}; - float current_axis[3] = {lig_rigid_gpu->axis[current][0], lig_rigid_gpu->axis[current][1], - lig_rigid_gpu->axis[current][2]}; - - angle_to_quaternion2(tmp, current_axis, torsion); - angle_to_quaternion_multi(tmp, parent_q); - quaternion_normalize_approx(tmp, epsilon_fl); - - for (int i = 0; i < 4; i++) - lig_rigid_gpu->orientation_q[current][i] = tmp[i]; /* set orientation_q */ - quaternion_to_r3(lig_rigid_gpu->orientation_q[current], - lig_rigid_gpu->orientation_m[current]); /* set orientation_m */ - - /* set coords */ - begin = lig_rigid_gpu->atom_range[current][0]; - end = lig_rigid_gpu->atom_range[current][1]; - for (int i = begin; i < end; i++) { - local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[current], atoms[i].coords, - lig_rigid_gpu->orientation_m[current]); - } - } - /* ************* end branches_set_conf ************* */ -} - -__device__ __forceinline__ void p_eval_deriv(float* out, int type_pair_index, float r2, - p_cuda_t* p_cuda_gpu, const float epsilon_fl) { - const float cutoff_sqr = p_cuda_gpu->m_cutoff_sqr; - if (r2 > cutoff_sqr) - DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!, r2 > Cutoff_sqr, r2=%f, cutoff_sqr=%f", r2, - cutoff_sqr); - - p_m_data_cuda_t* tmp = &p_cuda_gpu->m_data[type_pair_index]; - float r2_factored = tmp->factor * r2; - int i1 = (int)(r2_factored); - int i2 = i1 + 1; - float rem = r2_factored - i1; - if (rem < -epsilon_fl) DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!"); - if (rem >= 1 + epsilon_fl) DEBUG_PRINTF("\nkernel2: p_eval_deriv() ERROR!"); - float p1[2] = {tmp->smooth[i1][0], tmp->smooth[i1][1]}; - if (i1 >= SMOOTH_SIZE) p1[0] = p1[1] = 0; - float p2[2] = {tmp->smooth[i2][0], tmp->smooth[i2][1]}; - if (i2 >= SMOOTH_SIZE) p2[0] = p2[1] = 0; - float e = p1[0] + rem * (p2[0] - p1[0]); - float dor = p1[1] + rem * (p2[1] - p1[1]); - out[0] = e; - out[1] = dor; -} - -__device__ __forceinline__ void curl(float* e, float* deriv, float v, const float epsilon_fl) { - if (*e > 0 && not_max_gpu(v)) { - float tmp = (v < epsilon_fl) ? 0 : (v / (v + *e)); - (*e) = tmp * (*e); - for (int i = 0; i < 3; i++) deriv[i] = deriv[i] * (tmp * tmp); - } -} - -__device__ __forceinline__ float eval_interacting_pairs_deriv(p_cuda_t* p_cuda_gpu, const float v, - const lig_pairs_cuda_t* pairs, - const m_coords_cuda_t* m_coords, - m_minus_forces_t* minus_forces, - const float epsilon_fl) { - float e = 0; - for (int i = 0; i < pairs->num_pairs; i++) { - const int ip[3] = {pairs->type_pair_index[i], pairs->a[i], pairs->b[i]}; - int index = pairs->a[i] + pairs->b[i] * (pairs->b[i] + 1) / 2; - float coords_b[3] - = {m_coords->coords[ip[2]][0], m_coords->coords[ip[2]][1], m_coords->coords[ip[2]][2]}; - float coords_a[3] - = {m_coords->coords[ip[1]][0], m_coords->coords[ip[1]][1], m_coords->coords[ip[1]][2]}; - float r[3] - = {coords_b[0] - coords_a[0], coords_b[1] - coords_a[1], coords_b[2] - coords_a[2]}; - float r2 = r[0] * r[0] + r[1] * r[1] + r[2] * r[2]; - - if (r2 < p_cuda_gpu->m_cutoff_sqr) { - float tmp[2]; - p_eval_deriv(tmp, index, r2, p_cuda_gpu, epsilon_fl); - float force[3] = {r[0] * tmp[1], r[1] * tmp[1], r[2] * tmp[1]}; - curl(&tmp[0], force, v, epsilon_fl); - e += tmp[0]; - for (int j = 0; j < 3; j++) minus_forces->coords[ip[1]][j] -= force[j]; - for (int j = 0; j < 3; j++) minus_forces->coords[ip[2]][j] += force[j]; - } - } - return e; -} - -__device__ __forceinline__ void product(float* res, const float* a, const float* b) { - res[0] = a[1] * b[2] - a[2] * b[1]; - res[1] = a[2] * b[0] - a[0] * b[2]; - res[2] = a[0] * b[1] - a[1] * b[0]; -} - -__device__ __forceinline__ void POT_deriv(const m_minus_forces_t* minus_forces, - const rigid_cuda_t* lig_rigid_gpu, - const m_coords_cuda_t* m_coords, change_cuda_t* g) { - int num_torsion = lig_rigid_gpu->num_children; - int num_rigid = num_torsion + 1; - float position_derivative_tmp[MAX_NUM_OF_RIGID][3]; - float position_derivative[MAX_NUM_OF_RIGID][3]; - float orientation_derivative_tmp[MAX_NUM_OF_RIGID][3]; - float orientation_derivative[MAX_NUM_OF_RIGID][3]; - float torsion_derivative[MAX_NUM_OF_RIGID]; /* torsion_derivative[0] has no meaning(root node - has no torsion) */ - - for (int i = 0; i < num_rigid; i++) { - int begin = lig_rigid_gpu->atom_range[i][0]; - int end = lig_rigid_gpu->atom_range[i][1]; - for (int k = 0; k < 3; k++) position_derivative_tmp[i][k] = 0; - for (int k = 0; k < 3; k++) orientation_derivative_tmp[i][k] = 0; - for (int j = begin; j < end; j++) { - for (int k = 0; k < 3; k++) position_derivative_tmp[i][k] += minus_forces->coords[j][k]; - - float tmp1[3] = {m_coords->coords[j][0] - lig_rigid_gpu->origin[i][0], - m_coords->coords[j][1] - lig_rigid_gpu->origin[i][1], - m_coords->coords[j][2] - lig_rigid_gpu->origin[i][2]}; - float tmp2[3] = {minus_forces->coords[j][0], minus_forces->coords[j][1], - minus_forces->coords[j][2]}; - float tmp3[3]; - product(tmp3, tmp1, tmp2); - for (int k = 0; k < 3; k++) orientation_derivative_tmp[i][k] += tmp3[k]; - } - } - - /* position_derivative */ - for (int i = num_rigid - 1; i >= 0; i--) { /* from bottom to top */ - for (int k = 0; k < 3; k++) position_derivative[i][k] = position_derivative_tmp[i][k]; - /* looking for chidren node */ - for (int j = 0; j < num_rigid; j++) { - if (lig_rigid_gpu->children_map[i][j] == true) { - for (int k = 0; k < 3; k++) - position_derivative[i][k] += position_derivative[j][k]; /* self+children node */ - } - } - } - - /* orientation_derivetive */ - for (int i = num_rigid - 1; i >= 0; i--) { /* from bottom to top */ - for (int k = 0; k < 3; k++) orientation_derivative[i][k] = orientation_derivative_tmp[i][k]; - /* looking for chidren node */ - for (int j = 0; j < num_rigid; j++) { - if (lig_rigid_gpu->children_map[i][j] == true) { /* self + children node + product */ - for (int k = 0; k < 3; k++) - orientation_derivative[i][k] += orientation_derivative[j][k]; - float product_out[3]; - float origin_temp[3] = {lig_rigid_gpu->origin[j][0] - lig_rigid_gpu->origin[i][0], - lig_rigid_gpu->origin[j][1] - lig_rigid_gpu->origin[i][1], - lig_rigid_gpu->origin[j][2] - lig_rigid_gpu->origin[i][2]}; - product(product_out, origin_temp, position_derivative[j]); - for (int k = 0; k < 3; k++) orientation_derivative[i][k] += product_out[k]; - } - } - } - - /* torsion_derivative */ - for (int i = num_rigid - 1; i >= 0; i--) { - float sum = 0; - for (int j = 0; j < 3; j++) sum += orientation_derivative[i][j] * lig_rigid_gpu->axis[i][j]; - torsion_derivative[i] = sum; - } - - for (int k = 0; k < 3; k++) g->position[k] = position_derivative[0][k]; - for (int k = 0; k < 3; k++) g->orientation[k] = orientation_derivative[0][k]; - for (int k = 0; k < num_torsion; k++) g->lig_torsion[k] = torsion_derivative[k + 1]; -} - -__device__ __forceinline__ float m_eval_deriv(output_type_cuda_t* c, change_cuda_t* g, - m_cuda_t* m_cuda_gpu, p_cuda_t* p_cuda_gpu, - ig_cuda_t* ig_cuda_gpu, const float* v, - const float epsilon_fl) { - // check set args - set(c, &m_cuda_gpu->ligand.rigid, &m_cuda_gpu->m_coords, m_cuda_gpu->atoms, - m_cuda_gpu->m_num_movable_atoms, epsilon_fl); - - float e = 0; - e = ig_eval_deriv(c, g, v[1], ig_cuda_gpu, m_cuda_gpu, epsilon_fl); - e += eval_interacting_pairs_deriv(p_cuda_gpu, v[0], &m_cuda_gpu->ligand.pairs, - &m_cuda_gpu->m_coords, &m_cuda_gpu->minus_forces, epsilon_fl); - // should add derivs for glue, other and inter pairs - POT_deriv(&m_cuda_gpu->minus_forces, &m_cuda_gpu->ligand.rigid, &m_cuda_gpu->m_coords, g); - - return e; -} - -__device__ __forceinline__ float find_change_index_read(const change_cuda_t* g, int index) { - if (index < 3) return g->position[index]; - index -= 3; - if (index < 3) return g->orientation[index]; - index -= 3; - if (index < g->lig_torsion_size) return g->lig_torsion[index]; - DEBUG_PRINTF("\nKernel2:find_change_index_read() ERROR!"); /* Shouldn't be here */ -} - -__device__ __forceinline__ void find_change_index_write(change_cuda_t* g, int index, float data) { - if (index < 3) { - g->position[index] = data; - return; - } - index -= 3; - if (index < 3) { - g->orientation[index] = data; - return; - } - index -= 3; - if (index < g->lig_torsion_size) { - g->lig_torsion[index] = data; - return; - } - DEBUG_PRINTF("\nKernel2:find_change_index_write() ERROR!"); /* Shouldn't be here */ -} - -__device__ __forceinline__ void minus_mat_vec_product(const matrix_d* h, const change_cuda_t* in, - change_cuda_t* out) { - int n = h->dim; - for (int i = 0; i < n; i++) { - float sum = 0; - for (int j = 0; j < n; j++) { - sum += h->data[index_permissive(h, i, j)] * find_change_index_read(in, j); - } - find_change_index_write(out, i, -sum); - } -} - -__device__ __forceinline__ float scalar_product(const change_cuda_t* a, const change_cuda_t* b, - int n) { - float tmp = 0; - for (int i = 0; i < n; i++) { - tmp += find_change_index_read(a, i) * find_change_index_read(b, i); - } - return tmp; -} - -__device__ __forceinline__ float line_search(m_cuda_t* m_cuda_gpu, p_cuda_t* p_cuda_gpu, - ig_cuda_t* ig_cuda_gpu, int n, - const output_type_cuda_t* x, const change_cuda_t* g, - const float f0, const change_cuda_t* p, - output_type_cuda_t* x_new, change_cuda_t* g_new, - float* f1, const float epsilon_fl, - const float* hunt_cap) { - const float c0 = 0.0001; - const int max_trials = 10; - const float multiplier = 0.5; - float alpha = 1; - - const float pg = scalar_product(p, g, n); - - for (int trial = 0; trial < max_trials; trial++) { - output_type_cuda_init_with_output(x_new, x); - output_type_cuda_increment(x_new, p, alpha, epsilon_fl); - *f1 = m_eval_deriv(x_new, g_new, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, hunt_cap, epsilon_fl); - if (*f1 - f0 < c0 * alpha * pg) break; - alpha *= multiplier; - } - return alpha; -} - -__device__ __forceinline__ bool bfgs_update(matrix_d* h, const change_cuda_t* p, - const change_cuda_t* y, const float alpha, - const float epsilon_fl) { - const float yp = scalar_product(y, p, h->dim); - - if (alpha * yp < epsilon_fl) return false; - change_cuda_t minus_hy; - change_cuda_init_with_change(&minus_hy, y); - minus_mat_vec_product(h, y, &minus_hy); - const float yhy = -scalar_product(y, &minus_hy, h->dim); - const float r = 1 / (alpha * yp); - const int n = 6 + p->lig_torsion_size; - - for (int i = 0; i < n; i++) { - for (int j = i; j < n; j++) { - float tmp - = alpha * r - * (find_change_index_read(&minus_hy, i) * find_change_index_read(p, j) - + find_change_index_read(&minus_hy, j) * find_change_index_read(p, i)) - + +alpha * alpha * (r * r * yhy + r) * find_change_index_read(p, i) - * find_change_index_read(p, j); - - h->data[i + j * (j + 1) / 2] += tmp; - } - } - - return true; -} - -__device__ __forceinline__ void bfgs(output_type_cuda_t* x, change_cuda_t* g, m_cuda_t* m_cuda_gpu, - p_cuda_t* p_cuda_gpu, ig_cuda_t* ig_cuda_gpu, - const float* hunt_cap, const float epsilon_fl, - const int max_steps) { - int n = 3 + 3 + x->lig_torsion_size; /* the dimensions of matirx */ - - matrix_d h; - matrix_d_init(&h, n, 0); - matrix_d_set_diagonal(&h, 1); - - change_cuda_t g_new; - change_cuda_init_with_change(&g_new, g); - - output_type_cuda_t x_new; - output_type_cuda_init_with_output(&x_new, x); - - float f0 = m_eval_deriv(x, g, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, hunt_cap, epsilon_fl); - - float f_orig = f0; - /* Init g_orig, x_orig */ - change_cuda_t g_orig; - change_cuda_init_with_change(&g_orig, g); - output_type_cuda_t x_orig; - output_type_cuda_init_with_output(&x_orig, x); - /* Init p */ - change_cuda_t p; - change_cuda_init_with_change(&p, g); - - for (int step = 0; step < max_steps; step++) { - minus_mat_vec_product(&h, g, &p); - float f1 = 0; - - const float alpha = line_search(m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, n, x, g, f0, &p, - &x_new, &g_new, &f1, epsilon_fl, hunt_cap); - - change_cuda_t y; - change_cuda_init_with_change(&y, &g_new); - /* subtract_change */ - for (int i = 0; i < n; i++) { - float tmp = find_change_index_read(&y, i) - find_change_index_read(g, i); - find_change_index_write(&y, i, tmp); - } - // f_values[step + 1] = f1; - f0 = f1; - output_type_cuda_init_with_output(x, &x_new); - if (!(sqrt(scalar_product(g, g, n)) >= 1e-5)) break; - change_cuda_init_with_change(g, &g_new); - - if (step == 0) { - float yy = scalar_product(&y, &y, n); - if (fabs(yy) > epsilon_fl) { - matrix_d_set_diagonal(&h, alpha * scalar_product(&y, &p, n) / yy); - } - } - - bool h_updated = bfgs_update(&h, &p, &y, alpha, epsilon_fl); - } - - if (!(f0 <= f_orig)) { - f0 = f_orig; - output_type_cuda_init_with_output(x, &x_orig); - change_cuda_init_with_change(g, &g_orig); - } - - // write output_type_cuda energy - x->e = f0; -} +#include "quasi_newton.h" +#include +#include +#include -/* Above based on quasi_newton.cpp */ +namespace cg = cooperative_groups; /* Below is monte-carlo kernel, based on kernel.cl*/ -__device__ __forceinline__ void m_cuda_init_with_m_cuda(const m_cuda_t* m_cuda_old, - m_cuda_t* m_cuda_new) { - memcpy(m_cuda_new, m_cuda_old, MAX_NUM_OF_ATOMS * sizeof(atom_cuda_t)); - m_cuda_new->m_coords = m_cuda_old->m_coords; - m_cuda_new->minus_forces = m_cuda_old->minus_forces; - m_cuda_new->ligand = m_cuda_old->ligand; - m_cuda_new->m_num_movable_atoms = m_cuda_old->m_num_movable_atoms; -} - -__device__ __forceinline__ void get_heavy_atom_movable_coords(output_type_cuda_t* tmp, - const m_cuda_t* m_cuda_gpu) { +__device__ __forceinline__ void get_heavy_atom_movable_coords(output_type_cuda_t *tmp, + const m_cuda_t *m_cuda_gpu) { int counter = 0; for (int i = 0; i < m_cuda_gpu->m_num_movable_atoms; i++) { if (m_cuda_gpu->atoms[i].types[0] != EL_TYPE_H) { for (int j = 0; j < 3; j++) tmp->coords[counter][j] = m_cuda_gpu->m_coords.coords[i][j]; counter++; } else { - // DEBUG_PRINTF("\n P2: removed H atom coords in get_heavy_atom_movable_coords()!"); + // DEBUG_PRINTF("\n P2: removed H atom coords in + // get_heavy_atom_movable_coords()!"); } } /* assign 0 for others */ @@ -1001,8 +64,8 @@ __device__ __forceinline__ void get_heavy_atom_movable_coords(output_type_cuda_t } } -__device__ __forceinline__ float generate_n(const float* pi_map, const int step) { - return fabs(pi_map[step]) / M_PI; +__device__ __forceinline__ float generate_n(const float *pi_map, const int step) { + return fabs(pi_map[step]) / M_PI_F; } __device__ __forceinline__ bool metropolis_accept(float old_f, float new_f, float temperature, @@ -1012,8 +75,8 @@ __device__ __forceinline__ bool metropolis_accept(float old_f, float new_f, floa return n < acceptance_probability; } -__device__ __forceinline__ void write_back(output_type_cuda_t* results, - const output_type_cuda_t* best_out) { +__device__ __forceinline__ void write_back(output_type_cuda_t *results, + const output_type_cuda_t *best_out) { for (int i = 0; i < 3; i++) results->position[i] = best_out->position[i]; for (int i = 0; i < 4; i++) results->orientation[i] = best_out->orientation[i]; for (int i = 0; i < MAX_NUM_OF_LIG_TORSION; i++) @@ -1028,76 +91,125 @@ __device__ __forceinline__ void write_back(output_type_cuda_t* results, } } } -// MAX_THREADS_PER_BLOCK and MIN_BLOCKS_PER_MP should be adjusted according to the profiling results +// MAX_THREADS_PER_BLOCK and MIN_BLOCKS_PER_MP should be adjusted according to +// the profiling results #define MAX_THREADS_PER_BLOCK 32 #define MIN_BLOCKS_PER_MP 32 +template __global__ __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) void kernel( - m_cuda_t* m_cuda_global, ig_cuda_t* ig_cuda_gpu, p_cuda_t* p_cuda_gpu, - float* rand_molec_struc_gpu, float* best_e_gpu, int bfgs_max_steps, float mutation_amplitude, - curandStatePhilox4_32_10_t* states, unsigned long long seed, float epsilon_fl, - float* hunt_cap_gpu, float* authentic_v_gpu, output_type_cuda_t* results, int search_depth, - int num_of_ligands, int threads_per_ligand, bool multi_bias) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; + m_cuda_t *m_cuda_global, ig_cuda_t *ig_cuda_gpu, p_cuda_t *p_cuda_gpu, + float *rand_molec_struc_gpu, float *best_e_gpu, int bfgs_max_steps, float mutation_amplitude, + curandStatePhilox4_32_10_t *states, unsigned long long seed, float epsilon_fl, + float *hunt_cap_gpu, float *authentic_v_gpu, output_type_cuda_t *results, + output_type_cuda_t *output_aux, change_cuda_t *change_aux, pot_cuda_t *pot_aux, + matrix_d *h_cuda_gpu, m_cuda_t *m_cuda_gpu, int search_depth, int num_of_ligands, + int threads_per_ligand, bool multi_bias) { + int bid = blockIdx.x, tid = threadIdx.x; + int pose_id = (bid * warpSize + tid) / TileSize; + auto tb = cg::this_thread_block(); + cg::thread_block_tile tile = cg::tiled_partition(tb); + if (m_cuda_global[pose_id / threads_per_ligand].m_num_movable_atoms == -1) { + return; + } + float best_e = INFINITY; + // __shared__ output_type_cuda_t tmp, best_out, candidate, x_new, x_orig; + output_type_cuda_t &tmp = output_aux[pose_id * 5]; + output_type_cuda_t &best_out = output_aux[pose_id * 5 + 1]; + output_type_cuda_t &candidate = output_aux[pose_id * 5 + 2]; + output_type_cuda_t &x_new = output_aux[pose_id * 5 + 3]; + output_type_cuda_t &x_orig = output_aux[pose_id * 5 + 4]; + + // __shared__ change_cuda_t g, tmp1, tmp2, tmp3, tmp4; + change_cuda_t &g = change_aux[pose_id * 5]; + change_cuda_t &tmp1 = change_aux[pose_id * 5 + 1]; + change_cuda_t &tmp2 = change_aux[pose_id * 5 + 2]; + change_cuda_t &tmp3 = change_aux[pose_id * 5 + 3]; + change_cuda_t &tmp4 = change_aux[pose_id * 5 + 4]; + + if (pose_id < num_of_ligands * threads_per_ligand) { + output_type_cuda_init_warp( + tile, &tmp, rand_molec_struc_gpu + pose_id * (SIZE_OF_MOLEC_STRUC / sizeof(float))); + + m_cuda_init_with_m_cuda_warp(tile, &m_cuda_global[pose_id / threads_per_ligand], + &m_cuda_gpu[pose_id]); + + if (tile.thread_rank() == 0) { + curand_init(seed, pose_id, 0, &states[pose_id]); + g.lig_torsion_size = tmp.lig_torsion_size; + } + tile.sync(); - if (idx < num_of_ligands * threads_per_ligand) { - // if (idx % 100 == 0)DEBUG_PRINTF("\nThread %d START", idx); - output_type_cuda_t tmp; // private memory, shared only in work item - change_cuda_t g; - m_cuda_t m_cuda_gpu; - // update pointer to get correct ligand data - output_type_cuda_init(&tmp, - rand_molec_struc_gpu + idx * (SIZE_OF_MOLEC_STRUC / sizeof(float))); - curand_init(seed, idx, 0, &states[idx]); - m_cuda_init_with_m_cuda(m_cuda_global + idx / threads_per_ligand, &m_cuda_gpu); if (multi_bias) { - ig_cuda_gpu = ig_cuda_gpu + idx / threads_per_ligand; + ig_cuda_gpu += pose_id / threads_per_ligand; } - if (m_cuda_gpu.m_num_movable_atoms == -1) { - return; - } - p_cuda_gpu = p_cuda_gpu + idx / threads_per_ligand; - g.lig_torsion_size = tmp.lig_torsion_size; - // BFGS - output_type_cuda_t best_out; - output_type_cuda_t candidate; + pot_aux += pose_id; + p_cuda_gpu += pose_id / threads_per_ligand; + // BFGS for (int step = 0; step < search_depth; step++) { - output_type_cuda_init_with_output(&candidate, &tmp); - mutate_conf_cuda(bfgs_max_steps, &candidate, &states[idx], m_cuda_gpu.ligand.begin, - m_cuda_gpu.ligand.end, m_cuda_gpu.atoms, &m_cuda_gpu.m_coords, - m_cuda_gpu.ligand.rigid.origin[0], epsilon_fl, mutation_amplitude); - bfgs(&candidate, &g, &m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, hunt_cap_gpu, epsilon_fl, - bfgs_max_steps); - // n ~ U[0,1] - float n = curand_uniform(&states[idx]); - - // if (idx == 0) - // DEBUG_PRINTF("metropolis_accept tmp.e=%f, candidate.e=%f, n=%f\n", tmp.e, - // candidate.e, n); - - if (step == 0 || metropolis_accept(tmp.e, candidate.e, 1.2, n)) { - output_type_cuda_init_with_output(&tmp, &candidate); - set(&tmp, &m_cuda_gpu.ligand.rigid, &m_cuda_gpu.m_coords, m_cuda_gpu.atoms, - m_cuda_gpu.m_num_movable_atoms, epsilon_fl); + output_type_cuda_init_with_output_warp(tile, &candidate, &tmp); + + if (tile.thread_rank() == 0) + mutate_conf_cuda(bfgs_max_steps, &candidate, &states[pose_id], + m_cuda_gpu[pose_id].ligand.begin, m_cuda_gpu[pose_id].ligand.end, + m_cuda_gpu[pose_id].atoms, &m_cuda_gpu[pose_id].m_coords, + m_cuda_gpu[pose_id].ligand.rigid.origin[0], epsilon_fl, + mutation_amplitude); + tile.sync(); + + bfgs_warp(tile, &candidate, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4, + &h_cuda_gpu[pose_id], &m_cuda_gpu[pose_id], p_cuda_gpu, ig_cuda_gpu, pot_aux, + hunt_cap_gpu, epsilon_fl, bfgs_max_steps); + + bool accepted; + if (tile.thread_rank() == 0) { + // n ~ U[0,1] + float n = curand_uniform(&states[pose_id]); + accepted = metropolis_accept(tmp.e, candidate.e, 1.2, n); + } + accepted = tile.shfl(accepted, 0); + + if (step == 0 || accepted) { + output_type_cuda_init_with_output_warp(tile, &tmp, &candidate); + + if (tile.thread_rank() == 0) { + set(&tmp, &m_cuda_gpu[pose_id].ligand.rigid, &m_cuda_gpu[pose_id].m_coords, + m_cuda_gpu[pose_id].atoms, m_cuda_gpu[pose_id].m_num_movable_atoms, + epsilon_fl); + } + tile.sync(); + if (tmp.e < best_e) { - bfgs(&tmp, &g, &m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, authentic_v_gpu, - epsilon_fl, bfgs_max_steps); + bfgs_warp(tile, &tmp, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4, + &h_cuda_gpu[pose_id], &m_cuda_gpu[pose_id], p_cuda_gpu, ig_cuda_gpu, + pot_aux, authentic_v_gpu, epsilon_fl, bfgs_max_steps); + // set if (tmp.e < best_e) { - set(&tmp, &m_cuda_gpu.ligand.rigid, &m_cuda_gpu.m_coords, m_cuda_gpu.atoms, - m_cuda_gpu.m_num_movable_atoms, epsilon_fl); - output_type_cuda_init_with_output(&best_out, &tmp); - get_heavy_atom_movable_coords(&best_out, &m_cuda_gpu); // get coords + if (tile.thread_rank() == 0) + set(&tmp, &m_cuda_gpu[pose_id].ligand.rigid, + &m_cuda_gpu[pose_id].m_coords, m_cuda_gpu[pose_id].atoms, + m_cuda_gpu[pose_id].m_num_movable_atoms, epsilon_fl); + tile.sync(); + + output_type_cuda_init_with_output_warp(tile, &best_out, &tmp); + + if (tile.thread_rank() == 0) { + get_heavy_atom_movable_coords(&best_out, + &m_cuda_gpu[pose_id]); // get coords + } + tile.sync(); + best_e = tmp.e; } } } } + // write the best conformation back to CPU // FIX?? should add more - write_back(results + idx, &best_out); - // if (idx % 100 == 0) DEBUG_PRINTF("\nThread %d FINISH", idx); + write_back_warp(tile, results + pose_id, &best_out); } } @@ -1136,30 +248,31 @@ std::vector monte_carlo::cuda_to_vina(output_type_cuda_t results_pt } __host__ void monte_carlo::operator()( - std::vector& m_gpu, std::vector& out_gpu, - std::vector& p_gpu, triangular_matrix_cuda_t* m_data_list_gpu, - const igrid& ig, const vec& corner1, const vec& corner2, rng& generator, int verbosity, - unsigned long long seed, std::vector>& bias_batch_list) const { + std::vector &m_gpu, std::vector &out_gpu, + std::vector &p_gpu, triangular_matrix_cuda_t *m_data_list_gpu, + const igrid &ig, const vec &corner1, const vec &corner2, rng &generator, int verbosity, + unsigned long long seed, std::vector> &bias_batch_list) const { /* Definitions from vina1.2 */ DEBUG_PRINTF("entering CUDA monte_carlo search\n"); // debug - vec authentic_v(1000, 1000, 1000); // FIXME? this is here to avoid max_fl/max_fl + vec authentic_v(1000, 1000, + 1000); // FIXME? this is here to avoid max_fl/max_fl quasi_newton quasi_newton_par; const int quasi_newton_par_max_steps = local_steps; // no need to decrease step /* Allocate CPU memory and define new data structure */ DEBUG_PRINTF("Allocating CPU memory\n"); // debug - m_cuda_t* m_cuda; + m_cuda_t *m_cuda; checkCUDA(cudaMallocHost(&m_cuda, sizeof(m_cuda_t))); - output_type_cuda_t* rand_molec_struc_tmp; + output_type_cuda_t *rand_molec_struc_tmp; checkCUDA(cudaMallocHost(&rand_molec_struc_tmp, sizeof(output_type_cuda_t))); - ig_cuda_t* ig_cuda_ptr; + ig_cuda_t *ig_cuda_ptr; checkCUDA(cudaMallocHost(&ig_cuda_ptr, sizeof(ig_cuda_t))); - p_cuda_t_cpu* p_cuda; + p_cuda_t_cpu *p_cuda; checkCUDA(cudaMallocHost(&p_cuda, sizeof(p_cuda_t_cpu))); /* End CPU allocation */ @@ -1176,47 +289,53 @@ __host__ void monte_carlo::operator()( DEBUG_PRINTF("p_cuda_size_gpu=%lu\n", p_cuda_size_gpu); // rand_molec_struc_gpu - float* rand_molec_struc_gpu; + float *rand_molec_struc_gpu; checkCUDA(cudaMalloc(&rand_molec_struc_gpu, thread * SIZE_OF_MOLEC_STRUC)); // best_e_gpu - float* best_e_gpu; + float *best_e_gpu; float epsilon_fl_float = static_cast(epsilon_fl); checkCUDA(cudaMalloc(&best_e_gpu, sizeof(float))); checkCUDA(cudaMemcpy(best_e_gpu, &max_fl, sizeof(float), cudaMemcpyHostToDevice)); // use cuRand to generate random values on GPU - curandStatePhilox4_32_10_t* states; + curandStatePhilox4_32_10_t *states; DEBUG_PRINTF("random states size=%lu\n", sizeof(curandStatePhilox4_32_10_t) * thread); checkCUDA(cudaMalloc(&states, sizeof(curandStatePhilox4_32_10_t) * thread)); // hunt_cap_gpu - float* hunt_cap_gpu; + float *hunt_cap_gpu; float hunt_cap_float[3] = {static_cast(hunt_cap[0]), static_cast(hunt_cap[1]), static_cast(hunt_cap[2])}; checkCUDA(cudaMalloc(&hunt_cap_gpu, 3 * sizeof(float))); // Preparing m related data - m_cuda_t* m_cuda_gpu; + m_cuda_t *m_cuda_gpu; DEBUG_PRINTF("m_cuda_size=%lu", m_cuda_size); checkCUDA(cudaMalloc(&m_cuda_gpu, num_of_ligands * m_cuda_size)); // Preparing p related data - p_cuda_t* p_cuda_gpu; + p_cuda_t *p_cuda_gpu; checkCUDA(cudaMalloc(&p_cuda_gpu, num_of_ligands * p_cuda_size_gpu)); DEBUG_PRINTF("p_cuda_gpu=%p\n", p_cuda_gpu); // Preparing ig related data (cache related data) - ig_cuda_t* ig_cuda_gpu; + ig_cuda_t *ig_cuda_gpu; - float* authentic_v_gpu; + float *authentic_v_gpu; float authentic_v_float[3] = {static_cast(authentic_v[0]), static_cast(authentic_v[1]), static_cast(authentic_v[2])}; checkCUDA(cudaMalloc(&authentic_v_gpu, sizeof(authentic_v_float))); // Preparing result data - output_type_cuda_t* results_gpu; + output_type_cuda_t *results_gpu; checkCUDA(cudaMalloc(&results_gpu, thread * sizeof(output_type_cuda_t))); + m_cuda_t *m_cuda_global; + checkCUDA(cudaMalloc(&m_cuda_global, thread * sizeof(m_cuda_t))); + + matrix_d *h_cuda_global; + checkCUDA(cudaMalloc(&h_cuda_global, thread * sizeof(matrix_d))); + /* End Allocating GPU Memory */ assert(num_of_ligands <= MAX_LIGAND_NUM); @@ -1225,7 +344,7 @@ __host__ void monte_carlo::operator()( struct tmp_struct { int start_index = 0; int parent_index = 0; - void store_node(tree& child_ptr, rigid_cuda_t& rigid) { + void store_node(tree &child_ptr, rigid_cuda_t &rigid) { start_index++; // start with index 1, index 0 is root node rigid.parent[start_index] = parent_index; rigid.atom_range[start_index][0] = child_ptr.node.begin; @@ -1256,8 +375,8 @@ __host__ void monte_carlo::operator()( }; for (int l = 0; l < num_of_ligands; ++l) { - model& m = m_gpu[l]; - const precalculate_byatom& p = p_gpu[l]; + model &m = m_gpu[l]; + const precalculate_byatom &p = p_gpu[l]; /* Prepare m related data */ conf_size s = m.get_size(); @@ -1310,7 +429,7 @@ __host__ void monte_carlo::operator()( } m_cuda->ligand.begin = m.ligands[0].begin; // 0 m_cuda->ligand.end = m.ligands[0].end; // 29 - ligand& m_ligand = m.ligands[0]; // Only support one ligand + ligand &m_ligand = m.ligands[0]; // Only support one ligand DEBUG_PRINTF("m_ligand.end=%lu, MAX_NUM_OF_ATOMS=%d\n", m_ligand.end, MAX_NUM_OF_ATOMS); assert(m_ligand.end < MAX_NUM_OF_ATOMS); @@ -1342,14 +461,24 @@ __host__ void monte_carlo::operator()( } m_cuda->ligand.rigid.num_children = ts.start_index; - // set children_map + // set children map DEBUG_PRINTF("set children map\n"); for (int i = 0; i < MAX_NUM_OF_RIGID; i++) - for (int j = 0; j < MAX_NUM_OF_RIGID; j++) + for (int j = 0; j < MAX_NUM_OF_RIGID; j++) { m_cuda->ligand.rigid.children_map[i][j] = false; - for (int i = 1; i < m_cuda->ligand.rigid.num_children + 1; i++) { - int parent_index = m_cuda->ligand.rigid.parent[i]; - m_cuda->ligand.rigid.children_map[parent_index][i] = true; + m_cuda->ligand.rigid.descendant_map[i][j] = false; + } + + for (int i = MAX_NUM_OF_RIGID - 1; i >= 0; i--) { + if (i > 0) { + m_cuda->ligand.rigid.children_map[m_cuda->ligand.rigid.parent[i]][i] = true; + m_cuda->ligand.rigid.descendant_map[m_cuda->ligand.rigid.parent[i]][i] = true; + } + for (int j = i + 1; j < MAX_NUM_OF_RIGID; j++) { + if (m_cuda->ligand.rigid.descendant_map[i][j]) + m_cuda->ligand.rigid.descendant_map[m_cuda->ligand.rigid.parent[i]][j] + = true; + } } m_cuda->m_num_movable_atoms = m.num_movable_atoms(); @@ -1369,9 +498,9 @@ __host__ void monte_carlo::operator()( for (int i = 0; i < threads_per_ligand; ++i) { if (!local_only) { - tmp.c.randomize( - corner1, corner2, - generator); // generate a random structure, can move to GPU if necessary + tmp.c.randomize(corner1, corner2, + generator); // generate a random structure, + // can move to GPU if necessary } for (int j = 0; j < 3; j++) rand_molec_struc_tmp->position[j] = tmp.c.ligands[0].rigid.position[j]; @@ -1395,7 +524,7 @@ __host__ void monte_carlo::operator()( rand_molec_struc_tmp->lig_torsion_size = lig_torsion_size; - float* rand_molec_struc_gpu_tmp + float *rand_molec_struc_gpu_tmp = rand_molec_struc_gpu + (l * threads_per_ligand + i) * SIZE_OF_MOLEC_STRUC / sizeof(float); checkCUDA(cudaMemcpy(rand_molec_struc_gpu_tmp, rand_molec_struc_tmp, @@ -1412,7 +541,7 @@ __host__ void monte_carlo::operator()( p_cuda->m_data_size = p.m_data.m_data.size(); checkCUDA(cudaMemcpy(p_cuda_gpu + l, p_cuda, sizeof(p_cuda_t), cudaMemcpyHostToDevice)); checkCUDA(cudaMemcpy(&(p_cuda_gpu[l].m_data), &(m_data_list_gpu[l].p_data), - sizeof(p_m_data_cuda_t*), + sizeof(p_m_data_cuda_t *), cudaMemcpyHostToDevice)); // check if fl == float } } @@ -1588,16 +717,24 @@ __host__ void monte_carlo::operator()( /* Launch kernel */ DEBUG_PRINTF("launch kernel, global_steps=%d, thread=%d, num_of_ligands=%d\n", global_steps, thread, num_of_ligands); - kernel<<>>(m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu, - best_e_gpu, quasi_newton_par_max_steps, - mutation_amplitude_float, states, seed, epsilon_fl_float, - hunt_cap_gpu, authentic_v_gpu, results_gpu, global_steps, - num_of_ligands, threads_per_ligand, multi_bias); + + output_type_cuda_t *results_aux; + checkCUDA(cudaMalloc(&results_aux, 5 * thread * sizeof(output_type_cuda_t))); + change_cuda_t *change_aux; + checkCUDA(cudaMalloc(&change_aux, 5 * thread * sizeof(change_cuda_t))); + pot_cuda_t *pot_aux; + checkCUDA(cudaMalloc(&pot_aux, thread * sizeof(pot_cuda_t))); + + kernel<32><<>>( + m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu, best_e_gpu, + quasi_newton_par_max_steps, mutation_amplitude_float, states, seed, epsilon_fl_float, + hunt_cap_gpu, authentic_v_gpu, results_gpu, results_aux, change_aux, pot_aux, h_cuda_global, + m_cuda_global, global_steps, num_of_ligands, threads_per_ligand, multi_bias); // Device to Host memcpy of precalculated_byatom, copy back data to p_gpu - p_m_data_cuda_t* p_data; + p_m_data_cuda_t *p_data; checkCUDA(cudaMallocHost(&p_data, sizeof(p_m_data_cuda_t) * MAX_P_DATA_M_DATA_SIZE)); - output_type_cuda_t* results; + output_type_cuda_t *results; checkCUDA(cudaMallocHost(&results, thread * sizeof(output_type_cuda_t))); for (int l = 0; l < num_of_ligands; ++l) { @@ -1614,10 +751,12 @@ __host__ void monte_carlo::operator()( } // DEBUG_PRINTF("energies about the first ligand on GPU:\n"); // for (int i = 0;i < 20; ++i){ - // DEBUG_PRINTF("precalculated_byatom.m_data.m_data[%d]: (smooth.first, smooth.second, fast) - // ", i); for (int j = 0;j < FAST_SIZE; ++j){ - // DEBUG_PRINTF("(%f, %f, %f) ", p_gpu[0].m_data.m_data[i].smooth[j].first, - // p_gpu[0].m_data.m_data[i].smooth[j].second, p_gpu[0].m_data.m_data[i].fast[j]); + // DEBUG_PRINTF("precalculated_byatom.m_data.m_data[%d]: (smooth.first, + // smooth.second, fast) ", i); for (int j = 0;j < FAST_SIZE; ++j){ + // DEBUG_PRINTF("(%f, %f, %f) ", + // p_gpu[0].m_data.m_data[i].smooth[j].first, + // p_gpu[0].m_data.m_data[i].smooth[j].second, + // p_gpu[0].m_data.m_data[i].fast[j]); // } // DEBUG_PRINTF("\n"); // } @@ -1629,7 +768,8 @@ __host__ void monte_carlo::operator()( cudaEventSynchronize(stop); float msecTotal = 0.0f; cudaEventElapsedTime(&msecTotal, start, stop); - DEBUG_PRINTF("Time spend on GPU is %f ms\n", msecTotal); + printf("Time spend on GPU is %f ms\n", msecTotal); + // DEBUG_PRINTF("Time spend on GPU is %f ms\n", msecTotal); /* Convert result data. Can be improved by mapping memory */ @@ -1663,6 +803,8 @@ __host__ void monte_carlo::operator()( checkCUDA(cudaFree(hunt_cap_gpu)); checkCUDA(cudaFree(authentic_v_gpu)); checkCUDA(cudaFree(results_gpu)); + checkCUDA(cudaFree(change_aux)); + checkCUDA(cudaFree(results_aux)); checkCUDA(cudaFree(states)); checkCUDA(cudaFreeHost(m_cuda)); checkCUDA(cudaFreeHost(rand_molec_struc_tmp)); @@ -1674,17 +816,18 @@ __host__ void monte_carlo::operator()( DEBUG_PRINTF("exit monte_carlo\n"); } -bool metropolis_accept(fl old_f, fl new_f, fl temperature, rng& generator) { +bool metropolis_accept(fl old_f, fl new_f, fl temperature, rng &generator) { if (new_f < old_f) return true; const fl acceptance_probability = std::exp((old_f - new_f) / temperature); return random_fl(0, 1, generator) < acceptance_probability; } -__host__ void monte_carlo::operator()(model& m, output_container& out, const precalculate_byatom& p, - const igrid& ig, const vec& corner1, const vec& corner2, - rng& generator) const { +__host__ void monte_carlo::operator()(model &m, output_container &out, const precalculate_byatom &p, + const igrid &ig, const vec &corner1, const vec &corner2, + rng &generator) const { int evalcount = 0; - vec authentic_v(1000, 1000, 1000); // FIXME? this is here to avoid max_fl/max_fl + vec authentic_v(1000, 1000, + 1000); // FIXME? this is here to avoid max_fl/max_fl conf_size s = m.get_size(); change g(s); output_type tmp(s, 0); @@ -1709,7 +852,8 @@ __host__ void monte_carlo::operator()(model& m, output_container& out, const pre quasi_newton_par(m, p, ig, tmp, g, authentic_v, evalcount); m.set(tmp.c); // FIXME? useless? tmp.coords = m.get_heavy_atom_movable_coords(); - add_to_output_container(out, tmp, min_rmsd, num_saved_mins); // 20 - max size + add_to_output_container(out, tmp, min_rmsd, + num_saved_mins); // 20 - max size if (tmp.e < best_e) best_e = tmp.e; } } diff --git a/unidock/src/cuda/precalculate.cu b/unidock/src/cuda/precalculate.cu index 675ac70..03313a1 100644 --- a/unidock/src/cuda/precalculate.cu +++ b/unidock/src/cuda/precalculate.cu @@ -5,7 +5,7 @@ #include #include "precalculate.h" -#include "precalculate_gpu.h" +#include "precalculate_gpu.cuh" // TODO: define kernel here __global__ void precalculate_gpu(triangular_matrix_cuda_t *m_data_gpu_list, diff --git a/unidock/src/cuda/precalculate_gpu.h b/unidock/src/cuda/precalculate_gpu.cuh similarity index 99% rename from unidock/src/cuda/precalculate_gpu.h rename to unidock/src/cuda/precalculate_gpu.cuh index c22b620..673f8b4 100644 --- a/unidock/src/cuda/precalculate_gpu.h +++ b/unidock/src/cuda/precalculate_gpu.cuh @@ -5,7 +5,7 @@ // Define GPU precalculate structures /* atom related start */ -#include "atom_constants_gpu.h" +#include "atom_constants_gpu.cuh" /* atom related end */ diff --git a/unidock/src/cuda/warp_ops.cuh b/unidock/src/cuda/warp_ops.cuh new file mode 100644 index 0000000..1ea368f --- /dev/null +++ b/unidock/src/cuda/warp_ops.cuh @@ -0,0 +1,660 @@ +#pragma once +#include "bfgs.h" +#include "common.cuh" +#include "kernel.h" +#include +#include + +namespace cg = cooperative_groups; + +template +__device__ __forceinline__ void matrix_d_init_warp(cg::thread_block_tile &tile, + matrix_d *m, int dim, float fill_data) { + if (tile.thread_rank() == 0) m->dim = dim; + if ((dim * (dim + 1) / 2) > MAX_HESSIAN_MATRIX_D_SIZE) + DEBUG_PRINTF("\nnmatrix_d: matrix_d_init() ERROR!"); + // ((dim * (dim + 1) / 2)*sizeof(float)); // symmetric matrix_d + for (int i = tile.thread_rank(); i < (dim * (dim + 1) / 2); i += tile.num_threads()) + m->data[i] = fill_data; + for (int i = (dim * (dim + 1) / 2) + tile.thread_rank(); i < MAX_HESSIAN_MATRIX_D_SIZE; + i += tile.num_threads()) + m->data[i] = 0; // Others will be 0 + tile.sync(); +} + +template +__device__ __forceinline__ void matrix_d_set_diagonal_warp(cg::thread_block_tile &tile, + matrix_d *m, float fill_data) { + for (int i = tile.thread_rank(); i < m->dim; i += tile.num_threads()) { + m->data[i + i * (i + 1) / 2] = fill_data; + } + tile.sync(); +} + +template +__device__ __forceinline__ float scalar_product_warp(cg::thread_block_tile &tile, + const change_cuda_t *a, const change_cuda_t *b, + int n) { + float tmp = 0; + for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) { + tmp += find_change_index_read(a, i) * find_change_index_read(b, i); + } + tile.sync(); + + return cg::reduce(tile, tmp, cg::plus()); +} + +template +__device__ __forceinline__ void minus_mat_vec_product_warp(cg::thread_block_tile &tile, + const matrix_d *h, + const change_cuda_t *in, + change_cuda_t *out) { + int n = h->dim; + for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) { + float sum = 0; + for (int j = 0; j < n; j++) { + sum += h->data[index_permissive(h, i, j)] * find_change_index_read(in, j); + } + find_change_index_write(out, i, -sum); + } + tile.sync(); +} + +template +__device__ __forceinline__ void output_type_cuda_init_warp(cg::thread_block_tile &tile, + output_type_cuda_t *out, + const float *ptr) { + for (int i = tile.thread_rank(); i < 3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION; + i += tile.num_threads()) { + if (i < 3) + out->position[i] = ptr[i]; + else if (i < 7) + out->orientation[i - 3] = ptr[i]; + else if (i < 7 + MAX_NUM_OF_LIG_TORSION) + out->lig_torsion[i - 7] = ptr[i]; + else + out->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION] = ptr[i]; + } + + if (tile.thread_rank() == 0) + out->lig_torsion_size = ptr[3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION]; + // did not assign coords and e + + tile.sync(); +} + +template +__device__ __forceinline__ void output_type_cuda_init_with_output_warp( + cg::thread_block_tile &tile, output_type_cuda_t *out_new, + const output_type_cuda_t *out_old) { + for (int i = tile.thread_rank(); i < 3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION; + i += tile.num_threads()) { + if (i < 3) + out_new->position[i] = out_old->position[i]; + else if (i < 7) + out_new->orientation[i - 3] = out_old->orientation[i - 3]; + else if (i < 7 + MAX_NUM_OF_LIG_TORSION) + out_new->lig_torsion[i - 7] = out_old->lig_torsion[i - 7]; + else + out_new->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION] + = out_old->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION]; + } + + if (tile.thread_rank() == 0) { + out_new->lig_torsion_size = out_old->lig_torsion_size; + // assign e but not coords + out_new->e = out_old->e; + } + + tile.sync(); +} + +template __device__ __forceinline__ void output_type_cuda_increment_warp( + cg::thread_block_tile &tile, output_type_cuda_t *x, const change_cuda_t *c, + float factor, float epsilon_fl) { + // position increment + if (tile.thread_rank() == 0) { + for (int k = 0; k < 3; k++) x->position[k] += factor * c->position[k]; + // orientation increment + float rotation[3]; + for (int k = 0; k < 3; k++) rotation[k] = factor * c->orientation[k]; + quaternion_increment(x->orientation, rotation, epsilon_fl); + } + + // torsion increment + for (int k = tile.thread_rank(); k < x->lig_torsion_size; k += tile.num_threads()) { + float tmp = factor * c->lig_torsion[k]; + normalize_angle(&tmp); + x->lig_torsion[k] += tmp; + normalize_angle(&(x->lig_torsion[k])); + } + + tile.sync(); +} + +template __device__ __forceinline__ void change_cuda_init_with_change_warp( + cg::thread_block_tile &tile, change_cuda_t *g_new, const change_cuda_t *g_old) { + for (int i = tile.thread_rank(); i < 3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION; + i += tile.num_threads()) { + if (i < 3) + g_new->position[i] = g_old->position[i]; + else if (i < 7) + g_new->orientation[i - 3] = g_old->orientation[i - 3]; + else if (i < 7 + MAX_NUM_OF_LIG_TORSION) + g_new->lig_torsion[i - 7] = g_old->lig_torsion[i - 7]; + else + g_new->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION] + = g_old->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION]; + } + + if (tile.thread_rank() == 0) g_new->lig_torsion_size = g_old->lig_torsion_size; + // did not assign coords and e + + tile.sync(); +} + +template +__device__ __forceinline__ void ligand_init_with_ligand_warp(cg::thread_block_tile &tile, + const ligand_cuda_t *ligand_cuda_old, + ligand_cuda_t *ligand_cuda_new) { + for (int i = tile.thread_rank(); i < MAX_NUM_OF_LIG_PAIRS; i += tile.num_threads()) { + ligand_cuda_new->pairs.type_pair_index[i] = ligand_cuda_old->pairs.type_pair_index[i]; + } + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_LIG_PAIRS; i += tile.num_threads()) { + ligand_cuda_new->pairs.a[i] = ligand_cuda_old->pairs.a[i]; + } + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_LIG_PAIRS; i += tile.num_threads()) { + ligand_cuda_new->pairs.b[i] = ligand_cuda_old->pairs.b[i]; + } + + if (tile.thread_rank() == 0) + ligand_cuda_new->pairs.num_pairs = ligand_cuda_old->pairs.num_pairs; + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) { + for (int j = 0; j < 2; ++j) + ligand_cuda_new->rigid.atom_range[i][j] = ligand_cuda_old->rigid.atom_range[i][j]; + } + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) { + for (int j = 0; j < 3; ++j) + ligand_cuda_new->rigid.origin[i][j] = ligand_cuda_old->rigid.origin[i][j]; + } + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) { + for (int j = 0; j < 9; ++j) + ligand_cuda_new->rigid.orientation_m[i][j] = ligand_cuda_old->rigid.orientation_m[i][j]; + } + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) { + for (int j = 0; j < 4; ++j) + ligand_cuda_new->rigid.orientation_q[i][j] = ligand_cuda_old->rigid.orientation_q[i][j]; + } + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) { + for (int j = 0; j < 3; ++j) + ligand_cuda_new->rigid.axis[i][j] = ligand_cuda_old->rigid.axis[i][j]; + } + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) { + for (int j = 0; j < 3; ++j) + ligand_cuda_new->rigid.relative_axis[i][j] = ligand_cuda_old->rigid.relative_axis[i][j]; + } + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) { + for (int j = 0; j < 3; ++j) + ligand_cuda_new->rigid.relative_origin[i][j] + = ligand_cuda_old->rigid.relative_origin[i][j]; + } + + for (int i = tile.thread_rank(); i < MAX_NUM_OF_RIGID; i += tile.num_threads()) { + ligand_cuda_new->rigid.parent[i] = ligand_cuda_old->rigid.parent[i]; + } + + for (int i = 0; i < MAX_NUM_OF_RIGID; i++) { + for (int j = tile.thread_rank(); j < MAX_NUM_OF_RIGID; j += tile.num_threads()) + ligand_cuda_new->rigid.children_map[i][j] = ligand_cuda_old->rigid.children_map[i][j]; + + for (int j = tile.thread_rank(); j < MAX_NUM_OF_RIGID; j += tile.num_threads()) + ligand_cuda_new->rigid.descendant_map[i][j] + = ligand_cuda_old->rigid.descendant_map[i][j]; + } + + if (tile.thread_rank() == 0) { + ligand_cuda_new->rigid.num_children = ligand_cuda_old->rigid.num_children; + ligand_cuda_new->begin = ligand_cuda_old->begin; + ligand_cuda_new->end = ligand_cuda_old->end; + } + + tile.sync(); +} + +template +__device__ __forceinline__ void m_cuda_init_with_m_cuda_warp(cg::thread_block_tile &tile, + const m_cuda_t *m_cuda_old, + m_cuda_t *m_cuda_new) { + for (int i = tile.thread_rank(); i < MAX_NUM_OF_ATOMS; i += tile.num_threads()) { + m_cuda_new->atoms[i] = m_cuda_old->atoms[i]; + } + for (int i = tile.thread_rank(); i < MAX_NUM_OF_ATOMS; i += tile.num_threads()) { + for (int j = 0; j < 3; ++j) + m_cuda_new->m_coords.coords[i][j] = m_cuda_old->m_coords.coords[i][j]; + } + for (int i = tile.thread_rank(); i < MAX_NUM_OF_ATOMS; i += tile.num_threads()) { + for (int j = 0; j < 3; ++j) + m_cuda_new->minus_forces.coords[i][j] = m_cuda_old->minus_forces.coords[i][j]; + } + + ligand_init_with_ligand_warp(tile, &m_cuda_old->ligand, &m_cuda_new->ligand); + + if (tile.thread_rank() == 0) m_cuda_new->m_num_movable_atoms = m_cuda_old->m_num_movable_atoms; + + tile.sync(); +} + +template +__device__ __forceinline__ float ig_eval_deriv_warp(cg::thread_block_tile &tile, + output_type_cuda_t *x, const float v, + ig_cuda_t *ig_cuda_gpu, m_cuda_t *m_cuda_gpu, + const float epsilon_fl) { + float e = 0; + float deriv[3]; + int nat = num_atom_types(ig_cuda_gpu->atu); + for (int i = tile.thread_rank(); i < m_cuda_gpu->m_num_movable_atoms; i += tile.num_threads()) { + int t = m_cuda_gpu->atoms[i].types[ig_cuda_gpu->atu]; + if (t >= nat) { + m_cuda_gpu->minus_forces.coords[i][0] = 0.0f; + m_cuda_gpu->minus_forces.coords[i][1] = 0.0f; + m_cuda_gpu->minus_forces.coords[i][2] = 0.0f; + continue; + } + + e += g_evaluate(&ig_cuda_gpu->grids[t], m_cuda_gpu->m_coords.coords[i], ig_cuda_gpu->slope, + v, deriv, epsilon_fl); + + m_cuda_gpu->minus_forces.coords[i][0] = deriv[0]; + m_cuda_gpu->minus_forces.coords[i][1] = deriv[1]; + m_cuda_gpu->minus_forces.coords[i][2] = deriv[2]; + } + tile.sync(); + return e; +} + +template __device__ __forceinline__ float eval_interacting_pairs_deriv_warp( + cg::thread_block_tile &tile, p_cuda_t *p_cuda_gpu, const float v, + const lig_pairs_cuda_t *pairs, const m_coords_cuda_t *m_coords, m_minus_forces_t *minus_forces, + const float epsilon_fl) { + float e = 0.0f; + + for (int i = tile.thread_rank(); i < pairs->num_pairs; i += tile.num_threads()) { + int ai = pairs->a[i], bi = pairs->b[i]; + int index = pairs->a[i] + pairs->b[i] * (pairs->b[i] + 1) / 2; + float r[3] = {m_coords->coords[bi][0] - m_coords->coords[ai][0], + m_coords->coords[bi][1] - m_coords->coords[ai][1], + m_coords->coords[bi][2] - m_coords->coords[ai][2]}; + float r2 = r[0] * r[0] + r[1] * r[1] + r[2] * r[2]; + + if (r2 < p_cuda_gpu->m_cutoff_sqr) { + float tmp[2]; + p_eval_deriv(tmp, index, r2, p_cuda_gpu, epsilon_fl); + float force[3] = {r[0] * tmp[1], r[1] * tmp[1], r[2] * tmp[1]}; + curl(&tmp[0], force, v, epsilon_fl); + e += tmp[0]; + atomicAdd(&minus_forces->coords[ai][0], -force[0]); + atomicAdd(&minus_forces->coords[ai][1], -force[1]); + atomicAdd(&minus_forces->coords[ai][2], -force[2]); + atomicAdd(&minus_forces->coords[bi][0], force[0]); + atomicAdd(&minus_forces->coords[bi][1], force[1]); + atomicAdd(&minus_forces->coords[bi][2], force[2]); + } + } + tile.sync(); + return e; +} + +template +__device__ __forceinline__ void POT_deriv_warp(cg::thread_block_tile &tile, + const m_minus_forces_t *minus_forces, + const rigid_cuda_t *lig_rigid_gpu, + const m_coords_cuda_t *m_coords, change_cuda_t *g, + pot_cuda_t *p) { + int num_torsion = lig_rigid_gpu->num_children; + int num_rigid = num_torsion + 1; + + float pos_tmp[3], ori_tmp[3], tmp1[3], tmp2[3], tmp3[3]; + for (int i = tile.thread_rank(); i < num_rigid; i += tile.num_threads()) { + int begin = lig_rigid_gpu->atom_range[i][0]; + int end = lig_rigid_gpu->atom_range[i][1]; + pos_tmp[0] = pos_tmp[1] = pos_tmp[2] = 0.0f; + ori_tmp[0] = ori_tmp[1] = ori_tmp[2] = 0.0f; + for (int j = begin; j < end; j++) { + pos_tmp[0] += minus_forces->coords[j][0]; + pos_tmp[1] += minus_forces->coords[j][1]; + pos_tmp[2] += minus_forces->coords[j][2]; + tmp1[0] = m_coords->coords[j][0] - lig_rigid_gpu->origin[i][0]; + tmp1[1] = m_coords->coords[j][1] - lig_rigid_gpu->origin[i][1]; + tmp1[2] = m_coords->coords[j][2] - lig_rigid_gpu->origin[i][2]; + tmp2[0] = minus_forces->coords[j][0]; + tmp2[1] = minus_forces->coords[j][1]; + tmp2[2] = minus_forces->coords[j][2]; + product(tmp3, tmp1, tmp2); + ori_tmp[0] += tmp3[0]; + ori_tmp[1] += tmp3[1]; + ori_tmp[2] += tmp3[2]; + } + p->ptmp[i][0] = pos_tmp[0]; + p->ptmp[i][1] = pos_tmp[1]; + p->ptmp[i][2] = pos_tmp[2]; + p->o[i][0] = ori_tmp[0]; + p->o[i][1] = ori_tmp[1]; + p->o[i][2] = ori_tmp[2]; + } + tile.sync(); + + /* position_derivative */ + for (int i = tile.thread_rank(); i < num_rigid; i += tile.num_threads()) { + p->p[i][0] = p->ptmp[i][0]; + p->p[i][1] = p->ptmp[i][1]; + p->p[i][2] = p->ptmp[i][2]; + for (int j = i + 1; j < num_rigid; j++) { + if (lig_rigid_gpu->descendant_map[i][j]) { + p->p[i][0] += p->ptmp[j][0]; + p->p[i][1] += p->ptmp[j][1]; + p->p[i][2] += p->ptmp[j][2]; + } + } + } + tile.sync(); + + /* orientation derivative */ + if (tile.thread_rank() == 0) { // NOTE: Single thread is better here + float origin_temp[3], product_out[3]; + for (int i = num_rigid - 1; i >= 0; i--) { /* from bottom to top */ + ori_tmp[0] = p->o[i][0]; + ori_tmp[1] = p->o[i][1]; + ori_tmp[2] = p->o[i][2]; + for (int j = i + 1; j < num_rigid; j++) { + if (lig_rigid_gpu->children_map[i][j]) { + ori_tmp[0] += p->o[j][0]; + ori_tmp[1] += p->o[j][1]; + ori_tmp[2] += p->o[j][2]; /* self+children node + */ + + origin_temp[0] = lig_rigid_gpu->origin[j][0] - lig_rigid_gpu->origin[i][0]; + origin_temp[1] = lig_rigid_gpu->origin[j][1] - lig_rigid_gpu->origin[i][1]; + origin_temp[2] = lig_rigid_gpu->origin[j][2] - lig_rigid_gpu->origin[i][2]; + + product(product_out, origin_temp, p->p[j]); + ori_tmp[0] += product_out[0]; + ori_tmp[1] += product_out[1]; + ori_tmp[2] += product_out[2]; + } + } + p->o[i][0] = ori_tmp[0]; + p->o[i][1] = ori_tmp[1]; + p->o[i][2] = ori_tmp[2]; + } + } + tile.sync(); + + /* torsion_derivative */ + for (int i = tile.thread_rank(); i < num_torsion; i += tile.num_threads()) { + g->lig_torsion[i - 1] = p->o[i][0] * lig_rigid_gpu->axis[i][0] + + p->o[i][1] * lig_rigid_gpu->axis[i][1] + + p->o[i][2] * lig_rigid_gpu->axis[i][2]; + } + tile.sync(); + + for (int i = tile.thread_rank(); i < 3; i += tile.num_threads()) { + g->position[i] = p->p[0][i]; + g->orientation[i] = p->o[0][i]; + } + + tile.sync(); +} + +template +__device__ float m_eval_deriv_warp(cg::thread_block_tile &tile, output_type_cuda_t *c, + change_cuda_t *g, m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu, + ig_cuda_t *ig_cuda_gpu, pot_cuda_t *pot_aux, const float *v, + const float epsilon_fl) { + // check set args + if (tile.thread_rank() == 0) { + set(c, &m_cuda_gpu->ligand.rigid, &m_cuda_gpu->m_coords, m_cuda_gpu->atoms, + m_cuda_gpu->m_num_movable_atoms, epsilon_fl); + } + tile.sync(); + + float e = ig_eval_deriv_warp(tile, c, v[1], ig_cuda_gpu, m_cuda_gpu, epsilon_fl); + e += eval_interacting_pairs_deriv_warp(tile, p_cuda_gpu, v[0], &m_cuda_gpu->ligand.pairs, + &m_cuda_gpu->m_coords, &m_cuda_gpu->minus_forces, + epsilon_fl); + tile.sync(); + e = cg::reduce(tile, e, cg::plus()); + + // should add derivs for glue, other and inter pairs + POT_deriv_warp(tile, &m_cuda_gpu->minus_forces, &m_cuda_gpu->ligand.rigid, + &m_cuda_gpu->m_coords, g, pot_aux); + + return e; +} + +template __device__ __forceinline__ void line_search_warp( + cg::thread_block_tile &tile, m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu, + ig_cuda_t *ig_cuda_gpu, int n, const output_type_cuda_t *x, const change_cuda_t *g, + const float f0, const change_cuda_t *p, output_type_cuda_t *x_new, change_cuda_t *g_new, + pot_cuda_t *pot_aux, float *f, float *alpha, const float epsilon_fl, const float *hunt_cap) { + const float c0 = 0.0001; + const int max_trials = 10; + const float multiplier = 0.5; + float alpha_ = 1.0, f_; + + const float pg = scalar_product_warp(tile, p, g, n); + for (int trial = 0; trial < max_trials; trial++) { + output_type_cuda_init_with_output_warp(tile, x_new, x); + output_type_cuda_increment_warp(tile, x_new, p, alpha_, epsilon_fl); + f_ = m_eval_deriv_warp(tile, x_new, g_new, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, pot_aux, + hunt_cap, epsilon_fl); + if (f_ - f0 < c0 * alpha_ * pg) break; + alpha_ *= multiplier; + } + + *f = f_; + *alpha = alpha_; +} + +template +__device__ __forceinline__ bool bfgs_update_warp(cg::thread_block_tile &tile, matrix_d *h, + const change_cuda_t *p, const change_cuda_t *y, + const float alpha, const float epsilon_fl) { + float yp, yhy; + yp = scalar_product_warp(tile, y, p, h->dim); + if (alpha * yp < epsilon_fl) return false; + + __shared__ change_cuda_t minus_hy[32 / TileSize]; + auto minus_hy_ptr = &minus_hy[tile.meta_group_rank()]; + change_cuda_init_with_change_warp(tile, minus_hy_ptr, y); + minus_mat_vec_product_warp(tile, h, y, minus_hy_ptr); + yhy = -scalar_product_warp(tile, y, minus_hy_ptr, h->dim); + float r = 1 / (alpha * yp); + int n = 6 + p->lig_torsion_size; + + __shared__ float minus_hy_[(6 + MAX_NUM_OF_LIG_TORSION) * 32 / TileSize], + p_[(6 + MAX_NUM_OF_LIG_TORSION) * 32 / TileSize]; + + // Calculate offset + auto minus_hy_ptr_ = &minus_hy_[tile.meta_group_rank() * (6 + MAX_NUM_OF_LIG_TORSION)]; + auto p_ptr_ = &p_[tile.meta_group_rank() * (6 + MAX_NUM_OF_LIG_TORSION)]; + for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) { + minus_hy_ptr_[i] = find_change_index_read(minus_hy, i); + p_ptr_[i] = find_change_index_read(p, i); + } + tile.sync(); + + for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) { + for (int j = i; j < n; j++) { + float tmp = alpha * r * (minus_hy_ptr_[i] * p_ptr_[j] + minus_hy_ptr_[j] * p_ptr_[i]) + + alpha * alpha * (r * r * yhy + r) * p_ptr_[i] * p_ptr_[j]; + h->data[i + j * (j + 1) / 2] += tmp; + } + } + tile.sync(); + return true; +} + +template +__device__ void bfgs_warp(cg::thread_block_tile &tile, output_type_cuda_t *x, + output_type_cuda_t *x_new, output_type_cuda_t *x_orig, change_cuda_t *g, + change_cuda_t *g_new, change_cuda_t *g_orig, change_cuda_t *p, + change_cuda_t *y, matrix_d *h, m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu, + ig_cuda_t *ig_cuda_gpu, pot_cuda_t *pot_aux, const float *hunt_cap, + const float epsilon_fl, const int max_steps) { + // Profiling: perform timing within kernel + int n = 3 + 3 + x->lig_torsion_size; /* the dimensions of matirx */ + + float f0, f1, f_orig, alpha; + + matrix_d_init_warp(tile, h, n, 0); + matrix_d_set_diagonal_warp(tile, h, 1); + change_cuda_init_with_change_warp(tile, g_new, g); + output_type_cuda_init_with_output_warp(tile, x_new, x); + f_orig = m_eval_deriv_warp(tile, x, g, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, pot_aux, hunt_cap, + epsilon_fl); + + /* Init g_orig, x_orig */ + change_cuda_init_with_change_warp(tile, g_orig, g); + output_type_cuda_init_with_output_warp(tile, x_orig, x); + + /* Init p */ + change_cuda_init_with_change_warp(tile, p, g); + + for (int step = 0; step < max_steps; step++) { + minus_mat_vec_product_warp(tile, h, g, p); + line_search_warp(tile, m_cuda_gpu, p_cuda_gpu, ig_cuda_gpu, n, x, g, f0, p, x_new, g_new, + pot_aux, &f1, &alpha, epsilon_fl, hunt_cap); + change_cuda_init_with_change_warp(tile, y, g_new); + + /* subtract_change */ + for (int i = tile.thread_rank(); i < n; i += tile.num_threads()) { + float tmp = find_change_index_read(y, i) - find_change_index_read(g, i); + find_change_index_write(y, i, tmp); + } + tile.sync(); + f0 = f1; + + output_type_cuda_init_with_output_warp(tile, x, x_new); + + float gg = scalar_product_warp(tile, g, g, n); + if (!(sqrtf(gg) >= 1e-5f)) break; + + change_cuda_init_with_change_warp(tile, g, g_new); + + if (step == 0) { + float yy = scalar_product_warp(tile, y, y, n); + if (fabs(yy) > epsilon_fl) { + float yp = scalar_product_warp(tile, y, p, n); + matrix_d_set_diagonal_warp(tile, h, alpha * yp / yy); + } + } + tile.sync(); + + bfgs_update_warp(tile, h, p, y, alpha, epsilon_fl); + } + + if (!(f0 <= f_orig)) { + f0 = f_orig; + output_type_cuda_init_with_output_warp(tile, x, x_orig); + change_cuda_init_with_change_warp(tile, g, g_orig); + } + + // write output_type_cuda energy + x->e = f0; +} + +__device__ __forceinline__ void set_warp(const output_type_cuda_t *x, rigid_cuda_t *lig_rigid_gpu, + m_coords_cuda_t *m_coords_gpu, const atom_cuda_t *atoms, + const int m_num_movable_atoms, const float epsilon_fl) { + for (int i = 0; i < 3; i++) lig_rigid_gpu->origin[0][i] = x->position[i]; + for (int i = 0; i < 4; i++) lig_rigid_gpu->orientation_q[0][i] = x->orientation[i]; + quaternion_to_r3(lig_rigid_gpu->orientation_q[0], + lig_rigid_gpu->orientation_m[0]); /* set orientation_m */ + + int begin = lig_rigid_gpu->atom_range[0][0]; + int end = lig_rigid_gpu->atom_range[0][1]; + for (int i = begin; i < end; i++) { + local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[0], atoms[i].coords, + lig_rigid_gpu->orientation_m[0]); + } + /* ************* end node.set_conf ************* */ + + /* ************* branches_set_conf ************* */ + /* update nodes in depth-first order */ + for (int current = 1; current < lig_rigid_gpu->num_children + 1; + current++) { /* current starts from 1 (namely starts from first + child node) */ + int parent = lig_rigid_gpu->parent[current]; + float torsion = x->lig_torsion[current - 1]; /* torsions are all related to child nodes */ + local_to_lab(lig_rigid_gpu->origin[current], lig_rigid_gpu->origin[parent], + lig_rigid_gpu->relative_origin[current], lig_rigid_gpu->orientation_m[parent]); + local_to_lab_direction(lig_rigid_gpu->axis[current], lig_rigid_gpu->relative_axis[current], + lig_rigid_gpu->orientation_m[parent]); + float tmp[4]; + float parent_q[4] + = {lig_rigid_gpu->orientation_q[parent][0], lig_rigid_gpu->orientation_q[parent][1], + lig_rigid_gpu->orientation_q[parent][2], lig_rigid_gpu->orientation_q[parent][3]}; + float current_axis[3] = {lig_rigid_gpu->axis[current][0], lig_rigid_gpu->axis[current][1], + lig_rigid_gpu->axis[current][2]}; + + angle_to_quaternion2(tmp, current_axis, torsion); + angle_to_quaternion_multi(tmp, parent_q); + quaternion_normalize_approx(tmp, epsilon_fl); + + for (int i = 0; i < 4; i++) + lig_rigid_gpu->orientation_q[current][i] = tmp[i]; /* set orientation_q */ + quaternion_to_r3(lig_rigid_gpu->orientation_q[current], + lig_rigid_gpu->orientation_m[current]); /* set orientation_m */ + + /* set coords */ + begin = lig_rigid_gpu->atom_range[current][0]; + end = lig_rigid_gpu->atom_range[current][1]; + for (int i = begin; i < end; i++) { + local_to_lab(m_coords_gpu->coords[i], lig_rigid_gpu->origin[current], atoms[i].coords, + lig_rigid_gpu->orientation_m[current]); + } + } + /* ************* end branches_set_conf ************* */ +} + +template +__device__ __forceinline__ void write_back_warp(cg::thread_block_tile &tile, + output_type_cuda_t *results, + const output_type_cuda_t *best_out) { + for (int i = tile.thread_rank(); + i < 3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION + MAX_NUM_OF_ATOMS; + i += tile.num_threads()) { + if (i < 3) { + results->position[i] = best_out->position[i]; + } else if (i < 7) { + results->orientation[i - 3] = best_out->orientation[i - 3]; + } else if (i < 7 + MAX_NUM_OF_LIG_TORSION) { + results->lig_torsion[i - 7] = best_out->lig_torsion[i - 7]; + } else if (i < 7 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION) { + results->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION] + = best_out->flex_torsion[i - 7 - MAX_NUM_OF_LIG_TORSION]; + } else { +#pragma unroll + for (int j = 0; j < 3; j++) { + results->coords[i - 7 - MAX_NUM_OF_LIG_TORSION - MAX_NUM_OF_FLEX_TORSION][j] + = best_out->coords[i - 7 - MAX_NUM_OF_LIG_TORSION - MAX_NUM_OF_FLEX_TORSION][j]; + } + } + } + + if (tile.thread_rank() == 0) { + results->lig_torsion_size = best_out->lig_torsion_size; + results->e = best_out->e; + } + + tile.sync(); +} From 64bbef2f69998c6565ca39f1d8fd15ba8a2d36d2 Mon Sep 17 00:00:00 2001 From: Zihua Wu Date: Sat, 18 Nov 2023 17:33:02 +0800 Subject: [PATCH 2/5] robust memory size estimation --- unidock/src/cuda/common.cuh | 6 --- unidock/src/cuda/kernel.h | 6 +++ unidock/src/cuda/monte_carlo.cu | 59 ++++++++++++++++------------ unidock/src/cuda/warp_ops.cuh | 20 +++++----- unidock/src/main/main.cpp | 69 ++++++++++++++++++++------------- 5 files changed, 94 insertions(+), 66 deletions(-) diff --git a/unidock/src/cuda/common.cuh b/unidock/src/cuda/common.cuh index 1e0000f..9e08c82 100644 --- a/unidock/src/cuda/common.cuh +++ b/unidock/src/cuda/common.cuh @@ -19,12 +19,6 @@ #define M_PI_F 3.1415927f -// symmetric matrix_d (only half of it are stored) -typedef struct { - float data[MAX_HESSIAN_MATRIX_D_SIZE]; - int dim; -} matrix_d; - /* Below based on mutate_conf.cpp */ __device__ __forceinline__ void quaternion_increment(float *q, const float *rotation, diff --git a/unidock/src/cuda/kernel.h b/unidock/src/cuda/kernel.h index bbee8ad..fc5e0f5 100644 --- a/unidock/src/cuda/kernel.h +++ b/unidock/src/cuda/kernel.h @@ -212,3 +212,9 @@ typedef struct { output_type_cuda_t container[MAX_CONTAINER_SIZE_EVERY_WI]; int current_size; } output_container_cuda_t; + +// symmetric matrix_d (only half of it are stored) +typedef struct { + float data[MAX_HESSIAN_MATRIX_D_SIZE]; + int dim; +} matrix_d; diff --git a/unidock/src/cuda/monte_carlo.cu b/unidock/src/cuda/monte_carlo.cu index c8971ed..04ffdc0 100644 --- a/unidock/src/cuda/monte_carlo.cu +++ b/unidock/src/cuda/monte_carlo.cu @@ -98,34 +98,34 @@ __device__ __forceinline__ void write_back(output_type_cuda_t *results, template __global__ __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) void kernel( m_cuda_t *m_cuda_global, ig_cuda_t *ig_cuda_gpu, p_cuda_t *p_cuda_gpu, - float *rand_molec_struc_gpu, float *best_e_gpu, int bfgs_max_steps, float mutation_amplitude, + float *rand_molec_struc_gpu, int bfgs_max_steps, float mutation_amplitude, curandStatePhilox4_32_10_t *states, unsigned long long seed, float epsilon_fl, float *hunt_cap_gpu, float *authentic_v_gpu, output_type_cuda_t *results, output_type_cuda_t *output_aux, change_cuda_t *change_aux, pot_cuda_t *pot_aux, matrix_d *h_cuda_gpu, m_cuda_t *m_cuda_gpu, int search_depth, int num_of_ligands, int threads_per_ligand, bool multi_bias) { int bid = blockIdx.x, tid = threadIdx.x; - int pose_id = (bid * warpSize + tid) / TileSize; - auto tb = cg::this_thread_block(); - cg::thread_block_tile tile = cg::tiled_partition(tb); + int pose_id = (bid * blockDim.x + tid) / TileSize; if (m_cuda_global[pose_id / threads_per_ligand].m_num_movable_atoms == -1) { return; } + auto tb = cg::this_thread_block(); + cg::thread_block_tile tile = cg::tiled_partition(tb); + float best_e = INFINITY; - // __shared__ output_type_cuda_t tmp, best_out, candidate, x_new, x_orig; output_type_cuda_t &tmp = output_aux[pose_id * 5]; output_type_cuda_t &best_out = output_aux[pose_id * 5 + 1]; output_type_cuda_t &candidate = output_aux[pose_id * 5 + 2]; output_type_cuda_t &x_new = output_aux[pose_id * 5 + 3]; output_type_cuda_t &x_orig = output_aux[pose_id * 5 + 4]; - // __shared__ change_cuda_t g, tmp1, tmp2, tmp3, tmp4; - change_cuda_t &g = change_aux[pose_id * 5]; - change_cuda_t &tmp1 = change_aux[pose_id * 5 + 1]; - change_cuda_t &tmp2 = change_aux[pose_id * 5 + 2]; - change_cuda_t &tmp3 = change_aux[pose_id * 5 + 3]; - change_cuda_t &tmp4 = change_aux[pose_id * 5 + 4]; + change_cuda_t &g = change_aux[pose_id * 6]; + change_cuda_t &tmp1 = change_aux[pose_id * 6 + 1]; + change_cuda_t &tmp2 = change_aux[pose_id * 6 + 2]; + change_cuda_t &tmp3 = change_aux[pose_id * 6 + 3]; + change_cuda_t &tmp4 = change_aux[pose_id * 6 + 4]; + change_cuda_t &tmp5 = change_aux[pose_id * 6 + 5]; if (pose_id < num_of_ligands * threads_per_ligand) { output_type_cuda_init_warp( @@ -159,7 +159,7 @@ __global__ __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) void kern mutation_amplitude); tile.sync(); - bfgs_warp(tile, &candidate, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4, + bfgs_warp(tile, &candidate, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &h_cuda_gpu[pose_id], &m_cuda_gpu[pose_id], p_cuda_gpu, ig_cuda_gpu, pot_aux, hunt_cap_gpu, epsilon_fl, bfgs_max_steps); @@ -182,7 +182,7 @@ __global__ __launch_bounds__(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) void kern tile.sync(); if (tmp.e < best_e) { - bfgs_warp(tile, &tmp, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4, + bfgs_warp(tile, &tmp, &x_new, &x_orig, &g, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &h_cuda_gpu[pose_id], &m_cuda_gpu[pose_id], p_cuda_gpu, ig_cuda_gpu, pot_aux, authentic_v_gpu, epsilon_fl, bfgs_max_steps); @@ -274,6 +274,7 @@ __host__ void monte_carlo::operator()( p_cuda_t_cpu *p_cuda; checkCUDA(cudaMallocHost(&p_cuda, sizeof(p_cuda_t_cpu))); + size_t avail, total; /* End CPU allocation */ @@ -291,11 +292,7 @@ __host__ void monte_carlo::operator()( // rand_molec_struc_gpu float *rand_molec_struc_gpu; checkCUDA(cudaMalloc(&rand_molec_struc_gpu, thread * SIZE_OF_MOLEC_STRUC)); - // best_e_gpu - float *best_e_gpu; float epsilon_fl_float = static_cast(epsilon_fl); - checkCUDA(cudaMalloc(&best_e_gpu, sizeof(float))); - checkCUDA(cudaMemcpy(best_e_gpu, &max_fl, sizeof(float), cudaMemcpyHostToDevice)); // use cuRand to generate random values on GPU curandStatePhilox4_32_10_t *states; @@ -324,17 +321,29 @@ __host__ void monte_carlo::operator()( float authentic_v_float[3] = {static_cast(authentic_v[0]), static_cast(authentic_v[1]), static_cast(authentic_v[2])}; + cudaMemGetInfo(&avail, &total); + printf("Available Memory = %dMiB Total Memory = %dMiB\n", int(avail / 1024 / 1024), + int(total / 1024 / 1024)); checkCUDA(cudaMalloc(&authentic_v_gpu, sizeof(authentic_v_float))); // Preparing result data output_type_cuda_t *results_gpu; checkCUDA(cudaMalloc(&results_gpu, thread * sizeof(output_type_cuda_t))); + cudaMemGetInfo(&avail, &total); + printf("Available Memory = %dMiB Total Memory = %dMiB\n", int(avail / 1024 / 1024), + int(total / 1024 / 1024)); m_cuda_t *m_cuda_global; checkCUDA(cudaMalloc(&m_cuda_global, thread * sizeof(m_cuda_t))); + cudaMemGetInfo(&avail, &total); + printf("Available Memory = %dMiB Total Memory = %dMiB\n", int(avail / 1024 / 1024), + int(total / 1024 / 1024)); matrix_d *h_cuda_global; checkCUDA(cudaMalloc(&h_cuda_global, thread * sizeof(matrix_d))); + cudaMemGetInfo(&avail, &total); + printf("Available Memory = %dMiB Total Memory = %dMiB\n", int(avail / 1024 / 1024), + int(total / 1024 / 1024)); /* End Allocating GPU Memory */ @@ -721,15 +730,15 @@ __host__ void monte_carlo::operator()( output_type_cuda_t *results_aux; checkCUDA(cudaMalloc(&results_aux, 5 * thread * sizeof(output_type_cuda_t))); change_cuda_t *change_aux; - checkCUDA(cudaMalloc(&change_aux, 5 * thread * sizeof(change_cuda_t))); + checkCUDA(cudaMalloc(&change_aux, 6 * thread * sizeof(change_cuda_t))); pot_cuda_t *pot_aux; checkCUDA(cudaMalloc(&pot_aux, thread * sizeof(pot_cuda_t))); - kernel<32><<>>( - m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu, best_e_gpu, - quasi_newton_par_max_steps, mutation_amplitude_float, states, seed, epsilon_fl_float, - hunt_cap_gpu, authentic_v_gpu, results_gpu, results_aux, change_aux, pot_aux, h_cuda_global, - m_cuda_global, global_steps, num_of_ligands, threads_per_ligand, multi_bias); + kernel<32><<>>(m_cuda_gpu, ig_cuda_gpu, p_cuda_gpu, rand_molec_struc_gpu, + quasi_newton_par_max_steps, mutation_amplitude_float, states, seed, + epsilon_fl_float, hunt_cap_gpu, authentic_v_gpu, results_gpu, + results_aux, change_aux, pot_aux, h_cuda_global, m_cuda_global, + global_steps, num_of_ligands, threads_per_ligand, multi_bias); // Device to Host memcpy of precalculated_byatom, copy back data to p_gpu p_m_data_cuda_t *p_data; @@ -799,13 +808,15 @@ __host__ void monte_carlo::operator()( checkCUDA(cudaFree(ig_cuda_gpu)); checkCUDA(cudaFree(p_cuda_gpu)); checkCUDA(cudaFree(rand_molec_struc_gpu)); - checkCUDA(cudaFree(best_e_gpu)); checkCUDA(cudaFree(hunt_cap_gpu)); checkCUDA(cudaFree(authentic_v_gpu)); checkCUDA(cudaFree(results_gpu)); checkCUDA(cudaFree(change_aux)); checkCUDA(cudaFree(results_aux)); + checkCUDA(cudaFree(pot_aux)); checkCUDA(cudaFree(states)); + checkCUDA(cudaFree(h_cuda_global)); + checkCUDA(cudaFree(m_cuda_global)); checkCUDA(cudaFreeHost(m_cuda)); checkCUDA(cudaFreeHost(rand_molec_struc_tmp)); checkCUDA(cudaFreeHost(ig_cuda_ptr)); diff --git a/unidock/src/cuda/warp_ops.cuh b/unidock/src/cuda/warp_ops.cuh index 1ea368f..71d4abf 100644 --- a/unidock/src/cuda/warp_ops.cuh +++ b/unidock/src/cuda/warp_ops.cuh @@ -467,16 +467,15 @@ template __device__ __forceinline__ void line_search_war template __device__ __forceinline__ bool bfgs_update_warp(cg::thread_block_tile &tile, matrix_d *h, const change_cuda_t *p, const change_cuda_t *y, - const float alpha, const float epsilon_fl) { + change_cuda_t *minus_hy, const float alpha, + const float epsilon_fl) { float yp, yhy; yp = scalar_product_warp(tile, y, p, h->dim); if (alpha * yp < epsilon_fl) return false; - __shared__ change_cuda_t minus_hy[32 / TileSize]; - auto minus_hy_ptr = &minus_hy[tile.meta_group_rank()]; - change_cuda_init_with_change_warp(tile, minus_hy_ptr, y); - minus_mat_vec_product_warp(tile, h, y, minus_hy_ptr); - yhy = -scalar_product_warp(tile, y, minus_hy_ptr, h->dim); + change_cuda_init_with_change_warp(tile, minus_hy, y); + minus_mat_vec_product_warp(tile, h, y, minus_hy); + yhy = -scalar_product_warp(tile, y, minus_hy, h->dim); float r = 1 / (alpha * yp); int n = 6 + p->lig_torsion_size; @@ -507,9 +506,10 @@ template __device__ void bfgs_warp(cg::thread_block_tile &tile, output_type_cuda_t *x, output_type_cuda_t *x_new, output_type_cuda_t *x_orig, change_cuda_t *g, change_cuda_t *g_new, change_cuda_t *g_orig, change_cuda_t *p, - change_cuda_t *y, matrix_d *h, m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu, - ig_cuda_t *ig_cuda_gpu, pot_cuda_t *pot_aux, const float *hunt_cap, - const float epsilon_fl, const int max_steps) { + change_cuda_t *y, change_cuda_t *minus_hy, matrix_d *h, + m_cuda_t *m_cuda_gpu, p_cuda_t *p_cuda_gpu, ig_cuda_t *ig_cuda_gpu, + pot_cuda_t *pot_aux, const float *hunt_cap, const float epsilon_fl, + const int max_steps) { // Profiling: perform timing within kernel int n = 3 + 3 + x->lig_torsion_size; /* the dimensions of matirx */ @@ -559,7 +559,7 @@ __device__ void bfgs_warp(cg::thread_block_tile &tile, output_type_cud } tile.sync(); - bfgs_update_warp(tile, h, p, y, alpha, epsilon_fl); + bfgs_update_warp(tile, h, p, y, minus_hy, alpha, epsilon_fl); } if (!(f0 <= f_orig)) { diff --git a/unidock/src/main/main.cpp b/unidock/src/main/main.cpp index dded9dd..b1db789 100644 --- a/unidock/src/main/main.cpp +++ b/unidock/src/main/main.cpp @@ -20,11 +20,14 @@ */ +#include #include #include #include // ligand paths #include #include +#include "conf.h" +#include "kernel.h" #include "vina.h" #include "utils.h" #include "scoring_function.h" @@ -32,6 +35,7 @@ #include #include #include +#include struct usage_error : public std::runtime_error { usage_error(const std::string& message) : std::runtime_error(message) {} @@ -68,25 +72,39 @@ void check_occurrence(boost::program_options::variables_map& vm, } int predict_peak_memory(int batch_size, int exhaustiveness, int all_atom2_numbers, - bool use_v100 = true, bool multi_bias = false) { - if (!multi_bias) { - if (use_v100) { - return 1.214869 * batch_size + .0038522 * exhaustiveness * batch_size - + .011978 * all_atom2_numbers + 20017.72; // this is based on V100, 32G - } else { - return 1.166067 * batch_size + .0038676 * exhaustiveness * batch_size - + .0119598 * all_atom2_numbers + 5313.848; // this is based on T4, 16G - } - } else { - if (use_v100) { - return 65.214869 * batch_size + .0038522 * exhaustiveness * batch_size - + .011978 * all_atom2_numbers + 20017.72; // this is based on V100, 32G - } else { - return 65.166067 * batch_size + .0038676 * exhaustiveness * batch_size - + .0119598 * all_atom2_numbers + 5313.848; // this is based on T4, 16G - } - } - return 0; + bool multi_bias) { + int64_t gpu_memory = 0; + // precalculate + gpu_memory += (int64_t)(1) * all_atom2_numbers * sizeof(precalculate_element_cuda_t); + + // m_cuda_gpu + gpu_memory += (int64_t)(1) * batch_size * sizeof(m_cuda_t); + // ig_cuda_gpu + if (multi_bias) + gpu_memory += (int64_t)(1) * batch_size * sizeof(ig_cuda_t); + else + gpu_memory += sizeof(ig_cuda_t); + // p_cuda_gpu + gpu_memory += (int64_t)(1) * batch_size * sizeof(p_cuda_t); + // rand_molec_struc_gpu + gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * SIZE_OF_MOLEC_STRUC; + // results_gpu + gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * sizeof(output_type_cuda_t); + // m_cuda_global + gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * sizeof(m_cuda_t); + // h_cuda_global + gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * sizeof(matrix_d); + // change_aux + gpu_memory += (int64_t)(6) * batch_size * exhaustiveness * sizeof(change_cuda_t); + // results_aux + gpu_memory += (int64_t)(5) * batch_size * exhaustiveness * sizeof(output_type_cuda_t); + // pot_aux + gpu_memory += (int64_t)(1) * batch_size * exhaustiveness * sizeof(pot_cuda_t); + // states + gpu_memory + += (int64_t)(1) * batch_size * exhaustiveness * 64; // sizeof(curandStatePhilox4_32_10_t) + + return gpu_memory / (1024 * 1024); } int main(int argc, char* argv[]) { @@ -703,7 +721,9 @@ bug reporting, license agreements, and more information. \n"; int deviceCount = 0; size_t avail; size_t total; - float max_memory = 32000; + // get total memory in MB and leave 5% + float max_memory + = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGE_SIZE) / 1024 / 1024 * 0.95; bool use_v100 = true; bool ad4 = false; if (sf_name.compare("ad4") == 0) ad4 = true; @@ -713,12 +733,9 @@ bug reporting, license agreements, and more information. \n"; cudaMemGetInfo(&avail, &total); printf("Available Memory = %dMiB Total Memory = %dMiB\n", int(avail / 1024 / 1024), int(total / 1024 / 1024)); - max_memory = avail / 1024 / 1024 * 0.95; // leave 5% to prevent error - } - if (max_memory < 17000) { - // using T4 or other 16G global memory GPU - use_v100 = false; + max_gpu_memory = avail / 1024 / 1024 * 0.95; // leave 5% } + if (max_gpu_memory > 0 && max_gpu_memory < max_memory) { max_memory = (float)max_gpu_memory; } @@ -758,7 +775,7 @@ bug reporting, license agreements, and more information. \n"; std::vector batch_ligands; // ligands in current batch v1.bias_batch_list.clear(); while (predict_peak_memory(batch_size, exhaustiveness, all_atom2_numbers, - use_v100, v.multi_bias) + multi_bias) < max_memory && processed_ligands + batch_size < all_ligands.size()) { batch_ligands.emplace_back( From a29b3cdc7cfbc172cc510f451ccedd43291d1ec0 Mon Sep 17 00:00:00 2001 From: Zihua Wu Date: Sat, 18 Nov 2023 17:42:51 +0800 Subject: [PATCH 3/5] remove debug info --- unidock/src/cuda/monte_carlo.cu | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/unidock/src/cuda/monte_carlo.cu b/unidock/src/cuda/monte_carlo.cu index 04ffdc0..9587e93 100644 --- a/unidock/src/cuda/monte_carlo.cu +++ b/unidock/src/cuda/monte_carlo.cu @@ -274,7 +274,6 @@ __host__ void monte_carlo::operator()( p_cuda_t_cpu *p_cuda; checkCUDA(cudaMallocHost(&p_cuda, sizeof(p_cuda_t_cpu))); - size_t avail, total; /* End CPU allocation */ @@ -321,29 +320,17 @@ __host__ void monte_carlo::operator()( float authentic_v_float[3] = {static_cast(authentic_v[0]), static_cast(authentic_v[1]), static_cast(authentic_v[2])}; - cudaMemGetInfo(&avail, &total); - printf("Available Memory = %dMiB Total Memory = %dMiB\n", int(avail / 1024 / 1024), - int(total / 1024 / 1024)); checkCUDA(cudaMalloc(&authentic_v_gpu, sizeof(authentic_v_float))); // Preparing result data output_type_cuda_t *results_gpu; checkCUDA(cudaMalloc(&results_gpu, thread * sizeof(output_type_cuda_t))); - cudaMemGetInfo(&avail, &total); - printf("Available Memory = %dMiB Total Memory = %dMiB\n", int(avail / 1024 / 1024), - int(total / 1024 / 1024)); m_cuda_t *m_cuda_global; checkCUDA(cudaMalloc(&m_cuda_global, thread * sizeof(m_cuda_t))); - cudaMemGetInfo(&avail, &total); - printf("Available Memory = %dMiB Total Memory = %dMiB\n", int(avail / 1024 / 1024), - int(total / 1024 / 1024)); matrix_d *h_cuda_global; checkCUDA(cudaMalloc(&h_cuda_global, thread * sizeof(matrix_d))); - cudaMemGetInfo(&avail, &total); - printf("Available Memory = %dMiB Total Memory = %dMiB\n", int(avail / 1024 / 1024), - int(total / 1024 / 1024)); /* End Allocating GPU Memory */ From b4a1766db7e85177ea61dd0c5c5c39a3eb34f43e Mon Sep 17 00:00:00 2001 From: Zihua Wu Date: Wed, 13 Dec 2023 13:22:57 +0800 Subject: [PATCH 4/5] clean unused code --- .gitignore | 4 - unidock/src/cuda/common.cuh | 112 -------------------------- unidock/src/cuda/kernel.h | 1 - unidock/src/cuda/monte_carlo.cu | 6 +- unidock/src/cuda/precalculate_gpu.cuh | 2 - unidock/src/cuda/warp_ops.cuh | 1 - unidock/src/lib/precalculate.h | 4 +- 7 files changed, 2 insertions(+), 128 deletions(-) diff --git a/.gitignore b/.gitignore index ab78e12..70d0727 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,3 @@ unidock_tools/dist unidock_tools/dist/* unidock_tools/unidock_tools.egg-info unidock_tools/unidock_tools.egg-info/* - -build-* -nvidia_inha_benchmark -compile_commands.json diff --git a/unidock/src/cuda/common.cuh b/unidock/src/cuda/common.cuh index 9e08c82..feded27 100644 --- a/unidock/src/cuda/common.cuh +++ b/unidock/src/cuda/common.cuh @@ -1,21 +1,9 @@ #pragma once -#include "cuda.h" #include "curand_kernel.h" #include "kernel.h" #include "math.h" #include -#include -#include -/* Original Include files */ -#include "ad4cache.h" -#include "cache.h" -#include "coords.h" -#include "model.h" -#include "monte_carlo.h" -#include "mutate.h" #include "precalculate.h" -#include "quasi_newton.h" -#include #define M_PI_F 3.1415927f @@ -26,40 +14,6 @@ __device__ __forceinline__ void quaternion_increment(float *q, const float *rota __device__ __forceinline__ void normalize_angle(float *x); -__device__ __forceinline__ void output_type_cuda_init(output_type_cuda_t *out, const float *ptr) { - memcpy(out, ptr, sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION)); - out->lig_torsion_size = ptr[3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION]; - // did not assign coords and e -} - -__device__ __forceinline__ void output_type_cuda_init_with_output( - output_type_cuda_t *out_new, const output_type_cuda_t *out_old) { - memcpy(out_new, out_old, - sizeof(float) * (3 + 4 + MAX_NUM_OF_LIG_TORSION + MAX_NUM_OF_FLEX_TORSION)); - out_new->lig_torsion_size = out_old->lig_torsion_size; - // assign e but not coords - out_new->e = out_old->e; -} - -__device__ __forceinline__ void output_type_cuda_increment(output_type_cuda_t *x, - const change_cuda_t *c, float factor, - float epsilon_fl) { - // position increment - for (int k = 0; k < 3; k++) x->position[k] += factor * c->position[k]; - // orientation increment - float rotation[3]; - for (int k = 0; k < 3; k++) rotation[k] = factor * c->orientation[k]; - quaternion_increment(x->orientation, rotation, epsilon_fl); - - // torsion increment - for (int k = 0; k < x->lig_torsion_size; k++) { - float tmp = factor * c->lig_torsion[k]; - normalize_angle(&tmp); - x->lig_torsion[k] += tmp; - normalize_angle(&(x->lig_torsion[k])); - } -} - __device__ __forceinline__ float norm3(const float *a) { return sqrtf(a[0] * a[0] + a[1] * a[1] + a[2] * a[2]); } @@ -236,16 +190,6 @@ __device__ __forceinline__ void mutate_conf_cuda(const int num_steps, output_typ /* Below based on matrix.cpp */ -__device__ __forceinline__ void matrix_d_init(matrix_d *m, int dim, float fill_data) { - m->dim = dim; - if ((dim * (dim + 1) / 2) > MAX_HESSIAN_MATRIX_D_SIZE) - DEBUG_PRINTF("\nnmatrix_d: matrix_d_init() ERROR!"); - // ((dim * (dim + 1) / 2)*sizeof(float)); // symmetric matrix_d - for (int i = 0; i < (dim * (dim + 1) / 2); i++) m->data[i] = fill_data; - for (int i = (dim * (dim + 1) / 2); i < MAX_HESSIAN_MATRIX_D_SIZE; i++) - m->data[i] = 0; // Others will be 0 -} - // as rugular 3x3 matrix_d __device__ __forceinline__ void mat_init(matrix_d *m, float fill_data) { m->dim = 3; // fixed to 3x3 matrix_d @@ -253,12 +197,6 @@ __device__ __forceinline__ void mat_init(matrix_d *m, float fill_data) { for (int i = 0; i < 9; i++) m->data[i] = fill_data; } -__device__ __forceinline__ void matrix_d_set_diagonal(matrix_d *m, float fill_data) { - for (int i = 0; i < m->dim; i++) { - m->data[i + i * (i + 1) / 2] = fill_data; - } -} - // as regular matrix_d __device__ __forceinline__ void matrix_d_set_element(matrix_d *m, int dim, int x, int y, float fill_data) { @@ -720,53 +658,3 @@ __device__ __forceinline__ void find_change_index_write(change_cuda_t *g, int in } DEBUG_PRINTF("\nKernel2:find_change_index_write() ERROR!"); /* Shouldn't be here */ } - -__device__ __forceinline__ void minus_mat_vec_product(const matrix_d *h, const change_cuda_t *in, - change_cuda_t *out) { - int n = h->dim; - for (int i = 0; i < n; i++) { - float sum = 0; - for (int j = 0; j < n; j++) { - sum += h->data[index_permissive(h, i, j)] * find_change_index_read(in, j); - } - find_change_index_write(out, i, -sum); - } -} - -__device__ __forceinline__ float scalar_product(const change_cuda_t *a, const change_cuda_t *b, - int n) { - float tmp = 0; - for (int i = 0; i < n; i++) { - tmp += find_change_index_read(a, i) * find_change_index_read(b, i); - } - return tmp; -} - -__device__ __forceinline__ bool bfgs_update(matrix_d *h, const change_cuda_t *p, - const change_cuda_t *y, const float alpha, - const float epsilon_fl) { - const float yp = scalar_product(y, p, h->dim); - - if (alpha * yp < epsilon_fl) return false; - change_cuda_t minus_hy; - change_cuda_init_with_change(&minus_hy, y); - minus_mat_vec_product(h, y, &minus_hy); - const float yhy = -scalar_product(y, &minus_hy, h->dim); - const float r = 1 / (alpha * yp); - const int n = 6 + p->lig_torsion_size; - - for (int i = 0; i < n; i++) { - for (int j = i; j < n; j++) { - float tmp - = alpha * r - * (find_change_index_read(&minus_hy, i) * find_change_index_read(p, j) - + find_change_index_read(&minus_hy, j) * find_change_index_read(p, i)) - + +alpha * alpha * (r * r * yhy + r) * find_change_index_read(p, i) - * find_change_index_read(p, j); - - h->data[i + j * (j + 1) / 2] += tmp; - } - } - - return true; -} diff --git a/unidock/src/cuda/kernel.h b/unidock/src/cuda/kernel.h index fc5e0f5..706108b 100644 --- a/unidock/src/cuda/kernel.h +++ b/unidock/src/cuda/kernel.h @@ -1,6 +1,5 @@ #pragma once -#include #include template void check(T result, char const *const func, const char *const file, int const line) { diff --git a/unidock/src/cuda/monte_carlo.cu b/unidock/src/cuda/monte_carlo.cu index 9587e93..bb2c408 100644 --- a/unidock/src/cuda/monte_carlo.cu +++ b/unidock/src/cuda/monte_carlo.cu @@ -21,13 +21,11 @@ */ #include "common.cuh" -#include "cuda.h" #include "curand_kernel.h" #include "kernel.h" #include "math.h" #include "warp_ops.cuh" #include -#include #include /* Original Include files */ #include "ad4cache.h" @@ -38,7 +36,6 @@ #include "mutate.h" #include "precalculate.h" #include "quasi_newton.h" -#include #include #include @@ -764,8 +761,7 @@ __host__ void monte_carlo::operator()( cudaEventSynchronize(stop); float msecTotal = 0.0f; cudaEventElapsedTime(&msecTotal, start, stop); - printf("Time spend on GPU is %f ms\n", msecTotal); - // DEBUG_PRINTF("Time spend on GPU is %f ms\n", msecTotal); + DEBUG_PRINTF("Time spend on GPU is %f ms\n", msecTotal); /* Convert result data. Can be improved by mapping memory */ diff --git a/unidock/src/cuda/precalculate_gpu.cuh b/unidock/src/cuda/precalculate_gpu.cuh index 673f8b4..ff6c1b3 100644 --- a/unidock/src/cuda/precalculate_gpu.cuh +++ b/unidock/src/cuda/precalculate_gpu.cuh @@ -1,6 +1,4 @@ #include "math.h" -#include "kernel.h" -#include "macros.h" // Define GPU precalculate structures diff --git a/unidock/src/cuda/warp_ops.cuh b/unidock/src/cuda/warp_ops.cuh index 71d4abf..9647cc0 100644 --- a/unidock/src/cuda/warp_ops.cuh +++ b/unidock/src/cuda/warp_ops.cuh @@ -1,5 +1,4 @@ #pragma once -#include "bfgs.h" #include "common.cuh" #include "kernel.h" #include diff --git a/unidock/src/lib/precalculate.h b/unidock/src/lib/precalculate.h index d738ac5..4715146 100644 --- a/unidock/src/lib/precalculate.h +++ b/unidock/src/lib/precalculate.h @@ -25,6 +25,7 @@ #include "scoring_function.h" #include "matrix.h" +#include "model.h" #include "kernel.h" #ifdef DEBUG @@ -33,9 +34,6 @@ # define DEBUG_PRINTF(...) #endif -// Forward declaration -struct model; - struct precalculate_element { public: precalculate_element(sz n, fl factor_) : fast(n, 0), smooth(n, pr(0, 0)), factor(factor_) {} From 6b579d1f39c86c59315d2ccb6041fe19ffef7400 Mon Sep 17 00:00:00 2001 From: Zihua Wu Date: Wed, 13 Dec 2023 13:26:39 +0800 Subject: [PATCH 5/5] clean unused code --- unidock/src/main/main.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/unidock/src/main/main.cpp b/unidock/src/main/main.cpp index b1db789..df45d48 100644 --- a/unidock/src/main/main.cpp +++ b/unidock/src/main/main.cpp @@ -20,22 +20,16 @@ */ -#include #include #include #include // ligand paths -#include #include -#include "conf.h" -#include "kernel.h" #include "vina.h" #include "utils.h" #include "scoring_function.h" -#include #include #include -#include struct usage_error : public std::runtime_error { usage_error(const std::string& message) : std::runtime_error(message) {}