Skip to content

Commit

Permalink
Merge pull request #1661 from CEED/jeremy/gen-memory
Browse files Browse the repository at this point in the history
gpu - reduce memory usage in gen backends
  • Loading branch information
jeremylt authored Sep 12, 2024
2 parents bce9d45 + e93651e commit 8386266
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 29 deletions.
105 changes: 90 additions & 15 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,8 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
// Restriction
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
bool use_3d_slices) {
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
CeedInt Q_1d, bool is_input, bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = "P_1d" + var_suffix;
CeedEvalMode eval_mode = CEED_EVAL_NONE;
Expand All @@ -229,10 +229,21 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
// Restriction
if (is_input) {
// Input
if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
if (field_input_buffer[i] != i) {
std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);

// Restriction was already done for previous input
code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
} else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
bool is_strided;

code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
if (eval_mode == CEED_EVAL_NONE) {
// No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
} else {
// Otherwise we're using the scratch space
code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
}
CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
if (!is_strided) {
CeedInt comp_stride;
Expand Down Expand Up @@ -356,7 +367,6 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
}
// LCOV_EXCL_START
case CEED_EVAL_DIV:
break; // TODO: Not implemented
case CEED_EVAL_CURL:
break; // TODO: Not implemented
// LCOV_EXCL_STOP
Expand All @@ -367,12 +377,12 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
code << " CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
break; // No action
case CEED_EVAL_INTERP:
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
break;
case CEED_EVAL_GRAD:
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
if (use_3d_slices) {
code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
Expand All @@ -386,7 +396,6 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO
case CEED_EVAL_WEIGHT:
break; // Should not occur
case CEED_EVAL_DIV:
break; // TODO: Not implemented
case CEED_EVAL_CURL:
break; // TODO: Not implemented
// LCOV_EXCL_STOP
Expand Down Expand Up @@ -433,7 +442,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
// We treat quadrature points per slice in 3d to save registers
if (use_3d_slices) {
code << "\n // Note: Using planes of 3D elements\n";
code << "#pragma unroll\n";
code << " #pragma unroll\n";
code << " for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
code << " // -- Input fields\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
Expand Down Expand Up @@ -789,17 +798,83 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
code << " __syncthreads();\n";
code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";

// -- Compute minimum buffer space needed
CeedInt max_rstr_buffer_size = 0;

for (CeedInt i = 0; i < num_input_fields; i++) {
CeedInt num_comp, elem_size;
CeedElemRestriction elem_rstr;

CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
}
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedInt num_comp, elem_size;
CeedElemRestriction elem_rstr;

CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
}
code << " // Scratch restriction buffer space\n";
code << " CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";

// -- Determine best input field processing order
CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];

for (CeedInt i = 0; i < num_input_fields; i++) {
field_rstr_in_buffer[i] = -1;
input_field_order[i] = -1;
}
{
bool is_ordered[CEED_FIELD_MAX];
CeedInt curr_index = 0;

for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedVector vec_i;
CeedElemRestriction rstr_i;

if (is_ordered[i]) continue;
CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
field_rstr_in_buffer[i] = i;
is_ordered[i] = true;
input_field_order[curr_index] = i;
curr_index++;
if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
for (CeedInt j = i + 1; j < num_input_fields; j++) {
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
if (rstr_i == rstr_j && vec_i == vec_j) {
field_rstr_in_buffer[j] = i;
is_ordered[j] = true;
input_field_order[curr_index] = j;
curr_index++;
}
}
}
}

// -- Input restriction and basis
code << " // -- Input field restrictions and basis actions\n";
code << "\n // -- Input field restrictions and basis actions\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
code << " // ---- Input field " << i << "\n";
CeedInt f = input_field_order[i];

code << " // ---- Input field " << f << "\n";

// ---- Restriction
CeedCallBackend(
CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
Q_1d, true, use_3d_slices));

// ---- Basis action
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
}

// -- Q function
Expand All @@ -816,7 +891,7 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {

// ---- Restriction
CeedCallBackend(
CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
}

// Close loop and function
Expand Down
104 changes: 90 additions & 14 deletions backends/hip-gen/ceed-hip-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,8 @@ static int CeedOperatorBuildKernelFieldData_Hip_gen(std::ostringstream &code, Ce
// Restriction
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code, CeedOperator_Hip_gen *data, CeedInt i, CeedInt dim,
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
bool use_3d_slices) {
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
CeedInt Q_1d, bool is_input, bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = "P_1d" + var_suffix;
CeedEvalMode eval_mode = CEED_EVAL_NONE;
Expand All @@ -256,10 +256,22 @@ static int CeedOperatorBuildKernelRestriction_Hip_gen(std::ostringstream &code,
// Restriction
if (is_input) {
// Input
if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
// Input
if (field_input_buffer[i] != i) {
std::string buffer_name = "r_e_in_" + std::to_string(field_input_buffer[i]);

// Restriction was already done for previous input
code << " CeedScalar *r_e" << var_suffix << " = " << buffer_name << ";\n";
} else if (eval_mode != CEED_EVAL_WEIGHT && !((eval_mode == CEED_EVAL_NONE) && use_3d_slices)) {
bool is_strided;

code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
if (eval_mode == CEED_EVAL_NONE) {
// No basis action, so r_e_in_* in also r_q_in_* and needs to be allocated
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
} else {
// Otherwise we're using the scratch space
code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
}
CeedCallBackend(CeedElemRestrictionIsStrided(elem_rstr, &is_strided));
if (!is_strided) {
CeedInt comp_stride;
Expand Down Expand Up @@ -383,7 +395,6 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
}
// LCOV_EXCL_START
case CEED_EVAL_DIV:
break; // TODO: Not implemented
case CEED_EVAL_CURL:
break; // TODO: Not implemented
// LCOV_EXCL_STOP
Expand All @@ -394,12 +405,12 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
code << " CeedScalar *r_e" << var_suffix << " = r_q" << var_suffix << ";\n";
break; // No action
case CEED_EVAL_INTERP:
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
break;
case CEED_EVAL_GRAD:
code << " CeedScalar r_e" << var_suffix << "[num_comp" << var_suffix << "*" << P_name << "];\n";
code << " CeedScalar *r_e" << var_suffix << " = r_e_scratch;\n";
if (use_3d_slices) {
code << " InterpTranspose" << (dim > 1 ? "Tensor" : "") << dim << "d<num_comp" << var_suffix << ", " << P_name << ", " << Q_name
<< ">(data, r_q" << var_suffix << ", s_B" << var_suffix << ", r_e" << var_suffix << ");\n";
Expand All @@ -413,7 +424,6 @@ static int CeedOperatorBuildKernelBasis_Hip_gen(std::ostringstream &code, CeedOp
case CEED_EVAL_WEIGHT:
break; // Should not occur
case CEED_EVAL_DIV:
break; // TODO: Not implemented
case CEED_EVAL_CURL:
break; // TODO: Not implemented
// LCOV_EXCL_STOP
Expand Down Expand Up @@ -460,7 +470,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
// We treat quadrature points per slice in 3d to save registers
if (use_3d_slices) {
code << "\n // Note: Using planes of 3D elements\n";
code << "#pragma unroll\n";
code << " #pragma unroll\n";
code << " for (CeedInt q = 0; q < " << Q_name << "; q++) {\n";
code << " // -- Input fields\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
Expand Down Expand Up @@ -797,17 +807,83 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
code << " __syncthreads();\n";
code << " for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {\n";

// -- Compute minimum buffer space needed
CeedInt max_rstr_buffer_size = 0;

for (CeedInt i = 0; i < num_input_fields; i++) {
CeedInt num_comp, elem_size;
CeedElemRestriction elem_rstr;

CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &elem_rstr));
CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
}
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedInt num_comp, elem_size;
CeedElemRestriction elem_rstr;

CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_output_fields[i], &elem_rstr));
CeedCallBackend(CeedElemRestrictionGetNumComponents(elem_rstr, &num_comp));
CeedCallBackend(CeedElemRestrictionGetElementSize(elem_rstr, &elem_size));
max_rstr_buffer_size = CeedIntMax(max_rstr_buffer_size, num_comp * elem_size);
}
code << " // Scratch restriction buffer space\n";
code << " CeedScalar r_e_scratch[" << max_rstr_buffer_size << "];\n";

// -- Determine best input field processing order
CeedInt field_rstr_in_buffer[CEED_FIELD_MAX], input_field_order[CEED_FIELD_MAX];

for (CeedInt i = 0; i < num_input_fields; i++) {
field_rstr_in_buffer[i] = -1;
input_field_order[i] = -1;
}
{
bool is_ordered[CEED_FIELD_MAX];
CeedInt curr_index = 0;

for (CeedInt i = 0; i < num_input_fields; i++) is_ordered[i] = false;
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedVector vec_i;
CeedElemRestriction rstr_i;

if (is_ordered[i]) continue;
CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[i], &vec_i));
field_rstr_in_buffer[i] = i;
is_ordered[i] = true;
input_field_order[curr_index] = i;
curr_index++;
if (vec_i == CEED_VECTOR_NONE) continue; // CEED_EVAL_WEIGHT
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[i], &rstr_i));
for (CeedInt j = i + 1; j < num_input_fields; j++) {
CeedVector vec_j;
CeedElemRestriction rstr_j;

CeedCallBackend(CeedOperatorFieldGetVector(op_input_fields[j], &vec_j));
CeedCallBackend(CeedOperatorFieldGetElemRestriction(op_input_fields[j], &rstr_j));
if (rstr_i == rstr_j && vec_i == vec_j) {
field_rstr_in_buffer[j] = i;
is_ordered[j] = true;
input_field_order[curr_index] = j;
curr_index++;
}
}
}
}

// -- Input restriction and basis
code << " // -- Input field restrictions and basis actions\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
code << " // ---- Input field " << i << "\n";
CeedInt f = input_field_order[i];

code << " // ---- Input field " << f << "\n";

// ---- Restriction
CeedCallBackend(
CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelRestriction_Hip_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f], Q_1d,
true, use_3d_slices));

// ---- Basis action
CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, i, dim, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelBasis_Hip_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
}

// -- Q function
Expand All @@ -824,7 +900,7 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {

// ---- Restriction
CeedCallBackend(
CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
CeedOperatorBuildKernelRestriction_Hip_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
}

// Close loop and function
Expand Down

0 comments on commit 8386266

Please sign in to comment.