Skip to content

Commit

Permalink
#24 Added better bitpacked CPU morphology implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
carljohnsen committed May 21, 2024
1 parent 90200e4 commit fc69616
Showing 1 changed file with 78 additions and 1 deletion.
79 changes: 78 additions & 1 deletion src/lib/cpp/cpu_seq/morphology.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ void morphology_3d_sphere(
}

template <typename Op, uint32_t neutral>
void morphology_3d_sphere_bitpacked(
void morphology_3d_sphere_bitpacked_naive(
const uint32_t *voxels,
const int64_t radius,
const int64_t N[3],
Expand Down Expand Up @@ -146,4 +146,81 @@ void morphology_3d_sphere_bitpacked(
}
}

template <typename Op, uint32_t neutral>
void morphology_3d_sphere_bitpacked(
const uint32_t *voxels,
const int64_t radius,
const int64_t N[3],
const int64_t strides[3],
uint32_t *result) {
// TODO assumes that Nx is a multiple of 32, which is true for scale <= 4
Op op;
int64_t
k = radius*2 + 1,
sqradius = radius * radius;

// TODO handle k < 32
// TODO templated construction? Has to 'hardcode' a radius, but that is beneficial anyways.
// Create the kernel
uint32_t *kernel = (uint32_t*) malloc(k*k*sizeof(uint32_t));

#pragma omp parallel for collapse(2)
for (int64_t z = -radius; z <= radius; z++) {
for (int64_t y = -radius; y <= radius; y++) {
uint32_t row = 0;
for (int64_t x = 0; x < 32; x++) {
uint32_t element = (x-radius)*(x-radius) + y*y + z*z <= sqradius;
row |= element << (31 - x);
}
kernel[(z+radius)*k + y+radius] = row;
}
}

#pragma omp parallel for collapse(3)
for (int64_t z = 0; z < N[0]; z++) {
for (int64_t y = 0; y < N[1]; y++) {
for (int64_t x = 0; x < N[2]/32; x++) {
// Compute boundaries
int64_t flat_index = z*strides[0] + y*strides[1] + x*strides[2];
int64_t X[3] = {z, y, x};
int64_t limits[6];
for (int axis = 0; axis < 3; axis++) {
limits[(axis*2)] = -min(radius, X[axis]);
limits[(axis*2)+1] = min(radius, N[axis] - X[axis] - 1);
}

// Apply the spherical kernel
uint32_t value = neutral;
for (int64_t pz = limits[0]; pz <= limits[1]; pz++) {
for (int64_t py = limits[2]; py <= limits[3]; py++) {
int64_t this_flat_index = flat_index + pz*strides[0] + py*strides[1];
uint32_t
left = x == 0 ? neutral : voxels[this_flat_index - 1],
middle = voxels[this_flat_index],
right = x == (N[2]/32)-1 ? neutral : voxels[this_flat_index + 1],
kernel_row = kernel[(pz+radius)*k + (py+radius)];
//voxels_row = voxels[this_flat_index],

uint32_t this_row = 0;
for (int64_t px = 0; px < 32; px++) {
uint32_t this_x = 0 |
(left << (32 - radius + px)) |
(middle >> (radius - px)) |
(middle << (-radius + px)) |
(right >> (32 + radius - px));
this_x &= kernel_row;
this_x = this_x != 0; // This is for dilate - make generic / work for erode
this_row |= this_x << (31 - px);
}
value = op(value, this_row);
}
}

// Store the results
result[flat_index] = value;
}
}
}
}

} // namespace cpu_seq

0 comments on commit fc69616

Please sign in to comment.