Skip to content

Commit 9002e52

Browse files
sagarghuge20kaiwenjon
authored and
Marge Bot
committed
anv: Implement cmd_dispatch_unaligned callback
Rework: (Kevin) - Calculate correct number of threads in GPGPU thread group based on SIMD size. - Instead of round up, just use the simple division and let the remainder part handle groupCount < local_size_x. - Drop indirect_unroll_off and fix the bug that we're not using is_unaligned_size_x Co-authored-by: Kevin Chuang <[email protected]> Co-authored-by: Sagar Ghuge <[email protected]> Reviewed-by: Lionel Landwerlin <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31588>
1 parent 0cab02c commit 9002e52

File tree

4 files changed

+208
-19
lines changed

4 files changed

+208
-19
lines changed

src/intel/vulkan/anv_cmd_buffer.c

+12
Original file line numberDiff line numberDiff line change
@@ -1569,3 +1569,15 @@ anv_cmd_flush_buffer_write_cp(VkCommandBuffer commandBuffer)
15691569
* ANV_PIPE_DATA_CACHE_FLUSH_BIT.
15701570
*/
15711571
}
1572+
1573+
void
1574+
anv_cmd_dispatch_unaligned(VkCommandBuffer commandBuffer,
1575+
uint32_t invocations_x,
1576+
uint32_t invocations_y,
1577+
uint32_t invocations_z)
1578+
{
1579+
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
1580+
1581+
anv_genX(cmd_buffer->device->info, cmd_dispatch_unaligned)
1582+
(commandBuffer, invocations_x, invocations_y, invocations_z);
1583+
}

src/intel/vulkan/anv_genX.h

+12
Original file line numberDiff line numberDiff line change
@@ -432,3 +432,15 @@ genX(cmd_buffer_flush_push_descriptors)(struct anv_cmd_buffer *cmd_buffer,
432432
void genX(emit_embedded_sampler)(struct anv_device *device,
433433
struct anv_embedded_sampler *sampler,
434434
struct anv_pipeline_embedded_sampler_binding *binding);
435+
436+
void
437+
genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
438+
struct anv_address indirect_addr,
439+
bool is_unaligned_size_x);
440+
441+
void
442+
genX(cmd_dispatch_unaligned)(
443+
VkCommandBuffer commandBuffer,
444+
uint32_t invocations_x,
445+
uint32_t invocations_y,
446+
uint32_t invocations_z);

src/intel/vulkan/anv_private.h

+6
Original file line numberDiff line numberDiff line change
@@ -5911,6 +5911,12 @@ anv_cmd_write_buffer_cp(VkCommandBuffer cmd_buffer,
59115911
void *data,
59125912
uint32_t size);
59135913
void
5914+
anv_cmd_dispatch_unaligned(VkCommandBuffer cmd_buffer,
5915+
uint32_t invocations_x,
5916+
uint32_t invocations_y,
5917+
uint32_t invocations_z);
5918+
5919+
void
59145920
anv_cmd_flush_buffer_write_cp(VkCommandBuffer cmd_buffer);
59155921

59165922
VkResult

src/intel/vulkan/genX_cmd_compute.c

+178-19
Original file line numberDiff line numberDiff line change
@@ -292,12 +292,30 @@ anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer *cmd_buffer,
292292

293293
static void
294294
compute_load_indirect_params(struct anv_cmd_buffer *cmd_buffer,
295-
const struct anv_address indirect_addr)
295+
const struct anv_address indirect_addr,
296+
bool is_unaligned_size_x)
296297
{
297298
struct mi_builder b;
298299
mi_builder_init(&b, cmd_buffer->device->info, &cmd_buffer->batch);
299300

300301
struct mi_value size_x = mi_mem32(anv_address_add(indirect_addr, 0));
302+
303+
/* Convert unaligned thread invocations to aligned thread group in X
304+
* dimension for unaligned shader dispatches during ray tracing phase.
305+
*/
306+
if (is_unaligned_size_x) {
307+
const uint32_t mocs = isl_mocs(&cmd_buffer->device->isl_dev, 0, false);
308+
mi_builder_set_mocs(&b, mocs);
309+
310+
struct anv_compute_pipeline *pipeline =
311+
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
312+
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
313+
314+
assert(util_is_power_of_two_or_zero(prog_data->local_size[0]));
315+
size_x = mi_udiv32_imm(&b, size_x, prog_data->local_size[0]);
316+
size_x = mi_iadd(&b, size_x, mi_imm(1));
317+
}
318+
301319
struct mi_value size_y = mi_mem32(anv_address_add(indirect_addr, 4));
302320
struct mi_value size_z = mi_mem32(anv_address_add(indirect_addr, 8));
303321

@@ -415,16 +433,13 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
415433
const struct anv_compute_pipeline *pipeline,
416434
struct anv_address indirect_addr,
417435
const struct brw_cs_prog_data *prog_data,
436+
struct intel_cs_dispatch_info dispatch,
418437
uint32_t groupCountX, uint32_t groupCountY,
419438
uint32_t groupCountZ)
420439
{
421440
const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute;
422441
const bool predicate = cmd_buffer->state.conditional_render_enabled;
423442

424-
const struct intel_device_info *devinfo = pipeline->base.device->info;
425-
const struct intel_cs_dispatch_info dispatch =
426-
brw_cs_get_dispatch_info(devinfo, prog_data, NULL);
427-
428443
uint32_t num_workgroup_data[3];
429444
if (!anv_address_is_null(indirect_addr)) {
430445
uint64_t indirect_addr64 = anv_address_physical(indirect_addr);
@@ -520,25 +535,32 @@ static inline void
520535
emit_cs_walker(struct anv_cmd_buffer *cmd_buffer,
521536
const struct anv_compute_pipeline *pipeline,
522537
const struct brw_cs_prog_data *prog_data,
538+
struct intel_cs_dispatch_info dispatch,
523539
struct anv_address indirect_addr,
524-
uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ)
540+
uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ,
541+
bool is_unaligned_size_x)
525542
{
526543
bool is_indirect = !anv_address_is_null(indirect_addr);
527544

528545
#if GFX_VERx10 >= 125
529-
if (is_indirect && cmd_buffer->device->info->has_indirect_unroll) {
546+
/* For unaligned dispatch, we need to tweak the dispatch value with
547+
* MI_MATH, so we can't use indirect HW instructions.
548+
*/
549+
if (is_indirect && !is_unaligned_size_x &&
550+
cmd_buffer->device->info->has_indirect_unroll) {
530551
emit_indirect_compute_walker(cmd_buffer, pipeline->cs, prog_data,
531552
indirect_addr);
532553
return;
533554
}
534555
#endif
535556

536557
if (is_indirect)
537-
compute_load_indirect_params(cmd_buffer, indirect_addr);
558+
compute_load_indirect_params(cmd_buffer, indirect_addr,
559+
is_unaligned_size_x);
538560

539561
#if GFX_VERx10 >= 125
540562
emit_compute_walker(cmd_buffer, pipeline, indirect_addr, prog_data,
541-
groupCountX, groupCountY, groupCountZ);
563+
dispatch, groupCountX, groupCountY, groupCountZ);
542564
#else
543565
emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data,
544566
groupCountX, groupCountY, groupCountZ);
@@ -558,6 +580,8 @@ void genX(CmdDispatchBase)(
558580
struct anv_compute_pipeline *pipeline =
559581
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
560582
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
583+
struct intel_cs_dispatch_info dispatch =
584+
brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
561585

562586
if (anv_batch_has_error(&cmd_buffer->batch))
563587
return;
@@ -581,32 +605,154 @@ void genX(CmdDispatchBase)(
581605
if (cmd_buffer->state.conditional_render_enabled)
582606
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
583607

584-
emit_cs_walker(cmd_buffer, pipeline, prog_data,
608+
emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch,
585609
ANV_NULL_ADDRESS /* no indirect data */,
586-
groupCountX, groupCountY, groupCountZ);
610+
groupCountX, groupCountY, groupCountZ,
611+
false);
587612

588613
trace_intel_end_compute(&cmd_buffer->trace,
589614
groupCountX, groupCountY, groupCountZ);
590615
}
591616

592-
void genX(CmdDispatchIndirect)(
617+
static void
618+
emit_unaligned_cs_walker(
593619
VkCommandBuffer commandBuffer,
594-
VkBuffer _buffer,
595-
VkDeviceSize offset)
620+
uint32_t baseGroupX,
621+
uint32_t baseGroupY,
622+
uint32_t baseGroupZ,
623+
uint32_t groupCountX,
624+
uint32_t groupCountY,
625+
uint32_t groupCountZ,
626+
struct intel_cs_dispatch_info dispatch)
596627
{
597628
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
598-
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
599629
struct anv_compute_pipeline *pipeline =
600630
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
601631
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
602-
struct anv_address addr = anv_address_add(buffer->address, offset);
632+
633+
if (anv_batch_has_error(&cmd_buffer->batch))
634+
return;
635+
636+
anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
637+
baseGroupX, baseGroupY, baseGroupZ,
638+
groupCountX, groupCountY, groupCountZ,
639+
ANV_NULL_ADDRESS);
640+
641+
/* RT shaders have Y and Z local size set to 1 always. */
642+
assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1);
643+
644+
/* RT shaders dispatched with group Y and Z set to 1 always. */
645+
assert(groupCountY == 1 && groupCountZ == 1);
646+
647+
if (anv_batch_has_error(&cmd_buffer->batch))
648+
return;
649+
650+
anv_measure_snapshot(cmd_buffer,
651+
INTEL_SNAPSHOT_COMPUTE,
652+
"compute-unaligned-cs-walker",
653+
groupCountX * groupCountY * groupCountZ *
654+
prog_data->local_size[0] * prog_data->local_size[1] *
655+
prog_data->local_size[2]);
656+
657+
trace_intel_begin_compute(&cmd_buffer->trace);
658+
659+
assert(!prog_data->uses_num_work_groups);
660+
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
661+
662+
if (cmd_buffer->state.conditional_render_enabled)
663+
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
664+
665+
#if GFX_VERx10 >= 125
666+
emit_compute_walker(cmd_buffer, pipeline, ANV_NULL_ADDRESS, prog_data,
667+
dispatch, groupCountX, groupCountY, groupCountZ);
668+
#endif
669+
670+
trace_intel_end_compute(&cmd_buffer->trace,
671+
groupCountX, groupCountY, groupCountZ);
672+
}
673+
674+
/*
675+
* Dispatch compute work item with unaligned thread invocations.
676+
*
677+
* This helper takes unaligned thread invocations, convert it into aligned
678+
* thread group count and dispatch compute work items.
679+
*
680+
* We launch two CS walker, one with aligned part and another CS walker
681+
* with single group for remaining thread invocations.
682+
*
683+
* This function is now specifically for BVH building.
684+
*/
685+
void
686+
genX(cmd_dispatch_unaligned)(
687+
VkCommandBuffer commandBuffer,
688+
uint32_t invocations_x,
689+
uint32_t invocations_y,
690+
uint32_t invocations_z)
691+
{
692+
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
693+
struct anv_compute_pipeline *pipeline =
694+
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
695+
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
696+
697+
/* Group X can be unaligned for RT dispatches. */
698+
uint32_t groupCountX = invocations_x / prog_data->local_size[0];
699+
uint32_t groupCountY = invocations_y;
700+
uint32_t groupCountZ = invocations_z;
701+
702+
struct intel_cs_dispatch_info dispatch =
703+
brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
704+
705+
/* Launch first CS walker with aligned group count X. */
706+
if (groupCountX) {
707+
emit_unaligned_cs_walker(commandBuffer, 0, 0, 0, groupCountX,
708+
groupCountY, groupCountZ, dispatch);
709+
}
710+
711+
uint32_t unaligned_invocations_x = invocations_x % prog_data->local_size[0];
712+
if (unaligned_invocations_x) {
713+
dispatch.threads = DIV_ROUND_UP(unaligned_invocations_x,
714+
dispatch.simd_size);
715+
716+
/* Make sure the 2nd walker has the same amount of invocations per
717+
* workgroup as the 1st walker, so that gl_GlobalInvocationsID can be
718+
* calculated correctly with baseGroup.
719+
*/
720+
assert(dispatch.threads * dispatch.simd_size == prog_data->local_size[0]);
721+
722+
const uint32_t remainder = unaligned_invocations_x & (dispatch.simd_size - 1);
723+
if (remainder > 0) {
724+
dispatch.right_mask = ~0u >> (32 - remainder);
725+
} else {
726+
dispatch.right_mask = ~0u >> (32 - dispatch.simd_size);
727+
}
728+
729+
/* Launch second CS walker for unaligned part. */
730+
emit_unaligned_cs_walker(commandBuffer, groupCountX, 0, 0, 1, 1, 1,
731+
dispatch);
732+
}
733+
}
734+
735+
/*
736+
* This dispatches compute work item with indirect parameters.
737+
* Helper also makes the unaligned thread invocations aligned.
738+
*/
739+
void
740+
genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer,
741+
struct anv_address indirect_addr,
742+
bool is_unaligned_size_x)
743+
{
744+
struct anv_compute_pipeline *pipeline =
745+
anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline);
746+
const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline);
603747
UNUSED struct anv_batch *batch = &cmd_buffer->batch;
748+
struct intel_cs_dispatch_info dispatch =
749+
brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL);
604750

605751
if (anv_batch_has_error(&cmd_buffer->batch))
606752
return;
607753

608754
anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data,
609-
0, 0, 0, 0, 0, 0, addr);
755+
0, 0, 0, 0, 0, 0, indirect_addr);
610756

611757
anv_measure_snapshot(cmd_buffer,
612758
INTEL_SNAPSHOT_COMPUTE,
@@ -619,10 +765,23 @@ void genX(CmdDispatchIndirect)(
619765
if (cmd_buffer->state.conditional_render_enabled)
620766
genX(cmd_emit_conditional_render_predicate)(cmd_buffer);
621767

622-
emit_cs_walker(cmd_buffer, pipeline, prog_data, addr, 0, 0, 0);
768+
emit_cs_walker(cmd_buffer, pipeline, prog_data, dispatch, indirect_addr, 0,
769+
0, 0, is_unaligned_size_x);
623770

624771
trace_intel_end_compute_indirect(&cmd_buffer->trace,
625-
anv_address_utrace(addr));
772+
anv_address_utrace(indirect_addr));
773+
}
774+
775+
void genX(CmdDispatchIndirect)(
776+
VkCommandBuffer commandBuffer,
777+
VkBuffer _buffer,
778+
VkDeviceSize offset)
779+
{
780+
ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
781+
ANV_FROM_HANDLE(anv_buffer, buffer, _buffer);
782+
struct anv_address addr = anv_address_add(buffer->address, offset);
783+
784+
genX(cmd_buffer_dispatch_indirect)(cmd_buffer, addr, false);
626785
}
627786

628787
struct anv_address

0 commit comments

Comments
 (0)