@@ -292,12 +292,30 @@ anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer *cmd_buffer,
292
292
293
293
static void
294
294
compute_load_indirect_params (struct anv_cmd_buffer * cmd_buffer ,
295
- const struct anv_address indirect_addr )
295
+ const struct anv_address indirect_addr ,
296
+ bool is_unaligned_size_x )
296
297
{
297
298
struct mi_builder b ;
298
299
mi_builder_init (& b , cmd_buffer -> device -> info , & cmd_buffer -> batch );
299
300
300
301
struct mi_value size_x = mi_mem32 (anv_address_add (indirect_addr , 0 ));
302
+
303
+ /* Convert unaligned thread invocations to aligned thread group in X
304
+ * dimension for unaligned shader dispatches during ray tracing phase.
305
+ */
306
+ if (is_unaligned_size_x ) {
307
+ const uint32_t mocs = isl_mocs (& cmd_buffer -> device -> isl_dev , 0 , false);
308
+ mi_builder_set_mocs (& b , mocs );
309
+
310
+ struct anv_compute_pipeline * pipeline =
311
+ anv_pipeline_to_compute (cmd_buffer -> state .compute .base .pipeline );
312
+ const struct brw_cs_prog_data * prog_data = get_cs_prog_data (pipeline );
313
+
314
+ assert (util_is_power_of_two_or_zero (prog_data -> local_size [0 ]));
315
+ size_x = mi_udiv32_imm (& b , size_x , prog_data -> local_size [0 ]);
316
+ size_x = mi_iadd (& b , size_x , mi_imm (1 ));
317
+ }
318
+
301
319
struct mi_value size_y = mi_mem32 (anv_address_add (indirect_addr , 4 ));
302
320
struct mi_value size_z = mi_mem32 (anv_address_add (indirect_addr , 8 ));
303
321
@@ -415,16 +433,13 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer,
415
433
const struct anv_compute_pipeline * pipeline ,
416
434
struct anv_address indirect_addr ,
417
435
const struct brw_cs_prog_data * prog_data ,
436
+ struct intel_cs_dispatch_info dispatch ,
418
437
uint32_t groupCountX , uint32_t groupCountY ,
419
438
uint32_t groupCountZ )
420
439
{
421
440
const struct anv_cmd_compute_state * comp_state = & cmd_buffer -> state .compute ;
422
441
const bool predicate = cmd_buffer -> state .conditional_render_enabled ;
423
442
424
- const struct intel_device_info * devinfo = pipeline -> base .device -> info ;
425
- const struct intel_cs_dispatch_info dispatch =
426
- brw_cs_get_dispatch_info (devinfo , prog_data , NULL );
427
-
428
443
uint32_t num_workgroup_data [3 ];
429
444
if (!anv_address_is_null (indirect_addr )) {
430
445
uint64_t indirect_addr64 = anv_address_physical (indirect_addr );
@@ -520,25 +535,32 @@ static inline void
520
535
emit_cs_walker (struct anv_cmd_buffer * cmd_buffer ,
521
536
const struct anv_compute_pipeline * pipeline ,
522
537
const struct brw_cs_prog_data * prog_data ,
538
+ struct intel_cs_dispatch_info dispatch ,
523
539
struct anv_address indirect_addr ,
524
- uint32_t groupCountX , uint32_t groupCountY , uint32_t groupCountZ )
540
+ uint32_t groupCountX , uint32_t groupCountY , uint32_t groupCountZ ,
541
+ bool is_unaligned_size_x )
525
542
{
526
543
bool is_indirect = !anv_address_is_null (indirect_addr );
527
544
528
545
#if GFX_VERx10 >= 125
529
- if (is_indirect && cmd_buffer -> device -> info -> has_indirect_unroll ) {
546
+ /* For unaligned dispatch, we need to tweak the dispatch value with
547
+ * MI_MATH, so we can't use indirect HW instructions.
548
+ */
549
+ if (is_indirect && !is_unaligned_size_x &&
550
+ cmd_buffer -> device -> info -> has_indirect_unroll ) {
530
551
emit_indirect_compute_walker (cmd_buffer , pipeline -> cs , prog_data ,
531
552
indirect_addr );
532
553
return ;
533
554
}
534
555
#endif
535
556
536
557
if (is_indirect )
537
- compute_load_indirect_params (cmd_buffer , indirect_addr );
558
+ compute_load_indirect_params (cmd_buffer , indirect_addr ,
559
+ is_unaligned_size_x );
538
560
539
561
#if GFX_VERx10 >= 125
540
562
emit_compute_walker (cmd_buffer , pipeline , indirect_addr , prog_data ,
541
- groupCountX , groupCountY , groupCountZ );
563
+ dispatch , groupCountX , groupCountY , groupCountZ );
542
564
#else
543
565
emit_gpgpu_walker (cmd_buffer , pipeline , is_indirect , prog_data ,
544
566
groupCountX , groupCountY , groupCountZ );
@@ -558,6 +580,8 @@ void genX(CmdDispatchBase)(
558
580
struct anv_compute_pipeline * pipeline =
559
581
anv_pipeline_to_compute (cmd_buffer -> state .compute .base .pipeline );
560
582
const struct brw_cs_prog_data * prog_data = get_cs_prog_data (pipeline );
583
+ struct intel_cs_dispatch_info dispatch =
584
+ brw_cs_get_dispatch_info (cmd_buffer -> device -> info , prog_data , NULL );
561
585
562
586
if (anv_batch_has_error (& cmd_buffer -> batch ))
563
587
return ;
@@ -581,32 +605,154 @@ void genX(CmdDispatchBase)(
581
605
if (cmd_buffer -> state .conditional_render_enabled )
582
606
genX (cmd_emit_conditional_render_predicate )(cmd_buffer );
583
607
584
- emit_cs_walker (cmd_buffer , pipeline , prog_data ,
608
+ emit_cs_walker (cmd_buffer , pipeline , prog_data , dispatch ,
585
609
ANV_NULL_ADDRESS /* no indirect data */ ,
586
- groupCountX , groupCountY , groupCountZ );
610
+ groupCountX , groupCountY , groupCountZ ,
611
+ false);
587
612
588
613
trace_intel_end_compute (& cmd_buffer -> trace ,
589
614
groupCountX , groupCountY , groupCountZ );
590
615
}
591
616
592
- void genX (CmdDispatchIndirect )(
617
+ static void
618
+ emit_unaligned_cs_walker (
593
619
VkCommandBuffer commandBuffer ,
594
- VkBuffer _buffer ,
595
- VkDeviceSize offset )
620
+ uint32_t baseGroupX ,
621
+ uint32_t baseGroupY ,
622
+ uint32_t baseGroupZ ,
623
+ uint32_t groupCountX ,
624
+ uint32_t groupCountY ,
625
+ uint32_t groupCountZ ,
626
+ struct intel_cs_dispatch_info dispatch )
596
627
{
597
628
ANV_FROM_HANDLE (anv_cmd_buffer , cmd_buffer , commandBuffer );
598
- ANV_FROM_HANDLE (anv_buffer , buffer , _buffer );
599
629
struct anv_compute_pipeline * pipeline =
600
630
anv_pipeline_to_compute (cmd_buffer -> state .compute .base .pipeline );
601
631
const struct brw_cs_prog_data * prog_data = get_cs_prog_data (pipeline );
602
- struct anv_address addr = anv_address_add (buffer -> address , offset );
632
+
633
+ if (anv_batch_has_error (& cmd_buffer -> batch ))
634
+ return ;
635
+
636
+ anv_cmd_buffer_push_workgroups (cmd_buffer , prog_data ,
637
+ baseGroupX , baseGroupY , baseGroupZ ,
638
+ groupCountX , groupCountY , groupCountZ ,
639
+ ANV_NULL_ADDRESS );
640
+
641
+ /* RT shaders have Y and Z local size set to 1 always. */
642
+ assert (prog_data -> local_size [1 ] == 1 && prog_data -> local_size [2 ] == 1 );
643
+
644
+ /* RT shaders dispatched with group Y and Z set to 1 always. */
645
+ assert (groupCountY == 1 && groupCountZ == 1 );
646
+
647
+ if (anv_batch_has_error (& cmd_buffer -> batch ))
648
+ return ;
649
+
650
+ anv_measure_snapshot (cmd_buffer ,
651
+ INTEL_SNAPSHOT_COMPUTE ,
652
+ "compute-unaligned-cs-walker" ,
653
+ groupCountX * groupCountY * groupCountZ *
654
+ prog_data -> local_size [0 ] * prog_data -> local_size [1 ] *
655
+ prog_data -> local_size [2 ]);
656
+
657
+ trace_intel_begin_compute (& cmd_buffer -> trace );
658
+
659
+ assert (!prog_data -> uses_num_work_groups );
660
+ genX (cmd_buffer_flush_compute_state )(cmd_buffer );
661
+
662
+ if (cmd_buffer -> state .conditional_render_enabled )
663
+ genX (cmd_emit_conditional_render_predicate )(cmd_buffer );
664
+
665
+ #if GFX_VERx10 >= 125
666
+ emit_compute_walker (cmd_buffer , pipeline , ANV_NULL_ADDRESS , prog_data ,
667
+ dispatch , groupCountX , groupCountY , groupCountZ );
668
+ #endif
669
+
670
+ trace_intel_end_compute (& cmd_buffer -> trace ,
671
+ groupCountX , groupCountY , groupCountZ );
672
+ }
673
+
674
+ /*
675
+ * Dispatch compute work item with unaligned thread invocations.
676
+ *
677
+ * This helper takes unaligned thread invocations, convert it into aligned
678
+ * thread group count and dispatch compute work items.
679
+ *
680
+ * We launch two CS walker, one with aligned part and another CS walker
681
+ * with single group for remaining thread invocations.
682
+ *
683
+ * This function is now specifically for BVH building.
684
+ */
685
+ void
686
+ genX (cmd_dispatch_unaligned )(
687
+ VkCommandBuffer commandBuffer ,
688
+ uint32_t invocations_x ,
689
+ uint32_t invocations_y ,
690
+ uint32_t invocations_z )
691
+ {
692
+ ANV_FROM_HANDLE (anv_cmd_buffer , cmd_buffer , commandBuffer );
693
+ struct anv_compute_pipeline * pipeline =
694
+ anv_pipeline_to_compute (cmd_buffer -> state .compute .base .pipeline );
695
+ const struct brw_cs_prog_data * prog_data = get_cs_prog_data (pipeline );
696
+
697
+ /* Group X can be unaligned for RT dispatches. */
698
+ uint32_t groupCountX = invocations_x / prog_data -> local_size [0 ];
699
+ uint32_t groupCountY = invocations_y ;
700
+ uint32_t groupCountZ = invocations_z ;
701
+
702
+ struct intel_cs_dispatch_info dispatch =
703
+ brw_cs_get_dispatch_info (cmd_buffer -> device -> info , prog_data , NULL );
704
+
705
+ /* Launch first CS walker with aligned group count X. */
706
+ if (groupCountX ) {
707
+ emit_unaligned_cs_walker (commandBuffer , 0 , 0 , 0 , groupCountX ,
708
+ groupCountY , groupCountZ , dispatch );
709
+ }
710
+
711
+ uint32_t unaligned_invocations_x = invocations_x % prog_data -> local_size [0 ];
712
+ if (unaligned_invocations_x ) {
713
+ dispatch .threads = DIV_ROUND_UP (unaligned_invocations_x ,
714
+ dispatch .simd_size );
715
+
716
+ /* Make sure the 2nd walker has the same amount of invocations per
717
+ * workgroup as the 1st walker, so that gl_GlobalInvocationsID can be
718
+ * calculated correctly with baseGroup.
719
+ */
720
+ assert (dispatch .threads * dispatch .simd_size == prog_data -> local_size [0 ]);
721
+
722
+ const uint32_t remainder = unaligned_invocations_x & (dispatch .simd_size - 1 );
723
+ if (remainder > 0 ) {
724
+ dispatch .right_mask = ~0u >> (32 - remainder );
725
+ } else {
726
+ dispatch .right_mask = ~0u >> (32 - dispatch .simd_size );
727
+ }
728
+
729
+ /* Launch second CS walker for unaligned part. */
730
+ emit_unaligned_cs_walker (commandBuffer , groupCountX , 0 , 0 , 1 , 1 , 1 ,
731
+ dispatch );
732
+ }
733
+ }
734
+
735
+ /*
736
+ * This dispatches compute work item with indirect parameters.
737
+ * Helper also makes the unaligned thread invocations aligned.
738
+ */
739
+ void
740
+ genX (cmd_buffer_dispatch_indirect )(struct anv_cmd_buffer * cmd_buffer ,
741
+ struct anv_address indirect_addr ,
742
+ bool is_unaligned_size_x )
743
+ {
744
+ struct anv_compute_pipeline * pipeline =
745
+ anv_pipeline_to_compute (cmd_buffer -> state .compute .base .pipeline );
746
+ const struct brw_cs_prog_data * prog_data = get_cs_prog_data (pipeline );
603
747
UNUSED struct anv_batch * batch = & cmd_buffer -> batch ;
748
+ struct intel_cs_dispatch_info dispatch =
749
+ brw_cs_get_dispatch_info (cmd_buffer -> device -> info , prog_data , NULL );
604
750
605
751
if (anv_batch_has_error (& cmd_buffer -> batch ))
606
752
return ;
607
753
608
754
anv_cmd_buffer_push_workgroups (cmd_buffer , prog_data ,
609
- 0 , 0 , 0 , 0 , 0 , 0 , addr );
755
+ 0 , 0 , 0 , 0 , 0 , 0 , indirect_addr );
610
756
611
757
anv_measure_snapshot (cmd_buffer ,
612
758
INTEL_SNAPSHOT_COMPUTE ,
@@ -619,10 +765,23 @@ void genX(CmdDispatchIndirect)(
619
765
if (cmd_buffer -> state .conditional_render_enabled )
620
766
genX (cmd_emit_conditional_render_predicate )(cmd_buffer );
621
767
622
- emit_cs_walker (cmd_buffer , pipeline , prog_data , addr , 0 , 0 , 0 );
768
+ emit_cs_walker (cmd_buffer , pipeline , prog_data , dispatch , indirect_addr , 0 ,
769
+ 0 , 0 , is_unaligned_size_x );
623
770
624
771
trace_intel_end_compute_indirect (& cmd_buffer -> trace ,
625
- anv_address_utrace (addr ));
772
+ anv_address_utrace (indirect_addr ));
773
+ }
774
+
775
+ void genX (CmdDispatchIndirect )(
776
+ VkCommandBuffer commandBuffer ,
777
+ VkBuffer _buffer ,
778
+ VkDeviceSize offset )
779
+ {
780
+ ANV_FROM_HANDLE (anv_cmd_buffer , cmd_buffer , commandBuffer );
781
+ ANV_FROM_HANDLE (anv_buffer , buffer , _buffer );
782
+ struct anv_address addr = anv_address_add (buffer -> address , offset );
783
+
784
+ genX (cmd_buffer_dispatch_indirect )(cmd_buffer , addr , false);
626
785
}
627
786
628
787
struct anv_address
0 commit comments