diff --git a/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp index 14cb858b9f..d9daa03493 100644 --- a/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp +++ b/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp @@ -668,14 +668,14 @@ class GemmUniversal< Tensor accumulators = cutlass::detail::make_sm100_accumulator( tiled_mma, acc_shape, EpilogueTile{}); + // Ensure memory ops in this kernel are not done prior to completion of dependent grids. + cutlass::arch::wait_on_dependent_grids(); + #if 1 pipeline_init_wait(cluster_size); if (is_participant.main_ab_load) { set_warpgroup_reg_dealloc(); - // Ensure that the prefetched kernel does not touch - // unflushed global memory prior to this instruction - cutlass::arch::wait_on_dependent_grids(); bool do_load_order_arrive = is_epi_load_needed; auto load_inputs = collective_mainloop.load_ab_init( @@ -752,8 +752,6 @@ class GemmUniversal< // why this variable is needed. bool requires_clc_query = true; - cutlass::arch::wait_on_dependent_grids(); - do { if (requires_clc_query) { // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers. @@ -957,9 +955,6 @@ class GemmUniversal< } else if (not IsNoSmemEpilogue and is_participant.epi_load) { set_warpgroup_reg_dealloc(); - // Ensure that the prefetched kernel does not touch - // unflushed global memory prior to this instruction - cutlass::arch::wait_on_dependent_grids(); bool do_load_order_wait = true; bool do_tail_load = false;