NVIDIA · tianyuxbear · May 27, 2026
diff --git a/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp
@@ -668,14 +668,14 @@ class GemmUniversal<
     Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
         tiled_mma, acc_shape, EpilogueTile{});
 
+    // Ensure memory ops in this kernel are not done prior to completion of dependent grids.
+    cutlass::arch::wait_on_dependent_grids();
+
 #if 1
     pipeline_init_wait(cluster_size);
 
     if (is_participant.main_ab_load) {
       set_warpgroup_reg_dealloc();
-      // Ensure that the prefetched kernel does not touch
-      // unflushed global memory prior to this instruction
-      cutlass::arch::wait_on_dependent_grids();
 
       bool do_load_order_arrive = is_epi_load_needed;
       auto load_inputs = collective_mainloop.load_ab_init(
@@ -752,8 +752,6 @@ class GemmUniversal<
         // why this variable is needed.
         bool requires_clc_query = true;
 
-        cutlass::arch::wait_on_dependent_grids();
-
         do {
           if (requires_clc_query) {
             // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
@@ -957,9 +955,6 @@ class GemmUniversal<
     }
     else if (not IsNoSmemEpilogue and is_participant.epi_load) {
       set_warpgroup_reg_dealloc();
-      // Ensure that the prefetched kernel does not touch
-      // unflushed global memory prior to this instruction
-      cutlass::arch::wait_on_dependent_grids();
 
       bool do_load_order_wait = true;
       bool do_tail_load = false;