From 25230b5a3f82c20098e3fa60fb786b8b9be0422c Mon Sep 17 00:00:00 2001
From: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com>
Date: Wed, 27 May 2026 02:56:56 -0700
Subject: [PATCH] fix: missing PDL wait on main_sf_load in sm103 blockscaled
 GEMM

---
 .../sm103_blockscaled_gemm_tma_warpspecialized.hpp    | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp
index 14cb858b9f..d9daa03493 100644
--- a/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp
@@ -668,14 +668,14 @@ class GemmUniversal<
     Tensor accumulators = cutlass::detail::make_sm100_accumulator<AccumulatorPipelineStageCount, IsOverlappingAccum>(
         tiled_mma, acc_shape, EpilogueTile{});
 
+    // Ensure memory ops in this kernel are not done prior to completion of dependent grids.
+    cutlass::arch::wait_on_dependent_grids();
+
 #if 1
     pipeline_init_wait(cluster_size);
 
     if (is_participant.main_ab_load) {
       set_warpgroup_reg_dealloc();
-      // Ensure that the prefetched kernel does not touch
-      // unflushed global memory prior to this instruction
-      cutlass::arch::wait_on_dependent_grids();
 
       bool do_load_order_arrive = is_epi_load_needed;
       auto load_inputs = collective_mainloop.load_ab_init(
@@ -752,8 +752,6 @@ class GemmUniversal<
         // why this variable is needed.
         bool requires_clc_query = true;
 
-        cutlass::arch::wait_on_dependent_grids();
-
         do {
           if (requires_clc_query) {
             // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers.
@@ -957,9 +955,6 @@ class GemmUniversal<
     }
     else if (not IsNoSmemEpilogue and is_participant.epi_load) {
       set_warpgroup_reg_dealloc();
-      // Ensure that the prefetched kernel does not touch
-      // unflushed global memory prior to this instruction
-      cutlass::arch::wait_on_dependent_grids();
 
       bool do_load_order_wait = true;
       bool do_tail_load = false;