From 25230b5a3f82c20098e3fa60fb786b8b9be0422c Mon Sep 17 00:00:00 2001 From: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com> Date: Wed, 27 May 2026 02:56:56 -0700 Subject: [PATCH] fix: missing PDL wait on main_sf_load in sm103 blockscaled GEMM --- .../sm103_blockscaled_gemm_tma_warpspecialized.hpp | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp index 14cb858b9f..d9daa03493 100644 --- a/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp +++ b/include/cutlass/gemm/kernel/sm103_blockscaled_gemm_tma_warpspecialized.hpp @@ -668,14 +668,14 @@ class GemmUniversal< Tensor accumulators = cutlass::detail::make_sm100_accumulator( tiled_mma, acc_shape, EpilogueTile{}); + // Ensure memory ops in this kernel are not done prior to completion of dependent grids. + cutlass::arch::wait_on_dependent_grids(); + #if 1 pipeline_init_wait(cluster_size); if (is_participant.main_ab_load) { set_warpgroup_reg_dealloc(); - // Ensure that the prefetched kernel does not touch - // unflushed global memory prior to this instruction - cutlass::arch::wait_on_dependent_grids(); bool do_load_order_arrive = is_epi_load_needed; auto load_inputs = collective_mainloop.load_ab_init( @@ -752,8 +752,6 @@ class GemmUniversal< // why this variable is needed. bool requires_clc_query = true; - cutlass::arch::wait_on_dependent_grids(); - do { if (requires_clc_query) { // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers. @@ -957,9 +955,6 @@ class GemmUniversal< } else if (not IsNoSmemEpilogue and is_participant.epi_load) { set_warpgroup_reg_dealloc(); - // Ensure that the prefetched kernel does not touch - // unflushed global memory prior to this instruction - cutlass::arch::wait_on_dependent_grids(); bool do_load_order_wait = true; bool do_tail_load = false;