NVIDIA · tianyuxbear · May 26, 2026 · May 26, 2026
diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh
@@ -897,6 +897,15 @@ quantize_with_block_size(
             }
         }
     }
+    // Fix for nvbugs/5970614 (https://nvbugspro.nvidia.com/bug/5970614).
+    // PDL completion is reported when every CTA has either exited or called
+    // this function at least once (per CUDA Programming Guide). Without a
+    // CTA-wide barrier, an early-finishing warp can trigger completion while
+    // other warps in the same CTA are still writing sf_out / out, allowing the
+    // downstream NVF4 GEMM consumer to read partial data once
+    // wait_on_dependent_grids returns. Drain the CTA's stores before trigger.
+    __syncthreads();
+    __threadfence();
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }

@@ -2902,6 +2902,13 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          8,
                          "CUTLASS",
                          marks=pytest.mark.skip_less_mpi_world_size(8)),
+            # throughput_pp4_mtp (NVBug 6018046): on 4-GPU PP=4 + MTP, the
+            # per-bs CUDA-graph snapshot pool plus per-step activation and
+            # NCCL collective buffers consume the lazy cuBLAS Lt workspace
+            # headroom at bs=32, surfacing as mid-run
+            # CUBLAS_STATUS_EXECUTION_FAILED. Match the already-stable
+            # throughput_bs8_mtp configuration with bs=8; the default 0.70
+            # KV fraction is sufficient once bs is reduced.
             pytest.param(1,
                          4,
                          1,
@@ -2911,7 +2918,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          False,
                          True,
                          True,
-                         32,
+                         8,
                          "CUTLASS",
                          marks=pytest.mark.skip_less_mpi_world_size(4)),
         ],

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -22,7 +22,6 @@ accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_bf16_small[4] SK
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/6215736)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/6084775)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/6029882)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/6018046)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] SKIP (https://nvbugs/6215793)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus SKIP (https://nvbugs/6193778)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_dsa_host_cache_offload[host_cache_offload] SKIP (https://nvbugs/6185196)