From 61e4addfcca23fa880fb9fb47983ae83c539d3b3 Mon Sep 17 00:00:00 2001
From: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com>
Date: Tue, 26 May 2026 08:52:43 -0700
Subject: [PATCH 1/2] [https://nvbugs/6018046][fix] Lower throughput_pp4_mtp
 max_batch_size to 8

The throughput_pp4_mtp variant uses pp_size=4 + ep_size=1 + mtp_nextn=1
on a 4-GPU topology. With max_batch_size=32 (every other 4-GPU variant
in this test uses bs<=16), the per-bs CUDA-graph snapshot pool plus
per-step activation and NCCL collective buffers leave no headroom for
the lazy cuBLAS Lt workspace allocation. Under sustained load this
surfaces as mid-run CUBLAS_STATUS_EXECUTION_FAILED in the DeepseekV3
router / o_proj GEMMs, followed by an asynchronous illegal memory
access.

Lower max_batch_size from 32 to 8 to match the already-stable
throughput_bs8_mtp configuration. With bs=8 the default 0.70 KV
cache fraction provides sufficient cuBLAS workspace headroom; the
test passes in repeated runs without any KV fraction reduction.

Removes the matching waiver entry.

Signed-off-by: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 9 ++++++++-
 tests/integration/test_lists/waives.txt                 | 1 -
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 9e24f1a8328e..b504cac47578 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2902,6 +2902,13 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          8,
                          "CUTLASS",
                          marks=pytest.mark.skip_less_mpi_world_size(8)),
+            # throughput_pp4_mtp (NVBug 6018046): on 4-GPU PP=4 + MTP, the
+            # per-bs CUDA-graph snapshot pool plus per-step activation and
+            # NCCL collective buffers consume the lazy cuBLAS Lt workspace
+            # headroom at bs=32, surfacing as mid-run
+            # CUBLAS_STATUS_EXECUTION_FAILED. Match the already-stable
+            # throughput_bs8_mtp configuration with bs=8; the default 0.70
+            # KV fraction is sufficient once bs is reduced.
             pytest.param(1,
                          4,
                          1,
@@ -2911,7 +2918,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          False,
                          True,
                          True,
-                         32,
+                         8,
                          "CUTLASS",
                          marks=pytest.mark.skip_less_mpi_world_size(4)),
         ],
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index a53606807422..f45722f85c21 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -22,7 +22,6 @@ accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_bf16_small[4] SK
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/6215736)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/6084775)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/6029882)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/6018046)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] SKIP (https://nvbugs/6215793)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus SKIP (https://nvbugs/6193778)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_dsa_host_cache_offload[host_cache_offload] SKIP (https://nvbugs/6185196)

From f8d34817e32e4007981bc64a55d87b437ac95743 Mon Sep 17 00:00:00 2001
From: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com>
Date: Tue, 26 May 2026 04:37:53 -0700
Subject: [PATCH 2/2] [https://nvbugs/5970614][fix] Sync CTA before PDL trigger
 in quantize_with_block_size

Signed-off-by: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com>
---
 cpp/tensorrt_llm/kernels/quantization.cuh | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh
index 89b96b288b46..860e430e587b 100644
--- a/cpp/tensorrt_llm/kernels/quantization.cuh
+++ b/cpp/tensorrt_llm/kernels/quantization.cuh
@@ -897,6 +897,15 @@ quantize_with_block_size(
             }
         }
     }
+    // Fix for nvbugs/5970614 (https://nvbugspro.nvidia.com/bug/5970614).
+    // PDL completion is reported when every CTA has either exited or called
+    // this function at least once (per CUDA Programming Guide). Without a
+    // CTA-wide barrier, an early-finishing warp can trigger completion while
+    // other warps in the same CTA are still writing sf_out / out, allowing the
+    // downstream NVF4 GEMM consumer to read partial data once
+    // wait_on_dependent_grids returns. Drain the CTA's stores before trigger.
+    __syncthreads();
+    __threadfence();
     cudaTriggerProgrammaticLaunchCompletion();
 #endif
 }