From 61e4addfcca23fa880fb9fb47983ae83c539d3b3 Mon Sep 17 00:00:00 2001 From: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com> Date: Tue, 26 May 2026 08:52:43 -0700 Subject: [PATCH 1/2] [https://nvbugs/6018046][fix] Lower throughput_pp4_mtp max_batch_size to 8 The throughput_pp4_mtp variant uses pp_size=4 + ep_size=1 + mtp_nextn=1 on a 4-GPU topology. With max_batch_size=32 (every other 4-GPU variant in this test uses bs<=16), the per-bs CUDA-graph snapshot pool plus per-step activation and NCCL collective buffers leave no headroom for the lazy cuBLAS Lt workspace allocation. Under sustained load this surfaces as mid-run CUBLAS_STATUS_EXECUTION_FAILED in the DeepseekV3 router / o_proj GEMMs, followed by an asynchronous illegal memory access. Lower max_batch_size from 32 to 8 to match the already-stable throughput_bs8_mtp configuration. With bs=8 the default 0.70 KV cache fraction provides sufficient cuBLAS workspace headroom; the test passes in repeated runs without any KV fraction reduction. Removes the matching waiver entry. Signed-off-by: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 9 ++++++++- tests/integration/test_lists/waives.txt | 1 - 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 9e24f1a8328e..b504cac47578 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2902,6 +2902,13 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): 8, "CUTLASS", marks=pytest.mark.skip_less_mpi_world_size(8)), + # throughput_pp4_mtp (NVBug 6018046): on 4-GPU PP=4 + MTP, the + # per-bs CUDA-graph snapshot pool plus per-step activation and + # NCCL collective buffers consume the lazy cuBLAS Lt workspace + # headroom at bs=32, surfacing as mid-run + # CUBLAS_STATUS_EXECUTION_FAILED. Match the already-stable + # throughput_bs8_mtp configuration with bs=8; the default 0.70 + # KV fraction is sufficient once bs is reduced. pytest.param(1, 4, 1, @@ -2911,7 +2918,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): False, True, True, - 32, + 8, "CUTLASS", marks=pytest.mark.skip_less_mpi_world_size(4)), ], diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index a53606807422..f45722f85c21 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -22,7 +22,6 @@ accuracy/test_llm_api_autodeploy.py::TestQwen3_5_397B_MoE::test_bf16_small[4] SK accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] SKIP (https://nvbugs/6215736) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] SKIP (https://nvbugs/6084775) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] SKIP (https://nvbugs/6029882) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/6018046) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4] SKIP (https://nvbugs/6215793) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1LongBenchV2::test_fp8_8gpus SKIP (https://nvbugs/6193778) accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_dsa_host_cache_offload[host_cache_offload] SKIP (https://nvbugs/6185196) From f8d34817e32e4007981bc64a55d87b437ac95743 Mon Sep 17 00:00:00 2001 From: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com> Date: Tue, 26 May 2026 04:37:53 -0700 Subject: [PATCH 2/2] [https://nvbugs/5970614][fix] Sync CTA before PDL trigger in quantize_with_block_size Signed-off-by: Tianyu Xiong <117647511+tianyuxbear@users.noreply.github.com> --- cpp/tensorrt_llm/kernels/quantization.cuh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh index 89b96b288b46..860e430e587b 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cuh +++ b/cpp/tensorrt_llm/kernels/quantization.cuh @@ -897,6 +897,15 @@ quantize_with_block_size( } } } + // Fix for nvbugs/5970614 (https://nvbugspro.nvidia.com/bug/5970614). + // PDL completion is reported when every CTA has either exited or called + // this function at least once (per CUDA Programming Guide). Without a + // CTA-wide barrier, an early-finishing warp can trigger completion while + // other warps in the same CTA are still writing sf_out / out, allowing the + // downstream NVF4 GEMM consumer to read partial data once + // wait_on_dependent_grids returns. Drain the CTA's stores before trigger. + __syncthreads(); + __threadfence(); cudaTriggerProgrammaticLaunchCompletion(); #endif }