NVIDIA-NeMo · louisfaury · Jun 3, 2026 · Jun 4, 2026 · Jun 7, 2026 · Jun 8, 2026
@@ -320,6 +320,7 @@ policy:
       # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
       # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
       enforce_eager: False
+      use_tqdm: true # Set to false to suppress vLLM generation progress bars. Only applies when async_engine is false
       use_deep_gemm: False
       num_last_layers_in_bf16: 0
       num_first_layers_in_bf16: 0

@@ -30,6 +30,8 @@ class VllmSpecificArgs(TypedDict):
     precision: NotRequired[str]
     kv_cache_dtype: Literal["auto", "fp8", "fp8_e4m3"]
     enforce_eager: NotRequired[bool]
+    # Whether to show a tqdm progress bar during generation. Defaults to vLLM's own default (True) when absent. Only applies when async_engine is False.
+    use_tqdm: NotRequired[bool]
     # By default, NeMo RL only has a Python handle to the vllm.LLM generation engine. The expose_http_server flag here will expose that generation engine as an HTTP server.
     # Exposing vLLM as a server is useful in instances where the multi-turn rollout is performed with utilities outside of NeMo RL, but the user still wants to take advantage of the refit logic in NeMo RL that keeps the policy and generation up to date.
     # Currently it will expose the /tokenize and /v1/chat/completions endpoints. Later on we may expose /v1/completions or /v1/responses.

@@ -769,7 +769,8 @@ def generate(
         assert self.llm is not None, (
             "Attempting to generate with either an uninitialized vLLM or non-model-owner"
         )
-        outputs = self.llm.generate(prompts, sampling_params)
+        use_tqdm = self.cfg["vllm_cfg"].get("use_tqdm", True)
+        outputs = self.llm.generate(prompts, sampling_params, use_tqdm=use_tqdm)
 
         # Process the outputs - but preserve the original input padding structure
         output_ids_list = []
@@ -905,7 +906,8 @@ def generate_text(
         assert self.llm is not None, (
             "Attempting to generate with either an uninitialized vLLM or non-model-owner"
         )
-        outputs = self.llm.generate(data["prompts"], sampling_params)
+        use_tqdm = self.cfg["vllm_cfg"].get("use_tqdm", True)
+        outputs = self.llm.generate(data["prompts"], sampling_params, use_tqdm=use_tqdm)
         texts = [output.outputs[0].text for output in outputs]
 
         # Convert to BatchedDataDict

@@ -314,6 +314,7 @@ policy:
       # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
       # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
       enforce_eager: False
+      use_tqdm: true # Set to false to suppress vLLM generation progress bars. Only applies when async_engine is false
       use_deep_gemm: False
       num_last_layers_in_bf16: 0
       num_first_layers_in_bf16: 0