NVIDIA · romerojosh · Jun 4, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 28, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -4,10 +4,10 @@ if (NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE RelWithDebInfo)
 endif()
 
-# https://github.com/NVIDIA/TorchFort/issues/3
+#https: // github.com/NVIDIA/TorchFort/issues/3
 cmake_policy(SET CMP0057 NEW)
 
-# User-defined build options
+#User - defined build options
 set(TORCHFORT_CUDA_CC_LIST "70;80;90" CACHE STRING "List of CUDA compute capabilities to build torchfort for.")
 set(TORCHFORT_NCCL_ROOT CACHE STRING "Path to search for NCCL installation. Default NVIDA HPC SDK provided NCCL version if available.")
 set(TORCHFORT_YAML_CPP_ROOT CACHE STRING "Path to search for yaml-cpp installation.")
@@ -16,7 +16,7 @@ option(TORCHFORT_BUILD_EXAMPLES "Build examples" OFF)
 option(TORCHFORT_BUILD_TESTS "Build tests" OFF)
 option(TORCHFORT_ENABLE_GPU "Enable GPU/CUDA support" ON)
 
-# For backward-compatibility with existing variable
+#For backward - compatibility with existing variable
 if (YAML_CPP_ROOT)
   set(TORCHFORT_YAML_CPP_ROOT ${YAML_CPP_ROOT})
 endif()
@@ -34,13 +34,12 @@ endif()
 project(torchfort LANGUAGES ${LANGS})
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "NVHPC")
-  # __rdtsc() in torch not supported by nvc++. Use g++ for CXX files.
+#__rdtsc() in torch not supported by nvc++.Use g++ for CXX files.
   message(FATAL_ERROR "TorchFort does not support compilation of C++ files with nvc++. "
                       "Set CMAKE_CXX_COMPILER to g++ to proceed.")
 endif()
 
-
-# unit testing with gtest
+#unit testing with gtest
 if (TORCHFORT_BUILD_TESTS)
   enable_testing()
   include(CTest)
@@ -49,29 +48,29 @@ if (TORCHFORT_BUILD_TESTS)
     googletest
     URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
     )
-  # For Windows: Prevent overriding the parent project's compiler/linker settings
+#For Windows : Prevent overriding the parent project's compiler/linker settings
   set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
   FetchContent_MakeAvailable(googletest)
   include(GoogleTest)
 endif()
 
-# MPI
+#MPI
 find_package(MPI REQUIRED)
 
-# CUDA
+#CUDA
 if (TORCHFORT_ENABLE_GPU)
   find_package(CUDAToolkit REQUIRED)
 
-  # HPC SDK
-  # Locate and append NVHPC CMake configuration if available
+#HPC SDK
+#Locate and append NVHPC CMake configuration if available
   find_program(NVHPC_CXX_BIN "nvc++")
   if (NVHPC_CXX_BIN)
     string(REPLACE "compilers/bin/nvc++" "cmake" NVHPC_CMAKE_DIR ${NVHPC_CXX_BIN})
     set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH};${NVHPC_CMAKE_DIR}")
     find_package(NVHPC COMPONENTS "")
   endif()
-  
-  # Get NCCL library (with optional override)
+
+#Get NCCL library(with optional override)
   if (TORCHFORT_NCCL_ROOT)
     find_path(NCCL_INCLUDE_DIR REQUIRED
       NAMES nccl.h
@@ -103,8 +102,8 @@ if (TORCHFORT_ENABLE_GPU)
 
   message(STATUS "Using NCCL library: ${NCCL_LIBRARY}")
 
-  # PyTorch
-  # Set TORCH_CUDA_ARCH_LIST string to match TORCHFORT_CUDA_CC_LIST
+#PyTorch
+#Set TORCH_CUDA_ARCH_LIST string to match TORCHFORT_CUDA_CC_LIST
   foreach(CUDA_CC ${TORCHFORT_CUDA_CC_LIST})
       string(REGEX REPLACE "([0-9])$" ".\\1" CUDA_CC_W_DOT ${CUDA_CC})
     list(APPEND TORCH_CUDA_ARCH_LIST ${CUDA_CC_W_DOT})
@@ -114,15 +113,15 @@ endif()
 
 find_package(Torch REQUIRED)
 
-# Generate configuration header
+#Generate configuration header
 configure_file(
   ${CMAKE_CURRENT_SOURCE_DIR}/src/csrc/include/torchfort_config.h.in
   ${CMAKE_BINARY_DIR}/include/torchfort_config.h
   @ONLY
 )
 
-# yaml-cpp
-#find_package(yaml-cpp REQUIRED)
+#yaml - cpp
+#find_package(yaml - cpp REQUIRED)
 find_path(YAML_CPP_INCLUDE_DIR REQUIRED
   NAMES yaml-cpp/yaml.h
   HINTS ${TORCHFORT_YAML_CPP_ROOT}/include
@@ -133,7 +132,7 @@ find_library(YAML_CPP_LIBRARY REQUIRED
 )
 message(STATUS "Using yaml-cpp library: ${YAML_CPP_LIBRARY}")
 
-# C/C++ shared library
+#C / C++ shared library
 add_library(${PROJECT_NAME} SHARED)
 set_target_properties(${PROJECT_NAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 
@@ -165,6 +164,7 @@ target_sources(${PROJECT_NAME}
   ${CMAKE_CURRENT_SOURCE_DIR}/src/csrc/models/rl/sac_model.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/csrc/rl/policy.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/csrc/rl/running_normalizer.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/csrc/rl/setup.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/csrc/rl/utils.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/csrc/rl/off_policy/interface.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/src/csrc/rl/off_policy/ddpg.cpp
@@ -219,17 +219,17 @@ install(
   INCLUDES DESTINATION ${CMAKE_INSTALL_PREFIX}/include
 )
 
-# Install generated configuration header
+#Install generated configuration header
 install(
   FILES ${CMAKE_BINARY_DIR}/include/torchfort_config.h
   DESTINATION ${CMAKE_INSTALL_PREFIX}/include
 )
 
-# Fortran library and module
+#Fortran library and module
 if (TORCHFORT_BUILD_FORTRAN)
 
   if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC")
-    # Creating -gpu argument string for nvfortran GPU compilation
+#Creating - gpu argument string for nvfortran GPU compilation
     foreach(CUDA_CC ${TORCHFORT_CUDA_CC_LIST})
       list(APPEND CUF_GPU_ARG "cc${CUDA_CC}")
     endforeach()
@@ -256,17 +256,17 @@ if (TORCHFORT_BUILD_FORTRAN)
   install(
     TARGETS "${PROJECT_NAME}_fort"
   )
-  # install Fortran module
+#install Fortran module
   install(FILES ${CMAKE_BINARY_DIR}/include/torchfort.mod DESTINATION ${CMAKE_INSTALL_PREFIX}/include)
 endif()
 
-# install Python files
+#install Python files
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/src/python/wandb_helper.py DESTINATION ${CMAKE_INSTALL_PREFIX}/bin/python)
 
-# install docs
+#install docs
 install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/docs DESTINATION ${CMAKE_INSTALL_PREFIX})
 
-# build examples
+#build examples
 if (TORCHFORT_BUILD_EXAMPLES)
   add_subdirectory(examples/cpp/cart_pole)
   if (TORCHFORT_BUILD_FORTRAN)
@@ -275,7 +275,7 @@ if (TORCHFORT_BUILD_EXAMPLES)
   endif()
 endif()
 
-# build tests
+#build tests
 if (TORCHFORT_BUILD_TESTS)
   add_subdirectory(tests/general)
   add_subdirectory(tests/supervised)

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# TorchFort
+#TorchFort
 
 An Online Deep Learning Interface for HPC programs on NVIDIA GPUs
 

diff --git a/docs/api/config.rst b/docs/api/config.rst
@@ -403,7 +403,7 @@ The block in the configuration file defining algorithm properties takes the foll
     parameters:
       <option> = <value>
 
-Currently, only type ``uniform`` is supported. The following table lists the available options:
+Currently, types ``uniform`` and ``prioritized`` are supported. The following table lists the available options:
 
 +---------------------------+-----------------+-----------------+------------------------------------------------------------------+
 | Replay Buffer Type        | Option          | Data Type       | Description                                                      |
@@ -414,11 +414,32 @@ Currently, only type ``uniform`` is supported. The following table lists the ava
 +                           +-----------------+-----------------+------------------------------------------------------------------+
 |                           | ``n_envs``      | integer         | Number of environments                                           |
 +---------------------------+-----------------+-----------------+------------------------------------------------------------------+
+| ``prioritized``           | ``min_size``    | integer         | Minimum number of samples before buffer is ready for training    |
++                           +-----------------+-----------------+------------------------------------------------------------------+
+|                           | ``max_size``    | integer         | Maximum capacity                                                 |
++                           +-----------------+-----------------+------------------------------------------------------------------+
+|                           | ``n_envs``      | integer         | Number of environments                                           |
++                           +-----------------+-----------------+------------------------------------------------------------------+
+|                           | ``alpha``       | float           | Prioritization exponent; 0=uniform, 1=full (default 0.6)         |
++                           +-----------------+-----------------+------------------------------------------------------------------+
+|                           | ``beta0``       | float           | Initial importance-sampling weight exponent (default 0.4)        |
++                           +-----------------+-----------------+------------------------------------------------------------------+
+|                           | ``beta_max``    | float           | Final importance-sampling weight exponent (default 1.0)          |
++                           +-----------------+-----------------+------------------------------------------------------------------+
+|                           | ``beta_steps``  | integer         | Steps to anneal beta from beta0 to beta_max (default 100000)     |
++---------------------------+-----------------+-----------------+------------------------------------------------------------------+
 
 Note that the effective sizes for each environment is :math:`\mathrm{min\_size} / \mathrm{n\_envs}` and :math:`\mathrm{max\_size} / \mathrm{n\_envs}`.
 You need to ensure that you can store at least one sample for each environment. However, for better algorithm performance, it is highly advised to provide buffers
 which can store longer trajectories.
 
+The ``prioritized`` buffer implements Prioritized Experience Replay (`Schaul et al., 2016 <https://arxiv.org/abs/1511.05952>`_), sampling
+transitions in proportion to their last observed temporal-difference (TD) error rather than uniformly. The degree of prioritization is controlled
+by ``alpha`` (with ``alpha = 0`` recovering uniform sampling), and the resulting sampling bias is corrected by importance-sampling weights whose
+exponent ``beta`` is annealed linearly from ``beta0`` to ``beta_max`` over ``beta_steps`` sampling steps. All off-policy algorithms (``DDPG``,
+``TD3``, ``SAC``) transparently apply these importance-sampling weights to their losses and feed the per-sample TD errors back to update the
+priorities; no changes to the algorithm configuration are required to switch between ``uniform`` and ``prioritized`` buffers.
+
 For on-policy algorithms, the block looks as follows:
 
 .. code-block:: yaml

diff --git a/requirements.txt b/requirements.txt
@@ -1,15 +1,15 @@
-# basic packages
-ruamel-yaml
+#basic packages
+ruamel - yaml
 
-# pytorch and some dependencies
-torch==2.8.0
+#pytorch and some dependencies
+             torch ==
+    2.8.0
 
-# training monitoring
-wandb
+#training monitoring
+    wandb
 
-# RL example visualization related
-pygame
-moviepy
+#RL example visualization related
+        pygame moviepy
 
-# Supervised learning example visualization related
-matplotlib
+#Supervised learning example visualization related
+            matplotlib
diff --git a/src/csrc/include/internal/rl/off_policy/ddpg.h b/src/csrc/include/internal/rl/off_policy/ddpg.h
@@ -52,7 +52,8 @@ template <typename T>
 void train_ddpg(const ModelPack& p_model, const ModelPack& p_model_target, const ModelPack& q_model,
                 const ModelPack& q_model_target, torch::Tensor state_old_tensor, torch::Tensor state_new_tensor,
                 torch::Tensor action_old_tensor, torch::Tensor action_new_tensor, torch::Tensor reward_tensor,
-                torch::Tensor d_tensor, const T& gamma, const T& rho, T& p_loss_val, T& q_loss_val) {
+                torch::Tensor d_tensor, torch::Tensor is_weights, const T& gamma, const T& rho,
+                torch::Tensor& td_errors, T& p_loss_val, T& q_loss_val) {
 
   // nvtx marker
   torchfort::nvtx::rangePush("torchfort_train_ddpg");
@@ -72,10 +73,6 @@ void train_ddpg(const ModelPack& p_model, const ModelPack& p_model_target, const
   // value functions
   q_model.model->train();
 
-  // opt
-  // loss is fixed by algorithm
-  auto q_loss_func = torch::nn::MSELoss(torch::nn::MSELossOptions().reduction(torch::kMean));
-
   // policy function
   // compute y: use the target models for q_new, no grads
   torch::Tensor y_tensor;
@@ -87,10 +84,11 @@ void train_ddpg(const ModelPack& p_model, const ModelPack& p_model_target, const
   }
 
   // backward and update step
-  // compute loss
+  // IS-weighted MSE loss: mean(w * (q - y)^2)
   torch::Tensor q_old_tensor =
       torch::squeeze(q_model.model->forward(std::vector<torch::Tensor>{state_old_tensor, action_old_tensor})[0], 1);
-  torch::Tensor q_loss_tensor = q_loss_func->forward(q_old_tensor, y_tensor);
+  td_errors = torch::abs(q_old_tensor - y_tensor).detach();
+  torch::Tensor q_loss_tensor = torch::mean(is_weights * torch::square(q_old_tensor - y_tensor));
 
   auto state = q_model.state;
   if (state->step_train_current % q_model.grad_accumulation_steps == 0) {

diff --git a/src/csrc/include/internal/rl/off_policy/sac.h b/src/csrc/include/internal/rl/off_policy/sac.h
@@ -56,10 +56,10 @@ template <typename T>
 void train_sac(const PolicyPack& p_model, const std::vector<ModelPack>& q_models,
                const std::vector<ModelPack>& q_models_target, torch::Tensor state_old_tensor,
                torch::Tensor state_new_tensor, torch::Tensor action_old_tensor, torch::Tensor reward_tensor,
-               torch::Tensor d_tensor, const std::shared_ptr<AlphaModel>& alpha_model,
+               torch::Tensor d_tensor, torch::Tensor is_weights, const std::shared_ptr<AlphaModel>& alpha_model,
                const std::shared_ptr<torch::optim::Optimizer>& alpha_optimizer,
                const std::shared_ptr<BaseLRScheduler>& alpha_lr_scheduler, const T& target_entropy, const T& gamma,
-               const T& rho, T& p_loss_val, T& q_loss_val) {
+               const T& rho, torch::Tensor& td_errors, T& p_loss_val, T& q_loss_val) {
 
   // nvtx marker
   torchfort::nvtx::rangePush("torchfort_train_sac");
@@ -84,10 +84,6 @@ void train_sac(const PolicyPack& p_model, const std::vector<ModelPack>& q_models
     q_model_target.model->train();
   }
 
-  // opt
-  // loss is fixed by algorithm
-  auto q_loss_func = torch::nn::MSELoss(torch::nn::MSELossOptions().reduction(torch::kMean));
-
   // if we are updating the entropy coefficient, do that first
   torch::Tensor alpha_loss;
   auto state = p_model.state;
@@ -168,9 +164,12 @@ void train_sac(const PolicyPack& p_model, const std::vector<ModelPack>& q_models
   }
 
   // backward and update step
+  // IS-weighted MSE loss: mean(w * (q - y)^2), summed across critics
+  // td_errors taken from first critic only
   torch::Tensor q_old_tensor =
       torch::squeeze(q_models[0].model->forward(std::vector<torch::Tensor>{state_old_tensor, action_old_tensor})[0], 1);
-  torch::Tensor q_loss_tensor = q_loss_func->forward(q_old_tensor, y_tensor);
+  td_errors = torch::abs(q_old_tensor - y_tensor).detach();
+  torch::Tensor q_loss_tensor = torch::mean(is_weights * torch::square(q_old_tensor - y_tensor));
   state = q_models[0].state;
   if (state->step_train_current % q_models[0].grad_accumulation_steps == 0) {
     q_models[0].optimizer->zero_grad();
@@ -179,7 +178,7 @@ void train_sac(const PolicyPack& p_model, const std::vector<ModelPack>& q_models
     // compute loss
     q_old_tensor = torch::squeeze(
         q_models[i].model->forward(std::vector<torch::Tensor>{state_old_tensor, action_old_tensor})[0], 1);
-    q_loss_tensor = q_loss_tensor + q_loss_func->forward(q_old_tensor, y_tensor);
+    q_loss_tensor = q_loss_tensor + torch::mean(is_weights * torch::square(q_old_tensor - y_tensor));
     state = q_models[i].state;
     if (state->step_train_current % q_models[i].grad_accumulation_steps == 0) {
       q_models[i].optimizer->zero_grad();

diff --git a/src/csrc/include/internal/rl/off_policy/td3.h b/src/csrc/include/internal/rl/off_policy/td3.h
@@ -52,8 +52,8 @@ template <typename T>
 void train_td3(const ModelPack& p_model, const ModelPack& p_model_target, const std::vector<ModelPack>& q_models,
                const std::vector<ModelPack>& q_models_target, torch::Tensor state_old_tensor,
                torch::Tensor state_new_tensor, torch::Tensor action_old_tensor, torch::Tensor action_new_tensor,
-               torch::Tensor reward_tensor, torch::Tensor d_tensor, const T& gamma, const T& rho, T& p_loss_val,
-               T& q_loss_val, bool update_policy) {
+               torch::Tensor reward_tensor, torch::Tensor d_tensor, torch::Tensor is_weights, const T& gamma,
+               const T& rho, torch::Tensor& td_errors, T& p_loss_val, T& q_loss_val, bool update_policy) {
 
   // nvtx marker
   torchfort::nvtx::rangePush("torchfort_train_td3");
@@ -76,10 +76,6 @@ void train_td3(const ModelPack& p_model, const ModelPack& p_model_target, const
     q_model.model->train();
   }
 
-  // opt
-  // loss is fixed by algorithm
-  auto q_loss_func = torch::nn::MSELoss(torch::nn::MSELossOptions().reduction(torch::kMean));
-
   // policy function
   // compute y: use the target models for q_new, no grads
   torch::Tensor y_tensor;
@@ -96,18 +92,20 @@ void train_td3(const ModelPack& p_model, const ModelPack& p_model_target, const
   }
 
   // backward and update step
-  // compute loss for critics and zero grads while we are at it
+  // IS-weighted MSE loss: mean(w * (q - y)^2), summed across critics
+  // td_errors taken from first critic only (consistent with policy update using q_models[0])
   torch::Tensor q_old_tensor =
       torch::squeeze(q_models[0].model->forward(std::vector<torch::Tensor>{state_old_tensor, action_old_tensor})[0], 1);
-  torch::Tensor q_loss_tensor = q_loss_func->forward(q_old_tensor, y_tensor);
+  td_errors = torch::abs(q_old_tensor - y_tensor).detach();
+  torch::Tensor q_loss_tensor = torch::mean(is_weights * torch::square(q_old_tensor - y_tensor));
   auto state = q_models[0].state;
   if (state->step_train_current % q_models[0].grad_accumulation_steps == 0) {
     q_models[0].optimizer->zero_grad();
   }
   for (int i = 1; i < q_models.size(); ++i) {
     q_old_tensor = torch::squeeze(
         q_models[i].model->forward(std::vector<torch::Tensor>{state_old_tensor, action_old_tensor})[0], 1);
-    q_loss_tensor = q_loss_tensor + q_loss_func->forward(q_old_tensor, y_tensor);
+    q_loss_tensor = q_loss_tensor + torch::mean(is_weights * torch::square(q_old_tensor - y_tensor));
     state = q_models[i].state;
     if (state->step_train_current % q_models[i].grad_accumulation_steps == 0) {
       q_models[i].optimizer->zero_grad();