ROCm · rraminen · Jun 15, 2026 · Apr 30, 2026 · May 7, 2026 · May 26, 2026
@@ -163,7 +163,7 @@ case "$tag" in
     ROCM_VERSION=7.2
     TRITON=yes
     KATEX=yes
-    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
+    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1250"
     if [[ $tag =~ "benchmarks" ]]; then
       INDUCTOR_BENCHMARKS=yes
     fi

diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
@@ -110,6 +110,7 @@ if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
   source /var/lib/jenkins/vulkansdk/setup-env.sh
 fi
 
+# Example BUILD_ENVIRONMENT: linux-noble-rocm-py3.12-gfx1250
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   # hcc used to run out of memory, silently exiting without stopping
   # the build process, leaving undefined symbols in the shared lib,
@@ -119,10 +120,23 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
     export MAX_JOBS=$(($(nproc) - 1))
   fi
 
+  # Logic for multiple architectures based on the discriminator BUILD_ENVIRONMENT
+  # that is set by the workflow YAML and follows a consistent naming pattern.
   if [[ -n "$CI" && -z "$PYTORCH_ROCM_ARCH" ]]; then
-      # Set ROCM_ARCH to gfx906 for CI builds, if user doesn't override.
-      echo "Limiting PYTORCH_ROCM_ARCH to gfx906 for CI builds"
-      export PYTORCH_ROCM_ARCH="gfx906"
+      if [[ "$BUILD_ENVIRONMENT" == *gfx1250* ]]; then
+          echo "Setting PYTORCH_ROCM_ARCH to gfx1250 for CI builds"
+          export PYTORCH_ROCM_ARCH="gfx1250"
+      elif [[ "$BUILD_ENVIRONMENT" == *mi355* ]] || [[ "$BUILD_ENVIRONMENT" == *gfx950* ]]; then
+          echo "Setting PYTORCH_ROCM_ARCH to gfx950 for CI builds"
+          export PYTORCH_ROCM_ARCH="gfx950"
+      elif [[ "$BUILD_ENVIRONMENT" == *mi300* ]] || [[ "$BUILD_ENVIRONMENT" == *gfx942* ]]; then
+          echo "Setting PYTORCH_ROCM_ARCH to gfx942 for CI builds"
+          export PYTORCH_ROCM_ARCH="gfx942"
+      else
+          # Set ROCM_ARCH to gfx906 for CI builds, if user doesn't override.
+          echo "Limiting PYTORCH_ROCM_ARCH to gfx906 for CI builds"
+          export PYTORCH_ROCM_ARCH="gfx906"
+      fi
   fi
 
   # hipify sources

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -955,8 +955,12 @@ cmake_dependent_option(
   OFF)
 
 
+# TODO:
+# MSLK related parts are missing that already exists upstream.
+# gfx1250 for MSLK needs to be involved as well.
+
 IF(USE_ROCM AND ("gfx942" IN_LIST PYTORCH_ROCM_ARCH OR "gfx950" IN_LIST PYTORCH_ROCM_ARCH))
-  message(WARNING "Setting USE_MSLK for gfx942/gfx950 to ON by default, doing ROCM build")
+  message(STATUS "Setting USE_MSLK for gfx942/gfx950 to ON by default, doing ROCM build")
   set(USE_MSLK_DEFAULT ON)
 elseif(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
   message(STATUS "Setting USE_MSLK to ON by default , doing CUDA build for SM100a")

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -268,8 +268,15 @@ if(USE_FLASH_ATTENTION)
         CK_ENABLE_FP64
         CK_ENABLE_FP8
         CK_ENABLE_INT8
-        CK_USE_FNUZ_FP8
-        CK_USE_GFX94
+        #CK_USE_FNUZ_FP8
+        #CK_USE_GFX94
+        CK_USE_GFX1250
+        CK_USE_NATIVE_MX_SUPPORT
+        CK_GFX1250_SUPPORT
+        CK_GFX12_SUPPORT
+        CK_USE_OCP_FP8
+        CK_USE_WMMA
+        CK_USE_WMMA_FP8
         CK_USE_XDL
         __HIP_PLATFORM_AMD__=1
         __HIP_PLATFORM_HCC__=1
@@ -430,7 +437,7 @@ IF(USE_MSLK)
         list(PREPEND MSLK_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
       endif()
 
-    # Only compile for gfx942 and gfx950.
+    # Only compile for gfx942 and gfx950 (composable_kernel lacks gfx1250 support).
     set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
     string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
     foreach(ARCH gfx942 gfx950)

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -505,7 +505,7 @@ at::BlasBackend Context::blasPreferredBackend() {
 bool Context::ckSupported() {
 #ifdef USE_ROCM
   static const std::vector<std::string> supported_archs = {
-    "gfx90a", "gfx942", "gfx950"
+    "gfx942", "gfx950", "gfx1250",
   };
   for (auto index : c10::irange(detail::getCUDAHooks().deviceCount())) {
     if(!detail::getCUDAHooks().isGPUArch(supported_archs, index)) {

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -2019,6 +2019,14 @@ void scaled_gemm(
                            "Got m=", m, ", n=", n, ", k=", k);
             }
   #endif
+  #if ROCM_VERSION >= 70200
+            if (at::detail::getCUDAHooks().isGPUArch({"gfx1250"})) {
+                // TODO: add constraints based on hipblaslt internals
+                TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
+                           "M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
+                           "Got m=", m, ", n=", n, ", k=", k);
+            }
+  #endif
   }
 #elif (CUDA_VERSION < 12090) && !defined(USE_ROCM)
   // hipblaslt supported row-wise before cublas, and did so their own way (via

diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -166,8 +166,8 @@ size_t parseChosenWorkspaceSize() {
     val = c10::utils::get_env("ROCBLAS_WORKSPACE_CONFIG");
   }
   /* 32MiB default, 128MiB for gfx94x/gfx95x */
-  const bool gfx94_95 = at::detail::getCUDAHooks().isGPUArch({"gfx94", "gfx95"});
-  const size_t default_size = gfx94_95 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
+  const bool gfx94_95_125 = at::detail::getCUDAHooks().isGPUArch({"gfx94", "gfx95", "gfx125"});
+  const size_t default_size = gfx94_95_125 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
 #else
   /* :4096:2:16:8 default, 32MiB for Hopper and Blackwell */
   cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();

diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -554,7 +554,10 @@ const std::vector<std::string>& CUDAHooks::getHipblasltPreferredArchs() const {
     "gfx1200", "gfx1201",
 #endif
 #if ROCM_VERSION >= 70000
-    "gfx950"
+    "gfx950",
+#endif
+#if ROCM_VERSION >= 70200
+    "gfx1250"
 #endif
   };
   return archs;

diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -231,7 +231,8 @@ C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
   using traits = function_traits<func_t>;
   constexpr auto io_size = calc_io_size<func_t>();
-#if defined(USE_ROCM) && defined(__gfx942__)
+  // Extend the TWS (16) to GFX1250.
+#if defined(USE_ROCM) && (defined(__gfx942__) || defined(__gfx1250__))
   // Similar check in launch_vectorized_kernel() as well. Both should be in sync.
   constexpr int tws = 16;
 #else

diff --git a/aten/src/ATen/native/cuda/GroupedBlas.cpp b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@@ -714,14 +714,16 @@ std::optional<c10::ScalarType> out_dtype) {
   bool use_fast_path = false;
   // ifdef USE_ROCM_CK_GEMM is required since ROCm systems w/o CK should not call ck path.
 #if defined(USE_ROCM_CK_GEMM)
-  if (at::globalContext().rocmAllowGroupGemmCk() && at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950", "gfx90a"})) {
+  if (at::globalContext().rocmAllowGroupGemmCk() && at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950", "gfx90a", "gfx1250"})) {
     use_fast_path = true;
   }
 #endif //USE_ROCM_CK_GEMM
   const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
   Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
   if (use_fast_path) {
+#if defined(USE_ROCM_CK_GEMM)
     at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
+#endif //USE_ROCM_CK_GEMM
   } else {
     _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
   }

diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -13,7 +13,8 @@
 
 #if ROCM_VERSION < 60400
 __device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
-#if (defined(__gfx942__)) && \
+// `__gfx1250__`-specific `s_wait_loadcnt(0)` path for committed store already there
+#if (defined(__gfx942__) || defined(__gfx1250__)) && \
   __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
   typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
   static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));

diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@@ -187,7 +187,8 @@ template <int vec_size, typename scalar_t>
 __device__ aligned_vector<scalar_t, vec_size> load_vector(const scalar_t *base_ptr, uint32_t offset) {
   using vec_t = aligned_vector<scalar_t, vec_size>;
   auto *from = reinterpret_cast<const vec_t *>(base_ptr);
-#if defined(USE_ROCM) && defined(__gfx942__)
+  // Extend the non-temporal load optimization to GFX1250.
+#if defined(USE_ROCM) && (defined(__gfx942__) || defined(__gfx1250__))
   using longx2 = __attribute__((__vector_size__(4*sizeof(int)))) int;
   if constexpr (sizeof(vec_t) == sizeof(int)) {
    union {

diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu
@@ -8,6 +8,9 @@
 #include <ATen/native/RangeUtils.h>
 #include <cmath>
 #include <limits>
+#if defined(USE_ROCM)
+#include <algorithm>
+#endif
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -48,12 +51,49 @@ __global__ void elementwise_kernel_with_index(index_t N, func_t f, typename func
   }
 }
 
+#if defined(USE_ROCM)
+// HIP does not support launches with gridDim.x * blockDim.x >= 2^32:
+// depending on the ROCm version the launch returns
+// hipErrorInvalidConfiguration or is accepted silently with the kernel
+// never executing, leaving zero-initialized output. A grid-stride kernel
+// with a fixed grid sized to device occupancy avoids the limit.
+template<typename index_t, typename func_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void elementwise_kernel_with_index_grid_stride(
+    index_t N, func_t f,
+    typename function_traits<func_t>::result_type *data) {
+  index_t idx = static_cast<index_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const index_t stride = static_cast<index_t>(gridDim.x) * blockDim.x;
+  for (; idx < N; idx += stride) {
+    data[idx] = f(idx);
+  }
+}
+#endif
+
 template<typename func_t>
 void gpu_kernel_with_index(at::Tensor &output, func_t f) {
   int64_t N = output.numel();
   if (N == 0) {
     return;
   }
+#if defined(USE_ROCM)
+  constexpr int blocks_per_sm = 4;
+  const int sm_count =
+      at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  const int64_t orig_grid = (N + block_work_size - 1) / block_work_size;
+  int64_t grid = std::min<int64_t>(
+      orig_grid, static_cast<int64_t>(sm_count) * blocks_per_sm);
+  grid = std::max<int64_t>(grid, 1);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  using scalar_t = typename function_traits<func_t>::result_type;
+  if (N <= std::numeric_limits<int>::max()) {
+    elementwise_kernel_with_index_grid_stride<int><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    elementwise_kernel_with_index_grid_stride<int64_t><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+#else
   int64_t grid = (N + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
   using scalar_t = typename function_traits<func_t>::result_type;
@@ -64,6 +104,7 @@ void gpu_kernel_with_index(at::Tensor &output, func_t f) {
     elementwise_kernel_with_index<int64_t><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
+#endif
 }
 
 }  // namespace

diff --git a/aten/src/ATen/native/cuda/ScaledBlas.cpp b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@@ -79,6 +79,9 @@ bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=false) {
 #endif
 #if ROCM_VERSION >= 60500
         "gfx950"
+#endif
+#if ROCM_VERSION >= 70200
+        , "gfx1250"
 #endif
     };
     return at::detail::getCUDAHooks().isGPUArch(archs);
@@ -622,8 +625,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) {
 #ifdef USE_ROCM
     #if ROCM_VERSION >= 70000
-    TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+    TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx1250"}),
+                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950/gfx1250");
 
     int packed_factor = 1;
     if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
@@ -1064,8 +1067,8 @@ _scaled_mxfp8_mxfp8(
 
 #ifdef USE_ROCM
 #if ROCM_VERSION >= 70000
-  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx1250"}),
+              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950/gfx1250");
 
   TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
               mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,
@@ -1150,8 +1153,8 @@ _scaled_mxfp4_mxfp4(
   auto scaling_choice_b = ScalingType::BlockWise1x32;
 
 #if ROCM_VERSION >= 70000
-  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx1250"}),
+              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950/gfx1250");
 
   TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
               mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,

diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu
@@ -127,7 +127,8 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
   return diff == 0 ? 0 : uint32_t(Align) - diff;
 }
 
-#if defined (__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
+// CDNA arch with MFMA and Warp-32 support
+#if defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
 #define CDNA2_OR_LATER 1
 #else
 #define CDNA2_OR_LATER 0
@@ -146,6 +147,12 @@ static bool isCDNA2orLater(int index) {
     return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942", "gfx950"}, index);
 }
 
+// Conceptual for now and subject to change
+// gfx1250 (CDNA5 / CDNA-next / UDNA)
+static bool isCDNA5orLater(int index) {
+  return at::detail::getCUDAHooks().isGPUArch({"gfx1250"}, index);
+}
+
 #else
 constexpr int32_t kWarpSize = 32;
 #endif
@@ -1098,6 +1105,11 @@ at::Tensor _weight_int4pack_mm_cuda(
       A.device() == B.device() && A.device() == qScaleAndZeros.device());
 
 #if defined(USE_ROCM)
+  if (isCDNA5orLater(A.device().index())) {
+    TORCH_CHECK(false,
+                "_weight_int4pack_mm_cuda is not yet supported on gfx1250. "
+                "A WMMA-based implementation is required for gfx1250.")
+  }
   if (!isCDNA2orLater(A.device().index())) {
     TORCH_CHECK(false, "_weight_int4pack_mm_cuda is only supported on AMD gpu arch greater than or equal to CDNA2");
   }
@@ -1293,6 +1305,11 @@ at::Tensor _convert_weight_to_int4pack_cuda(
   TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8);
 
 #if defined(USE_ROCM)
+  if (isCDNA5orLater(in.device().index())) {
+    TORCH_CHECK(false,
+                "_convert_weight_to_int4pack_cuda is not yet supported on gfx1250. "
+                "A WMMA-based implementation is required for gfx1250.")
+  }
   if (!isCDNA2orLater(in.device().index())) {
     TORCH_CHECK(false, "_convert_weight_to_int4pack_cuda is only supported on AMD gpu arch greater than or equal to CDNA2");
   }

diff --git a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@@ -30,7 +30,7 @@ static void initHipSparseLtSupport() {
     // Check only the first available device
     try {
         if (at::cuda::device_count() > 0) {
-            g_hipSparseLtSupported = at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx942"}, 0);
+            g_hipSparseLtSupported = at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx942", "gfx1250"}, 0);
         }
     } catch (const std::exception&) {
         // If an exception occurs during device property check, we assume hipSparseLt is not supported
@@ -49,7 +49,7 @@ static bool isHipSparseLtSupported() {
         TORCH_CHECK(
             false,
             "hipSparseLt not supported on this device, supported architectures: "
-            "gfx950, gfx942. "
+            "gfx1250, gfx950, gfx942. "
             "required ROCM version: 6.4.0 or later.");
     }
     return g_hipSparseLtSupported;