ROCm · rraminen · Jun 15, 2026 · Apr 30, 2026 · May 7, 2026 · May 26, 2026
@@ -163,7 +163,9 @@ case "$tag" in
     ROCM_VERSION=7.2
     TRITON=yes
     KATEX=yes
-    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1250"
     if [[ $tag =~ "benchmarks" ]]; then
       INDUCTOR_BENCHMARKS=yes
     fi

diff --git a/.ci/pytorch/binary_populate_env.sh b/.ci/pytorch/binary_populate_env.sh
@@ -147,7 +147,7 @@ export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:
 # TODO: We don't need this anymore IIUC
 export TORCH_PACKAGE_NAME='torch'
 
-export USE_FBGEMM=1
+export USE_FBGEMM=0
 export PIP_UPLOAD_FOLDER="$PIP_UPLOAD_FOLDER"
 export DOCKER_IMAGE="$DOCKER_IMAGE"
 

diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
@@ -110,6 +110,7 @@ if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
   source /var/lib/jenkins/vulkansdk/setup-env.sh
 fi
 
+# Example BUILD_ENVIRONMENT: linux-noble-rocm-py3.12-gfx1250
 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
   # hcc used to run out of memory, silently exiting without stopping
   # the build process, leaving undefined symbols in the shared lib,
@@ -119,10 +120,23 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
     export MAX_JOBS=$(($(nproc) - 1))
   fi
 
+  # Logic for multiple architectures based on the discriminator BUILD_ENVIRONMENT
+  # that is set by the workflow YAML and follows a consistent naming pattern.
   if [[ -n "$CI" && -z "$PYTORCH_ROCM_ARCH" ]]; then
-      # Set ROCM_ARCH to gfx906 for CI builds, if user doesn't override.
-      echo "Limiting PYTORCH_ROCM_ARCH to gfx906 for CI builds"
-      export PYTORCH_ROCM_ARCH="gfx906"
+      if [[ "$BUILD_ENVIRONMENT" == *gfx1250* ]]; then
+          echo "Setting PYTORCH_ROCM_ARCH to gfx1250 for CI builds"
+          export PYTORCH_ROCM_ARCH="gfx1250"
+      elif [[ "$BUILD_ENVIRONMENT" == *mi355* ]] || [[ "$BUILD_ENVIRONMENT" == *gfx950* ]]; then
+          echo "Setting PYTORCH_ROCM_ARCH to gfx950 for CI builds"
+          export PYTORCH_ROCM_ARCH="gfx950"
+      elif [[ "$BUILD_ENVIRONMENT" == *mi300* ]] || [[ "$BUILD_ENVIRONMENT" == *gfx942* ]]; then
+          echo "Setting PYTORCH_ROCM_ARCH to gfx942 for CI builds"
+          export PYTORCH_ROCM_ARCH="gfx942"
+      else
+          # Set ROCM_ARCH to gfx906 for CI builds, if user doesn't override.
+          echo "Limiting PYTORCH_ROCM_ARCH to gfx906 for CI builds"
+          export PYTORCH_ROCM_ARCH="gfx906"
+      fi
   fi
 
   # hipify sources

diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
@@ -62,9 +62,13 @@ self-hosted-runner:
     # gfx942 runners
     - linux.rocm.gpu.gfx942.1
     - linux.rocm.gpu.gfx942.4
+    - linux.rocm.gfx942.docker-cache
     # gfx950 runners
     - linux.rocm.gpu.gfx950.1
     - linux.rocm.gpu.gfx950.4
+    # gfx1250 runners
+    - linux.rocm.gpu.gfx1250.1
+    - linux.rocm.gpu.gfx1250.4
     # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
     - macos-m1-stable
     - macos-m1-14

diff --git a/.github/workflows/inductor-rocm-gfx1250.yml b/.github/workflows/inductor-rocm-gfx1250.yml
@@ -0,0 +1,77 @@
+# The name of this file is subject to change to stay consistent with other .yml files.
+#
+# The MI355 workflow (.github/workflows/inductor-rocm-mi355.yml) uses:
+# - _linux-build.yml and _rocm-test.yml reusable workflows
+# - Build environment linux-noble-rocm-py3.12-mi355
+# - Runner label linux.rocm.gpu.gfx950.1
+# - Docker image ci-image:pytorch-linux-noble-rocm-n-py3
+# - 2-shard test matrix for the inductor config
+#
+# The GFX1250 equivalent is following this exact pattern.
+
+name: inductor-rocm-gfx1250
+
+on:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      - ciflow/inductor-rocm-gfx1250/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+  actions: read
+
+jobs:
+  target-determination:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.11
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  linux-noble-rocm-py3_12-inductor-build:
+    name: linux-noble-rocm-py3.12-gfx1250
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-noble-rocm-py3.12-gfx1250
+      # Docker image stays the same as MI355 because ROCm image supports multiple arches.
+      docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
+      # Set PYTORCH_ROCM_ARCH directly in the workflow YAML as an env variable,
+      # so build.sh never needs to parse BUILD_ENVIRONMENT.
+      #env-var-script: |
+      #  export PYTORCH_ROCM_ARCH=gfx1250
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1250.1" },  # It requires provisioning hardware.
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1250.1" },
+        ]}
+    secrets: inherit
+
+  linux-noble-rocm-py3_12-inductor-test:
+    name: linux-noble-rocm-py3.12-gfx1250
+    uses: ./.github/workflows/_rocm-test.yml
+    needs: linux-noble-rocm-py3_12-inductor-build
+    with:
+      build-environment: ${{ needs.linux-noble-rocm-py3_12-inductor-build.outputs.build-environment }}
+      docker-image: ${{ needs.linux-noble-rocm-py3_12-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-noble-rocm-py3_12-inductor-build.outputs.test-matrix }}
+    secrets: inherit
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -275,7 +275,7 @@ cmake_dependent_option(USE_CUSPARSELT "Use cuSPARSELt" ON "USE_CUDA" OFF)
 cmake_dependent_option(USE_CUDSS "Use cuDSS" ON "USE_CUDA" OFF)
 # USE_ROCM is guarded against in Dependencies.cmake because USE_ROCM is not properly defined here
 cmake_dependent_option(USE_CUFILE "Use cuFile" ON "USE_CUDA AND NOT WIN32" OFF)
-option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON)
+option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" OFF)
 option(USE_KINETO "Use Kineto profiling library" ON)
 option(USE_CUPTI_SO "Use CUPTI as a shared library" ON)
 option(USE_GFLAGS "Use GFLAGS" OFF)
@@ -955,9 +955,13 @@ cmake_dependent_option(
   OFF)
 
 
+# TODO:
+# MSLK related parts are missing that already exists upstream.
+# gfx1250 for MSLK needs to be involved as well.
+
 IF(USE_ROCM AND ("gfx942" IN_LIST PYTORCH_ROCM_ARCH OR "gfx950" IN_LIST PYTORCH_ROCM_ARCH))
-  message(WARNING "Setting USE_MSLK for gfx942/gfx950 to ON by default, doing ROCM build")
-  set(USE_MSLK_DEFAULT ON)
+  message(WARNING "Setting USE_FBGEMM_GENAI for gfx942/gfx950 to ON by default, doing ROCM build")
+  set(USE_FBGEMM_GENAI_DEFAULT ON)
 elseif(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
   message(STATUS "Setting USE_MSLK to ON by default , doing CUDA build for SM100a")
   set(USE_MSLK_DEFAULT ON)

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -268,8 +268,15 @@ if(USE_FLASH_ATTENTION)
         CK_ENABLE_FP64
         CK_ENABLE_FP8
         CK_ENABLE_INT8
-        CK_USE_FNUZ_FP8
-        CK_USE_GFX94
+        #CK_USE_FNUZ_FP8
+        #CK_USE_GFX94
+        CK_USE_GFX1250
+        CK_USE_NATIVE_MX_SUPPORT
+        CK_GFX1250_SUPPORT
+        CK_GFX12_SUPPORT
+        CK_USE_OCP_FP8
+        CK_USE_WMMA
+        CK_USE_WMMA_FP8
         CK_USE_XDL
         __HIP_PLATFORM_AMD__=1
         __HIP_PLATFORM_HCC__=1
@@ -430,10 +437,10 @@ IF(USE_MSLK)
         list(PREPEND MSLK_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
       endif()
 
-    # Only compile for gfx942 and gfx950.
+    # Only compile for gfx942, gfx950, and gfx1250.
     set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
     string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
-    foreach(ARCH gfx942 gfx950)
+    foreach(ARCH gfx942 gfx950 gfx1250)
       if(${ARCH} IN_LIST PYTORCH_ROCM_ARCH)
         list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=${ARCH})
       endif()

diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -505,7 +505,7 @@ at::BlasBackend Context::blasPreferredBackend() {
 bool Context::ckSupported() {
 #ifdef USE_ROCM
   static const std::vector<std::string> supported_archs = {
-    "gfx90a", "gfx942", "gfx950"
+    "gfx90a", "gfx942", "gfx950", "gfx1250",
   };
   for (auto index : c10::irange(detail::getCUDAHooks().deviceCount())) {
     if(!detail::getCUDAHooks().isGPUArch(supported_archs, index)) {

diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -2019,6 +2019,14 @@ void scaled_gemm(
                            "Got m=", m, ", n=", n, ", k=", k);
             }
   #endif
+  #if ROCM_VERSION >= 70200
+            if (at::detail::getCUDAHooks().isGPUArch({"gfx1250"})) {
+                // TODO: add constraints based on hipblaslt internals
+                TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
+                           "M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
+                           "Got m=", m, ", n=", n, ", k=", k);
+            }
+  #endif
   }
 #elif (CUDA_VERSION < 12090) && !defined(USE_ROCM)
   // hipblaslt supported row-wise before cublas, and did so their own way (via

diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -166,8 +166,8 @@ size_t parseChosenWorkspaceSize() {
     val = c10::utils::get_env("ROCBLAS_WORKSPACE_CONFIG");
   }
   /* 32MiB default, 128MiB for gfx94x/gfx95x */
-  const bool gfx94_95 = at::detail::getCUDAHooks().isGPUArch({"gfx94", "gfx95"});
-  const size_t default_size = gfx94_95 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
+  const bool gfx94_95_125 = at::detail::getCUDAHooks().isGPUArch({"gfx94", "gfx95", "gfx125"});
+  const size_t default_size = gfx94_95_125 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
 #else
   /* :4096:2:16:8 default, 32MiB for Hopper and Blackwell */
   cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();

diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@@ -554,7 +554,10 @@ const std::vector<std::string>& CUDAHooks::getHipblasltPreferredArchs() const {
     "gfx1200", "gfx1201",
 #endif
 #if ROCM_VERSION >= 70000
-    "gfx950"
+    "gfx950",
+#endif
+#if ROCM_VERSION >= 70200
+    "gfx1250"
 #endif
   };
   return archs;

diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -231,7 +231,8 @@ C10_LAUNCH_BOUNDS_1(num_threads())
 __global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
   using traits = function_traits<func_t>;
   constexpr auto io_size = calc_io_size<func_t>();
-#if defined(USE_ROCM) && defined(__gfx942__)
+  // Extend the TWS (16) to GFX1250.
+#if defined(USE_ROCM) && (defined(__gfx942__) || defined(__gfx1250__))
   // Similar check in launch_vectorized_kernel() as well. Both should be in sync.
   constexpr int tws = 16;
 #else

diff --git a/aten/src/ATen/native/cuda/GroupedBlas.cpp b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@@ -714,7 +714,7 @@ std::optional<c10::ScalarType> out_dtype) {
   bool use_fast_path = false;
   // ifdef USE_ROCM_CK_GEMM is required since ROCm systems w/o CK should not call ck path.
 #if defined(USE_ROCM_CK_GEMM)
-  if (at::globalContext().rocmAllowGroupGemmCk() && at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950", "gfx90a"})) {
+  if (at::globalContext().rocmAllowGroupGemmCk() && at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950", "gfx90a", "gfx1250"})) {
     use_fast_path = true;
   }
 #endif //USE_ROCM_CK_GEMM

diff --git a/aten/src/ATen/native/cuda/KernelUtils.cuh b/aten/src/ATen/native/cuda/KernelUtils.cuh
@@ -13,7 +13,8 @@
 
 #if ROCM_VERSION < 60400
 __device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
-#if (defined(__gfx942__)) && \
+// `__gfx1250__`-specific `s_wait_loadcnt(0)` path for committed store already there
+#if (defined(__gfx942__) || defined(__gfx1250__)) && \
   __has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
   typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
   static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));

diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@@ -187,7 +187,8 @@ template <int vec_size, typename scalar_t>
 __device__ aligned_vector<scalar_t, vec_size> load_vector(const scalar_t *base_ptr, uint32_t offset) {
   using vec_t = aligned_vector<scalar_t, vec_size>;
   auto *from = reinterpret_cast<const vec_t *>(base_ptr);
-#if defined(USE_ROCM) && defined(__gfx942__)
+  // Extend the non-temporal load optimization to GFX1250.
+#if defined(USE_ROCM) && (defined(__gfx942__) || defined(__gfx1250__))
   using longx2 = __attribute__((__vector_size__(4*sizeof(int)))) int;
   if constexpr (sizeof(vec_t) == sizeof(int)) {
    union {

diff --git a/aten/src/ATen/native/cuda/RangeFactories.cu b/aten/src/ATen/native/cuda/RangeFactories.cu
@@ -8,6 +8,9 @@
 #include <ATen/native/RangeUtils.h>
 #include <cmath>
 #include <limits>
+#if defined(USE_ROCM)
+#include <algorithm>
+#endif
 
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@@ -48,12 +51,49 @@ __global__ void elementwise_kernel_with_index(index_t N, func_t f, typename func
   }
 }
 
+#if defined(USE_ROCM)
+// HIP does not support launches with gridDim.x * blockDim.x >= 2^32:
+// depending on the ROCm version the launch returns
+// hipErrorInvalidConfiguration or is accepted silently with the kernel
+// never executing, leaving zero-initialized output. A grid-stride kernel
+// with a fixed grid sized to device occupancy avoids the limit.
+template<typename index_t, typename func_t>
+C10_LAUNCH_BOUNDS_1(num_threads())
+__global__ void elementwise_kernel_with_index_grid_stride(
+    index_t N, func_t f,
+    typename function_traits<func_t>::result_type *data) {
+  index_t idx = static_cast<index_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  const index_t stride = static_cast<index_t>(gridDim.x) * blockDim.x;
+  for (; idx < N; idx += stride) {
+    data[idx] = f(idx);
+  }
+}
+#endif
+
 template<typename func_t>
 void gpu_kernel_with_index(at::Tensor &output, func_t f) {
   int64_t N = output.numel();
   if (N == 0) {
     return;
   }
+#if defined(USE_ROCM)
+  constexpr int blocks_per_sm = 4;
+  const int sm_count =
+      at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
+  const int64_t orig_grid = (N + block_work_size - 1) / block_work_size;
+  int64_t grid = std::min<int64_t>(
+      orig_grid, static_cast<int64_t>(sm_count) * blocks_per_sm);
+  grid = std::max<int64_t>(grid, 1);
+  auto stream = at::cuda::getCurrentCUDAStream();
+  using scalar_t = typename function_traits<func_t>::result_type;
+  if (N <= std::numeric_limits<int>::max()) {
+    elementwise_kernel_with_index_grid_stride<int><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  } else {
+    elementwise_kernel_with_index_grid_stride<int64_t><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  }
+#else
   int64_t grid = (N + block_work_size - 1) / block_work_size;
   auto stream = at::cuda::getCurrentCUDAStream();
   using scalar_t = typename function_traits<func_t>::result_type;
@@ -64,6 +104,7 @@ void gpu_kernel_with_index(at::Tensor &output, func_t f) {
     elementwise_kernel_with_index<int64_t><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   }
+#endif
 }
 
 }  // namespace

diff --git a/aten/src/ATen/native/cuda/ScaledBlas.cpp b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@@ -79,6 +79,9 @@ bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=false) {
 #endif
 #if ROCM_VERSION >= 60500
         "gfx950"
+#endif
+#if ROCM_VERSION >= 70200
+        , "gfx1250"
 #endif
     };
     return at::detail::getCUDAHooks().isGPUArch(archs);
@@ -622,8 +625,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) {
 #ifdef USE_ROCM
     #if ROCM_VERSION >= 70000
-    TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+    TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx1250"}),
+                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950/gfx1250");
 
     int packed_factor = 1;
     if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
@@ -1064,8 +1067,8 @@ _scaled_mxfp8_mxfp8(
 
 #ifdef USE_ROCM
 #if ROCM_VERSION >= 70000
-  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx1250"}),
+              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950/gfx1250");
 
   TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
               mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,
@@ -1150,8 +1153,8 @@ _scaled_mxfp4_mxfp4(
   auto scaling_choice_b = ScalingType::BlockWise1x32;
 
 #if ROCM_VERSION >= 70000
-  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+  TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx1250"}),
+              "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950/gfx1250");
 
   TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
               mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,