Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/docker/build.sh

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes are unnecessary unless we know for certain any build workflows that would use it. TheRock build workflows don't.

Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ case "$tag" in
ROCM_VERSION=7.2
TRITON=yes
KATEX=yes
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100;gfx1250"
if [[ $tag =~ "benchmarks" ]]; then
INDUCTOR_BENCHMARKS=yes
fi
Expand Down
20 changes: 17 additions & 3 deletions .ci/pytorch/build.sh

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes are unnecessary unless we know for certain any build workflows that would use it. TheRock build workflows don't.

Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ if [[ "$BUILD_ENVIRONMENT" == *vulkan* ]]; then
source /var/lib/jenkins/vulkansdk/setup-env.sh
fi

# Example BUILD_ENVIRONMENT: linux-noble-rocm-py3.12-gfx1250
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
# hcc used to run out of memory, silently exiting without stopping
# the build process, leaving undefined symbols in the shared lib,
Expand All @@ -119,10 +120,23 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
export MAX_JOBS=$(($(nproc) - 1))
fi

# Logic for multiple architectures based on the discriminator BUILD_ENVIRONMENT
# that is set by the workflow YAML and follows a consistent naming pattern.
if [[ -n "$CI" && -z "$PYTORCH_ROCM_ARCH" ]]; then
# Set ROCM_ARCH to gfx906 for CI builds, if user doesn't override.
echo "Limiting PYTORCH_ROCM_ARCH to gfx906 for CI builds"
export PYTORCH_ROCM_ARCH="gfx906"
if [[ "$BUILD_ENVIRONMENT" == *gfx1250* ]]; then
echo "Setting PYTORCH_ROCM_ARCH to gfx1250 for CI builds"
export PYTORCH_ROCM_ARCH="gfx1250"
elif [[ "$BUILD_ENVIRONMENT" == *mi355* ]] || [[ "$BUILD_ENVIRONMENT" == *gfx950* ]]; then
echo "Setting PYTORCH_ROCM_ARCH to gfx950 for CI builds"
export PYTORCH_ROCM_ARCH="gfx950"
elif [[ "$BUILD_ENVIRONMENT" == *mi300* ]] || [[ "$BUILD_ENVIRONMENT" == *gfx942* ]]; then
echo "Setting PYTORCH_ROCM_ARCH to gfx942 for CI builds"
export PYTORCH_ROCM_ARCH="gfx942"
else
# Set ROCM_ARCH to gfx906 for CI builds, if user doesn't override.
echo "Limiting PYTORCH_ROCM_ARCH to gfx906 for CI builds"
export PYTORCH_ROCM_ARCH="gfx906"
fi
fi

# hipify sources
Expand Down
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -955,8 +955,12 @@ cmake_dependent_option(
OFF)


# TODO:
# MSLK related parts are missing that already exists upstream.
# gfx1250 for MSLK needs to be involved as well.

IF(USE_ROCM AND ("gfx942" IN_LIST PYTORCH_ROCM_ARCH OR "gfx950" IN_LIST PYTORCH_ROCM_ARCH))
message(WARNING "Setting USE_MSLK for gfx942/gfx950 to ON by default, doing ROCM build")
message(STATUS "Setting USE_MSLK for gfx942/gfx950 to ON by default, doing ROCM build")
set(USE_MSLK_DEFAULT ON)
elseif(USE_CUDA AND "$ENV{TORCH_CUDA_ARCH_LIST}" MATCHES "10.0" AND CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8 AND NOT WIN32)
message(STATUS "Setting USE_MSLK to ON by default , doing CUDA build for SM100a")
Expand Down
13 changes: 10 additions & 3 deletions aten/src/ATen/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,15 @@ if(USE_FLASH_ATTENTION)
CK_ENABLE_FP64
CK_ENABLE_FP8
CK_ENABLE_INT8
CK_USE_FNUZ_FP8
CK_USE_GFX94
#CK_USE_FNUZ_FP8
#CK_USE_GFX94
CK_USE_GFX1250

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here and below change the CK SDPA compile definitions globally.
Are CK/AITER artifacts for GFX1250 actually ready and validated?

CK_USE_NATIVE_MX_SUPPORT
CK_GFX1250_SUPPORT
CK_GFX12_SUPPORT
CK_USE_OCP_FP8
CK_USE_WMMA
CK_USE_WMMA_FP8
CK_USE_XDL
__HIP_PLATFORM_AMD__=1
__HIP_PLATFORM_HCC__=1
Expand Down Expand Up @@ -430,7 +437,7 @@ IF(USE_MSLK)
list(PREPEND MSLK_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
endif()

# Only compile for gfx942 and gfx950.
# Only compile for gfx942 and gfx950 (composable_kernel lacks gfx1250 support).
set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
foreach(ARCH gfx942 gfx950)
Expand Down
2 changes: 1 addition & 1 deletion aten/src/ATen/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ at::BlasBackend Context::blasPreferredBackend() {
bool Context::ckSupported() {
#ifdef USE_ROCM
static const std::vector<std::string> supported_archs = {
"gfx90a", "gfx942", "gfx950"
"gfx942", "gfx950", "gfx1250",
};
for (auto index : c10::irange(detail::getCUDAHooks().deviceCount())) {
if(!detail::getCUDAHooks().isGPUArch(supported_archs, index)) {
Expand Down
8 changes: 8 additions & 0 deletions aten/src/ATen/cuda/CUDABlas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2019,6 +2019,14 @@ void scaled_gemm(
"Got m=", m, ", n=", n, ", k=", k);
}
#endif
#if ROCM_VERSION >= 70200
if (at::detail::getCUDAHooks().isGPUArch({"gfx1250"})) {
// TODO: add constraints based on hipblaslt internals
TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
"M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
"Got m=", m, ", n=", n, ", k=", k);
}
#endif
}
#elif (CUDA_VERSION < 12090) && !defined(USE_ROCM)
// hipblaslt supported row-wise before cublas, and did so their own way (via
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/cuda/CublasHandlePool.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,8 +166,8 @@ size_t parseChosenWorkspaceSize() {
val = c10::utils::get_env("ROCBLAS_WORKSPACE_CONFIG");
}
/* 32MiB default, 128MiB for gfx94x/gfx95x */
const bool gfx94_95 = at::detail::getCUDAHooks().isGPUArch({"gfx94", "gfx95"});
const size_t default_size = gfx94_95 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
const bool gfx94_95_125 = at::detail::getCUDAHooks().isGPUArch({"gfx94", "gfx95", "gfx125"});
const size_t default_size = gfx94_95_125 ? 1024 * 128 * 1024 : 1024 * 32 * 1024;
#else
/* :4096:2:16:8 default, 32MiB for Hopper and Blackwell */
cudaDeviceProp* properties = at::cuda::getCurrentDeviceProperties();
Expand Down
5 changes: 4 additions & 1 deletion aten/src/ATen/cuda/detail/CUDAHooks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,10 @@ const std::vector<std::string>& CUDAHooks::getHipblasltPreferredArchs() const {
"gfx1200", "gfx1201",
#endif
#if ROCM_VERSION >= 70000
"gfx950"
"gfx950",
#endif
#if ROCM_VERSION >= 70200
"gfx1250"
#endif
};
return archs;
Expand Down
3 changes: 2 additions & 1 deletion aten/src/ATen/native/cuda/CUDALoops.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ C10_LAUNCH_BOUNDS_1(num_threads())
__global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
using traits = function_traits<func_t>;
constexpr auto io_size = calc_io_size<func_t>();
#if defined(USE_ROCM) && defined(__gfx942__)
// Extend the TWS (16) to GFX1250.
#if defined(USE_ROCM) && (defined(__gfx942__) || defined(__gfx1250__))
// Similar check in launch_vectorized_kernel() as well. Both should be in sync.
constexpr int tws = 16;
#else
Expand Down
4 changes: 3 additions & 1 deletion aten/src/ATen/native/cuda/GroupedBlas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -714,14 +714,16 @@ std::optional<c10::ScalarType> out_dtype) {
bool use_fast_path = false;
// ifdef USE_ROCM_CK_GEMM is required since ROCm systems w/o CK should not call ck path.
#if defined(USE_ROCM_CK_GEMM)
if (at::globalContext().rocmAllowGroupGemmCk() && at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950", "gfx90a"})) {
if (at::globalContext().rocmAllowGroupGemmCk() && at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950", "gfx90a", "gfx1250"})) {

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it true that the existing CK grouped GEMM path is the Wave64/MFMA/XDL path used gfx90a/gfx942/gfx950? If so, because gfx1250 is Wave32 and WMMA/SWMMAC-oriented, it may not be routed into this path by arch-name allowlisting.

use_fast_path = true;
}
#endif //USE_ROCM_CK_GEMM
const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
if (use_fast_path) {
#if defined(USE_ROCM_CK_GEMM)
at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
#endif //USE_ROCM_CK_GEMM
} else {
_grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
}
Expand Down
3 changes: 2 additions & 1 deletion aten/src/ATen/native/cuda/KernelUtils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

#if ROCM_VERSION < 60400
__device__ inline __hip_bfloat162 preview_unsafeAtomicAdd(__hip_bfloat162* address, __hip_bfloat162 value) {
#if (defined(__gfx942__)) && \
// `__gfx1250__`-specific `s_wait_loadcnt(0)` path for committed store already there
#if (defined(__gfx942__) || defined(__gfx1250__)) && \

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this change matter now, if the outer condition is #if ROCM_VERSION < 60400?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in #3347

__has_builtin(__builtin_amdgcn_flat_atomic_fadd_v2bf16)
typedef unsigned short __attribute__((ext_vector_type(2))) vec_short2;
static_assert(sizeof(vec_short2) == sizeof(__hip_bfloat162_raw));
Expand Down
3 changes: 2 additions & 1 deletion aten/src/ATen/native/cuda/MemoryAccess.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,8 @@ template <int vec_size, typename scalar_t>
__device__ aligned_vector<scalar_t, vec_size> load_vector(const scalar_t *base_ptr, uint32_t offset) {
using vec_t = aligned_vector<scalar_t, vec_size>;
auto *from = reinterpret_cast<const vec_t *>(base_ptr);
#if defined(USE_ROCM) && defined(__gfx942__)
// Extend the non-temporal load optimization to GFX1250.
#if defined(USE_ROCM) && (defined(__gfx942__) || defined(__gfx1250__))

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simply extending to gfx1250, would this be another Wave64-tuned path being applied to Wave32 hardware?

using longx2 = __attribute__((__vector_size__(4*sizeof(int)))) int;
if constexpr (sizeof(vec_t) == sizeof(int)) {
union {
Expand Down
41 changes: 41 additions & 0 deletions aten/src/ATen/native/cuda/RangeFactories.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
#include <ATen/native/RangeUtils.h>
#include <cmath>
#include <limits>
#if defined(USE_ROCM)
#include <algorithm>
#endif

#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
Expand Down Expand Up @@ -48,12 +51,49 @@ __global__ void elementwise_kernel_with_index(index_t N, func_t f, typename func
}
}

#if defined(USE_ROCM)
// HIP does not support launches with gridDim.x * blockDim.x >= 2^32:
// depending on the ROCm version the launch returns
// hipErrorInvalidConfiguration or is accepted silently with the kernel
// never executing, leaving zero-initialized output. A grid-stride kernel
// with a fixed grid sized to device occupancy avoids the limit.
template<typename index_t, typename func_t>
C10_LAUNCH_BOUNDS_1(num_threads())
__global__ void elementwise_kernel_with_index_grid_stride(
index_t N, func_t f,
typename function_traits<func_t>::result_type *data) {
index_t idx = static_cast<index_t>(blockIdx.x) * blockDim.x + threadIdx.x;
const index_t stride = static_cast<index_t>(gridDim.x) * blockDim.x;
for (; idx < N; idx += stride) {
data[idx] = f(idx);
}
}
#endif

template<typename func_t>
void gpu_kernel_with_index(at::Tensor &output, func_t f) {
int64_t N = output.numel();
if (N == 0) {
return;
}
#if defined(USE_ROCM)
constexpr int blocks_per_sm = 4;
const int sm_count =
at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
const int64_t orig_grid = (N + block_work_size - 1) / block_work_size;
int64_t grid = std::min<int64_t>(
orig_grid, static_cast<int64_t>(sm_count) * blocks_per_sm);
grid = std::max<int64_t>(grid, 1);
auto stream = at::cuda::getCurrentCUDAStream();
using scalar_t = typename function_traits<func_t>::result_type;
if (N <= std::numeric_limits<int>::max()) {
elementwise_kernel_with_index_grid_stride<int><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
elementwise_kernel_with_index_grid_stride<int64_t><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
#else
int64_t grid = (N + block_work_size - 1) / block_work_size;
auto stream = at::cuda::getCurrentCUDAStream();
using scalar_t = typename function_traits<func_t>::result_type;
Expand All @@ -64,6 +104,7 @@ void gpu_kernel_with_index(at::Tensor &output, func_t f) {
elementwise_kernel_with_index<int64_t><<<grid, num_threads(), 0, stream>>>(N, f, output.mutable_data_ptr<scalar_t>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
#endif
}

} // namespace
Expand Down
15 changes: 9 additions & 6 deletions aten/src/ATen/native/cuda/ScaledBlas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ bool _scaled_mm_allowed_device(bool sm90_only=false, bool sm100_only=false) {
#endif
#if ROCM_VERSION >= 60500
"gfx950"
#endif
#if ROCM_VERSION >= 70200
, "gfx1250"
#endif
};
return at::detail::getCUDAHooks().isGPUArch(archs);
Expand Down Expand Up @@ -622,8 +625,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) {
#ifdef USE_ROCM
#if ROCM_VERSION >= 70000
TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx1250"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950/gfx1250");

int packed_factor = 1;
if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
Expand Down Expand Up @@ -1064,8 +1067,8 @@ _scaled_mxfp8_mxfp8(

#ifdef USE_ROCM
#if ROCM_VERSION >= 70000
TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx1250"}),

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Above _scaled_mm_allowed_device() (line ~82) gates gfx1250 at >= 70200. So So on ROCm 7.0/7.1 the device is rejected by _scaled_mm_allowed_device yet these inner checks would have admitted it.

How about nesting #if ROCM_VERSION >= 70200 inside each isGPUArch({...})?

"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950/gfx1250");

TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,
Expand Down Expand Up @@ -1150,8 +1153,8 @@ _scaled_mxfp4_mxfp4(
auto scaling_choice_b = ScalingType::BlockWise1x32;

#if ROCM_VERSION >= 70000
TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
TORCH_CHECK_NOT_IMPLEMENTED(at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx1250"}),
"Block-wise scaling for Float8_e8m0fnu is only supported on gfx950/gfx1250");

TORCH_CHECK_VALUE(mat_a.size(0) % 32 == 0 && mat_a.size(1) % 32 == 0 &&
mat_b.size(0) % 32 == 0 && mat_b.size(1) % 32 == 0,
Expand Down
19 changes: 18 additions & 1 deletion aten/src/ATen/native/cuda/int4mm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
return diff == 0 ? 0 : uint32_t(Align) - diff;
}

#if defined (__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
// CDNA arch with MFMA and Warp-32 support
#if defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
#define CDNA2_OR_LATER 1
#else
#define CDNA2_OR_LATER 0
Expand All @@ -146,6 +147,12 @@ static bool isCDNA2orLater(int index) {
return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942", "gfx950"}, index);
}

// Conceptual for now and subject to change
// gfx1250 (CDNA5 / CDNA-next / UDNA)
static bool isCDNA5orLater(int index) {
return at::detail::getCUDAHooks().isGPUArch({"gfx1250"}, index);
}

#else
constexpr int32_t kWarpSize = 32;
#endif
Expand Down Expand Up @@ -1098,6 +1105,11 @@ at::Tensor _weight_int4pack_mm_cuda(
A.device() == B.device() && A.device() == qScaleAndZeros.device());

#if defined(USE_ROCM)
if (isCDNA5orLater(A.device().index())) {
TORCH_CHECK(false,
"_weight_int4pack_mm_cuda is not yet supported on gfx1250. "
"A WMMA-based implementation is required for gfx1250.")
}
if (!isCDNA2orLater(A.device().index())) {
TORCH_CHECK(false, "_weight_int4pack_mm_cuda is only supported on AMD gpu arch greater than or equal to CDNA2");
}
Expand Down Expand Up @@ -1293,6 +1305,11 @@ at::Tensor _convert_weight_to_int4pack_cuda(
TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8);

#if defined(USE_ROCM)
if (isCDNA5orLater(in.device().index())) {
TORCH_CHECK(false,
"_convert_weight_to_int4pack_cuda is not yet supported on gfx1250. "
"A WMMA-based implementation is required for gfx1250.")
}
if (!isCDNA2orLater(in.device().index())) {
TORCH_CHECK(false, "_convert_weight_to_int4pack_cuda is only supported on AMD gpu arch greater than or equal to CDNA2");
}
Expand Down
4 changes: 2 additions & 2 deletions aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ static void initHipSparseLtSupport() {
// Check only the first available device
try {
if (at::cuda::device_count() > 0) {
g_hipSparseLtSupported = at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx942"}, 0);
g_hipSparseLtSupported = at::detail::getCUDAHooks().isGPUArch({"gfx950", "gfx942", "gfx1250"}, 0);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we confirm whether hipSparseLT requires ROCm 7.2+?
gfx1250 is advertised unconditionally here, which might fail deeper.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, hipsparselt actually requires ROCm >=7.12. PR is in progress pytorch#178737

}
} catch (const std::exception&) {
// If an exception occurs during device property check, we assume hipSparseLt is not supported
Expand All @@ -49,7 +49,7 @@ static bool isHipSparseLtSupported() {
TORCH_CHECK(
false,
"hipSparseLt not supported on this device, supported architectures: "
"gfx950, gfx942. "
"gfx1250, gfx950, gfx942. "
"required ROCM version: 6.4.0 or later.");
}
return g_hipSparseLtSupported;
Expand Down
Loading