From 9e783f88a9ebe1f8e2f861b64b5f0b9b6ea4ce7e Mon Sep 17 00:00:00 2001 From: tianyuzhou668 <2431054748@qq.com> Date: Fri, 21 Nov 2025 17:03:20 +0800 Subject: [PATCH 1/7] [ILUVATAR_GPU] Support eig_kernel --- Paddle | 2 +- backends/iluvatar_gpu/CMakeLists.txt | 5 +- .../iluvatar_gpu/cmake/external/magma.cmake | 89 +++ .../kernels/cuda_kernels/eig_grad_kernel.cu | 519 ++++++++++++++++++ .../kernels/cuda_kernels/eig_kernel.cu | 123 +++++ .../iluvatar_gpu/runtime/iluvatar_context.h | 28 + .../tests/unittests/test_eig_op_iluvatar.py | 391 +++++++++++++ .../unittests/test_linalg_eig_op_iluvatar.py | 42 ++ 8 files changed, 1197 insertions(+), 2 deletions(-) create mode 100644 backends/iluvatar_gpu/cmake/external/magma.cmake create mode 100644 backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu create mode 100644 backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu create mode 100644 backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py create mode 100644 backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py diff --git a/Paddle b/Paddle index 93a5410253b..86238399d4c 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 93a5410253bf2ca0945f4551e1a58ad7a5aec996 +Subproject commit 86238399d4c720a0dccc07d416ece8168225d757 diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt index 1b65161112d..827d5553c01 100644 --- a/backends/iluvatar_gpu/CMakeLists.txt +++ b/backends/iluvatar_gpu/CMakeLists.txt @@ -34,6 +34,7 @@ include(version) include(generic) include(cblas) include(external/eigen) +include(external/magma) include(external/xxhash) include(external/zlib) include(external/protobuf) @@ -119,11 +120,13 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/*.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/*.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/eigen/*.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/magma/magma_function.cc # cudnn/cublas ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cudnn.cc ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublas.cc ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublasLt.cc ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc + ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/magma.cc # kernels/gpu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_kernel.cu @@ -1008,7 +1011,7 @@ target_link_libraries( ixattnbkd nccl # change nccl to ${FLAGCX_LIB} if compiling with FlagCX ${FLAGCX_LIB} -) + magma) include_directories(BEFORE ${PADDLE_SOURCE_DIR}) diff --git a/backends/iluvatar_gpu/cmake/external/magma.cmake b/backends/iluvatar_gpu/cmake/external/magma.cmake new file mode 100644 index 00000000000..612ea1b600e --- /dev/null +++ b/backends/iluvatar_gpu/cmake/external/magma.cmake @@ -0,0 +1,89 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +include(ExternalProject) + +set(MAGMA_PREFIX_DIR ${THIRD_PARTY_PATH}/magma) +set(MAGMA_DOWNLOAD_DIR + ${PADDLE_SOURCE_DIR}/third_party/magma/${CMAKE_SYSTEM_NAME}) +set(MAGMA_INSTALL_DIR ${THIRD_PARTY_PATH}/install/magma) +set(MAGMA_LIB_DIR ${MAGMA_INSTALL_DIR}/lib) + +# Note(zhouwei): magma need fortran compiler which many machines don't have, so +# use precompiled library. use magma tag v2.9.0 on 07/28/2025 +# https://github.com/icl-utk-edu/magma/tree/v2.9.0 +if(LINUX) + set(MAGMA_FILE + "magma_local.tar.gz" + CACHE STRING "" FORCE) + set(MAGMA_URL + "file:///home/tianyu.zhou/tyzhou/magma_local.tar.gz" + CACHE STRING "" FORCE) + set(MAGMA_URL_MD5 9715dfad9eb073e099f46feb6587232d) + set(MAGMA_LIB "${MAGMA_LIB_DIR}/libmagma.so") +elseif(WIN32) + message("magma do not support windows yet, skip ...") +else() # MacOS + message("magma do not support macos or other platform yet, skip ...") +endif() + +function(download_magma) + message( + STATUS "Downloading ${MAGMA_URL} to ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}") + # NOTE: If the version is updated, consider emptying the folder; maybe add + # timeout + file( + DOWNLOAD ${MAGMA_URL} ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE} + EXPECTED_MD5 ${MAGMA_URL_MD5} + STATUS ERR) + if(ERR EQUAL 0) + message(STATUS "Download ${MAGMA_FILE} success") + else() + message( + FATAL_ERROR + "Download failed, error: ${ERR}\n You can try downloading ${MAGMA_FILE} again" + ) + endif() +endfunction() + +# Download and check magma. +if(EXISTS ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}) + file(MD5 ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE} MAGMA_MD5) + if(NOT MAGMA_MD5 STREQUAL MAGMA_URL_MD5) + # clean build file + file(REMOVE_RECURSE ${MAGMA_PREFIX_DIR}) + file(REMOVE_RECURSE ${MAGMA_INSTALL_DIR}) + download_magma() + endif() +else() + download_magma() +endif() + +ExternalProject_Add( + extern_magma + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE} + URL_MD5 ${MAGMA_URL_MD5} + DOWNLOAD_DIR ${MAGMA_DOWNLOAD_DIR} + SOURCE_DIR ${MAGMA_LIB_DIR} + PREFIX ${MAGMA_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + PATCH_COMMAND "" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + BUILD_BYPRODUCTS ${MAGMA_LIB}) + +add_definitions(-DPADDLE_WITH_MAGMA) diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu new file mode 100644 index 00000000000..7748e32cd60 --- /dev/null +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu @@ -0,0 +1,519 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/dynload/cublas.h" +#include "paddle/phi/backends/dynload/cusolver.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/memory_utils.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/cpu/eig.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/transpose_kernel.h" +#include "runtime/iluvatar_context.h" + +namespace phi { + +template +void SolveLinearSystemGPU(const GPUContext& dev_ctx, + const T* matrix_data, + const T* rhs_data, + T* out_data, + int order, + int rhs_cols, + int batch_count); + +template <> +void SolveLinearSystemGPU>( + const phi::GPUContext& dev_ctx, + const phi::dtype::complex* + matrix_data, // device ptr, row-major, size batch*order*order + const phi::dtype::complex* + rhs_data, // device ptr, row-major, size batch*order*rhs_cols + phi::dtype::complex* + out_data, // device ptr, row-major, size batch*order*rhs_cols + int order, + int rhs_cols, + int batch_count) { + // handles + cublasHandle_t cublas_handle = dev_ctx.cublas_handle(); + // cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle(); + cusolverDnHandle_t cusolver_handle = GetSolverHandle(dev_ctx.stream()); + + auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + + // cuComplex constants + const cuComplex kAlpha = make_cuFloatComplex(1.0f, 0.0f); + const cuComplex kZero = make_cuFloatComplex(0.0f, 0.0f); + + // Sizes + const size_t A_one_bytes = + static_cast(order) * order * sizeof(cuComplex); + const size_t B_one_bytes = + static_cast(order) * rhs_cols * sizeof(cuComplex); + const size_t A_batch_bytes = A_one_bytes * batch_count; + const size_t B_batch_bytes = B_one_bytes * batch_count; + + const cuComplex* A_row_all = reinterpret_cast(matrix_data); + const cuComplex* B_row_all = reinterpret_cast(rhs_data); + cuComplex* X_row_all = reinterpret_cast(out_data); + + auto dA_col_alloc = + phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + auto dB_col_alloc = + phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + cuComplex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); + cuComplex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); + + auto d_pivots_alloc = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + static_cast(batch_count) * order * sizeof(int), + stream); + int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); + + auto d_info_alloc = + phi::memory_utils::Alloc(dev_ctx.GetPlace(), + static_cast(batch_count) * sizeof(int), + stream); + int* d_info = reinterpret_cast(d_info_alloc->ptr()); + + // A_row layout: row-major (order x order), B_row layout: row-major (order + // x rhs_cols) + for (int i = 0; i < batch_count; ++i) { + const cuComplex* A_row = A_row_all + static_cast(i) * order * order; + cuComplex* A_col = dA_col + static_cast(i) * order * order; + const cuComplex* B_row = + B_row_all + static_cast(i) * order * rhs_cols; + cuComplex* B_col = dB_col + static_cast(i) * order * rhs_cols; + + // transpose A_row (row-major) -> A_col (column-major) via C = A^T + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam( + cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + order, + order, + &kAlpha, + A_row, + order, // lda: when interpreting A_row as (order x order) row-major, + // using order + &kZero, + nullptr, + order, + A_col, + order)); // ldc = order (column-major leading dim) + + // transpose B_row (row-major order x rhs_cols) -> B_col (column-major order + // x rhs_cols) + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam( + cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + order, + rhs_cols, + &kAlpha, + B_row, + rhs_cols, // lda when A_row is viewed row-major: leading = rhs_cols + &kZero, + nullptr, + rhs_cols, + B_col, + order)); // ldc = order + } + + int lwork = 0; + cuComplex* dA_col0 = dA_col; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrf_bufferSize( + cusolver_handle, order, order, dA_col0, order, &lwork)); + + size_t work_bytes = static_cast(lwork) * sizeof(cuComplex); + auto d_work_alloc = + phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); + cuComplex* d_work = reinterpret_cast(d_work_alloc->ptr()); + + for (int i = 0; i < batch_count; ++i) { + cuComplex* A_col = dA_col + static_cast(i) * order * order; + cuComplex* B_col = dB_col + static_cast(i) * order * rhs_cols; + int* pivots_i = d_pivots + static_cast(i) * order; + int* info_i = d_info + i; + + // getrf (LU factorization) on A_col (column-major) + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrf( + cusolver_handle, order, order, A_col, order, d_work, pivots_i, info_i)); + + // getrs: solve A_col * X_col = B_col + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrs( + cusolver_handle, + CUBLAS_OP_N, // no transpose on column-major matrix + order, + rhs_cols, + A_col, + order, + pivots_i, + B_col, + order, + info_i)); + } + + for (int i = 0; i < batch_count; ++i) { + cuComplex* B_col = dB_col + static_cast(i) * order * + rhs_cols; // X in column-major + cuComplex* X_row = X_row_all + static_cast(i) * order * + rhs_cols; // target row-major + + // transpose X_col -> X_row + // We use C = A^T : A has shape (order x rhs_cols) in column-major, so C + // will be (rhs_cols x order), but we want X_row with shape (order x + // rhs_cols) in row-major; calling cublasCgeam with op=T and adjusted dims + // works: + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam( + cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + rhs_cols, + order, // rowsC = rhs_cols, colsC = order + &kAlpha, + B_col, + order, // B_col lda = order (col-major) + &kZero, + nullptr, + order, + X_row, + rhs_cols)); // X_row ldc = rhs_cols (row-major leading dimension) + } + + std::vector h_info(batch_count, 0); + phi::memory_utils::Copy(phi::CPUPlace(), + h_info.data(), + dev_ctx.GetPlace(), + d_info, + static_cast(batch_count) * sizeof(int), + reinterpret_cast(dev_ctx.stream())); + dev_ctx.Wait(); + + for (int i = 0; i < batch_count; ++i) { + PADDLE_ENFORCE_EQ( + h_info[i], + 0, + errors::External( + "cuSOLVER getrf/getrs failed at batch %d, info: %d", i, h_info[i])); + } +} + +template <> +void SolveLinearSystemGPU>( + const phi::GPUContext& dev_ctx, + const phi::dtype::complex* + matrix_data, // device ptr, row-major, size batch*order*order + const phi::dtype::complex* + rhs_data, // device ptr, row-major, size batch*order*rhs_cols + phi::dtype::complex* + out_data, // device ptr, row-major, size batch*order*rhs_cols + int order, + int rhs_cols, + int batch_count) { + // handles + cublasHandle_t cublas_handle = dev_ctx.cublas_handle(); + // cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle(); + cusolverDnHandle_t cusolver_handle = GetSolverHandle(dev_ctx.stream()); + + auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); + + // cuDoubleComplex constants + const cuDoubleComplex kAlpha = make_cuDoubleComplex(1.0f, 0.0f); + const cuDoubleComplex kZero = make_cuDoubleComplex(0.0f, 0.0f); + + // Sizes + const size_t A_one_bytes = + static_cast(order) * order * sizeof(cuDoubleComplex); + const size_t B_one_bytes = + static_cast(order) * rhs_cols * sizeof(cuDoubleComplex); + const size_t A_batch_bytes = A_one_bytes * batch_count; + const size_t B_batch_bytes = B_one_bytes * batch_count; + + const cuDoubleComplex* A_row_all = + reinterpret_cast(matrix_data); + const cuDoubleComplex* B_row_all = + reinterpret_cast(rhs_data); + cuDoubleComplex* X_row_all = reinterpret_cast(out_data); + + auto dA_col_alloc = + phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); + auto dB_col_alloc = + phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); + cuDoubleComplex* dA_col = + reinterpret_cast(dA_col_alloc->ptr()); + cuDoubleComplex* dB_col = + reinterpret_cast(dB_col_alloc->ptr()); + + auto d_pivots_alloc = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + static_cast(batch_count) * order * sizeof(int), + stream); + int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); + + auto d_info_alloc = + phi::memory_utils::Alloc(dev_ctx.GetPlace(), + static_cast(batch_count) * sizeof(int), + stream); + int* d_info = reinterpret_cast(d_info_alloc->ptr()); + + // A_row layout: row-major (order x order), B_row layout: row-major (order + // x rhs_cols) + for (int i = 0; i < batch_count; ++i) { + const cuDoubleComplex* A_row = + A_row_all + static_cast(i) * order * order; + cuDoubleComplex* A_col = dA_col + static_cast(i) * order * order; + const cuDoubleComplex* B_row = + B_row_all + static_cast(i) * order * rhs_cols; + cuDoubleComplex* B_col = dB_col + static_cast(i) * order * rhs_cols; + + // transpose A_row (row-major) -> A_col (column-major) via C = A^T + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam( + cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + order, + order, + &kAlpha, + A_row, + order, // lda: when interpreting A_row as (order x order) row-major, + // using order + &kZero, + nullptr, + order, + A_col, + order)); // ldc = order (column-major leading dim) + + // transpose B_row (row-major order x rhs_cols) -> B_col (column-major order + // x rhs_cols) + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam( + cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + order, + rhs_cols, + &kAlpha, + B_row, + rhs_cols, // lda when A_row is viewed row-major: leading = rhs_cols + &kZero, + nullptr, + rhs_cols, + B_col, + order)); // ldc = order + } + + int lwork = 0; + cuDoubleComplex* dA_col0 = dA_col; + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrf_bufferSize( + cusolver_handle, order, order, dA_col0, order, &lwork)); + + size_t work_bytes = static_cast(lwork) * sizeof(cuDoubleComplex); + auto d_work_alloc = + phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); + cuDoubleComplex* d_work = + reinterpret_cast(d_work_alloc->ptr()); + + for (int i = 0; i < batch_count; ++i) { + cuDoubleComplex* A_col = dA_col + static_cast(i) * order * order; + cuDoubleComplex* B_col = dB_col + static_cast(i) * order * rhs_cols; + int* pivots_i = d_pivots + static_cast(i) * order; + int* info_i = d_info + i; + + // getrf (LU factorization) on A_col (column-major) + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrf( + cusolver_handle, order, order, A_col, order, d_work, pivots_i, info_i)); + + // getrs: solve A_col * X_col = B_col + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrs( + cusolver_handle, + CUBLAS_OP_N, // no transpose on column-major matrix + order, + rhs_cols, + A_col, + order, + pivots_i, + B_col, + order, + info_i)); + } + + for (int i = 0; i < batch_count; ++i) { + cuDoubleComplex* B_col = dB_col + static_cast(i) * order * + rhs_cols; // X in column-major + cuDoubleComplex* X_row = X_row_all + static_cast(i) * order * + rhs_cols; // target row-major + + // transpose X_col -> X_row + // We use C = A^T : A has shape (order x rhs_cols) in column-major, so C + // will be (rhs_cols x order), but we want X_row with shape (order x + // rhs_cols) in row-major; calling cublasZgeam with op=T and adjusted dims + // works: + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam( + cublas_handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + rhs_cols, + order, // rowsC = rhs_cols, colsC = order + &kAlpha, + B_col, + order, // B_col lda = order (col-major) + &kZero, + nullptr, + order, + X_row, + rhs_cols)); // X_row ldc = rhs_cols (row-major leading dimension) + } + + std::vector h_info(batch_count, 0); + phi::memory_utils::Copy(phi::CPUPlace(), + h_info.data(), + dev_ctx.GetPlace(), + d_info, + static_cast(batch_count) * sizeof(int), + reinterpret_cast(dev_ctx.stream())); + dev_ctx.Wait(); + + for (int i = 0; i < batch_count; ++i) { + PADDLE_ENFORCE_EQ( + h_info[i], + 0, + errors::External( + "cuSOLVER getrf/getrs failed at batch %d, info: %d", i, h_info[i])); + } +} + +template +void ComputeBackwardForComplexInputGPU(const DenseTensor& L, + const DenseTensor& V, + const paddle::optional& gL, + const paddle::optional& gV, + T* x_grad_data, + int batch_count, + int order, + const Context& dev_ctx) { + DenseTensor gL_safe; + if (gL.get_ptr()) { + gL_safe = gL.get(); + } else { + gL_safe = + Fill(dev_ctx, common::vectorize(L.dims()), T(0)); + } + + DenseTensor gV_safe; + if (gV.get_ptr()) { + gV_safe = gV.get(); + } else { + gV_safe = + Fill(dev_ctx, common::vectorize(V.dims()), T(0)); + } + DenseTensor trans_v = phi::TransposeLast2Dim(dev_ctx, V); + DenseTensor Vh = phi::Conj(dev_ctx, trans_v); + DenseTensor Lconj = phi::Conj(dev_ctx, L); + DenseTensor Econj = phi::Subtract(dev_ctx, + phi::funcs::Unsqueeze(Lconj, -2), + phi::funcs::Unsqueeze(Lconj, -1)); + DenseTensor VhgV = phi::Matmul(dev_ctx, Vh, gV_safe); + DenseTensor diag_real = phi::Real(dev_ctx, VhgV); + + auto cpu_place = phi::CPUPlace(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* cpu_ctx = static_cast(pool.Get(cpu_place)); + + DenseTensor diag_real_cpu; + diag_real_cpu.Resize(diag_real.dims()); + phi::Copy(dev_ctx, diag_real, cpu_place, false, &diag_real_cpu); + + DenseTensor diag_res_cpu = + phi::funcs::BatchDiag((*cpu_ctx), diag_real_cpu, batch_count); + + DenseTensor diag_res; + dev_ctx.template Alloc(&diag_res); + phi::Copy(dev_ctx, diag_res_cpu, dev_ctx.GetPlace(), false, &diag_res); + + DenseTensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2); + + auto numel = diag_unsqueezed.numel(); + DenseTensor diag_unsqueezed_complex; + auto* data_diag_un = diag_unsqueezed.data>(); + diag_unsqueezed_complex.Resize(diag_unsqueezed.dims()); + auto* data_diag_un_com = dev_ctx.template Alloc( + &diag_unsqueezed_complex, static_cast(numel * sizeof(T))); + + phi::funcs::ForRange for_range(dev_ctx, numel); + phi::funcs::RealToComplexFunctor functor( + data_diag_un, data_diag_un_com, numel); + for_range(functor); + // real tensor multiply complex tensor in broadcast manner + DenseTensor res1 = phi::Multiply(dev_ctx, V, diag_unsqueezed_complex); + DenseTensor res2 = phi::Matmul(dev_ctx, Vh, res1); + DenseTensor result = phi::Subtract(dev_ctx, VhgV, res2); + + result.Resize(V.dims()); + dev_ctx.template Alloc(&result); + result = phi::Divide(dev_ctx, result, Econj); + result = phi::funcs::DiagFill( + dev_ctx, order, order, order, 0, gL_safe, result); + DenseTensor rhs = phi::Matmul(dev_ctx, result, Vh); + + // solve linear system + // solve(Vh, rhs, out, m, k) + // Vh: matrix with shape [m,m] + // rhs: rhs with shape [m,k] + // x_grad: out + int m = static_cast(Vh.dims(-1)); + int k = static_cast(rhs.dims(-1)); + auto* matrix_data = Vh.data(); + auto* rhs_data = rhs.data(); + + SolveLinearSystemGPU( + dev_ctx, matrix_data, rhs_data, x_grad_data, m, k, batch_count); +} + +template +void EigGradKernel(const Context& dev_ctx, + const DenseTensor& out_w, + const DenseTensor& out_v, + const paddle::optional& dout_w, + const paddle::optional& dout_v, + DenseTensor* dx) { + auto* dx_data = dev_ctx.template Alloc>(dx); + if (dx->numel() == 0) { + return; + } + auto& dims = out_v.dims(); + phi::DDim dim_origin = dims; + int num_dims = dim_origin.size(); + int batch_count = BatchCount(out_v); + const int order = static_cast(dim_origin[num_dims - 1]); + + ComputeBackwardForComplexInputGPU, Context>( + out_w, out_v, dout_w, dout_v, dx_data, batch_count, order, dev_ctx); +} + +} // namespace phi + +// Register the kernel +PD_REGISTER_PLUGIN_KERNEL(eig_grad, + iluvatar_gpu, + ALL_LAYOUT, + phi::EigGradKernel, + float, + phi::complex64) { + kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); + kernel->InputAt(2).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); +} diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu new file mode 100644 index 00000000000..8461b82ece7 --- /dev/null +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu @@ -0,0 +1,123 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/eig.h" +#include "paddle/phi/kernels/eig_kernel.h" + +namespace phi { + +template +void EigKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out_w, + DenseTensor* out_v) { + dev_ctx.template Alloc>(out_w); + dev_ctx.template Alloc>(out_v); + + if (x.numel() == 0) { + return; + } + + auto cpu_place = phi::CPUPlace(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* cpu_ctx = static_cast(pool.Get(cpu_place)); + + // prepare cpu Tensor here, since magma requires output on cpu + DenseTensor out_w_cpu, out_v_cpu; + out_w_cpu.Resize(out_w->dims()); + (*cpu_ctx).template Alloc>(&out_w_cpu); + out_v_cpu.Resize(x.dims()); + (*cpu_ctx).template Alloc>(&out_v_cpu); + + if (!IsComplexType(x.dtype())) { + // output still be complex though input is real + int batch_count = BatchCount(x); + int order = static_cast(x.dims()[x.dims().size() - 1]); + + DenseTensor real_w_cpu, real_v_cpu; + + std::vector real_w_dim = common::vectorize(out_w->dims()); + real_w_dim.back() *= 2; + real_w_cpu.Resize(common::make_ddim(real_w_dim)); + (*cpu_ctx).template Alloc>(&real_w_cpu); + real_v_cpu.Resize(x.dims()); + (*cpu_ctx).template Alloc>(&real_v_cpu); + + phi::ApplyEigKernelMagma, Context>( + dev_ctx, x, &real_w_cpu, &real_v_cpu); + + // 1. extract real part & imag part from real_w_cpu + DenseTensor real_part_cpu = phi::funcs::Slice>( + (*cpu_ctx), real_w_cpu, {-1}, {0}, {order}); + DenseTensor imag_part_cpu = phi::funcs::Slice>( + (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2}); + + // 2. construct complex values + auto* real_part_data = real_part_cpu.data>(); + auto* imag_part_data = imag_part_cpu.data>(); + int64_t out_w_numel = static_cast(out_w->numel()); + + phi::funcs::ForRange for_range((*cpu_ctx), out_w_numel); + phi::funcs::RealImagToComplexFunctor> functor( + real_part_data, + imag_part_data, + out_w_cpu.data>(), + out_w_numel); + for_range(functor); + + // 3. construct complex vectors + DenseTensor real_v_trans_cpu = + phi::TransposeLast2Dim, phi::CPUContext>( + (*cpu_ctx), real_v_cpu); + DenseTensor out_v_trans_cpu; + out_v_trans_cpu.Resize(x.dims()); + (*cpu_ctx).template Alloc>(&out_v_trans_cpu); + + phi::ConstructComplexVectors, + phi::dtype::Complex, + phi::CPUContext>(&out_v_trans_cpu, + out_w_cpu, + real_v_trans_cpu, + (*cpu_ctx), + batch_count, + order); + + TransposeTwoAxis, phi::CPUContext>( + out_v_trans_cpu, + &out_v_cpu, + x.dims().size() - 1, + x.dims().size() - 2, + (*cpu_ctx)); + + } else { + phi::ApplyEigKernelMagma(dev_ctx, x, &out_w_cpu, &out_v_cpu); + } + + // copy result from cpu to gpu tensor + phi::Copy(dev_ctx, out_w_cpu, dev_ctx.GetPlace(), false, out_w); + phi::Copy(dev_ctx, out_v_cpu, dev_ctx.GetPlace(), false, out_v); +} + +} // namespace phi + +PD_REGISTER_PLUGIN_KERNEL( + eig, iluvatar_gpu, ALL_LAYOUT, phi::EigKernel, float, phi::complex64) { + if (kernel_key.dtype() == phi::DataType::FLOAT32) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); + kernel->OutputAt(1).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); + } +} diff --git a/backends/iluvatar_gpu/runtime/iluvatar_context.h b/backends/iluvatar_gpu/runtime/iluvatar_context.h index a007bf4154c..a6ea37a0caa 100644 --- a/backends/iluvatar_gpu/runtime/iluvatar_context.h +++ b/backends/iluvatar_gpu/runtime/iluvatar_context.h @@ -77,6 +77,11 @@ class DnnWorkspaceHandle { namespace { // NOLINT inline cudnnHandle_t dnn_handle_ = nullptr; inline std::once_flag flag_dnn_; + +inline cusolverDnHandle_t solver_handle_ = nullptr; +inline std::function solver_handle_creator_{nullptr}; +inline std::once_flag flag_solver_; + inline void InitDnnHandle(cudnnHandle_t* handle, gpuStream_t stream, Place place) { @@ -114,6 +119,29 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor, const gpuStream_t& stream) { return DnnWorkspaceHandle(alloactor, stream); } + +inline void InitSolverHandle(cusolverDnHandle_t* handle, gpuStream_t stream) { + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle)); + PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream)); +} + +inline cusolverDnHandle_t GetSolverHandle(gpuStream_t stream) { + std::call_once(flag_solver_, [&]() { + if (!solver_handle_) { + if (!solver_handle_creator_) { + InitSolverHandle(&solver_handle_, stream); + } else { + solver_handle_ = solver_handle_creator_(); + } + } + }); + PADDLE_ENFORCE_NOT_NULL( + solver_handle_, + common::errors::InvalidArgument( + "The GPU solver handle is nullptr. It must not be null.")); + return solver_handle_; +} + } // namespace phi namespace iluvatar { diff --git a/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py new file mode 100644 index 00000000000..3f5fbb9f7f3 --- /dev/null +++ b/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py @@ -0,0 +1,391 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import OpTest, skip_check_grad_ci +from utils import dygraph_guard + +import paddle +from paddle import base + + +# cast output to complex for numpy.linalg.eig +def cast_to_complex(input, output): + if input.dtype == np.float32: + output = output.astype(np.complex64) + elif input.dtype == np.float64: + output = output.astype(np.complex128) + return output + + +# define eig backward function for a single square matrix +def eig_backward(w, v, grad_w, grad_v): + v_tran = np.transpose(v) + v_tran = np.conjugate(v_tran) + w_conj = np.conjugate(w) + w_conj_l = w_conj.reshape(1, w.size) + w_conj_r = w_conj.reshape(w.size, 1) + w_conj_2d = w_conj_l - w_conj_r + + vhgv = np.matmul(v_tran, grad_v) + real_vhgv = np.real(vhgv) + diag_real = real_vhgv.diagonal() + + diag_2d = diag_real.reshape(1, w.size) + rhs = v * diag_2d + mid = np.matmul(v_tran, rhs) + result = vhgv - mid + + res = np.divide(result, w_conj_2d) + row, col = np.diag_indices_from(res) + res[row, col] = 1.0 + + tmp = np.matmul(res, v_tran) + dx = np.linalg.solve(v_tran, tmp) + return dx + + +class TestEigOp(OpTest): + def setUp(self): + paddle.enable_static() + paddle.device.set_device("iluvatar_gpu") + self.op_type = "eig" + self.python_api = paddle.linalg.eig + self.__class__.op_type = self.op_type + self.init_input() + self.inputs = {"X": OpTest.np_dtype_to_base_dtype(self.x)} + self.outputs = {"Eigenvalues": self.out[0], "Eigenvectors": self.out[1]} + + def init_input(self): + self.set_dtype() + self.set_dims() + self.x = np.random.random(self.shape).astype(self.dtype) + self.out = np.linalg.eig(self.x) + self.out = ( + cast_to_complex(self.x, self.out[0]), + cast_to_complex(self.x, self.out[1]), + ) + + # for the real input, a customized checker is needed + def checker(self, outs): + actual_out_w = outs[0].flatten() + expect_out_w = self.out[0].flatten() + actual_out_v = outs[1].flatten() + expect_out_v = self.out[1].flatten() + + length_w = len(expect_out_w) + act_w_real = np.sort( + np.array([np.abs(actual_out_w[i].real) for i in range(length_w)]) + ) + act_w_imag = np.sort( + np.array([np.abs(actual_out_w[i].imag) for i in range(length_w)]) + ) + exp_w_real = np.sort( + np.array([np.abs(expect_out_w[i].real) for i in range(length_w)]) + ) + exp_w_imag = np.sort( + np.array([np.abs(expect_out_w[i].imag) for i in range(length_w)]) + ) + + for i in range(length_w): + np.testing.assert_allclose( + act_w_real[i], + exp_w_real[i], + rtol=1e-06, + atol=1e-05, + err_msg="The eigenvalues real part have diff: \nExpected " + + str(act_w_real[i]) + + "\n" + + "But got: " + + str(exp_w_real[i]), + ) + np.testing.assert_allclose( + act_w_imag[i], + exp_w_imag[i], + rtol=1e-06, + atol=1e-05, + err_msg="The eigenvalues image part have diff: \nExpected " + + str(act_w_imag[i]) + + "\n" + + "But got: " + + str(exp_w_imag[i]), + ) + + length_v = len(expect_out_v) + act_v_real = np.sort( + np.array([np.abs(actual_out_v[i].real) for i in range(length_v)]) + ) + act_v_imag = np.sort( + np.array([np.abs(actual_out_v[i].imag) for i in range(length_v)]) + ) + exp_v_real = np.sort( + np.array([np.abs(expect_out_v[i].real) for i in range(length_v)]) + ) + exp_v_imag = np.sort( + np.array([np.abs(expect_out_v[i].imag) for i in range(length_v)]) + ) + + for i in range(length_v): + np.testing.assert_allclose( + act_v_real[i], + exp_v_real[i], + rtol=1e-06, + atol=1e-05, + err_msg="The eigenvectors real part have diff: \nExpected " + + str(act_v_real[i]) + + "\n" + + "But got: " + + str(exp_v_real[i]), + ) + np.testing.assert_allclose( + act_v_imag[i], + exp_v_imag[i], + rtol=1e-06, + atol=1e-05, + err_msg="The eigenvectors image part have diff: \nExpected " + + str(act_v_imag[i]) + + "\n" + + "But got: " + + str(exp_v_imag[i]), + ) + + def set_dtype(self): + self.dtype = np.complex64 + + def set_dims(self): + self.shape = (10, 10) + + def init_grad(self): + # grad_w, grad_v complex dtype + gtype = self.dtype + if self.dtype == np.float32: + gtype = np.complex64 + elif self.dtype == np.float64: + gtype = np.complex128 + self.grad_w = np.ones(self.out[0].shape, gtype) + self.grad_v = np.ones(self.out[1].shape, gtype) + self.grad_x = eig_backward(self.out[0], self.out[1], self.grad_w, self.grad_v) + + def test_check_output(self): + self.check_output_with_place_customized( + checker=self.checker, + place=paddle.CustomPlace("iluvatar_gpu", 0), + check_pir=True, + ) + + def test_check_grad(self): + self.init_grad() + self.check_grad( + ["X"], + ["Eigenvalues", "Eigenvectors"], + user_defined_grads=[self.grad_x], + user_defined_grad_outputs=[self.grad_w, self.grad_v], + check_pir=True, + ) + + +@skip_check_grad_ci( + reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig" +) +class TestFloat(TestEigOp): + def set_dtype(self): + self.dtype = np.float32 + + def test_check_grad(self): + pass + + +class TestEigStatic(TestEigOp): + def test_check_output_with_place(self): + paddle.enable_static() + place = paddle.CustomPlace("iluvatar_gpu", 0) + input_np = np.random.random([3, 3]).astype("complex") + expect_val, expect_vec = np.linalg.eig(input_np) + with base.program_guard(base.Program(), base.Program()): + input = paddle.static.data(name="input", shape=[3, 3], dtype="complex") + act_val, act_vec = paddle.linalg.eig(input) + + exe = base.Executor(place) + fetch_val, fetch_vec = exe.run( + base.default_main_program(), + feed={"input": input_np}, + fetch_list=[act_val, act_vec], + ) + np.testing.assert_allclose( + expect_val, + fetch_val, + rtol=1e-06, + atol=1e-06, + err_msg="The eigen values have diff: \nExpected " + + str(expect_val) + + "\n" + + "But got: " + + str(fetch_val), + ) + np.testing.assert_allclose( + np.abs(expect_vec), + np.abs(fetch_vec), + rtol=1e-06, + atol=1e-06, + err_msg="The eigen vectors have diff: \nExpected " + + str(np.abs(expect_vec)) + + "\n" + + "But got: " + + str(np.abs(fetch_vec)), + ) + + +class TestEigDyGraph(unittest.TestCase): + def test_check_output_with_place(self): + np.random.seed(1024) + input_np = np.random.random([3, 3]).astype("complex64") + expect_val, expect_vec = np.linalg.eig(input_np) + + paddle.set_device("iluvatar_gpu") + paddle.disable_static() + + input_tensor = paddle.to_tensor(input_np) + fetch_val, fetch_vec = paddle.linalg.eig(input_tensor) + + np.testing.assert_allclose( + expect_val, + fetch_val.numpy(), + rtol=1e-06, + atol=1e-06, + err_msg="The eigen values have diff: \nExpected " + + str(expect_val) + + "\n" + + "But got: " + + str(fetch_val), + ) + np.testing.assert_allclose( + np.abs(expect_vec), + np.abs(fetch_vec.numpy()), + rtol=1e-06, + atol=1e-06, + err_msg="The eigen vectors have diff: \nExpected " + + str(np.abs(expect_vec)) + + "\n" + + "But got: " + + str(np.abs(fetch_vec.numpy())), + ) + + # def test_check_grad(self): + # test_shape = [3, 3] + # test_type = 'float32' + # paddle.set_device("iluvatar_gpu") + + # np.random.seed(1024) + # input_np = np.random.random(test_shape).astype(test_type) + # real_w, real_v = np.linalg.eig(input_np) + + # grad_w = np.ones(real_w.shape, test_type) + # grad_v = np.ones(real_v.shape, test_type) + # grad_x = eig_backward(real_w, real_v, grad_w, grad_v) + + # with base.dygraph.guard(): + # x = paddle.to_tensor(input_np) + # x.stop_gradient = False + # w, v = paddle.linalg.eig(x) + # (w.sum() + v.sum()).backward() + + # np.testing.assert_allclose( + # np.abs(x.grad.numpy()), + # np.abs(grad_x), + # rtol=1e-05, + # atol=1e-05, + # err_msg='The grad x have diff: \nExpected ' + # + str(np.abs(grad_x)) + # + '\n' + # + 'But got: ' + # + str(np.abs(x.grad.numpy())), + # ) + + +class TestEigWrongDimsError(unittest.TestCase): + def test_error(self): + paddle.device.set_device("iluvatar_gpu") + paddle.disable_static() + a = np.random.random(3).astype("float32") + x = paddle.to_tensor(a) + self.assertRaises(ValueError, paddle.linalg.eig, x) + + +class TestEigNotSquareError(unittest.TestCase): + def test_error(self): + paddle.device.set_device("iluvatar_gpu") + paddle.disable_static() + a = np.random.random((1, 2, 3)).astype("float32") + x = paddle.to_tensor(a) + self.assertRaises(ValueError, paddle.linalg.eig, x) + + +class TestEigUnsupportedDtypeError(unittest.TestCase): + def test_error(self): + paddle.device.set_device("iluvatar_gpu") + paddle.disable_static() + a = (np.random.random((3, 3)) * 10).astype("int64") + x = paddle.to_tensor(a) + self.assertRaises(RuntimeError, paddle.linalg.eig, x) + + +class TestOptionalGradInput(unittest.TestCase): + def test_eager(self): + with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"): + x = paddle.randn(3, 3, requires_grad=True) + w, v = paddle.linalg.eig(x) + + np.testing.assert_allclose( + (x @ v).numpy(), + (w.unsqueeze(0) * v).numpy(), + atol=1e-5, + rtol=1e-5, + ) # Aμ = λμ + + # (dw_dx,) = paddle.grad(w, x, retain_graph=True) + # (dv_dx,) = paddle.grad(v, x, retain_graph=True) + # (dwdv_dx,) = paddle.grad([w, v], x) + # np.testing.assert_allclose( + # (dw_dx + dv_dx).numpy(), + # dwdv_dx.numpy(), + # atol=1e-5, + # rtol=1e-5, + # ) + + def test_dy2st(self): + with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"): + x = paddle.randn(3, 3, requires_grad=True) + + def f(x): + w, v = paddle.linalg.eig(x) + return ( + w, + v, + ) + + st_f = paddle.jit.to_static(f, full_graph=True, backend=None) + + w, v = st_f(x) + np.testing.assert_allclose( + (x @ v).numpy(), + (w.unsqueeze(0) * v).numpy(), + atol=1e-5, + rtol=1e-5, + ) # Aμ = λμ + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py new file mode 100644 index 00000000000..18581f25af0 --- /dev/null +++ b/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py @@ -0,0 +1,42 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from utils import dygraph_guard + +import paddle + + +class TestEigAPI0Size(unittest.TestCase): + def test_errors(self): + with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"): + for shape in [[0, 0], [0, 4, 4], [1, 0, 2, 3, 3]]: + x = paddle.randn(shape=shape, dtype="float32", requires_grad=True) + w, v = paddle.linalg.eig(x) + self.assertEqual(w.shape, shape[:-1]) + self.assertEqual(v.shape, shape) + + # (dw_dx,) = paddle.grad(w.abs().sum(), x, retain_graph=True) + # self.assertEqual(dw_dx.shape, x.shape) + # (dv_dx,) = paddle.grad(v.abs().sum(), x, retain_graph=True) + # self.assertEqual(dv_dx.shape, x.shape) + # (dwv_dx,) = paddle.grad( + # w.abs().sum() + v.abs().sum(), x, retain_graph=True + # ) + # self.assertEqual(dwv_dx.shape, x.shape) + + +if __name__ == "__main__": + unittest.main() From 80cbe00b479dfbd0de871e1126cad1393e7d6d1a Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Tue, 23 Dec 2025 11:00:19 +0800 Subject: [PATCH 2/7] update cmake --- backends/iluvatar_gpu/cmake/external/magma.cmake | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/backends/iluvatar_gpu/cmake/external/magma.cmake b/backends/iluvatar_gpu/cmake/external/magma.cmake index 612ea1b600e..217562ddef0 100644 --- a/backends/iluvatar_gpu/cmake/external/magma.cmake +++ b/backends/iluvatar_gpu/cmake/external/magma.cmake @@ -20,17 +20,16 @@ set(MAGMA_DOWNLOAD_DIR set(MAGMA_INSTALL_DIR ${THIRD_PARTY_PATH}/install/magma) set(MAGMA_LIB_DIR ${MAGMA_INSTALL_DIR}/lib) -# Note(zhouwei): magma need fortran compiler which many machines don't have, so -# use precompiled library. use magma tag v2.9.0 on 07/28/2025 +# use precompiled library magma tag v2.9.0 on 07/28/2025 # https://github.com/icl-utk-edu/magma/tree/v2.9.0 if(LINUX) set(MAGMA_FILE - "magma_local.tar.gz" + "magma_lnx_iluvatar_v2.9.0.20250728.tar.gz" CACHE STRING "" FORCE) set(MAGMA_URL - "file:///home/tianyu.zhou/tyzhou/magma_local.tar.gz" + "https://paddlepaddledeps.bj.bcebos.com/${MAGMA_FILE}" CACHE STRING "" FORCE) - set(MAGMA_URL_MD5 9715dfad9eb073e099f46feb6587232d) + set(MAGMA_URL_MD5 7fcca7e0140ed8df1fef511d82ae30c3) set(MAGMA_LIB "${MAGMA_LIB_DIR}/libmagma.so") elseif(WIN32) message("magma do not support windows yet, skip ...") From 8cab30c86ac27abed5f22ff3b0c8ca65fa94916e Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Tue, 23 Dec 2025 16:14:16 +0800 Subject: [PATCH 3/7] update --- Paddle | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Paddle b/Paddle index 86238399d4c..24c29dc7eb6 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 86238399d4c720a0dccc07d416ece8168225d757 +Subproject commit 24c29dc7eb6308a4014e39f8679f428e82008137 From 2a9496d9fc49d7e0f3eb0f5e35acc401f9b5c94e Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Wed, 24 Dec 2025 10:47:32 +0800 Subject: [PATCH 4/7] support eig op in metax --- .../iluvatar_gpu/cmake/external/magma.cmake | 12 +- backends/metax_gpu/CMakeLists.txt | 6 + .../kernels/metax_kernel/eig_kernel.cu | 123 ++++++++++++++++++ 3 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu diff --git a/backends/iluvatar_gpu/cmake/external/magma.cmake b/backends/iluvatar_gpu/cmake/external/magma.cmake index 217562ddef0..66c9560f6ca 100644 --- a/backends/iluvatar_gpu/cmake/external/magma.cmake +++ b/backends/iluvatar_gpu/cmake/external/magma.cmake @@ -20,16 +20,16 @@ set(MAGMA_DOWNLOAD_DIR set(MAGMA_INSTALL_DIR ${THIRD_PARTY_PATH}/install/magma) set(MAGMA_LIB_DIR ${MAGMA_INSTALL_DIR}/lib) -# use precompiled library magma tag v2.9.0 on 07/28/2025 +# use precompiled library. use magma tag v2.9.0 on 07/28/2025 # https://github.com/icl-utk-edu/magma/tree/v2.9.0 if(LINUX) set(MAGMA_FILE - "magma_lnx_iluvatar_v2.9.0.20250728.tar.gz" + "magma_lnx_metax_v2.9.0.20250728.tar.gz" CACHE STRING "" FORCE) set(MAGMA_URL "https://paddlepaddledeps.bj.bcebos.com/${MAGMA_FILE}" CACHE STRING "" FORCE) - set(MAGMA_URL_MD5 7fcca7e0140ed8df1fef511d82ae30c3) + set(MAGMA_URL_MD5 3aa4106aa11ba0aeb0036b450a53e972) set(MAGMA_LIB "${MAGMA_LIB_DIR}/libmagma.so") elseif(WIN32) message("magma do not support windows yet, skip ...") @@ -86,3 +86,9 @@ ExternalProject_Add( BUILD_BYPRODUCTS ${MAGMA_LIB}) add_definitions(-DPADDLE_WITH_MAGMA) + +add_library(magma SHARED IMPORTED GLOBAL) + +set_target_properties(magma PROPERTIES IMPORTED_LOCATION ${MAGMA_LIB}) + +add_dependencies(magma extern_magma) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 79941c2866a..b58c9aa6120 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -39,6 +39,9 @@ include(cutlass) include(dgc) include(warpctc) include(warprnnt) +if(WITH_MAGMA) + include(external/magma) +endif() set(PLUGIN_VERSION ${PADDLE_VERSION}) @@ -123,6 +126,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/*.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/*.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/eigen/*.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/magma/magma_function.cc # kernels/gpu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_send_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_recv_kernel.cu @@ -667,6 +671,7 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/binomial_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc + ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/magma.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -779,6 +784,7 @@ target_link_libraries( protobuf external_error_proto dgc + magma ${WARPCTC_LIBRARIES} ${WARPRNNT_LIBRARIES} ${PADDLE_CORE_LIB}) diff --git a/backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu new file mode 100644 index 00000000000..942bcb20ecf --- /dev/null +++ b/backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu @@ -0,0 +1,123 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/context_pool.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/eig.h" +#include "paddle/phi/kernels/eig_kernel.h" + +namespace phi { + +template +void EigKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out_w, + DenseTensor* out_v) { + dev_ctx.template Alloc>(out_w); + dev_ctx.template Alloc>(out_v); + + if (x.numel() == 0) { + return; + } + + auto cpu_place = phi::CPUPlace(); + phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); + auto* cpu_ctx = static_cast(pool.Get(cpu_place)); + + // prepare cpu Tensor here, since magma requires output on cpu + DenseTensor out_w_cpu, out_v_cpu; + out_w_cpu.Resize(out_w->dims()); + (*cpu_ctx).template Alloc>(&out_w_cpu); + out_v_cpu.Resize(x.dims()); + (*cpu_ctx).template Alloc>(&out_v_cpu); + + if (!IsComplexType(x.dtype())) { + // output still be complex though input is real + int batch_count = BatchCount(x); + int order = static_cast(x.dims(-1)); + + DenseTensor real_w_cpu, real_v_cpu; + + std::vector real_w_dim = common::vectorize(out_w->dims()); + real_w_dim.back() *= 2; + real_w_cpu.Resize(common::make_ddim(real_w_dim)); + (*cpu_ctx).template Alloc>(&real_w_cpu); + real_v_cpu.Resize(x.dims()); + (*cpu_ctx).template Alloc>(&real_v_cpu); + + phi::ApplyEigKernelMagma, Context>( + dev_ctx, x, &real_w_cpu, &real_v_cpu); + + // 1. extract real part & imag part from real_w_cpu + DenseTensor real_part_cpu = phi::funcs::Slice>( + (*cpu_ctx), real_w_cpu, {-1}, {0}, {order}); + DenseTensor imag_part_cpu = phi::funcs::Slice>( + (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2}); + + // 2. construct complex values + auto* real_part_data = real_part_cpu.data>(); + auto* imag_part_data = imag_part_cpu.data>(); + int64_t out_w_numel = static_cast(out_w->numel()); + + phi::funcs::ForRange for_range((*cpu_ctx), out_w_numel); + phi::funcs::RealImagToComplexFunctor> functor( + real_part_data, + imag_part_data, + out_w_cpu.data>(), + out_w_numel); + for_range(functor); + + // 3. construct complex vectors + DenseTensor real_v_trans_cpu = + phi::TransposeLast2Dim, phi::CPUContext>( + (*cpu_ctx), real_v_cpu); + DenseTensor out_v_trans_cpu; + out_v_trans_cpu.Resize(x.dims()); + (*cpu_ctx).template Alloc>(&out_v_trans_cpu); + + phi::ConstructComplexVectors, + phi::dtype::Complex, + phi::CPUContext>(&out_v_trans_cpu, + out_w_cpu, + real_v_trans_cpu, + (*cpu_ctx), + batch_count, + order); + + TransposeTwoAxis, phi::CPUContext>( + out_v_trans_cpu, + &out_v_cpu, + x.dims().size() - 1, + x.dims().size() - 2, + (*cpu_ctx)); + + } else { + phi::ApplyEigKernelMagma(dev_ctx, x, &out_w_cpu, &out_v_cpu); + } + + // copy result from cpu to gpu tensor + phi::Copy(dev_ctx, out_w_cpu, dev_ctx.GetPlace(), false, out_w); + phi::Copy(dev_ctx, out_v_cpu, dev_ctx.GetPlace(), false, out_v); +} + +} // namespace phi + +PD_REGISTER_PLUGIN_KERNEL( + eig, metax_gpu, ALL_LAYOUT, phi::EigKernel, float, phi::complex64) { + if (kernel_key.dtype() == phi::DataType::FLOAT32) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); + kernel->OutputAt(1).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); + } +} From 112d7e87227013f56853f022069944577c947e54 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Wed, 24 Dec 2025 11:20:14 +0800 Subject: [PATCH 5/7] remove iluvator changes --- backends/iluvatar_gpu/CMakeLists.txt | 9 +- .../rms_norm_grad_kernel_register.cc | 4 +- .../cuda_kernels/rms_norm_kernel_register.cc | 4 +- .../iluvatar_gpu/patches/paddle-corex.patch | 141 +++++++++++++----- .../iluvatar_gpu/runtime/iluvatar_context.h | 28 ---- 5 files changed, 109 insertions(+), 77 deletions(-) diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt index 1aa4e057dd0..e2684d446e6 100644 --- a/backends/iluvatar_gpu/CMakeLists.txt +++ b/backends/iluvatar_gpu/CMakeLists.txt @@ -34,7 +34,6 @@ include(version) include(generic) include(cblas) include(external/eigen) -include(external/magma) include(external/xxhash) include(external/zlib) include(external/protobuf) @@ -121,13 +120,11 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/*.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/*.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/eigen/*.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/magma/magma_function.cc # cudnn/cublas ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cudnn.cc ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublas.cc ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublasLt.cc ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc - ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/magma.cc # kernels/gpu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu @@ -198,8 +195,8 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/put_along_axis_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/randint_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/reduce_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/fused_rms_norm_quant_grad_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/roi_align_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/scale_kernel.cu @@ -1016,7 +1013,7 @@ target_link_libraries( ixattnbkd nccl # change nccl to ${FLAGCX_LIB} if compiling with FlagCX ${FLAGCX_LIB} - magma) +) include_directories(BEFORE ${PADDLE_SOURCE_DIR}) include_directories(BEFORE ${CMAKE_SOURCE_DIR}/headers) diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc index e669cc26e85..46028b9d54b 100644 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc @@ -20,10 +20,10 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/rms_norm_grad_kernel.h" -PD_CUSTOM_KERNEL_REGISTER(fused_rms_norm_quant_grad, +PD_CUSTOM_KERNEL_REGISTER(rms_norm_grad, iluvatar_gpu, ALL_LAYOUT, - phi::RmsNormQuantGradKernel, + phi::RmsNormGradKernel, float, phi::dtype::float16, phi::dtype::bfloat16) {} diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc index 4dcc3449fb3..52f124dd919 100644 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc @@ -37,10 +37,10 @@ limitations under the License. #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/rms_norm_kernel.h" -PD_CUSTOM_KERNEL_REGISTER(fused_rms_norm_quant, +PD_CUSTOM_KERNEL_REGISTER(rms_norm, iluvatar_gpu, ALL_LAYOUT, - phi::RmsNormQuantKernel, + phi::RmsNormKernel, float, phi::dtype::float16, phi::dtype::bfloat16) {} diff --git a/backends/iluvatar_gpu/patches/paddle-corex.patch b/backends/iluvatar_gpu/patches/paddle-corex.patch index 929bbb4c621..6526c8423df 100644 --- a/backends/iluvatar_gpu/patches/paddle-corex.patch +++ b/backends/iluvatar_gpu/patches/paddle-corex.patch @@ -1,8 +1,55 @@ +From 6484778861092b0d56309f5be9aae4d6c23726ef Mon Sep 17 00:00:00 2001 +From: tianyuzhou668 <2431054748@qq.com> +Date: Wed, 12 Nov 2025 15:37:49 +0800 +Subject: [PATCH] Fix + +--- + CMakeLists.txt | 2 +- + .../operators/collective/recv_v2_op.cu.cc | 2 +- + .../operators/collective/send_v2_op.cu.cc | 2 +- + .../fluid/platform/device/gpu/nccl_helper.h | 2 +- + paddle/phi/backends/dynload/cudnn.cc | 8 ++ + paddle/phi/backends/dynload/cudnn.h | 28 ++++++- + paddle/phi/backends/dynload/cusolver.h | 2 - + paddle/phi/backends/dynload/cusparse.h | 2 + + .../backends/gpu/cuda/cuda_device_function.h | 4 +- + paddle/phi/backends/gpu/cuda/cuda_graph.cc | 2 +- + paddle/phi/backends/gpu/cuda/cuda_helper.h | 2 +- + paddle/phi/backends/gpu/cuda/cudnn_desc.h | 16 +++- + paddle/phi/backends/gpu/cuda/cudnn_helper.h | 2 +- + paddle/phi/backends/gpu/gpu_launch_config.h | 16 +++- + paddle/phi/backends/gpu/gpu_primitives.h | 25 ++++++ + paddle/phi/backends/gpu/gpu_types.h | 5 ++ + paddle/phi/core/distributed/nccl_tools.cc | 2 +- + paddle/phi/core/enforce.h | 6 +- + paddle/phi/core/utils/data_type.h | 2 +- + paddle/phi/kernels/funcs/activation_functor.h | 20 +++++ + paddle/phi/kernels/funcs/affine_grid_utils.h | 2 + + paddle/phi/kernels/funcs/blas/blas_impl.cu.h | 18 ++++- + paddle/phi/kernels/funcs/cufft_util.h | 80 +++++++++++++++++++ + paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 4 - + paddle/phi/kernels/funcs/reduce_function.h | 2 +- + paddle/phi/kernels/funcs/segmented_array.h | 8 ++ + paddle/phi/kernels/funcs/softmax_impl.h | 1 + + .../fusion/gpu/fused_layernorm_kernel.cu | 4 - + .../fused_layernorm_residual_dropout_bias.h | 17 ---- + paddle/phi/kernels/gpu/elementwise_grad.h | 4 + + .../phi/kernels/gpu/layer_norm_grad_kernel.cu | 2 +- + paddle/phi/kernels/gpu/layer_norm_kernel.cu | 2 +- + .../phi/kernels/gpu/rms_norm_grad_kernel.cu | 2 +- + .../kernels/primitive/compute_primitives.h | 24 +++--- + paddle/phi/kernels/reduce_sum_kernel.cc | 2 + + paddle/phi/kernels/shape_kernel.cc | 2 + + paddle/phi/kernels/squeeze_kernel.cc | 2 + + paddle/phi/kernels/strided_slice_kernel.cc | 2 + + paddle/phi/kernels/unsqueeze_kernel.cc | 2 + + 39 files changed, 266 insertions(+), 64 deletions(-) + diff --git a/CMakeLists.txt b/CMakeLists.txt -index 6ea73365e8..766cc92f49 100755 +index 1a4460a3be..b6c6b4a797 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -65,7 +65,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) +@@ -64,7 +64,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF) option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF) @@ -25,7 +72,7 @@ index ab866f015c..10a8111637 100644 #endif int, diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc -index 28c126d2f2..c382e68188 100644 +index e58e29465d..1755595405 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -203,7 +203,7 @@ PD_REGISTER_STRUCT_KERNEL(send_v2, @@ -77,10 +124,10 @@ index 5a18808d47..749073ce38 100644 std::call_once(cudnn_dso_flag, []() { cudnn_dso_handle = GetCUDNNDsoHandle(); }); diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h -index 092365a961..6b05da600b 100644 +index 4ff2e528a9..956bac0c64 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h -@@ -134,7 +134,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( +@@ -141,7 +141,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( template __forceinline__ __device__ T @@ -89,7 +136,7 @@ index 092365a961..6b05da600b 100644 return __shfl_sync(mask, val, src_line, width); } -@@ -151,7 +151,7 @@ __device__ T reduceSum(T val, int tid, int len) { +@@ -158,7 +158,7 @@ __device__ T reduceSum(T val, int tid, int len) { // I use Warp-Level Parallelism and assume the Warp size // is 32 which may be different for different GPU, // but most card's warp size is 32. @@ -112,10 +159,10 @@ index 1c4f13e6b4..a90c0f6d21 100644 } diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h -index dfd3945e9a..08eda4978c 100644 +index 02753c0333..bcf435dfae 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_helper.h +++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h -@@ -82,7 +82,7 @@ cudaDataType_t ToCudaDataType() { +@@ -85,7 +85,7 @@ cudaDataType_t ToCudaDataType() { return CUDA_R_64F; } else if (std::is_same::value) { return CUDA_R_16F; @@ -165,10 +212,10 @@ index 189e97534e..8f805afe8c 100644 void set(const phi::DenseTensor& tensor, const cudnnTensorFormat_t format) { diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h -index 8b293d3007..35245dddb0 100644 +index 28c3d14d37..5dc5f79178 100644 --- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h +++ b/paddle/phi/backends/gpu/cuda/cudnn_helper.h -@@ -124,7 +124,7 @@ class CudnnDataType { +@@ -125,7 +125,7 @@ class CudnnDataType { #endif // CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1 @@ -220,7 +267,7 @@ index af1c7ba8b9..132e488061 100644 const int capability = dev_ctx.GetComputeCapability(); GpuLaunchConfig config; diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h -index a7df8a4023..d4ff45d8d5 100644 +index ab505091ab..8b7dd5ff86 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -134,13 +134,38 @@ CUDA_ATOMIC_WRAPPER(Add, int16_t) { @@ -392,10 +439,11 @@ index 9c9ab5dff9..ecf4e8f5e8 100644 template struct CudaLogFunctor : public BaseActivationFunctor { diff --git a/paddle/phi/kernels/funcs/affine_grid_utils.h b/paddle/phi/kernels/funcs/affine_grid_utils.h -index 70abf63a3d..af6f2136c5 100644 +index 1df6184141..fc6015b209 100644 --- a/paddle/phi/kernels/funcs/affine_grid_utils.h +++ b/paddle/phi/kernels/funcs/affine_grid_utils.h -@@ -16,7 +16,9 @@ +@@ -15,7 +15,9 @@ + #pragma once #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" @@ -406,10 +454,10 @@ index 70abf63a3d..af6f2136c5 100644 #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h -index fc86cc09d1..6295ff91ac 100644 +index ae7b67de6d..ff60fd20ba 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h -@@ -1755,7 +1755,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, +@@ -1754,7 +1754,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, const phi::bfloat16 *B, phi::bfloat16 beta, phi::bfloat16 *C) const { @@ -418,7 +466,7 @@ index fc86cc09d1..6295ff91ac 100644 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int64_t lda = (transA == CblasNoTrans) ? K : M; -@@ -1765,6 +1765,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, +@@ -1764,6 +1764,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; @@ -426,7 +474,7 @@ index fc86cc09d1..6295ff91ac 100644 PADDLE_ENFORCE_GE( dev_ctx_.GetComputeCapability(), 80, -@@ -1772,6 +1773,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, +@@ -1771,6 +1772,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, "cublas bf16 gemm requires GPU compute capability >= 80," "but received %d", dev_ctx_.GetComputeCapability())); @@ -434,7 +482,7 @@ index fc86cc09d1..6295ff91ac 100644 float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); -@@ -2293,12 +2295,13 @@ inline void Blas::GEMM(bool transA, +@@ -2292,12 +2294,13 @@ inline void Blas::GEMM(bool transA, phi::bfloat16 beta, phi::bfloat16 *C, int ldc) const { @@ -449,7 +497,7 @@ index fc86cc09d1..6295ff91ac 100644 PADDLE_ENFORCE_GE( dev_ctx_.GetComputeCapability(), 80, -@@ -2306,6 +2309,7 @@ inline void Blas::GEMM(bool transA, +@@ -2305,6 +2308,7 @@ inline void Blas::GEMM(bool transA, "cublas bf16 gemm requires GPU compute capability >= 80," "but received %d", dev_ctx_.GetComputeCapability())); @@ -457,7 +505,7 @@ index fc86cc09d1..6295ff91ac 100644 float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); -@@ -2802,7 +2806,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -2772,7 +2776,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, int64_t batchCount, int64_t strideA, int64_t strideB) const { @@ -466,7 +514,7 @@ index fc86cc09d1..6295ff91ac 100644 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int64_t lda = (transA == CblasNoTrans) ? K : M; -@@ -2881,7 +2885,11 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -2851,7 +2855,11 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, static_cast(ldc), strideC, static_cast(batchCount), @@ -478,7 +526,7 @@ index fc86cc09d1..6295ff91ac 100644 algo)); }); } -@@ -3175,7 +3183,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -3145,7 +3153,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, phi::bfloat16 beta, phi::bfloat16 **C, int batchCount) const { @@ -487,7 +535,7 @@ index fc86cc09d1..6295ff91ac 100644 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; -@@ -3186,6 +3194,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -3156,6 +3164,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; @@ -495,7 +543,7 @@ index fc86cc09d1..6295ff91ac 100644 PADDLE_ENFORCE_GE( dev_ctx_.GetComputeCapability(), 80, -@@ -3193,6 +3202,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -3163,6 +3172,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublas bf16 gemm requires GPU compute capability >= 80," "but received %d", dev_ctx_.GetComputeCapability())); @@ -644,10 +692,10 @@ index df4f214e66..e31b8eb1f6 100644 } // namespace detail } // namespace funcs diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h -index 92dccf18ce..5cc9937d7a 100644 +index 4eae698648..9247535e0d 100644 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h -@@ -37,11 +37,7 @@ using LayerNormParamType = typename CudnnDataType::BatchNormParamType; +@@ -44,11 +44,7 @@ using LayerNormParamType = typename CudnnDataType::BatchNormParamType; inline static int GetDesiredBlockDim(int64_t block_dim) { const int kMaxBlockDim = 512; @@ -660,10 +708,10 @@ index 92dccf18ce..5cc9937d7a 100644 } diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h -index e89969e9dc..65e744f37d 100644 +index 24c30ae7e2..ce5cf15176 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h -@@ -1131,7 +1131,7 @@ void ReduceKernel(const KPDevice& dev_ctx, +@@ -1139,7 +1139,7 @@ void ReduceKernel(const KPDevice& dev_ctx, config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16 && config.reduce_num <= std::numeric_limits::max(); @@ -699,7 +747,7 @@ index dad852093e..71adfaf3ed 100644 auto ptr = allocation->ptr(); allocations.emplace_back(std::move(allocation)); diff --git a/paddle/phi/kernels/funcs/softmax_impl.h b/paddle/phi/kernels/funcs/softmax_impl.h -index 9f12293c0f..4e897bb433 100644 +index 361936305c..f4c680fe56 100644 --- a/paddle/phi/kernels/funcs/softmax_impl.h +++ b/paddle/phi/kernels/funcs/softmax_impl.h @@ -21,6 +21,7 @@ limitations under the License. */ @@ -711,10 +759,10 @@ index 9f12293c0f..4e897bb433 100644 namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu -index 77e3537124..8f6022bc76 100644 +index 3612a5fc89..634a61ebe1 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu -@@ -58,11 +58,7 @@ namespace fusion { +@@ -60,11 +60,7 @@ namespace fusion { namespace { @@ -781,10 +829,10 @@ index 9d4bb18d55..78bf0ad1b9 100644 } } diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h -index 411ee4510c..36c2f8fba7 100644 +index f3a2874b92..47d2c0b4d7 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h -@@ -352,7 +352,11 @@ void ElementwiseAddGrad(const GPUContext &dev_ctx, +@@ -213,7 +213,11 @@ void ElementwiseAddGrad(const GPUContext &dev_ctx, phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx); } else if (dx_data != dout_data && dy_data != dout_data) { auto size = x.numel(); @@ -797,10 +845,10 @@ index 411ee4510c..36c2f8fba7 100644 dim3 grid_size = dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu -index 701c19a56f..b5d3ba7846 100644 +index d6c105e1b2..6d810ab0e6 100644 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu -@@ -300,16 +300,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in, +@@ -305,16 +305,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in, &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h); // top_left_index is just input_index. @@ -824,10 +872,10 @@ index 701c19a56f..b5d3ba7846 100644 funcs::BlockReduceMax(top_right_index, FINAL_MASK); int64_t in_bot_max_index = diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu -index 8280e95065..dd50aa08bf 100644 +index 2645060f4c..6a38e20776 100644 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu -@@ -232,7 +232,7 @@ PD_REGISTER_KERNEL(layer_norm_grad, +@@ -150,7 +150,7 @@ PD_REGISTER_KERNEL(layer_norm_grad, kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); } } @@ -837,10 +885,10 @@ index 8280e95065..dd50aa08bf 100644 GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu -index 892525493c..854f13a239 100644 +index ed5f6438ab..52f1e9e1c7 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu -@@ -765,7 +765,7 @@ PD_REGISTER_KERNEL( +@@ -708,7 +708,7 @@ PD_REGISTER_KERNEL( kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); } @@ -849,6 +897,19 @@ index 892525493c..854f13a239 100644 PD_REGISTER_KERNEL(layer_norm, GPU, ALL_LAYOUT, +diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu +index 20015f7b87..8fee4c4b70 100644 +--- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu ++++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu +@@ -214,7 +214,7 @@ PD_REGISTER_KERNEL(rms_norm_grad, + float, + phi::float16) {} + +-#elif CUDNN_VERSION_MIN(8, 1, 0) ++#elif CUDNN_VERSION_MIN(8, 1, 0) || defined(PADDLE_WITH_COREX) + + PD_REGISTER_KERNEL(rms_norm_grad, + GPU, diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 11481a8b02..136593297e 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h @@ -1009,3 +1070,5 @@ index ffdf995ece..4a7e03f4ad 100644 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(unsqueeze, GPU, +-- +2.25.1 diff --git a/backends/iluvatar_gpu/runtime/iluvatar_context.h b/backends/iluvatar_gpu/runtime/iluvatar_context.h index a6ea37a0caa..a007bf4154c 100644 --- a/backends/iluvatar_gpu/runtime/iluvatar_context.h +++ b/backends/iluvatar_gpu/runtime/iluvatar_context.h @@ -77,11 +77,6 @@ class DnnWorkspaceHandle { namespace { // NOLINT inline cudnnHandle_t dnn_handle_ = nullptr; inline std::once_flag flag_dnn_; - -inline cusolverDnHandle_t solver_handle_ = nullptr; -inline std::function solver_handle_creator_{nullptr}; -inline std::once_flag flag_solver_; - inline void InitDnnHandle(cudnnHandle_t* handle, gpuStream_t stream, Place place) { @@ -119,29 +114,6 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor, const gpuStream_t& stream) { return DnnWorkspaceHandle(alloactor, stream); } - -inline void InitSolverHandle(cusolverDnHandle_t* handle, gpuStream_t stream) { - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle)); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream)); -} - -inline cusolverDnHandle_t GetSolverHandle(gpuStream_t stream) { - std::call_once(flag_solver_, [&]() { - if (!solver_handle_) { - if (!solver_handle_creator_) { - InitSolverHandle(&solver_handle_, stream); - } else { - solver_handle_ = solver_handle_creator_(); - } - } - }); - PADDLE_ENFORCE_NOT_NULL( - solver_handle_, - common::errors::InvalidArgument( - "The GPU solver handle is nullptr. It must not be null.")); - return solver_handle_; -} - } // namespace phi namespace iluvatar { From f9149c7e42080307ff1d01dca69301369b2dacf2 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Wed, 24 Dec 2025 11:21:58 +0800 Subject: [PATCH 6/7] update --- backends/iluvatar_gpu/CMakeLists.txt | 4 +- .../rms_norm_grad_kernel_register.cc | 4 +- .../cuda_kernels/rms_norm_kernel_register.cc | 4 +- .../iluvatar_gpu/patches/paddle-corex.patch | 141 +++++------------- 4 files changed, 45 insertions(+), 108 deletions(-) diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt index e2684d446e6..051e6c16eb4 100644 --- a/backends/iluvatar_gpu/CMakeLists.txt +++ b/backends/iluvatar_gpu/CMakeLists.txt @@ -195,8 +195,8 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/put_along_axis_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/randint_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/reduce_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/fused_rms_norm_quant_grad_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/roi_align_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/scale_kernel.cu diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc index 46028b9d54b..e669cc26e85 100644 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc @@ -20,10 +20,10 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/rms_norm_grad_kernel.h" -PD_CUSTOM_KERNEL_REGISTER(rms_norm_grad, +PD_CUSTOM_KERNEL_REGISTER(fused_rms_norm_quant_grad, iluvatar_gpu, ALL_LAYOUT, - phi::RmsNormGradKernel, + phi::RmsNormQuantGradKernel, float, phi::dtype::float16, phi::dtype::bfloat16) {} diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc index 52f124dd919..4dcc3449fb3 100644 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc +++ b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc @@ -37,10 +37,10 @@ limitations under the License. #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/rms_norm_kernel.h" -PD_CUSTOM_KERNEL_REGISTER(rms_norm, +PD_CUSTOM_KERNEL_REGISTER(fused_rms_norm_quant, iluvatar_gpu, ALL_LAYOUT, - phi::RmsNormKernel, + phi::RmsNormQuantKernel, float, phi::dtype::float16, phi::dtype::bfloat16) {} diff --git a/backends/iluvatar_gpu/patches/paddle-corex.patch b/backends/iluvatar_gpu/patches/paddle-corex.patch index 6526c8423df..929bbb4c621 100644 --- a/backends/iluvatar_gpu/patches/paddle-corex.patch +++ b/backends/iluvatar_gpu/patches/paddle-corex.patch @@ -1,55 +1,8 @@ -From 6484778861092b0d56309f5be9aae4d6c23726ef Mon Sep 17 00:00:00 2001 -From: tianyuzhou668 <2431054748@qq.com> -Date: Wed, 12 Nov 2025 15:37:49 +0800 -Subject: [PATCH] Fix - ---- - CMakeLists.txt | 2 +- - .../operators/collective/recv_v2_op.cu.cc | 2 +- - .../operators/collective/send_v2_op.cu.cc | 2 +- - .../fluid/platform/device/gpu/nccl_helper.h | 2 +- - paddle/phi/backends/dynload/cudnn.cc | 8 ++ - paddle/phi/backends/dynload/cudnn.h | 28 ++++++- - paddle/phi/backends/dynload/cusolver.h | 2 - - paddle/phi/backends/dynload/cusparse.h | 2 + - .../backends/gpu/cuda/cuda_device_function.h | 4 +- - paddle/phi/backends/gpu/cuda/cuda_graph.cc | 2 +- - paddle/phi/backends/gpu/cuda/cuda_helper.h | 2 +- - paddle/phi/backends/gpu/cuda/cudnn_desc.h | 16 +++- - paddle/phi/backends/gpu/cuda/cudnn_helper.h | 2 +- - paddle/phi/backends/gpu/gpu_launch_config.h | 16 +++- - paddle/phi/backends/gpu/gpu_primitives.h | 25 ++++++ - paddle/phi/backends/gpu/gpu_types.h | 5 ++ - paddle/phi/core/distributed/nccl_tools.cc | 2 +- - paddle/phi/core/enforce.h | 6 +- - paddle/phi/core/utils/data_type.h | 2 +- - paddle/phi/kernels/funcs/activation_functor.h | 20 +++++ - paddle/phi/kernels/funcs/affine_grid_utils.h | 2 + - paddle/phi/kernels/funcs/blas/blas_impl.cu.h | 18 ++++- - paddle/phi/kernels/funcs/cufft_util.h | 80 +++++++++++++++++++ - paddle/phi/kernels/funcs/layer_norm_impl.cu.h | 4 - - paddle/phi/kernels/funcs/reduce_function.h | 2 +- - paddle/phi/kernels/funcs/segmented_array.h | 8 ++ - paddle/phi/kernels/funcs/softmax_impl.h | 1 + - .../fusion/gpu/fused_layernorm_kernel.cu | 4 - - .../fused_layernorm_residual_dropout_bias.h | 17 ---- - paddle/phi/kernels/gpu/elementwise_grad.h | 4 + - .../phi/kernels/gpu/layer_norm_grad_kernel.cu | 2 +- - paddle/phi/kernels/gpu/layer_norm_kernel.cu | 2 +- - .../phi/kernels/gpu/rms_norm_grad_kernel.cu | 2 +- - .../kernels/primitive/compute_primitives.h | 24 +++--- - paddle/phi/kernels/reduce_sum_kernel.cc | 2 + - paddle/phi/kernels/shape_kernel.cc | 2 + - paddle/phi/kernels/squeeze_kernel.cc | 2 + - paddle/phi/kernels/strided_slice_kernel.cc | 2 + - paddle/phi/kernels/unsqueeze_kernel.cc | 2 + - 39 files changed, 266 insertions(+), 64 deletions(-) - diff --git a/CMakeLists.txt b/CMakeLists.txt -index 1a4460a3be..b6c6b4a797 100755 +index 6ea73365e8..766cc92f49 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -64,7 +64,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) +@@ -65,7 +65,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF) option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF) @@ -72,7 +25,7 @@ index ab866f015c..10a8111637 100644 #endif int, diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc -index e58e29465d..1755595405 100644 +index 28c126d2f2..c382e68188 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc @@ -203,7 +203,7 @@ PD_REGISTER_STRUCT_KERNEL(send_v2, @@ -124,10 +77,10 @@ index 5a18808d47..749073ce38 100644 std::call_once(cudnn_dso_flag, []() { cudnn_dso_handle = GetCUDNNDsoHandle(); }); diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h -index 4ff2e528a9..956bac0c64 100644 +index 092365a961..6b05da600b 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h -@@ -141,7 +141,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( +@@ -134,7 +134,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( template __forceinline__ __device__ T @@ -136,7 +89,7 @@ index 4ff2e528a9..956bac0c64 100644 return __shfl_sync(mask, val, src_line, width); } -@@ -158,7 +158,7 @@ __device__ T reduceSum(T val, int tid, int len) { +@@ -151,7 +151,7 @@ __device__ T reduceSum(T val, int tid, int len) { // I use Warp-Level Parallelism and assume the Warp size // is 32 which may be different for different GPU, // but most card's warp size is 32. @@ -159,10 +112,10 @@ index 1c4f13e6b4..a90c0f6d21 100644 } diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h -index 02753c0333..bcf435dfae 100644 +index dfd3945e9a..08eda4978c 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_helper.h +++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h -@@ -85,7 +85,7 @@ cudaDataType_t ToCudaDataType() { +@@ -82,7 +82,7 @@ cudaDataType_t ToCudaDataType() { return CUDA_R_64F; } else if (std::is_same::value) { return CUDA_R_16F; @@ -212,10 +165,10 @@ index 189e97534e..8f805afe8c 100644 void set(const phi::DenseTensor& tensor, const cudnnTensorFormat_t format) { diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h -index 28c3d14d37..5dc5f79178 100644 +index 8b293d3007..35245dddb0 100644 --- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h +++ b/paddle/phi/backends/gpu/cuda/cudnn_helper.h -@@ -125,7 +125,7 @@ class CudnnDataType { +@@ -124,7 +124,7 @@ class CudnnDataType { #endif // CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1 @@ -267,7 +220,7 @@ index af1c7ba8b9..132e488061 100644 const int capability = dev_ctx.GetComputeCapability(); GpuLaunchConfig config; diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h -index ab505091ab..8b7dd5ff86 100644 +index a7df8a4023..d4ff45d8d5 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -134,13 +134,38 @@ CUDA_ATOMIC_WRAPPER(Add, int16_t) { @@ -439,11 +392,10 @@ index 9c9ab5dff9..ecf4e8f5e8 100644 template struct CudaLogFunctor : public BaseActivationFunctor { diff --git a/paddle/phi/kernels/funcs/affine_grid_utils.h b/paddle/phi/kernels/funcs/affine_grid_utils.h -index 1df6184141..fc6015b209 100644 +index 70abf63a3d..af6f2136c5 100644 --- a/paddle/phi/kernels/funcs/affine_grid_utils.h +++ b/paddle/phi/kernels/funcs/affine_grid_utils.h -@@ -15,7 +15,9 @@ - #pragma once +@@ -16,7 +16,9 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" @@ -454,10 +406,10 @@ index 1df6184141..fc6015b209 100644 #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h -index ae7b67de6d..ff60fd20ba 100644 +index fc86cc09d1..6295ff91ac 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h -@@ -1754,7 +1754,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, +@@ -1755,7 +1755,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, const phi::bfloat16 *B, phi::bfloat16 beta, phi::bfloat16 *C) const { @@ -466,7 +418,7 @@ index ae7b67de6d..ff60fd20ba 100644 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int64_t lda = (transA == CblasNoTrans) ? K : M; -@@ -1764,6 +1764,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, +@@ -1765,6 +1765,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; @@ -474,7 +426,7 @@ index ae7b67de6d..ff60fd20ba 100644 PADDLE_ENFORCE_GE( dev_ctx_.GetComputeCapability(), 80, -@@ -1771,6 +1772,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, +@@ -1772,6 +1773,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, "cublas bf16 gemm requires GPU compute capability >= 80," "but received %d", dev_ctx_.GetComputeCapability())); @@ -482,7 +434,7 @@ index ae7b67de6d..ff60fd20ba 100644 float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); -@@ -2292,12 +2294,13 @@ inline void Blas::GEMM(bool transA, +@@ -2293,12 +2295,13 @@ inline void Blas::GEMM(bool transA, phi::bfloat16 beta, phi::bfloat16 *C, int ldc) const { @@ -497,7 +449,7 @@ index ae7b67de6d..ff60fd20ba 100644 PADDLE_ENFORCE_GE( dev_ctx_.GetComputeCapability(), 80, -@@ -2305,6 +2308,7 @@ inline void Blas::GEMM(bool transA, +@@ -2306,6 +2309,7 @@ inline void Blas::GEMM(bool transA, "cublas bf16 gemm requires GPU compute capability >= 80," "but received %d", dev_ctx_.GetComputeCapability())); @@ -505,7 +457,7 @@ index ae7b67de6d..ff60fd20ba 100644 float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); -@@ -2772,7 +2776,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -2802,7 +2806,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, int64_t batchCount, int64_t strideA, int64_t strideB) const { @@ -514,7 +466,7 @@ index ae7b67de6d..ff60fd20ba 100644 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int64_t lda = (transA == CblasNoTrans) ? K : M; -@@ -2851,7 +2855,11 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -2881,7 +2885,11 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, static_cast(ldc), strideC, static_cast(batchCount), @@ -526,7 +478,7 @@ index ae7b67de6d..ff60fd20ba 100644 algo)); }); } -@@ -3145,7 +3153,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -3175,7 +3183,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, phi::bfloat16 beta, phi::bfloat16 **C, int batchCount) const { @@ -535,7 +487,7 @@ index ae7b67de6d..ff60fd20ba 100644 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; -@@ -3156,6 +3164,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -3186,6 +3194,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; @@ -543,7 +495,7 @@ index ae7b67de6d..ff60fd20ba 100644 PADDLE_ENFORCE_GE( dev_ctx_.GetComputeCapability(), 80, -@@ -3163,6 +3172,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, +@@ -3193,6 +3202,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublas bf16 gemm requires GPU compute capability >= 80," "but received %d", dev_ctx_.GetComputeCapability())); @@ -692,10 +644,10 @@ index df4f214e66..e31b8eb1f6 100644 } // namespace detail } // namespace funcs diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h -index 4eae698648..9247535e0d 100644 +index 92dccf18ce..5cc9937d7a 100644 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h -@@ -44,11 +44,7 @@ using LayerNormParamType = typename CudnnDataType::BatchNormParamType; +@@ -37,11 +37,7 @@ using LayerNormParamType = typename CudnnDataType::BatchNormParamType; inline static int GetDesiredBlockDim(int64_t block_dim) { const int kMaxBlockDim = 512; @@ -708,10 +660,10 @@ index 4eae698648..9247535e0d 100644 } diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h -index 24c30ae7e2..ce5cf15176 100644 +index e89969e9dc..65e744f37d 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h -@@ -1139,7 +1139,7 @@ void ReduceKernel(const KPDevice& dev_ctx, +@@ -1131,7 +1131,7 @@ void ReduceKernel(const KPDevice& dev_ctx, config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16 && config.reduce_num <= std::numeric_limits::max(); @@ -747,7 +699,7 @@ index dad852093e..71adfaf3ed 100644 auto ptr = allocation->ptr(); allocations.emplace_back(std::move(allocation)); diff --git a/paddle/phi/kernels/funcs/softmax_impl.h b/paddle/phi/kernels/funcs/softmax_impl.h -index 361936305c..f4c680fe56 100644 +index 9f12293c0f..4e897bb433 100644 --- a/paddle/phi/kernels/funcs/softmax_impl.h +++ b/paddle/phi/kernels/funcs/softmax_impl.h @@ -21,6 +21,7 @@ limitations under the License. */ @@ -759,10 +711,10 @@ index 361936305c..f4c680fe56 100644 namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu -index 3612a5fc89..634a61ebe1 100644 +index 77e3537124..8f6022bc76 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu -@@ -60,11 +60,7 @@ namespace fusion { +@@ -58,11 +58,7 @@ namespace fusion { namespace { @@ -829,10 +781,10 @@ index 9d4bb18d55..78bf0ad1b9 100644 } } diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h -index f3a2874b92..47d2c0b4d7 100644 +index 411ee4510c..36c2f8fba7 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h -@@ -213,7 +213,11 @@ void ElementwiseAddGrad(const GPUContext &dev_ctx, +@@ -352,7 +352,11 @@ void ElementwiseAddGrad(const GPUContext &dev_ctx, phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx); } else if (dx_data != dout_data && dy_data != dout_data) { auto size = x.numel(); @@ -845,10 +797,10 @@ index f3a2874b92..47d2c0b4d7 100644 dim3 grid_size = dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu -index d6c105e1b2..6d810ab0e6 100644 +index 701c19a56f..b5d3ba7846 100644 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu -@@ -305,16 +305,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in, +@@ -300,16 +300,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in, &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h); // top_left_index is just input_index. @@ -872,10 +824,10 @@ index d6c105e1b2..6d810ab0e6 100644 funcs::BlockReduceMax(top_right_index, FINAL_MASK); int64_t in_bot_max_index = diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu -index 2645060f4c..6a38e20776 100644 +index 8280e95065..dd50aa08bf 100644 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu -@@ -150,7 +150,7 @@ PD_REGISTER_KERNEL(layer_norm_grad, +@@ -232,7 +232,7 @@ PD_REGISTER_KERNEL(layer_norm_grad, kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); } } @@ -885,10 +837,10 @@ index 2645060f4c..6a38e20776 100644 GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu -index ed5f6438ab..52f1e9e1c7 100644 +index 892525493c..854f13a239 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu -@@ -708,7 +708,7 @@ PD_REGISTER_KERNEL( +@@ -765,7 +765,7 @@ PD_REGISTER_KERNEL( kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); } @@ -897,19 +849,6 @@ index ed5f6438ab..52f1e9e1c7 100644 PD_REGISTER_KERNEL(layer_norm, GPU, ALL_LAYOUT, -diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu -index 20015f7b87..8fee4c4b70 100644 ---- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu -+++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu -@@ -214,7 +214,7 @@ PD_REGISTER_KERNEL(rms_norm_grad, - float, - phi::float16) {} - --#elif CUDNN_VERSION_MIN(8, 1, 0) -+#elif CUDNN_VERSION_MIN(8, 1, 0) || defined(PADDLE_WITH_COREX) - - PD_REGISTER_KERNEL(rms_norm_grad, - GPU, diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 11481a8b02..136593297e 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h @@ -1070,5 +1009,3 @@ index ffdf995ece..4a7e03f4ad 100644 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(unsqueeze, GPU, --- -2.25.1 From aa3e7d1199fc78024dd2664342082bbdc85b1438 Mon Sep 17 00:00:00 2001 From: HydrogenSulfate <490868991@qq.com> Date: Wed, 24 Dec 2025 11:24:53 +0800 Subject: [PATCH 7/7] restore iluvatar --- Paddle | 2 +- .../iluvatar_gpu/cmake/external/magma.cmake | 94 ---- .../kernels/cuda_kernels/eig_grad_kernel.cu | 519 ------------------ .../kernels/cuda_kernels/eig_kernel.cu | 123 ----- .../tests/unittests/test_eig_op_iluvatar.py | 391 ------------- .../unittests/test_linalg_eig_op_iluvatar.py | 42 -- 6 files changed, 1 insertion(+), 1170 deletions(-) delete mode 100644 backends/iluvatar_gpu/cmake/external/magma.cmake delete mode 100644 backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu delete mode 100644 backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu delete mode 100644 backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py delete mode 100644 backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py diff --git a/Paddle b/Paddle index 24c29dc7eb6..935f0a6601f 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 24c29dc7eb6308a4014e39f8679f428e82008137 +Subproject commit 935f0a6601f90f4eb1d091682b31b243da50851b diff --git a/backends/iluvatar_gpu/cmake/external/magma.cmake b/backends/iluvatar_gpu/cmake/external/magma.cmake deleted file mode 100644 index 66c9560f6ca..00000000000 --- a/backends/iluvatar_gpu/cmake/external/magma.cmake +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not -# use this file except in compliance with the License. You may obtain a copy of -# the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -# License for the specific language governing permissions and limitations under -# the License. - -include(ExternalProject) - -set(MAGMA_PREFIX_DIR ${THIRD_PARTY_PATH}/magma) -set(MAGMA_DOWNLOAD_DIR - ${PADDLE_SOURCE_DIR}/third_party/magma/${CMAKE_SYSTEM_NAME}) -set(MAGMA_INSTALL_DIR ${THIRD_PARTY_PATH}/install/magma) -set(MAGMA_LIB_DIR ${MAGMA_INSTALL_DIR}/lib) - -# use precompiled library. use magma tag v2.9.0 on 07/28/2025 -# https://github.com/icl-utk-edu/magma/tree/v2.9.0 -if(LINUX) - set(MAGMA_FILE - "magma_lnx_metax_v2.9.0.20250728.tar.gz" - CACHE STRING "" FORCE) - set(MAGMA_URL - "https://paddlepaddledeps.bj.bcebos.com/${MAGMA_FILE}" - CACHE STRING "" FORCE) - set(MAGMA_URL_MD5 3aa4106aa11ba0aeb0036b450a53e972) - set(MAGMA_LIB "${MAGMA_LIB_DIR}/libmagma.so") -elseif(WIN32) - message("magma do not support windows yet, skip ...") -else() # MacOS - message("magma do not support macos or other platform yet, skip ...") -endif() - -function(download_magma) - message( - STATUS "Downloading ${MAGMA_URL} to ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}") - # NOTE: If the version is updated, consider emptying the folder; maybe add - # timeout - file( - DOWNLOAD ${MAGMA_URL} ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE} - EXPECTED_MD5 ${MAGMA_URL_MD5} - STATUS ERR) - if(ERR EQUAL 0) - message(STATUS "Download ${MAGMA_FILE} success") - else() - message( - FATAL_ERROR - "Download failed, error: ${ERR}\n You can try downloading ${MAGMA_FILE} again" - ) - endif() -endfunction() - -# Download and check magma. -if(EXISTS ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}) - file(MD5 ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE} MAGMA_MD5) - if(NOT MAGMA_MD5 STREQUAL MAGMA_URL_MD5) - # clean build file - file(REMOVE_RECURSE ${MAGMA_PREFIX_DIR}) - file(REMOVE_RECURSE ${MAGMA_INSTALL_DIR}) - download_magma() - endif() -else() - download_magma() -endif() - -ExternalProject_Add( - extern_magma - ${EXTERNAL_PROJECT_LOG_ARGS} - URL ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE} - URL_MD5 ${MAGMA_URL_MD5} - DOWNLOAD_DIR ${MAGMA_DOWNLOAD_DIR} - SOURCE_DIR ${MAGMA_LIB_DIR} - PREFIX ${MAGMA_PREFIX_DIR} - DOWNLOAD_NO_PROGRESS 1 - PATCH_COMMAND "" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - BUILD_BYPRODUCTS ${MAGMA_LIB}) - -add_definitions(-DPADDLE_WITH_MAGMA) - -add_library(magma SHARED IMPORTED GLOBAL) - -set_target_properties(magma PROPERTIES IMPORTED_LOCATION ${MAGMA_LIB}) - -add_dependencies(magma extern_magma) diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu deleted file mode 100644 index 7748e32cd60..00000000000 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu +++ /dev/null @@ -1,519 +0,0 @@ -// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/backends/dynload/cublas.h" -#include "paddle/phi/backends/dynload/cusolver.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/memory_utils.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/complex_kernel.h" -#include "paddle/phi/kernels/cpu/eig.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/for_range.h" -#include "paddle/phi/kernels/funcs/unsqueeze.h" -#include "paddle/phi/kernels/transpose_kernel.h" -#include "runtime/iluvatar_context.h" - -namespace phi { - -template -void SolveLinearSystemGPU(const GPUContext& dev_ctx, - const T* matrix_data, - const T* rhs_data, - T* out_data, - int order, - int rhs_cols, - int batch_count); - -template <> -void SolveLinearSystemGPU>( - const phi::GPUContext& dev_ctx, - const phi::dtype::complex* - matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* - rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* - out_data, // device ptr, row-major, size batch*order*rhs_cols - int order, - int rhs_cols, - int batch_count) { - // handles - cublasHandle_t cublas_handle = dev_ctx.cublas_handle(); - // cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle(); - cusolverDnHandle_t cusolver_handle = GetSolverHandle(dev_ctx.stream()); - - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); - - // cuComplex constants - const cuComplex kAlpha = make_cuFloatComplex(1.0f, 0.0f); - const cuComplex kZero = make_cuFloatComplex(0.0f, 0.0f); - - // Sizes - const size_t A_one_bytes = - static_cast(order) * order * sizeof(cuComplex); - const size_t B_one_bytes = - static_cast(order) * rhs_cols * sizeof(cuComplex); - const size_t A_batch_bytes = A_one_bytes * batch_count; - const size_t B_batch_bytes = B_one_bytes * batch_count; - - const cuComplex* A_row_all = reinterpret_cast(matrix_data); - const cuComplex* B_row_all = reinterpret_cast(rhs_data); - cuComplex* X_row_all = reinterpret_cast(out_data); - - auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); - auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); - cuComplex* dA_col = reinterpret_cast(dA_col_alloc->ptr()); - cuComplex* dB_col = reinterpret_cast(dB_col_alloc->ptr()); - - auto d_pivots_alloc = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - static_cast(batch_count) * order * sizeof(int), - stream); - int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); - - auto d_info_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), - static_cast(batch_count) * sizeof(int), - stream); - int* d_info = reinterpret_cast(d_info_alloc->ptr()); - - // A_row layout: row-major (order x order), B_row layout: row-major (order - // x rhs_cols) - for (int i = 0; i < batch_count; ++i) { - const cuComplex* A_row = A_row_all + static_cast(i) * order * order; - cuComplex* A_col = dA_col + static_cast(i) * order * order; - const cuComplex* B_row = - B_row_all + static_cast(i) * order * rhs_cols; - cuComplex* B_col = dB_col + static_cast(i) * order * rhs_cols; - - // transpose A_row (row-major) -> A_col (column-major) via C = A^T - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam( - cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - order, - order, - &kAlpha, - A_row, - order, // lda: when interpreting A_row as (order x order) row-major, - // using order - &kZero, - nullptr, - order, - A_col, - order)); // ldc = order (column-major leading dim) - - // transpose B_row (row-major order x rhs_cols) -> B_col (column-major order - // x rhs_cols) - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam( - cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - order, - rhs_cols, - &kAlpha, - B_row, - rhs_cols, // lda when A_row is viewed row-major: leading = rhs_cols - &kZero, - nullptr, - rhs_cols, - B_col, - order)); // ldc = order - } - - int lwork = 0; - cuComplex* dA_col0 = dA_col; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrf_bufferSize( - cusolver_handle, order, order, dA_col0, order, &lwork)); - - size_t work_bytes = static_cast(lwork) * sizeof(cuComplex); - auto d_work_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); - cuComplex* d_work = reinterpret_cast(d_work_alloc->ptr()); - - for (int i = 0; i < batch_count; ++i) { - cuComplex* A_col = dA_col + static_cast(i) * order * order; - cuComplex* B_col = dB_col + static_cast(i) * order * rhs_cols; - int* pivots_i = d_pivots + static_cast(i) * order; - int* info_i = d_info + i; - - // getrf (LU factorization) on A_col (column-major) - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrf( - cusolver_handle, order, order, A_col, order, d_work, pivots_i, info_i)); - - // getrs: solve A_col * X_col = B_col - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrs( - cusolver_handle, - CUBLAS_OP_N, // no transpose on column-major matrix - order, - rhs_cols, - A_col, - order, - pivots_i, - B_col, - order, - info_i)); - } - - for (int i = 0; i < batch_count; ++i) { - cuComplex* B_col = dB_col + static_cast(i) * order * - rhs_cols; // X in column-major - cuComplex* X_row = X_row_all + static_cast(i) * order * - rhs_cols; // target row-major - - // transpose X_col -> X_row - // We use C = A^T : A has shape (order x rhs_cols) in column-major, so C - // will be (rhs_cols x order), but we want X_row with shape (order x - // rhs_cols) in row-major; calling cublasCgeam with op=T and adjusted dims - // works: - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam( - cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - rhs_cols, - order, // rowsC = rhs_cols, colsC = order - &kAlpha, - B_col, - order, // B_col lda = order (col-major) - &kZero, - nullptr, - order, - X_row, - rhs_cols)); // X_row ldc = rhs_cols (row-major leading dimension) - } - - std::vector h_info(batch_count, 0); - phi::memory_utils::Copy(phi::CPUPlace(), - h_info.data(), - dev_ctx.GetPlace(), - d_info, - static_cast(batch_count) * sizeof(int), - reinterpret_cast(dev_ctx.stream())); - dev_ctx.Wait(); - - for (int i = 0; i < batch_count; ++i) { - PADDLE_ENFORCE_EQ( - h_info[i], - 0, - errors::External( - "cuSOLVER getrf/getrs failed at batch %d, info: %d", i, h_info[i])); - } -} - -template <> -void SolveLinearSystemGPU>( - const phi::GPUContext& dev_ctx, - const phi::dtype::complex* - matrix_data, // device ptr, row-major, size batch*order*order - const phi::dtype::complex* - rhs_data, // device ptr, row-major, size batch*order*rhs_cols - phi::dtype::complex* - out_data, // device ptr, row-major, size batch*order*rhs_cols - int order, - int rhs_cols, - int batch_count) { - // handles - cublasHandle_t cublas_handle = dev_ctx.cublas_handle(); - // cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle(); - cusolverDnHandle_t cusolver_handle = GetSolverHandle(dev_ctx.stream()); - - auto stream = phi::Stream(reinterpret_cast(dev_ctx.stream())); - - // cuDoubleComplex constants - const cuDoubleComplex kAlpha = make_cuDoubleComplex(1.0f, 0.0f); - const cuDoubleComplex kZero = make_cuDoubleComplex(0.0f, 0.0f); - - // Sizes - const size_t A_one_bytes = - static_cast(order) * order * sizeof(cuDoubleComplex); - const size_t B_one_bytes = - static_cast(order) * rhs_cols * sizeof(cuDoubleComplex); - const size_t A_batch_bytes = A_one_bytes * batch_count; - const size_t B_batch_bytes = B_one_bytes * batch_count; - - const cuDoubleComplex* A_row_all = - reinterpret_cast(matrix_data); - const cuDoubleComplex* B_row_all = - reinterpret_cast(rhs_data); - cuDoubleComplex* X_row_all = reinterpret_cast(out_data); - - auto dA_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream); - auto dB_col_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream); - cuDoubleComplex* dA_col = - reinterpret_cast(dA_col_alloc->ptr()); - cuDoubleComplex* dB_col = - reinterpret_cast(dB_col_alloc->ptr()); - - auto d_pivots_alloc = phi::memory_utils::Alloc( - dev_ctx.GetPlace(), - static_cast(batch_count) * order * sizeof(int), - stream); - int* d_pivots = reinterpret_cast(d_pivots_alloc->ptr()); - - auto d_info_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), - static_cast(batch_count) * sizeof(int), - stream); - int* d_info = reinterpret_cast(d_info_alloc->ptr()); - - // A_row layout: row-major (order x order), B_row layout: row-major (order - // x rhs_cols) - for (int i = 0; i < batch_count; ++i) { - const cuDoubleComplex* A_row = - A_row_all + static_cast(i) * order * order; - cuDoubleComplex* A_col = dA_col + static_cast(i) * order * order; - const cuDoubleComplex* B_row = - B_row_all + static_cast(i) * order * rhs_cols; - cuDoubleComplex* B_col = dB_col + static_cast(i) * order * rhs_cols; - - // transpose A_row (row-major) -> A_col (column-major) via C = A^T - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam( - cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - order, - order, - &kAlpha, - A_row, - order, // lda: when interpreting A_row as (order x order) row-major, - // using order - &kZero, - nullptr, - order, - A_col, - order)); // ldc = order (column-major leading dim) - - // transpose B_row (row-major order x rhs_cols) -> B_col (column-major order - // x rhs_cols) - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam( - cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - order, - rhs_cols, - &kAlpha, - B_row, - rhs_cols, // lda when A_row is viewed row-major: leading = rhs_cols - &kZero, - nullptr, - rhs_cols, - B_col, - order)); // ldc = order - } - - int lwork = 0; - cuDoubleComplex* dA_col0 = dA_col; - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrf_bufferSize( - cusolver_handle, order, order, dA_col0, order, &lwork)); - - size_t work_bytes = static_cast(lwork) * sizeof(cuDoubleComplex); - auto d_work_alloc = - phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream); - cuDoubleComplex* d_work = - reinterpret_cast(d_work_alloc->ptr()); - - for (int i = 0; i < batch_count; ++i) { - cuDoubleComplex* A_col = dA_col + static_cast(i) * order * order; - cuDoubleComplex* B_col = dB_col + static_cast(i) * order * rhs_cols; - int* pivots_i = d_pivots + static_cast(i) * order; - int* info_i = d_info + i; - - // getrf (LU factorization) on A_col (column-major) - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrf( - cusolver_handle, order, order, A_col, order, d_work, pivots_i, info_i)); - - // getrs: solve A_col * X_col = B_col - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrs( - cusolver_handle, - CUBLAS_OP_N, // no transpose on column-major matrix - order, - rhs_cols, - A_col, - order, - pivots_i, - B_col, - order, - info_i)); - } - - for (int i = 0; i < batch_count; ++i) { - cuDoubleComplex* B_col = dB_col + static_cast(i) * order * - rhs_cols; // X in column-major - cuDoubleComplex* X_row = X_row_all + static_cast(i) * order * - rhs_cols; // target row-major - - // transpose X_col -> X_row - // We use C = A^T : A has shape (order x rhs_cols) in column-major, so C - // will be (rhs_cols x order), but we want X_row with shape (order x - // rhs_cols) in row-major; calling cublasZgeam with op=T and adjusted dims - // works: - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam( - cublas_handle, - CUBLAS_OP_T, - CUBLAS_OP_N, - rhs_cols, - order, // rowsC = rhs_cols, colsC = order - &kAlpha, - B_col, - order, // B_col lda = order (col-major) - &kZero, - nullptr, - order, - X_row, - rhs_cols)); // X_row ldc = rhs_cols (row-major leading dimension) - } - - std::vector h_info(batch_count, 0); - phi::memory_utils::Copy(phi::CPUPlace(), - h_info.data(), - dev_ctx.GetPlace(), - d_info, - static_cast(batch_count) * sizeof(int), - reinterpret_cast(dev_ctx.stream())); - dev_ctx.Wait(); - - for (int i = 0; i < batch_count; ++i) { - PADDLE_ENFORCE_EQ( - h_info[i], - 0, - errors::External( - "cuSOLVER getrf/getrs failed at batch %d, info: %d", i, h_info[i])); - } -} - -template -void ComputeBackwardForComplexInputGPU(const DenseTensor& L, - const DenseTensor& V, - const paddle::optional& gL, - const paddle::optional& gV, - T* x_grad_data, - int batch_count, - int order, - const Context& dev_ctx) { - DenseTensor gL_safe; - if (gL.get_ptr()) { - gL_safe = gL.get(); - } else { - gL_safe = - Fill(dev_ctx, common::vectorize(L.dims()), T(0)); - } - - DenseTensor gV_safe; - if (gV.get_ptr()) { - gV_safe = gV.get(); - } else { - gV_safe = - Fill(dev_ctx, common::vectorize(V.dims()), T(0)); - } - DenseTensor trans_v = phi::TransposeLast2Dim(dev_ctx, V); - DenseTensor Vh = phi::Conj(dev_ctx, trans_v); - DenseTensor Lconj = phi::Conj(dev_ctx, L); - DenseTensor Econj = phi::Subtract(dev_ctx, - phi::funcs::Unsqueeze(Lconj, -2), - phi::funcs::Unsqueeze(Lconj, -1)); - DenseTensor VhgV = phi::Matmul(dev_ctx, Vh, gV_safe); - DenseTensor diag_real = phi::Real(dev_ctx, VhgV); - - auto cpu_place = phi::CPUPlace(); - phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); - auto* cpu_ctx = static_cast(pool.Get(cpu_place)); - - DenseTensor diag_real_cpu; - diag_real_cpu.Resize(diag_real.dims()); - phi::Copy(dev_ctx, diag_real, cpu_place, false, &diag_real_cpu); - - DenseTensor diag_res_cpu = - phi::funcs::BatchDiag((*cpu_ctx), diag_real_cpu, batch_count); - - DenseTensor diag_res; - dev_ctx.template Alloc(&diag_res); - phi::Copy(dev_ctx, diag_res_cpu, dev_ctx.GetPlace(), false, &diag_res); - - DenseTensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2); - - auto numel = diag_unsqueezed.numel(); - DenseTensor diag_unsqueezed_complex; - auto* data_diag_un = diag_unsqueezed.data>(); - diag_unsqueezed_complex.Resize(diag_unsqueezed.dims()); - auto* data_diag_un_com = dev_ctx.template Alloc( - &diag_unsqueezed_complex, static_cast(numel * sizeof(T))); - - phi::funcs::ForRange for_range(dev_ctx, numel); - phi::funcs::RealToComplexFunctor functor( - data_diag_un, data_diag_un_com, numel); - for_range(functor); - // real tensor multiply complex tensor in broadcast manner - DenseTensor res1 = phi::Multiply(dev_ctx, V, diag_unsqueezed_complex); - DenseTensor res2 = phi::Matmul(dev_ctx, Vh, res1); - DenseTensor result = phi::Subtract(dev_ctx, VhgV, res2); - - result.Resize(V.dims()); - dev_ctx.template Alloc(&result); - result = phi::Divide(dev_ctx, result, Econj); - result = phi::funcs::DiagFill( - dev_ctx, order, order, order, 0, gL_safe, result); - DenseTensor rhs = phi::Matmul(dev_ctx, result, Vh); - - // solve linear system - // solve(Vh, rhs, out, m, k) - // Vh: matrix with shape [m,m] - // rhs: rhs with shape [m,k] - // x_grad: out - int m = static_cast(Vh.dims(-1)); - int k = static_cast(rhs.dims(-1)); - auto* matrix_data = Vh.data(); - auto* rhs_data = rhs.data(); - - SolveLinearSystemGPU( - dev_ctx, matrix_data, rhs_data, x_grad_data, m, k, batch_count); -} - -template -void EigGradKernel(const Context& dev_ctx, - const DenseTensor& out_w, - const DenseTensor& out_v, - const paddle::optional& dout_w, - const paddle::optional& dout_v, - DenseTensor* dx) { - auto* dx_data = dev_ctx.template Alloc>(dx); - if (dx->numel() == 0) { - return; - } - auto& dims = out_v.dims(); - phi::DDim dim_origin = dims; - int num_dims = dim_origin.size(); - int batch_count = BatchCount(out_v); - const int order = static_cast(dim_origin[num_dims - 1]); - - ComputeBackwardForComplexInputGPU, Context>( - out_w, out_v, dout_w, dout_v, dx_data, batch_count, order, dev_ctx); -} - -} // namespace phi - -// Register the kernel -PD_REGISTER_PLUGIN_KERNEL(eig_grad, - iluvatar_gpu, - ALL_LAYOUT, - phi::EigGradKernel, - float, - phi::complex64) { - kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); - kernel->InputAt(2).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); - kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); -} diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu deleted file mode 100644 index 8461b82ece7..00000000000 --- a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/backends/context_pool.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/eig.h" -#include "paddle/phi/kernels/eig_kernel.h" - -namespace phi { - -template -void EigKernel(const Context& dev_ctx, - const DenseTensor& x, - DenseTensor* out_w, - DenseTensor* out_v) { - dev_ctx.template Alloc>(out_w); - dev_ctx.template Alloc>(out_v); - - if (x.numel() == 0) { - return; - } - - auto cpu_place = phi::CPUPlace(); - phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance(); - auto* cpu_ctx = static_cast(pool.Get(cpu_place)); - - // prepare cpu Tensor here, since magma requires output on cpu - DenseTensor out_w_cpu, out_v_cpu; - out_w_cpu.Resize(out_w->dims()); - (*cpu_ctx).template Alloc>(&out_w_cpu); - out_v_cpu.Resize(x.dims()); - (*cpu_ctx).template Alloc>(&out_v_cpu); - - if (!IsComplexType(x.dtype())) { - // output still be complex though input is real - int batch_count = BatchCount(x); - int order = static_cast(x.dims()[x.dims().size() - 1]); - - DenseTensor real_w_cpu, real_v_cpu; - - std::vector real_w_dim = common::vectorize(out_w->dims()); - real_w_dim.back() *= 2; - real_w_cpu.Resize(common::make_ddim(real_w_dim)); - (*cpu_ctx).template Alloc>(&real_w_cpu); - real_v_cpu.Resize(x.dims()); - (*cpu_ctx).template Alloc>(&real_v_cpu); - - phi::ApplyEigKernelMagma, Context>( - dev_ctx, x, &real_w_cpu, &real_v_cpu); - - // 1. extract real part & imag part from real_w_cpu - DenseTensor real_part_cpu = phi::funcs::Slice>( - (*cpu_ctx), real_w_cpu, {-1}, {0}, {order}); - DenseTensor imag_part_cpu = phi::funcs::Slice>( - (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2}); - - // 2. construct complex values - auto* real_part_data = real_part_cpu.data>(); - auto* imag_part_data = imag_part_cpu.data>(); - int64_t out_w_numel = static_cast(out_w->numel()); - - phi::funcs::ForRange for_range((*cpu_ctx), out_w_numel); - phi::funcs::RealImagToComplexFunctor> functor( - real_part_data, - imag_part_data, - out_w_cpu.data>(), - out_w_numel); - for_range(functor); - - // 3. construct complex vectors - DenseTensor real_v_trans_cpu = - phi::TransposeLast2Dim, phi::CPUContext>( - (*cpu_ctx), real_v_cpu); - DenseTensor out_v_trans_cpu; - out_v_trans_cpu.Resize(x.dims()); - (*cpu_ctx).template Alloc>(&out_v_trans_cpu); - - phi::ConstructComplexVectors, - phi::dtype::Complex, - phi::CPUContext>(&out_v_trans_cpu, - out_w_cpu, - real_v_trans_cpu, - (*cpu_ctx), - batch_count, - order); - - TransposeTwoAxis, phi::CPUContext>( - out_v_trans_cpu, - &out_v_cpu, - x.dims().size() - 1, - x.dims().size() - 2, - (*cpu_ctx)); - - } else { - phi::ApplyEigKernelMagma(dev_ctx, x, &out_w_cpu, &out_v_cpu); - } - - // copy result from cpu to gpu tensor - phi::Copy(dev_ctx, out_w_cpu, dev_ctx.GetPlace(), false, out_w); - phi::Copy(dev_ctx, out_v_cpu, dev_ctx.GetPlace(), false, out_v); -} - -} // namespace phi - -PD_REGISTER_PLUGIN_KERNEL( - eig, iluvatar_gpu, ALL_LAYOUT, phi::EigKernel, float, phi::complex64) { - if (kernel_key.dtype() == phi::DataType::FLOAT32) { - kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); - kernel->OutputAt(1).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); - } -} diff --git a/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py deleted file mode 100644 index 3f5fbb9f7f3..00000000000 --- a/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py +++ /dev/null @@ -1,391 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np -from op_test import OpTest, skip_check_grad_ci -from utils import dygraph_guard - -import paddle -from paddle import base - - -# cast output to complex for numpy.linalg.eig -def cast_to_complex(input, output): - if input.dtype == np.float32: - output = output.astype(np.complex64) - elif input.dtype == np.float64: - output = output.astype(np.complex128) - return output - - -# define eig backward function for a single square matrix -def eig_backward(w, v, grad_w, grad_v): - v_tran = np.transpose(v) - v_tran = np.conjugate(v_tran) - w_conj = np.conjugate(w) - w_conj_l = w_conj.reshape(1, w.size) - w_conj_r = w_conj.reshape(w.size, 1) - w_conj_2d = w_conj_l - w_conj_r - - vhgv = np.matmul(v_tran, grad_v) - real_vhgv = np.real(vhgv) - diag_real = real_vhgv.diagonal() - - diag_2d = diag_real.reshape(1, w.size) - rhs = v * diag_2d - mid = np.matmul(v_tran, rhs) - result = vhgv - mid - - res = np.divide(result, w_conj_2d) - row, col = np.diag_indices_from(res) - res[row, col] = 1.0 - - tmp = np.matmul(res, v_tran) - dx = np.linalg.solve(v_tran, tmp) - return dx - - -class TestEigOp(OpTest): - def setUp(self): - paddle.enable_static() - paddle.device.set_device("iluvatar_gpu") - self.op_type = "eig" - self.python_api = paddle.linalg.eig - self.__class__.op_type = self.op_type - self.init_input() - self.inputs = {"X": OpTest.np_dtype_to_base_dtype(self.x)} - self.outputs = {"Eigenvalues": self.out[0], "Eigenvectors": self.out[1]} - - def init_input(self): - self.set_dtype() - self.set_dims() - self.x = np.random.random(self.shape).astype(self.dtype) - self.out = np.linalg.eig(self.x) - self.out = ( - cast_to_complex(self.x, self.out[0]), - cast_to_complex(self.x, self.out[1]), - ) - - # for the real input, a customized checker is needed - def checker(self, outs): - actual_out_w = outs[0].flatten() - expect_out_w = self.out[0].flatten() - actual_out_v = outs[1].flatten() - expect_out_v = self.out[1].flatten() - - length_w = len(expect_out_w) - act_w_real = np.sort( - np.array([np.abs(actual_out_w[i].real) for i in range(length_w)]) - ) - act_w_imag = np.sort( - np.array([np.abs(actual_out_w[i].imag) for i in range(length_w)]) - ) - exp_w_real = np.sort( - np.array([np.abs(expect_out_w[i].real) for i in range(length_w)]) - ) - exp_w_imag = np.sort( - np.array([np.abs(expect_out_w[i].imag) for i in range(length_w)]) - ) - - for i in range(length_w): - np.testing.assert_allclose( - act_w_real[i], - exp_w_real[i], - rtol=1e-06, - atol=1e-05, - err_msg="The eigenvalues real part have diff: \nExpected " - + str(act_w_real[i]) - + "\n" - + "But got: " - + str(exp_w_real[i]), - ) - np.testing.assert_allclose( - act_w_imag[i], - exp_w_imag[i], - rtol=1e-06, - atol=1e-05, - err_msg="The eigenvalues image part have diff: \nExpected " - + str(act_w_imag[i]) - + "\n" - + "But got: " - + str(exp_w_imag[i]), - ) - - length_v = len(expect_out_v) - act_v_real = np.sort( - np.array([np.abs(actual_out_v[i].real) for i in range(length_v)]) - ) - act_v_imag = np.sort( - np.array([np.abs(actual_out_v[i].imag) for i in range(length_v)]) - ) - exp_v_real = np.sort( - np.array([np.abs(expect_out_v[i].real) for i in range(length_v)]) - ) - exp_v_imag = np.sort( - np.array([np.abs(expect_out_v[i].imag) for i in range(length_v)]) - ) - - for i in range(length_v): - np.testing.assert_allclose( - act_v_real[i], - exp_v_real[i], - rtol=1e-06, - atol=1e-05, - err_msg="The eigenvectors real part have diff: \nExpected " - + str(act_v_real[i]) - + "\n" - + "But got: " - + str(exp_v_real[i]), - ) - np.testing.assert_allclose( - act_v_imag[i], - exp_v_imag[i], - rtol=1e-06, - atol=1e-05, - err_msg="The eigenvectors image part have diff: \nExpected " - + str(act_v_imag[i]) - + "\n" - + "But got: " - + str(exp_v_imag[i]), - ) - - def set_dtype(self): - self.dtype = np.complex64 - - def set_dims(self): - self.shape = (10, 10) - - def init_grad(self): - # grad_w, grad_v complex dtype - gtype = self.dtype - if self.dtype == np.float32: - gtype = np.complex64 - elif self.dtype == np.float64: - gtype = np.complex128 - self.grad_w = np.ones(self.out[0].shape, gtype) - self.grad_v = np.ones(self.out[1].shape, gtype) - self.grad_x = eig_backward(self.out[0], self.out[1], self.grad_w, self.grad_v) - - def test_check_output(self): - self.check_output_with_place_customized( - checker=self.checker, - place=paddle.CustomPlace("iluvatar_gpu", 0), - check_pir=True, - ) - - def test_check_grad(self): - self.init_grad() - self.check_grad( - ["X"], - ["Eigenvalues", "Eigenvectors"], - user_defined_grads=[self.grad_x], - user_defined_grad_outputs=[self.grad_w, self.grad_v], - check_pir=True, - ) - - -@skip_check_grad_ci( - reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig" -) -class TestFloat(TestEigOp): - def set_dtype(self): - self.dtype = np.float32 - - def test_check_grad(self): - pass - - -class TestEigStatic(TestEigOp): - def test_check_output_with_place(self): - paddle.enable_static() - place = paddle.CustomPlace("iluvatar_gpu", 0) - input_np = np.random.random([3, 3]).astype("complex") - expect_val, expect_vec = np.linalg.eig(input_np) - with base.program_guard(base.Program(), base.Program()): - input = paddle.static.data(name="input", shape=[3, 3], dtype="complex") - act_val, act_vec = paddle.linalg.eig(input) - - exe = base.Executor(place) - fetch_val, fetch_vec = exe.run( - base.default_main_program(), - feed={"input": input_np}, - fetch_list=[act_val, act_vec], - ) - np.testing.assert_allclose( - expect_val, - fetch_val, - rtol=1e-06, - atol=1e-06, - err_msg="The eigen values have diff: \nExpected " - + str(expect_val) - + "\n" - + "But got: " - + str(fetch_val), - ) - np.testing.assert_allclose( - np.abs(expect_vec), - np.abs(fetch_vec), - rtol=1e-06, - atol=1e-06, - err_msg="The eigen vectors have diff: \nExpected " - + str(np.abs(expect_vec)) - + "\n" - + "But got: " - + str(np.abs(fetch_vec)), - ) - - -class TestEigDyGraph(unittest.TestCase): - def test_check_output_with_place(self): - np.random.seed(1024) - input_np = np.random.random([3, 3]).astype("complex64") - expect_val, expect_vec = np.linalg.eig(input_np) - - paddle.set_device("iluvatar_gpu") - paddle.disable_static() - - input_tensor = paddle.to_tensor(input_np) - fetch_val, fetch_vec = paddle.linalg.eig(input_tensor) - - np.testing.assert_allclose( - expect_val, - fetch_val.numpy(), - rtol=1e-06, - atol=1e-06, - err_msg="The eigen values have diff: \nExpected " - + str(expect_val) - + "\n" - + "But got: " - + str(fetch_val), - ) - np.testing.assert_allclose( - np.abs(expect_vec), - np.abs(fetch_vec.numpy()), - rtol=1e-06, - atol=1e-06, - err_msg="The eigen vectors have diff: \nExpected " - + str(np.abs(expect_vec)) - + "\n" - + "But got: " - + str(np.abs(fetch_vec.numpy())), - ) - - # def test_check_grad(self): - # test_shape = [3, 3] - # test_type = 'float32' - # paddle.set_device("iluvatar_gpu") - - # np.random.seed(1024) - # input_np = np.random.random(test_shape).astype(test_type) - # real_w, real_v = np.linalg.eig(input_np) - - # grad_w = np.ones(real_w.shape, test_type) - # grad_v = np.ones(real_v.shape, test_type) - # grad_x = eig_backward(real_w, real_v, grad_w, grad_v) - - # with base.dygraph.guard(): - # x = paddle.to_tensor(input_np) - # x.stop_gradient = False - # w, v = paddle.linalg.eig(x) - # (w.sum() + v.sum()).backward() - - # np.testing.assert_allclose( - # np.abs(x.grad.numpy()), - # np.abs(grad_x), - # rtol=1e-05, - # atol=1e-05, - # err_msg='The grad x have diff: \nExpected ' - # + str(np.abs(grad_x)) - # + '\n' - # + 'But got: ' - # + str(np.abs(x.grad.numpy())), - # ) - - -class TestEigWrongDimsError(unittest.TestCase): - def test_error(self): - paddle.device.set_device("iluvatar_gpu") - paddle.disable_static() - a = np.random.random(3).astype("float32") - x = paddle.to_tensor(a) - self.assertRaises(ValueError, paddle.linalg.eig, x) - - -class TestEigNotSquareError(unittest.TestCase): - def test_error(self): - paddle.device.set_device("iluvatar_gpu") - paddle.disable_static() - a = np.random.random((1, 2, 3)).astype("float32") - x = paddle.to_tensor(a) - self.assertRaises(ValueError, paddle.linalg.eig, x) - - -class TestEigUnsupportedDtypeError(unittest.TestCase): - def test_error(self): - paddle.device.set_device("iluvatar_gpu") - paddle.disable_static() - a = (np.random.random((3, 3)) * 10).astype("int64") - x = paddle.to_tensor(a) - self.assertRaises(RuntimeError, paddle.linalg.eig, x) - - -class TestOptionalGradInput(unittest.TestCase): - def test_eager(self): - with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"): - x = paddle.randn(3, 3, requires_grad=True) - w, v = paddle.linalg.eig(x) - - np.testing.assert_allclose( - (x @ v).numpy(), - (w.unsqueeze(0) * v).numpy(), - atol=1e-5, - rtol=1e-5, - ) # Aμ = λμ - - # (dw_dx,) = paddle.grad(w, x, retain_graph=True) - # (dv_dx,) = paddle.grad(v, x, retain_graph=True) - # (dwdv_dx,) = paddle.grad([w, v], x) - # np.testing.assert_allclose( - # (dw_dx + dv_dx).numpy(), - # dwdv_dx.numpy(), - # atol=1e-5, - # rtol=1e-5, - # ) - - def test_dy2st(self): - with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"): - x = paddle.randn(3, 3, requires_grad=True) - - def f(x): - w, v = paddle.linalg.eig(x) - return ( - w, - v, - ) - - st_f = paddle.jit.to_static(f, full_graph=True, backend=None) - - w, v = st_f(x) - np.testing.assert_allclose( - (x @ v).numpy(), - (w.unsqueeze(0) * v).numpy(), - atol=1e-5, - rtol=1e-5, - ) # Aμ = λμ - - -if __name__ == "__main__": - unittest.main() diff --git a/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py deleted file mode 100644 index 18581f25af0..00000000000 --- a/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -from utils import dygraph_guard - -import paddle - - -class TestEigAPI0Size(unittest.TestCase): - def test_errors(self): - with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"): - for shape in [[0, 0], [0, 4, 4], [1, 0, 2, 3, 3]]: - x = paddle.randn(shape=shape, dtype="float32", requires_grad=True) - w, v = paddle.linalg.eig(x) - self.assertEqual(w.shape, shape[:-1]) - self.assertEqual(v.shape, shape) - - # (dw_dx,) = paddle.grad(w.abs().sum(), x, retain_graph=True) - # self.assertEqual(dw_dx.shape, x.shape) - # (dv_dx,) = paddle.grad(v.abs().sum(), x, retain_graph=True) - # self.assertEqual(dv_dx.shape, x.shape) - # (dwv_dx,) = paddle.grad( - # w.abs().sum() + v.abs().sum(), x, retain_graph=True - # ) - # self.assertEqual(dwv_dx.shape, x.shape) - - -if __name__ == "__main__": - unittest.main()