From 9e783f88a9ebe1f8e2f861b64b5f0b9b6ea4ce7e Mon Sep 17 00:00:00 2001
From: tianyuzhou668 <2431054748@qq.com>
Date: Fri, 21 Nov 2025 17:03:20 +0800
Subject: [PATCH 1/7] [ILUVATAR_GPU] Support eig_kernel

---
 Paddle                                        |   2 +-
 backends/iluvatar_gpu/CMakeLists.txt          |   5 +-
 .../iluvatar_gpu/cmake/external/magma.cmake   |  89 +++
 .../kernels/cuda_kernels/eig_grad_kernel.cu   | 519 ++++++++++++++++++
 .../kernels/cuda_kernels/eig_kernel.cu        | 123 +++++
 .../iluvatar_gpu/runtime/iluvatar_context.h   |  28 +
 .../tests/unittests/test_eig_op_iluvatar.py   | 391 +++++++++++++
 .../unittests/test_linalg_eig_op_iluvatar.py  |  42 ++
 8 files changed, 1197 insertions(+), 2 deletions(-)
 create mode 100644 backends/iluvatar_gpu/cmake/external/magma.cmake
 create mode 100644 backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu
 create mode 100644 backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu
 create mode 100644 backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py
 create mode 100644 backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py

diff --git a/Paddle b/Paddle
index 93a5410253b..86238399d4c 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 93a5410253bf2ca0945f4551e1a58ad7a5aec996
+Subproject commit 86238399d4c720a0dccc07d416ece8168225d757
diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt
index 1b65161112d..827d5553c01 100644
--- a/backends/iluvatar_gpu/CMakeLists.txt
+++ b/backends/iluvatar_gpu/CMakeLists.txt
@@ -34,6 +34,7 @@ include(version)
 include(generic)
 include(cblas)
 include(external/eigen)
+include(external/magma)
 include(external/xxhash)
 include(external/zlib)
 include(external/protobuf)
@@ -119,11 +120,13 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/*.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/*.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/eigen/*.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/magma/magma_function.cc
   # cudnn/cublas
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cudnn.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublas.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublasLt.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/magma.cc
   # kernels/gpu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/spectral_norm_kernel.cu
@@ -1008,7 +1011,7 @@ target_link_libraries(
           ixattnbkd
           nccl
           # change nccl to ${FLAGCX_LIB} if compiling with FlagCX ${FLAGCX_LIB}
-)
+          magma)
 
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
diff --git a/backends/iluvatar_gpu/cmake/external/magma.cmake b/backends/iluvatar_gpu/cmake/external/magma.cmake
new file mode 100644
index 00000000000..612ea1b600e
--- /dev/null
+++ b/backends/iluvatar_gpu/cmake/external/magma.cmake
@@ -0,0 +1,89 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+include(ExternalProject)
+
+set(MAGMA_PREFIX_DIR ${THIRD_PARTY_PATH}/magma)
+set(MAGMA_DOWNLOAD_DIR
+    ${PADDLE_SOURCE_DIR}/third_party/magma/${CMAKE_SYSTEM_NAME})
+set(MAGMA_INSTALL_DIR ${THIRD_PARTY_PATH}/install/magma)
+set(MAGMA_LIB_DIR ${MAGMA_INSTALL_DIR}/lib)
+
+# Note(zhouwei): magma need fortran compiler which many machines don't have, so
+# use precompiled library. use magma tag v2.9.0 on 07/28/2025
+# https://github.com/icl-utk-edu/magma/tree/v2.9.0
+if(LINUX)
+  set(MAGMA_FILE
+      "magma_local.tar.gz"
+      CACHE STRING "" FORCE)
+  set(MAGMA_URL
+      "file:///home/tianyu.zhou/tyzhou/magma_local.tar.gz"
+      CACHE STRING "" FORCE)
+  set(MAGMA_URL_MD5 9715dfad9eb073e099f46feb6587232d)
+  set(MAGMA_LIB "${MAGMA_LIB_DIR}/libmagma.so")
+elseif(WIN32)
+  message("magma do not support windows yet, skip ...")
+else() # MacOS
+  message("magma do not support macos or other platform yet, skip ...")
+endif()
+
+function(download_magma)
+  message(
+    STATUS "Downloading ${MAGMA_URL} to ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}")
+  # NOTE: If the version is updated, consider emptying the folder; maybe add
+  # timeout
+  file(
+    DOWNLOAD ${MAGMA_URL} ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}
+    EXPECTED_MD5 ${MAGMA_URL_MD5}
+    STATUS ERR)
+  if(ERR EQUAL 0)
+    message(STATUS "Download ${MAGMA_FILE} success")
+  else()
+    message(
+      FATAL_ERROR
+        "Download failed, error: ${ERR}\n You can try downloading ${MAGMA_FILE} again"
+    )
+  endif()
+endfunction()
+
+# Download and check magma.
+if(EXISTS ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE})
+  file(MD5 ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE} MAGMA_MD5)
+  if(NOT MAGMA_MD5 STREQUAL MAGMA_URL_MD5)
+    # clean build file
+    file(REMOVE_RECURSE ${MAGMA_PREFIX_DIR})
+    file(REMOVE_RECURSE ${MAGMA_INSTALL_DIR})
+    download_magma()
+  endif()
+else()
+  download_magma()
+endif()
+
+ExternalProject_Add(
+  extern_magma
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}
+  URL_MD5 ${MAGMA_URL_MD5}
+  DOWNLOAD_DIR ${MAGMA_DOWNLOAD_DIR}
+  SOURCE_DIR ${MAGMA_LIB_DIR}
+  PREFIX ${MAGMA_PREFIX_DIR}
+  DOWNLOAD_NO_PROGRESS 1
+  PATCH_COMMAND ""
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  BUILD_BYPRODUCTS ${MAGMA_LIB})
+
+add_definitions(-DPADDLE_WITH_MAGMA)
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu
new file mode 100644
index 00000000000..7748e32cd60
--- /dev/null
+++ b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu
@@ -0,0 +1,519 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/dynload/cublas.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/cpu/eig.h"
+#include "paddle/phi/kernels/funcs/complex_functors.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+#include "runtime/iluvatar_context.h"
+
+namespace phi {
+
+template <typename T>
+void SolveLinearSystemGPU(const GPUContext& dev_ctx,
+                          const T* matrix_data,
+                          const T* rhs_data,
+                          T* out_data,
+                          int order,
+                          int rhs_cols,
+                          int batch_count);
+
+template <>
+void SolveLinearSystemGPU<phi::dtype::complex<float>>(
+    const phi::GPUContext& dev_ctx,
+    const phi::dtype::complex<float>*
+        matrix_data,  // device ptr, row-major, size batch*order*order
+    const phi::dtype::complex<float>*
+        rhs_data,  // device ptr, row-major, size batch*order*rhs_cols
+    phi::dtype::complex<float>*
+        out_data,  // device ptr, row-major, size batch*order*rhs_cols
+    int order,
+    int rhs_cols,
+    int batch_count) {
+  // handles
+  cublasHandle_t cublas_handle = dev_ctx.cublas_handle();
+  //   cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle();
+  cusolverDnHandle_t cusolver_handle = GetSolverHandle(dev_ctx.stream());
+
+  auto stream = phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()));
+
+  // cuComplex constants
+  const cuComplex kAlpha = make_cuFloatComplex(1.0f, 0.0f);
+  const cuComplex kZero = make_cuFloatComplex(0.0f, 0.0f);
+
+  // Sizes
+  const size_t A_one_bytes =
+      static_cast<size_t>(order) * order * sizeof(cuComplex);
+  const size_t B_one_bytes =
+      static_cast<size_t>(order) * rhs_cols * sizeof(cuComplex);
+  const size_t A_batch_bytes = A_one_bytes * batch_count;
+  const size_t B_batch_bytes = B_one_bytes * batch_count;
+
+  const cuComplex* A_row_all = reinterpret_cast<const cuComplex*>(matrix_data);
+  const cuComplex* B_row_all = reinterpret_cast<const cuComplex*>(rhs_data);
+  cuComplex* X_row_all = reinterpret_cast<cuComplex*>(out_data);
+
+  auto dA_col_alloc =
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
+  auto dB_col_alloc =
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
+  cuComplex* dA_col = reinterpret_cast<cuComplex*>(dA_col_alloc->ptr());
+  cuComplex* dB_col = reinterpret_cast<cuComplex*>(dB_col_alloc->ptr());
+
+  auto d_pivots_alloc = phi::memory_utils::Alloc(
+      dev_ctx.GetPlace(),
+      static_cast<size_t>(batch_count) * order * sizeof(int),
+      stream);
+  int* d_pivots = reinterpret_cast<int*>(d_pivots_alloc->ptr());
+
+  auto d_info_alloc =
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(),
+                               static_cast<size_t>(batch_count) * sizeof(int),
+                               stream);
+  int* d_info = reinterpret_cast<int*>(d_info_alloc->ptr());
+
+  //    A_row layout: row-major (order x order), B_row layout: row-major (order
+  //    x rhs_cols)
+  for (int i = 0; i < batch_count; ++i) {
+    const cuComplex* A_row = A_row_all + static_cast<size_t>(i) * order * order;
+    cuComplex* A_col = dA_col + static_cast<size_t>(i) * order * order;
+    const cuComplex* B_row =
+        B_row_all + static_cast<size_t>(i) * order * rhs_cols;
+    cuComplex* B_col = dB_col + static_cast<size_t>(i) * order * rhs_cols;
+
+    // transpose A_row (row-major) -> A_col (column-major) via C = A^T
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam(
+        cublas_handle,
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        order,
+        order,
+        &kAlpha,
+        A_row,
+        order,  // lda: when interpreting A_row as (order x order) row-major,
+                // using order
+        &kZero,
+        nullptr,
+        order,
+        A_col,
+        order));  // ldc = order (column-major leading dim)
+
+    // transpose B_row (row-major order x rhs_cols) -> B_col (column-major order
+    // x rhs_cols)
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam(
+        cublas_handle,
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        order,
+        rhs_cols,
+        &kAlpha,
+        B_row,
+        rhs_cols,  // lda when A_row is viewed row-major: leading = rhs_cols
+        &kZero,
+        nullptr,
+        rhs_cols,
+        B_col,
+        order));  // ldc = order
+  }
+
+  int lwork = 0;
+  cuComplex* dA_col0 = dA_col;
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrf_bufferSize(
+      cusolver_handle, order, order, dA_col0, order, &lwork));
+
+  size_t work_bytes = static_cast<size_t>(lwork) * sizeof(cuComplex);
+  auto d_work_alloc =
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream);
+  cuComplex* d_work = reinterpret_cast<cuComplex*>(d_work_alloc->ptr());
+
+  for (int i = 0; i < batch_count; ++i) {
+    cuComplex* A_col = dA_col + static_cast<size_t>(i) * order * order;
+    cuComplex* B_col = dB_col + static_cast<size_t>(i) * order * rhs_cols;
+    int* pivots_i = d_pivots + static_cast<size_t>(i) * order;
+    int* info_i = d_info + i;
+
+    // getrf (LU factorization) on A_col (column-major)
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrf(
+        cusolver_handle, order, order, A_col, order, d_work, pivots_i, info_i));
+
+    // getrs: solve A_col * X_col = B_col
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrs(
+        cusolver_handle,
+        CUBLAS_OP_N,  // no transpose on column-major matrix
+        order,
+        rhs_cols,
+        A_col,
+        order,
+        pivots_i,
+        B_col,
+        order,
+        info_i));
+  }
+
+  for (int i = 0; i < batch_count; ++i) {
+    cuComplex* B_col = dB_col + static_cast<size_t>(i) * order *
+                                    rhs_cols;  // X in column-major
+    cuComplex* X_row = X_row_all + static_cast<size_t>(i) * order *
+                                       rhs_cols;  // target row-major
+
+    // transpose X_col -> X_row
+    // We use C = A^T : A has shape (order x rhs_cols) in column-major, so C
+    // will be (rhs_cols x order), but we want X_row with shape (order x
+    // rhs_cols) in row-major; calling cublasCgeam with op=T and adjusted dims
+    // works:
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam(
+        cublas_handle,
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        rhs_cols,
+        order,  // rowsC = rhs_cols, colsC = order
+        &kAlpha,
+        B_col,
+        order,  // B_col lda = order (col-major)
+        &kZero,
+        nullptr,
+        order,
+        X_row,
+        rhs_cols));  // X_row ldc = rhs_cols (row-major leading dimension)
+  }
+
+  std::vector<int> h_info(batch_count, 0);
+  phi::memory_utils::Copy(phi::CPUPlace(),
+                          h_info.data(),
+                          dev_ctx.GetPlace(),
+                          d_info,
+                          static_cast<size_t>(batch_count) * sizeof(int),
+                          reinterpret_cast<void*>(dev_ctx.stream()));
+  dev_ctx.Wait();
+
+  for (int i = 0; i < batch_count; ++i) {
+    PADDLE_ENFORCE_EQ(
+        h_info[i],
+        0,
+        errors::External(
+            "cuSOLVER getrf/getrs failed at batch %d, info: %d", i, h_info[i]));
+  }
+}
+
+template <>
+void SolveLinearSystemGPU<phi::dtype::complex<double>>(
+    const phi::GPUContext& dev_ctx,
+    const phi::dtype::complex<double>*
+        matrix_data,  // device ptr, row-major, size batch*order*order
+    const phi::dtype::complex<double>*
+        rhs_data,  // device ptr, row-major, size batch*order*rhs_cols
+    phi::dtype::complex<double>*
+        out_data,  // device ptr, row-major, size batch*order*rhs_cols
+    int order,
+    int rhs_cols,
+    int batch_count) {
+  // handles
+  cublasHandle_t cublas_handle = dev_ctx.cublas_handle();
+  //   cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle();
+  cusolverDnHandle_t cusolver_handle = GetSolverHandle(dev_ctx.stream());
+
+  auto stream = phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()));
+
+  // cuDoubleComplex constants
+  const cuDoubleComplex kAlpha = make_cuDoubleComplex(1.0f, 0.0f);
+  const cuDoubleComplex kZero = make_cuDoubleComplex(0.0f, 0.0f);
+
+  // Sizes
+  const size_t A_one_bytes =
+      static_cast<size_t>(order) * order * sizeof(cuDoubleComplex);
+  const size_t B_one_bytes =
+      static_cast<size_t>(order) * rhs_cols * sizeof(cuDoubleComplex);
+  const size_t A_batch_bytes = A_one_bytes * batch_count;
+  const size_t B_batch_bytes = B_one_bytes * batch_count;
+
+  const cuDoubleComplex* A_row_all =
+      reinterpret_cast<const cuDoubleComplex*>(matrix_data);
+  const cuDoubleComplex* B_row_all =
+      reinterpret_cast<const cuDoubleComplex*>(rhs_data);
+  cuDoubleComplex* X_row_all = reinterpret_cast<cuDoubleComplex*>(out_data);
+
+  auto dA_col_alloc =
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
+  auto dB_col_alloc =
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
+  cuDoubleComplex* dA_col =
+      reinterpret_cast<cuDoubleComplex*>(dA_col_alloc->ptr());
+  cuDoubleComplex* dB_col =
+      reinterpret_cast<cuDoubleComplex*>(dB_col_alloc->ptr());
+
+  auto d_pivots_alloc = phi::memory_utils::Alloc(
+      dev_ctx.GetPlace(),
+      static_cast<size_t>(batch_count) * order * sizeof(int),
+      stream);
+  int* d_pivots = reinterpret_cast<int*>(d_pivots_alloc->ptr());
+
+  auto d_info_alloc =
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(),
+                               static_cast<size_t>(batch_count) * sizeof(int),
+                               stream);
+  int* d_info = reinterpret_cast<int*>(d_info_alloc->ptr());
+
+  //    A_row layout: row-major (order x order), B_row layout: row-major (order
+  //    x rhs_cols)
+  for (int i = 0; i < batch_count; ++i) {
+    const cuDoubleComplex* A_row =
+        A_row_all + static_cast<size_t>(i) * order * order;
+    cuDoubleComplex* A_col = dA_col + static_cast<size_t>(i) * order * order;
+    const cuDoubleComplex* B_row =
+        B_row_all + static_cast<size_t>(i) * order * rhs_cols;
+    cuDoubleComplex* B_col = dB_col + static_cast<size_t>(i) * order * rhs_cols;
+
+    // transpose A_row (row-major) -> A_col (column-major) via C = A^T
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam(
+        cublas_handle,
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        order,
+        order,
+        &kAlpha,
+        A_row,
+        order,  // lda: when interpreting A_row as (order x order) row-major,
+                // using order
+        &kZero,
+        nullptr,
+        order,
+        A_col,
+        order));  // ldc = order (column-major leading dim)
+
+    // transpose B_row (row-major order x rhs_cols) -> B_col (column-major order
+    // x rhs_cols)
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam(
+        cublas_handle,
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        order,
+        rhs_cols,
+        &kAlpha,
+        B_row,
+        rhs_cols,  // lda when A_row is viewed row-major: leading = rhs_cols
+        &kZero,
+        nullptr,
+        rhs_cols,
+        B_col,
+        order));  // ldc = order
+  }
+
+  int lwork = 0;
+  cuDoubleComplex* dA_col0 = dA_col;
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrf_bufferSize(
+      cusolver_handle, order, order, dA_col0, order, &lwork));
+
+  size_t work_bytes = static_cast<size_t>(lwork) * sizeof(cuDoubleComplex);
+  auto d_work_alloc =
+      phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream);
+  cuDoubleComplex* d_work =
+      reinterpret_cast<cuDoubleComplex*>(d_work_alloc->ptr());
+
+  for (int i = 0; i < batch_count; ++i) {
+    cuDoubleComplex* A_col = dA_col + static_cast<size_t>(i) * order * order;
+    cuDoubleComplex* B_col = dB_col + static_cast<size_t>(i) * order * rhs_cols;
+    int* pivots_i = d_pivots + static_cast<size_t>(i) * order;
+    int* info_i = d_info + i;
+
+    // getrf (LU factorization) on A_col (column-major)
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrf(
+        cusolver_handle, order, order, A_col, order, d_work, pivots_i, info_i));
+
+    // getrs: solve A_col * X_col = B_col
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrs(
+        cusolver_handle,
+        CUBLAS_OP_N,  // no transpose on column-major matrix
+        order,
+        rhs_cols,
+        A_col,
+        order,
+        pivots_i,
+        B_col,
+        order,
+        info_i));
+  }
+
+  for (int i = 0; i < batch_count; ++i) {
+    cuDoubleComplex* B_col = dB_col + static_cast<size_t>(i) * order *
+                                          rhs_cols;  // X in column-major
+    cuDoubleComplex* X_row = X_row_all + static_cast<size_t>(i) * order *
+                                             rhs_cols;  // target row-major
+
+    // transpose X_col -> X_row
+    // We use C = A^T : A has shape (order x rhs_cols) in column-major, so C
+    // will be (rhs_cols x order), but we want X_row with shape (order x
+    // rhs_cols) in row-major; calling cublasZgeam with op=T and adjusted dims
+    // works:
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam(
+        cublas_handle,
+        CUBLAS_OP_T,
+        CUBLAS_OP_N,
+        rhs_cols,
+        order,  // rowsC = rhs_cols, colsC = order
+        &kAlpha,
+        B_col,
+        order,  // B_col lda = order (col-major)
+        &kZero,
+        nullptr,
+        order,
+        X_row,
+        rhs_cols));  // X_row ldc = rhs_cols (row-major leading dimension)
+  }
+
+  std::vector<int> h_info(batch_count, 0);
+  phi::memory_utils::Copy(phi::CPUPlace(),
+                          h_info.data(),
+                          dev_ctx.GetPlace(),
+                          d_info,
+                          static_cast<size_t>(batch_count) * sizeof(int),
+                          reinterpret_cast<void*>(dev_ctx.stream()));
+  dev_ctx.Wait();
+
+  for (int i = 0; i < batch_count; ++i) {
+    PADDLE_ENFORCE_EQ(
+        h_info[i],
+        0,
+        errors::External(
+            "cuSOLVER getrf/getrs failed at batch %d, info: %d", i, h_info[i]));
+  }
+}
+
+template <typename T, typename Context>
+void ComputeBackwardForComplexInputGPU(const DenseTensor& L,
+                                       const DenseTensor& V,
+                                       const paddle::optional<DenseTensor>& gL,
+                                       const paddle::optional<DenseTensor>& gV,
+                                       T* x_grad_data,
+                                       int batch_count,
+                                       int order,
+                                       const Context& dev_ctx) {
+  DenseTensor gL_safe;
+  if (gL.get_ptr()) {
+    gL_safe = gL.get();
+  } else {
+    gL_safe =
+        Fill<T, Context>(dev_ctx, common::vectorize<int64_t>(L.dims()), T(0));
+  }
+
+  DenseTensor gV_safe;
+  if (gV.get_ptr()) {
+    gV_safe = gV.get();
+  } else {
+    gV_safe =
+        Fill<T, Context>(dev_ctx, common::vectorize<int64_t>(V.dims()), T(0));
+  }
+  DenseTensor trans_v = phi::TransposeLast2Dim<T>(dev_ctx, V);
+  DenseTensor Vh = phi::Conj<T>(dev_ctx, trans_v);
+  DenseTensor Lconj = phi::Conj<T>(dev_ctx, L);
+  DenseTensor Econj = phi::Subtract<T>(dev_ctx,
+                                       phi::funcs::Unsqueeze(Lconj, -2),
+                                       phi::funcs::Unsqueeze(Lconj, -1));
+  DenseTensor VhgV = phi::Matmul<T>(dev_ctx, Vh, gV_safe);
+  DenseTensor diag_real = phi::Real<T>(dev_ctx, VhgV);
+
+  auto cpu_place = phi::CPUPlace();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
+
+  DenseTensor diag_real_cpu;
+  diag_real_cpu.Resize(diag_real.dims());
+  phi::Copy(dev_ctx, diag_real, cpu_place, false, &diag_real_cpu);
+
+  DenseTensor diag_res_cpu =
+      phi::funcs::BatchDiag<T>((*cpu_ctx), diag_real_cpu, batch_count);
+
+  DenseTensor diag_res;
+  dev_ctx.template Alloc<T>(&diag_res);
+  phi::Copy(dev_ctx, diag_res_cpu, dev_ctx.GetPlace(), false, &diag_res);
+
+  DenseTensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2);
+
+  auto numel = diag_unsqueezed.numel();
+  DenseTensor diag_unsqueezed_complex;
+  auto* data_diag_un = diag_unsqueezed.data<dtype::Real<T>>();
+  diag_unsqueezed_complex.Resize(diag_unsqueezed.dims());
+  auto* data_diag_un_com = dev_ctx.template Alloc<T>(
+      &diag_unsqueezed_complex, static_cast<size_t>(numel * sizeof(T)));
+
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  phi::funcs::RealToComplexFunctor<T> functor(
+      data_diag_un, data_diag_un_com, numel);
+  for_range(functor);
+  // real tensor multiply complex tensor in broadcast manner
+  DenseTensor res1 = phi::Multiply<T>(dev_ctx, V, diag_unsqueezed_complex);
+  DenseTensor res2 = phi::Matmul<T>(dev_ctx, Vh, res1);
+  DenseTensor result = phi::Subtract<T>(dev_ctx, VhgV, res2);
+
+  result.Resize(V.dims());
+  dev_ctx.template Alloc<T>(&result);
+  result = phi::Divide<T>(dev_ctx, result, Econj);
+  result = phi::funcs::DiagFill<T, T>(
+      dev_ctx, order, order, order, 0, gL_safe, result);
+  DenseTensor rhs = phi::Matmul<T>(dev_ctx, result, Vh);
+
+  // solve linear system
+  // solve(Vh, rhs, out, m, k)
+  // Vh: matrix with shape [m,m]
+  // rhs: rhs with shape [m,k]
+  // x_grad: out
+  int m = static_cast<int>(Vh.dims(-1));
+  int k = static_cast<int>(rhs.dims(-1));
+  auto* matrix_data = Vh.data<T>();
+  auto* rhs_data = rhs.data<T>();
+
+  SolveLinearSystemGPU<T>(
+      dev_ctx, matrix_data, rhs_data, x_grad_data, m, k, batch_count);
+}
+
+template <typename T, typename Context>
+void EigGradKernel(const Context& dev_ctx,
+                   const DenseTensor& out_w,
+                   const DenseTensor& out_v,
+                   const paddle::optional<DenseTensor>& dout_w,
+                   const paddle::optional<DenseTensor>& dout_v,
+                   DenseTensor* dx) {
+  auto* dx_data = dev_ctx.template Alloc<phi::dtype::Complex<T>>(dx);
+  if (dx->numel() == 0) {
+    return;
+  }
+  auto& dims = out_v.dims();
+  phi::DDim dim_origin = dims;
+  int num_dims = dim_origin.size();
+  int batch_count = BatchCount(out_v);
+  const int order = static_cast<int>(dim_origin[num_dims - 1]);
+
+  ComputeBackwardForComplexInputGPU<phi::dtype::Complex<T>, Context>(
+      out_w, out_v, dout_w, dout_v, dx_data, batch_count, order, dev_ctx);
+}
+
+}  // namespace phi
+
+// Register the kernel
+PD_REGISTER_PLUGIN_KERNEL(eig_grad,
+                          iluvatar_gpu,
+                          ALL_LAYOUT,
+                          phi::EigGradKernel,
+                          float,
+                          phi::complex64) {
+  kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+  kernel->InputAt(2).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu
new file mode 100644
index 00000000000..8461b82ece7
--- /dev/null
+++ b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu
@@ -0,0 +1,123 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/eig.h"
+#include "paddle/phi/kernels/eig_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EigKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               DenseTensor* out_w,
+               DenseTensor* out_v) {
+  dev_ctx.template Alloc<phi::dtype::Complex<T>>(out_w);
+  dev_ctx.template Alloc<phi::dtype::Complex<T>>(out_v);
+
+  if (x.numel() == 0) {
+    return;
+  }
+
+  auto cpu_place = phi::CPUPlace();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
+
+  // prepare cpu Tensor here, since magma requires output on cpu
+  DenseTensor out_w_cpu, out_v_cpu;
+  out_w_cpu.Resize(out_w->dims());
+  (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_w_cpu);
+  out_v_cpu.Resize(x.dims());
+  (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_v_cpu);
+
+  if (!IsComplexType(x.dtype())) {
+    // output still be complex though input is real
+    int batch_count = BatchCount(x);
+    int order = static_cast<int>(x.dims()[x.dims().size() - 1]);
+
+    DenseTensor real_w_cpu, real_v_cpu;
+
+    std::vector<int64_t> real_w_dim = common::vectorize<int64_t>(out_w->dims());
+    real_w_dim.back() *= 2;
+    real_w_cpu.Resize(common::make_ddim(real_w_dim));
+    (*cpu_ctx).template Alloc<phi::dtype::Real<T>>(&real_w_cpu);
+    real_v_cpu.Resize(x.dims());
+    (*cpu_ctx).template Alloc<phi::dtype::Real<T>>(&real_v_cpu);
+
+    phi::ApplyEigKernelMagma<phi::dtype::Real<T>, Context>(
+        dev_ctx, x, &real_w_cpu, &real_v_cpu);
+
+    // 1. extract real part & imag part from real_w_cpu
+    DenseTensor real_part_cpu = phi::funcs::Slice<phi::dtype::Real<T>>(
+        (*cpu_ctx), real_w_cpu, {-1}, {0}, {order});
+    DenseTensor imag_part_cpu = phi::funcs::Slice<phi::dtype::Real<T>>(
+        (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2});
+
+    // 2. construct complex values
+    auto* real_part_data = real_part_cpu.data<phi::dtype::Real<T>>();
+    auto* imag_part_data = imag_part_cpu.data<phi::dtype::Real<T>>();
+    int64_t out_w_numel = static_cast<int64_t>(out_w->numel());
+
+    phi::funcs::ForRange<phi::CPUContext> for_range((*cpu_ctx), out_w_numel);
+    phi::funcs::RealImagToComplexFunctor<phi::dtype::Complex<T>> functor(
+        real_part_data,
+        imag_part_data,
+        out_w_cpu.data<phi::dtype::Complex<T>>(),
+        out_w_numel);
+    for_range(functor);
+
+    // 3. construct complex vectors
+    DenseTensor real_v_trans_cpu =
+        phi::TransposeLast2Dim<phi::dtype::Real<T>, phi::CPUContext>(
+            (*cpu_ctx), real_v_cpu);
+    DenseTensor out_v_trans_cpu;
+    out_v_trans_cpu.Resize(x.dims());
+    (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_v_trans_cpu);
+
+    phi::ConstructComplexVectors<phi::dtype::Real<T>,
+                                 phi::dtype::Complex<T>,
+                                 phi::CPUContext>(&out_v_trans_cpu,
+                                                  out_w_cpu,
+                                                  real_v_trans_cpu,
+                                                  (*cpu_ctx),
+                                                  batch_count,
+                                                  order);
+
+    TransposeTwoAxis<phi::dtype::Complex<T>, phi::CPUContext>(
+        out_v_trans_cpu,
+        &out_v_cpu,
+        x.dims().size() - 1,
+        x.dims().size() - 2,
+        (*cpu_ctx));
+
+  } else {
+    phi::ApplyEigKernelMagma<T, Context>(dev_ctx, x, &out_w_cpu, &out_v_cpu);
+  }
+
+  // copy result from cpu to gpu tensor
+  phi::Copy(dev_ctx, out_w_cpu, dev_ctx.GetPlace(), false, out_w);
+  phi::Copy(dev_ctx, out_v_cpu, dev_ctx.GetPlace(), false, out_v);
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(
+    eig, iluvatar_gpu, ALL_LAYOUT, phi::EigKernel, float, phi::complex64) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT32) {
+    kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+    kernel->OutputAt(1).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+  }
+}
diff --git a/backends/iluvatar_gpu/runtime/iluvatar_context.h b/backends/iluvatar_gpu/runtime/iluvatar_context.h
index a007bf4154c..a6ea37a0caa 100644
--- a/backends/iluvatar_gpu/runtime/iluvatar_context.h
+++ b/backends/iluvatar_gpu/runtime/iluvatar_context.h
@@ -77,6 +77,11 @@ class DnnWorkspaceHandle {
 namespace {  // NOLINT
 inline cudnnHandle_t dnn_handle_ = nullptr;
 inline std::once_flag flag_dnn_;
+
+inline cusolverDnHandle_t solver_handle_ = nullptr;
+inline std::function<cusolverDnHandle_t()> solver_handle_creator_{nullptr};
+inline std::once_flag flag_solver_;
+
 inline void InitDnnHandle(cudnnHandle_t* handle,
                           gpuStream_t stream,
                           Place place) {
@@ -114,6 +119,29 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor,
                                           const gpuStream_t& stream) {
   return DnnWorkspaceHandle(alloactor, stream);
 }
+
+inline void InitSolverHandle(cusolverDnHandle_t* handle, gpuStream_t stream) {
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
+}
+
+inline cusolverDnHandle_t GetSolverHandle(gpuStream_t stream) {
+  std::call_once(flag_solver_, [&]() {
+    if (!solver_handle_) {
+      if (!solver_handle_creator_) {
+        InitSolverHandle(&solver_handle_, stream);
+      } else {
+        solver_handle_ = solver_handle_creator_();
+      }
+    }
+  });
+  PADDLE_ENFORCE_NOT_NULL(
+      solver_handle_,
+      common::errors::InvalidArgument(
+          "The GPU solver handle is nullptr. It must not be null."));
+  return solver_handle_;
+}
+
 }  // namespace phi
 
 namespace iluvatar {
diff --git a/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py
new file mode 100644
index 00000000000..3f5fbb9f7f3
--- /dev/null
+++ b/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py
@@ -0,0 +1,391 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, skip_check_grad_ci
+from utils import dygraph_guard
+
+import paddle
+from paddle import base
+
+
+# cast output to complex for numpy.linalg.eig
+def cast_to_complex(input, output):
+    if input.dtype == np.float32:
+        output = output.astype(np.complex64)
+    elif input.dtype == np.float64:
+        output = output.astype(np.complex128)
+    return output
+
+
+# define eig backward function for a single square matrix
+def eig_backward(w, v, grad_w, grad_v):
+    v_tran = np.transpose(v)
+    v_tran = np.conjugate(v_tran)
+    w_conj = np.conjugate(w)
+    w_conj_l = w_conj.reshape(1, w.size)
+    w_conj_r = w_conj.reshape(w.size, 1)
+    w_conj_2d = w_conj_l - w_conj_r
+
+    vhgv = np.matmul(v_tran, grad_v)
+    real_vhgv = np.real(vhgv)
+    diag_real = real_vhgv.diagonal()
+
+    diag_2d = diag_real.reshape(1, w.size)
+    rhs = v * diag_2d
+    mid = np.matmul(v_tran, rhs)
+    result = vhgv - mid
+
+    res = np.divide(result, w_conj_2d)
+    row, col = np.diag_indices_from(res)
+    res[row, col] = 1.0
+
+    tmp = np.matmul(res, v_tran)
+    dx = np.linalg.solve(v_tran, tmp)
+    return dx
+
+
+class TestEigOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        paddle.device.set_device("iluvatar_gpu")
+        self.op_type = "eig"
+        self.python_api = paddle.linalg.eig
+        self.__class__.op_type = self.op_type
+        self.init_input()
+        self.inputs = {"X": OpTest.np_dtype_to_base_dtype(self.x)}
+        self.outputs = {"Eigenvalues": self.out[0], "Eigenvectors": self.out[1]}
+
+    def init_input(self):
+        self.set_dtype()
+        self.set_dims()
+        self.x = np.random.random(self.shape).astype(self.dtype)
+        self.out = np.linalg.eig(self.x)
+        self.out = (
+            cast_to_complex(self.x, self.out[0]),
+            cast_to_complex(self.x, self.out[1]),
+        )
+
+    # for the real input, a customized checker is needed
+    def checker(self, outs):
+        actual_out_w = outs[0].flatten()
+        expect_out_w = self.out[0].flatten()
+        actual_out_v = outs[1].flatten()
+        expect_out_v = self.out[1].flatten()
+
+        length_w = len(expect_out_w)
+        act_w_real = np.sort(
+            np.array([np.abs(actual_out_w[i].real) for i in range(length_w)])
+        )
+        act_w_imag = np.sort(
+            np.array([np.abs(actual_out_w[i].imag) for i in range(length_w)])
+        )
+        exp_w_real = np.sort(
+            np.array([np.abs(expect_out_w[i].real) for i in range(length_w)])
+        )
+        exp_w_imag = np.sort(
+            np.array([np.abs(expect_out_w[i].imag) for i in range(length_w)])
+        )
+
+        for i in range(length_w):
+            np.testing.assert_allclose(
+                act_w_real[i],
+                exp_w_real[i],
+                rtol=1e-06,
+                atol=1e-05,
+                err_msg="The eigenvalues real part have diff: \nExpected "
+                + str(act_w_real[i])
+                + "\n"
+                + "But got: "
+                + str(exp_w_real[i]),
+            )
+            np.testing.assert_allclose(
+                act_w_imag[i],
+                exp_w_imag[i],
+                rtol=1e-06,
+                atol=1e-05,
+                err_msg="The eigenvalues image part have diff: \nExpected "
+                + str(act_w_imag[i])
+                + "\n"
+                + "But got: "
+                + str(exp_w_imag[i]),
+            )
+
+        length_v = len(expect_out_v)
+        act_v_real = np.sort(
+            np.array([np.abs(actual_out_v[i].real) for i in range(length_v)])
+        )
+        act_v_imag = np.sort(
+            np.array([np.abs(actual_out_v[i].imag) for i in range(length_v)])
+        )
+        exp_v_real = np.sort(
+            np.array([np.abs(expect_out_v[i].real) for i in range(length_v)])
+        )
+        exp_v_imag = np.sort(
+            np.array([np.abs(expect_out_v[i].imag) for i in range(length_v)])
+        )
+
+        for i in range(length_v):
+            np.testing.assert_allclose(
+                act_v_real[i],
+                exp_v_real[i],
+                rtol=1e-06,
+                atol=1e-05,
+                err_msg="The eigenvectors real part have diff: \nExpected "
+                + str(act_v_real[i])
+                + "\n"
+                + "But got: "
+                + str(exp_v_real[i]),
+            )
+            np.testing.assert_allclose(
+                act_v_imag[i],
+                exp_v_imag[i],
+                rtol=1e-06,
+                atol=1e-05,
+                err_msg="The eigenvectors image part have diff: \nExpected "
+                + str(act_v_imag[i])
+                + "\n"
+                + "But got: "
+                + str(exp_v_imag[i]),
+            )
+
+    def set_dtype(self):
+        self.dtype = np.complex64
+
+    def set_dims(self):
+        self.shape = (10, 10)
+
+    def init_grad(self):
+        # grad_w, grad_v complex dtype
+        gtype = self.dtype
+        if self.dtype == np.float32:
+            gtype = np.complex64
+        elif self.dtype == np.float64:
+            gtype = np.complex128
+        self.grad_w = np.ones(self.out[0].shape, gtype)
+        self.grad_v = np.ones(self.out[1].shape, gtype)
+        self.grad_x = eig_backward(self.out[0], self.out[1], self.grad_w, self.grad_v)
+
+    def test_check_output(self):
+        self.check_output_with_place_customized(
+            checker=self.checker,
+            place=paddle.CustomPlace("iluvatar_gpu", 0),
+            check_pir=True,
+        )
+
+    def test_check_grad(self):
+        self.init_grad()
+        self.check_grad(
+            ["X"],
+            ["Eigenvalues", "Eigenvectors"],
+            user_defined_grads=[self.grad_x],
+            user_defined_grad_outputs=[self.grad_w, self.grad_v],
+            check_pir=True,
+        )
+
+
+@skip_check_grad_ci(
+    reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig"
+)
+class TestFloat(TestEigOp):
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_grad(self):
+        pass
+
+
+class TestEigStatic(TestEigOp):
+    def test_check_output_with_place(self):
+        paddle.enable_static()
+        place = paddle.CustomPlace("iluvatar_gpu", 0)
+        input_np = np.random.random([3, 3]).astype("complex")
+        expect_val, expect_vec = np.linalg.eig(input_np)
+        with base.program_guard(base.Program(), base.Program()):
+            input = paddle.static.data(name="input", shape=[3, 3], dtype="complex")
+            act_val, act_vec = paddle.linalg.eig(input)
+
+            exe = base.Executor(place)
+            fetch_val, fetch_vec = exe.run(
+                base.default_main_program(),
+                feed={"input": input_np},
+                fetch_list=[act_val, act_vec],
+            )
+        np.testing.assert_allclose(
+            expect_val,
+            fetch_val,
+            rtol=1e-06,
+            atol=1e-06,
+            err_msg="The eigen values have diff: \nExpected "
+            + str(expect_val)
+            + "\n"
+            + "But got: "
+            + str(fetch_val),
+        )
+        np.testing.assert_allclose(
+            np.abs(expect_vec),
+            np.abs(fetch_vec),
+            rtol=1e-06,
+            atol=1e-06,
+            err_msg="The eigen vectors have diff: \nExpected "
+            + str(np.abs(expect_vec))
+            + "\n"
+            + "But got: "
+            + str(np.abs(fetch_vec)),
+        )
+
+
+class TestEigDyGraph(unittest.TestCase):
+    def test_check_output_with_place(self):
+        np.random.seed(1024)
+        input_np = np.random.random([3, 3]).astype("complex64")
+        expect_val, expect_vec = np.linalg.eig(input_np)
+
+        paddle.set_device("iluvatar_gpu")
+        paddle.disable_static()
+
+        input_tensor = paddle.to_tensor(input_np)
+        fetch_val, fetch_vec = paddle.linalg.eig(input_tensor)
+
+        np.testing.assert_allclose(
+            expect_val,
+            fetch_val.numpy(),
+            rtol=1e-06,
+            atol=1e-06,
+            err_msg="The eigen values have diff: \nExpected "
+            + str(expect_val)
+            + "\n"
+            + "But got: "
+            + str(fetch_val),
+        )
+        np.testing.assert_allclose(
+            np.abs(expect_vec),
+            np.abs(fetch_vec.numpy()),
+            rtol=1e-06,
+            atol=1e-06,
+            err_msg="The eigen vectors have diff: \nExpected "
+            + str(np.abs(expect_vec))
+            + "\n"
+            + "But got: "
+            + str(np.abs(fetch_vec.numpy())),
+        )
+
+    # def test_check_grad(self):
+    #     test_shape = [3, 3]
+    #     test_type = 'float32'
+    #     paddle.set_device("iluvatar_gpu")
+
+    #     np.random.seed(1024)
+    #     input_np = np.random.random(test_shape).astype(test_type)
+    #     real_w, real_v = np.linalg.eig(input_np)
+
+    #     grad_w = np.ones(real_w.shape, test_type)
+    #     grad_v = np.ones(real_v.shape, test_type)
+    #     grad_x = eig_backward(real_w, real_v, grad_w, grad_v)
+
+    #     with base.dygraph.guard():
+    #         x = paddle.to_tensor(input_np)
+    #         x.stop_gradient = False
+    #         w, v = paddle.linalg.eig(x)
+    #         (w.sum() + v.sum()).backward()
+
+    #     np.testing.assert_allclose(
+    #         np.abs(x.grad.numpy()),
+    #         np.abs(grad_x),
+    #         rtol=1e-05,
+    #         atol=1e-05,
+    #         err_msg='The grad x have diff: \nExpected '
+    #         + str(np.abs(grad_x))
+    #         + '\n'
+    #         + 'But got: '
+    #         + str(np.abs(x.grad.numpy())),
+    #     )
+
+
+class TestEigWrongDimsError(unittest.TestCase):
+    def test_error(self):
+        paddle.device.set_device("iluvatar_gpu")
+        paddle.disable_static()
+        a = np.random.random(3).astype("float32")
+        x = paddle.to_tensor(a)
+        self.assertRaises(ValueError, paddle.linalg.eig, x)
+
+
+class TestEigNotSquareError(unittest.TestCase):
+    def test_error(self):
+        paddle.device.set_device("iluvatar_gpu")
+        paddle.disable_static()
+        a = np.random.random((1, 2, 3)).astype("float32")
+        x = paddle.to_tensor(a)
+        self.assertRaises(ValueError, paddle.linalg.eig, x)
+
+
+class TestEigUnsupportedDtypeError(unittest.TestCase):
+    def test_error(self):
+        paddle.device.set_device("iluvatar_gpu")
+        paddle.disable_static()
+        a = (np.random.random((3, 3)) * 10).astype("int64")
+        x = paddle.to_tensor(a)
+        self.assertRaises(RuntimeError, paddle.linalg.eig, x)
+
+
+class TestOptionalGradInput(unittest.TestCase):
+    def test_eager(self):
+        with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"):
+            x = paddle.randn(3, 3, requires_grad=True)
+            w, v = paddle.linalg.eig(x)
+
+            np.testing.assert_allclose(
+                (x @ v).numpy(),
+                (w.unsqueeze(0) * v).numpy(),
+                atol=1e-5,
+                rtol=1e-5,
+            )  # Aμ = λμ
+
+            # (dw_dx,) = paddle.grad(w, x, retain_graph=True)
+            # (dv_dx,) = paddle.grad(v, x, retain_graph=True)
+            # (dwdv_dx,) = paddle.grad([w, v], x)
+            # np.testing.assert_allclose(
+            #     (dw_dx + dv_dx).numpy(),
+            #     dwdv_dx.numpy(),
+            #     atol=1e-5,
+            #     rtol=1e-5,
+            # )
+
+    def test_dy2st(self):
+        with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"):
+            x = paddle.randn(3, 3, requires_grad=True)
+
+            def f(x):
+                w, v = paddle.linalg.eig(x)
+                return (
+                    w,
+                    v,
+                )
+
+            st_f = paddle.jit.to_static(f, full_graph=True, backend=None)
+
+            w, v = st_f(x)
+            np.testing.assert_allclose(
+                (x @ v).numpy(),
+                (w.unsqueeze(0) * v).numpy(),
+                atol=1e-5,
+                rtol=1e-5,
+            )  # Aμ = λμ
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py
new file mode 100644
index 00000000000..18581f25af0
--- /dev/null
+++ b/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py
@@ -0,0 +1,42 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from utils import dygraph_guard
+
+import paddle
+
+
+class TestEigAPI0Size(unittest.TestCase):
+    def test_errors(self):
+        with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"):
+            for shape in [[0, 0], [0, 4, 4], [1, 0, 2, 3, 3]]:
+                x = paddle.randn(shape=shape, dtype="float32", requires_grad=True)
+                w, v = paddle.linalg.eig(x)
+                self.assertEqual(w.shape, shape[:-1])
+                self.assertEqual(v.shape, shape)
+
+                # (dw_dx,) = paddle.grad(w.abs().sum(), x, retain_graph=True)
+                # self.assertEqual(dw_dx.shape, x.shape)
+                # (dv_dx,) = paddle.grad(v.abs().sum(), x, retain_graph=True)
+                # self.assertEqual(dv_dx.shape, x.shape)
+                # (dwv_dx,) = paddle.grad(
+                #     w.abs().sum() + v.abs().sum(), x, retain_graph=True
+                # )
+                # self.assertEqual(dwv_dx.shape, x.shape)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 80cbe00b479dfbd0de871e1126cad1393e7d6d1a Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 23 Dec 2025 11:00:19 +0800
Subject: [PATCH 2/7] update cmake

---
 backends/iluvatar_gpu/cmake/external/magma.cmake | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/backends/iluvatar_gpu/cmake/external/magma.cmake b/backends/iluvatar_gpu/cmake/external/magma.cmake
index 612ea1b600e..217562ddef0 100644
--- a/backends/iluvatar_gpu/cmake/external/magma.cmake
+++ b/backends/iluvatar_gpu/cmake/external/magma.cmake
@@ -20,17 +20,16 @@ set(MAGMA_DOWNLOAD_DIR
 set(MAGMA_INSTALL_DIR ${THIRD_PARTY_PATH}/install/magma)
 set(MAGMA_LIB_DIR ${MAGMA_INSTALL_DIR}/lib)
 
-# Note(zhouwei): magma need fortran compiler which many machines don't have, so
-# use precompiled library. use magma tag v2.9.0 on 07/28/2025
+# use precompiled library magma tag v2.9.0 on 07/28/2025
 # https://github.com/icl-utk-edu/magma/tree/v2.9.0
 if(LINUX)
   set(MAGMA_FILE
-      "magma_local.tar.gz"
+      "magma_lnx_iluvatar_v2.9.0.20250728.tar.gz"
       CACHE STRING "" FORCE)
   set(MAGMA_URL
-      "file:///home/tianyu.zhou/tyzhou/magma_local.tar.gz"
+      "https://paddlepaddledeps.bj.bcebos.com/${MAGMA_FILE}"
       CACHE STRING "" FORCE)
-  set(MAGMA_URL_MD5 9715dfad9eb073e099f46feb6587232d)
+  set(MAGMA_URL_MD5 7fcca7e0140ed8df1fef511d82ae30c3)
   set(MAGMA_LIB "${MAGMA_LIB_DIR}/libmagma.so")
 elseif(WIN32)
   message("magma do not support windows yet, skip ...")

From 8cab30c86ac27abed5f22ff3b0c8ca65fa94916e Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Tue, 23 Dec 2025 16:14:16 +0800
Subject: [PATCH 3/7] update

---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 86238399d4c..24c29dc7eb6 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 86238399d4c720a0dccc07d416ece8168225d757
+Subproject commit 24c29dc7eb6308a4014e39f8679f428e82008137

From 2a9496d9fc49d7e0f3eb0f5e35acc401f9b5c94e Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Wed, 24 Dec 2025 10:47:32 +0800
Subject: [PATCH 4/7] support eig op in metax

---
 .../iluvatar_gpu/cmake/external/magma.cmake   |  12 +-
 backends/metax_gpu/CMakeLists.txt             |   6 +
 .../kernels/metax_kernel/eig_kernel.cu        | 123 ++++++++++++++++++
 3 files changed, 138 insertions(+), 3 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu

diff --git a/backends/iluvatar_gpu/cmake/external/magma.cmake b/backends/iluvatar_gpu/cmake/external/magma.cmake
index 217562ddef0..66c9560f6ca 100644
--- a/backends/iluvatar_gpu/cmake/external/magma.cmake
+++ b/backends/iluvatar_gpu/cmake/external/magma.cmake
@@ -20,16 +20,16 @@ set(MAGMA_DOWNLOAD_DIR
 set(MAGMA_INSTALL_DIR ${THIRD_PARTY_PATH}/install/magma)
 set(MAGMA_LIB_DIR ${MAGMA_INSTALL_DIR}/lib)
 
-# use precompiled library magma tag v2.9.0 on 07/28/2025
+# use precompiled library. use magma tag v2.9.0 on 07/28/2025
 # https://github.com/icl-utk-edu/magma/tree/v2.9.0
 if(LINUX)
   set(MAGMA_FILE
-      "magma_lnx_iluvatar_v2.9.0.20250728.tar.gz"
+      "magma_lnx_metax_v2.9.0.20250728.tar.gz"
       CACHE STRING "" FORCE)
   set(MAGMA_URL
       "https://paddlepaddledeps.bj.bcebos.com/${MAGMA_FILE}"
       CACHE STRING "" FORCE)
-  set(MAGMA_URL_MD5 7fcca7e0140ed8df1fef511d82ae30c3)
+  set(MAGMA_URL_MD5 3aa4106aa11ba0aeb0036b450a53e972)
   set(MAGMA_LIB "${MAGMA_LIB_DIR}/libmagma.so")
 elseif(WIN32)
   message("magma do not support windows yet, skip ...")
@@ -86,3 +86,9 @@ ExternalProject_Add(
   BUILD_BYPRODUCTS ${MAGMA_LIB})
 
 add_definitions(-DPADDLE_WITH_MAGMA)
+
+add_library(magma SHARED IMPORTED GLOBAL)
+
+set_target_properties(magma PROPERTIES IMPORTED_LOCATION ${MAGMA_LIB})
+
+add_dependencies(magma extern_magma)
diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 79941c2866a..b58c9aa6120 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -39,6 +39,9 @@ include(cutlass)
 include(dgc)
 include(warpctc)
 include(warprnnt)
+if(WITH_MAGMA)
+  include(external/magma)
+endif()
 
 set(PLUGIN_VERSION ${PADDLE_VERSION})
 
@@ -123,6 +126,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/*.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/*.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/eigen/*.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/magma/magma_function.cc
   # kernels/gpu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_send_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_recv_kernel.cu
@@ -667,6 +671,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/binomial_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/magma.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -779,6 +784,7 @@ target_link_libraries(
   protobuf
   external_error_proto
   dgc
+  magma
   ${WARPCTC_LIBRARIES}
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
diff --git a/backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu
new file mode 100644
index 00000000000..942bcb20ecf
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/eig_kernel.cu
@@ -0,0 +1,123 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/eig.h"
+#include "paddle/phi/kernels/eig_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void EigKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               DenseTensor* out_w,
+               DenseTensor* out_v) {
+  dev_ctx.template Alloc<phi::dtype::Complex<T>>(out_w);
+  dev_ctx.template Alloc<phi::dtype::Complex<T>>(out_v);
+
+  if (x.numel() == 0) {
+    return;
+  }
+
+  auto cpu_place = phi::CPUPlace();
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
+
+  // prepare cpu Tensor here, since magma requires output on cpu
+  DenseTensor out_w_cpu, out_v_cpu;
+  out_w_cpu.Resize(out_w->dims());
+  (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_w_cpu);
+  out_v_cpu.Resize(x.dims());
+  (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_v_cpu);
+
+  if (!IsComplexType(x.dtype())) {
+    // output still be complex though input is real
+    int batch_count = BatchCount(x);
+    int order = static_cast<int>(x.dims(-1));
+
+    DenseTensor real_w_cpu, real_v_cpu;
+
+    std::vector<int64_t> real_w_dim = common::vectorize<int64_t>(out_w->dims());
+    real_w_dim.back() *= 2;
+    real_w_cpu.Resize(common::make_ddim(real_w_dim));
+    (*cpu_ctx).template Alloc<phi::dtype::Real<T>>(&real_w_cpu);
+    real_v_cpu.Resize(x.dims());
+    (*cpu_ctx).template Alloc<phi::dtype::Real<T>>(&real_v_cpu);
+
+    phi::ApplyEigKernelMagma<phi::dtype::Real<T>, Context>(
+        dev_ctx, x, &real_w_cpu, &real_v_cpu);
+
+    // 1. extract real part & imag part from real_w_cpu
+    DenseTensor real_part_cpu = phi::funcs::Slice<phi::dtype::Real<T>>(
+        (*cpu_ctx), real_w_cpu, {-1}, {0}, {order});
+    DenseTensor imag_part_cpu = phi::funcs::Slice<phi::dtype::Real<T>>(
+        (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2});
+
+    // 2. construct complex values
+    auto* real_part_data = real_part_cpu.data<phi::dtype::Real<T>>();
+    auto* imag_part_data = imag_part_cpu.data<phi::dtype::Real<T>>();
+    int64_t out_w_numel = static_cast<int64_t>(out_w->numel());
+
+    phi::funcs::ForRange<phi::CPUContext> for_range((*cpu_ctx), out_w_numel);
+    phi::funcs::RealImagToComplexFunctor<phi::dtype::Complex<T>> functor(
+        real_part_data,
+        imag_part_data,
+        out_w_cpu.data<phi::dtype::Complex<T>>(),
+        out_w_numel);
+    for_range(functor);
+
+    // 3. construct complex vectors
+    DenseTensor real_v_trans_cpu =
+        phi::TransposeLast2Dim<phi::dtype::Real<T>, phi::CPUContext>(
+            (*cpu_ctx), real_v_cpu);
+    DenseTensor out_v_trans_cpu;
+    out_v_trans_cpu.Resize(x.dims());
+    (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_v_trans_cpu);
+
+    phi::ConstructComplexVectors<phi::dtype::Real<T>,
+                                 phi::dtype::Complex<T>,
+                                 phi::CPUContext>(&out_v_trans_cpu,
+                                                  out_w_cpu,
+                                                  real_v_trans_cpu,
+                                                  (*cpu_ctx),
+                                                  batch_count,
+                                                  order);
+
+    TransposeTwoAxis<phi::dtype::Complex<T>, phi::CPUContext>(
+        out_v_trans_cpu,
+        &out_v_cpu,
+        x.dims().size() - 1,
+        x.dims().size() - 2,
+        (*cpu_ctx));
+
+  } else {
+    phi::ApplyEigKernelMagma<T, Context>(dev_ctx, x, &out_w_cpu, &out_v_cpu);
+  }
+
+  // copy result from cpu to gpu tensor
+  phi::Copy(dev_ctx, out_w_cpu, dev_ctx.GetPlace(), false, out_w);
+  phi::Copy(dev_ctx, out_v_cpu, dev_ctx.GetPlace(), false, out_v);
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(
+    eig, metax_gpu, ALL_LAYOUT, phi::EigKernel, float, phi::complex64) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT32) {
+    kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+    kernel->OutputAt(1).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+  }
+}

From 112d7e87227013f56853f022069944577c947e54 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Wed, 24 Dec 2025 11:20:14 +0800
Subject: [PATCH 5/7] remove iluvator changes

---
 backends/iluvatar_gpu/CMakeLists.txt          |   9 +-
 .../rms_norm_grad_kernel_register.cc          |   4 +-
 .../cuda_kernels/rms_norm_kernel_register.cc  |   4 +-
 .../iluvatar_gpu/patches/paddle-corex.patch   | 141 +++++++++++++-----
 .../iluvatar_gpu/runtime/iluvatar_context.h   |  28 ----
 5 files changed, 109 insertions(+), 77 deletions(-)

diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt
index 1aa4e057dd0..e2684d446e6 100644
--- a/backends/iluvatar_gpu/CMakeLists.txt
+++ b/backends/iluvatar_gpu/CMakeLists.txt
@@ -34,7 +34,6 @@ include(version)
 include(generic)
 include(cblas)
 include(external/eigen)
-include(external/magma)
 include(external/xxhash)
 include(external/zlib)
 include(external/protobuf)
@@ -121,13 +120,11 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/*.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/*.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/eigen/*.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/magma/magma_function.cc
   # cudnn/cublas
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cudnn.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublas.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublasLt.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc
-  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/magma.cc
   # kernels/gpu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_kernel.cu
@@ -198,8 +195,8 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/randint_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/reduce_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/fused_rms_norm_quant_grad_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/roi_align_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -1016,7 +1013,7 @@ target_link_libraries(
           ixattnbkd
           nccl
           # change nccl to ${FLAGCX_LIB} if compiling with FlagCX ${FLAGCX_LIB}
-          magma)
+)
 
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 include_directories(BEFORE ${CMAKE_SOURCE_DIR}/headers)
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc
index e669cc26e85..46028b9d54b 100644
--- a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc
+++ b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/rms_norm_grad_kernel.h"
 
-PD_CUSTOM_KERNEL_REGISTER(fused_rms_norm_quant_grad,
+PD_CUSTOM_KERNEL_REGISTER(rms_norm_grad,
                           iluvatar_gpu,
                           ALL_LAYOUT,
-                          phi::RmsNormQuantGradKernel,
+                          phi::RmsNormGradKernel,
                           float,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc
index 4dcc3449fb3..52f124dd919 100644
--- a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc
+++ b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc
@@ -37,10 +37,10 @@ limitations under the License.
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/rms_norm_kernel.h"
 
-PD_CUSTOM_KERNEL_REGISTER(fused_rms_norm_quant,
+PD_CUSTOM_KERNEL_REGISTER(rms_norm,
                           iluvatar_gpu,
                           ALL_LAYOUT,
-                          phi::RmsNormQuantKernel,
+                          phi::RmsNormKernel,
                           float,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}
diff --git a/backends/iluvatar_gpu/patches/paddle-corex.patch b/backends/iluvatar_gpu/patches/paddle-corex.patch
index 929bbb4c621..6526c8423df 100644
--- a/backends/iluvatar_gpu/patches/paddle-corex.patch
+++ b/backends/iluvatar_gpu/patches/paddle-corex.patch
@@ -1,8 +1,55 @@
+From 6484778861092b0d56309f5be9aae4d6c23726ef Mon Sep 17 00:00:00 2001
+From: tianyuzhou668 <2431054748@qq.com>
+Date: Wed, 12 Nov 2025 15:37:49 +0800
+Subject: [PATCH] Fix
+
+---
+ CMakeLists.txt                                |  2 +-
+ .../operators/collective/recv_v2_op.cu.cc     |  2 +-
+ .../operators/collective/send_v2_op.cu.cc     |  2 +-
+ .../fluid/platform/device/gpu/nccl_helper.h   |  2 +-
+ paddle/phi/backends/dynload/cudnn.cc          |  8 ++
+ paddle/phi/backends/dynload/cudnn.h           | 28 ++++++-
+ paddle/phi/backends/dynload/cusolver.h        |  2 -
+ paddle/phi/backends/dynload/cusparse.h        |  2 +
+ .../backends/gpu/cuda/cuda_device_function.h  |  4 +-
+ paddle/phi/backends/gpu/cuda/cuda_graph.cc    |  2 +-
+ paddle/phi/backends/gpu/cuda/cuda_helper.h    |  2 +-
+ paddle/phi/backends/gpu/cuda/cudnn_desc.h     | 16 +++-
+ paddle/phi/backends/gpu/cuda/cudnn_helper.h   |  2 +-
+ paddle/phi/backends/gpu/gpu_launch_config.h   | 16 +++-
+ paddle/phi/backends/gpu/gpu_primitives.h      | 25 ++++++
+ paddle/phi/backends/gpu/gpu_types.h           |  5 ++
+ paddle/phi/core/distributed/nccl_tools.cc     |  2 +-
+ paddle/phi/core/enforce.h                     |  6 +-
+ paddle/phi/core/utils/data_type.h             |  2 +-
+ paddle/phi/kernels/funcs/activation_functor.h | 20 +++++
+ paddle/phi/kernels/funcs/affine_grid_utils.h  |  2 +
+ paddle/phi/kernels/funcs/blas/blas_impl.cu.h  | 18 ++++-
+ paddle/phi/kernels/funcs/cufft_util.h         | 80 +++++++++++++++++++
+ paddle/phi/kernels/funcs/layer_norm_impl.cu.h |  4 -
+ paddle/phi/kernels/funcs/reduce_function.h    |  2 +-
+ paddle/phi/kernels/funcs/segmented_array.h    |  8 ++
+ paddle/phi/kernels/funcs/softmax_impl.h       |  1 +
+ .../fusion/gpu/fused_layernorm_kernel.cu      |  4 -
+ .../fused_layernorm_residual_dropout_bias.h   | 17 ----
+ paddle/phi/kernels/gpu/elementwise_grad.h     |  4 +
+ .../phi/kernels/gpu/layer_norm_grad_kernel.cu |  2 +-
+ paddle/phi/kernels/gpu/layer_norm_kernel.cu   |  2 +-
+ .../phi/kernels/gpu/rms_norm_grad_kernel.cu   |  2 +-
+ .../kernels/primitive/compute_primitives.h    | 24 +++---
+ paddle/phi/kernels/reduce_sum_kernel.cc       |  2 +
+ paddle/phi/kernels/shape_kernel.cc            |  2 +
+ paddle/phi/kernels/squeeze_kernel.cc          |  2 +
+ paddle/phi/kernels/strided_slice_kernel.cc    |  2 +
+ paddle/phi/kernels/unsqueeze_kernel.cc        |  2 +
+ 39 files changed, 266 insertions(+), 64 deletions(-)
+
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 6ea73365e8..766cc92f49 100755
+index 1a4460a3be..b6c6b4a797 100755
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -65,7 +65,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
+@@ -64,7 +64,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
  option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
  option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
  option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
@@ -25,7 +72,7 @@ index ab866f015c..10a8111637 100644
  #endif
                            int,
 diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
-index 28c126d2f2..c382e68188 100644
+index e58e29465d..1755595405 100644
 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
 +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
 @@ -203,7 +203,7 @@ PD_REGISTER_STRUCT_KERNEL(send_v2,
@@ -77,10 +124,10 @@ index 5a18808d47..749073ce38 100644
    std::call_once(cudnn_dso_flag,
                   []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
-index 092365a961..6b05da600b 100644
+index 4ff2e528a9..956bac0c64 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
-@@ -134,7 +134,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
+@@ -141,7 +141,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
  
  template <typename T>
  __forceinline__ __device__ T
@@ -89,7 +136,7 @@ index 092365a961..6b05da600b 100644
    return __shfl_sync(mask, val, src_line, width);
  }
  
-@@ -151,7 +151,7 @@ __device__ T reduceSum(T val, int tid, int len) {
+@@ -158,7 +158,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // I use Warp-Level Parallelism and assume the Warp size
    // is 32 which may be different for different GPU,
    // but most card's warp size is 32.
@@ -112,10 +159,10 @@ index 1c4f13e6b4..a90c0f6d21 100644
  }
  
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
-index dfd3945e9a..08eda4978c 100644
+index 02753c0333..bcf435dfae 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
-@@ -82,7 +82,7 @@ cudaDataType_t ToCudaDataType() {
+@@ -85,7 +85,7 @@ cudaDataType_t ToCudaDataType() {
      return CUDA_R_64F;
    } else if (std::is_same<T, phi::dtype::float16>::value) {
      return CUDA_R_16F;
@@ -165,10 +212,10 @@ index 189e97534e..8f805afe8c 100644
  
    void set(const phi::DenseTensor& tensor, const cudnnTensorFormat_t format) {
 diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
-index 8b293d3007..35245dddb0 100644
+index 28c3d14d37..5dc5f79178 100644
 --- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h
 +++ b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
-@@ -124,7 +124,7 @@ class CudnnDataType<phi::dtype::float8_e4m3fn> {
+@@ -125,7 +125,7 @@ class CudnnDataType<phi::dtype::float8_e4m3fn> {
  #endif
  
  // CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1
@@ -220,7 +267,7 @@ index af1c7ba8b9..132e488061 100644
    const int capability = dev_ctx.GetComputeCapability();
    GpuLaunchConfig config;
 diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
-index a7df8a4023..d4ff45d8d5 100644
+index ab505091ab..8b7dd5ff86 100644
 --- a/paddle/phi/backends/gpu/gpu_primitives.h
 +++ b/paddle/phi/backends/gpu/gpu_primitives.h
 @@ -134,13 +134,38 @@ CUDA_ATOMIC_WRAPPER(Add, int16_t) {
@@ -392,10 +439,11 @@ index 9c9ab5dff9..ecf4e8f5e8 100644
  template <typename T>
  struct CudaLogFunctor : public BaseActivationFunctor<T> {
 diff --git a/paddle/phi/kernels/funcs/affine_grid_utils.h b/paddle/phi/kernels/funcs/affine_grid_utils.h
-index 70abf63a3d..af6f2136c5 100644
+index 1df6184141..fc6015b209 100644
 --- a/paddle/phi/kernels/funcs/affine_grid_utils.h
 +++ b/paddle/phi/kernels/funcs/affine_grid_utils.h
-@@ -16,7 +16,9 @@
+@@ -15,7 +15,9 @@
+ #pragma once
  
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/device_context.h"
@@ -406,10 +454,10 @@ index 70abf63a3d..af6f2136c5 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
  
 diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
-index fc86cc09d1..6295ff91ac 100644
+index ae7b67de6d..ff60fd20ba 100644
 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
 +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
-@@ -1755,7 +1755,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1754,7 +1754,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                          const phi::bfloat16 *B,
                                          phi::bfloat16 beta,
                                          phi::bfloat16 *C) const {
@@ -418,7 +466,7 @@ index fc86cc09d1..6295ff91ac 100644
    // Note that cublas follows fortran order, so the order is different from
    // the cblas convention.
    int64_t lda = (transA == CblasNoTrans) ? K : M;
-@@ -1765,6 +1765,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1764,6 +1764,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
    cublasOperation_t cuTransB =
        (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  
@@ -426,7 +474,7 @@ index fc86cc09d1..6295ff91ac 100644
    PADDLE_ENFORCE_GE(
        dev_ctx_.GetComputeCapability(),
        80,
-@@ -1772,6 +1773,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1771,6 +1772,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
            "cublas bf16 gemm requires GPU compute capability >= 80,"
            "but received %d",
            dev_ctx_.GetComputeCapability()));
@@ -434,7 +482,7 @@ index fc86cc09d1..6295ff91ac 100644
  
    float h_alpha = static_cast<float>(alpha);
    float h_beta = static_cast<float>(beta);
-@@ -2293,12 +2295,13 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
+@@ -2292,12 +2294,13 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
                                          phi::bfloat16 beta,
                                          phi::bfloat16 *C,
                                          int ldc) const {
@@ -449,7 +497,7 @@ index fc86cc09d1..6295ff91ac 100644
    PADDLE_ENFORCE_GE(
        dev_ctx_.GetComputeCapability(),
        80,
-@@ -2306,6 +2309,7 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
+@@ -2305,6 +2308,7 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
            "cublas bf16 gemm requires GPU compute capability >= 80,"
            "but received %d",
            dev_ctx_.GetComputeCapability()));
@@ -457,7 +505,7 @@ index fc86cc09d1..6295ff91ac 100644
  
    float h_alpha = static_cast<float>(alpha);
    float h_beta = static_cast<float>(beta);
-@@ -2802,7 +2806,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -2772,7 +2776,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                 int64_t batchCount,
                                                 int64_t strideA,
                                                 int64_t strideB) const {
@@ -466,7 +514,7 @@ index fc86cc09d1..6295ff91ac 100644
    // Note that cublas follows fortran order, so the order is different from
    // the cblas convention.
    int64_t lda = (transA == CblasNoTrans) ? K : M;
-@@ -2881,7 +2885,11 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -2851,7 +2855,11 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                     static_cast<int>(ldc),
                                                     strideC,
                                                     static_cast<int>(batchCount),
@@ -478,7 +526,7 @@ index fc86cc09d1..6295ff91ac 100644
                                                     algo));
      });
    }
-@@ -3175,7 +3183,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -3145,7 +3153,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                 phi::bfloat16 beta,
                                                 phi::bfloat16 **C,
                                                 int batchCount) const {
@@ -487,7 +535,7 @@ index fc86cc09d1..6295ff91ac 100644
    // Note that cublas follows fortran order, so the order is different from
    // the cblas convention.
    int lda = (transA == CblasNoTrans) ? K : M;
-@@ -3186,6 +3194,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -3156,6 +3164,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
    cublasOperation_t cuTransB =
        (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  
@@ -495,7 +543,7 @@ index fc86cc09d1..6295ff91ac 100644
    PADDLE_ENFORCE_GE(
        dev_ctx_.GetComputeCapability(),
        80,
-@@ -3193,6 +3202,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -3163,6 +3172,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
            "cublas bf16 gemm requires GPU compute capability >= 80,"
            "but received %d",
            dev_ctx_.GetComputeCapability()));
@@ -644,10 +692,10 @@ index df4f214e66..e31b8eb1f6 100644
  }  // namespace detail
  }  // namespace funcs
 diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
-index 92dccf18ce..5cc9937d7a 100644
+index 4eae698648..9247535e0d 100644
 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
-@@ -37,11 +37,7 @@ using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+@@ -44,11 +44,7 @@ using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
  
  inline static int GetDesiredBlockDim(int64_t block_dim) {
    const int kMaxBlockDim = 512;
@@ -660,10 +708,10 @@ index 92dccf18ce..5cc9937d7a 100644
  }
  
 diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
-index e89969e9dc..65e744f37d 100644
+index 24c30ae7e2..ce5cf15176 100644
 --- a/paddle/phi/kernels/funcs/reduce_function.h
 +++ b/paddle/phi/kernels/funcs/reduce_function.h
-@@ -1131,7 +1131,7 @@ void ReduceKernel(const KPDevice& dev_ctx,
+@@ -1139,7 +1139,7 @@ void ReduceKernel(const KPDevice& dev_ctx,
        config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16 &&
        config.reduce_num <= std::numeric_limits<int32_t>::max();
  
@@ -699,7 +747,7 @@ index dad852093e..71adfaf3ed 100644
      auto ptr = allocation->ptr();
      allocations.emplace_back(std::move(allocation));
 diff --git a/paddle/phi/kernels/funcs/softmax_impl.h b/paddle/phi/kernels/funcs/softmax_impl.h
-index 9f12293c0f..4e897bb433 100644
+index 361936305c..f4c680fe56 100644
 --- a/paddle/phi/kernels/funcs/softmax_impl.h
 +++ b/paddle/phi/kernels/funcs/softmax_impl.h
 @@ -21,6 +21,7 @@ limitations under the License. */
@@ -711,10 +759,10 @@ index 9f12293c0f..4e897bb433 100644
  namespace phi {
  namespace funcs {
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
-index 77e3537124..8f6022bc76 100644
+index 3612a5fc89..634a61ebe1 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
-@@ -58,11 +58,7 @@ namespace fusion {
+@@ -60,11 +60,7 @@ namespace fusion {
  
  namespace {
  
@@ -781,10 +829,10 @@ index 9d4bb18d55..78bf0ad1b9 100644
          }
        }
 diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
-index 411ee4510c..36c2f8fba7 100644
+index f3a2874b92..47d2c0b4d7 100644
 --- a/paddle/phi/kernels/gpu/elementwise_grad.h
 +++ b/paddle/phi/kernels/gpu/elementwise_grad.h
-@@ -352,7 +352,11 @@ void ElementwiseAddGrad(const GPUContext &dev_ctx,
+@@ -213,7 +213,11 @@ void ElementwiseAddGrad(const GPUContext &dev_ctx,
      phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
    } else if (dx_data != dout_data && dy_data != dout_data) {
      auto size = x.numel();
@@ -797,10 +845,10 @@ index 411ee4510c..36c2f8fba7 100644
      dim3 grid_size =
          dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
 diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
-index 701c19a56f..b5d3ba7846 100644
+index d6c105e1b2..6d810ab0e6 100644
 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
 +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
-@@ -300,16 +300,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in,
+@@ -305,16 +305,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in,
          &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h);
  
      // top_left_index is just input_index.
@@ -824,10 +872,10 @@ index 701c19a56f..b5d3ba7846 100644
          funcs::BlockReduceMax(top_right_index, FINAL_MASK);
      int64_t in_bot_max_index =
 diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
-index 8280e95065..dd50aa08bf 100644
+index 2645060f4c..6a38e20776 100644
 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
 +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
-@@ -232,7 +232,7 @@ PD_REGISTER_KERNEL(layer_norm_grad,
+@@ -150,7 +150,7 @@ PD_REGISTER_KERNEL(layer_norm_grad,
      kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
    }
  }
@@ -837,10 +885,10 @@ index 8280e95065..dd50aa08bf 100644
                     GPU,
                     ALL_LAYOUT,
 diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
-index 892525493c..854f13a239 100644
+index ed5f6438ab..52f1e9e1c7 100644
 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
 +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
-@@ -765,7 +765,7 @@ PD_REGISTER_KERNEL(
+@@ -708,7 +708,7 @@ PD_REGISTER_KERNEL(
    kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
    kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
  }
@@ -849,6 +897,19 @@ index 892525493c..854f13a239 100644
  PD_REGISTER_KERNEL(layer_norm,
                     GPU,
                     ALL_LAYOUT,
+diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
+index 20015f7b87..8fee4c4b70 100644
+--- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
++++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
+@@ -214,7 +214,7 @@ PD_REGISTER_KERNEL(rms_norm_grad,
+                    float,
+                    phi::float16) {}
+ 
+-#elif CUDNN_VERSION_MIN(8, 1, 0)
++#elif CUDNN_VERSION_MIN(8, 1, 0) || defined(PADDLE_WITH_COREX)
+ 
+ PD_REGISTER_KERNEL(rms_norm_grad,
+                    GPU,
 diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
 index 11481a8b02..136593297e 100644
 --- a/paddle/phi/kernels/primitive/compute_primitives.h
@@ -1009,3 +1070,5 @@ index ffdf995ece..4a7e03f4ad 100644
  #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  PD_REGISTER_KERNEL(unsqueeze,
                     GPU,
+-- 
+2.25.1
diff --git a/backends/iluvatar_gpu/runtime/iluvatar_context.h b/backends/iluvatar_gpu/runtime/iluvatar_context.h
index a6ea37a0caa..a007bf4154c 100644
--- a/backends/iluvatar_gpu/runtime/iluvatar_context.h
+++ b/backends/iluvatar_gpu/runtime/iluvatar_context.h
@@ -77,11 +77,6 @@ class DnnWorkspaceHandle {
 namespace {  // NOLINT
 inline cudnnHandle_t dnn_handle_ = nullptr;
 inline std::once_flag flag_dnn_;
-
-inline cusolverDnHandle_t solver_handle_ = nullptr;
-inline std::function<cusolverDnHandle_t()> solver_handle_creator_{nullptr};
-inline std::once_flag flag_solver_;
-
 inline void InitDnnHandle(cudnnHandle_t* handle,
                           gpuStream_t stream,
                           Place place) {
@@ -119,29 +114,6 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor,
                                           const gpuStream_t& stream) {
   return DnnWorkspaceHandle(alloactor, stream);
 }
-
-inline void InitSolverHandle(cusolverDnHandle_t* handle, gpuStream_t stream) {
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
-}
-
-inline cusolverDnHandle_t GetSolverHandle(gpuStream_t stream) {
-  std::call_once(flag_solver_, [&]() {
-    if (!solver_handle_) {
-      if (!solver_handle_creator_) {
-        InitSolverHandle(&solver_handle_, stream);
-      } else {
-        solver_handle_ = solver_handle_creator_();
-      }
-    }
-  });
-  PADDLE_ENFORCE_NOT_NULL(
-      solver_handle_,
-      common::errors::InvalidArgument(
-          "The GPU solver handle is nullptr. It must not be null."));
-  return solver_handle_;
-}
-
 }  // namespace phi
 
 namespace iluvatar {

From f9149c7e42080307ff1d01dca69301369b2dacf2 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Wed, 24 Dec 2025 11:21:58 +0800
Subject: [PATCH 6/7] update

---
 backends/iluvatar_gpu/CMakeLists.txt          |   4 +-
 .../rms_norm_grad_kernel_register.cc          |   4 +-
 .../cuda_kernels/rms_norm_kernel_register.cc  |   4 +-
 .../iluvatar_gpu/patches/paddle-corex.patch   | 141 +++++-------------
 4 files changed, 45 insertions(+), 108 deletions(-)

diff --git a/backends/iluvatar_gpu/CMakeLists.txt b/backends/iluvatar_gpu/CMakeLists.txt
index e2684d446e6..051e6c16eb4 100644
--- a/backends/iluvatar_gpu/CMakeLists.txt
+++ b/backends/iluvatar_gpu/CMakeLists.txt
@@ -195,8 +195,8 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/randint_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/reduce_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/fused_rms_norm_quant_grad_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/fused_rms_norm_quant_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/roi_align_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/scale_kernel.cu
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc
index 46028b9d54b..e669cc26e85 100644
--- a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc
+++ b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_grad_kernel_register.cc
@@ -20,10 +20,10 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/rms_norm_grad_kernel.h"
 
-PD_CUSTOM_KERNEL_REGISTER(rms_norm_grad,
+PD_CUSTOM_KERNEL_REGISTER(fused_rms_norm_quant_grad,
                           iluvatar_gpu,
                           ALL_LAYOUT,
-                          phi::RmsNormGradKernel,
+                          phi::RmsNormQuantGradKernel,
                           float,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc
index 52f124dd919..4dcc3449fb3 100644
--- a/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc
+++ b/backends/iluvatar_gpu/kernels/cuda_kernels/rms_norm_kernel_register.cc
@@ -37,10 +37,10 @@ limitations under the License.
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/rms_norm_kernel.h"
 
-PD_CUSTOM_KERNEL_REGISTER(rms_norm,
+PD_CUSTOM_KERNEL_REGISTER(fused_rms_norm_quant,
                           iluvatar_gpu,
                           ALL_LAYOUT,
-                          phi::RmsNormKernel,
+                          phi::RmsNormQuantKernel,
                           float,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}
diff --git a/backends/iluvatar_gpu/patches/paddle-corex.patch b/backends/iluvatar_gpu/patches/paddle-corex.patch
index 6526c8423df..929bbb4c621 100644
--- a/backends/iluvatar_gpu/patches/paddle-corex.patch
+++ b/backends/iluvatar_gpu/patches/paddle-corex.patch
@@ -1,55 +1,8 @@
-From 6484778861092b0d56309f5be9aae4d6c23726ef Mon Sep 17 00:00:00 2001
-From: tianyuzhou668 <2431054748@qq.com>
-Date: Wed, 12 Nov 2025 15:37:49 +0800
-Subject: [PATCH] Fix
-
----
- CMakeLists.txt                                |  2 +-
- .../operators/collective/recv_v2_op.cu.cc     |  2 +-
- .../operators/collective/send_v2_op.cu.cc     |  2 +-
- .../fluid/platform/device/gpu/nccl_helper.h   |  2 +-
- paddle/phi/backends/dynload/cudnn.cc          |  8 ++
- paddle/phi/backends/dynload/cudnn.h           | 28 ++++++-
- paddle/phi/backends/dynload/cusolver.h        |  2 -
- paddle/phi/backends/dynload/cusparse.h        |  2 +
- .../backends/gpu/cuda/cuda_device_function.h  |  4 +-
- paddle/phi/backends/gpu/cuda/cuda_graph.cc    |  2 +-
- paddle/phi/backends/gpu/cuda/cuda_helper.h    |  2 +-
- paddle/phi/backends/gpu/cuda/cudnn_desc.h     | 16 +++-
- paddle/phi/backends/gpu/cuda/cudnn_helper.h   |  2 +-
- paddle/phi/backends/gpu/gpu_launch_config.h   | 16 +++-
- paddle/phi/backends/gpu/gpu_primitives.h      | 25 ++++++
- paddle/phi/backends/gpu/gpu_types.h           |  5 ++
- paddle/phi/core/distributed/nccl_tools.cc     |  2 +-
- paddle/phi/core/enforce.h                     |  6 +-
- paddle/phi/core/utils/data_type.h             |  2 +-
- paddle/phi/kernels/funcs/activation_functor.h | 20 +++++
- paddle/phi/kernels/funcs/affine_grid_utils.h  |  2 +
- paddle/phi/kernels/funcs/blas/blas_impl.cu.h  | 18 ++++-
- paddle/phi/kernels/funcs/cufft_util.h         | 80 +++++++++++++++++++
- paddle/phi/kernels/funcs/layer_norm_impl.cu.h |  4 -
- paddle/phi/kernels/funcs/reduce_function.h    |  2 +-
- paddle/phi/kernels/funcs/segmented_array.h    |  8 ++
- paddle/phi/kernels/funcs/softmax_impl.h       |  1 +
- .../fusion/gpu/fused_layernorm_kernel.cu      |  4 -
- .../fused_layernorm_residual_dropout_bias.h   | 17 ----
- paddle/phi/kernels/gpu/elementwise_grad.h     |  4 +
- .../phi/kernels/gpu/layer_norm_grad_kernel.cu |  2 +-
- paddle/phi/kernels/gpu/layer_norm_kernel.cu   |  2 +-
- .../phi/kernels/gpu/rms_norm_grad_kernel.cu   |  2 +-
- .../kernels/primitive/compute_primitives.h    | 24 +++---
- paddle/phi/kernels/reduce_sum_kernel.cc       |  2 +
- paddle/phi/kernels/shape_kernel.cc            |  2 +
- paddle/phi/kernels/squeeze_kernel.cc          |  2 +
- paddle/phi/kernels/strided_slice_kernel.cc    |  2 +
- paddle/phi/kernels/unsqueeze_kernel.cc        |  2 +
- 39 files changed, 266 insertions(+), 64 deletions(-)
-
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 1a4460a3be..b6c6b4a797 100755
+index 6ea73365e8..766cc92f49 100755
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -64,7 +64,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
+@@ -65,7 +65,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
  option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
  option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
  option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
@@ -72,7 +25,7 @@ index ab866f015c..10a8111637 100644
  #endif
                            int,
 diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
-index e58e29465d..1755595405 100644
+index 28c126d2f2..c382e68188 100644
 --- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
 +++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
 @@ -203,7 +203,7 @@ PD_REGISTER_STRUCT_KERNEL(send_v2,
@@ -124,10 +77,10 @@ index 5a18808d47..749073ce38 100644
    std::call_once(cudnn_dso_flag,
                   []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
-index 4ff2e528a9..956bac0c64 100644
+index 092365a961..6b05da600b 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
-@@ -141,7 +141,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
+@@ -134,7 +134,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
  
  template <typename T>
  __forceinline__ __device__ T
@@ -136,7 +89,7 @@ index 4ff2e528a9..956bac0c64 100644
    return __shfl_sync(mask, val, src_line, width);
  }
  
-@@ -158,7 +158,7 @@ __device__ T reduceSum(T val, int tid, int len) {
+@@ -151,7 +151,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // I use Warp-Level Parallelism and assume the Warp size
    // is 32 which may be different for different GPU,
    // but most card's warp size is 32.
@@ -159,10 +112,10 @@ index 1c4f13e6b4..a90c0f6d21 100644
  }
  
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
-index 02753c0333..bcf435dfae 100644
+index dfd3945e9a..08eda4978c 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
-@@ -85,7 +85,7 @@ cudaDataType_t ToCudaDataType() {
+@@ -82,7 +82,7 @@ cudaDataType_t ToCudaDataType() {
      return CUDA_R_64F;
    } else if (std::is_same<T, phi::dtype::float16>::value) {
      return CUDA_R_16F;
@@ -212,10 +165,10 @@ index 189e97534e..8f805afe8c 100644
  
    void set(const phi::DenseTensor& tensor, const cudnnTensorFormat_t format) {
 diff --git a/paddle/phi/backends/gpu/cuda/cudnn_helper.h b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
-index 28c3d14d37..5dc5f79178 100644
+index 8b293d3007..35245dddb0 100644
 --- a/paddle/phi/backends/gpu/cuda/cudnn_helper.h
 +++ b/paddle/phi/backends/gpu/cuda/cudnn_helper.h
-@@ -125,7 +125,7 @@ class CudnnDataType<phi::dtype::float8_e4m3fn> {
+@@ -124,7 +124,7 @@ class CudnnDataType<phi::dtype::float8_e4m3fn> {
  #endif
  
  // CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1
@@ -267,7 +220,7 @@ index af1c7ba8b9..132e488061 100644
    const int capability = dev_ctx.GetComputeCapability();
    GpuLaunchConfig config;
 diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
-index ab505091ab..8b7dd5ff86 100644
+index a7df8a4023..d4ff45d8d5 100644
 --- a/paddle/phi/backends/gpu/gpu_primitives.h
 +++ b/paddle/phi/backends/gpu/gpu_primitives.h
 @@ -134,13 +134,38 @@ CUDA_ATOMIC_WRAPPER(Add, int16_t) {
@@ -439,11 +392,10 @@ index 9c9ab5dff9..ecf4e8f5e8 100644
  template <typename T>
  struct CudaLogFunctor : public BaseActivationFunctor<T> {
 diff --git a/paddle/phi/kernels/funcs/affine_grid_utils.h b/paddle/phi/kernels/funcs/affine_grid_utils.h
-index 1df6184141..fc6015b209 100644
+index 70abf63a3d..af6f2136c5 100644
 --- a/paddle/phi/kernels/funcs/affine_grid_utils.h
 +++ b/paddle/phi/kernels/funcs/affine_grid_utils.h
-@@ -15,7 +15,9 @@
- #pragma once
+@@ -16,7 +16,9 @@
  
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/device_context.h"
@@ -454,10 +406,10 @@ index 1df6184141..fc6015b209 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
  
 diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
-index ae7b67de6d..ff60fd20ba 100644
+index fc86cc09d1..6295ff91ac 100644
 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
 +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
-@@ -1754,7 +1754,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1755,7 +1755,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                          const phi::bfloat16 *B,
                                          phi::bfloat16 beta,
                                          phi::bfloat16 *C) const {
@@ -466,7 +418,7 @@ index ae7b67de6d..ff60fd20ba 100644
    // Note that cublas follows fortran order, so the order is different from
    // the cblas convention.
    int64_t lda = (transA == CblasNoTrans) ? K : M;
-@@ -1764,6 +1764,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1765,6 +1765,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
    cublasOperation_t cuTransB =
        (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  
@@ -474,7 +426,7 @@ index ae7b67de6d..ff60fd20ba 100644
    PADDLE_ENFORCE_GE(
        dev_ctx_.GetComputeCapability(),
        80,
-@@ -1771,6 +1772,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+@@ -1772,6 +1773,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
            "cublas bf16 gemm requires GPU compute capability >= 80,"
            "but received %d",
            dev_ctx_.GetComputeCapability()));
@@ -482,7 +434,7 @@ index ae7b67de6d..ff60fd20ba 100644
  
    float h_alpha = static_cast<float>(alpha);
    float h_beta = static_cast<float>(beta);
-@@ -2292,12 +2294,13 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
+@@ -2293,12 +2295,13 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
                                          phi::bfloat16 beta,
                                          phi::bfloat16 *C,
                                          int ldc) const {
@@ -497,7 +449,7 @@ index ae7b67de6d..ff60fd20ba 100644
    PADDLE_ENFORCE_GE(
        dev_ctx_.GetComputeCapability(),
        80,
-@@ -2305,6 +2308,7 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
+@@ -2306,6 +2309,7 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
            "cublas bf16 gemm requires GPU compute capability >= 80,"
            "but received %d",
            dev_ctx_.GetComputeCapability()));
@@ -505,7 +457,7 @@ index ae7b67de6d..ff60fd20ba 100644
  
    float h_alpha = static_cast<float>(alpha);
    float h_beta = static_cast<float>(beta);
-@@ -2772,7 +2776,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -2802,7 +2806,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                 int64_t batchCount,
                                                 int64_t strideA,
                                                 int64_t strideB) const {
@@ -514,7 +466,7 @@ index ae7b67de6d..ff60fd20ba 100644
    // Note that cublas follows fortran order, so the order is different from
    // the cblas convention.
    int64_t lda = (transA == CblasNoTrans) ? K : M;
-@@ -2851,7 +2855,11 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -2881,7 +2885,11 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                     static_cast<int>(ldc),
                                                     strideC,
                                                     static_cast<int>(batchCount),
@@ -526,7 +478,7 @@ index ae7b67de6d..ff60fd20ba 100644
                                                     algo));
      });
    }
-@@ -3145,7 +3153,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -3175,7 +3183,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
                                                 phi::bfloat16 beta,
                                                 phi::bfloat16 **C,
                                                 int batchCount) const {
@@ -535,7 +487,7 @@ index ae7b67de6d..ff60fd20ba 100644
    // Note that cublas follows fortran order, so the order is different from
    // the cblas convention.
    int lda = (transA == CblasNoTrans) ? K : M;
-@@ -3156,6 +3164,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -3186,6 +3194,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
    cublasOperation_t cuTransB =
        (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  
@@ -543,7 +495,7 @@ index ae7b67de6d..ff60fd20ba 100644
    PADDLE_ENFORCE_GE(
        dev_ctx_.GetComputeCapability(),
        80,
-@@ -3163,6 +3172,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+@@ -3193,6 +3202,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
            "cublas bf16 gemm requires GPU compute capability >= 80,"
            "but received %d",
            dev_ctx_.GetComputeCapability()));
@@ -692,10 +644,10 @@ index df4f214e66..e31b8eb1f6 100644
  }  // namespace detail
  }  // namespace funcs
 diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
-index 4eae698648..9247535e0d 100644
+index 92dccf18ce..5cc9937d7a 100644
 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
-@@ -44,11 +44,7 @@ using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+@@ -37,11 +37,7 @@ using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
  
  inline static int GetDesiredBlockDim(int64_t block_dim) {
    const int kMaxBlockDim = 512;
@@ -708,10 +660,10 @@ index 4eae698648..9247535e0d 100644
  }
  
 diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
-index 24c30ae7e2..ce5cf15176 100644
+index e89969e9dc..65e744f37d 100644
 --- a/paddle/phi/kernels/funcs/reduce_function.h
 +++ b/paddle/phi/kernels/funcs/reduce_function.h
-@@ -1139,7 +1139,7 @@ void ReduceKernel(const KPDevice& dev_ctx,
+@@ -1131,7 +1131,7 @@ void ReduceKernel(const KPDevice& dev_ctx,
        config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16 &&
        config.reduce_num <= std::numeric_limits<int32_t>::max();
  
@@ -747,7 +699,7 @@ index dad852093e..71adfaf3ed 100644
      auto ptr = allocation->ptr();
      allocations.emplace_back(std::move(allocation));
 diff --git a/paddle/phi/kernels/funcs/softmax_impl.h b/paddle/phi/kernels/funcs/softmax_impl.h
-index 361936305c..f4c680fe56 100644
+index 9f12293c0f..4e897bb433 100644
 --- a/paddle/phi/kernels/funcs/softmax_impl.h
 +++ b/paddle/phi/kernels/funcs/softmax_impl.h
 @@ -21,6 +21,7 @@ limitations under the License. */
@@ -759,10 +711,10 @@ index 361936305c..f4c680fe56 100644
  namespace phi {
  namespace funcs {
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
-index 3612a5fc89..634a61ebe1 100644
+index 77e3537124..8f6022bc76 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
-@@ -60,11 +60,7 @@ namespace fusion {
+@@ -58,11 +58,7 @@ namespace fusion {
  
  namespace {
  
@@ -829,10 +781,10 @@ index 9d4bb18d55..78bf0ad1b9 100644
          }
        }
 diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
-index f3a2874b92..47d2c0b4d7 100644
+index 411ee4510c..36c2f8fba7 100644
 --- a/paddle/phi/kernels/gpu/elementwise_grad.h
 +++ b/paddle/phi/kernels/gpu/elementwise_grad.h
-@@ -213,7 +213,11 @@ void ElementwiseAddGrad(const GPUContext &dev_ctx,
+@@ -352,7 +352,11 @@ void ElementwiseAddGrad(const GPUContext &dev_ctx,
      phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
    } else if (dx_data != dout_data && dy_data != dout_data) {
      auto size = x.numel();
@@ -845,10 +797,10 @@ index f3a2874b92..47d2c0b4d7 100644
      dim3 grid_size =
          dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
 diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
-index d6c105e1b2..6d810ab0e6 100644
+index 701c19a56f..b5d3ba7846 100644
 --- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
 +++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
-@@ -305,16 +305,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in,
+@@ -300,16 +300,16 @@ __global__ void KeBilinearInterpBwShareMemory(T* in,
          &in_img_idy, &h_id, &h1lambda, &h2lambda, src_h, in_h);
  
      // top_left_index is just input_index.
@@ -872,10 +824,10 @@ index d6c105e1b2..6d810ab0e6 100644
          funcs::BlockReduceMax(top_right_index, FINAL_MASK);
      int64_t in_bot_max_index =
 diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
-index 2645060f4c..6a38e20776 100644
+index 8280e95065..dd50aa08bf 100644
 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
 +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu
-@@ -150,7 +150,7 @@ PD_REGISTER_KERNEL(layer_norm_grad,
+@@ -232,7 +232,7 @@ PD_REGISTER_KERNEL(layer_norm_grad,
      kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
    }
  }
@@ -885,10 +837,10 @@ index 2645060f4c..6a38e20776 100644
                     GPU,
                     ALL_LAYOUT,
 diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
-index ed5f6438ab..52f1e9e1c7 100644
+index 892525493c..854f13a239 100644
 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
 +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
-@@ -708,7 +708,7 @@ PD_REGISTER_KERNEL(
+@@ -765,7 +765,7 @@ PD_REGISTER_KERNEL(
    kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
    kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED);
  }
@@ -897,19 +849,6 @@ index ed5f6438ab..52f1e9e1c7 100644
  PD_REGISTER_KERNEL(layer_norm,
                     GPU,
                     ALL_LAYOUT,
-diff --git a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
-index 20015f7b87..8fee4c4b70 100644
---- a/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
-+++ b/paddle/phi/kernels/gpu/rms_norm_grad_kernel.cu
-@@ -214,7 +214,7 @@ PD_REGISTER_KERNEL(rms_norm_grad,
-                    float,
-                    phi::float16) {}
- 
--#elif CUDNN_VERSION_MIN(8, 1, 0)
-+#elif CUDNN_VERSION_MIN(8, 1, 0) || defined(PADDLE_WITH_COREX)
- 
- PD_REGISTER_KERNEL(rms_norm_grad,
-                    GPU,
 diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
 index 11481a8b02..136593297e 100644
 --- a/paddle/phi/kernels/primitive/compute_primitives.h
@@ -1070,5 +1009,3 @@ index ffdf995ece..4a7e03f4ad 100644
  #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  PD_REGISTER_KERNEL(unsqueeze,
                     GPU,
--- 
-2.25.1

From aa3e7d1199fc78024dd2664342082bbdc85b1438 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Wed, 24 Dec 2025 11:24:53 +0800
Subject: [PATCH 7/7] restore iluvatar

---
 Paddle                                        |   2 +-
 .../iluvatar_gpu/cmake/external/magma.cmake   |  94 ----
 .../kernels/cuda_kernels/eig_grad_kernel.cu   | 519 ------------------
 .../kernels/cuda_kernels/eig_kernel.cu        | 123 -----
 .../tests/unittests/test_eig_op_iluvatar.py   | 391 -------------
 .../unittests/test_linalg_eig_op_iluvatar.py  |  42 --
 6 files changed, 1 insertion(+), 1170 deletions(-)
 delete mode 100644 backends/iluvatar_gpu/cmake/external/magma.cmake
 delete mode 100644 backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu
 delete mode 100644 backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu
 delete mode 100644 backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py
 delete mode 100644 backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py

diff --git a/Paddle b/Paddle
index 24c29dc7eb6..935f0a6601f 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 24c29dc7eb6308a4014e39f8679f428e82008137
+Subproject commit 935f0a6601f90f4eb1d091682b31b243da50851b
diff --git a/backends/iluvatar_gpu/cmake/external/magma.cmake b/backends/iluvatar_gpu/cmake/external/magma.cmake
deleted file mode 100644
index 66c9560f6ca..00000000000
--- a/backends/iluvatar_gpu/cmake/external/magma.cmake
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not
-# use this file except in compliance with the License. You may obtain a copy of
-# the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-# License for the specific language governing permissions and limitations under
-# the License.
-
-include(ExternalProject)
-
-set(MAGMA_PREFIX_DIR ${THIRD_PARTY_PATH}/magma)
-set(MAGMA_DOWNLOAD_DIR
-    ${PADDLE_SOURCE_DIR}/third_party/magma/${CMAKE_SYSTEM_NAME})
-set(MAGMA_INSTALL_DIR ${THIRD_PARTY_PATH}/install/magma)
-set(MAGMA_LIB_DIR ${MAGMA_INSTALL_DIR}/lib)
-
-# use precompiled library. use magma tag v2.9.0 on 07/28/2025
-# https://github.com/icl-utk-edu/magma/tree/v2.9.0
-if(LINUX)
-  set(MAGMA_FILE
-      "magma_lnx_metax_v2.9.0.20250728.tar.gz"
-      CACHE STRING "" FORCE)
-  set(MAGMA_URL
-      "https://paddlepaddledeps.bj.bcebos.com/${MAGMA_FILE}"
-      CACHE STRING "" FORCE)
-  set(MAGMA_URL_MD5 3aa4106aa11ba0aeb0036b450a53e972)
-  set(MAGMA_LIB "${MAGMA_LIB_DIR}/libmagma.so")
-elseif(WIN32)
-  message("magma do not support windows yet, skip ...")
-else() # MacOS
-  message("magma do not support macos or other platform yet, skip ...")
-endif()
-
-function(download_magma)
-  message(
-    STATUS "Downloading ${MAGMA_URL} to ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}")
-  # NOTE: If the version is updated, consider emptying the folder; maybe add
-  # timeout
-  file(
-    DOWNLOAD ${MAGMA_URL} ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}
-    EXPECTED_MD5 ${MAGMA_URL_MD5}
-    STATUS ERR)
-  if(ERR EQUAL 0)
-    message(STATUS "Download ${MAGMA_FILE} success")
-  else()
-    message(
-      FATAL_ERROR
-        "Download failed, error: ${ERR}\n You can try downloading ${MAGMA_FILE} again"
-    )
-  endif()
-endfunction()
-
-# Download and check magma.
-if(EXISTS ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE})
-  file(MD5 ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE} MAGMA_MD5)
-  if(NOT MAGMA_MD5 STREQUAL MAGMA_URL_MD5)
-    # clean build file
-    file(REMOVE_RECURSE ${MAGMA_PREFIX_DIR})
-    file(REMOVE_RECURSE ${MAGMA_INSTALL_DIR})
-    download_magma()
-  endif()
-else()
-  download_magma()
-endif()
-
-ExternalProject_Add(
-  extern_magma
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  URL ${MAGMA_DOWNLOAD_DIR}/${MAGMA_FILE}
-  URL_MD5 ${MAGMA_URL_MD5}
-  DOWNLOAD_DIR ${MAGMA_DOWNLOAD_DIR}
-  SOURCE_DIR ${MAGMA_LIB_DIR}
-  PREFIX ${MAGMA_PREFIX_DIR}
-  DOWNLOAD_NO_PROGRESS 1
-  PATCH_COMMAND ""
-  UPDATE_COMMAND ""
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND ""
-  INSTALL_COMMAND ""
-  BUILD_BYPRODUCTS ${MAGMA_LIB})
-
-add_definitions(-DPADDLE_WITH_MAGMA)
-
-add_library(magma SHARED IMPORTED GLOBAL)
-
-set_target_properties(magma PROPERTIES IMPORTED_LOCATION ${MAGMA_LIB})
-
-add_dependencies(magma extern_magma)
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu
deleted file mode 100644
index 7748e32cd60..00000000000
--- a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_grad_kernel.cu
+++ /dev/null
@@ -1,519 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/dynload/cublas.h"
-#include "paddle/phi/backends/dynload/cusolver.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/cpu/eig.h"
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-#include "paddle/phi/kernels/funcs/unsqueeze.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-#include "runtime/iluvatar_context.h"
-
-namespace phi {
-
-template <typename T>
-void SolveLinearSystemGPU(const GPUContext& dev_ctx,
-                          const T* matrix_data,
-                          const T* rhs_data,
-                          T* out_data,
-                          int order,
-                          int rhs_cols,
-                          int batch_count);
-
-template <>
-void SolveLinearSystemGPU<phi::dtype::complex<float>>(
-    const phi::GPUContext& dev_ctx,
-    const phi::dtype::complex<float>*
-        matrix_data,  // device ptr, row-major, size batch*order*order
-    const phi::dtype::complex<float>*
-        rhs_data,  // device ptr, row-major, size batch*order*rhs_cols
-    phi::dtype::complex<float>*
-        out_data,  // device ptr, row-major, size batch*order*rhs_cols
-    int order,
-    int rhs_cols,
-    int batch_count) {
-  // handles
-  cublasHandle_t cublas_handle = dev_ctx.cublas_handle();
-  //   cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle();
-  cusolverDnHandle_t cusolver_handle = GetSolverHandle(dev_ctx.stream());
-
-  auto stream = phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()));
-
-  // cuComplex constants
-  const cuComplex kAlpha = make_cuFloatComplex(1.0f, 0.0f);
-  const cuComplex kZero = make_cuFloatComplex(0.0f, 0.0f);
-
-  // Sizes
-  const size_t A_one_bytes =
-      static_cast<size_t>(order) * order * sizeof(cuComplex);
-  const size_t B_one_bytes =
-      static_cast<size_t>(order) * rhs_cols * sizeof(cuComplex);
-  const size_t A_batch_bytes = A_one_bytes * batch_count;
-  const size_t B_batch_bytes = B_one_bytes * batch_count;
-
-  const cuComplex* A_row_all = reinterpret_cast<const cuComplex*>(matrix_data);
-  const cuComplex* B_row_all = reinterpret_cast<const cuComplex*>(rhs_data);
-  cuComplex* X_row_all = reinterpret_cast<cuComplex*>(out_data);
-
-  auto dA_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
-  auto dB_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
-  cuComplex* dA_col = reinterpret_cast<cuComplex*>(dA_col_alloc->ptr());
-  cuComplex* dB_col = reinterpret_cast<cuComplex*>(dB_col_alloc->ptr());
-
-  auto d_pivots_alloc = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      static_cast<size_t>(batch_count) * order * sizeof(int),
-      stream);
-  int* d_pivots = reinterpret_cast<int*>(d_pivots_alloc->ptr());
-
-  auto d_info_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(),
-                               static_cast<size_t>(batch_count) * sizeof(int),
-                               stream);
-  int* d_info = reinterpret_cast<int*>(d_info_alloc->ptr());
-
-  //    A_row layout: row-major (order x order), B_row layout: row-major (order
-  //    x rhs_cols)
-  for (int i = 0; i < batch_count; ++i) {
-    const cuComplex* A_row = A_row_all + static_cast<size_t>(i) * order * order;
-    cuComplex* A_col = dA_col + static_cast<size_t>(i) * order * order;
-    const cuComplex* B_row =
-        B_row_all + static_cast<size_t>(i) * order * rhs_cols;
-    cuComplex* B_col = dB_col + static_cast<size_t>(i) * order * rhs_cols;
-
-    // transpose A_row (row-major) -> A_col (column-major) via C = A^T
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam(
-        cublas_handle,
-        CUBLAS_OP_T,
-        CUBLAS_OP_N,
-        order,
-        order,
-        &kAlpha,
-        A_row,
-        order,  // lda: when interpreting A_row as (order x order) row-major,
-                // using order
-        &kZero,
-        nullptr,
-        order,
-        A_col,
-        order));  // ldc = order (column-major leading dim)
-
-    // transpose B_row (row-major order x rhs_cols) -> B_col (column-major order
-    // x rhs_cols)
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam(
-        cublas_handle,
-        CUBLAS_OP_T,
-        CUBLAS_OP_N,
-        order,
-        rhs_cols,
-        &kAlpha,
-        B_row,
-        rhs_cols,  // lda when A_row is viewed row-major: leading = rhs_cols
-        &kZero,
-        nullptr,
-        rhs_cols,
-        B_col,
-        order));  // ldc = order
-  }
-
-  int lwork = 0;
-  cuComplex* dA_col0 = dA_col;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrf_bufferSize(
-      cusolver_handle, order, order, dA_col0, order, &lwork));
-
-  size_t work_bytes = static_cast<size_t>(lwork) * sizeof(cuComplex);
-  auto d_work_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream);
-  cuComplex* d_work = reinterpret_cast<cuComplex*>(d_work_alloc->ptr());
-
-  for (int i = 0; i < batch_count; ++i) {
-    cuComplex* A_col = dA_col + static_cast<size_t>(i) * order * order;
-    cuComplex* B_col = dB_col + static_cast<size_t>(i) * order * rhs_cols;
-    int* pivots_i = d_pivots + static_cast<size_t>(i) * order;
-    int* info_i = d_info + i;
-
-    // getrf (LU factorization) on A_col (column-major)
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrf(
-        cusolver_handle, order, order, A_col, order, d_work, pivots_i, info_i));
-
-    // getrs: solve A_col * X_col = B_col
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgetrs(
-        cusolver_handle,
-        CUBLAS_OP_N,  // no transpose on column-major matrix
-        order,
-        rhs_cols,
-        A_col,
-        order,
-        pivots_i,
-        B_col,
-        order,
-        info_i));
-  }
-
-  for (int i = 0; i < batch_count; ++i) {
-    cuComplex* B_col = dB_col + static_cast<size_t>(i) * order *
-                                    rhs_cols;  // X in column-major
-    cuComplex* X_row = X_row_all + static_cast<size_t>(i) * order *
-                                       rhs_cols;  // target row-major
-
-    // transpose X_col -> X_row
-    // We use C = A^T : A has shape (order x rhs_cols) in column-major, so C
-    // will be (rhs_cols x order), but we want X_row with shape (order x
-    // rhs_cols) in row-major; calling cublasCgeam with op=T and adjusted dims
-    // works:
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgeam(
-        cublas_handle,
-        CUBLAS_OP_T,
-        CUBLAS_OP_N,
-        rhs_cols,
-        order,  // rowsC = rhs_cols, colsC = order
-        &kAlpha,
-        B_col,
-        order,  // B_col lda = order (col-major)
-        &kZero,
-        nullptr,
-        order,
-        X_row,
-        rhs_cols));  // X_row ldc = rhs_cols (row-major leading dimension)
-  }
-
-  std::vector<int> h_info(batch_count, 0);
-  phi::memory_utils::Copy(phi::CPUPlace(),
-                          h_info.data(),
-                          dev_ctx.GetPlace(),
-                          d_info,
-                          static_cast<size_t>(batch_count) * sizeof(int),
-                          reinterpret_cast<void*>(dev_ctx.stream()));
-  dev_ctx.Wait();
-
-  for (int i = 0; i < batch_count; ++i) {
-    PADDLE_ENFORCE_EQ(
-        h_info[i],
-        0,
-        errors::External(
-            "cuSOLVER getrf/getrs failed at batch %d, info: %d", i, h_info[i]));
-  }
-}
-
-template <>
-void SolveLinearSystemGPU<phi::dtype::complex<double>>(
-    const phi::GPUContext& dev_ctx,
-    const phi::dtype::complex<double>*
-        matrix_data,  // device ptr, row-major, size batch*order*order
-    const phi::dtype::complex<double>*
-        rhs_data,  // device ptr, row-major, size batch*order*rhs_cols
-    phi::dtype::complex<double>*
-        out_data,  // device ptr, row-major, size batch*order*rhs_cols
-    int order,
-    int rhs_cols,
-    int batch_count) {
-  // handles
-  cublasHandle_t cublas_handle = dev_ctx.cublas_handle();
-  //   cusolverDnHandle_t cusolver_handle = dev_ctx.cusolver_dn_handle();
-  cusolverDnHandle_t cusolver_handle = GetSolverHandle(dev_ctx.stream());
-
-  auto stream = phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream()));
-
-  // cuDoubleComplex constants
-  const cuDoubleComplex kAlpha = make_cuDoubleComplex(1.0f, 0.0f);
-  const cuDoubleComplex kZero = make_cuDoubleComplex(0.0f, 0.0f);
-
-  // Sizes
-  const size_t A_one_bytes =
-      static_cast<size_t>(order) * order * sizeof(cuDoubleComplex);
-  const size_t B_one_bytes =
-      static_cast<size_t>(order) * rhs_cols * sizeof(cuDoubleComplex);
-  const size_t A_batch_bytes = A_one_bytes * batch_count;
-  const size_t B_batch_bytes = B_one_bytes * batch_count;
-
-  const cuDoubleComplex* A_row_all =
-      reinterpret_cast<const cuDoubleComplex*>(matrix_data);
-  const cuDoubleComplex* B_row_all =
-      reinterpret_cast<const cuDoubleComplex*>(rhs_data);
-  cuDoubleComplex* X_row_all = reinterpret_cast<cuDoubleComplex*>(out_data);
-
-  auto dA_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), A_batch_bytes, stream);
-  auto dB_col_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), B_batch_bytes, stream);
-  cuDoubleComplex* dA_col =
-      reinterpret_cast<cuDoubleComplex*>(dA_col_alloc->ptr());
-  cuDoubleComplex* dB_col =
-      reinterpret_cast<cuDoubleComplex*>(dB_col_alloc->ptr());
-
-  auto d_pivots_alloc = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      static_cast<size_t>(batch_count) * order * sizeof(int),
-      stream);
-  int* d_pivots = reinterpret_cast<int*>(d_pivots_alloc->ptr());
-
-  auto d_info_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(),
-                               static_cast<size_t>(batch_count) * sizeof(int),
-                               stream);
-  int* d_info = reinterpret_cast<int*>(d_info_alloc->ptr());
-
-  //    A_row layout: row-major (order x order), B_row layout: row-major (order
-  //    x rhs_cols)
-  for (int i = 0; i < batch_count; ++i) {
-    const cuDoubleComplex* A_row =
-        A_row_all + static_cast<size_t>(i) * order * order;
-    cuDoubleComplex* A_col = dA_col + static_cast<size_t>(i) * order * order;
-    const cuDoubleComplex* B_row =
-        B_row_all + static_cast<size_t>(i) * order * rhs_cols;
-    cuDoubleComplex* B_col = dB_col + static_cast<size_t>(i) * order * rhs_cols;
-
-    // transpose A_row (row-major) -> A_col (column-major) via C = A^T
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam(
-        cublas_handle,
-        CUBLAS_OP_T,
-        CUBLAS_OP_N,
-        order,
-        order,
-        &kAlpha,
-        A_row,
-        order,  // lda: when interpreting A_row as (order x order) row-major,
-                // using order
-        &kZero,
-        nullptr,
-        order,
-        A_col,
-        order));  // ldc = order (column-major leading dim)
-
-    // transpose B_row (row-major order x rhs_cols) -> B_col (column-major order
-    // x rhs_cols)
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam(
-        cublas_handle,
-        CUBLAS_OP_T,
-        CUBLAS_OP_N,
-        order,
-        rhs_cols,
-        &kAlpha,
-        B_row,
-        rhs_cols,  // lda when A_row is viewed row-major: leading = rhs_cols
-        &kZero,
-        nullptr,
-        rhs_cols,
-        B_col,
-        order));  // ldc = order
-  }
-
-  int lwork = 0;
-  cuDoubleComplex* dA_col0 = dA_col;
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrf_bufferSize(
-      cusolver_handle, order, order, dA_col0, order, &lwork));
-
-  size_t work_bytes = static_cast<size_t>(lwork) * sizeof(cuDoubleComplex);
-  auto d_work_alloc =
-      phi::memory_utils::Alloc(dev_ctx.GetPlace(), work_bytes, stream);
-  cuDoubleComplex* d_work =
-      reinterpret_cast<cuDoubleComplex*>(d_work_alloc->ptr());
-
-  for (int i = 0; i < batch_count; ++i) {
-    cuDoubleComplex* A_col = dA_col + static_cast<size_t>(i) * order * order;
-    cuDoubleComplex* B_col = dB_col + static_cast<size_t>(i) * order * rhs_cols;
-    int* pivots_i = d_pivots + static_cast<size_t>(i) * order;
-    int* info_i = d_info + i;
-
-    // getrf (LU factorization) on A_col (column-major)
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrf(
-        cusolver_handle, order, order, A_col, order, d_work, pivots_i, info_i));
-
-    // getrs: solve A_col * X_col = B_col
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgetrs(
-        cusolver_handle,
-        CUBLAS_OP_N,  // no transpose on column-major matrix
-        order,
-        rhs_cols,
-        A_col,
-        order,
-        pivots_i,
-        B_col,
-        order,
-        info_i));
-  }
-
-  for (int i = 0; i < batch_count; ++i) {
-    cuDoubleComplex* B_col = dB_col + static_cast<size_t>(i) * order *
-                                          rhs_cols;  // X in column-major
-    cuDoubleComplex* X_row = X_row_all + static_cast<size_t>(i) * order *
-                                             rhs_cols;  // target row-major
-
-    // transpose X_col -> X_row
-    // We use C = A^T : A has shape (order x rhs_cols) in column-major, so C
-    // will be (rhs_cols x order), but we want X_row with shape (order x
-    // rhs_cols) in row-major; calling cublasZgeam with op=T and adjusted dims
-    // works:
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgeam(
-        cublas_handle,
-        CUBLAS_OP_T,
-        CUBLAS_OP_N,
-        rhs_cols,
-        order,  // rowsC = rhs_cols, colsC = order
-        &kAlpha,
-        B_col,
-        order,  // B_col lda = order (col-major)
-        &kZero,
-        nullptr,
-        order,
-        X_row,
-        rhs_cols));  // X_row ldc = rhs_cols (row-major leading dimension)
-  }
-
-  std::vector<int> h_info(batch_count, 0);
-  phi::memory_utils::Copy(phi::CPUPlace(),
-                          h_info.data(),
-                          dev_ctx.GetPlace(),
-                          d_info,
-                          static_cast<size_t>(batch_count) * sizeof(int),
-                          reinterpret_cast<void*>(dev_ctx.stream()));
-  dev_ctx.Wait();
-
-  for (int i = 0; i < batch_count; ++i) {
-    PADDLE_ENFORCE_EQ(
-        h_info[i],
-        0,
-        errors::External(
-            "cuSOLVER getrf/getrs failed at batch %d, info: %d", i, h_info[i]));
-  }
-}
-
-template <typename T, typename Context>
-void ComputeBackwardForComplexInputGPU(const DenseTensor& L,
-                                       const DenseTensor& V,
-                                       const paddle::optional<DenseTensor>& gL,
-                                       const paddle::optional<DenseTensor>& gV,
-                                       T* x_grad_data,
-                                       int batch_count,
-                                       int order,
-                                       const Context& dev_ctx) {
-  DenseTensor gL_safe;
-  if (gL.get_ptr()) {
-    gL_safe = gL.get();
-  } else {
-    gL_safe =
-        Fill<T, Context>(dev_ctx, common::vectorize<int64_t>(L.dims()), T(0));
-  }
-
-  DenseTensor gV_safe;
-  if (gV.get_ptr()) {
-    gV_safe = gV.get();
-  } else {
-    gV_safe =
-        Fill<T, Context>(dev_ctx, common::vectorize<int64_t>(V.dims()), T(0));
-  }
-  DenseTensor trans_v = phi::TransposeLast2Dim<T>(dev_ctx, V);
-  DenseTensor Vh = phi::Conj<T>(dev_ctx, trans_v);
-  DenseTensor Lconj = phi::Conj<T>(dev_ctx, L);
-  DenseTensor Econj = phi::Subtract<T>(dev_ctx,
-                                       phi::funcs::Unsqueeze(Lconj, -2),
-                                       phi::funcs::Unsqueeze(Lconj, -1));
-  DenseTensor VhgV = phi::Matmul<T>(dev_ctx, Vh, gV_safe);
-  DenseTensor diag_real = phi::Real<T>(dev_ctx, VhgV);
-
-  auto cpu_place = phi::CPUPlace();
-  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
-  auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
-
-  DenseTensor diag_real_cpu;
-  diag_real_cpu.Resize(diag_real.dims());
-  phi::Copy(dev_ctx, diag_real, cpu_place, false, &diag_real_cpu);
-
-  DenseTensor diag_res_cpu =
-      phi::funcs::BatchDiag<T>((*cpu_ctx), diag_real_cpu, batch_count);
-
-  DenseTensor diag_res;
-  dev_ctx.template Alloc<T>(&diag_res);
-  phi::Copy(dev_ctx, diag_res_cpu, dev_ctx.GetPlace(), false, &diag_res);
-
-  DenseTensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2);
-
-  auto numel = diag_unsqueezed.numel();
-  DenseTensor diag_unsqueezed_complex;
-  auto* data_diag_un = diag_unsqueezed.data<dtype::Real<T>>();
-  diag_unsqueezed_complex.Resize(diag_unsqueezed.dims());
-  auto* data_diag_un_com = dev_ctx.template Alloc<T>(
-      &diag_unsqueezed_complex, static_cast<size_t>(numel * sizeof(T)));
-
-  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
-  phi::funcs::RealToComplexFunctor<T> functor(
-      data_diag_un, data_diag_un_com, numel);
-  for_range(functor);
-  // real tensor multiply complex tensor in broadcast manner
-  DenseTensor res1 = phi::Multiply<T>(dev_ctx, V, diag_unsqueezed_complex);
-  DenseTensor res2 = phi::Matmul<T>(dev_ctx, Vh, res1);
-  DenseTensor result = phi::Subtract<T>(dev_ctx, VhgV, res2);
-
-  result.Resize(V.dims());
-  dev_ctx.template Alloc<T>(&result);
-  result = phi::Divide<T>(dev_ctx, result, Econj);
-  result = phi::funcs::DiagFill<T, T>(
-      dev_ctx, order, order, order, 0, gL_safe, result);
-  DenseTensor rhs = phi::Matmul<T>(dev_ctx, result, Vh);
-
-  // solve linear system
-  // solve(Vh, rhs, out, m, k)
-  // Vh: matrix with shape [m,m]
-  // rhs: rhs with shape [m,k]
-  // x_grad: out
-  int m = static_cast<int>(Vh.dims(-1));
-  int k = static_cast<int>(rhs.dims(-1));
-  auto* matrix_data = Vh.data<T>();
-  auto* rhs_data = rhs.data<T>();
-
-  SolveLinearSystemGPU<T>(
-      dev_ctx, matrix_data, rhs_data, x_grad_data, m, k, batch_count);
-}
-
-template <typename T, typename Context>
-void EigGradKernel(const Context& dev_ctx,
-                   const DenseTensor& out_w,
-                   const DenseTensor& out_v,
-                   const paddle::optional<DenseTensor>& dout_w,
-                   const paddle::optional<DenseTensor>& dout_v,
-                   DenseTensor* dx) {
-  auto* dx_data = dev_ctx.template Alloc<phi::dtype::Complex<T>>(dx);
-  if (dx->numel() == 0) {
-    return;
-  }
-  auto& dims = out_v.dims();
-  phi::DDim dim_origin = dims;
-  int num_dims = dim_origin.size();
-  int batch_count = BatchCount(out_v);
-  const int order = static_cast<int>(dim_origin[num_dims - 1]);
-
-  ComputeBackwardForComplexInputGPU<phi::dtype::Complex<T>, Context>(
-      out_w, out_v, dout_w, dout_v, dx_data, batch_count, order, dev_ctx);
-}
-
-}  // namespace phi
-
-// Register the kernel
-PD_REGISTER_PLUGIN_KERNEL(eig_grad,
-                          iluvatar_gpu,
-                          ALL_LAYOUT,
-                          phi::EigGradKernel,
-                          float,
-                          phi::complex64) {
-  kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
-  kernel->InputAt(2).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
-  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
-}
diff --git a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu b/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu
deleted file mode 100644
index 8461b82ece7..00000000000
--- a/backends/iluvatar_gpu/kernels/cuda_kernels/eig_kernel.cu
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/backends/context_pool.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/eig.h"
-#include "paddle/phi/kernels/eig_kernel.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void EigKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               DenseTensor* out_w,
-               DenseTensor* out_v) {
-  dev_ctx.template Alloc<phi::dtype::Complex<T>>(out_w);
-  dev_ctx.template Alloc<phi::dtype::Complex<T>>(out_v);
-
-  if (x.numel() == 0) {
-    return;
-  }
-
-  auto cpu_place = phi::CPUPlace();
-  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
-  auto* cpu_ctx = static_cast<phi::CPUContext*>(pool.Get(cpu_place));
-
-  // prepare cpu Tensor here, since magma requires output on cpu
-  DenseTensor out_w_cpu, out_v_cpu;
-  out_w_cpu.Resize(out_w->dims());
-  (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_w_cpu);
-  out_v_cpu.Resize(x.dims());
-  (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_v_cpu);
-
-  if (!IsComplexType(x.dtype())) {
-    // output still be complex though input is real
-    int batch_count = BatchCount(x);
-    int order = static_cast<int>(x.dims()[x.dims().size() - 1]);
-
-    DenseTensor real_w_cpu, real_v_cpu;
-
-    std::vector<int64_t> real_w_dim = common::vectorize<int64_t>(out_w->dims());
-    real_w_dim.back() *= 2;
-    real_w_cpu.Resize(common::make_ddim(real_w_dim));
-    (*cpu_ctx).template Alloc<phi::dtype::Real<T>>(&real_w_cpu);
-    real_v_cpu.Resize(x.dims());
-    (*cpu_ctx).template Alloc<phi::dtype::Real<T>>(&real_v_cpu);
-
-    phi::ApplyEigKernelMagma<phi::dtype::Real<T>, Context>(
-        dev_ctx, x, &real_w_cpu, &real_v_cpu);
-
-    // 1. extract real part & imag part from real_w_cpu
-    DenseTensor real_part_cpu = phi::funcs::Slice<phi::dtype::Real<T>>(
-        (*cpu_ctx), real_w_cpu, {-1}, {0}, {order});
-    DenseTensor imag_part_cpu = phi::funcs::Slice<phi::dtype::Real<T>>(
-        (*cpu_ctx), real_w_cpu, {-1}, {order}, {order * 2});
-
-    // 2. construct complex values
-    auto* real_part_data = real_part_cpu.data<phi::dtype::Real<T>>();
-    auto* imag_part_data = imag_part_cpu.data<phi::dtype::Real<T>>();
-    int64_t out_w_numel = static_cast<int64_t>(out_w->numel());
-
-    phi::funcs::ForRange<phi::CPUContext> for_range((*cpu_ctx), out_w_numel);
-    phi::funcs::RealImagToComplexFunctor<phi::dtype::Complex<T>> functor(
-        real_part_data,
-        imag_part_data,
-        out_w_cpu.data<phi::dtype::Complex<T>>(),
-        out_w_numel);
-    for_range(functor);
-
-    // 3. construct complex vectors
-    DenseTensor real_v_trans_cpu =
-        phi::TransposeLast2Dim<phi::dtype::Real<T>, phi::CPUContext>(
-            (*cpu_ctx), real_v_cpu);
-    DenseTensor out_v_trans_cpu;
-    out_v_trans_cpu.Resize(x.dims());
-    (*cpu_ctx).template Alloc<phi::dtype::Complex<T>>(&out_v_trans_cpu);
-
-    phi::ConstructComplexVectors<phi::dtype::Real<T>,
-                                 phi::dtype::Complex<T>,
-                                 phi::CPUContext>(&out_v_trans_cpu,
-                                                  out_w_cpu,
-                                                  real_v_trans_cpu,
-                                                  (*cpu_ctx),
-                                                  batch_count,
-                                                  order);
-
-    TransposeTwoAxis<phi::dtype::Complex<T>, phi::CPUContext>(
-        out_v_trans_cpu,
-        &out_v_cpu,
-        x.dims().size() - 1,
-        x.dims().size() - 2,
-        (*cpu_ctx));
-
-  } else {
-    phi::ApplyEigKernelMagma<T, Context>(dev_ctx, x, &out_w_cpu, &out_v_cpu);
-  }
-
-  // copy result from cpu to gpu tensor
-  phi::Copy(dev_ctx, out_w_cpu, dev_ctx.GetPlace(), false, out_w);
-  phi::Copy(dev_ctx, out_v_cpu, dev_ctx.GetPlace(), false, out_v);
-}
-
-}  // namespace phi
-
-PD_REGISTER_PLUGIN_KERNEL(
-    eig, iluvatar_gpu, ALL_LAYOUT, phi::EigKernel, float, phi::complex64) {
-  if (kernel_key.dtype() == phi::DataType::FLOAT32) {
-    kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
-    kernel->OutputAt(1).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
-  }
-}
diff --git a/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py
deleted file mode 100644
index 3f5fbb9f7f3..00000000000
--- a/backends/iluvatar_gpu/tests/unittests/test_eig_op_iluvatar.py
+++ /dev/null
@@ -1,391 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest, skip_check_grad_ci
-from utils import dygraph_guard
-
-import paddle
-from paddle import base
-
-
-# cast output to complex for numpy.linalg.eig
-def cast_to_complex(input, output):
-    if input.dtype == np.float32:
-        output = output.astype(np.complex64)
-    elif input.dtype == np.float64:
-        output = output.astype(np.complex128)
-    return output
-
-
-# define eig backward function for a single square matrix
-def eig_backward(w, v, grad_w, grad_v):
-    v_tran = np.transpose(v)
-    v_tran = np.conjugate(v_tran)
-    w_conj = np.conjugate(w)
-    w_conj_l = w_conj.reshape(1, w.size)
-    w_conj_r = w_conj.reshape(w.size, 1)
-    w_conj_2d = w_conj_l - w_conj_r
-
-    vhgv = np.matmul(v_tran, grad_v)
-    real_vhgv = np.real(vhgv)
-    diag_real = real_vhgv.diagonal()
-
-    diag_2d = diag_real.reshape(1, w.size)
-    rhs = v * diag_2d
-    mid = np.matmul(v_tran, rhs)
-    result = vhgv - mid
-
-    res = np.divide(result, w_conj_2d)
-    row, col = np.diag_indices_from(res)
-    res[row, col] = 1.0
-
-    tmp = np.matmul(res, v_tran)
-    dx = np.linalg.solve(v_tran, tmp)
-    return dx
-
-
-class TestEigOp(OpTest):
-    def setUp(self):
-        paddle.enable_static()
-        paddle.device.set_device("iluvatar_gpu")
-        self.op_type = "eig"
-        self.python_api = paddle.linalg.eig
-        self.__class__.op_type = self.op_type
-        self.init_input()
-        self.inputs = {"X": OpTest.np_dtype_to_base_dtype(self.x)}
-        self.outputs = {"Eigenvalues": self.out[0], "Eigenvectors": self.out[1]}
-
-    def init_input(self):
-        self.set_dtype()
-        self.set_dims()
-        self.x = np.random.random(self.shape).astype(self.dtype)
-        self.out = np.linalg.eig(self.x)
-        self.out = (
-            cast_to_complex(self.x, self.out[0]),
-            cast_to_complex(self.x, self.out[1]),
-        )
-
-    # for the real input, a customized checker is needed
-    def checker(self, outs):
-        actual_out_w = outs[0].flatten()
-        expect_out_w = self.out[0].flatten()
-        actual_out_v = outs[1].flatten()
-        expect_out_v = self.out[1].flatten()
-
-        length_w = len(expect_out_w)
-        act_w_real = np.sort(
-            np.array([np.abs(actual_out_w[i].real) for i in range(length_w)])
-        )
-        act_w_imag = np.sort(
-            np.array([np.abs(actual_out_w[i].imag) for i in range(length_w)])
-        )
-        exp_w_real = np.sort(
-            np.array([np.abs(expect_out_w[i].real) for i in range(length_w)])
-        )
-        exp_w_imag = np.sort(
-            np.array([np.abs(expect_out_w[i].imag) for i in range(length_w)])
-        )
-
-        for i in range(length_w):
-            np.testing.assert_allclose(
-                act_w_real[i],
-                exp_w_real[i],
-                rtol=1e-06,
-                atol=1e-05,
-                err_msg="The eigenvalues real part have diff: \nExpected "
-                + str(act_w_real[i])
-                + "\n"
-                + "But got: "
-                + str(exp_w_real[i]),
-            )
-            np.testing.assert_allclose(
-                act_w_imag[i],
-                exp_w_imag[i],
-                rtol=1e-06,
-                atol=1e-05,
-                err_msg="The eigenvalues image part have diff: \nExpected "
-                + str(act_w_imag[i])
-                + "\n"
-                + "But got: "
-                + str(exp_w_imag[i]),
-            )
-
-        length_v = len(expect_out_v)
-        act_v_real = np.sort(
-            np.array([np.abs(actual_out_v[i].real) for i in range(length_v)])
-        )
-        act_v_imag = np.sort(
-            np.array([np.abs(actual_out_v[i].imag) for i in range(length_v)])
-        )
-        exp_v_real = np.sort(
-            np.array([np.abs(expect_out_v[i].real) for i in range(length_v)])
-        )
-        exp_v_imag = np.sort(
-            np.array([np.abs(expect_out_v[i].imag) for i in range(length_v)])
-        )
-
-        for i in range(length_v):
-            np.testing.assert_allclose(
-                act_v_real[i],
-                exp_v_real[i],
-                rtol=1e-06,
-                atol=1e-05,
-                err_msg="The eigenvectors real part have diff: \nExpected "
-                + str(act_v_real[i])
-                + "\n"
-                + "But got: "
-                + str(exp_v_real[i]),
-            )
-            np.testing.assert_allclose(
-                act_v_imag[i],
-                exp_v_imag[i],
-                rtol=1e-06,
-                atol=1e-05,
-                err_msg="The eigenvectors image part have diff: \nExpected "
-                + str(act_v_imag[i])
-                + "\n"
-                + "But got: "
-                + str(exp_v_imag[i]),
-            )
-
-    def set_dtype(self):
-        self.dtype = np.complex64
-
-    def set_dims(self):
-        self.shape = (10, 10)
-
-    def init_grad(self):
-        # grad_w, grad_v complex dtype
-        gtype = self.dtype
-        if self.dtype == np.float32:
-            gtype = np.complex64
-        elif self.dtype == np.float64:
-            gtype = np.complex128
-        self.grad_w = np.ones(self.out[0].shape, gtype)
-        self.grad_v = np.ones(self.out[1].shape, gtype)
-        self.grad_x = eig_backward(self.out[0], self.out[1], self.grad_w, self.grad_v)
-
-    def test_check_output(self):
-        self.check_output_with_place_customized(
-            checker=self.checker,
-            place=paddle.CustomPlace("iluvatar_gpu", 0),
-            check_pir=True,
-        )
-
-    def test_check_grad(self):
-        self.init_grad()
-        self.check_grad(
-            ["X"],
-            ["Eigenvalues", "Eigenvectors"],
-            user_defined_grads=[self.grad_x],
-            user_defined_grad_outputs=[self.grad_w, self.grad_v],
-            check_pir=True,
-        )
-
-
-@skip_check_grad_ci(
-    reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig"
-)
-class TestFloat(TestEigOp):
-    def set_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_grad(self):
-        pass
-
-
-class TestEigStatic(TestEigOp):
-    def test_check_output_with_place(self):
-        paddle.enable_static()
-        place = paddle.CustomPlace("iluvatar_gpu", 0)
-        input_np = np.random.random([3, 3]).astype("complex")
-        expect_val, expect_vec = np.linalg.eig(input_np)
-        with base.program_guard(base.Program(), base.Program()):
-            input = paddle.static.data(name="input", shape=[3, 3], dtype="complex")
-            act_val, act_vec = paddle.linalg.eig(input)
-
-            exe = base.Executor(place)
-            fetch_val, fetch_vec = exe.run(
-                base.default_main_program(),
-                feed={"input": input_np},
-                fetch_list=[act_val, act_vec],
-            )
-        np.testing.assert_allclose(
-            expect_val,
-            fetch_val,
-            rtol=1e-06,
-            atol=1e-06,
-            err_msg="The eigen values have diff: \nExpected "
-            + str(expect_val)
-            + "\n"
-            + "But got: "
-            + str(fetch_val),
-        )
-        np.testing.assert_allclose(
-            np.abs(expect_vec),
-            np.abs(fetch_vec),
-            rtol=1e-06,
-            atol=1e-06,
-            err_msg="The eigen vectors have diff: \nExpected "
-            + str(np.abs(expect_vec))
-            + "\n"
-            + "But got: "
-            + str(np.abs(fetch_vec)),
-        )
-
-
-class TestEigDyGraph(unittest.TestCase):
-    def test_check_output_with_place(self):
-        np.random.seed(1024)
-        input_np = np.random.random([3, 3]).astype("complex64")
-        expect_val, expect_vec = np.linalg.eig(input_np)
-
-        paddle.set_device("iluvatar_gpu")
-        paddle.disable_static()
-
-        input_tensor = paddle.to_tensor(input_np)
-        fetch_val, fetch_vec = paddle.linalg.eig(input_tensor)
-
-        np.testing.assert_allclose(
-            expect_val,
-            fetch_val.numpy(),
-            rtol=1e-06,
-            atol=1e-06,
-            err_msg="The eigen values have diff: \nExpected "
-            + str(expect_val)
-            + "\n"
-            + "But got: "
-            + str(fetch_val),
-        )
-        np.testing.assert_allclose(
-            np.abs(expect_vec),
-            np.abs(fetch_vec.numpy()),
-            rtol=1e-06,
-            atol=1e-06,
-            err_msg="The eigen vectors have diff: \nExpected "
-            + str(np.abs(expect_vec))
-            + "\n"
-            + "But got: "
-            + str(np.abs(fetch_vec.numpy())),
-        )
-
-    # def test_check_grad(self):
-    #     test_shape = [3, 3]
-    #     test_type = 'float32'
-    #     paddle.set_device("iluvatar_gpu")
-
-    #     np.random.seed(1024)
-    #     input_np = np.random.random(test_shape).astype(test_type)
-    #     real_w, real_v = np.linalg.eig(input_np)
-
-    #     grad_w = np.ones(real_w.shape, test_type)
-    #     grad_v = np.ones(real_v.shape, test_type)
-    #     grad_x = eig_backward(real_w, real_v, grad_w, grad_v)
-
-    #     with base.dygraph.guard():
-    #         x = paddle.to_tensor(input_np)
-    #         x.stop_gradient = False
-    #         w, v = paddle.linalg.eig(x)
-    #         (w.sum() + v.sum()).backward()
-
-    #     np.testing.assert_allclose(
-    #         np.abs(x.grad.numpy()),
-    #         np.abs(grad_x),
-    #         rtol=1e-05,
-    #         atol=1e-05,
-    #         err_msg='The grad x have diff: \nExpected '
-    #         + str(np.abs(grad_x))
-    #         + '\n'
-    #         + 'But got: '
-    #         + str(np.abs(x.grad.numpy())),
-    #     )
-
-
-class TestEigWrongDimsError(unittest.TestCase):
-    def test_error(self):
-        paddle.device.set_device("iluvatar_gpu")
-        paddle.disable_static()
-        a = np.random.random(3).astype("float32")
-        x = paddle.to_tensor(a)
-        self.assertRaises(ValueError, paddle.linalg.eig, x)
-
-
-class TestEigNotSquareError(unittest.TestCase):
-    def test_error(self):
-        paddle.device.set_device("iluvatar_gpu")
-        paddle.disable_static()
-        a = np.random.random((1, 2, 3)).astype("float32")
-        x = paddle.to_tensor(a)
-        self.assertRaises(ValueError, paddle.linalg.eig, x)
-
-
-class TestEigUnsupportedDtypeError(unittest.TestCase):
-    def test_error(self):
-        paddle.device.set_device("iluvatar_gpu")
-        paddle.disable_static()
-        a = (np.random.random((3, 3)) * 10).astype("int64")
-        x = paddle.to_tensor(a)
-        self.assertRaises(RuntimeError, paddle.linalg.eig, x)
-
-
-class TestOptionalGradInput(unittest.TestCase):
-    def test_eager(self):
-        with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"):
-            x = paddle.randn(3, 3, requires_grad=True)
-            w, v = paddle.linalg.eig(x)
-
-            np.testing.assert_allclose(
-                (x @ v).numpy(),
-                (w.unsqueeze(0) * v).numpy(),
-                atol=1e-5,
-                rtol=1e-5,
-            )  # Aμ = λμ
-
-            # (dw_dx,) = paddle.grad(w, x, retain_graph=True)
-            # (dv_dx,) = paddle.grad(v, x, retain_graph=True)
-            # (dwdv_dx,) = paddle.grad([w, v], x)
-            # np.testing.assert_allclose(
-            #     (dw_dx + dv_dx).numpy(),
-            #     dwdv_dx.numpy(),
-            #     atol=1e-5,
-            #     rtol=1e-5,
-            # )
-
-    def test_dy2st(self):
-        with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"):
-            x = paddle.randn(3, 3, requires_grad=True)
-
-            def f(x):
-                w, v = paddle.linalg.eig(x)
-                return (
-                    w,
-                    v,
-                )
-
-            st_f = paddle.jit.to_static(f, full_graph=True, backend=None)
-
-            w, v = st_f(x)
-            np.testing.assert_allclose(
-                (x @ v).numpy(),
-                (w.unsqueeze(0) * v).numpy(),
-                atol=1e-5,
-                rtol=1e-5,
-            )  # Aμ = λμ
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py b/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py
deleted file mode 100644
index 18581f25af0..00000000000
--- a/backends/iluvatar_gpu/tests/unittests/test_linalg_eig_op_iluvatar.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from utils import dygraph_guard
-
-import paddle
-
-
-class TestEigAPI0Size(unittest.TestCase):
-    def test_errors(self):
-        with dygraph_guard(), paddle.device.device_guard("iluvatar_gpu"):
-            for shape in [[0, 0], [0, 4, 4], [1, 0, 2, 3, 3]]:
-                x = paddle.randn(shape=shape, dtype="float32", requires_grad=True)
-                w, v = paddle.linalg.eig(x)
-                self.assertEqual(w.shape, shape[:-1])
-                self.assertEqual(v.shape, shape)
-
-                # (dw_dx,) = paddle.grad(w.abs().sum(), x, retain_graph=True)
-                # self.assertEqual(dw_dx.shape, x.shape)
-                # (dv_dx,) = paddle.grad(v.abs().sum(), x, retain_graph=True)
-                # self.assertEqual(dv_dx.shape, x.shape)
-                # (dwv_dx,) = paddle.grad(
-                #     w.abs().sum() + v.abs().sum(), x, retain_graph=True
-                # )
-                # self.assertEqual(dwv_dx.shape, x.shape)
-
-
-if __name__ == "__main__":
-    unittest.main()