diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2b8c6934f9..718028541d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -282,6 +282,8 @@ if(RAFT_COMPILE_LIBRARY) src/raft_runtime/solver/lanczos_solver_int64_float.cu src/raft_runtime/solver/lanczos_solver_int_double.cu src/raft_runtime/solver/lanczos_solver_int_float.cu + src/raft_runtime/solver/randomized_svds_float.cu + src/raft_runtime/solver/randomized_svds_double.cu ) set_target_properties( raft_objs diff --git a/cpp/include/raft/sparse/solver/detail/cholesky_qr.cuh b/cpp/include/raft/sparse/solver/detail/cholesky_qr.cuh new file mode 100644 index 0000000000..2176c38b2b --- /dev/null +++ b/cpp/include/raft/sparse/solver/detail/cholesky_qr.cuh @@ -0,0 +1,159 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace raft::sparse::solver::detail { + +/** + * @brief Single pass of CholeskyQR: orthogonalize Q in-place via Cholesky factorization + * of the Gram matrix W = Q^T @ Q. + * + * @return true on success, false if Cholesky factorization failed (matrix not SPD / rank deficient) + */ +template +bool cholesky_qr_pass(raft::resources const& handle, + ValueTypeT* Q, + int m, + int k, + ValueTypeT* W, + ValueTypeT* workspace, + int workspace_size, + int* dev_info) +{ + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas_h = raft::resource::get_cublas_handle(handle); + auto cusolver_h = raft::resource::get_cusolver_dn_handle(handle); + + const ValueTypeT one = 1; + const ValueTypeT zero = 0; + + // W = Q^T @ Q (k x k) + // Q is col-major (m x k), so: W = Q^T * Q via gemm(TRANS, NOTRANS, k, k, m) + raft::linalg::gemm(handle, + true, // trans_a + false, // trans_b + k, + k, + m, + &one, + Q, + m, + Q, + m, + &zero, + W, + k, + stream); + + // L = cholesky(W, LOWER) — W is overwritten with L in lower triangle + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf( + cusolver_h, CUBLAS_FILL_MODE_LOWER, k, W, k, workspace, workspace_size, dev_info, stream)); + + // Check if Cholesky succeeded + int h_dev_info = 0; + raft::update_host(&h_dev_info, dev_info, 1, stream); + raft::resource::sync_stream(handle); + if (h_dev_info != 0) { return false; } + + // Q = Q @ L^{-T} + // This is equivalent to solving X * L^T = Q for X, i.e. trsm with RIGHT, LOWER, TRANS + RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(cublas_h, + CUBLAS_SIDE_RIGHT, + CUBLAS_FILL_MODE_LOWER, + CUBLAS_OP_T, + CUBLAS_DIAG_NON_UNIT, + m, + k, + &one, + W, + k, + Q, + m, + stream)); + + return true; +} + +/** + * @brief CholeskyQR2 orthogonalization: two passes of CholeskyQR for numerical stability. + * + * This is the GPU-optimized orthogonalization from Tomás, Quintana-Ortí, Anzt (2024), + * "Fast Truncated SVD of Sparse and Dense Matrices on Graphics Processors". + * It uses GEMM + Cholesky + TRSM operations which are highly efficient on GPU, + * providing ~3x speedup over standard Householder QR. + * + * If Cholesky factorization fails (input is rank-deficient), falls back to standard QR. + * + * @param handle raft resources handle + * @param Q matrix to orthogonalize of shape (m, k), col-major, modified in-place + * @return true if CholeskyQR2 succeeded, false if fell back to QR + */ +template +bool cholesky_qr2(raft::resources const& handle, + raft::device_matrix_view Q) +{ + int m = Q.extent(0); + int k = Q.extent(1); + + auto stream = raft::resource::get_cuda_stream(handle); + auto cusolver_h = raft::resource::get_cusolver_dn_handle(handle); + + // Allocate workspace for Gram matrix and Cholesky + rmm::device_uvector W(k * k, stream); + rmm::device_scalar dev_info(stream); + + // Query workspace size for potrf + int potrf_workspace_size = 0; + RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize( + cusolver_h, CUBLAS_FILL_MODE_LOWER, k, W.data(), k, &potrf_workspace_size)); + rmm::device_uvector potrf_workspace(potrf_workspace_size, stream); + + // First pass + if (!cholesky_qr_pass(handle, + Q.data_handle(), + m, + k, + W.data(), + potrf_workspace.data(), + potrf_workspace_size, + dev_info.data())) { + // Fallback to standard QR (qrGetQ handles src==dst via internal copy) + raft::linalg::qrGetQ(handle, Q.data_handle(), Q.data_handle(), m, k, stream); + return false; + } + + // Second pass for improved numerical stability + if (!cholesky_qr_pass(handle, + Q.data_handle(), + m, + k, + W.data(), + potrf_workspace.data(), + potrf_workspace_size, + dev_info.data())) { + // Fallback to standard QR (qrGetQ handles src==dst via internal copy) + raft::linalg::qrGetQ(handle, Q.data_handle(), Q.data_handle(), m, k, stream); + return false; + } + + return true; +} + +} // namespace raft::sparse::solver::detail diff --git a/cpp/include/raft/sparse/solver/detail/csr_linear_operator.cuh b/cpp/include/raft/sparse/solver/detail/csr_linear_operator.cuh new file mode 100644 index 0000000000..d29449f5cf --- /dev/null +++ b/cpp/include/raft/sparse/solver/detail/csr_linear_operator.cuh @@ -0,0 +1,85 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft::sparse::solver::detail { + +/** + * @brief CSR matrix wrapper providing apply() / apply_transpose() for the sparse SVD solver. + * + * This is NOT a general-purpose LinearOperator (in the sense of scipy's + * `scipy.sparse.linalg.LinearOperator`): it only exposes the two products the + * randomized SVD inner loop needs (Y = A @ X and Z = A^T @ X) via cuSPARSE SpMM. + * It intentionally lives in `detail/` and is not part of the public API. + * + * @note The cuSPARSE spmm wrapper requires `int` for indptr/indices types, so this wrapper + * is currently limited to int-indexed CSR matrices. + * + * @tparam ValueTypeT Data type of matrix values + * @tparam NNZTypeT Type for number of non-zeros + */ +template +struct csr_linear_operator { + /** + * @brief Construct from a const CSR matrix view + * + * @note `m_`/`n_` are cached at construction because + * `raft::device_csr_matrix_view::structure_view()` is not const-qualified + * and cannot be invoked from a const member function. + */ + explicit csr_linear_operator(raft::device_csr_matrix_view A) + : A_(A), m_(A.structure_view().get_n_rows()), n_(A.structure_view().get_n_cols()) + { + } + + int rows() const { return m_; } + int cols() const { return n_; } + + /** @brief Access the underlying const CSR matrix view */ + raft::device_csr_matrix_view csr_view() const { return A_; } + + /** + * @brief Compute Y = A @ X + * @param[in] handle raft resources handle + * @param[in] X input dense matrix of shape (n, k) col-major + * @param[out] Y output dense matrix of shape (m, k) col-major + */ + void apply(raft::resources const& handle, + raft::device_matrix_view X, + raft::device_matrix_view Y) const + { + ValueTypeT alpha = 1; + ValueTypeT beta = 0; + raft::sparse::linalg::spmm(handle, false, false, &alpha, A_, X, &beta, Y); + } + + /** + * @brief Compute Z = A^T @ X + * @param[in] handle raft resources handle + * @param[in] X input dense matrix of shape (m, k) col-major + * @param[out] Z output dense matrix of shape (n, k) col-major + */ + void apply_transpose(raft::resources const& handle, + raft::device_matrix_view X, + raft::device_matrix_view Z) const + { + ValueTypeT alpha = 1; + ValueTypeT beta = 0; + raft::sparse::linalg::spmm(handle, true, false, &alpha, A_, X, &beta, Z); + } + + private: + raft::device_csr_matrix_view A_; + int m_; + int n_; +}; + +} // namespace raft::sparse::solver::detail diff --git a/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh b/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh new file mode 100644 index 0000000000..25b202f1eb --- /dev/null +++ b/cpp/include/raft/sparse/solver/detail/randomized_svds.cuh @@ -0,0 +1,256 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace raft::sparse::solver::detail { + +/** + * @brief Randomized SVD for sparse matrices using block power iteration with CholeskyQR2. + * + * Implements randomized SVD (Halko et al. 2009) with GPU-optimized CholeskyQR2 + * orthogonalization (Tomás et al. 2024). + * + * The operator interface allows implicit operators (e.g. mean-centered sparse matrices) + * without materializing the dense matrix. + * + * @tparam ValueTypeT Data type (float or double) + * @tparam OperatorT Linear operator type providing apply() and apply_transpose() + * + * @param[in] handle raft resources handle + * @param[in] config SVD configuration (n_components, n_oversamples, n_power_iters, seed) + * @param[in] op linear operator representing the matrix to decompose + * @param[out] singular_values output singular values of shape (k,) in descending order + * @param[out] U optional output left singular vectors of shape (m, k), col-major + * @param[out] Vt optional output right singular vectors of shape (k, n), col-major + */ +template +void sparse_randomized_svd( + raft::resources const& handle, + sparse_svd_config const& config, + OperatorT const& op, + raft::device_vector_view singular_values, + std::optional> U, + std::optional> Vt) +{ + common::nvtx::range fun_scope( + "raft::sparse::solver::sparse_randomized_svd(%d, %d, %d)", + op.rows(), + op.cols(), + config.n_components); + + int m = op.rows(); + int n = op.cols(); + int k = config.n_components; + int p = config.n_oversamples; + + RAFT_EXPECTS(k > 0, "n_components must be positive"); + RAFT_EXPECTS(k < std::min(m, n), "n_components must be less than min(m, n)"); + RAFT_EXPECTS(p >= 0, "n_oversamples must be non-negative"); + RAFT_EXPECTS(config.n_power_iters >= 0, "n_power_iters must be non-negative"); + RAFT_EXPECTS(singular_values.extent(0) == static_cast(k), + "singular_values must have size n_components"); + if (U) { + RAFT_EXPECTS( + U->extent(0) == static_cast(m) && U->extent(1) == static_cast(k), + "U must have shape (m, n_components)"); + } + if (Vt) { + RAFT_EXPECTS( + Vt->extent(0) == static_cast(k) && Vt->extent(1) == static_cast(n), + "Vt must have shape (n_components, n)"); + } + + auto stream = raft::resource::get_cuda_stream(handle); + + int min_dim = std::min(m, n); + if (k + p > min_dim) { + RAFT_LOG_WARN( + "n_components (%d) + n_oversamples (%d) = %d exceeds min(n_rows, n_cols) = %d. " + "Clamping to %d. This may affect approximation quality.", + k, + p, + k + p, + min_dim, + min_dim); + } + int block_size = std::min(k + p, min_dim); + RAFT_EXPECTS(block_size >= k, + "block_size (n_components + n_oversamples) must be >= n_components"); + + // Initialize RNG + uint64_t seed = config.seed.value_or(std::random_device{}()); + raft::random::RngState rng_state(seed); + + // Step 1-3: Y = A @ Omega, orthogonalize + auto Y = raft::make_device_matrix( + handle, static_cast(m), static_cast(block_size)); + { + auto Omega = raft::make_device_matrix( + handle, static_cast(n), static_cast(block_size)); + raft::random::normal(handle, + rng_state, + Omega.data_handle(), + static_cast(n) * block_size, + ValueTypeT(0), + ValueTypeT(1)); + op.apply(handle, + raft::make_device_matrix_view( + Omega.data_handle(), n, block_size), + Y.view()); + } // Omega freed here + if (!cholesky_qr2(handle, Y.view())) { + RAFT_LOG_WARN("CholeskyQR2 fell back to standard QR during initial orthogonalization"); + } + + // Step 4: Power iterations + auto Z = raft::make_device_matrix( + handle, static_cast(n), static_cast(block_size)); + + for (int iter = 0; iter < config.n_power_iters; ++iter) { + // Z = A^T @ Q -> (n, block_size) + op.apply_transpose(handle, + raft::make_device_matrix_view( + Y.data_handle(), m, block_size), + Z.view()); + if (!cholesky_qr2(handle, Z.view())) { + RAFT_LOG_WARN( + "CholeskyQR2 fell back to standard QR during power iteration %d (transpose step)", iter); + } + + // Y = A @ Z -> (m, block_size) + op.apply(handle, + raft::make_device_matrix_view( + Z.data_handle(), n, block_size), + Y.view()); + if (!cholesky_qr2(handle, Y.view())) { + RAFT_LOG_WARN("CholeskyQR2 fell back to standard QR during power iteration %d (forward step)", + iter); + } + } + + // Q = Y after power iterations (already orthogonal) + // Q is (m, block_size) + + // Step 5: Bt = A^T @ Q -> (n, block_size) + op.apply_transpose(handle, + raft::make_device_matrix_view( + Y.data_handle(), m, block_size), + Z.view()); + + // Step 6-7: SVD of B = Bt^T where Bt = Z (n x block_size, tall matrix) + // We compute SVD(Bt) directly to avoid cuSOLVER gesvd issues with wide matrices. + // SVD(Bt) = U_bt * S * Vt_bt → SVD(B) has U_b = V_bt and Vt_b = U_bt^T + auto S_full = raft::make_device_vector(handle, block_size); + + // Bt = B^T is (n x block_size), we already have Bt = Z from step 5 + // Z is (n, block_size) = A^T @ Q = B^T. We can reuse Z directly! + // SVD(Bt) with Bt being (n x block_size): jobu='S' gives U_bt (n x block_size), + // jobvt='A' gives Vt_bt (block_size x block_size). Both are needed when either + // U or Vt is requested by the caller; we only allocate the ones we will read. + std::optional> U_bt; + std::optional> Vt_bt; + if (Vt) { + U_bt.emplace(raft::make_device_matrix( + handle, static_cast(n), static_cast(block_size))); + } + if (U) { + Vt_bt.emplace(raft::make_device_matrix( + handle, static_cast(block_size), static_cast(block_size))); + } + + // Z is consumed by svdQR (modifies input in-place) — this is fine since Z is not used after. + // svdQR requires non-null pointers for left/right vectors when gen_*_vec is true; pass + // a scratch matrix when the caller didn't request that side. + std::optional> U_bt_scratch; + std::optional> Vt_bt_scratch; + ValueTypeT* U_bt_ptr = U_bt ? U_bt->data_handle() : nullptr; + ValueTypeT* Vt_bt_ptr = Vt_bt ? Vt_bt->data_handle() : nullptr; + if (U_bt_ptr == nullptr) { + U_bt_scratch.emplace(raft::make_device_matrix( + handle, static_cast(n), static_cast(block_size))); + U_bt_ptr = U_bt_scratch->data_handle(); + } + if (Vt_bt_ptr == nullptr) { + Vt_bt_scratch.emplace(raft::make_device_matrix( + handle, static_cast(block_size), static_cast(block_size))); + Vt_bt_ptr = Vt_bt_scratch->data_handle(); + } + + raft::linalg::svdQR(handle, + Z.data_handle(), + n, + block_size, + S_full.data_handle(), + U_bt_ptr, + Vt_bt_ptr, + true, // transpose right vectors: Vt_bt -> V_bt (block_size x block_size) + true, // generate left vectors + true, // generate right vectors + stream); + // After svdQR with trans_right=true: + // U_bt is (n, block_size) — left singular vectors of Bt + // Vt_bt is now V_bt (block_size x block_size) — right singular vectors of Bt (transposed) + // S_full has block_size singular values + // + // For B = Bt^T: U_b = V_bt = Vt_bt (after transpose), Vt_b = U_bt^T + // So: U_b[:, :k] = Vt_bt[:, :k] and Vt_b[:k, :] = U_bt[:, :k]^T + + // Step 8: U = Q @ U_b[:, :k] = Q @ V_bt[:, :k] + // Q is Y (m, block_size), V_bt is (block_size, block_size) + // U = Y @ V_bt[:, :k] → (m, block_size) * (block_size, k) → (m, k) + if (U) { + const ValueTypeT one = 1; + const ValueTypeT zero = 0; + raft::linalg::gemm(handle, + Y.data_handle(), + m, + block_size, + Vt_bt->data_handle(), // This is V_bt after trans_right=true + U->data_handle(), + m, + k, + CUBLAS_OP_N, + CUBLAS_OP_N, + one, + zero, + stream); + } + + // Step 9: Truncate S, optionally compute Vt + raft::copy(singular_values.data_handle(), S_full.data_handle(), k, stream); + + // Vt[:k, :] = U_bt[:, :k]^T + // U_bt is col-major (n, block_size); transpose its first k columns to (k, n) col-major. + if (Vt) { raft::linalg::transpose(handle, U_bt->data_handle(), Vt->data_handle(), n, k, stream); } + + // Step 10: Sign correction (no-op when neither U nor Vt is requested) + svd_sign_correction(handle, U, Vt); +} + +} // namespace raft::sparse::solver::detail diff --git a/cpp/include/raft/sparse/solver/detail/svds_sign_correction.cuh b/cpp/include/raft/sparse/solver/detail/svds_sign_correction.cuh new file mode 100644 index 0000000000..e35889d91d --- /dev/null +++ b/cpp/include/raft/sparse/solver/detail/svds_sign_correction.cuh @@ -0,0 +1,155 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace raft::sparse::solver::detail { + +/** + * @brief CUDA kernel for SVD sign correction. + * + * For each component i, finds the element with largest absolute value in either + * column i of U (if U is not null) or row i of Vt. If that element is negative, + * flips the sign of U[:, i] (when present) and Vt[i, :] (when present). + * This ensures deterministic output regardless of the SVD algorithm's internal sign choices. + * + * One thread block per component. Uses shared memory reduction to find the argmax. + * + * @param U left singular vectors, col-major (m x k), U[:, i] is at U + i*m. May be nullptr. + * @param Vt right singular vectors, col-major (k x n), Vt[i, :] is row i. May be nullptr. + * @param m number of rows of U (ignored if U is nullptr) + * @param n number of columns of Vt (ignored if Vt is nullptr) + * @param k number of components + * + * @note At least one of U / Vt must be non-null. When U is non-null it is used to + * derive the signs; otherwise Vt is used. + */ +template +RAFT_KERNEL svd_sign_correction_kernel(ValueTypeT* U, ValueTypeT* Vt, int m, int n, int k) +{ + int comp = blockIdx.x; // one block per component + if (comp >= k) return; + + extern __shared__ char smem[]; + auto* s_max_val = reinterpret_cast(smem); + auto* s_max_idx = reinterpret_cast(smem + blockDim.x * sizeof(ValueTypeT)); + + ValueTypeT local_max_val = 0; + int local_max_idx = 0; + + // Source for sign decision: prefer U column; fall back to Vt row + if (U != nullptr) { + ValueTypeT* u_col = U + static_cast(comp) * m; + for (int row = threadIdx.x; row < m; row += blockDim.x) { + ValueTypeT abs_val = raft::abs(u_col[row]); + if (abs_val > local_max_val) { + local_max_val = abs_val; + local_max_idx = row; + } + } + } else { + // Vt is col-major (k x n); row `comp` is at Vt[comp + j*k] for j=0..n-1 + for (int col = threadIdx.x; col < n; col += blockDim.x) { + ValueTypeT abs_val = raft::abs(Vt[comp + static_cast(col) * k]); + if (abs_val > local_max_val) { + local_max_val = abs_val; + local_max_idx = col; + } + } + } + + s_max_val[threadIdx.x] = local_max_val; + s_max_idx[threadIdx.x] = local_max_idx; + __syncthreads(); + + // Tree reduction + for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) { + if (s_max_val[threadIdx.x + stride] > s_max_val[threadIdx.x]) { + s_max_val[threadIdx.x] = s_max_val[threadIdx.x + stride]; + s_max_idx[threadIdx.x] = s_max_idx[threadIdx.x + stride]; + } + } + __syncthreads(); + } + + // Thread 0 has the result + ValueTypeT signed_val; + if (U != nullptr) { + signed_val = U[static_cast(comp) * m + s_max_idx[0]]; + } else { + signed_val = Vt[comp + static_cast(s_max_idx[0]) * k]; + } + bool flip = (s_max_val[0] > 0) && (signed_val < 0); + __syncthreads(); + + if (!flip) return; + + // Flip U column if present + if (U != nullptr) { + ValueTypeT* u_col = U + static_cast(comp) * m; + for (int row = threadIdx.x; row < m; row += blockDim.x) { + u_col[row] = -u_col[row]; + } + } + + // Flip Vt row if present: row `comp` is at Vt[comp + j*k] for j=0..n-1 + if (Vt != nullptr) { + for (int col = threadIdx.x; col < n; col += blockDim.x) { + Vt[comp + static_cast(col) * k] = -Vt[comp + static_cast(col) * k]; + } + } +} + +/** + * @brief Apply deterministic sign correction to SVD output. + * + * For each component, ensures the element with largest absolute value in U + * (or Vt, if U is not present) is positive. Whichever of U / Vt is present is + * modified in-place to maintain A ≈ U @ diag(S) @ Vt. + * + * @param handle raft resources handle + * @param U optional left singular vectors of shape (m, k), col-major, modified in-place + * @param Vt optional right singular vectors of shape (k, n), col-major, modified in-place + * + * @note If both U and Vt are absent, this is a no-op. When only Vt is present, the + * sign convention differs from the both-present case (signs are derived from + * `argmax(|Vt[j, :]|)` rather than `argmax(|U[:, j]|)`), but is still + * deterministic for fixed inputs. + */ +template +void svd_sign_correction( + raft::resources const& handle, + std::optional> U, + std::optional> Vt) +{ + if (!U && !Vt) return; + + int k = U ? static_cast(U->extent(1)) : static_cast(Vt->extent(0)); + int m = U ? static_cast(U->extent(0)) : 0; + int n = Vt ? static_cast(Vt->extent(1)) : 0; + + auto stream = raft::resource::get_cuda_stream(handle); + + // threads_per_block must be a power of 2 for the tree reduction in the kernel + constexpr int threads_per_block = 256; + int smem_size = threads_per_block * (sizeof(ValueTypeT) + sizeof(int)); + + ValueTypeT* U_ptr = U ? U->data_handle() : nullptr; + ValueTypeT* Vt_ptr = Vt ? Vt->data_handle() : nullptr; + + svd_sign_correction_kernel<<>>(U_ptr, Vt_ptr, m, n, k); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +} // namespace raft::sparse::solver::detail diff --git a/cpp/include/raft/sparse/solver/randomized_svds.cuh b/cpp/include/raft/sparse/solver/randomized_svds.cuh new file mode 100644 index 0000000000..10acf924c3 --- /dev/null +++ b/cpp/include/raft/sparse/solver/randomized_svds.cuh @@ -0,0 +1,117 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace raft::sparse::solver { + +namespace detail { +// Helper alias used to put the optional U/Vt parameters in a non-deduced context +// so template argument deduction picks up ValueTypeT from earlier parameters and +// implicit conversion from `device_matrix_view` to `std::optional<...>` works. +template +using nondeduced_optional_matrix_view_t = std::optional>; +} // namespace detail + +/** + * @defgroup sparse_randomized_svd Sparse Randomized SVD + * @{ + */ + +/** + * @brief Compute truncated SVD using randomized algorithm with a generic linear operator. + * + * Implements randomized SVD (Halko et al. 2009) with CholeskyQR2 orthogonalization + * (Tomás et al. 2024) for efficient GPU execution on sparse matrices. + * + * The operator interface allows implicit linear operators (e.g. mean-centered sparse + * matrices for PCA) without materializing the dense matrix. `OperatorT` must expose + * the minimal interface used by the randomized SVD inner loop: + * - `int rows() const` / `int cols() const` + * - `void apply(handle, X, Y) const` computes `Y = A @ X` + * - `void apply_transpose(handle, X, Z) const` computes `Z = A^T @ X` + * + * where `X`, `Y`, `Z` are `raft::device_matrix_view<..., uint32_t, raft::col_major>`. + * This is intentionally a narrow, SVD-specific operator: only the two matrix products + * above are required, and no other operations (addition, scaling, inverse, eigensolves, + * etc.) are assumed or supported — it is not a general-purpose linear operator like + * `scipy.sparse.linalg.LinearOperator`. + * + * @tparam ValueTypeT Data type (float or double) + * @tparam OperatorT Linear operator type satisfying the interface above + * + * @param[in] handle raft resources handle + * @param[in] config SVD configuration parameters + * @param[in] op linear operator representing the matrix to decompose + * @param[out] singular_values output singular values of shape (n_components,) in descending order + * @param[out] U optional output left singular vectors of shape (m, n_components), col-major. + * Pass `std::nullopt` to skip computing U. + * @param[out] Vt optional output right singular vectors of shape (n_components, n), col-major. + * Pass `std::nullopt` to skip computing Vt. + * + * @note References: + * [1] Halko, Martinsson, Tropp (2009) "Finding structure with randomness" + * https://arxiv.org/abs/0909.4061 + * [2] Tomás, Quintana-Ortí, Anzt (2024) "Fast Truncated SVD of Sparse and Dense Matrices + * on Graphics Processors" https://arxiv.org/abs/2403.06218 + */ +template +void sparse_randomized_svd( + raft::resources const& handle, + sparse_svd_config const& config, + OperatorT const& op, + raft::device_vector_view singular_values, + detail::nondeduced_optional_matrix_view_t< + raft::device_matrix_view> U = std::nullopt, + detail::nondeduced_optional_matrix_view_t< + raft::device_matrix_view> Vt = std::nullopt) +{ + detail::sparse_randomized_svd(handle, config, op, singular_values, U, Vt); +} + +/** + * @brief Compute truncated SVD of a sparse CSR matrix using randomized algorithm. + * + * Convenience overload that accepts a CSR matrix view directly. + * + * @tparam ValueTypeT Data type (float or double) + * @tparam NNZTypeT Type for number of non-zeros + * + * @param[in] handle raft resources handle + * @param[in] config SVD configuration parameters + * @param[in] A input sparse CSR matrix of shape (m, n) + * @param[out] singular_values output singular values of shape (n_components,) in descending order + * @param[out] U optional output left singular vectors of shape (m, n_components), col-major. + * Pass `std::nullopt` to skip computing U. + * @param[out] Vt optional output right singular vectors of shape (n_components, n), col-major. + * Pass `std::nullopt` to skip computing Vt. + */ +template +void sparse_randomized_svd( + raft::resources const& handle, + sparse_svd_config const& config, + raft::device_csr_matrix_view A, + raft::device_vector_view singular_values, + detail::nondeduced_optional_matrix_view_t< + raft::device_matrix_view> U = std::nullopt, + detail::nondeduced_optional_matrix_view_t< + raft::device_matrix_view> Vt = std::nullopt) +{ + detail::csr_linear_operator op(A); + detail::sparse_randomized_svd(handle, config, op, singular_values, U, Vt); +} + +/** @} */ + +} // namespace raft::sparse::solver diff --git a/cpp/include/raft/sparse/solver/svds_config.hpp b/cpp/include/raft/sparse/solver/svds_config.hpp new file mode 100644 index 0000000000..32077b97cd --- /dev/null +++ b/cpp/include/raft/sparse/solver/svds_config.hpp @@ -0,0 +1,35 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +namespace raft::sparse::solver { + +/** + * @brief Configuration parameters for the sparse randomized SVD solver + * + * @tparam ValueTypeT Data type for values (float or double) + */ +template +struct sparse_svd_config { + /** @brief Number of singular values/vectors to compute. Must be set by the user. */ + int n_components = 0; + + /** @brief Number of extra random vectors for better approximation. + * Total subspace dimension is n_components + n_oversamples. */ + int n_oversamples = 10; + + /** @brief Number of power iteration passes. More iterations improve accuracy + * for matrices with slowly decaying singular values. */ + int n_power_iters = 2; + + /** @brief Random seed for reproducibility */ + std::optional seed = std::nullopt; +}; + +} // namespace raft::sparse::solver diff --git a/cpp/include/raft_runtime/solver/randomized_svds.hpp b/cpp/include/raft_runtime/solver/randomized_svds.hpp new file mode 100644 index 0000000000..afb23035e7 --- /dev/null +++ b/cpp/include/raft_runtime/solver/randomized_svds.hpp @@ -0,0 +1,43 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace raft::runtime::solver { + +/** + * @defgroup sparse_randomized_svd_runtime Sparse Randomized SVD Runtime API + * @{ + */ + +#define FUNC_DECL(ValueType) \ + void sparse_randomized_svd( \ + const raft::resources& handle, \ + const raft::sparse::solver::sparse_svd_config& config, \ + raft::device_vector_view indptr, \ + raft::device_vector_view indices, \ + raft::device_vector_view data, \ + int n_rows, \ + int n_cols, \ + int nnz, \ + raft::device_vector_view singular_values, \ + std::optional> U, \ + std::optional> Vt) + +FUNC_DECL(float); +FUNC_DECL(double); + +#undef FUNC_DECL + +/** @} */ + +} // namespace raft::runtime::solver diff --git a/cpp/src/raft_runtime/solver/randomized_svds.cuh b/cpp/src/raft_runtime/solver/randomized_svds.cuh new file mode 100644 index 0000000000..61e60fe5ec --- /dev/null +++ b/cpp/src/raft_runtime/solver/randomized_svds.cuh @@ -0,0 +1,30 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include + +#define FUNC_DEF(ValueType) \ + void sparse_randomized_svd( \ + const raft::resources& handle, \ + const raft::sparse::solver::sparse_svd_config& config, \ + raft::device_vector_view indptr, \ + raft::device_vector_view indices, \ + raft::device_vector_view data, \ + int n_rows, \ + int n_cols, \ + int nnz, \ + raft::device_vector_view singular_values, \ + std::optional> U, \ + std::optional> Vt) \ + { \ + auto csr_structure = raft::make_device_compressed_structure_view( \ + indptr.data_handle(), indices.data_handle(), n_rows, n_cols, nnz); \ + auto csr_matrix = raft::make_device_csr_matrix_view( \ + data.data_handle(), csr_structure); \ + raft::sparse::solver::sparse_randomized_svd( \ + handle, config, csr_matrix, singular_values, U, Vt); \ + } diff --git a/cpp/src/raft_runtime/solver/randomized_svds_double.cu b/cpp/src/raft_runtime/solver/randomized_svds_double.cu new file mode 100644 index 0000000000..a6b3984e96 --- /dev/null +++ b/cpp/src/raft_runtime/solver/randomized_svds_double.cu @@ -0,0 +1,12 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "randomized_svds.cuh" + +namespace raft::runtime::solver { + +FUNC_DEF(double); + +} // namespace raft::runtime::solver diff --git a/cpp/src/raft_runtime/solver/randomized_svds_float.cu b/cpp/src/raft_runtime/solver/randomized_svds_float.cu new file mode 100644 index 0000000000..8364c27f0e --- /dev/null +++ b/cpp/src/raft_runtime/solver/randomized_svds_float.cu @@ -0,0 +1,12 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "randomized_svds.cuh" + +namespace raft::runtime::solver { + +FUNC_DEF(float); + +} // namespace raft::runtime::solver diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 47ac6fc286..41b6e7ee3c 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -242,7 +242,7 @@ if(BUILD_TESTS) ConfigureTest( NAME SOLVERS_TEST PATH linalg/eigen_solvers.cu lap/lap.cu sparse/mst.cu - sparse/solver/lanczos.cu LIB EXPLICIT_INSTANTIATE_ONLY + sparse/solver/lanczos.cu sparse/solver/randomized_svds.cu LIB EXPLICIT_INSTANTIATE_ONLY ) ConfigureTest( diff --git a/cpp/tests/sparse/solver/randomized_svds.cu b/cpp/tests/sparse/solver/randomized_svds.cu new file mode 100644 index 0000000000..978f836eac --- /dev/null +++ b/cpp/tests/sparse/solver/randomized_svds.cu @@ -0,0 +1,679 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "../../test_utils.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include + +namespace raft::sparse::solver { + +// ============================================================================ +// Golden data: 20x15 sparse matrix (nnz=120), generated with cupy seed=42 +// Expected singular values computed via cupy.linalg.svd +// ============================================================================ + +template +std::vector golden_indptr() +{ + return {0, 4, 12, 17, 21, 27, 35, 41, 48, 55, 61, 67, 74, 79, 84, 90, 97, 102, 110, 114, 120}; +} + +template +std::vector golden_indices() +{ + return {0, 4, 11, 14, 0, 2, 4, 5, 6, 12, 13, 14, 1, 6, 7, 8, 10, 0, 1, 6, + 11, 0, 3, 4, 5, 13, 14, 0, 1, 2, 3, 8, 10, 12, 14, 1, 4, 8, 9, 11, + 12, 0, 3, 4, 7, 10, 11, 13, 0, 2, 4, 8, 9, 11, 13, 0, 4, 8, 9, 10, + 11, 3, 4, 5, 12, 13, 14, 0, 1, 2, 5, 7, 10, 13, 0, 2, 7, 13, 14, 3, + 8, 10, 12, 14, 0, 3, 4, 5, 9, 11, 0, 3, 7, 9, 11, 12, 14, 0, 4, 9, + 10, 12, 0, 1, 5, 6, 8, 9, 10, 13, 4, 6, 9, 11, 2, 4, 6, 7, 8, 14}; +} + +template +std::vector golden_values() +{ + return {0.1887315, 0.1368716, 0.1613618, 0.4839568, 0.4151455, 0.9012728, 0.7806592, 0.6713938, + 0.4281978, 0.5782541, 0.4538292, 0.0404218, 0.1264595, 0.3271811, 0.0725956, 0.0031886, + 0.6529871, 0.7025284, 0.1509823, 0.2051058, 0.6743041, 0.7965932, 0.5618279, 0.3343601, + 0.8330396, 0.6796557, 0.5684967, 0.2353606, 0.8159586, 0.3035201, 0.7145222, 0.7080131, + 0.4448774, 0.8879446, 0.2896976, 0.5502138, 0.3758983, 0.6988429, 0.6113330, 0.3722922, + 0.4468638, 0.8808504, 0.0408660, 0.6595733, 0.5429990, 0.7350115, 0.3736090, 0.9308268, + 0.8861471, 0.3089533, 0.3642297, 0.4673888, 0.8658971, 0.8499948, 0.6036414, 0.0184879, + 0.6970348, 0.1920955, 0.0650081, 0.7614104, 0.1568318, 0.3703384, 0.5869733, 0.1282817, + 0.2451350, 0.9711478, 0.5114062, 0.5030190, 0.5759066, 0.8097631, 0.3580396, 0.4049093, + 0.1618905, 0.2125666, 0.1163929, 0.4818056, 0.7425613, 0.7851025, 0.7386931, 0.7113866, + 0.5566008, 0.5112362, 0.5538779, 0.0846058, 0.3478690, 0.2713018, 0.5058042, 0.4784714, + 0.6136972, 0.5514663, 0.6401569, 0.1399612, 0.3880760, 0.6163719, 0.5653080, 0.4741685, + 0.3447469, 0.8718122, 0.8952977, 0.7043414, 0.7425023, 0.4490941, 0.3434426, 0.4157445, + 0.2318376, 0.6219735, 0.0641180, 0.8132253, 0.7734252, 0.0820893, 0.1556361, 0.1998085, + 0.2101485, 0.0405817, 0.2176879, 0.9811354, 0.2048657, 0.4163201, 0.3513670, 0.2649395}; +} + +template +std::vector golden_singular_values() +{ + return {3.819689, 2.097294, 1.926829}; +} + +// ============================================================================ +// Test: randomized SVD on golden data +// ============================================================================ + +template +class RandomizedSvdsTest : public ::testing::Test { + public: + RandomizedSvdsTest() + : stream(resource::get_cuda_stream(handle)), + m(20), + n(15), + k(3), + nnz(120), + d_indptr(raft::make_device_vector(handle, m + 1)), + d_indices(raft::make_device_vector(handle, nnz)), + d_values(raft::make_device_vector(handle, nnz)), + expected_S(raft::make_device_vector(handle, k)) + { + } + + protected: + void SetUp() override + { + auto h_indptr = golden_indptr(); + auto h_indices = golden_indices(); + auto h_values = golden_values(); + auto h_S = golden_singular_values(); + + raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream); + raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream); + raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream); + raft::update_device(expected_S.data_handle(), h_S.data(), k, stream); + } + + void Run() + { + auto csr_structure = + raft::make_device_compressed_structure_view( + d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz); + auto csr_matrix = + raft::make_device_csr_matrix_view( + d_values.data_handle(), csr_structure); + + sparse_svd_config config; + config.n_components = k; + config.n_oversamples = 10; + config.n_power_iters = 4; + config.seed = 42; + + auto S = raft::make_device_vector(handle, k); + auto U = raft::make_device_matrix(handle, m, k); + auto Vt = raft::make_device_matrix(handle, k, n); + + sparse_randomized_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view()); + + // Singular values must match golden ground truth + ASSERT_TRUE(raft::devArrMatch( + S.data_handle(), expected_S.data_handle(), k, raft::CompareApprox(0.05), stream)); + + // U must be orthogonal: U^T U ~ I_k + auto UtU = raft::make_device_matrix(handle, k, k); + ValueType one = 1, zero = 0; + raft::linalg::gemm(handle, + U.data_handle(), + m, + k, + U.data_handle(), + UtU.data_handle(), + k, + k, + CUBLAS_OP_T, + CUBLAS_OP_N, + one, + zero, + stream); + + std::vector I_k(k * k, 0); + for (int i = 0; i < k; i++) + I_k[i * k + i] = 1; + auto I_k_dev = raft::make_device_matrix(handle, k, k); + raft::update_device(I_k_dev.data_handle(), I_k.data(), k * k, stream); + + ASSERT_TRUE(raft::devArrMatch(UtU.data_handle(), + I_k_dev.data_handle(), + k * k, + raft::CompareApprox(1e-4), + stream)); + + // Vt must have orthonormal rows: Vt Vt^T ~ I_k + auto VVt = raft::make_device_matrix(handle, k, k); + raft::linalg::gemm(handle, + Vt.data_handle(), + k, + n, + Vt.data_handle(), + VVt.data_handle(), + k, + k, + CUBLAS_OP_N, + CUBLAS_OP_T, + one, + zero, + stream); + + ASSERT_TRUE(raft::devArrMatch(VVt.data_handle(), + I_k_dev.data_handle(), + k * k, + raft::CompareApprox(1e-4), + stream)); + } + + raft::resources handle; + cudaStream_t stream; + int m, n, k, nnz; + raft::device_vector d_indptr; + raft::device_vector d_indices; + raft::device_vector d_values; + raft::device_vector expected_S; +}; + +using RandomizedSvdsTestF = RandomizedSvdsTest; +TEST_F(RandomizedSvdsTestF, GoldenData) { Run(); } + +using RandomizedSvdsTestD = RandomizedSvdsTest; +TEST_F(RandomizedSvdsTestD, GoldenData) { Run(); } + +// ============================================================================ +// Test: optional U / Vt outputs (skip flag) +// +// Verifies that passing std::nullopt for U or Vt (or both) skips that output: +// - singular values match the all-outputs reference exactly +// - when U is requested, it equals the reference U (sign-fixing uses U) +// - when Vt is requested without U, signs are derived from Vt — values are +// deterministic but may differ from the U+Vt reference; we check the +// row-norms (each Vt row should remain unit-norm) +// ============================================================================ + +template +class OptionalUVtTest : public ::testing::Test { + public: + OptionalUVtTest() + : stream(resource::get_cuda_stream(handle)), + m(20), + n(15), + k(3), + nnz(120), + d_indptr(raft::make_device_vector(handle, m + 1)), + d_indices(raft::make_device_vector(handle, nnz)), + d_values(raft::make_device_vector(handle, nnz)) + { + auto h_indptr = golden_indptr(); + auto h_indices = golden_indices(); + auto h_values = golden_values(); + raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream); + raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream); + raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream); + } + + template + void run_one(UViewT U_opt, VtViewT Vt_opt, ValueType* S_out) + { + auto csr_structure = + raft::make_device_compressed_structure_view( + d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz); + auto csr_matrix = + raft::make_device_csr_matrix_view( + d_values.data_handle(), csr_structure); + + sparse_svd_config config; + config.n_components = k; + config.n_oversamples = 10; + config.n_power_iters = 4; + config.seed = 42; + + auto S = raft::make_device_vector(handle, k); + sparse_randomized_svd(handle, config, csr_matrix, S.view(), U_opt, Vt_opt); + raft::update_host(S_out, S.data_handle(), k, stream); + resource::sync_stream(handle); + } + + void Run() + { + using mat_view = raft::device_matrix_view; + + // Reference: ask for everything + auto U_ref = raft::make_device_matrix(handle, m, k); + auto Vt_ref = raft::make_device_matrix(handle, k, n); + std::vector S_ref(k); + run_one, std::optional>( + U_ref.view(), Vt_ref.view(), S_ref.data()); + + // Mode: S only — both nullopt + { + std::vector S_only(k); + run_one, std::optional>( + std::nullopt, std::nullopt, S_only.data()); + for (int i = 0; i < k; i++) { + ASSERT_NEAR(S_only[i], S_ref[i], 1e-4) << "S mismatch at i=" << i << " (S-only)"; + } + } + + // Mode: U + S + { + auto U_only = raft::make_device_matrix(handle, m, k); + std::vector S_u(k); + run_one, std::optional>( + U_only.view(), std::nullopt, S_u.data()); + for (int i = 0; i < k; i++) { + ASSERT_NEAR(S_u[i], S_ref[i], 1e-4) << "S mismatch at i=" << i << " (U-only)"; + } + // U is sign-corrected from U → must match reference U exactly + ASSERT_TRUE(raft::devArrMatch(U_only.data_handle(), + U_ref.data_handle(), + m * k, + raft::CompareApprox(1e-5), + stream)) + << "U should match reference (same sign convention)"; + } + + // Mode: Vt + S + { + auto Vt_only = raft::make_device_matrix(handle, k, n); + std::vector S_v(k); + run_one, std::optional>( + std::nullopt, Vt_only.view(), S_v.data()); + for (int i = 0; i < k; i++) { + ASSERT_NEAR(S_v[i], S_ref[i], 1e-4) << "S mismatch at i=" << i << " (Vt-only)"; + } + // In Vt-only mode, signs come from Vt instead of U; rows may flip relative to + // reference Vt. Verify row-orthonormality is preserved. + auto VVt = raft::make_device_matrix(handle, k, k); + ValueType one = 1, zero = 0; + raft::linalg::gemm(handle, + Vt_only.data_handle(), + k, + n, + Vt_only.data_handle(), + VVt.data_handle(), + k, + k, + CUBLAS_OP_N, + CUBLAS_OP_T, + one, + zero, + stream); + std::vector I_k(k * k, 0); + for (int i = 0; i < k; i++) + I_k[i * k + i] = 1; + auto I_k_dev = raft::make_device_matrix(handle, k, k); + raft::update_device(I_k_dev.data_handle(), I_k.data(), k * k, stream); + ASSERT_TRUE(raft::devArrMatch(VVt.data_handle(), + I_k_dev.data_handle(), + k * k, + raft::CompareApprox(1e-4), + stream)) + << "Vt rows must remain orthonormal in Vt-only mode"; + } + + // Determinism: same mode + same seed → identical output across runs + { + auto Vt_a = raft::make_device_matrix(handle, k, n); + auto Vt_b = raft::make_device_matrix(handle, k, n); + std::vector S_a(k), S_b(k); + run_one, std::optional>( + std::nullopt, Vt_a.view(), S_a.data()); + run_one, std::optional>( + std::nullopt, Vt_b.view(), S_b.data()); + ASSERT_TRUE(raft::devArrMatch(Vt_a.data_handle(), + Vt_b.data_handle(), + k * n, + raft::CompareApprox(1e-5), + stream)) + << "Vt-only mode must be deterministic"; + } + } + + raft::resources handle; + cudaStream_t stream; + int m, n, k, nnz; + raft::device_vector d_indptr; + raft::device_vector d_indices; + raft::device_vector d_values; +}; + +using OptionalUVtTestF = OptionalUVtTest; +TEST_F(OptionalUVtTestF, AllModes) { Run(); } + +using OptionalUVtTestD = OptionalUVtTest; +TEST_F(OptionalUVtTestD, AllModes) { Run(); } + +// ============================================================================ +// Test: reconstruction error ||A - U diag(S) Vt||_F +// ============================================================================ + +struct ReconstructionErrorTest : public ::testing::Test { + raft::resources handle; + cudaStream_t stream; + ReconstructionErrorTest() : stream(resource::get_cuda_stream(handle)) {} + + void Run() + { + using ValueType = float; + int m = 20, n = 15, k = 3, nnz = 120; + + auto h_indptr = golden_indptr(); + auto h_indices = golden_indices(); + auto h_values = golden_values(); + + auto d_indptr = raft::make_device_vector(handle, m + 1); + auto d_indices = raft::make_device_vector(handle, nnz); + auto d_values = raft::make_device_vector(handle, nnz); + raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream); + raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream); + raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream); + + auto csr_structure = raft::make_device_compressed_structure_view( + d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz); + auto csr_matrix = raft::make_device_csr_matrix_view( + d_values.data_handle(), csr_structure); + + sparse_svd_config config; + config.n_components = k; + config.n_oversamples = 10; + config.n_power_iters = 4; + config.seed = 42; + + auto S = raft::make_device_vector(handle, k); + auto U = raft::make_device_matrix(handle, m, k); + auto Vt = raft::make_device_matrix(handle, k, n); + sparse_randomized_svd(handle, config, csr_matrix, S.view(), U.view(), Vt.view()); + + // Reconstruct: recon = U @ diag(S) @ Vt + // First: US = U * S (scale columns of U by S) + auto US = raft::make_device_matrix(handle, m, k); + raft::copy(US.data_handle(), U.data_handle(), m * k, stream); + std::vector h_S(k); + raft::update_host(h_S.data(), S.data_handle(), k, stream); + resource::sync_stream(handle); + for (int j = 0; j < k; j++) { + auto cublas_h = raft::resource::get_cublas_handle(handle); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasscal( + cublas_h, m, &h_S[j], US.data_handle() + j * m, 1, stream)); + } + + // recon = US @ Vt + auto recon = raft::make_device_matrix(handle, m, n); + ValueType one = 1, zero = 0; + raft::linalg::gemm(handle, + US.data_handle(), + m, + k, + Vt.data_handle(), + recon.data_handle(), + m, + n, + CUBLAS_OP_N, + CUBLAS_OP_N, + one, + zero, + stream); + + // Build dense A on host + std::vector h_dense(m * n, 0); + int vi = 0; + for (int i = 0; i < m; i++) + for (int jj = h_indptr[i]; jj < h_indptr[i + 1]; jj++) + h_dense[h_indices[jj] * m + i] = h_values[vi++]; + + auto dense_A = raft::make_device_matrix(handle, m, n); + raft::update_device(dense_A.data_handle(), h_dense.data(), m * n, stream); + + // error = A - recon + // Compute ||A - recon||_F / ||A||_F on host + std::vector h_recon(m * n); + raft::update_host(h_recon.data(), recon.data_handle(), m * n, stream); + resource::sync_stream(handle); + + double err_sq = 0, norm_sq = 0; + for (int i = 0; i < m * n; i++) { + double diff = h_dense[i] - h_recon[i]; + err_sq += diff * diff; + norm_sq += (double)h_dense[i] * h_dense[i]; + } + double rel_err = std::sqrt(err_sq / norm_sq); + + // With k=3 out of min(20,15)=15 components, relative error should be < 1.0 + ASSERT_LT(rel_err, 1.0) << "Reconstruction relative error too large: " << rel_err; + } +}; + +TEST_F(ReconstructionErrorTest, RelativeError) { Run(); } + +// ============================================================================ +// Test: mean-centered linear operator +// Ground truth: dense SVD of explicitly centered matrix +// ============================================================================ + +template +struct mean_centered_operator { + detail::csr_linear_operator base_op_; + ValueType* col_means_; + int m_, n_; + + mean_centered_operator(raft::device_csr_matrix_view A, + ValueType* col_means, + int m, + int n) + : base_op_(A), col_means_(col_means), m_(m), n_(n) + { + } + + int rows() const { return m_; } + int cols() const { return n_; } + auto csr_view() const { return base_op_.csr_view(); } + + // Y = (A - 1*mean^T) @ X = A@X - ones * (mean^T @ X) + void apply(raft::resources const& handle, + raft::device_matrix_view X, + raft::device_matrix_view Y) const + { + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas = raft::resource::get_cublas_handle(handle); + int bk = X.extent(1); + base_op_.apply(handle, X, Y); + rmm::device_uvector corr(bk, stream); + rmm::device_uvector ones(m_, stream); + std::vector h(m_, 1); + raft::update_device(ones.data(), h.data(), m_, stream); + ValueType a1 = 1, a0 = 0, am1 = -1; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas, + CUBLAS_OP_T, + n_, + bk, + &a1, + X.data_handle(), + n_, + col_means_, + 1, + &a0, + corr.data(), + 1, + stream)); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas, + CUBLAS_OP_N, + CUBLAS_OP_N, + m_, + bk, + 1, + &am1, + ones.data(), + m_, + corr.data(), + 1, + &a1, + Y.data_handle(), + m_, + stream)); + } + + // Z = (A - 1*mean^T)^T @ X = A^T@X - mean * (1^T @ X) + void apply_transpose(raft::resources const& handle, + raft::device_matrix_view X, + raft::device_matrix_view Z) const + { + auto stream = raft::resource::get_cuda_stream(handle); + auto cublas = raft::resource::get_cublas_handle(handle); + int bk = X.extent(1); + base_op_.apply_transpose(handle, X, Z); + rmm::device_uvector sums(bk, stream); + rmm::device_uvector ones(m_, stream); + std::vector h(m_, 1); + raft::update_device(ones.data(), h.data(), m_, stream); + ValueType a1 = 1, a0 = 0, am1 = -1; + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas, + CUBLAS_OP_T, + m_, + bk, + &a1, + X.data_handle(), + m_, + ones.data(), + 1, + &a0, + sums.data(), + 1, + stream)); + RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas, + CUBLAS_OP_N, + CUBLAS_OP_N, + n_, + bk, + 1, + &am1, + col_means_, + n_, + sums.data(), + 1, + &a1, + Z.data_handle(), + n_, + stream)); + } +}; + +class MeanCenteredOperatorTest : public ::testing::Test { + public: + MeanCenteredOperatorTest() : stream(resource::get_cuda_stream(handle)) {} + + protected: + void Run() + { + using ValueType = float; + using IndexType = int; + + int m = 20, n = 15, k = 3, nnz = 120; + + auto h_indptr = golden_indptr(); + auto h_indices = golden_indices(); + auto h_values = golden_values(); + + auto d_indptr = raft::make_device_vector(handle, m + 1); + auto d_indices = raft::make_device_vector(handle, nnz); + auto d_values = raft::make_device_vector(handle, nnz); + raft::update_device(d_indptr.data_handle(), h_indptr.data(), m + 1, stream); + raft::update_device(d_indices.data_handle(), h_indices.data(), nnz, stream); + raft::update_device(d_values.data_handle(), h_values.data(), nnz, stream); + + auto csr_structure = + raft::make_device_compressed_structure_view( + d_indptr.data_handle(), d_indices.data_handle(), m, n, nnz); + auto csr_matrix = + raft::make_device_csr_matrix_view( + d_values.data_handle(), csr_structure); + + // Reconstruct dense matrix on host to compute column means + std::vector dense(m * n, 0); + int vi = 0; + for (int i = 0; i < m; i++) { + for (int jj = h_indptr[i]; jj < h_indptr[i + 1]; jj++) { + dense[h_indices[jj] * m + i] = h_values[vi++]; + } + } + + std::vector h_means(n, 0); + for (int j = 0; j < n; j++) { + for (int i = 0; i < m; i++) + h_means[j] += dense[j * m + i]; + h_means[j] /= m; + } + auto d_means = raft::make_device_vector(handle, n); + raft::update_device(d_means.data_handle(), h_means.data(), n, stream); + + // Ground truth: dense SVD of (A - 1*mean^T) + std::vector centered(m * n); + for (int j = 0; j < n; j++) + for (int i = 0; i < m; i++) + centered[j * m + i] = dense[j * m + i] - h_means[j]; + + auto centered_dev = + raft::make_device_matrix(handle, m, n); + raft::update_device(centered_dev.data_handle(), centered.data(), m * n, stream); + auto ref_S = raft::make_device_vector(handle, n); + auto ref_U = raft::make_device_matrix(handle, m, n); + auto ref_Vt = raft::make_device_matrix(handle, n, n); + raft::linalg::svdQR(handle, + centered_dev.data_handle(), + m, + n, + ref_S.data_handle(), + ref_U.data_handle(), + ref_Vt.data_handle(), + false, + true, + true, + stream); + + // Operator-based SVD + mean_centered_operator op(csr_matrix, d_means.data_handle(), m, n); + + sparse_svd_config config; + config.n_components = k; + config.n_oversamples = 10; + config.n_power_iters = 4; + config.seed = 42; + + auto S = raft::make_device_vector(handle, k); + auto U = raft::make_device_matrix(handle, m, k); + auto Vt = raft::make_device_matrix(handle, k, n); + sparse_randomized_svd(handle, config, op, S.view(), U.view(), Vt.view()); + + // Singular values must match dense centered ground truth + ASSERT_TRUE(raft::devArrMatch( + S.data_handle(), ref_S.data_handle(), k, raft::CompareApprox(0.1), stream)); + } + + raft::resources handle; + cudaStream_t stream; +}; + +TEST_F(MeanCenteredOperatorTest, OperatorVsDenseGroundTruth) { Run(); } + +} // namespace raft::sparse::solver diff --git a/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt b/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt index 516b8d61f1..969b9b68b7 100644 --- a/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt +++ b/python/pylibraft/pylibraft/sparse/linalg/CMakeLists.txt @@ -1,12 +1,12 @@ # ============================================================================= # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on # ============================================================================= # Set the list of Cython files to build -set(cython_sources lanczos.pyx) +set(cython_sources lanczos.pyx svds.pyx) # TODO: should finally be replaced with 'compiled' library to be more generic, when that is # available diff --git a/python/pylibraft/pylibraft/sparse/linalg/__init__.py b/python/pylibraft/pylibraft/sparse/linalg/__init__.py index edf9b39e9c..e632a723d3 100644 --- a/python/pylibraft/pylibraft/sparse/linalg/__init__.py +++ b/python/pylibraft/pylibraft/sparse/linalg/__init__.py @@ -1,7 +1,8 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # from .lanczos import eigsh +from .svds import svds -__all__ = ["eigsh"] +__all__ = ["eigsh", "svds"] diff --git a/python/pylibraft/pylibraft/sparse/linalg/svds.pyx b/python/pylibraft/pylibraft/sparse/linalg/svds.pyx new file mode 100644 index 0000000000..b9a2778190 --- /dev/null +++ b/python/pylibraft/pylibraft/sparse/linalg/svds.pyx @@ -0,0 +1,284 @@ +# +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True +# cython: language_level = 3 + +import warnings + +import cupy as cp +import cupyx.scipy.sparse as cupy_sparse +import numpy as np + +from cython.operator cimport dereference as deref +from libc.stdint cimport int64_t, uint32_t, uint64_t, uintptr_t + +from pylibraft.common import Handle, cai_wrapper, device_ndarray +from pylibraft.common.handle import auto_sync_handle + +from pylibraft.common.cpp.mdspan cimport ( + col_major, + device_matrix_view, + device_vector_view, + make_device_matrix_view, + make_device_vector_view, + row_major, +) +from pylibraft.common.cpp.optional cimport optional +from pylibraft.common.handle cimport device_resources + + +cdef extern from "raft/sparse/solver/svds_config.hpp" \ + namespace "raft::sparse::solver" nogil: + + cdef cppclass sparse_svd_config[ValueTypeT]: + int n_components + int n_oversamples + int n_power_iters + optional[uint64_t] seed + + +cdef extern from "raft_runtime/solver/randomized_svds.hpp" \ + namespace "raft::runtime::solver" nogil: + + cdef void sparse_randomized_svd_f \ + "raft::runtime::solver::sparse_randomized_svd"( + const device_resources &handle, + const sparse_svd_config[float] &config, + device_vector_view[int, uint32_t] indptr, + device_vector_view[int, uint32_t] indices, + device_vector_view[float, uint32_t] data, + int n_rows, int n_cols, int nnz, + device_vector_view[float, uint32_t] singular_values, + optional[device_matrix_view[float, uint32_t, col_major]] U, + optional[device_matrix_view[float, uint32_t, col_major]] Vt) except + + + cdef void sparse_randomized_svd_d \ + "raft::runtime::solver::sparse_randomized_svd"( + const device_resources &handle, + const sparse_svd_config[double] &config, + device_vector_view[int, uint32_t] indptr, + device_vector_view[int, uint32_t] indices, + device_vector_view[double, uint32_t] data, + int n_rows, int n_cols, int nnz, + device_vector_view[double, uint32_t] singular_values, + optional[device_matrix_view[double, uint32_t, col_major]] U, + optional[device_matrix_view[double, uint32_t, col_major]] Vt) except + + + +@auto_sync_handle +def svds(A, k=6, n_oversamples=10, n_power_iters=2, + seed=None, return_singular_vectors=True, handle=None): + """ + Compute the largest ``k`` singular values and corresponding singular + vectors of a sparse matrix using randomized SVD. + + Computes the truncated SVD: ``A ~ U @ diag(S) @ Vt``. + + Args: + A (cupyx.scipy.sparse.csr_matrix): Sparse CSR matrix of shape + ``(m, n)``. Must be of type + :class:`cupyx.scipy.sparse.csr_matrix`. + k (int): Number of singular values and vectors to compute. Must be + ``1 <= k < min(m, n)``. Default 6. + n_oversamples (int): Number of extra random vectors for better + approximation. Total subspace dimension is ``k + n_oversamples``. + Default 10. + n_power_iters (int): Number of power iteration passes. More + iterations improve accuracy for matrices with slowly decaying + singular values. Default 2. + seed (int or None): Random seed for reproducibility. If ``None``, + a non-deterministic seed is used. + return_singular_vectors (bool or {{"u", "vh"}}): Controls which + singular vectors are returned (matches + :func:`scipy.sparse.linalg.svds`). + + - ``True`` (default): return ``(U, S, Vt)``. + - ``False``: skip both vector matrices, return ``(None, S, None)``. + - ``"u"``: skip ``Vt``, return ``(U, S, None)``. + - ``"vh"``: skip ``U``, return ``(None, S, Vt)``. + + Skipping a side avoids the corresponding output buffer and + final matrix multiplication. + handle: RAFT resource handle. If ``None``, a default is created. + + Returns: + tuple: + ``(U, S, Vt)`` where ``U`` is left singular vectors ``(m, k)``, + ``S`` is singular values ``(k,)`` in descending order, and + ``Vt`` is right singular vectors ``(k, n)``. ``U`` and/or ``Vt`` + may be ``None`` depending on ``return_singular_vectors``. + + .. seealso:: + :func:`scipy.sparse.linalg.svds` + + .. note:: + This function uses randomized SVD (Halko et al. 2009) with + CholeskyQR2 orthogonalization (Tomas et al. 2024) for efficient + GPU execution. + + """ + + if A is None: + raise ValueError("'A' cannot be None!") + + if not cupy_sparse.issparse(A): + raise TypeError( + "Expected a cupyx.scipy.sparse matrix, got %s" % type(A).__name__ + ) + + if not isinstance(A, cupy_sparse.csr_matrix): + raise TypeError( + "Expected a cupyx.scipy.sparse.csr_matrix, got %s. " + "Use A.tocsr() to convert." % type(A).__name__ + ) + + if not ( + return_singular_vectors is True + or return_singular_vectors is False + or return_singular_vectors == "u" + or return_singular_vectors == "vh" + ): + raise ValueError( + "return_singular_vectors must be True, False, 'u', or 'vh', " + "got %r" % (return_singular_vectors,) + ) + want_U = (return_singular_vectors is True) or (return_singular_vectors == "u") + want_Vt = (return_singular_vectors is True) or (return_singular_vectors == "vh") + + m, n = A.shape + + if k < 1 or k >= min(m, n): + raise ValueError( + f"k must satisfy 1 <= k < min(m, n), got k={k}, m={m}, n={n}" + ) + + # Extract CSR arrays and ensure int32 indices. raft's runtime layer takes + # `int` (int32) for nnz / indptr / indices, so any overflow on conversion + # cannot be recovered downstream — error out before the cast. + indptr = A.indptr + indices = A.indices + data = A.data + + INT32_MAX = (1 << 31) - 1 + + IndexType = indptr.dtype + if IndexType == np.int64: + if A.nnz > INT32_MAX: + raise OverflowError( + f"nnz={A.nnz} exceeds int32 max ({INT32_MAX}); " + "raft sparse SVD requires nnz to fit in int32." + ) + if n > INT32_MAX: + raise OverflowError( + f"n_cols={n} exceeds int32 max ({INT32_MAX}); " + "raft sparse SVD requires column indices to fit in int32." + ) + warnings.warn( + "Input matrix has int64 indices which will be converted to " + "int32. The conversion is safe for this matrix (nnz and " + "column indices fit in int32).", + UserWarning, + stacklevel=2, + ) + indptr = indptr.astype(np.int32) + indices = indices.astype(np.int32) + elif IndexType != np.int32: + raise TypeError("dtype IndexType=%s not supported, " + "expected int32 or int64" % IndexType) + + indptr = cai_wrapper(indptr) + indices = cai_wrapper(indices) + data = cai_wrapper(data) + + ValueType = data.dtype + nnz = A.nnz + + indptr_ptr = indptr.data + indices_ptr = indices.data + data_ptr = data.data + + cdef optional[uint64_t] seed_opt + if seed is not None: + seed_opt = seed + + # Allocate outputs + S_out = device_ndarray.empty((k,), dtype=ValueType, order='F') + S_cai = cai_wrapper(S_out) + S_ptr = S_cai.data + + U_out = None + Vt_out = None + cdef uintptr_t U_ptr = 0 + cdef uintptr_t Vt_ptr = 0 + if want_U: + U_out = device_ndarray.empty((m, k), dtype=ValueType, order='F') + U_ptr = cai_wrapper(U_out).data + if want_Vt: + Vt_out = device_ndarray.empty((k, n), dtype=ValueType, order='F') + Vt_ptr = cai_wrapper(Vt_out).data + + handle = handle if handle is not None else Handle() + cdef device_resources *h = handle.getHandle() + + cdef sparse_svd_config[float] cfg_float + cdef sparse_svd_config[double] cfg_double + cdef optional[device_matrix_view[float, uint32_t, col_major]] U_opt_f + cdef optional[device_matrix_view[float, uint32_t, col_major]] Vt_opt_f + cdef optional[device_matrix_view[double, uint32_t, col_major]] U_opt_d + cdef optional[device_matrix_view[double, uint32_t, col_major]] Vt_opt_d + + if ValueType == np.float32: + cfg_float.n_components = k + cfg_float.n_oversamples = n_oversamples + cfg_float.n_power_iters = n_power_iters + cfg_float.seed = seed_opt + if want_U: + U_opt_f = make_device_matrix_view[float, uint32_t, col_major]( + U_ptr, m, k) + if want_Vt: + Vt_opt_f = make_device_matrix_view[float, uint32_t, col_major]( + Vt_ptr, k, n) + sparse_randomized_svd_f( + deref(h), + cfg_float, + make_device_vector_view(indptr_ptr, (m + 1)), + make_device_vector_view(indices_ptr, nnz), + make_device_vector_view(data_ptr, nnz), + m, n, nnz, + make_device_vector_view(S_ptr, k), + U_opt_f, + Vt_opt_f, + ) + elif ValueType == np.float64: + cfg_double.n_components = k + cfg_double.n_oversamples = n_oversamples + cfg_double.n_power_iters = n_power_iters + cfg_double.seed = seed_opt + if want_U: + U_opt_d = make_device_matrix_view[double, uint32_t, col_major]( + U_ptr, m, k) + if want_Vt: + Vt_opt_d = make_device_matrix_view[double, uint32_t, col_major]( + Vt_ptr, k, n) + sparse_randomized_svd_d( + deref(h), + cfg_double, + make_device_vector_view(indptr_ptr, (m + 1)), + make_device_vector_view(indices_ptr, nnz), + make_device_vector_view(data_ptr, nnz), + m, n, nnz, + make_device_vector_view(S_ptr, k), + U_opt_d, + Vt_opt_d, + ) + else: + raise TypeError("dtype ValueType=%s not supported, " + "expected float32 or float64" % ValueType) + + U_ret = cp.asarray(U_out) if U_out is not None else None + Vt_ret = cp.asarray(Vt_out) if Vt_out is not None else None + return (U_ret, cp.asarray(S_out), Vt_ret) diff --git a/python/pylibraft/pylibraft/tests/test_sparse.py b/python/pylibraft/pylibraft/tests/test_sparse.py index 61cc4d6e62..b6b66721a4 100644 --- a/python/pylibraft/pylibraft/tests/test_sparse.py +++ b/python/pylibraft/pylibraft/tests/test_sparse.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # @@ -9,7 +9,7 @@ import scipy from cupyx.scipy import sparse -from pylibraft.sparse.linalg import eigsh +from pylibraft.sparse.linalg import eigsh, svds def shaped_random( @@ -160,3 +160,194 @@ def test_reproducibility(self, dtype, seed): # Results should be identical assert cupy.allclose(w1, w2), "Eigenvalues differ with same seed" assert cupy.allclose(v1, v2), "Eigenvectors differ with same seed" + + +class TestSvds: + """Tests for sparse randomized SVD.""" + + @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64]) + @pytest.mark.parametrize("k", [3, 10]) + def test_singular_values_positive_descending(self, dtype, k): + """Singular values must be positive and in descending order.""" + m, n = 100, 60 + cupy.random.seed(42) + A = sparse.random(m, n, density=0.3, format="csr", dtype=dtype) + U, S, Vt = svds(A, k=k, seed=42) + S_host = cupy.asnumpy(S) + assert all(S_host > 0), "Singular values must be positive" + assert all( + S_host[i] >= S_host[i + 1] for i in range(len(S_host) - 1) + ), "Singular values must be descending" + + @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64]) + @pytest.mark.parametrize("k", [3, 10]) + def test_orthogonality(self, dtype, k): + """U columns and Vt rows must be orthonormal.""" + m, n = 100, 60 + cupy.random.seed(42) + A = sparse.random(m, n, density=0.3, format="csr", dtype=dtype) + U, S, Vt = svds(A, k=k, seed=42) + + tol = 1e-4 if dtype == numpy.float32 else 1e-8 + I_k = cupy.eye(k, dtype=dtype) + + UtU = U.T @ U + assert cupy.allclose(UtU, I_k, atol=tol), "U is not orthogonal" + + VtVtT = Vt @ Vt.T + assert cupy.allclose(VtVtT, I_k, atol=tol), "Vt rows not orthonormal" + + @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64]) + def test_reconstruction_error(self, dtype): + """||A - U @ diag(S) @ Vt|| must be bounded by the (k+1)-th singular value.""" + m, n = 80, 50 + k = 5 + cupy.random.seed(42) + A = sparse.random(m, n, density=0.4, format="csr", dtype=dtype) + U, S, Vt = svds(A, k=k, n_power_iters=4, seed=42) + + A_dense = A.toarray() + recon = (U * S[None, :]) @ Vt + err = cupy.linalg.norm(A_dense - recon) + norm_A = cupy.linalg.norm(A_dense) + + # Relative error should be reasonable (not reconstructing full matrix) + assert err / norm_A < 1.0, ( + f"Reconstruction error too large: {float(err / norm_A):.4f}" + ) + + def test_shapes(self): + """Output shapes must match (m,k), (k,), (k,n).""" + m, n, k = 100, 60, 5 + A = sparse.random(m, n, density=0.3, format="csr", dtype=numpy.float32) + U, S, Vt = svds(A, k=k, seed=42) + assert U.shape == (m, k) + assert S.shape == (k,) + assert Vt.shape == (k, n) + + def test_int64_indices(self): + """int64 indices should be auto-converted and work, with a warning.""" + m, n, k = 50, 30, 3 + cupy.random.seed(42) + A = sparse.random(m, n, density=0.3, format="csr", dtype=numpy.float32) + + # cupyx CSR's constructor normalizes index dtype back to int32, so + # bypass it via field reassignment to actually hold int64 indices. + A.indptr = A.indptr.astype(cupy.int64) + A.indices = A.indices.astype(cupy.int64) + assert A.indptr.dtype == numpy.int64 + + with pytest.warns(UserWarning, match="int64"): + U, S, Vt = svds(A, k=k, seed=42) + assert S.shape == (k,) + assert all(cupy.asnumpy(S) > 0) + + def test_invalid_k(self): + """K out of range must raise ValueError.""" + A = sparse.random(20, 15, density=0.3, format="csr") + with pytest.raises(ValueError): + svds(A, k=0) + with pytest.raises(ValueError): + svds(A, k=15) + + def test_invalid_input_matrix(self): + """Non-CSR / non-sparse / None inputs must be rejected with a clear error.""" + # None + with pytest.raises(ValueError, match="'A' cannot be None"): + svds(None, k=3) + + # Dense array (not sparse at all) + dense = cupy.random.rand(20, 15).astype(numpy.float32) + with pytest.raises(TypeError, match="cupyx.scipy.sparse"): + svds(dense, k=3) + + # Sparse but wrong layout (COO / CSC) + A_coo = sparse.random( + 20, 15, density=0.3, format="coo", dtype=numpy.float32 + ) + with pytest.raises(TypeError, match="csr_matrix"): + svds(A_coo, k=3) + + A_csc = sparse.random( + 20, 15, density=0.3, format="csc", dtype=numpy.float32 + ) + with pytest.raises(TypeError, match="csr_matrix"): + svds(A_csc, k=3) + + @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64]) + def test_reproducibility(self, dtype): + """Same seed must give same results.""" + m, n, k = 80, 50, 5 + cupy.random.seed(42) + A = sparse.random(m, n, density=0.3, format="csr", dtype=dtype) + + _, S1, _ = svds(A, k=k, seed=123) + _, S2, _ = svds(A, k=k, seed=123) + + tol = 1e-5 if dtype == numpy.float32 else 1e-10 + assert cupy.allclose(S1, S2, atol=tol), ( + "Not reproducible with same seed" + ) + + @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64]) + def test_return_singular_vectors_modes(self, dtype): + """All four scipy-style return_singular_vectors modes.""" + m, n, k = 60, 40, 4 + cupy.random.seed(42) + A = sparse.random(m, n, density=0.3, format="csr", dtype=dtype) + + # Reference: full output + U_ref, S_ref, Vt_ref = svds(A, k=k, seed=42) + assert U_ref.shape == (m, k) + assert Vt_ref.shape == (k, n) + + # False: singular values only + U_f, S_f, Vt_f = svds(A, k=k, seed=42, return_singular_vectors=False) + assert U_f is None + assert Vt_f is None + assert cupy.allclose(S_f, S_ref) + + # "u": U + S + U_u, S_u, Vt_u = svds(A, k=k, seed=42, return_singular_vectors="u") + assert U_u.shape == (m, k) + assert Vt_u is None + assert cupy.allclose(S_u, S_ref) + # U-based sign correction matches the full-output reference exactly + tol = 1e-4 if dtype == numpy.float32 else 1e-10 + assert cupy.allclose(U_u, U_ref, atol=tol) + + # "vh": Vt + S + U_v, S_v, Vt_v = svds(A, k=k, seed=42, return_singular_vectors="vh") + assert U_v is None + assert Vt_v.shape == (k, n) + assert cupy.allclose(S_v, S_ref) + # Vt rows should remain orthonormal even though signs may differ + I_k = cupy.eye(k, dtype=dtype) + assert cupy.allclose(Vt_v @ Vt_v.T, I_k, atol=tol) + + def test_return_singular_vectors_invalid(self): + """Invalid return_singular_vectors must raise ValueError.""" + A = sparse.random(20, 15, density=0.3, format="csr") + with pytest.raises(ValueError): + svds(A, k=3, return_singular_vectors="bogus") + with pytest.raises(ValueError): + svds(A, k=3, return_singular_vectors=1) + + @pytest.mark.parametrize("dtype", [numpy.float32, numpy.float64]) + def test_return_singular_vectors_deterministic(self, dtype): + """Each mode must be deterministic across runs with the same seed.""" + m, n, k = 50, 30, 3 + cupy.random.seed(42) + A = sparse.random(m, n, density=0.3, format="csr", dtype=dtype) + + # cuSOLVER gesvd inside the algorithm is not bit-deterministic across + # runs even with a fixed RNG seed, so allow a small numerical tolerance. + atol = 1e-4 if dtype == numpy.float32 else 1e-10 + for mode in (False, "u", "vh"): + res_a = svds(A, k=k, seed=7, return_singular_vectors=mode) + res_b = svds(A, k=k, seed=7, return_singular_vectors=mode) + assert cupy.allclose(res_a[1], res_b[1], atol=atol) + if mode in (True, "u"): + assert cupy.allclose(res_a[0], res_b[0], atol=atol) + if mode in (True, "vh"): + assert cupy.allclose(res_a[2], res_b[2], atol=atol)