diff --git a/mlx/backend/cuda/allocator.cpp b/mlx/backend/cuda/allocator.cpp
index 718ae33e9c..244218d4db 100644
--- a/mlx/backend/cuda/allocator.cpp
+++ b/mlx/backend/cuda/allocator.cpp
@@ -318,7 +318,7 @@ void CudaAllocator::move_to_unified_memory(
   buf.device = -1;
 }
 
-// This must be called with mutex_ aquired
+// This must be called with mutex_ acquired
 void CudaAllocator::free_cuda_buffer(CudaBuffer* buf) {
   if (scalar_pool_.in_pool(buf)) {
     scalar_pool_.free(buf);
diff --git a/mlx/backend/cuda/eval.cpp b/mlx/backend/cuda/eval.cpp
index ef9ee20cfa..c5bfe2fa85 100644
--- a/mlx/backend/cuda/eval.cpp
+++ b/mlx/backend/cuda/eval.cpp
@@ -13,7 +13,7 @@
 namespace mlx::core::gpu {
 
 void init() {
-  // Force initalization of CUDA, so CUDA runtime get destroyed last.
+  // Force initialization of CUDA, so CUDA runtime get destroyed last.
   cudaFree(nullptr);
   // Make sure CUDA event pool get destroyed after device and stream.
   mlx::core::cu::CudaEvent::init_pool();
diff --git a/mlx/backend/metal/kernels/fft.h b/mlx/backend/metal/kernels/fft.h
index e478a85b6c..3cce29c574 100644
--- a/mlx/backend/metal/kernels/fft.h
+++ b/mlx/backend/metal/kernels/fft.h
@@ -229,7 +229,7 @@ template <int tg_mem_size, typename in_T, typename out_T>
     uint3 grid [[threads_per_grid]]) {
   // Use Rader's algorithm to compute fast FFTs
   // when a prime factor `p` of `n` is greater than 13 but
-  // has `p - 1` Stockham decomposable into to prime factors <= 13.
+  // has `p - 1` Stockham decomposable into prime factors <= 13.
   //
   // E.g. n = 102
   //        = 2 * 3 * 17