weecology · henrykironde · Feb 13, 2026 · Feb 13, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,7 +3,7 @@ name = "deepforest"
 version = "2.0.1dev0"
 description = "Platform for individual detection from airborne remote sensing including trees, birds, and livestock. Supports multiple detection models, adding models for species classification, and easy fine tuning to particular ecosystems."
 readme = "README.md"
-requires-python = ">=3.10,<3.13"
+requires-python = ">=3.10,<3.14"
 license = {text = "MIT"}
 keywords = ["deep-learning", "forest", "ecology", "computer-vision"]
 classifiers = [

diff --git a/src/deepforest/datasets/prediction.py b/src/deepforest/datasets/prediction.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 
 import numpy as np
 import pandas as pd
@@ -14,6 +15,77 @@
 from deepforest.utilities import format_geometry, read_file
 
 
+def _load_image_array(
+    image_path: str | None = None, image: np.ndarray | Image.Image | None = None
+) -> np.ndarray:
+    """Load image from path or array; converts to RGB when loading from
+    path."""
+    if image is None:
+        if image_path is None:
+            raise ValueError("Either image_path or image must be provided")
+        return np.asarray(Image.open(image_path).convert("RGB"))
+
+    return image if isinstance(image, np.ndarray) else np.asarray(image)
+
+
+def _ensure_rgb_chw(image: np.ndarray) -> np.ndarray:
+    """Return 3-channel RGB in CHW order (no normalization).
+
+    Raises if grayscale or wrong shape.
+    """
+    if image.ndim == 2:
+        raise ValueError("Grayscale images are not supported (expected 3-channel RGB)")
+    if image.ndim != 3:
+        raise ValueError(f"Expected 3D image array, got shape {image.shape}")
+
+    # Ensure channels-first (C, H, W)
+    if image.shape[0] == 3:
+        chw = image
+    elif image.shape[-1] == 3:
+        chw = np.moveaxis(image, -1, 0)
+    else:
+        raise ValueError(f"Expected image with 3 channels, got shape {image.shape}")
+
+    return np.ascontiguousarray(chw)
+
+
+def _ensure_rgb_chw_float32(image: np.ndarray) -> np.ndarray:
+    """Normalize to RGB CHW float32 in [0, 1].
+
+    Accepts HWC/CHW uint8 or float. Raises if invalid.
+    """
+    chw = _ensure_rgb_chw(image)
+
+    # Normalize based primarily on dtype
+    if chw.dtype == np.uint8:
+        chw = chw.astype(np.float32)
+        chw /= 255.0
+    elif np.issubdtype(chw.dtype, np.floating):
+        if chw.dtype != np.float32:
+            chw = chw.astype(np.float32)
+
+        # Allow already-normalized float images.
+        # If values look like 0-255 floats, normalize.
+        max_val = float(chw.max())
+        min_val = float(chw.min())
+        if min_val < 0:
+            raise ValueError(
+                f"Expected float image in [0, 1] or [0, 255], got min {min_val}"
+            )
+        if max_val > 1.0:
+            if max_val <= 255.0:
+                chw /= 255.0
+            else:
+                raise ValueError(
+                    f"Expected float image in [0, 1] or [0, 255], got max {max_val}"
+                )
+    else:
+        # Integers other than uint8 are ambiguous; be explicit.
+        raise ValueError(f"Unsupported image dtype {chw.dtype}. Expected uint8 or float.")
+
+    return np.ascontiguousarray(chw)
+
+
 # Base prediction class
 class PredictionDataset(Dataset):
     """Base class for prediction datasets. Defines the common interface and
@@ -48,32 +120,9 @@ def __init__(
     def load_and_preprocess_image(
         self, image_path: str = None, image: np.ndarray | Image.Image = None
     ):
-        if image is None:
-            if image_path is None:
-                raise ValueError("Either image_path or image must be provided")
-            image = np.array(Image.open(image_path).convert("RGB"))
-        else:
-            image = np.array(image)
-        # If dtype is not float32, convert to float32
-        if image.dtype != "float32":
-            image = image.astype("float32")
-
-        # If image is not normalized, normalize to [0, 1]
-        if image.max() > 1 or image.min() < 0:
-            image = image / 255.0
-
-        # If image is not in CHW format, convert to CHW
-        if image.shape[0] != 3:
-            if image.shape[-1] != 3:
-                raise ValueError(
-                    f"Expected 3 channel image, got image shape {image.shape}"
-                )
-            else:
-                image = np.rollaxis(image, 2, 0)
-
-        image = torch.from_numpy(image)
-
-        return image
+        image_arr = _load_image_array(image_path=image_path, image=image)
+        image_arr = _ensure_rgb_chw_float32(image_arr)
+        return torch.from_numpy(image_arr)
 
     def prepare_items(self):
         """Prepare the items for the dataset.
@@ -169,7 +218,11 @@ def __init__(self, path=None, image=None, patch_size=400, patch_overlap=0):
         )
 
     def prepare_items(self):
-        self.image = self.load_and_preprocess_image(self.path, image=self.image)
+        image_arr = _load_image_array(image_path=self.path, image=self.image)
+        image = _ensure_rgb_chw(image_arr)
+
+        # Keep as uint8/float in CHW; normalize per-crop to avoid full-image float copy
+        self.image = image
         self.windows = preprocess.compute_windows(
             self.image, self.patch_size, self.patch_overlap
         )
@@ -182,8 +235,12 @@ def window_list(self):
 
     def get_crop(self, idx):
         crop = self.image[self.windows[idx].indices()]
-
-        return crop
+        # Copy to avoid in-place modification corrupting self.image when crop is a
+        # view (e.g. overlapping windows or float32 input). Reuse dtype-based
+        # normalization to avoid heuristic edge cases (e.g. uint8 all-0/1 crops).
+        crop = np.array(crop, copy=True)
+        crop = _ensure_rgb_chw_float32(crop)
+        return torch.from_numpy(crop)
 
     def get_image_basename(self, idx):
         if self.path is not None:
@@ -433,24 +490,25 @@ class TiledRaster(PredictionDataset):
     def __init__(self, path, patch_size, patch_overlap):
         if path is None:
             raise ValueError("path is required for a memory raster dataset")
+        self._src = None
         super().__init__(path=path, patch_size=patch_size, patch_overlap=patch_overlap)
 
     def prepare_items(self):
-        # Get raster shape without keeping file open
-        with rio.open(self.path) as src:
-            width = src.shape[0]
-            height = src.shape[1]
-
-            # Check is tiled
-            if not src.is_tiled:
-                raise ValueError(
-                    "Out-of-memory dataset is selected, but raster is not tiled, "
-                    "leading to entire raster being read into memory and defeating "
-                    "the purpose of an out-of-memory dataset. "
-                    "\nPlease run: "
-                    "\ngdal_translate -of GTiff -co TILED=YES <input> <output> "
-                    "to create a tiled raster"
-                )
+        # Open once; workers=0 is enforced by caller for this dataset.
+        self._src = rio.open(self.path)
+        height = self._src.height
+        width = self._src.width
+
+        # Warn on non-tiled rasters: window reads may still be efficient (strip-based),
+        # but performance can degrade depending on driver/strip layout.
+        if not self._src.is_tiled:
+            warnings.warn(
+                "dataloader_strategy='window' is selected, but raster is not tiled. "
+                "Windowed reads may be slower depending on file layout. If needed, "
+                "create a tiled GeoTIFF with: "
+                "gdal_translate -of GTiff -co TILED=YES <input> <output>",
+                stacklevel=2,
+            )
 
         # Generate sliding windows
         self.windows = slidingwindow.generateForSize(
@@ -469,12 +527,15 @@ def window_list(self):
 
     def get_crop(self, idx):
         window = self.windows[idx]
-        with rio.open(self.path) as src:
-            window_data = src.read(window=Window(window.x, window.y, window.w, window.h))
+        assert self._src is not None, "Raster dataset is not open"
+        window_data = self._src.read(
+            window=Window(window.x, window.y, window.w, window.h)
+        )
 
         # Rasterio already returns (C, H, W), just normalize and convert
-        window_data = window_data.astype("float32") / 255.0
-        window_data = torch.from_numpy(window_data).float()
+        window_data = window_data.astype(np.float32)
+        window_data /= 255.0
+        window_data = torch.from_numpy(window_data)
         if window_data.shape[0] != 3:
             raise ValueError(
                 f"Expected 3 channel image, got {window_data.shape[0]} channels"
@@ -487,3 +548,16 @@ def get_image_basename(self, idx):
 
     def get_crop_bounds(self, idx):
         return self.window_list()[idx]
+
+    def close(self) -> None:
+        """Close the underlying raster dataset."""
+        if self._src is not None:
+            self._src.close()
+            self._src = None
+
+    def __del__(self):
+        # Best-effort cleanup
+        try:
+            self.close()
+        except Exception:
+            pass
diff --git a/src/deepforest/main.py b/src/deepforest/main.py
@@ -241,6 +241,26 @@ def create_trainer(self, logger=None, callbacks=None, **kwargs):
 
         self.trainer = pl.Trainer(**trainer_args)
 
+        # Helpful warning: CUDA visible but trainer not using it.
+        # This commonly happens if accelerator/devices were overridden to CPU, or
+        # if the trainer wasn't recreated after changing config.
+        try:
+            accel_name = type(self.trainer.accelerator).__name__.lower()
+        except Exception:
+            accel_name = ""
+
+        requested_accel = str(trainer_args.get("accelerator", "")).lower()
+        if torch.cuda.is_available() and requested_accel in {"auto", "gpu", "cuda"}:
+            if "cuda" not in accel_name and "gpu" not in accel_name:
+                warnings.warn(
+                    "CUDA appears to be available, but the Lightning trainer is not "
+                    f"using a GPU accelerator (accelerator={trainer_args.get('accelerator')}, "
+                    f"devices={trainer_args.get('devices')}). "
+                    "To force GPU inference, call create_trainer(accelerator='gpu', devices=1) "
+                    "or set config.accelerator='gpu' and config.devices=1, then recreate the trainer.",
+                    stacklevel=2,
+                )
+
     def on_fit_start(self):
         if self.config.train.csv_file is None:
             raise AttributeError(
@@ -600,6 +620,10 @@ def predict_tile(
                         image_results.append(formatted_result)
                         global_window_idx += 1
 
+                # Ensure raster datasets are closed promptly
+                if hasattr(ds, "close"):
+                    ds.close()
+
             if not image_results:
                 results = pd.DataFrame()
             else:
@@ -895,8 +919,7 @@ def predict_step(self, batch, batch_idx):
 
         self.model.eval()
         with torch.no_grad():
-            preds = self.model.forward(images)
-        return preds
+            return self.model.forward(images)
 
     def predict_batch(self, images, preprocess_fn=None):
         """Predict a batch of images with the deepforest model.

diff --git a/tests/hpc_multi_gpu_train.py b/tests/hpc_multi_gpu_train.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+"""HPC-only multi-GPU training smoke test (DDP).
+
+Run with:
+  torchrun --nproc_per_node=2 tests/hpc_multi_gpu_train.py
+"""
+from __future__ import annotations
+
+import os
+import sys
+
+import torch
+
+from deepforest import get_data
+from deepforest.main import deepforest
+
+
+def _require_hpc() -> None:
+    if os.environ.get("GITHUB_ACTIONS") or os.environ.get("CI"):
+        raise SystemExit("CI environment detected; skip HPC-only test.")
+    if not os.environ.get("HIPERGATOR") and not os.environ.get("SLURM_JOB_ID"):
+        raise SystemExit(
+            "This script is intended for HPC use only. "
+            "Set HIPERGATOR=1 or run under SLURM."
+        )
+
+
+def _require_ddp() -> None:
+    if "LOCAL_RANK" not in os.environ and "RANK" not in os.environ:
+        raise SystemExit(
+            "DDP environment not detected. Run with:\n"
+            "  torchrun --nproc_per_node=2 tests/hpc_multi_gpu_train.py"
+        )
+
+
+def main() -> int:
+    _require_hpc()
+    _require_ddp()
+
+    if torch.cuda.device_count() < 2:
+        raise SystemExit("Need at least 2 GPUs for this test.")
+
+    m = deepforest()
+    m.config.workers = 0
+    m.config.batch_size = 1
+    m.config.num_classes = 1
+    m.config.label_dict = {"Tree": 0}
+    train_csv = get_data("example.csv")
+    m.config.train.csv_file = train_csv
+    m.config.train.root_dir = os.path.dirname(train_csv)
+    m.config.validation.csv_file = train_csv
+    m.config.validation.root_dir = os.path.dirname(train_csv)
+    m.create_model(initialize_model=True)
+
+    # Keep this fast but avoid fast_dev_run's zero-length warning in DDP.
+    m.create_trainer(
+        accelerator="gpu",
+        devices=2,
+        strategy="ddp",
+        fast_dev_run=False,
+        max_epochs=1,
+        limit_train_batches=2,
+        limit_val_batches=2,
+        log_every_n_steps=1,
+    )
+    m.trainer.fit(m)
+
+    # Multi-GPU evaluation pass (uses same example.csv)
+    m.trainer.validate(m)
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_gpu_inference_uses_cuda.py b/tests/test_gpu_inference_uses_cuda.py
@@ -0,0 +1,39 @@
+import os
+
+import pytest
+import torch
+
+from deepforest import get_data
+from deepforest.main import deepforest
+
+
+@pytest.mark.skipif(
+    not os.environ.get("HIPERGATOR"),
+    reason="Only run on HIPERGATOR (requires GPU + model downloads).",
+)
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason="CUDA not available in this test environment.",
+)
+def test_predict_tile_uses_cuda_when_requested():
+    """Ensure predict_tile runs on CUDA when accelerator/devices request GPU.
+
+    This is a regression test to catch silent CPU fallbacks on GPU nodes.
+    """
+    m = deepforest(config_args={"accelerator": "gpu", "devices": 1, "workers": 0})
+    m.load_model(model_name="weecology/deepforest-tree", revision="main")
+    m.create_trainer(accelerator="gpu", devices=1)
+
+    results = m.predict_tile(
+        path=get_data("OSBS_029.png"),
+        patch_size=400,
+        patch_overlap=0.0,
+        iou_threshold=0.15,
+        dataloader_strategy="single",
+    )
+    assert results is not None and not results.empty
+
+    # Assert trainer is actually using a GPU accelerator (no silent CPU fallback).
+    assert m.trainer is not None
+    accel_name = type(m.trainer.accelerator).__name__.lower()
+    assert "cuda" in accel_name or "gpu" in accel_name