Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/sweep-api-consistency-state.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
module,last_inspected,issue,severity_max,categories_found,notes
geotiff,2026-05-13,1775,MEDIUM,3,"Sweep v4 (deep-sweep-api-consistency-geotiff-2026-05-13). 1 MEDIUM finding filed and fixed: #1775 reader trio dtype kwarg lacks str|np.dtype|None annotation (Cat 3). Annotation added on open_geotiff, read_geotiff_dask, read_geotiff_gpu, and read_vrt; pinning tests extended in test_signature_annotations_1654.py. Cross-sibling return-type drift surfaced for follow-up but not fixed (Cat 2): write_vrt returns str while to_geotiff and write_geotiff_gpu return None -- behaviour change deferred. Prior sweep findings (#1654 #1683 #1684 #1685 #1705 #1754) all confirmed fixed. cuda-validated."
geotiff,2026-05-13,1810,MEDIUM,5,"Sweep v5 (deep-sweep-api-consistency-geotiff-2026-05-13). 1 MEDIUM finding filed and fixed: #1810 open_geotiff dispatcher dropped missing_sources kwarg when routing to read_vrt (Cat 5, same class as #1561/#1605/#1685/#1795). Fix mirrors the on_gpu_failure pattern: sentinel default, forward to read_vrt for .vrt sources, reject for non-VRT sources. Regression test in test_open_geotiff_missing_sources_1810.py. Prior sweep findings (#1654 #1683 #1684 #1685 #1705 #1715 #1754 #1775) all confirmed fixed. Cross-sibling return-type drift (Cat 2): write_vrt returns str while to_geotiff and write_geotiff_gpu return None -- still deferred (LOW, callers do not substitute these writers). cuda-validated."
reproject,2026-05-10,1570,HIGH,2;5,"Filed cross-module attrs['vertical_crs'] type collision (string vs EPSG int) vs xrspatial.geotiff. Fixed in PR (TBD): reproject now writes EPSG int and preserves friendly token under vertical_datum. MEDIUM kwarg-order drift (transform_precision vs chunk_size) and missing type hints vs geotiff documented but not fixed (cosmetic, kwarg-only)."
38 changes: 36 additions & 2 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@
# None is itself a value a caller could supply alongside crs=. See
# issue #1715.
_CRS_WKT_DEPRECATED_SENTINEL = object()
# ``open_geotiff`` needs to tell "caller never set missing_sources" (default
# sentinel: skip forwarding so the read_vrt default applies, and reject the
# kwarg up front for non-VRT sources) from "caller set missing_sources=<value>"
# (forward verbatim to read_vrt). Mirrors the on_gpu_failure pattern. See
# issue #1810.
_MISSING_SOURCES_SENTINEL = object()

# Names of dims that ``to_geotiff`` / ``write_geotiff_gpu`` treat as the
# non-spatial band axis. Used both to remap ``(band, y, x)`` inputs to
Expand Down Expand Up @@ -692,6 +698,7 @@ def open_geotiff(source: str | BinaryIO, *,
gpu: bool = False,
max_pixels: int | None = None,
on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL,
missing_sources: str = _MISSING_SOURCES_SENTINEL,
) -> xr.DataArray:
"""Read a GeoTIFF, COG, or VRT file into an xarray.DataArray.

Expand Down Expand Up @@ -737,6 +744,14 @@ def open_geotiff(source: str | BinaryIO, *,
Passing this kwarg with ``gpu=False`` raises ``ValueError``
because the policy only applies to the GPU pipeline. See
``read_geotiff_gpu`` for the full description.
missing_sources : {'warn', 'raise'}, optional
Forwarded to ``read_vrt`` when the source is a ``.vrt`` file.
``'warn'`` preserves the historical behavior: emit
``GeoTIFFFallbackWarning``, record ``attrs['vrt_holes']``, and
return a partial mosaic. ``'raise'`` fails immediately. Passing
this kwarg with a non-VRT source raises ``ValueError`` because
the policy only applies to the VRT pipeline. See ``read_vrt``
for the full description.

Returns
-------
Expand Down Expand Up @@ -782,8 +797,24 @@ def open_geotiff(source: str | BinaryIO, *,
"Pass gpu=True to enable the GPU pipeline, or drop "
"on_gpu_failure to keep the default CPU path.")

# ``missing_sources`` is VRT-only. Reject it up front when the source
# is not a ``.vrt`` file so callers learn the policy is being ignored
# instead of getting a silent drop -- same pattern ``on_gpu_failure``
# uses above for the GPU-only kwarg, and the same class of dispatcher
# silently-drops-backend-kwarg bug #1561 / #1605 / #1685 / #1795 fixed
# for the other VRT/GPU kwargs. See issue #1810.
missing_sources_passed = (
missing_sources is not _MISSING_SOURCES_SENTINEL)
_is_vrt_source = (
isinstance(source, str) and source.lower().endswith('.vrt'))
if missing_sources_passed and not _is_vrt_source:
raise ValueError(
"missing_sources only applies to VRT sources. "
"Pass a .vrt path to enable the VRT pipeline, or drop "
"missing_sources to keep the default GeoTIFF path.")

# VRT files (string paths only -- VRT XML references other files on disk)
if isinstance(source, str) and source.lower().endswith('.vrt'):
if _is_vrt_source:
# ``read_vrt`` does not accept ``overview_level`` (the VRT XML
# references its own source files; overview selection would need
# to apply to each one). Silently dropping the kwarg was the same
Expand All @@ -810,9 +841,12 @@ def open_geotiff(source: str | BinaryIO, *,
"VRT reads do not go through the GPU decoder pipeline; "
"drop the kwarg or call read_geotiff_gpu directly on a "
".tif source.")
vrt_kwargs = {}
if missing_sources_passed:
vrt_kwargs['missing_sources'] = missing_sources
return read_vrt(source, dtype=dtype, window=window, band=band,
name=name, chunks=chunks, gpu=gpu,
max_pixels=max_pixels)
max_pixels=max_pixels, **vrt_kwargs)

# File-like buffers don't support the GPU or dask code paths because
# those re-open the source by path from worker tasks or device-side
Expand Down
19 changes: 14 additions & 5 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1560,9 +1560,14 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader,
raise ValueError(
f"Invalid tile dimensions: TileWidth={tw}, TileLength={th}")

# Reject crafted tile dims that would force huge per-tile allocations.
# A single tile's decoded bytes must also fit under the pixel budget.
_check_dimensions(tw, th, samples, max_pixels)
# Reject crafted tile dims (e.g. TileWidth = 2**31). This guards the
# TIFF header against malformed values; it is not the caller's output
# budget. The output-window check below uses ``max_pixels`` and is
# what enforces the user's per-call memory cap. The source-read path
# under ``read_vrt`` (#1796) relies on that output check to honour a
# small caller ``max_pixels`` against a normal-tile source; see
# #1823.
_check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT)
Comment on lines +1569 to +1570

# Per-tile compressed-byte cap (issue #1664). Same env var as the
# HTTP path. mmap slicing is bounded by the file size, but the slice
Expand Down Expand Up @@ -2016,10 +2021,14 @@ def _fetch_decode_cog_http_tiles(
# A windowed HTTP read of a multi-billion-pixel COG only allocates
# the window, so capping the full image would reject legitimate
# tiled reads. The full-image cap still applies for whole-file
# reads (window is None). The single-tile budget always applies.
# reads (window is None). The per-tile dim check below guards the
# TIFF header against absurd ``TileWidth`` / ``TileLength`` values
# (e.g. 2**31) and uses ``MAX_PIXELS_DEFAULT`` so a caller's small
# ``max_pixels`` -- intended as an output-window budget -- does not
# reject normal 256x256 tiles. See #1823.
if window is None:
_check_dimensions(width, height, samples, max_pixels)
_check_dimensions(tw, th, samples, max_pixels)
_check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT)

# Reject malformed TIFFs whose declared tile grid exceeds the supplied
# TileOffsets length. See issue #1219.
Expand Down
146 changes: 146 additions & 0 deletions xrspatial/geotiff/tests/test_open_geotiff_missing_sources_1810.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""Regression test for #1810: ``open_geotiff`` did not accept
``missing_sources`` and did not forward it to ``read_vrt`` when the
source was a VRT.

The api-consistency sweep on 2026-05-13 flagged that ``read_vrt`` had
gained a ``missing_sources='warn'|'raise'`` policy kwarg (#1806) but
the documented dispatcher entry point ``open_geotiff`` did not expose
it. Callers wanting strict failure had to call ``read_vrt`` directly.
Same class of dispatcher-drops-backend-kwarg bug as #1561, #1605,
#1685, #1795.
"""
from __future__ import annotations

import inspect

import numpy as np
import pytest
import xarray as xr

from xrspatial.geotiff import (
GeoTIFFFallbackWarning,
open_geotiff,
to_geotiff,
write_vrt,
)


def _write_missing_source_vrt(path):
"""Write a VRT pointing at a non-existent source file.

Mirrors the helper in ``test_vrt_missing_sources_policy_1799.py`` so
the warn-vs-raise branches surface the same way through the
dispatcher as they do via the direct ``read_vrt`` call.
"""
path.write_text(
'<VRTDataset rasterXSize="2" rasterYSize="2">\n'
' <VRTRasterBand dataType="Byte" band="1">\n'
' <SimpleSource>\n'
' <SourceFilename relativeToVRT="1">missing.tif'
'</SourceFilename>\n'
' <SourceBand>1</SourceBand>\n'
' <SrcRect xOff="0" yOff="0" xSize="2" ySize="2"/>\n'
' <DstRect xOff="0" yOff="0" xSize="2" ySize="2"/>\n'
' </SimpleSource>\n'
' </VRTRasterBand>\n'
'</VRTDataset>\n'
)


def test_open_geotiff_accepts_missing_sources():
"""The signature exposes ``missing_sources`` so IDE autocomplete and
``inspect.signature`` see the kwarg without parsing the docstring.
"""
sig = inspect.signature(open_geotiff)
assert 'missing_sources' in sig.parameters


def test_open_geotiff_vrt_missing_sources_warn(tmp_path):
"""``missing_sources='warn'`` matches the historic behavior:
``GeoTIFFFallbackWarning`` is emitted and ``attrs['vrt_holes']`` is
populated. Going through the dispatcher must produce the same
result as calling ``read_vrt`` directly.
"""
vrt = tmp_path / "tmp_1810_missing_warn.vrt"
_write_missing_source_vrt(vrt)

with pytest.warns(GeoTIFFFallbackWarning, match="could not be read"):
da = open_geotiff(str(vrt), missing_sources='warn')

assert 'vrt_holes' in da.attrs
assert da.attrs['vrt_holes'][0]['source'].endswith('missing.tif')


def test_open_geotiff_vrt_missing_sources_raise(tmp_path):
"""``missing_sources='raise'`` propagates through the dispatcher and
fails fast instead of silently returning a partial mosaic.
"""
vrt = tmp_path / "tmp_1810_missing_raise.vrt"
_write_missing_source_vrt(vrt)

with pytest.raises((OSError, ValueError)):
open_geotiff(str(vrt), missing_sources='raise')


def test_open_geotiff_vrt_missing_sources_validates_policy(tmp_path):
"""The validator on ``read_vrt`` fires through the dispatcher path
so callers get the same parameter-named error from either entry
point.
"""
vrt = tmp_path / "tmp_1810_missing_bad_policy.vrt"
_write_missing_source_vrt(vrt)

with pytest.raises(ValueError, match="missing_sources"):
open_geotiff(str(vrt), missing_sources='ignore')


def test_open_geotiff_rejects_missing_sources_on_tif(tmp_path):
"""``missing_sources`` is VRT-only; passing it with a ``.tif`` source
raises rather than silently doing nothing. Mirrors the
``on_gpu_failure`` rejection pattern.
"""
arr = np.arange(16, dtype=np.uint16).reshape(4, 4)
da = xr.DataArray(
arr, dims=['y', 'x'],
coords={'y': np.arange(4, dtype=np.float64),
'x': np.arange(4, dtype=np.float64)},
attrs={'crs': 4326},
)
tif_path = tmp_path / "single_tile.tif"
to_geotiff(da, str(tif_path))

with pytest.raises(ValueError, match="missing_sources only applies"):
open_geotiff(str(tif_path), missing_sources='raise')


def test_open_geotiff_vrt_without_missing_sources_kwarg_still_works(tmp_path):
"""Omitting ``missing_sources`` is a no-op: behaviour matches the
pre-fix dispatcher (and the ``read_vrt`` default of ``'warn'``).
The existing two-tile VRT exercises the happy path so callers
upgrading from #1806 do not see a regression.
"""
arr_a = np.arange(16, dtype=np.uint16).reshape(4, 4)
da_a = xr.DataArray(
arr_a, dims=['y', 'x'],
coords={'y': np.array([0.5, 1.5, 2.5, 3.5]),
'x': np.array([0.5, 1.5, 2.5, 3.5])},
attrs={'crs': 4326},
)
tile_a = tmp_path / "tile_a.tif"
to_geotiff(da_a, str(tile_a))

arr_b = np.arange(16, 32, dtype=np.uint16).reshape(4, 4)
da_b = xr.DataArray(
arr_b, dims=['y', 'x'],
coords={'y': np.array([0.5, 1.5, 2.5, 3.5]),
'x': np.array([4.5, 5.5, 6.5, 7.5])},
attrs={'crs': 4326},
)
tile_b = tmp_path / "tile_b.tif"
to_geotiff(da_b, str(tile_b))

vrt_path = tmp_path / "mosaic_1810.vrt"
write_vrt(str(vrt_path), [str(tile_a), str(tile_b)])

da = open_geotiff(str(vrt_path))
assert da.shape == (4, 8)
113 changes: 113 additions & 0 deletions xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Regression tests for #1823.

PR #1803 forwarded the caller's ``max_pixels`` to ``read_to_array`` inside
the VRT source loop so that a tiny VRT output could not force a huge
source decode (#1796). The output-window check enforces that. A separate
per-tile dimension check was incorrectly using the same ``max_pixels``
value, so a caller setting ``max_pixels`` as an output budget (e.g.
10,000) would also fail the per-tile sanity check on every normal source
whose default tile size is 256x256 (= 65,536 pixels).

The #1796 protection remains: the output-window check still catches a
tiny VRT output that asks for a large source window.
"""
from __future__ import annotations

import os
import tempfile

import numpy as np
import pytest

from xrspatial.geotiff import to_geotiff
from xrspatial.geotiff._reader import PixelSafetyLimitError
from xrspatial.geotiff._vrt import read_vrt


def _write_normal_tile_source(td: str) -> str:
"""10x10 uint8 source -- ``to_geotiff`` pads to a 256x256 tile."""
src = os.path.join(td, 'src.tif')
to_geotiff(np.zeros((10, 10), dtype=np.uint8), src, compression='none')
return src


def _write_vrt(td: str, *, dst_x_size: int, dst_y_size: int,
raster_x: int = 100, raster_y: int = 100,
src_x_size: int = 10, src_y_size: int = 10) -> str:
vrt = os.path.join(td, 'mosaic.vrt')
xml = (
f'<VRTDataset rasterXSize="{raster_x}" rasterYSize="{raster_y}">\n'
f' <VRTRasterBand dataType="Byte" band="1">\n'
f' <SimpleSource>\n'
f' <SourceFilename relativeToVRT="1">src.tif</SourceFilename>\n'
f' <SourceBand>1</SourceBand>\n'
f' <SrcRect xOff="0" yOff="0" '
f'xSize="{src_x_size}" ySize="{src_y_size}"/>\n'
f' <DstRect xOff="0" yOff="0" '
f'xSize="{dst_x_size}" ySize="{dst_y_size}"/>\n'
f' </SimpleSource>\n'
f' </VRTRasterBand>\n'
f'</VRTDataset>\n'
)
with open(vrt, 'w') as f:
f.write(xml)
return vrt


class TestPerTileCheckDoesNotUseCallerBudget:
"""Per-tile dim sanity must not reject normal 256x256 source tiles
when the caller's ``max_pixels`` is a small output-budget value."""

def test_normal_tile_source_with_small_max_pixels(self):
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_normal_tile_source(td)
vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100)
arr, _ = read_vrt(vrt, max_pixels=10_000)
assert arr.shape == (100, 100)

def test_normal_tile_source_with_tiny_max_pixels(self):
"""An output budget below a single tile must still succeed when
the requested output window itself fits."""
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_normal_tile_source(td)
# Output 5x5 = 25 pixels; max_pixels = 100 fits 25 with room.
vrt = _write_vrt(td, dst_x_size=5, dst_y_size=5,
raster_x=5, raster_y=5)
arr, _ = read_vrt(vrt, max_pixels=100)
assert arr.shape == (5, 5)


class TestOutputWindowCheckStillEnforced:
"""The output-window check at the source read still rejects an
over-budget read; the #1796 protection is preserved."""

def test_output_window_exceeds_max_pixels_still_rejected(self):
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
src = os.path.join(td, 'src.tif')
to_geotiff(np.arange(16, dtype=np.uint8).reshape(4, 4),
src, compression='none')
vrt = _write_vrt(td, dst_x_size=1, dst_y_size=1,
raster_x=1, raster_y=1,
src_x_size=4, src_y_size=4)
# SrcRect 4x4 = 16 pixels > max_pixels=1 → output check fires.
with pytest.raises(ValueError, match="exceed"):
read_vrt(vrt, max_pixels=1)


class TestPerTileCheckStillRejectsCraftedHeader:
"""A pathological ``TileWidth``/``TileLength`` must still fail at
the per-tile sanity check, which uses ``MAX_PIXELS_DEFAULT``."""

def test_per_tile_check_caps_at_default(self, monkeypatch):
"""Lower ``MAX_PIXELS_DEFAULT`` to verify the per-tile call site
is wired to it (rather than to the caller's budget)."""
from xrspatial.geotiff import _reader as reader_mod

monkeypatch.setattr(reader_mod, "MAX_PIXELS_DEFAULT", 100)
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_normal_tile_source(td)
vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100)
# 256x256 tile > patched MAX_PIXELS_DEFAULT=100 → per-tile
# check fires regardless of caller's max_pixels (1e9 here).
with pytest.raises(PixelSafetyLimitError, match="65,536"):
read_vrt(vrt, max_pixels=1_000_000_000)
Loading