Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 14 additions & 5 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1560,9 +1560,14 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader,
raise ValueError(
f"Invalid tile dimensions: TileWidth={tw}, TileLength={th}")

# Reject crafted tile dims that would force huge per-tile allocations.
# A single tile's decoded bytes must also fit under the pixel budget.
_check_dimensions(tw, th, samples, max_pixels)
# Reject crafted tile dims (e.g. TileWidth = 2**31). This guards the
# TIFF header against malformed values; it is not the caller's output
# budget. The output-window check below uses ``max_pixels`` and is
# what enforces the user's per-call memory cap. The source-read path
# under ``read_vrt`` (#1796) relies on that output check to honour a
# small caller ``max_pixels`` against a normal-tile source; see
# #1823.
_check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT)

# Per-tile compressed-byte cap (issue #1664). Same env var as the
# HTTP path. mmap slicing is bounded by the file size, but the slice
Expand Down Expand Up @@ -2016,10 +2021,14 @@ def _fetch_decode_cog_http_tiles(
# A windowed HTTP read of a multi-billion-pixel COG only allocates
# the window, so capping the full image would reject legitimate
# tiled reads. The full-image cap still applies for whole-file
# reads (window is None). The single-tile budget always applies.
# reads (window is None). The per-tile dim check below guards the
# TIFF header against absurd ``TileWidth`` / ``TileLength`` values
# (e.g. 2**31) and uses ``MAX_PIXELS_DEFAULT`` so a caller's small
# ``max_pixels`` -- intended as an output-window budget -- does not
# reject normal 256x256 tiles. See #1823.
if window is None:
_check_dimensions(width, height, samples, max_pixels)
_check_dimensions(tw, th, samples, max_pixels)
_check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT)

# Reject malformed TIFFs whose declared tile grid exceeds the supplied
# TileOffsets length. See issue #1219.
Expand Down
50 changes: 48 additions & 2 deletions xrspatial/geotiff/_vrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,53 @@ def _codec_decode_exceptions() -> tuple[type[BaseException], ...]:
_ALLOWED_ROOTS_ENV = 'XRSPATIAL_VRT_ALLOWED_ROOTS'


# Cap on the VRT XML file read. VRTs are XML metadata only (pixel data lives
# in source TIFFs); a 50k-source VRT is around 25 MB, so 64 MiB is a safe
# default that still rejects pathological inputs without scanning the whole
# file into memory.
_MAX_XML_BYTES_ENV = 'XRSPATIAL_VRT_MAX_XML_BYTES'
_DEFAULT_MAX_XML_BYTES = 64 * 1024 * 1024


def _get_vrt_max_xml_bytes() -> int:
"""Return the cap on VRT XML file size, in bytes."""
raw = os.environ.get(_MAX_XML_BYTES_ENV)
if raw is None or raw == '':
return _DEFAULT_MAX_XML_BYTES
try:
value = int(raw)
except (TypeError, ValueError):
raise ValueError(
f"{_MAX_XML_BYTES_ENV} must be a positive integer, got "
f"{raw!r}")
if value <= 0:
raise ValueError(
f"{_MAX_XML_BYTES_ENV} must be a positive integer, got "
f"{value}")
return value


def _read_vrt_xml(vrt_path: str) -> str:
"""Read a VRT XML file with a bounded total size."""
cap = _get_vrt_max_xml_bytes()
chunks = []
total = 0
with open(vrt_path, 'rb') as f:
while True:
remaining = cap + 1 - total
chunk = f.read(min(65536, remaining))
if not chunk:
break
total += len(chunk)
if total > cap:
raise ValueError(
f"VRT XML at {vrt_path!r} exceeds the "
f"{cap:,}-byte cap. Raise the cap by setting "
f"{_MAX_XML_BYTES_ENV} if this file is legitimate.")
chunks.append(chunk)
return b''.join(chunks).decode('utf-8')


def _allowed_source_roots() -> list[str]:
"""Return the operator-supplied allowlist of trusted source roots.

Expand Down Expand Up @@ -637,8 +684,7 @@ def read_vrt(vrt_path: str, *, window=None,
"""
from ._reader import PixelSafetyLimitError, read_to_array

with open(vrt_path, 'r') as f:
xml_str = f.read()
xml_str = _read_vrt_xml(vrt_path)

vrt_dir = os.path.dirname(os.path.abspath(vrt_path))
vrt = parse_vrt(xml_str, vrt_dir)
Expand Down
113 changes: 113 additions & 0 deletions xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""Regression tests for #1823.

PR #1803 forwarded the caller's ``max_pixels`` to ``read_to_array`` inside
the VRT source loop so that a tiny VRT output could not force a huge
source decode (#1796). The output-window check enforces that. A separate
per-tile dimension check was incorrectly using the same ``max_pixels``
value, so a caller setting ``max_pixels`` as an output budget (e.g.
10,000) would also fail the per-tile sanity check on every normal source
whose default tile size is 256x256 (= 65,536 pixels).

The #1796 protection remains: the output-window check still catches a
tiny VRT output that asks for a large source window.
"""
from __future__ import annotations

import os
import tempfile

import numpy as np
import pytest

from xrspatial.geotiff import to_geotiff
from xrspatial.geotiff._reader import PixelSafetyLimitError
from xrspatial.geotiff._vrt import read_vrt


def _write_normal_tile_source(td: str) -> str:
"""10x10 uint8 source -- ``to_geotiff`` pads to a 256x256 tile."""
src = os.path.join(td, 'src.tif')
to_geotiff(np.zeros((10, 10), dtype=np.uint8), src, compression='none')
return src


def _write_vrt(td: str, *, dst_x_size: int, dst_y_size: int,
raster_x: int = 100, raster_y: int = 100,
src_x_size: int = 10, src_y_size: int = 10) -> str:
vrt = os.path.join(td, 'mosaic.vrt')
xml = (
f'<VRTDataset rasterXSize="{raster_x}" rasterYSize="{raster_y}">\n'
f' <VRTRasterBand dataType="Byte" band="1">\n'
f' <SimpleSource>\n'
f' <SourceFilename relativeToVRT="1">src.tif</SourceFilename>\n'
f' <SourceBand>1</SourceBand>\n'
f' <SrcRect xOff="0" yOff="0" '
f'xSize="{src_x_size}" ySize="{src_y_size}"/>\n'
f' <DstRect xOff="0" yOff="0" '
f'xSize="{dst_x_size}" ySize="{dst_y_size}"/>\n'
f' </SimpleSource>\n'
f' </VRTRasterBand>\n'
f'</VRTDataset>\n'
)
with open(vrt, 'w') as f:
f.write(xml)
return vrt


class TestPerTileCheckDoesNotUseCallerBudget:
"""Per-tile dim sanity must not reject normal 256x256 source tiles
when the caller's ``max_pixels`` is a small output-budget value."""

def test_normal_tile_source_with_small_max_pixels(self):
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_normal_tile_source(td)
vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100)
arr, _ = read_vrt(vrt, max_pixels=10_000)
assert arr.shape == (100, 100)

def test_normal_tile_source_with_tiny_max_pixels(self):
"""An output budget below a single tile must still succeed when
the requested output window itself fits."""
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_normal_tile_source(td)
# Output 5x5 = 25 pixels; max_pixels = 100 fits 25 with room.
vrt = _write_vrt(td, dst_x_size=5, dst_y_size=5,
raster_x=5, raster_y=5)
arr, _ = read_vrt(vrt, max_pixels=100)
assert arr.shape == (5, 5)


class TestOutputWindowCheckStillEnforced:
"""The output-window check at the source read still rejects an
over-budget read; the #1796 protection is preserved."""

def test_output_window_exceeds_max_pixels_still_rejected(self):
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
src = os.path.join(td, 'src.tif')
to_geotiff(np.arange(16, dtype=np.uint8).reshape(4, 4),
src, compression='none')
vrt = _write_vrt(td, dst_x_size=1, dst_y_size=1,
raster_x=1, raster_y=1,
src_x_size=4, src_y_size=4)
# SrcRect 4x4 = 16 pixels > max_pixels=1 → output check fires.
with pytest.raises(ValueError, match="exceed"):
read_vrt(vrt, max_pixels=1)


class TestPerTileCheckStillRejectsCraftedHeader:
"""A pathological ``TileWidth``/``TileLength`` must still fail at
the per-tile sanity check, which uses ``MAX_PIXELS_DEFAULT``."""

def test_per_tile_check_caps_at_default(self, monkeypatch):
"""Lower ``MAX_PIXELS_DEFAULT`` to verify the per-tile call site
is wired to it (rather than to the caller's budget)."""
from xrspatial.geotiff import _reader as reader_mod

monkeypatch.setattr(reader_mod, "MAX_PIXELS_DEFAULT", 100)
with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
_write_normal_tile_source(td)
vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100)
# 256x256 tile > patched MAX_PIXELS_DEFAULT=100 → per-tile
# check fires regardless of caller's max_pixels (1e9 here).
with pytest.raises(PixelSafetyLimitError, match="65,536"):
read_vrt(vrt, max_pixels=1_000_000_000)
94 changes: 94 additions & 0 deletions xrspatial/geotiff/tests/test_vrt_xml_size_cap_1815.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""VRT XML reads must be bounded to avoid unbounded memory allocation.

Regression test for issue #1815: ``read_vrt`` previously called
``f.read()`` on the VRT XML path with no size limit, so a multi-gigabyte
XML file would consume all that memory before parsing. The fix adds a
64 MiB default cap, overridable via ``XRSPATIAL_VRT_MAX_XML_BYTES``.
"""
from __future__ import annotations

import os

import numpy as np
import pytest

from xrspatial.geotiff import to_geotiff
from xrspatial.geotiff._vrt import read_vrt


def _write_source(td: str) -> str:
src_path = os.path.join(td, 'tmp_1815_src.tif')
to_geotiff(np.zeros((10, 10), dtype=np.uint8), src_path,
compression='none')
return src_path


def _write_vrt(td: str, *, pad_bytes: int = 0) -> str:
"""Write a VRT, optionally padded with a large XML comment."""
vrt_path = os.path.join(td, 'tmp_1815_mosaic.vrt')
comment = ''
if pad_bytes > 0:
comment = '<!-- ' + ('x' * pad_bytes) + ' -->\n'
vrt_xml = (
'<VRTDataset rasterXSize="10" rasterYSize="10">\n'
+ comment +
' <VRTRasterBand dataType="Byte" band="1">\n'
' <SimpleSource>\n'
' <SourceFilename relativeToVRT="1">'
'tmp_1815_src.tif</SourceFilename>\n'
' <SourceBand>1</SourceBand>\n'
' <SrcRect xOff="0" yOff="0" xSize="10" ySize="10"/>\n'
' <DstRect xOff="0" yOff="0" xSize="10" ySize="10"/>\n'
' </SimpleSource>\n'
' </VRTRasterBand>\n'
'</VRTDataset>\n'
)
with open(vrt_path, 'w') as f:
f.write(vrt_xml)
return vrt_path


def test_small_vrt_parses_under_default_cap(tmp_path):
"""A normal-sized VRT parses successfully with the default cap."""
td = str(tmp_path)
_write_source(td)
vrt_path = _write_vrt(td)
arr, _ = read_vrt(vrt_path)
assert arr.shape == (10, 10)


def test_oversized_vrt_raises_value_error(tmp_path, monkeypatch):
"""A VRT padded past the cap raises ValueError naming the cap and env var."""
td = str(tmp_path)
_write_source(td)
# Set a small cap (1 KiB) and pad the comment past it.
monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', '1024')
vrt_path = _write_vrt(td, pad_bytes=4096)
with pytest.raises(ValueError) as exc_info:
read_vrt(vrt_path)
msg = str(exc_info.value)
assert 'XRSPATIAL_VRT_MAX_XML_BYTES' in msg
assert '1,024' in msg


def test_raising_cap_lets_padded_vrt_parse(tmp_path, monkeypatch):
"""Setting the env var higher allows a padded VRT to parse."""
td = str(tmp_path)
_write_source(td)
vrt_path = _write_vrt(td, pad_bytes=4096)
# Default cap of 64 MiB is more than enough; verify with an explicit
# higher cap too.
monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', str(1024 * 1024))
arr, _ = read_vrt(vrt_path)
assert arr.shape == (10, 10)


@pytest.mark.parametrize('bad_value', ['not_a_number', '0', '-1', '-1024'])
def test_invalid_cap_raises_value_error(tmp_path, monkeypatch, bad_value):
"""Non-numeric, zero, or negative cap values produce a clear error."""
td = str(tmp_path)
_write_source(td)
vrt_path = _write_vrt(td)
monkeypatch.setenv('XRSPATIAL_VRT_MAX_XML_BYTES', bad_value)
with pytest.raises(ValueError, match='XRSPATIAL_VRT_MAX_XML_BYTES'):
read_vrt(vrt_path)
Loading