From d81496506b0a51858e1b1bc78586d2a0062a9951 Mon Sep 17 00:00:00 2001
From: d-laub <david@standardmodel.bio>
Date: Thu, 25 Jun 2026 01:53:47 +0000
Subject: [PATCH] fix(indexing): SpliceIndexer.parse_idx double-applies
 sample-subset map
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a Dataset has been sample-subsetted (non-identity sample_subset_idxs)
AND output is spliced, SpliceIndexer.parse_idx applied the sample-subset
map twice:

1. Via self.dsi._s_idx[s_idx] within parse_idx, which maps output-space
   sample positions to on-disk positions.
2. Again via self.dsi.parse_idx((r_idx, s_idx)) at the end, which applies
   _s_idx a second time to what are already on-disk positions.

With the MMRF consensus dataset this caused sample MMRF_2702 (svar pos 54,
GVL sorted 625) to receive MMRF_1395's NRAS G12D mutation because
sample_subset_idxs[625] mapped to MMRF_1395.

Fix: after the unravel/exon-expand step, r_idx and s_idx already hold
full-dataset positions. Compute the flat storage index directly via
ravel_multi_index using full_region_idxs, bypassing the second _s_idx
application that parse_idx would otherwise inject.

Adds regression test test_splice_indexer_subset.py that constructs a
SpliceIndexer with sample_subset_idxs=[2,4] over 5 on-disk samples,
verifies all three index paths (slice×slice, scalar×scalar, no-subset)
return the correct on-disk sample positions.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/genvarloader/_dataset/_indexing.py     |   9 +-
 .../dataset/test_splice_indexer_subset.py     | 276 ++++++++++++++++++
 2 files changed, 284 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/dataset/test_splice_indexer_subset.py

diff --git a/python/genvarloader/_dataset/_indexing.py b/python/genvarloader/_dataset/_indexing.py
index 696e9b3f..6d728bec 100644
--- a/python/genvarloader/_dataset/_indexing.py
+++ b/python/genvarloader/_dataset/_indexing.py
@@ -448,7 +448,14 @@ def parse_idx(self, idx: StrIdx | tuple[StrIdx] | tuple[StrIdx, StrIdx]):
         r_idx = r_idx.to_packed().data
         s_idx = s_idx.repeat(lengths)
 
-        ds_idx, *_ = self.dsi.parse_idx((r_idx, s_idx))
+        # At this point r_idx contains per-exon region positions (indices into
+        # full_region_idxs) and s_idx contains already-mapped on-disk sample
+        # indices (full_sample_idxs[sample_subset_idxs[...]]).  Calling
+        # self.dsi.parse_idx here would apply _s_idx a second time, corrupting
+        # the sample mapping when sample_subset_idxs is non-identity.  Instead,
+        # compute the flat storage index directly:
+        r_flat = self.dsi.full_region_idxs[r_idx]
+        ds_idx = np.ravel_multi_index((r_flat, s_idx), self.dsi.full_shape).ravel()
 
         return ds_idx, squeeze, out_reshape, offsets, n_rows_sel, n_samples_sel
 
diff --git a/tests/unit/dataset/test_splice_indexer_subset.py b/tests/unit/dataset/test_splice_indexer_subset.py
new file mode 100644
index 00000000..84d1cf95
--- /dev/null
+++ b/tests/unit/dataset/test_splice_indexer_subset.py
@@ -0,0 +1,276 @@
+"""Regression test for SpliceIndexer.parse_idx double-applying the sample-subset map.
+
+Bug: when a Dataset has been sample-subsetted (non-identity ``sample_subset_idxs``)
+AND output is spliced, ``SpliceIndexer.parse_idx`` applied the sample-subset map
+twice:
+
+1. Once inside the body of ``parse_idx`` via ``self.dsi._s_idx[s_idx]``.
+2. Again inside the ``self.dsi.parse_idx((r_idx, s_idx))`` call at the end,
+   which applies ``_s_idx`` a second time.
+
+Concrete symptom: requesting sample at output position *p* fetches the data for
+``full_sample_idxs[sample_subset_idxs[full_sample_idxs[sample_subset_idxs[p]]]]``
+instead of the correct ``full_sample_idxs[sample_subset_idxs[p]]``.
+
+With the MMRF consensus dataset, MMRF_2702 (svar pos 54, GVL sorted 625) received
+MMRF_1395's NRAS G12D mutation because ``sample_subset_idxs[625]`` mapped to
+MMRF_1395.
+
+This test builds a tiny ``SpliceIndexer`` with a non-identity sample subset and
+verifies that ``parse_idx`` resolves each requested (row, sample) pair to the
+correct flat storage index — i.e. that the sample-subset map is applied exactly
+once, and that each (row, sample) pair resolves to the *correct* sample's storage
+position.
+"""
+
+from __future__ import annotations
+
+import awkward as ak
+import numpy as np
+import pytest
+from hirola import HashTable
+from seqpro.rag import Ragged
+
+from genvarloader._dataset._indexing import DatasetIndexer, SpliceIndexer
+from genvarloader._dataset._splice import SpliceMap
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_ragged(exon_lists: list[list[int]]) -> Ragged:
+    """Build a seqpro Ragged from a list-of-lists of integer region indices."""
+    return Ragged(ak.Array(exon_lists, check_valid=True))
+
+
+def _make_splice_indexer(
+    *,
+    n_full_regions: int,
+    n_full_samples: int,
+    # Which on-disk sample positions are in the subset (output pos -> on-disk pos)
+    sample_subset_idxs: list[int],
+    # Ragged list of per-transcript exon region indices (0-indexed into regions)
+    exon_lists: list[list[int]],
+    # On-disk region → storage row (identity here for simplicity)
+    region_storage_idxs: list[int] | None = None,
+    # On-disk sample → storage col (identity here for simplicity)
+    sample_storage_idxs: list[int] | None = None,
+) -> SpliceIndexer:
+    """Construct a ``SpliceIndexer`` with explicit on-disk layouts and a
+    non-trivial sample subset."""
+
+    if region_storage_idxs is None:
+        region_storage_idxs = list(range(n_full_regions))
+    if sample_storage_idxs is None:
+        sample_storage_idxs = list(range(n_full_samples))
+
+    full_r = np.array(region_storage_idxs, dtype=np.intp)
+    full_s = np.array(sample_storage_idxs, dtype=np.intp)
+    sample_names = [f"s{i}" for i in range(n_full_samples)]
+    _samples = np.array(sample_names, dtype=np.str_)
+    s2i_map = HashTable(max=len(_samples) * 2, dtype=_samples.dtype)
+    s2i_map.add(_samples)
+
+    dsi = DatasetIndexer(
+        full_region_idxs=full_r,
+        full_sample_idxs=full_s,
+        s2i_map=s2i_map,
+        r2i_map=None,
+        region_subset_idxs=None,
+        sample_subset_idxs=np.array(sample_subset_idxs, dtype=np.intp),
+    )
+
+    # Build the SpliceMap from exon_lists.
+    n_transcripts = len(exon_lists)
+    transcript_names = np.array([f"T{i}" for i in range(n_transcripts)], dtype=np.str_)
+    transcript_name_map = HashTable(max=n_transcripts * 2, dtype=transcript_names.dtype)
+    transcript_name_map.add(transcript_names)
+
+    splice_map_ragged = _make_ragged(exon_lists)
+
+    sm = SpliceMap(
+        names=transcript_name_map,
+        splice_map=splice_map_ragged,
+        full_splice_map=splice_map_ragged,
+        row_idxs=np.arange(n_transcripts, dtype=np.intp),
+        row_subset_idxs=None,
+    )
+
+    return SpliceIndexer(map=sm, dsi=dsi)
+
+
+# ---------------------------------------------------------------------------
+# Fixture
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def splice_idxer_with_subset():
+    """A SpliceIndexer with:
+
+    * 4 full regions (region positions 0-3 map to storage rows 0-3, identity)
+    * 5 full samples (sample positions 0-4 map to storage cols 0-4, identity)
+    * sample subset: output positions [0, 1] map to on-disk positions [2, 4]
+      (i.e. sample_subset_idxs = [2, 4])
+    * 2 transcripts:
+        T0 = exons at region positions [0, 1]
+        T1 = exons at region positions [2, 3]
+    """
+    return _make_splice_indexer(
+        n_full_regions=4,
+        n_full_samples=5,
+        sample_subset_idxs=[2, 4],
+        exon_lists=[[0, 1], [2, 3]],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Core regression test — sample-subset map applied exactly once
+# ---------------------------------------------------------------------------
+
+
+def test_parse_idx_sample_subset_applied_once(splice_idxer_with_subset):
+    """SpliceIndexer.parse_idx must apply the sample-subset map exactly once.
+
+    When user requests all transcripts × all subset-samples (slice, slice), each
+    (transcript, sample) pair should resolve to the correct flat storage index:
+
+        storage_idx = ravel_multi_index(
+            (full_region_idxs[exon_region_pos], full_sample_idxs[sample_subset_idxs[output_pos]]),
+            full_shape,
+        )
+
+    With the bug, the second call to ``dsi.parse_idx`` re-applies _s_idx,
+    turning ``full_sample_idxs[sample_subset_idxs[output_pos]]`` into
+    ``full_sample_idxs[sample_subset_idxs[full_sample_idxs[sample_subset_idxs[output_pos]]]]``
+    — fetching the wrong sample entirely (or causing an IndexError if the
+    double-mapped value is out of bounds for the 2-element subset).
+    """
+    si = splice_idxer_with_subset
+
+    # full_shape = (n_regions=4, n_full_samples=5)
+    assert si.dsi.full_shape == (4, 5)
+    # shape = (n_transcripts=2, n_subset_samples=2)
+    assert si.shape == (2, 2)
+
+    # Request all transcripts, all subset samples.
+    ds_idx, squeeze, out_reshape, offsets, n_rows, n_samples = si.parse_idx(
+        (slice(None), slice(None))
+    )
+
+    # Decode the returned flat storage indices.
+    n_full_regions = 4
+    n_full_samples = 5
+    full_shape = (n_full_regions, n_full_samples)
+
+    r_flat, s_flat = np.unravel_index(ds_idx, full_shape)
+
+    # Every returned region index must be in {0, 1, 2, 3}.
+    assert set(r_flat.tolist()).issubset({0, 1, 2, 3}), (
+        f"Unexpected region storage indices: {r_flat.tolist()!r}"
+    )
+
+    # KEY ASSERTION: every returned on-disk sample index must be 2 or 4.
+    # sample_subset_idxs = [2, 4] maps output pos 0 → on-disk 2, pos 1 → on-disk 4.
+    # With the double-application bug, the code re-applies _s_idx (which is
+    # [2, 4]) to the already-mapped values 2 and 4:
+    #   _s_idx[2] is out-of-range for a 2-element array → IndexError or wrap.
+    # Even if it doesn't crash, the on-disk indices would be wrong.
+    unexpected_samples = set(s_flat.tolist()) - {2, 4}
+    assert not unexpected_samples, (
+        f"SpliceIndexer.parse_idx returned wrong on-disk sample indices {set(s_flat.tolist())}; "
+        f"expected only {{2, 4}} (subset_idxs=[2,4] maps output positions [0,1] → on-disk [2,4]). "
+        f"Got: {s_flat.tolist()!r}.  "
+        f"This indicates the sample-subset map was applied more than once."
+    )
+
+    # Also check that BOTH on-disk samples appear (not just one).
+    assert 2 in s_flat, "On-disk sample 2 missing from output"
+    assert 4 in s_flat, "On-disk sample 4 missing from output"
+
+    # Region correctness: T0 exons {0,1} and T1 exons {2,3} must appear.
+    assert 0 in r_flat and 1 in r_flat, "T0 exon regions 0 and 1 must both appear"
+    assert 2 in r_flat and 3 in r_flat, "T1 exon regions 2 and 3 must both appear"
+
+
+def test_parse_idx_single_sample_from_subset(splice_idxer_with_subset):
+    """Selecting one subset sample by output index returns that sample's storage index."""
+    si = splice_idxer_with_subset
+
+    # Request transcript T0 (idx 0), output sample 1 (→ on-disk sample 4).
+    ds_idx, squeeze, out_reshape, offsets, n_rows, n_samples = si.parse_idx((0, 1))
+
+    n_full_regions = 4
+    n_full_samples = 5
+    full_shape = (n_full_regions, n_full_samples)
+
+    r_flat, s_flat = np.unravel_index(ds_idx, full_shape)
+
+    # All returned on-disk sample indices must be 4 (output pos 1 → on-disk 4).
+    assert (s_flat == 4).all(), (
+        f"Expected all on-disk sample indices to be 4, got {s_flat.tolist()!r}.  "
+        f"Double-applying the subset map would produce a wrong value or IndexError."
+    )
+    # T0 has exons at region positions 0 and 1.
+    assert set(r_flat.tolist()) == {0, 1}, (
+        f"Expected region indices {{0, 1}} for T0, got {r_flat.tolist()!r}"
+    )
+
+
+def test_parse_idx_no_subset_unaffected():
+    """Without a sample subset the fix must not change existing behaviour.
+
+    Uses an identity-subset (sample_subset_idxs = None effectively, achieved by
+    constructing the DSI without subset) to confirm the non-subset path still works.
+    """
+    # Build a DSI *without* any sample_subset_idxs (the None path).
+    n_full_regions = 3
+    n_full_samples = 3
+    full_r = np.arange(n_full_regions, dtype=np.intp)
+    full_s = np.arange(n_full_samples, dtype=np.intp)
+    sample_names = [f"s{i}" for i in range(n_full_samples)]
+    _samples = np.array(sample_names, dtype=np.str_)
+    s2i_map = HashTable(max=len(_samples) * 2, dtype=_samples.dtype)
+    s2i_map.add(_samples)
+
+    dsi = DatasetIndexer(
+        full_region_idxs=full_r,
+        full_sample_idxs=full_s,
+        s2i_map=s2i_map,
+        r2i_map=None,
+        region_subset_idxs=None,
+        sample_subset_idxs=None,  # no subset
+    )
+
+    exon_lists = [[0, 1], [1, 2]]
+    n_transcripts = 2
+    transcript_names = np.array([f"T{i}" for i in range(n_transcripts)], dtype=np.str_)
+    transcript_name_map = HashTable(max=n_transcripts * 2, dtype=transcript_names.dtype)
+    transcript_name_map.add(transcript_names)
+
+    splice_map_ragged = _make_ragged(exon_lists)
+    sm = SpliceMap(
+        names=transcript_name_map,
+        splice_map=splice_map_ragged,
+        full_splice_map=splice_map_ragged,
+        row_idxs=np.arange(n_transcripts, dtype=np.intp),
+        row_subset_idxs=None,
+    )
+    si = SpliceIndexer(map=sm, dsi=dsi)
+
+    ds_idx, squeeze, out_reshape, offsets, n_rows, n_samples = si.parse_idx(
+        (slice(None), slice(None))
+    )
+
+    full_shape = (n_full_regions, n_full_samples)
+    r_flat, s_flat = np.unravel_index(ds_idx, full_shape)
+
+    # All samples must appear and only valid on-disk samples (0, 1, 2).
+    unexpected = set(s_flat.tolist()) - {0, 1, 2}
+    assert not unexpected, f"Unexpected on-disk sample indices: {unexpected}"
+    # T0 exons {0,1}, T1 exons {1,2}
+    assert set(r_flat.tolist()).issubset({0, 1, 2}), (
+        f"Unexpected region indices: {r_flat.tolist()!r}"
+    )