Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions python/python/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,49 @@ def test_take_with_projection(tmp_path: Path):
assert table3 == table2


def test_take_with_json_column(tmp_path: Path):
"""Test that take/take_rows return JSON columns in Arrow JSON format (Utf8).

Previously, take would return lance.json (LargeBinary) instead of
arrow.json (Utf8), which is the user-facing format.
"""
json_type = pa.json_()
data = pa.table(
{
"id": pa.array(range(10), type=pa.int64()),
"meta": pa.array(
[f'{{"val":{i}}}' for i in range(10)],
type=json_type,
),
}
)
base_dir = tmp_path / "test_take_json"
lance.write_dataset(data, base_dir)
dataset = lance.dataset(base_dir)

# Dataset.take should return arrow.json type
result = dataset.take([2, 5, 8])
meta_field = result.schema.field("meta")
assert meta_field.type == pa.utf8() or meta_field.type == pa.json_(), (
f"Expected arrow.json (Utf8), got {meta_field.type}"
)
metas = result.column("meta").to_pylist()
assert metas[0] == '{"val":2}'
assert metas[1] == '{"val":5}'
assert metas[2] == '{"val":8}'

# Dataset._take_rows should also return arrow.json type
result = dataset._take_rows([0, 3, 9])
meta_field = result.schema.field("meta")
assert meta_field.type == pa.utf8() or meta_field.type == pa.json_(), (
f"Expected arrow.json (Utf8), got {meta_field.type}"
)
metas = result.column("meta").to_pylist()
assert metas[0] == '{"val":0}'
assert metas[1] == '{"val":3}'
assert metas[2] == '{"val":9}'


def test_filter(tmp_path: Path):
table = pa.Table.from_pydict({"a": range(100), "b": range(100)})
base_dir = tmp_path / "test"
Expand Down
30 changes: 30 additions & 0 deletions python/python/tests/test_fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,3 +835,33 @@ def test_fragment_delete_rows(tmp_path: Path):
frag.delete_rows([100])
with pytest.raises(ValueError, match="out of range"):
frag.delete_rows([0, 50, 1000])


def test_fragment_take_with_json_column(tmp_path):
"""Test that FileFragment.take returns JSON columns in Arrow JSON format."""
json_type = pa.json_()
data = pa.table(
{
"id": pa.array(range(10), type=pa.int64()),
"meta": pa.array(
[f'{{"val":{i}}}' for i in range(10)],
type=json_type,
),
}
)
dataset_uri = tmp_path / "test_frag_take_json"
dataset = lance.write_dataset(data, dataset_uri)

fragment = dataset.get_fragment(0)
result = fragment.take([1, 4, 7])

# Should return arrow.json type (Utf8), not lance.json (LargeBinary)
meta_field = result.schema.field("meta")
assert meta_field.type == pa.utf8() or meta_field.type == pa.json_(), (
f"Expected arrow.json (Utf8), got {meta_field.type}"
)

metas = result.column("meta").to_pylist()
assert metas[0] == '{"val":1}'
assert metas[1] == '{"val":4}'
assert metas[2] == '{"val":7}'
18 changes: 16 additions & 2 deletions rust/lance/src/dataset/fragment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1572,8 +1572,22 @@ impl FileFragment {
};

// Then call take rows
self.take_rows(&row_ids, projection, false, false, false, false)
.await
let batch = self
.take_rows(&row_ids, projection, false, false, false, false)
.await?;

// Convert Lance JSON columns (LargeBinary/JSONB) back to Arrow JSON (Utf8)
// for user-facing output.
if batch
.schema()
.fields()
.iter()
.any(|f| lance_arrow::json::is_json_field(f) || lance_arrow::json::has_json_fields(f))
{
Ok(lance_arrow::json::convert_lance_json_to_arrow(&batch)?)
} else {
Ok(batch)
}
}

/// Get the deletion vector for this fragment, using the cache if available.
Expand Down
Loading