diff --git a/python/python/tests/test_dataset.py b/python/python/tests/test_dataset.py index a72a7613241..218cdbb9c54 100644 --- a/python/python/tests/test_dataset.py +++ b/python/python/tests/test_dataset.py @@ -964,6 +964,49 @@ def test_take_with_projection(tmp_path: Path): assert table3 == table2 +def test_take_with_json_column(tmp_path: Path): + """Test that take/take_rows return JSON columns in Arrow JSON format (Utf8). + + Previously, take would return lance.json (LargeBinary) instead of + arrow.json (Utf8), which is the user-facing format. + """ + json_type = pa.json_() + data = pa.table( + { + "id": pa.array(range(10), type=pa.int64()), + "meta": pa.array( + [f'{{"val":{i}}}' for i in range(10)], + type=json_type, + ), + } + ) + base_dir = tmp_path / "test_take_json" + lance.write_dataset(data, base_dir) + dataset = lance.dataset(base_dir) + + # Dataset.take should return arrow.json type + result = dataset.take([2, 5, 8]) + meta_field = result.schema.field("meta") + assert meta_field.type == pa.utf8() or meta_field.type == pa.json_(), ( + f"Expected arrow.json (Utf8), got {meta_field.type}" + ) + metas = result.column("meta").to_pylist() + assert metas[0] == '{"val":2}' + assert metas[1] == '{"val":5}' + assert metas[2] == '{"val":8}' + + # Dataset._take_rows should also return arrow.json type + result = dataset._take_rows([0, 3, 9]) + meta_field = result.schema.field("meta") + assert meta_field.type == pa.utf8() or meta_field.type == pa.json_(), ( + f"Expected arrow.json (Utf8), got {meta_field.type}" + ) + metas = result.column("meta").to_pylist() + assert metas[0] == '{"val":0}' + assert metas[1] == '{"val":3}' + assert metas[2] == '{"val":9}' + + def test_filter(tmp_path: Path): table = pa.Table.from_pydict({"a": range(100), "b": range(100)}) base_dir = tmp_path / "test" diff --git a/python/python/tests/test_fragment.py b/python/python/tests/test_fragment.py index bdbbac96ac5..b05888df31f 100644 --- a/python/python/tests/test_fragment.py +++ b/python/python/tests/test_fragment.py @@ -835,3 +835,33 @@ def test_fragment_delete_rows(tmp_path: Path): frag.delete_rows([100]) with pytest.raises(ValueError, match="out of range"): frag.delete_rows([0, 50, 1000]) + + +def test_fragment_take_with_json_column(tmp_path): + """Test that FileFragment.take returns JSON columns in Arrow JSON format.""" + json_type = pa.json_() + data = pa.table( + { + "id": pa.array(range(10), type=pa.int64()), + "meta": pa.array( + [f'{{"val":{i}}}' for i in range(10)], + type=json_type, + ), + } + ) + dataset_uri = tmp_path / "test_frag_take_json" + dataset = lance.write_dataset(data, dataset_uri) + + fragment = dataset.get_fragment(0) + result = fragment.take([1, 4, 7]) + + # Should return arrow.json type (Utf8), not lance.json (LargeBinary) + meta_field = result.schema.field("meta") + assert meta_field.type == pa.utf8() or meta_field.type == pa.json_(), ( + f"Expected arrow.json (Utf8), got {meta_field.type}" + ) + + metas = result.column("meta").to_pylist() + assert metas[0] == '{"val":1}' + assert metas[1] == '{"val":4}' + assert metas[2] == '{"val":7}' diff --git a/rust/lance/src/dataset/fragment.rs b/rust/lance/src/dataset/fragment.rs index 4ea1b51f073..175b1d7d2de 100644 --- a/rust/lance/src/dataset/fragment.rs +++ b/rust/lance/src/dataset/fragment.rs @@ -1572,8 +1572,22 @@ impl FileFragment { }; // Then call take rows - self.take_rows(&row_ids, projection, false, false, false, false) - .await + let batch = self + .take_rows(&row_ids, projection, false, false, false, false) + .await?; + + // Convert Lance JSON columns (LargeBinary/JSONB) back to Arrow JSON (Utf8) + // for user-facing output. + if batch + .schema() + .fields() + .iter() + .any(|f| lance_arrow::json::is_json_field(f) || lance_arrow::json::has_json_fields(f)) + { + Ok(lance_arrow::json::convert_lance_json_to_arrow(&batch)?) + } else { + Ok(batch) + } } /// Get the deletion vector for this fragment, using the cache if available.