diff --git a/src/visitor/jsonvectorassembler.ts b/src/visitor/jsonvectorassembler.ts index f89dbdda..fc0ddc62 100644 --- a/src/visitor/jsonvectorassembler.ts +++ b/src/visitor/jsonvectorassembler.ts @@ -207,8 +207,8 @@ function* binaryToString(vector: Vector | Vector | Vector( + Ctor: new (n: number) => T, + values: ArrayLike, +): T { + const pad = 3; + const backing = new Ctor(values.length + pad * 2); + backing.set(values as any, pad); + return backing.subarray(pad, pad + values.length); +} + +describeHelper('bigNumsToStrings', () => { + test('formats unsigned 64-bit values (stride 2)', () => { + const values = [0n, 1n, 42n, 0xFFFFFFFFn, 0x1_0000_0000n, 1234567890123456789n, 0xFFFFFFFFFFFFFFFFn]; + const arr = BigUint64Array.from(values); + expect([...bigNumsToStrings(arr, 2)]).toEqual(values.map(String)); + }); + + test('formats signed 64-bit values as their unsigned two\'s-complement (stride 2)', () => { + const values = [-1n, -2n, -1234567890n, 5n]; + const arr = BigInt64Array.from(values); + const expected = values.map((v) => String(BigInt.asUintN(64, v))); + expect([...bigNumsToStrings(arr, 2)]).toEqual(expected); + // sanity: -1 is the all-ones 64-bit word + expect(expected[0]).toBe('18446744073709551615'); + }); + + test('formats unsigned 128-bit values (stride 4)', () => { + // little-endian u32 words: value = w0 + w1*2^32 + w2*2^64 + w3*2^96 + const one = [1, 0, 0, 0]; + const twoTo96 = [0, 0, 0, 1]; + const arr = Uint32Array.from([...one, ...twoTo96]); + expect([...bigNumsToStrings(arr, 4)]).toEqual([ + '1', + String(2n ** 96n), + ]); + }); + + test('yields one string per group of `stride` words', () => { + const arr = BigUint64Array.from([10n, 20n, 30n]); + expect([...bigNumsToStrings(arr, 2)]).toHaveLength(3); + }); + + test('honours byteOffset for 64-bit views (regression)', () => { + const values = [0n, 1n, 1234567890123456789n, 0xFFFFFFFFFFFFFFFFn]; + const view = atOffset(BigUint64Array, values); + + // precondition: the view really starts past the buffer origin + expect(view.byteOffset).toBeGreaterThan(0); + + expect([...bigNumsToStrings(view, 2)]).toEqual(values.map(String)); + }); + + test('offset view and fresh array agree for signed 64-bit', () => { + const values = [-1n, 9223372036854775807n, -9223372036854775808n, 0n]; + const fresh = BigInt64Array.from(values); + const view = atOffset(BigInt64Array, values); + + expect(view.byteOffset).toBeGreaterThan(0); + expect([...bigNumsToStrings(view, 2)]).toEqual([...bigNumsToStrings(fresh, 2)]); + }); +}); diff --git a/test/unit/ipc/writer/json-offset-views-tests.ts b/test/unit/ipc/writer/json-offset-views-tests.ts new file mode 100644 index 00000000..b368f5d1 --- /dev/null +++ b/test/unit/ipc/writer/json-offset-views-tests.ts @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// JSONVectorAssembler must serialize a vector the same way regardless of where +// its backing typed arrays sit in their buffers. For each affected type we take +// a freshly-generated Vector (byteOffset === 0), rewrap every backing typed +// array in a subarray view at byteOffset > 0, and assert both serialize to +// identical JSON. + +import { + Data, DataType, Field, RecordBatch, Schema, Struct, +} from 'apache-arrow'; +import { JSONVectorAssembler } from 'apache-arrow/visitor/jsonvectorassembler'; +import * as generate from '../../../generate-test-data.js'; + +/** + * Return a copy of `arr` whose data lives at a nonzero byteOffset inside a + * larger underlying buffer. Equivalent to the state of a typed array that + * came out of `tableFromIPC` — same logical contents, shifted memory layout. + */ +function padBuffer(arr: T | undefined): T | undefined { + if (!arr) return arr; + const Ctor = arr.constructor as new (n: number) => any; + const padding = 4; + const padded = new Ctor((arr as any).length + padding * 2); + padded.set(arr, padding); + return padded.subarray(padding, padding + (arr as any).length) as T; +} + +/** + * Clone a Data so every backing typed array sits at byteOffset > 0. + * Recurses into children so nested types stay consistent. + */ +function withOffsetViews(d: Data): Data { + const newBuffers = Array.from(d.buffers as unknown as Array, padBuffer); + const newChildren = d.children.map((c) => withOffsetViews(c)); + return d.clone(d.type, d.offset, d.length, d.nullCount, newBuffers as any, newChildren); +} + +/** + * Wrap a Data in a single-field RecordBatch so we can hand it to assemble(). + */ +function batchOf(type: T, data: Data): RecordBatch { + const field = new Field('x', type, true); + const schema = new Schema([field]); + const structType = new Struct([field]); + const root = data.clone(structType as any, 0, data.length, 0, [undefined, undefined, undefined, undefined] as any, [data]); + return new RecordBatch(schema, root as Data); +} + +// Every affected callsite of bigNumsToStrings, one entry per type. Adding +// a new affected type is a one-line addition here. +const affectedTypeCases: Array<{ name: string; gen: () => generate.GeneratedVector }> = [ + { name: 'Int64', gen: () => generate.int64() }, + { name: 'Uint64', gen: () => generate.uint64() }, + { name: 'DateMillisecond', gen: () => generate.dateMillisecond() }, + { name: 'TimestampSecond', gen: () => generate.timestampSecond() }, + { name: 'TimestampMillisecond', gen: () => generate.timestampMillisecond() }, + { name: 'TimestampMicrosecond', gen: () => generate.timestampMicrosecond() }, + { name: 'TimestampNanosecond', gen: () => generate.timestampNanosecond() }, + { name: 'TimeMicrosecond', gen: () => generate.timeMicrosecond() }, + { name: 'TimeNanosecond', gen: () => generate.timeNanosecond() }, + { name: 'DurationSecond', gen: () => generate.durationSecond() }, + { name: 'DurationMillisecond', gen: () => generate.durationMillisecond() }, + { name: 'DurationMicrosecond', gen: () => generate.durationMicrosecond() }, + { name: 'DurationNanosecond', gen: () => generate.durationNanosecond() }, + { name: 'Decimal', gen: () => generate.decimal() }, + { name: 'LargeUtf8', gen: () => generate.largeUtf8() }, + { name: 'LargeBinary', gen: () => generate.largeBinary() }, +]; + +// JSONVectorAssembler is an internal (non-public) export. Single-bundle build +// targets such as the UMD bundle collapse every `apache-arrow/*` deep import to +// the public bundle, which doesn't re-export it. +// So, skip the test on UMD builds (covered there via the public API in +// json-writer-tests.ts). +if (!JSONVectorAssembler) { + console.warn( + 'Skipping "JSONVectorAssembler offset-view safety" test under UMD build.' + ); +} +const describeAssembler = JSONVectorAssembler ? describe : describe.skip; + +describeAssembler('JSONVectorAssembler offset-view safety', () => { + for (const { name, gen } of affectedTypeCases) { + test(name, () => { + const { vector } = gen(); + const fresh = vector.data[0]; + const viewed = withOffsetViews(fresh); + + // Sanity: the rewrap really did shift at least one buffer + // past byteOffset 0, so this run actually exercises the bug + // precondition. + const buffers = Array.from(viewed.buffers as unknown as Array); + expect(buffers.some((b) => b && b.byteOffset > 0)).toBe(true); + + const [[jsonFresh]] = JSONVectorAssembler.assemble(batchOf(fresh.type, fresh)); + const [[jsonViewed]] = JSONVectorAssembler.assemble(batchOf(fresh.type, viewed)); + expect(jsonViewed).toEqual(jsonFresh); + }); + } +}); diff --git a/test/unit/ipc/writer/json-writer-tests.ts b/test/unit/ipc/writer/json-writer-tests.ts index 29b66515..c7d48ee5 100644 --- a/test/unit/ipc/writer/json-writer-tests.ts +++ b/test/unit/ipc/writer/json-writer-tests.ts @@ -24,7 +24,9 @@ import { ArrowJSONLike, RecordBatchJSONWriter, RecordBatchReader, - Table + Table, + tableFromIPC, + tableToIPC, } from 'apache-arrow'; describe('RecordBatchJSONWriter', () => { @@ -38,12 +40,16 @@ describe('RecordBatchJSONWriter', () => { function testJSONWriter(table: Table, name: string) { describe(`should write the Arrow IPC JSON format (${name})`, () => { - test(`Table`, validateTable.bind(0, table)); + test(`Table`, validateJSONWriter.bind(0, table)); }); } -async function validateTable(source: Table) { - const writer = RecordBatchJSONWriter.writeAll(source); +async function validateJSONWriter(source: Table) { + // Route through binary IPC so the JSON writer sees realistic typed-array + // views with byteOffset > 0 (as `tableFromIPC` produces), not just fresh + // allocations from the generator. + const loaded = tableFromIPC(tableToIPC(source, 'stream')); + const writer = RecordBatchJSONWriter.writeAll(loaded); const string = await writer.toString(); const json = JSON.parse(string) as ArrowJSONLike; const result = new Table(RecordBatchReader.from(json));