diff --git a/packages/compiler/scripts/parseLiquid.js b/packages/compiler/scripts/parseLiquid.js index 4c2f35d7..b6372281 100644 --- a/packages/compiler/scripts/parseLiquid.js +++ b/packages/compiler/scripts/parseLiquid.js @@ -16,10 +16,9 @@ import {fileURLToPath} from 'node:url'; import * as ohm from 'ohm-js-legacy'; import {Bench} from 'tinybench'; -import {Grammar} from 'ohm-js'; -import {compileGrammars} from '../src/api.ts'; import {unparse} from '../test/_helpers.js'; -import {createReader} from '../../runtime/src/cstReader.ts'; +import {compileGrammars} from '../src/api.ts'; +import {Grammar} from '../../runtime/src/miniohm.ts'; const __dirname = dirname(fileURLToPath(import.meta.url)); const datadir = join(__dirname, '../test/data'); @@ -35,8 +34,7 @@ const positionalArgs = process.argv.slice(2).filter(a => !a.startsWith('--')); // https://matklad.github.io/2024/03/22/basic-things.html const smallSize = flags.has('--small-size'); const includeUnparse = flags.has('--include-unparse'); -const useCstReader = flags.has('--cst-reader'); -const useCstReaderPacked = flags.has('--cst-reader-packed'); +const useCstView = flags.has('--use-cstview'); // Get pattern from command line arguments const pattern = positionalArgs[0]; @@ -105,52 +103,26 @@ const pattern = positionalArgs[0]; opts ); - // Walk CST using CstReader (raw handles), collecting terminal text. - function unparseCstReaderRaw(matchResult) { - const reader = createReader(matchResult); - const inp = reader.input; - let ans = ''; - function walk(handle, startIdx) { - if (reader.isTerminal(handle)) { - ans += inp.slice(startIdx, startIdx + reader.matchLength(handle)); - return; - } - reader.forEachChild( - handle, - (child, _leadingSpaces, offset) => { - walk(child, startIdx + offset); - }, - startIdx - ); - } - walk(reader.rootHandle, reader.rootStartIdx); - return ans; - } - - // Walk CST using CstReader (handles with startIdx), collecting terminal text. - function unparseCstReaderPacked(matchResult) { - const reader = createReader(matchResult, {packStartIdx: true}); + // Walk CST using CstView (handles with startIdx), collecting terminal text. + function unparseCstView(matchResult) { + const {cst} = matchResult; let ans = ''; function walk(handle) { - if (reader.isTerminal(handle)) { - ans += reader.sourceString(handle); + if (cst.kind(handle) === '_terminal') { + ans += cst.sourceString(handle); return; } - reader.forEachChild(handle, (child, _leadingSpaces) => { + cst.forEachChild(handle, (child, _leadingSpaces) => { walk(child); }); } - walk(reader.root); + walk(cst.root); return ans; } const wasmLabel = includeUnparse ? 'Wasm parse+unparse' : 'Wasm parse'; bench.add( - useCstReaderPacked - ? `${wasmLabel} (CstReader packed)` - : useCstReader - ? `${wasmLabel} (CstReader)` - : wasmLabel, + useCstView ? `${wasmLabel} (CstView)` : wasmLabel, () => { let overriddenDuration = 0; for (const {input} of files) { @@ -167,11 +139,7 @@ const pattern = positionalArgs[0]; peakWasmMemoryBytes, exports.memory.buffer.byteLength ); - return useCstReaderPacked - ? unparseCstReaderPacked(m) - : useCstReader - ? unparseCstReaderRaw(m) - : unparse(g); + return useCstView ? unparseCstView(m) : unparse(g); }); if (includeUnparse) overriddenDuration += bench.now() - start; } diff --git a/packages/compiler/test/test-cstView.js b/packages/compiler/test/test-cstView.js new file mode 100644 index 00000000..58016810 --- /dev/null +++ b/packages/compiler/test/test-cstView.js @@ -0,0 +1,188 @@ +import test from 'ava'; +import {readFileSync} from 'node:fs'; + +import {CstKind} from 'ohm-js/cstView'; + +import {compileAndLoad, scriptRel} from './_helpers.js'; + +const jsonSource = readFileSync(scriptRel('../../lang-json/json.ohm'), 'utf-8'); + +/** + * Walk the CST via CstView and reconstruct a JS value, exercising unpack, + * forEachGroup, and the basic accessors (kind, sourceString). + */ +function toJS(cst, handle) { + const k = cst.kind(handle); + switch (k) { + case 'Object_empty': + return {}; + + case 'Object_nonEmpty': + // "{" Pair ("," Pair)* "}" + return cst.unpack(handle, (_open, firstPair, restPairs, _close) => { + const obj = {}; + const [key, val] = parsePair(cst, firstPair); + obj[key] = val; + cst.forEachGroup(restPairs, (_comma, pair) => { + const [k2, v2] = parsePair(cst, pair); + obj[k2] = v2; + }); + return obj; + }); + + case 'Array_empty': + return []; + + case 'Array_nonEmpty': + // "[" Value ("," Value)* "]" + return cst.unpack(handle, (_open, firstValue, restValues, _close) => { + const arr = [toJS(cst, firstValue)]; + cst.forEachGroup(restValues, (_comma, value) => { + arr.push(toJS(cst, value)); + }); + return arr; + }); + + case 'stringLiteral': + // "\"" doubleStringCharacter* "\"" + return cst.unpack(handle, (_open, chars, _close) => { + const parts = []; + cst.forEachGroup(chars, char => { + parts.push(toJS(cst, char)); + }); + return parts.join(''); + }); + + case 'doubleStringCharacter_nonEscaped': + return cst.sourceString(handle); + + case 'doubleStringCharacter_escaped': + // "\\" escapeSequence + return cst.unpack(handle, (_backslash, escSeq) => toJS(cst, escSeq)); + + case 'escapeSequence_doubleQuote': + return '"'; + case 'escapeSequence_reverseSolidus': + return '\\'; + case 'escapeSequence_solidus': + return '/'; + case 'escapeSequence_backspace': + return '\b'; + case 'escapeSequence_formfeed': + return '\f'; + case 'escapeSequence_newline': + return '\n'; + case 'escapeSequence_carriageReturn': + return '\r'; + case 'escapeSequence_horizontalTab': + return '\t'; + case 'escapeSequence_codePoint': + // "u" fourHexDigits + return cst.unpack(handle, (_u, fourHex) => { + return String.fromCharCode(parseInt(cst.sourceString(fourHex), 16)); + }); + + case 'numberLiteral_withExponent': + case 'numberLiteral_withoutExponent': + case 'decimal_withFract': + case 'decimal_withoutFract': + return Number(cst.sourceString(handle)); + + case 'True': + return true; + case 'False': + return false; + case 'Null': + return null; + + default: + // Wrapper nonterminals (Value, Object, Array, String, Number, Pair, + // doubleStringCharacter, escapeSequence, etc.) — pass through to child. + if (k === CstKind.Terminal) { + return cst.sourceString(handle); + } + return toJS(cst, cst.onlyChild(handle)); + } +} + +function parsePair(cst, handle) { + // Pair = String ":" Value + return cst.unpack(handle, (key, _colon, value) => { + return [toJS(cst, key), toJS(cst, value)]; + }); +} + +function parse(g, input) { + return g.match(input).use(r => { + if (r.failed()) throw new Error(r.message); + return toJS(r.cst, r.cst.root); + }); +} + +let jsonGrammar; + +test.before(async () => { + jsonGrammar = await compileAndLoad(jsonSource); +}); + +test('empty object', t => { + t.deepEqual(parse(jsonGrammar, '{}'), {}); +}); + +test('empty array', t => { + t.deepEqual(parse(jsonGrammar, '[]'), []); +}); + +test('strings', t => { + t.is(parse(jsonGrammar, '"hello"'), 'hello'); + t.is(parse(jsonGrammar, '""'), ''); +}); + +test('numbers', t => { + t.is(parse(jsonGrammar, '0'), 0); + t.is(parse(jsonGrammar, '42'), 42); + t.is(parse(jsonGrammar, '-1'), -1); + t.is(parse(jsonGrammar, '3.14'), 3.14); + t.is(parse(jsonGrammar, '1e10'), 1e10); + t.is(parse(jsonGrammar, '2.5E-3'), 2.5e-3); +}); + +test('booleans and null', t => { + t.is(parse(jsonGrammar, 'true'), true); + t.is(parse(jsonGrammar, 'false'), false); + t.is(parse(jsonGrammar, 'null'), null); +}); + +test('simple object', t => { + t.deepEqual(parse(jsonGrammar, '{"key": "value", "num": 42}'), { + key: 'value', + num: 42, + }); +}); + +test('nested structures', t => { + const input = '{"a": [1, 2, {"b": true}], "c": null}'; + t.deepEqual(parse(jsonGrammar, input), { + a: [1, 2, {b: true}], + c: null, + }); +}); + +test('string escape sequences', t => { + t.is(parse(jsonGrammar, '"hello\\nworld"'), 'hello\nworld'); + t.is(parse(jsonGrammar, '"tab\\there"'), 'tab\there'); + t.is(parse(jsonGrammar, '"quote\\"end"'), 'quote"end'); + t.is(parse(jsonGrammar, '"slash\\\\end"'), 'slash\\end'); + t.is(parse(jsonGrammar, '"\\u0041"'), 'A'); +}); + +test('array with mixed types', t => { + t.deepEqual(parse(jsonGrammar, '[1, "two", true, null, [3]]'), [1, 'two', true, null, [3]]); +}); + +test('deeply nested', t => { + const input = '{"a": {"b": {"c": [1, 2, 3]}}}'; + t.deepEqual(parse(jsonGrammar, input), { + a: {b: {c: [1, 2, 3]}}, + }); +}); diff --git a/packages/runtime/ohm-js.api.md b/packages/runtime/ohm-js.api.md index 08ccfeca..7954e71f 100644 --- a/packages/runtime/ohm-js.api.md +++ b/packages/runtime/ohm-js.api.md @@ -163,6 +163,8 @@ export interface SeqNode ex // @public (undocumented) export class SucceededMatchResult extends MatchResult { + // Warning: (ae-forgotten-export) The symbol "CstView" needs to be exported by the entry point index.d.ts + get cst(): CstView; // (undocumented) getCstRoot(): CstNode; } diff --git a/packages/runtime/package.json b/packages/runtime/package.json index db8d1eb0..06acd807 100644 --- a/packages/runtime/package.json +++ b/packages/runtime/package.json @@ -24,9 +24,9 @@ "types": "./dist/src/unstableDebug.d.ts", "default": "./dist/src/unstableDebug.js" }, - "./cstReader": { - "types": "./dist/src/cstReader.d.ts", - "default": "./dist/src/cstReader.js" + "./cstView": { + "types": "./dist/src/cstView.d.ts", + "default": "./dist/src/cstView.js" } }, "files": ["dist"], diff --git a/packages/runtime/src/cstReader.ts b/packages/runtime/src/cstReader.ts deleted file mode 100644 index f497f8d8..00000000 --- a/packages/runtime/src/cstReader.ts +++ /dev/null @@ -1,329 +0,0 @@ -import {isTaggedTerminal, MATCH_RECORD_TYPE_MASK, MatchRecordType} from './miniohm.ts'; - -import type {MatchContext, SucceededMatchResult} from './miniohm.ts'; - -const HANDLE_BITS = 27; -const SHIFT = 2 ** HANDLE_BITS; // 134217728 -const MASK = SHIFT - 1; // 0x7FFFFFF - -/** - * Null handle — indicates no node (e.g. no leading spaces). - * 0 is safe because heap allocations start from __heap_base (always > 0), - * so no CST node pointer is ever 0. - */ -export const NULL_HANDLE = 0; - -/** - * Spaces handle encoding: (spacesLen << 2) | 0b10. - * Bit 1 set, bit 0 clear — distinct from heap pointers (low 2 bits = 00) - * and tagged terminals (bit 0 = 1). - * - * In packed mode, startIdx is embedded in the upper bits just like other - * handles: startIdx * SHIFT + rawSpacesHandle. - * - * These are internal — consumers use the normal accessor methods on CstReader. - */ -function makeSpacesHandle(spacesLen: number): number { - return (spacesLen << 2) | 2; -} - -function isSpacesHandle(raw: number): boolean { - return (raw & 3) === 2; -} - -/** - * Pack a raw CST handle and startIdx into a single Number handle. - * Uses 53 of the available integer-precision bits in an IEEE 754 double - * (27 bits for the pointer, 26 bits for startIdx). Accessor methods - * (isTerminal, matchLength, etc.) transparently accept either a raw handle - * or a handle with startIdx — they extract the low 27 bits via `& MASK`, - * which is an identity operation for raw handles (< 2^27). - */ -export function pack(rawHandle: number, startIdx: number): number { - return startIdx * SHIFT + rawHandle; -} - -export function unpackHandle(handle: number): number { - return handle & MASK; -} - -export function unpackStartIdx(handle: number): number { - const raw = handle & MASK; - return (handle - raw) / SHIFT; -} - -/** - * Zero-allocation access to the CST stored in Wasm linear memory. - * - * Accessor methods (isTerminal, matchLength, childCount, ctorName, details) - * accept either a raw handle or a handle with startIdx. - * - * forEachChild(handle, fn) iterates visible children. The callback receives - * (childHandle, leadingSpaces, pos, index). leadingSpaces is a handle for - * the leading spaces node (use accessor methods to inspect), or NULL_HANDLE - * if none. - */ -export class CstReader { - /** @internal */ - private _ctx: MatchContext; - /** @internal — whether handles have startIdx packed in. */ - private _packed: boolean; - - /** Handle for the root node (with startIdx packed in if packStartIdx was set). */ - readonly root: number; - /** Handle for leading spaces before the root, or NULL_HANDLE if none. */ - readonly rootLeadingSpaces: number; - - /** Raw handle for the root node (without startIdx). */ - get rootHandle(): number { - return this.root & MASK; - } - /** startIdx for the root node. */ - get rootStartIdx(): number { - return unpackStartIdx(this.root); - } - - /** @internal */ - constructor(ctx: MatchContext, root: number, rootLeadingSpaces: number, packed: boolean) { - this._ctx = ctx; - this.root = root; - this.rootLeadingSpaces = rootLeadingSpaces; - this._packed = packed; - } - - /** Extract the startIdx from a handle. */ - startIdx(handle: number): number { - return unpackStartIdx(handle); - } - - isTerminal(handle: number): boolean { - const raw = handle & MASK; - if (isSpacesHandle(raw)) return false; - if (isTaggedTerminal(raw)) return true; - return ( - ((this._ctx.view.getInt32(raw + 8, true) & - MATCH_RECORD_TYPE_MASK) as MatchRecordType) === MatchRecordType.TERMINAL - ); - } - - isNonterminal(handle: number): boolean { - const raw = handle & MASK; - if (isSpacesHandle(raw)) return true; - if (isTaggedTerminal(raw)) return false; - return ( - ((this._ctx.view.getInt32(raw + 8, true) & - MATCH_RECORD_TYPE_MASK) as MatchRecordType) === MatchRecordType.NONTERMINAL - ); - } - - isList(handle: number): boolean { - const raw = handle & MASK; - if (isSpacesHandle(raw) || isTaggedTerminal(raw)) return false; - return ( - ((this._ctx.view.getInt32(raw + 8, true) & - MATCH_RECORD_TYPE_MASK) as MatchRecordType) === MatchRecordType.ITER_FLAG - ); - } - - isOptional(handle: number): boolean { - const raw = handle & MASK; - if (isSpacesHandle(raw) || isTaggedTerminal(raw)) return false; - return ( - ((this._ctx.view.getInt32(raw + 8, true) & - MATCH_RECORD_TYPE_MASK) as MatchRecordType) === MatchRecordType.OPTIONAL - ); - } - - /** Number of raw children stored in this match record. */ - childCount(handle: number): number { - const raw = handle & MASK; - if (isSpacesHandle(raw) || isTaggedTerminal(raw)) return 0; - return this._ctx.view.getUint32(raw, true); - } - - /** Length of matched input (in UTF-16 code units). */ - matchLength(handle: number): number { - const raw = handle & MASK; - if (isSpacesHandle(raw)) return raw >>> 2; - if (isTaggedTerminal(raw)) return raw >>> 1; - return this._ctx.view.getUint32(raw + 4, true); - } - - /** - * Constructor name. For nonterminals, the rule name (without parameterization). - * For other types: '_terminal', '_list', '_opt'. - */ - ctorName(handle: number): string { - const raw = handle & MASK; - if (isSpacesHandle(raw)) return 'spaces'; - if (isTaggedTerminal(raw)) return '_terminal'; - const type = (this._ctx.view.getInt32(raw + 8, true) & - MATCH_RECORD_TYPE_MASK) as MatchRecordType; - if (type === MatchRecordType.NONTERMINAL) { - const ruleId = this._ctx.view.getInt32(raw + 8, true) >>> 2; - return this._ctx.ruleNames[ruleId].split('<')[0]; - } - if (type === MatchRecordType.TERMINAL) return '_terminal'; - if (type === MatchRecordType.ITER_FLAG) return '_list'; - return '_opt'; - } - - /** - * Upper bits of typeAndDetails. For NONTERMINAL: the ruleId. - * For ITER_FLAG: the arity (children per iteration). - */ - details(handle: number): number { - const raw = handle & MASK; - if (isSpacesHandle(raw) || isTaggedTerminal(raw)) return 0; - return this._ctx.view.getInt32(raw + 8, true) >>> 2; - } - - /** Handle (Wasm pointer) of the i-th raw child. */ - childAt(handle: number, i: number): number { - const raw = handle & MASK; - return this._ctx.view.getUint32(raw + 16 + i * 4, true); - } - - /** Source string for a node. If startIdx is omitted, it is extracted from the handle. */ - sourceString(handle: number, startIdx?: number): string { - if (startIdx === undefined) { - const raw = handle & MASK; - startIdx = (handle - raw) / SHIFT; - handle = raw; - } - return this._ctx.input.slice(startIdx, startIdx + this.matchLength(handle)); - } - - /** The full input string that was parsed. */ - get input(): string { - return this._ctx.input; - } - - /** - * Iterate over children. The callback receives (childHandle, leadingSpaces, - * pos, index). leadingSpaces is a handle for the leading spaces node - * (works with accessor methods like matchLength, ctorName, sourceString), - * or NULL_HANDLE (0) if none. - * - * The meaning of `pos` depends on the mode: - * - Raw mode: offset from the parent's start position. - * `parentStartIdx` is the parent's absolute position, needed to - * query the memo table for spaces lengths. - * - Packed mode: the child's absolute startIdx (parentStartIdx is - * ignored since startIdx is embedded in the handle). - */ - forEachChild( - handle: number, - fn: (child: number, leadingSpaces: number, pos: number, index: number) => void, - parentStartIdx = 0 - ): void { - if (this._packed) { - this._forEachChildPacked(handle, fn); - } else { - this._forEachChildRaw(handle, parentStartIdx, fn); - } - } - - /** Check whether a raw child handle has parent-level space skipping. */ - private _hasParentSpaces(rawChild: number): boolean { - if (isTaggedTerminal(rawChild)) return true; - const type = (this._ctx.view.getInt32(rawChild + 8, true) & - MATCH_RECORD_TYPE_MASK) as MatchRecordType; - return type === MatchRecordType.NONTERMINAL || type === MatchRecordType.TERMINAL; - } - - private _forEachChildRaw( - handle: number, - parentStartIdx: number, - fn: (child: number, leadingSpaces: number, offset: number, index: number) => void - ): void { - if (isTaggedTerminal(handle)) return; - const count = this._ctx.view.getUint32(handle, true); - const {getSpacesLenAt} = this._ctx; - let offset = 0; - for (let i = 0; i < count; i++) { - const child = this._ctx.view.getUint32(handle + 16 + i * 4, true); - const rawSpacesLen = - getSpacesLenAt && this._hasParentSpaces(child) - ? Math.max(0, getSpacesLenAt(parentStartIdx + offset)) - : 0; - const leadingSpaces = rawSpacesLen > 0 ? makeSpacesHandle(rawSpacesLen) : 0; - offset += rawSpacesLen; - const len = isTaggedTerminal(child) - ? child >>> 1 - : this._ctx.view.getUint32(child + 4, true); - fn(child, leadingSpaces, offset, i); - offset += len; - } - } - - private _forEachChildPacked( - handle: number, - fn: (child: number, leadingSpaces: number, pos: number, index: number) => void - ): void { - const raw = handle & MASK; - if (isTaggedTerminal(raw)) return; - const count = this._ctx.view.getUint32(raw, true); - let childStart = (handle - raw) / SHIFT; - const {getSpacesLenAt} = this._ctx; - for (let i = 0; i < count; i++) { - const rawChild = this._ctx.view.getUint32(raw + 16 + i * 4, true); - const rawSpacesLen = - getSpacesLenAt && this._hasParentSpaces(rawChild) - ? Math.max(0, getSpacesLenAt(childStart)) - : 0; - // Pack the spaces handle with startIdx so sourceString() works. - const spacesStartIdx = childStart; - const leadingSpaces = - rawSpacesLen > 0 ? spacesStartIdx * SHIFT + makeSpacesHandle(rawSpacesLen) : 0; - childStart += rawSpacesLen; - const childHandle = childStart * SHIFT + rawChild; - const len = isTaggedTerminal(rawChild) - ? rawChild >>> 1 - : this._ctx.view.getUint32(rawChild + 4, true); - fn(childHandle, leadingSpaces, childStart, i); - childStart += len; - } - } -} - -interface CreateReaderOptions { - /** If true, handles include startIdx (see pack()). Default: false. */ - packStartIdx?: boolean; -} - -export function createReader( - result: SucceededMatchResult, - options?: CreateReaderOptions -): CstReader { - const exports = (result.grammar as any)._instance.exports; - const ctx = result._ctx; - const doPack = options?.packStartIdx ?? false; - - if (doPack) { - const heapTop = exports.__offset.value; - if (heapTop >= SHIFT) { - throw new Error( - `Wasm heap too large for CstReader: ${heapTop} bytes exceeds ${HANDLE_BITS}-bit limit (${SHIFT} bytes)` - ); - } - const startIdxLimit = 2 ** (53 - HANDLE_BITS); - if (ctx.input.length >= startIdxLimit) { - throw new Error( - `Input too long for CstReader: ${ctx.input.length} chars exceeds ${53 - HANDLE_BITS}-bit limit (${startIdxLimit} chars)` - ); - } - } - - const spacesLen = Math.max(0, exports.getSpacesLenAt(0)); - const rootPtr = exports.bindingsAt(0); - const p = doPack ? pack : (h: number, _s: number) => h; - // Pack the spaces handle with startIdx=0 (root leading spaces always start at 0). - const rootLeadingSpaces = - spacesLen > 0 - ? doPack - ? 0 * SHIFT + makeSpacesHandle(spacesLen) - : makeSpacesHandle(spacesLen) - : 0; - return new CstReader(ctx, p(rootPtr, spacesLen), rootLeadingSpaces, doPack); -} diff --git a/packages/runtime/src/cstView.ts b/packages/runtime/src/cstView.ts new file mode 100644 index 00000000..de3d0d60 --- /dev/null +++ b/packages/runtime/src/cstView.ts @@ -0,0 +1,343 @@ +import {isTaggedTerminal, MATCH_RECORD_TYPE_MASK, MatchRecordType} from './miniohm.ts'; + +import type {MatchContext, SucceededMatchResult} from './miniohm.ts'; + +const HANDLE_BITS = 27; +const SHIFT = 2 ** HANDLE_BITS; // 134217728 +const MASK = SHIFT - 1; // 0x7FFFFFF + +/** + * Null handle — indicates no node (e.g. no leading spaces). + * 0 is safe because heap allocations start from __heap_base (always > 0), + * so no CST node pointer is ever 0. + */ +export const NULL_HANDLE = 0; + +/** + * Special kind values for non-nonterminal nodes. + * For nonterminal nodes, `kind()` returns the rule name (a string). + */ +export const CstKind = { + Terminal: '_terminal', + List: '_list', + Optional: '_opt', + Spaces: 'spaces', +} as const; + +export type CstKindValue = (typeof CstKind)[keyof typeof CstKind]; +export type CstKind = CstKindValue | string; + +/** + * Spaces handle encoding: (spacesLen << 2) | 0b10. + * Bit 1 set, bit 0 clear — distinct from heap pointers (low 2 bits = 00) + * and tagged terminals (bit 0 = 1). + * + * These are internal — consumers use the normal accessor methods on CstView. + */ +function makeSpacesHandle(spacesLen: number): number { + return (spacesLen << 2) | 2; +} + +function isSpacesHandle(raw: number): boolean { + return (raw & 3) === 2; +} + +/** + * Pack a raw CST handle and startIdx into a single Number handle. + * Uses 53 of the available integer-precision bits in an IEEE 754 double + * (27 bits for the pointer, 26 bits for startIdx). + */ +function pack(rawHandle: number, startIdx: number): number { + return startIdx * SHIFT + rawHandle; +} + +function unpackStartIdx(handle: number): number { + const raw = handle & MASK; + return (handle - raw) / SHIFT; +} + +/** + * Zero-allocation access to the CST stored in Wasm linear memory. + * + * All handles have startIdx packed in. + * + * Use `kind(handle)` to get the node kind — returns the rule name for + * nonterminals, or a CstKind constant for other node types. + */ +export class CstView { + /** @internal */ + private _ctx: MatchContext; + /** @internal — precomputed rule ctor names (without parameterization). */ + private _ruleCtorNames: string[]; + /** @internal — mutable cursor used by _decodeChild to avoid allocation. */ + private _pos = 0; + + /** Handle for the root node (with startIdx packed in). */ + readonly root: number; + /** Handle for leading spaces before the root, or NULL_HANDLE if none. */ + readonly rootLeadingSpaces: number; + + /** @internal */ + constructor( + ctx: MatchContext, + root: number, + rootLeadingSpaces: number, + ruleCtorNames: string[] + ) { + this._ctx = ctx; + this.root = root; + this.rootLeadingSpaces = rootLeadingSpaces; + this._ruleCtorNames = ruleCtorNames; + } + + /** Create a CstView from a successful match result. */ + static from(result: SucceededMatchResult): CstView { + const exports = (result.grammar as any)._instance.exports; + const ctx = result._ctx; + + const heapTop = exports.__offset.value; + if (heapTop >= SHIFT) { + throw new Error( + `Wasm heap too large for CstView: ${heapTop} bytes exceeds ${HANDLE_BITS}-bit limit (${SHIFT} bytes)` + ); + } + const startIdxLimit = 2 ** (53 - HANDLE_BITS); + if (ctx.input.length >= startIdxLimit) { + throw new Error( + `Input too long for CstView: ${ctx.input.length} chars exceeds ${53 - HANDLE_BITS}-bit limit (${startIdxLimit} chars)` + ); + } + + const ruleCtorNames = (result.grammar as any)._ruleCtorNames as string[]; + + const spacesLen = Math.max(0, exports.getSpacesLenAt(0)); + const rootPtr = exports.bindingsAt(0); + const rootLeadingSpaces = spacesLen > 0 ? 0 * SHIFT + makeSpacesHandle(spacesLen) : 0; + return new CstView(ctx, pack(rootPtr, spacesLen), rootLeadingSpaces, ruleCtorNames); + } + + /** Extract the startIdx from a handle. */ + startIdx(handle: number): number { + this._checkNull(handle); + return unpackStartIdx(handle); + } + + /** + * Node kind. Returns the rule name (without parameterization) for + * nonterminals, or one of the CstKind constants ('_terminal', '_list', + * '_opt', 'spaces') for other node types. + */ + kind(handle: number): CstKind { + this._checkNull(handle); + const raw = handle & MASK; + if (isSpacesHandle(raw)) return CstKind.Spaces; + if (isTaggedTerminal(raw)) return CstKind.Terminal; + const type = (this._ctx.view.getInt32(raw + 8, true) & + MATCH_RECORD_TYPE_MASK) as MatchRecordType; + switch (type) { + case MatchRecordType.NONTERMINAL: { + const ruleId = this._ctx.view.getInt32(raw + 8, true) >>> 2; + return this._ruleCtorNames[ruleId]; + } + case MatchRecordType.TERMINAL: + return CstKind.Terminal; + case MatchRecordType.ITER_FLAG: + return CstKind.List; + case MatchRecordType.OPTIONAL: + return CstKind.Optional; + } + } + + /** + * Iterate over logical groups. Works for both list and optional nodes. + * + * - List (`_list`): groups children by the list's arity and calls `cb` + * per group. + * - Optional (`_opt`): all children form a single group. Calls `cb` + * once if present, not at all if absent. + */ + forEachGroup(handle: number, cb: (...children: number[]) => void): void { + this._checkNull(handle); + const k = this.kind(handle); + if (k !== CstKind.List && k !== CstKind.Optional) { + throw new Error(`forEachGroup() is only valid for list and optional nodes, got ${k}`); + } + const raw = handle & MASK; + const count = this._ctx.view.getUint32(raw, true); + if (count === 0) return; + + const type = (this._ctx.view.getInt32(raw + 8, true) & + MATCH_RECORD_TYPE_MASK) as MatchRecordType; + // For lists, arity is in the details field; for optionals, all children are one group. + const arity = + type === MatchRecordType.ITER_FLAG + ? this._ctx.view.getInt32(raw + 8, true) >>> 2 + : count; + + const savedPos = this._pos; + this._pos = (handle - raw) / SHIFT; + try { + const args = new Array(arity); + for (let i = 0; i < count; i += arity) { + for (let j = 0; j < arity; j++) args[j] = this._decodeChild(raw, i + j); + const p = this._pos; + cb.apply(undefined, args); + this._pos = p; + } + } finally { + this._pos = savedPos; + } + } + + /** + * Unpack a node's children into the callback arguments. + * + * For nonterminals: always calls `cb` with all children (even if zero), + * returns the result. + * For optionals: calls `cb` once if present (returns `T`), or returns + * `undefined` if absent. + * + * Not valid for terminals or lists — use `sourceString` for terminals + * and `forEachGroup` for lists. + */ + unpack(handle: number, cb: (...children: number[]) => T): T | undefined { + this._checkNull(handle); + const k = this.kind(handle); + if (k === CstKind.Terminal || k === CstKind.Spaces) { + throw new Error(`unpack() is not valid for ${k} nodes; use sourceString() instead`); + } + if (k === CstKind.List) { + throw new Error(`unpack() is not valid for list nodes; use forEachGroup() instead`); + } + const raw = handle & MASK; + const count = this._ctx.view.getUint32(raw, true); + // For optionals, absent means no callback. + if (k === CstKind.Optional && count === 0) return undefined; + if (cb.length > count) { + throw new Error( + `unpack(): callback expects ${cb.length} args but ${k} has ${count} children` + ); + } + this._pos = (handle - raw) / SHIFT; + const children = new Array(count); + for (let i = 0; i < count; i++) children[i] = this._decodeChild(raw, i); + return cb.apply(undefined, children); + } + + /** + * Return the only child of a wrapper nonterminal. Throws if the node + * doesn't have exactly one child. + */ + onlyChild(handle: number): number { + const raw = handle & MASK; + const count = this._ctx.view.getUint32(raw, true); + if (count !== 1) { + throw new Error( + `onlyChild(): expected 1 child, got ${count} (kind: ${this.kind(handle)})` + ); + } + this._pos = (handle - raw) / SHIFT; + return this._decodeChild(raw, 0); + } + + /** Number of raw children stored in this match record. */ + childCount(handle: number): number { + this._checkNull(handle); + const raw = handle & MASK; + if (isSpacesHandle(raw) || isTaggedTerminal(raw)) return 0; + return this._ctx.view.getUint32(raw, true); + } + + /** Length of matched input (in UTF-16 code units). */ + matchLength(handle: number): number { + this._checkNull(handle); + const raw = handle & MASK; + if (isSpacesHandle(raw)) return raw >>> 2; + if (isTaggedTerminal(raw)) return raw >>> 1; + return this._ctx.view.getUint32(raw + 4, true); + } + + /** Source string for a node. startIdx is extracted from the handle. */ + sourceString(handle: number): string { + this._checkNull(handle); + const raw = handle & MASK; + const si = (handle - raw) / SHIFT; + return this._ctx.input.slice(si, si + this.matchLength(raw)); + } + + /** The full input string that was parsed. */ + get input(): string { + return this._ctx.input; + } + + /** + * Iterate over children. The callback receives (childHandle, leadingSpaces, + * pos, index). leadingSpaces is a handle for the leading spaces node + * (works with accessor methods like matchLength, kind, sourceString), + * or NULL_HANDLE (0) if none. `pos` is the child's absolute startIdx. + */ + forEachChild( + handle: number, + fn: (child: number, leadingSpaces: number, pos: number, index: number) => void + ): void { + this._checkNull(handle); + const raw = handle & MASK; + if (isSpacesHandle(raw) || isTaggedTerminal(raw)) return; + const count = this._ctx.view.getUint32(raw, true); + let childStart = (handle - raw) / SHIFT; + const {getSpacesLenAt} = this._ctx; + for (let i = 0; i < count; i++) { + const rawChild = this._ctx.view.getUint32(raw + 16 + i * 4, true); + const rawSpacesLen = + getSpacesLenAt && this._hasParentSpaces(rawChild) + ? Math.max(0, getSpacesLenAt(childStart)) + : 0; + // Pack the spaces handle with startIdx so sourceString() works. + const spacesStartIdx = childStart; + const leadingSpaces = + rawSpacesLen > 0 ? spacesStartIdx * SHIFT + makeSpacesHandle(rawSpacesLen) : 0; + childStart += rawSpacesLen; + const childHandle = childStart * SHIFT + rawChild; + const len = isTaggedTerminal(rawChild) + ? rawChild >>> 1 + : this._ctx.view.getUint32(rawChild + 4, true); + fn(childHandle, leadingSpaces, childStart, i); + childStart += len; + } + } + + /** + * @internal — decode child at slot `i` of raw pointer `raw`. + * Reads and advances `this._pos`. Must be called sequentially for i = 0, 1, 2, ... + */ + private _decodeChild(raw: number, i: number): number { + const rawChild = this._ctx.view.getUint32(raw + 16 + i * 4, true); + const {getSpacesLenAt} = this._ctx; + const rawSpacesLen = + getSpacesLenAt && this._hasParentSpaces(rawChild) + ? Math.max(0, getSpacesLenAt(this._pos)) + : 0; + this._pos += rawSpacesLen; + const childHandle = this._pos * SHIFT + rawChild; + const len = isTaggedTerminal(rawChild) + ? rawChild >>> 1 + : this._ctx.view.getUint32(rawChild + 4, true); + this._pos += len; + return childHandle; + } + + /** @internal — throw on NULL_HANDLE to catch bugs early. */ + private _checkNull(handle: number): void { + if (handle === NULL_HANDLE) { + throw new Error('NULL_HANDLE passed to CstView method'); + } + } + + /** Check whether a raw child handle has parent-level space skipping. */ + private _hasParentSpaces(rawChild: number): boolean { + if (isTaggedTerminal(rawChild)) return true; + const type = (this._ctx.view.getInt32(rawChild + 8, true) & + MATCH_RECORD_TYPE_MASK) as MatchRecordType; + return type === MatchRecordType.NONTERMINAL || type === MatchRecordType.TERMINAL; + } +} diff --git a/packages/runtime/src/miniohm.ts b/packages/runtime/src/miniohm.ts index 4e8b905a..8a11ccec 100644 --- a/packages/runtime/src/miniohm.ts +++ b/packages/runtime/src/miniohm.ts @@ -1,4 +1,5 @@ import {assert, checkNotNull} from './assert.ts'; +import {CstView} from './cstView.ts'; import {getLineAndColumn, getLineAndColumnMessage} from './extras.ts'; export const MATCH_RECORD_TYPE_MASK = 0b11; @@ -142,6 +143,8 @@ export class Grammar { /** @internal */ private _instance?: WebAssembly.Instance = undefined; + /** @internal — precomputed rule constructor names (without parameterization). */ + _ruleCtorNames: string[] = []; /** @internal */ private _imports = { // System-level AssemblyScript imports. @@ -317,6 +320,10 @@ export class Grammar { this._ruleIds.set(ruleName, this._ruleIds.size); this._ruleNames.push(ruleName); } + this._ruleCtorNames = this._ruleNames.map(n => { + const i = n.indexOf('<'); + return i === -1 ? n : n.slice(0, i); + }); for (const str of parseStringTable(module, 'strings')) { this._strings.push(str); } @@ -1064,8 +1071,10 @@ function createMatchResult( } export class SucceededMatchResult extends MatchResult { - /** @internal */ - _cst: CstNode; + /** @internal — lazily materialized CstNode tree (compatibility adapter). */ + private _cstRoot?: CstNode; + /** @internal — lazily created CstView. */ + private _cstView?: CstView; /** @internal */ protected constructor( @@ -1075,7 +1084,16 @@ export class SucceededMatchResult extends MatchResult { succeeded: boolean ) { super(grammar, startExpr, ctx, succeeded); - this._cst = grammar._getCstRoot(ctx); + } + + /** Zero-allocation handle-based view of the CST. */ + get cst(): CstView { + return (this._cstView ??= CstView.from(this)); + } + + /** @internal — for backward compat: lazily access _cst. */ + get _cst(): CstNode { + return (this._cstRoot ??= this.grammar._getCstRoot(this._ctx)); } getCstRoot(): CstNode { diff --git a/packages/runtime/tsdown.config.ts b/packages/runtime/tsdown.config.ts index f50d49f1..6f60fdac 100644 --- a/packages/runtime/tsdown.config.ts +++ b/packages/runtime/tsdown.config.ts @@ -4,7 +4,7 @@ export default defineConfig({ entry: { index: 'index.ts', 'src/unstableDebug': 'src/unstableDebug.ts', - 'src/cstReader': 'src/cstReader.ts', + 'src/cstView': 'src/cstView.ts', }, format: 'esm', fixedExtension: false,