diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7c17ef0d8e95..ee3514bd7e0e 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -326,7 +326,8 @@ Optimizations * GITHUB#15938: Optimize ReaderUtil#partitionByLeaf to use binary search on leaf boundaries instead of linear scan. (Greg Miller) -* GITHUB#15970: Reduce memory usage of fields with long terms during segment merges. (Alan Woodward) +* GITHUB#15970, GITHUB#15977: Speed up TrieBuilder and reduce its memory footprint by replacing the in-memory object + tree with a compact prefix-coded byte buffer. (Alan Woodward, Guo Feng) Bug Fixes --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java index 8d290cafe37e..0a042571d1d7 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java @@ -17,20 +17,27 @@ package org.apache.lucene.codecs.lucene103.blocktree; import java.io.IOException; -import java.util.ArrayDeque; +import java.io.UncheckedIOException; import java.util.Arrays; -import java.util.Deque; import java.util.function.BiConsumer; +import org.apache.lucene.store.ByteBuffersDataInput; +import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; /** * A builder to build prefix tree (trie) as the index of block tree, and can be saved to disk. * - *

TODO make this trie builder a more memory efficient structure. + *

Entries are stored in a compact prefix-coded byte buffer during building. The first non-empty + * key (minKey) is stored separately so that {@link #append} can re-encode only that one entry and + * bulk-copy the remaining bytes. At {@link #save} time the trie structure is reconstructed + * on-the-fly using a frontier array and serialized to disk in a single pass. + * + *

Memory usage is O(total encoded bytes) during building and O(max key depth) during save. */ class TrieBuilder { @@ -54,343 +61,400 @@ class TrieBuilder { */ record Output(long fp, boolean hasTerms, BytesRef floorData) {} - private enum Status { - BUILDING, - SAVED, - DESTROYED - } + // ====== Prefix-coded buffer entry format ====== + // Each entry: + // [prefixLen: vInt] [suffixLen: vInt] [suffix: bytes] + // [fp: vLong] [hasTerms: byte (0/1)] [floorDataLen: vInt] [floorData: bytes] + // + // The first non-empty-key entry (minKey) is stored separately and NOT in the buffer. + // Buffer entries start from the second entry onward, prefix-encoded against their predecessor. - private static class Node { - - // The utf8 digit that leads to this Node, 0 for root node - private final int label; - // The output of this node. - private Output output; - // The number of children of this node. - private int childrenNum; - // Pointers to relative nodes - private Node next; - private Node firstChild; - private Node lastChild; - - Node(int label, Output output) { - this.label = label; - this.output = output; - } - } + /** Output for the empty ("") key, if any. */ + private Output emptyOutput; - /** - * Transient state used only during {@link #saveNodes}. Keeps save-time bookkeeping off of {@link - * Node} so that Nodes are smaller during the (much longer) building phase. - */ - private static class SaveFrame { - final Node node; - // Iteration cursor: the latest child that has been pushed for saving. - Node savedTo; - // File pointer assigned when this node is written. -1 until then. - long fp = -1; - // File pointers of children, collected as they are popped. Null for leaf nodes. - long[] childFps; - int numChildFps; + /** The first non-empty key, stored separately from the buffer. */ + private final BytesRef minKey; - SaveFrame(Node node) { - this.node = node; - if (node.childrenNum > 0) { - this.childFps = new long[node.childrenNum]; - } - } + /** Output for minKey. Null if there is no non-empty key entry. */ + private Output minOutput; - void addChildFp(long childFp) { - childFps[numChildFps++] = childFp; - } - } + /** Buffer holding all entries after minKey, prefix-encoded. */ + private final ByteBuffersDataOutput buffer = new ByteBuffersDataOutput(); - private final Node root = new Node(0, null); - private final BytesRef minKey; - private BytesRef maxKey; - private Status status = Status.BUILDING; + /** + * The last key appended. Since entries are always appended in sorted order this doubles as the + * lexicographically largest key for ordered-append assertions. + */ + private final BytesRefBuilder lastKey = new BytesRefBuilder(); + + /** The maximum key length across all entries. */ + private int maxKeyDepth; static TrieBuilder bytesRefToTrie(BytesRef k, Output v) { return new TrieBuilder(k, v); } private TrieBuilder(BytesRef k, Output v) { - minKey = maxKey = BytesRef.deepCopyOf(k); + minKey = BytesRef.deepCopyOf(k); + maxKeyDepth = k.length; + if (k.length == 0) { - root.output = v; - return; - } - Node parent = root; - for (int i = 0; i < k.length; i++) { - int b = k.bytes[i + k.offset] & 0xFF; - Output output = i == k.length - 1 ? v : null; - Node node = new Node(b, output); - parent.firstChild = parent.lastChild = node; - parent.childrenNum = 1; - parent = node; + emptyOutput = v; + } else { + minOutput = v; + lastKey.copyBytes(k); } } /** * Append all (K, V) pairs from the given trie into this one. The given trie builder need to * ensure its keys greater or equals than max key of this one. - * - *

Note: the given trie will be destroyed after appending. */ - void append(TrieBuilder trieBuilder) { - if (status != Status.BUILDING || trieBuilder.status != Status.BUILDING) { - throw new IllegalStateException( - "tries have wrong status, got this: " + status + ", append: " + trieBuilder.status); - } - assert this.maxKey.compareTo(trieBuilder.minKey) < 0; - - int mismatch = - Arrays.mismatch( - this.maxKey.bytes, - this.maxKey.offset, - this.maxKey.offset + this.maxKey.length, - trieBuilder.minKey.bytes, - trieBuilder.minKey.offset, - trieBuilder.minKey.offset + trieBuilder.minKey.length); - Node a = this.root; - Node b = trieBuilder.root; - - for (int i = 0; i < mismatch; i++) { - final Node aLast = a.lastChild; - final Node bFirst = b.firstChild; - assert aLast.label == bFirst.label; - - if (b.childrenNum > 1) { - aLast.next = bFirst.next; - a.childrenNum += b.childrenNum - 1; - a.lastChild = b.lastChild; - assert assertChildrenLabelInOrder(a); - } + void append(TrieBuilder other) { + assert this.lastKey.get().compareTo(other.minKey) < 0; - a = aLast; - b = bFirst; + if (other.emptyOutput != null && this.emptyOutput == null) { + this.emptyOutput = other.emptyOutput; } - assert b.childrenNum > 0; - if (a.childrenNum == 0) { - a.firstChild = b.firstChild; - a.lastChild = b.lastChild; - a.childrenNum = b.childrenNum; - } else { - assert a.lastChild.label < b.firstChild.label; - a.lastChild.next = b.firstChild; - a.lastChild = b.lastChild; - a.childrenNum += b.childrenNum; + if (other.minOutput != null) { + try { + + // Encode and append the min entry in 'other' into the buffer, prefix-encoded against + // lastKey. + BytesRef lastKeyRef = lastKey.get(); + int mismatch = + Arrays.mismatch( + lastKeyRef.bytes, + lastKeyRef.offset, + lastKeyRef.offset + lastKeyRef.length, + other.minKey.bytes, + other.minKey.offset, + other.minKey.offset + other.minKey.length); + if (mismatch == -1) { + mismatch = Math.min(lastKeyRef.length, other.minKey.length); + } + int suffixLen = other.minKey.length - mismatch; + buffer.writeVInt(mismatch); + buffer.writeVInt(suffixLen); + buffer.writeBytes(other.minKey.bytes, other.minKey.offset + mismatch, suffixLen); + buffer.writeVLong(other.minOutput.fp); + buffer.writeByte((byte) (other.minOutput.hasTerms ? 1 : 0)); + if (other.minOutput.floorData != null) { + BytesRef floorData = other.minOutput.floorData; + buffer.writeVInt(floorData.length); + buffer.writeBytes(floorData.bytes, floorData.offset, floorData.length); + } else { + buffer.writeVInt(0); + } + + // Remaining buffer entries in 'other' are prefix-encoded against other.minKey which now + // equals our lastKey, so bulk-copy is safe without re-encoding. + if (other.buffer.size() > 0) { + ByteBuffersDataInput otherIn = other.buffer.toDataInput(); + buffer.copyBytes(otherIn, otherIn.length()); + } + + } catch (IOException e) { + throw new UncheckedIOException("should not happen on in-memory buffer", e); + } } - assert assertChildrenLabelInOrder(a); - this.maxKey = trieBuilder.maxKey; - trieBuilder.status = Status.DESTROYED; + lastKey.copyBytes(other.lastKey); + this.maxKeyDepth = Math.max(this.maxKeyDepth, other.maxKeyDepth); } Output getEmptyOutput() { - return root.output; + return emptyOutput; } + // ====== Entry iteration (shared by visit and saveNodes) ====== + /** - * Used for tests only. The recursive impl need to be avoided if someone plans to use for - * production one day. + * Iterates over all non-empty-key entries: first the separately-stored minKey, then the + * prefix-coded buffer entries. Maintains a reusable key buffer and exposes the prefix length from + * the encoding, which equals the common prefix length with the preceding key. + * + *

Supports two-phase iteration via {@link #readHeader()} and {@link #readBody()}: after + * readHeader(), {@link #key} still holds the previous key's bytes (the new suffix has not been + * written yet), and {@link #prefixLen} gives the boundary for freezing. After readBody(), {@link + * #key} is updated to the current key. */ - void visit(BiConsumer consumer) { - assert status == Status.BUILDING; - if (root.output != null) { - consumer.accept(new BytesRef(), root.output); + class EntryIterator { + private final ByteBuffersDataInput in; + private boolean minKeyConsumed; + boolean headerRead = false; + + // header + int suffixLen; + int prefixLen; + + // body + byte[] key; + int keyLen; + Output output; + + EntryIterator() { + key = new byte[Math.max(maxKeyDepth, 1)]; + System.arraycopy(minKey.bytes, minKey.offset, key, 0, minKey.length); + keyLen = 0; + minKeyConsumed = (minOutput == null); + in = buffer.size() > 0 ? buffer.toDataInput() : null; } - visit(root.firstChild, new BytesRefBuilder(), consumer); - } - private void visit(Node first, BytesRefBuilder key, BiConsumer consumer) { - while (first != null) { - key.append((byte) first.label); - if (first.output != null) { - consumer.accept(key.toBytesRef(), first.output); + boolean hasNext() { + if (!minKeyConsumed) return true; + return in != null && in.position() < in.length(); + } + + /** + * Phase 1: Read only the header (prefixLen and suffixLen). + * + *

After this call, {@link #key}[0..keyLen) still contains the previous key's bytes + * because the new suffix has not been written yet. This allows the caller to use key[] safely + * for freezing nodes between depth keyLen and prefixLen. + */ + void readHeader() throws IOException { + assert headerRead == false; + headerRead = true; + + if (!minKeyConsumed) { + prefixLen = 0; + suffixLen = minKey.length; + return; + } + + prefixLen = in.readVInt(); + suffixLen = in.readVInt(); + } + + /** + * Phase 2: Read the suffix bytes (overwriting key[prefixLen..]) and the output. + * + *

After this call, {@link #key}[0..keyLen) holds the current key and {@link + * #output} holds the current entry's output. + */ + void readBody() throws IOException { + assert headerRead == true; + headerRead = false; + + if (!minKeyConsumed) { + keyLen = minKey.length; + output = minOutput; + minKeyConsumed = true; + return; + } + + keyLen = prefixLen + suffixLen; + in.readBytes(key, prefixLen, suffixLen); + long fp = in.readVLong(); + boolean hasTerms = in.readByte() == 1; + int floorDataLen = in.readVInt(); + BytesRef floorData = null; + if (floorDataLen > 0) { + byte[] fd = new byte[floorDataLen]; + in.readBytes(fd, 0, floorDataLen); + floorData = new BytesRef(fd); } - visit(first.firstChild, key, consumer); - key.setLength(key.length() - 1); - first = first.next; + output = new Output(fp, hasTerms, floorData); } } void save(DataOutput meta, IndexOutput index) throws IOException { - if (status != Status.BUILDING) { - throw new IllegalStateException("only unsaved trie can be saved, got: " + status); - } meta.writeVLong(index.getFilePointer()); meta.writeVLong(saveNodes(index)); index.writeLong(0L); // additional 8 bytes for over-reading meta.writeVLong(index.getFilePointer()); - status = Status.SAVED; } + // ====== Frontier-based save ====== + + /** + * A frontier slot for one depth level. frontier[d] represents the trie node at depth d on the + * path from root to the last inserted key. + */ + private static class FrontierNode { + Output output; + int childrenNum; + int[] childLabels; + long[] childFps; + + FrontierNode() { + childLabels = new int[4]; + childFps = new long[4]; + } + + void reset() { + output = null; + childrenNum = 0; + } + + void addChild(int label, long fp) { + childLabels = ArrayUtil.grow(childLabels, childrenNum + 1); + childFps = ArrayUtil.grow(childFps, childrenNum + 1); + childLabels[childrenNum] = label; + childFps[childrenNum] = fp; + childrenNum++; + } + } + + /** + * Reconstruct the trie from the prefix-coded entries and serialize it to disk. + * + *

Uses a two-phase iteration: readHeader() exposes the prefixLen (which is the common prefix + * length with the previous key, already encoded in the buffer at append time) while key[] still + * holds the previous key's content — this allows freezeFrom() to read the correct labels without + * maintaining a separate prevKey copy. readBody() then overwrites key[] with the current entry. + */ long saveNodes(IndexOutput index) throws IOException { final long startFP = index.getFilePointer(); - Deque stack = new ArrayDeque<>(); - SaveFrame rootFrame = new SaveFrame(root); - stack.push(rootFrame); - - // Visit and save nodes of this trie in a post-order depth-first traversal. - while (stack.isEmpty() == false) { - SaveFrame frame = stack.peek(); - Node node = frame.node; - assert frame.fp == -1; - assert assertChildrenLabelInOrder(node); - - final int childrenNum = node.childrenNum; - - if (childrenNum == 0) { // leaf node - assert node.output != null : "leaf nodes should have output."; - - frame.fp = index.getFilePointer() - startFP; - stack.pop(); - if (stack.isEmpty() == false) { - stack.peek().addChildFp(frame.fp); - } - // [n bytes] floor data - // [n bytes] output fp - // [1bit] x | [1bit] has floor | [1bit] has terms | [3bit] output fp bytes | [2bit] sign + FrontierNode[] frontier = new FrontierNode[maxKeyDepth + 1]; + for (int i = 0; i <= maxKeyDepth; i++) { + frontier[i] = new FrontierNode(); + } + frontier[0].output = emptyOutput; + + EntryIterator iter = new EntryIterator(); + while (iter.hasNext()) { + // Phase 1: read prefixLen only; iter.key still holds the previous key's bytes. + int prevKeyLen = iter.keyLen; + iter.readHeader(); + + // Freeze frontier nodes from prevKeyLen down to iter.prefixLen. + // On the first entry prevKeyLen == 0 and prefixLen == 0, so the loop is a no-op. + freezeFrom(iter.key, prevKeyLen, iter.prefixLen, frontier, startFP, index); + + // Phase 2: read suffix + output; iter.key now holds the current key. + iter.readBody(); + frontier[iter.keyLen].output = iter.output; + } + + // Freeze all remaining nodes for the last key. + freezeFrom(iter.key, iter.keyLen, 0, frontier, startFP, index); + + return freezeNode(frontier[0], startFP, index); + } + /** + * Freeze frontier nodes from depth {@code keyLen} down to depth {@code toDepth + 1}. Each frozen + * node's fp is registered as a child of its parent (one level up). + */ + private void freezeFrom( + byte[] key, int keyLen, int toDepth, FrontierNode[] frontier, long startFP, IndexOutput index) + throws IOException { + for (int d = keyLen; d > toDepth; d--) { + long fp = freezeNode(frontier[d], startFP, index); + frontier[d - 1].addChild(key[d - 1] & 0xFF, fp); + frontier[d].reset(); + } + } + + /** + * Serialize a single frontier node to the index output and return its fp (relative to startFP). + */ + private long freezeNode(FrontierNode node, long startFP, IndexOutput index) throws IOException { + int childrenNum = node.childrenNum; + + if (childrenNum == 0) { + assert node.output != null : "leaf nodes should have output."; + long bottomFp = index.getFilePointer() - startFP; + Output output = node.output; + int outputFpBytes = bytesRequiredVLong(output.fp); + int header = + SIGN_NO_CHILDREN + | ((outputFpBytes - 1) << 2) + | (output.hasTerms ? LEAF_NODE_HAS_TERMS : 0) + | (output.floorData != null ? LEAF_NODE_HAS_FLOOR : 0); + index.writeByte(((byte) header)); + writeLongNBytes(output.fp, outputFpBytes, index); + if (output.floorData != null) { + index.writeBytes(output.floorData.bytes, output.floorData.offset, output.floorData.length); + } + return bottomFp; + } + + if (childrenNum == 1) { + long bottomFp = index.getFilePointer() - startFP; + long childDeltaFp = bottomFp - node.childFps[0]; + assert childDeltaFp > 0 : "parent node is always written after children: " + childDeltaFp; + int childFpBytes = bytesRequiredVLong(childDeltaFp); + int encodedOutputFpBytes = node.output == null ? 0 : bytesRequiredVLong(node.output.fp << 2); + + int sign = + node.output != null ? SIGN_SINGLE_CHILD_WITH_OUTPUT : SIGN_SINGLE_CHILD_WITHOUT_OUTPUT; + int header = sign | ((childFpBytes - 1) << 2) | ((encodedOutputFpBytes - 1) << 5); + index.writeByte((byte) header); + index.writeByte((byte) node.childLabels[0]); + writeLongNBytes(childDeltaFp, childFpBytes, index); + + if (node.output != null) { Output output = node.output; - int outputFpBytes = bytesRequiredVLong(output.fp); - int header = - SIGN_NO_CHILDREN - | ((outputFpBytes - 1) << 2) - | (output.hasTerms ? LEAF_NODE_HAS_TERMS : 0) - | (output.floorData != null ? LEAF_NODE_HAS_FLOOR : 0); - index.writeByte(((byte) header)); - writeLongNBytes(output.fp, outputFpBytes, index); + long encodedFp = encodeFP(output); + writeLongNBytes(encodedFp, encodedOutputFpBytes, index); if (output.floorData != null) { index.writeBytes( output.floorData.bytes, output.floorData.offset, output.floorData.length); } - continue; - } - - // If there are any children have not been saved, push the first one into stack and continue. - // We want to ensure saving children before parent. - - if (frame.savedTo == null) { - frame.savedTo = node.firstChild; - stack.push(new SaveFrame(frame.savedTo)); - continue; - } - if (frame.savedTo.next != null) { - assert frame.numChildFps > 0 && frame.childFps[frame.numChildFps - 1] >= 0; - frame.savedTo = frame.savedTo.next; - stack.push(new SaveFrame(frame.savedTo)); - continue; } + return bottomFp; + } - // All children have been written, now it's time to write the parent! - - assert assertNonLeafFramePreparingSaving(frame); - frame.fp = index.getFilePointer() - startFP; - stack.pop(); - if (stack.isEmpty() == false) { - stack.peek().addChildFp(frame.fp); + // Multi-children + long bottomFp = index.getFilePointer() - startFP; + + final int minLabel = node.childLabels[0]; + final int maxLabel = node.childLabels[childrenNum - 1]; + assert maxLabel > minLabel; + ChildSaveStrategy childSaveStrategy = ChildSaveStrategy.choose(minLabel, maxLabel, childrenNum); + int strategyBytes = childSaveStrategy.needBytes(minLabel, maxLabel, childrenNum); + assert strategyBytes > 0 && strategyBytes <= 32; + + long maxChildDeltaFp = bottomFp - node.childFps[0]; + assert maxChildDeltaFp > 0 : "parent always written after all children"; + + int childrenFpBytes = bytesRequiredVLong(maxChildDeltaFp); + int encodedOutputFpBytes = node.output == null ? 1 : bytesRequiredVLong(node.output.fp << 2); + int header = + SIGN_MULTI_CHILDREN + | ((childrenFpBytes - 1) << 2) + | ((node.output != null ? 1 : 0) << 5) + | ((encodedOutputFpBytes - 1) << 6) + | (childSaveStrategy.code << 9) + | ((strategyBytes - 1) << 11) + | (minLabel << 16); + + writeLongNBytes(header, 3, index); + + if (node.output != null) { + Output output = node.output; + long encodedFp = encodeFP(output); + writeLongNBytes(encodedFp, encodedOutputFpBytes, index); + if (output.floorData != null) { + index.writeByte((byte) (childrenNum - 1)); } + } - if (childrenNum == 1) { - - // [n bytes] floor data - // [n bytes] encoded output fp | [n bytes] child fp | [1 byte] label - // [3bit] encoded output fp bytes | [3bit] child fp bytes | [2bit] sign - - long childDeltaFp = frame.fp - frame.childFps[0]; - assert childDeltaFp > 0 : "parent node is always written after children: " + childDeltaFp; - int childFpBytes = bytesRequiredVLong(childDeltaFp); - int encodedOutputFpBytes = - node.output == null ? 0 : bytesRequiredVLong(node.output.fp << 2); - - // TODO if we have only one child and no output, we can store child labels in this node. - // E.g. for a single term trie [foobar], we can save only two nodes [fooba] and [r] - - int sign = - node.output != null ? SIGN_SINGLE_CHILD_WITH_OUTPUT : SIGN_SINGLE_CHILD_WITHOUT_OUTPUT; - int header = sign | ((childFpBytes - 1) << 2) | ((encodedOutputFpBytes - 1) << 5); - index.writeByte((byte) header); - index.writeByte((byte) node.firstChild.label); - writeLongNBytes(childDeltaFp, childFpBytes, index); - - if (node.output != null) { - Output output = node.output; - long encodedFp = encodeFP(output); - writeLongNBytes(encodedFp, encodedOutputFpBytes, index); - if (output.floorData != null) { - index.writeBytes( - output.floorData.bytes, output.floorData.offset, output.floorData.length); - } - } - } else { - - // [n bytes] floor data - // [n bytes] children fps | [n bytes] strategy data - // [1 byte] children count (if floor data) | [n bytes] encoded output fp | [1 byte] label - // [5bit] strategy bytes | 2bit children strategy | [3bit] encoded output fp bytes - // [1bit] has output | [3bit] children fp bytes | [2bit] sign - - final int minLabel = node.firstChild.label; - final int maxLabel = node.lastChild.label; - assert maxLabel > minLabel; - ChildSaveStrategy childSaveStrategy = - ChildSaveStrategy.choose(minLabel, maxLabel, childrenNum); - int strategyBytes = childSaveStrategy.needBytes(minLabel, maxLabel, childrenNum); - assert strategyBytes > 0 && strategyBytes <= 32; - - // children fps are in order, so the first child's fp is min, then delta is max. - long maxChildDeltaFp = frame.fp - frame.childFps[0]; - assert maxChildDeltaFp > 0 : "parent always written after all children"; - - int childrenFpBytes = bytesRequiredVLong(maxChildDeltaFp); - int encodedOutputFpBytes = - node.output == null ? 1 : bytesRequiredVLong(node.output.fp << 2); - int header = - SIGN_MULTI_CHILDREN - | ((childrenFpBytes - 1) << 2) - | ((node.output != null ? 1 : 0) << 5) - | ((encodedOutputFpBytes - 1) << 6) - | (childSaveStrategy.code << 9) - | ((strategyBytes - 1) << 11) - | (minLabel << 16); - - writeLongNBytes(header, 3, index); - - if (node.output != null) { - Output output = node.output; - long encodedFp = encodeFP(output); - writeLongNBytes(encodedFp, encodedOutputFpBytes, index); - if (output.floorData != null) { - // We need this childrenNum to compute where the floor data start. - index.writeByte((byte) (childrenNum - 1)); - } - } - - long strategyStartFp = index.getFilePointer(); - childSaveStrategy.save(node, childrenNum, strategyBytes, index); - assert index.getFilePointer() == strategyStartFp + strategyBytes - : childSaveStrategy.name() - + " strategy bytes compute error, computed: " - + strategyBytes - + " actual: " - + (index.getFilePointer() - strategyStartFp); - - for (int i = 0; i < frame.numChildFps; i++) { - assert frame.fp > frame.childFps[i] : "parent always written after all children"; - writeLongNBytes(frame.fp - frame.childFps[i], childrenFpBytes, index); - } + long strategyStartFp = index.getFilePointer(); + childSaveStrategy.save(node.childLabels, childrenNum, strategyBytes, index); + assert index.getFilePointer() == strategyStartFp + strategyBytes + : childSaveStrategy.name() + + " strategy bytes compute error, computed: " + + strategyBytes + + " actual: " + + (index.getFilePointer() - strategyStartFp); + + for (int i = 0; i < childrenNum; i++) { + assert bottomFp > node.childFps[i] : "parent always written after all children"; + writeLongNBytes(bottomFp - node.childFps[i], childrenFpBytes, index); + } - if (node.output != null && node.output.floorData != null) { - BytesRef floorData = node.output.floorData; - index.writeBytes(floorData.bytes, floorData.offset, floorData.length); - } - } + if (node.output != null && node.output.floorData != null) { + BytesRef floorData = node.output.floorData; + index.writeBytes(floorData.bytes, floorData.offset, floorData.length); } - return rootFrame.fp; + + return bottomFp; } private long encodeFP(Output output) { @@ -412,59 +476,12 @@ private static int bytesRequiredVLong(long v) { */ private static void writeLongNBytes(long v, int n, DataOutput out) throws IOException { for (int i = 0; i < n; i++) { - // Note that we sometimes write trailing 0 bytes here, when the incoming int n is bigger than - // would be required for a "normal" vLong out.writeByte((byte) v); v >>>= 8; } assert v == 0; } - private static boolean assertChildrenLabelInOrder(Node node) { - if (node.childrenNum == 0) { - assert node.firstChild == null; - assert node.lastChild == null; - } else if (node.childrenNum == 1) { - assert node.firstChild == node.lastChild; - assert node.firstChild.next == null; - } else if (node.childrenNum > 1) { - int n = 0; - for (Node child = node.firstChild; child != null; child = child.next) { - n++; - assert child.next == null || child.label < child.next.label - : " the label of children nodes should always be in strictly increasing order."; - } - assert node.childrenNum == n; - } - return true; - } - - private static boolean assertNonLeafFramePreparingSaving(SaveFrame frame) { - Node node = frame.node; - assert assertChildrenLabelInOrder(node); - assert node.childrenNum != 0; - assert frame.numChildFps == node.childrenNum - : "expected " + node.childrenNum + " child fps, got " + frame.numChildFps; - if (node.childrenNum == 1) { - assert node.firstChild == node.lastChild; - assert node.firstChild.next == null; - assert frame.savedTo == node.firstChild; - assert frame.childFps[0] >= 0; - } else { - int n = 0; - for (int i = 0; i < frame.numChildFps; i++) { - n++; - assert frame.childFps[i] >= 0; - assert i == frame.numChildFps - 1 || frame.childFps[i] < frame.childFps[i + 1] - : " the fp of children nodes should always be in order."; - } - assert node.childrenNum == n; - assert node.lastChild == frame.savedTo; - assert frame.savedTo.next == null; - } - return true; - } - enum ChildSaveStrategy { /** @@ -479,13 +496,13 @@ int needBytes(int minLabel, int maxLabel, int labelCnt) { } @Override - void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output) + void save(int[] childLabels, int labelCnt, int strategyBytes, IndexOutput output) throws IOException { byte presenceBits = 1; // The first arc is always present. int presenceIndex = 0; - int previousLabel = parent.firstChild.label; - for (Node child = parent.firstChild.next; child != null; child = child.next) { - int label = child.label; + int previousLabel = childLabels[0]; + for (int i = 1; i < labelCnt; i++) { + int label = childLabels[i]; assert label > previousLabel; presenceIndex += label - previousLabel; while (presenceIndex >= Byte.SIZE) { @@ -493,13 +510,9 @@ void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output) presenceBits = 0; presenceIndex -= Byte.SIZE; } - // Set the bit at presenceIndex to flag that the corresponding arc is present. presenceBits |= 1 << presenceIndex; previousLabel = label; } - assert presenceIndex == (parent.lastChild.label - parent.firstChild.label) % 8; - assert presenceBits != 0; // The last byte is not 0. - assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. output.writeByte(presenceBits); } @@ -535,14 +548,14 @@ int lookup( ARRAY(1) { @Override int needBytes(int minLabel, int maxLabel, int labelCnt) { - return labelCnt - 1; // min label saved + return labelCnt - 1; } @Override - void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output) + void save(int[] childLabels, int labelCnt, int strategyBytes, IndexOutput output) throws IOException { - for (Node child = parent.firstChild.next; child != null; child = child.next) { - output.writeByte((byte) child.label); + for (int i = 1; i < labelCnt; i++) { + output.writeByte((byte) childLabels[i]); } } @@ -560,7 +573,7 @@ int lookup( } else if (midLabel > targetLabel) { high = mid - 1; } else { - return mid + 1; // min label not included, plus 1 + return mid + 1; } } return -1; @@ -582,12 +595,12 @@ int needBytes(int minLabel, int maxLabel, int labelCnt) { } @Override - void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output) + void save(int[] childLabels, int labelCnt, int strategyBytes, IndexOutput output) throws IOException { - output.writeByte((byte) parent.lastChild.label); - int lastLabel = parent.firstChild.label; - for (Node child = parent.firstChild.next; child != null; child = child.next) { - while (++lastLabel < child.label) { + output.writeByte((byte) childLabels[labelCnt - 1]); + int lastLabel = childLabels[0]; + for (int i = 1; i < labelCnt; i++) { + while (++lastLabel < childLabels[i]) { output.writeByte((byte) lastLabel); } } @@ -642,7 +655,7 @@ int lookup( abstract int needBytes(int minLabel, int maxLabel, int labelCnt); - abstract void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output) + abstract void save(int[] childLabels, int labelCnt, int strategyBytes, IndexOutput output) throws IOException; abstract int lookup( @@ -668,4 +681,22 @@ static ChildSaveStrategy choose(int minLabel, int maxLabel, int labelCnt) { return childSaveStrategy; } } + + /** Used for tests only. */ + void visit(BiConsumer consumer) { + if (emptyOutput != null) { + consumer.accept(new BytesRef(), emptyOutput); + } + try { + EntryIterator iter = new EntryIterator(); + while (iter.hasNext()) { + iter.readHeader(); + iter.readBody(); + consumer.accept( + new BytesRef(ArrayUtil.copyOfSubArray(iter.key, 0, iter.keyLen)), iter.output); + } + } catch (IOException e) { + throw new UncheckedIOException("should not happen on in-memory buffer", e); + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java index f1bc495f656e..e6f022df54e8 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java @@ -95,8 +95,6 @@ private void testTrieBuilder(Supplier randomBytesSupplier, int count) { } TrieBuilder add = TrieBuilder.bytesRefToTrie(entry.getKey(), entry.getValue()); trieBuilder.append(add); - Assert.assertThrows(IllegalStateException.class, () -> add.append(trieBuilder)); - Assert.assertThrows(IllegalStateException.class, () -> trieBuilder.append(add)); } Map actual = new TreeMap<>(); trieBuilder.visit(actual::put); @@ -128,21 +126,12 @@ private void testTrieLookup(Supplier randomBytesSupplier, int round) thr } TrieBuilder add = TrieBuilder.bytesRefToTrie(entry.getKey(), entry.getValue()); trieBuilder.append(add); - Assert.assertThrows(IllegalStateException.class, () -> add.append(trieBuilder)); - Assert.assertThrows(IllegalStateException.class, () -> trieBuilder.append(add)); } try (Directory directory = newDirectory()) { try (IndexOutput index = directory.createOutput("index", IOContext.DEFAULT); IndexOutput meta = directory.createOutput("meta", IOContext.DEFAULT)) { trieBuilder.save(meta, index); - assertThrows(IllegalStateException.class, () -> trieBuilder.save(meta, index)); - assertThrows( - IllegalStateException.class, - () -> - trieBuilder.append( - TrieBuilder.bytesRefToTrie( - new BytesRef(), new TrieBuilder.Output(0L, true, null)))); } try (IndexInput indexIn = directory.openInput("index", IOContext.DEFAULT);