diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 7c17ef0d8e95..ee3514bd7e0e 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -326,7 +326,8 @@ Optimizations
* GITHUB#15938: Optimize ReaderUtil#partitionByLeaf to use binary search on leaf
boundaries instead of linear scan. (Greg Miller)
-* GITHUB#15970: Reduce memory usage of fields with long terms during segment merges. (Alan Woodward)
+* GITHUB#15970, GITHUB#15977: Speed up TrieBuilder and reduce its memory footprint by replacing the in-memory object
+ tree with a compact prefix-coded byte buffer. (Alan Woodward, Guo Feng)
Bug Fixes
---------------------
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java
index 8d290cafe37e..0a042571d1d7 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java
@@ -17,20 +17,27 @@
package org.apache.lucene.codecs.lucene103.blocktree;
import java.io.IOException;
-import java.util.ArrayDeque;
+import java.io.UncheckedIOException;
import java.util.Arrays;
-import java.util.Deque;
import java.util.function.BiConsumer;
+import org.apache.lucene.store.ByteBuffersDataInput;
+import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
/**
* A builder to build prefix tree (trie) as the index of block tree, and can be saved to disk.
*
- *
TODO make this trie builder a more memory efficient structure.
+ *
Entries are stored in a compact prefix-coded byte buffer during building. The first non-empty
+ * key (minKey) is stored separately so that {@link #append} can re-encode only that one entry and
+ * bulk-copy the remaining bytes. At {@link #save} time the trie structure is reconstructed
+ * on-the-fly using a frontier array and serialized to disk in a single pass.
+ *
+ *
Memory usage is O(total encoded bytes) during building and O(max key depth) during save.
*/
class TrieBuilder {
@@ -54,343 +61,400 @@ class TrieBuilder {
*/
record Output(long fp, boolean hasTerms, BytesRef floorData) {}
- private enum Status {
- BUILDING,
- SAVED,
- DESTROYED
- }
+ // ====== Prefix-coded buffer entry format ======
+ // Each entry:
+ // [prefixLen: vInt] [suffixLen: vInt] [suffix: bytes]
+ // [fp: vLong] [hasTerms: byte (0/1)] [floorDataLen: vInt] [floorData: bytes]
+ //
+ // The first non-empty-key entry (minKey) is stored separately and NOT in the buffer.
+ // Buffer entries start from the second entry onward, prefix-encoded against their predecessor.
- private static class Node {
-
- // The utf8 digit that leads to this Node, 0 for root node
- private final int label;
- // The output of this node.
- private Output output;
- // The number of children of this node.
- private int childrenNum;
- // Pointers to relative nodes
- private Node next;
- private Node firstChild;
- private Node lastChild;
-
- Node(int label, Output output) {
- this.label = label;
- this.output = output;
- }
- }
+ /** Output for the empty ("") key, if any. */
+ private Output emptyOutput;
- /**
- * Transient state used only during {@link #saveNodes}. Keeps save-time bookkeeping off of {@link
- * Node} so that Nodes are smaller during the (much longer) building phase.
- */
- private static class SaveFrame {
- final Node node;
- // Iteration cursor: the latest child that has been pushed for saving.
- Node savedTo;
- // File pointer assigned when this node is written. -1 until then.
- long fp = -1;
- // File pointers of children, collected as they are popped. Null for leaf nodes.
- long[] childFps;
- int numChildFps;
+ /** The first non-empty key, stored separately from the buffer. */
+ private final BytesRef minKey;
- SaveFrame(Node node) {
- this.node = node;
- if (node.childrenNum > 0) {
- this.childFps = new long[node.childrenNum];
- }
- }
+ /** Output for minKey. Null if there is no non-empty key entry. */
+ private Output minOutput;
- void addChildFp(long childFp) {
- childFps[numChildFps++] = childFp;
- }
- }
+ /** Buffer holding all entries after minKey, prefix-encoded. */
+ private final ByteBuffersDataOutput buffer = new ByteBuffersDataOutput();
- private final Node root = new Node(0, null);
- private final BytesRef minKey;
- private BytesRef maxKey;
- private Status status = Status.BUILDING;
+ /**
+ * The last key appended. Since entries are always appended in sorted order this doubles as the
+ * lexicographically largest key for ordered-append assertions.
+ */
+ private final BytesRefBuilder lastKey = new BytesRefBuilder();
+
+ /** The maximum key length across all entries. */
+ private int maxKeyDepth;
static TrieBuilder bytesRefToTrie(BytesRef k, Output v) {
return new TrieBuilder(k, v);
}
private TrieBuilder(BytesRef k, Output v) {
- minKey = maxKey = BytesRef.deepCopyOf(k);
+ minKey = BytesRef.deepCopyOf(k);
+ maxKeyDepth = k.length;
+
if (k.length == 0) {
- root.output = v;
- return;
- }
- Node parent = root;
- for (int i = 0; i < k.length; i++) {
- int b = k.bytes[i + k.offset] & 0xFF;
- Output output = i == k.length - 1 ? v : null;
- Node node = new Node(b, output);
- parent.firstChild = parent.lastChild = node;
- parent.childrenNum = 1;
- parent = node;
+ emptyOutput = v;
+ } else {
+ minOutput = v;
+ lastKey.copyBytes(k);
}
}
/**
* Append all (K, V) pairs from the given trie into this one. The given trie builder need to
* ensure its keys greater or equals than max key of this one.
- *
- *
Note: the given trie will be destroyed after appending.
*/
- void append(TrieBuilder trieBuilder) {
- if (status != Status.BUILDING || trieBuilder.status != Status.BUILDING) {
- throw new IllegalStateException(
- "tries have wrong status, got this: " + status + ", append: " + trieBuilder.status);
- }
- assert this.maxKey.compareTo(trieBuilder.minKey) < 0;
-
- int mismatch =
- Arrays.mismatch(
- this.maxKey.bytes,
- this.maxKey.offset,
- this.maxKey.offset + this.maxKey.length,
- trieBuilder.minKey.bytes,
- trieBuilder.minKey.offset,
- trieBuilder.minKey.offset + trieBuilder.minKey.length);
- Node a = this.root;
- Node b = trieBuilder.root;
-
- for (int i = 0; i < mismatch; i++) {
- final Node aLast = a.lastChild;
- final Node bFirst = b.firstChild;
- assert aLast.label == bFirst.label;
-
- if (b.childrenNum > 1) {
- aLast.next = bFirst.next;
- a.childrenNum += b.childrenNum - 1;
- a.lastChild = b.lastChild;
- assert assertChildrenLabelInOrder(a);
- }
+ void append(TrieBuilder other) {
+ assert this.lastKey.get().compareTo(other.minKey) < 0;
- a = aLast;
- b = bFirst;
+ if (other.emptyOutput != null && this.emptyOutput == null) {
+ this.emptyOutput = other.emptyOutput;
}
- assert b.childrenNum > 0;
- if (a.childrenNum == 0) {
- a.firstChild = b.firstChild;
- a.lastChild = b.lastChild;
- a.childrenNum = b.childrenNum;
- } else {
- assert a.lastChild.label < b.firstChild.label;
- a.lastChild.next = b.firstChild;
- a.lastChild = b.lastChild;
- a.childrenNum += b.childrenNum;
+ if (other.minOutput != null) {
+ try {
+
+ // Encode and append the min entry in 'other' into the buffer, prefix-encoded against
+ // lastKey.
+ BytesRef lastKeyRef = lastKey.get();
+ int mismatch =
+ Arrays.mismatch(
+ lastKeyRef.bytes,
+ lastKeyRef.offset,
+ lastKeyRef.offset + lastKeyRef.length,
+ other.minKey.bytes,
+ other.minKey.offset,
+ other.minKey.offset + other.minKey.length);
+ if (mismatch == -1) {
+ mismatch = Math.min(lastKeyRef.length, other.minKey.length);
+ }
+ int suffixLen = other.minKey.length - mismatch;
+ buffer.writeVInt(mismatch);
+ buffer.writeVInt(suffixLen);
+ buffer.writeBytes(other.minKey.bytes, other.minKey.offset + mismatch, suffixLen);
+ buffer.writeVLong(other.minOutput.fp);
+ buffer.writeByte((byte) (other.minOutput.hasTerms ? 1 : 0));
+ if (other.minOutput.floorData != null) {
+ BytesRef floorData = other.minOutput.floorData;
+ buffer.writeVInt(floorData.length);
+ buffer.writeBytes(floorData.bytes, floorData.offset, floorData.length);
+ } else {
+ buffer.writeVInt(0);
+ }
+
+ // Remaining buffer entries in 'other' are prefix-encoded against other.minKey which now
+ // equals our lastKey, so bulk-copy is safe without re-encoding.
+ if (other.buffer.size() > 0) {
+ ByteBuffersDataInput otherIn = other.buffer.toDataInput();
+ buffer.copyBytes(otherIn, otherIn.length());
+ }
+
+ } catch (IOException e) {
+ throw new UncheckedIOException("should not happen on in-memory buffer", e);
+ }
}
- assert assertChildrenLabelInOrder(a);
- this.maxKey = trieBuilder.maxKey;
- trieBuilder.status = Status.DESTROYED;
+ lastKey.copyBytes(other.lastKey);
+ this.maxKeyDepth = Math.max(this.maxKeyDepth, other.maxKeyDepth);
}
Output getEmptyOutput() {
- return root.output;
+ return emptyOutput;
}
+ // ====== Entry iteration (shared by visit and saveNodes) ======
+
/**
- * Used for tests only. The recursive impl need to be avoided if someone plans to use for
- * production one day.
+ * Iterates over all non-empty-key entries: first the separately-stored minKey, then the
+ * prefix-coded buffer entries. Maintains a reusable key buffer and exposes the prefix length from
+ * the encoding, which equals the common prefix length with the preceding key.
+ *
+ *
Supports two-phase iteration via {@link #readHeader()} and {@link #readBody()}: after
+ * readHeader(), {@link #key} still holds the previous key's bytes (the new suffix has not been
+ * written yet), and {@link #prefixLen} gives the boundary for freezing. After readBody(), {@link
+ * #key} is updated to the current key.
*/
- void visit(BiConsumer consumer) {
- assert status == Status.BUILDING;
- if (root.output != null) {
- consumer.accept(new BytesRef(), root.output);
+ class EntryIterator {
+ private final ByteBuffersDataInput in;
+ private boolean minKeyConsumed;
+ boolean headerRead = false;
+
+ // header
+ int suffixLen;
+ int prefixLen;
+
+ // body
+ byte[] key;
+ int keyLen;
+ Output output;
+
+ EntryIterator() {
+ key = new byte[Math.max(maxKeyDepth, 1)];
+ System.arraycopy(minKey.bytes, minKey.offset, key, 0, minKey.length);
+ keyLen = 0;
+ minKeyConsumed = (minOutput == null);
+ in = buffer.size() > 0 ? buffer.toDataInput() : null;
}
- visit(root.firstChild, new BytesRefBuilder(), consumer);
- }
- private void visit(Node first, BytesRefBuilder key, BiConsumer consumer) {
- while (first != null) {
- key.append((byte) first.label);
- if (first.output != null) {
- consumer.accept(key.toBytesRef(), first.output);
+ boolean hasNext() {
+ if (!minKeyConsumed) return true;
+ return in != null && in.position() < in.length();
+ }
+
+ /**
+ * Phase 1: Read only the header (prefixLen and suffixLen).
+ *
+ * After this call, {@link #key}[0..keyLen) still contains the previous key's bytes
+ * because the new suffix has not been written yet. This allows the caller to use key[] safely
+ * for freezing nodes between depth keyLen and prefixLen.
+ */
+ void readHeader() throws IOException {
+ assert headerRead == false;
+ headerRead = true;
+
+ if (!minKeyConsumed) {
+ prefixLen = 0;
+ suffixLen = minKey.length;
+ return;
+ }
+
+ prefixLen = in.readVInt();
+ suffixLen = in.readVInt();
+ }
+
+ /**
+ * Phase 2: Read the suffix bytes (overwriting key[prefixLen..]) and the output.
+ *
+ *
After this call, {@link #key}[0..keyLen) holds the current key and {@link
+ * #output} holds the current entry's output.
+ */
+ void readBody() throws IOException {
+ assert headerRead == true;
+ headerRead = false;
+
+ if (!minKeyConsumed) {
+ keyLen = minKey.length;
+ output = minOutput;
+ minKeyConsumed = true;
+ return;
+ }
+
+ keyLen = prefixLen + suffixLen;
+ in.readBytes(key, prefixLen, suffixLen);
+ long fp = in.readVLong();
+ boolean hasTerms = in.readByte() == 1;
+ int floorDataLen = in.readVInt();
+ BytesRef floorData = null;
+ if (floorDataLen > 0) {
+ byte[] fd = new byte[floorDataLen];
+ in.readBytes(fd, 0, floorDataLen);
+ floorData = new BytesRef(fd);
}
- visit(first.firstChild, key, consumer);
- key.setLength(key.length() - 1);
- first = first.next;
+ output = new Output(fp, hasTerms, floorData);
}
}
void save(DataOutput meta, IndexOutput index) throws IOException {
- if (status != Status.BUILDING) {
- throw new IllegalStateException("only unsaved trie can be saved, got: " + status);
- }
meta.writeVLong(index.getFilePointer());
meta.writeVLong(saveNodes(index));
index.writeLong(0L); // additional 8 bytes for over-reading
meta.writeVLong(index.getFilePointer());
- status = Status.SAVED;
}
+ // ====== Frontier-based save ======
+
+ /**
+ * A frontier slot for one depth level. frontier[d] represents the trie node at depth d on the
+ * path from root to the last inserted key.
+ */
+ private static class FrontierNode {
+ Output output;
+ int childrenNum;
+ int[] childLabels;
+ long[] childFps;
+
+ FrontierNode() {
+ childLabels = new int[4];
+ childFps = new long[4];
+ }
+
+ void reset() {
+ output = null;
+ childrenNum = 0;
+ }
+
+ void addChild(int label, long fp) {
+ childLabels = ArrayUtil.grow(childLabels, childrenNum + 1);
+ childFps = ArrayUtil.grow(childFps, childrenNum + 1);
+ childLabels[childrenNum] = label;
+ childFps[childrenNum] = fp;
+ childrenNum++;
+ }
+ }
+
+ /**
+ * Reconstruct the trie from the prefix-coded entries and serialize it to disk.
+ *
+ *
Uses a two-phase iteration: readHeader() exposes the prefixLen (which is the common prefix
+ * length with the previous key, already encoded in the buffer at append time) while key[] still
+ * holds the previous key's content — this allows freezeFrom() to read the correct labels without
+ * maintaining a separate prevKey copy. readBody() then overwrites key[] with the current entry.
+ */
long saveNodes(IndexOutput index) throws IOException {
final long startFP = index.getFilePointer();
- Deque stack = new ArrayDeque<>();
- SaveFrame rootFrame = new SaveFrame(root);
- stack.push(rootFrame);
-
- // Visit and save nodes of this trie in a post-order depth-first traversal.
- while (stack.isEmpty() == false) {
- SaveFrame frame = stack.peek();
- Node node = frame.node;
- assert frame.fp == -1;
- assert assertChildrenLabelInOrder(node);
-
- final int childrenNum = node.childrenNum;
-
- if (childrenNum == 0) { // leaf node
- assert node.output != null : "leaf nodes should have output.";
-
- frame.fp = index.getFilePointer() - startFP;
- stack.pop();
- if (stack.isEmpty() == false) {
- stack.peek().addChildFp(frame.fp);
- }
- // [n bytes] floor data
- // [n bytes] output fp
- // [1bit] x | [1bit] has floor | [1bit] has terms | [3bit] output fp bytes | [2bit] sign
+ FrontierNode[] frontier = new FrontierNode[maxKeyDepth + 1];
+ for (int i = 0; i <= maxKeyDepth; i++) {
+ frontier[i] = new FrontierNode();
+ }
+ frontier[0].output = emptyOutput;
+
+ EntryIterator iter = new EntryIterator();
+ while (iter.hasNext()) {
+ // Phase 1: read prefixLen only; iter.key still holds the previous key's bytes.
+ int prevKeyLen = iter.keyLen;
+ iter.readHeader();
+
+ // Freeze frontier nodes from prevKeyLen down to iter.prefixLen.
+ // On the first entry prevKeyLen == 0 and prefixLen == 0, so the loop is a no-op.
+ freezeFrom(iter.key, prevKeyLen, iter.prefixLen, frontier, startFP, index);
+
+ // Phase 2: read suffix + output; iter.key now holds the current key.
+ iter.readBody();
+ frontier[iter.keyLen].output = iter.output;
+ }
+
+ // Freeze all remaining nodes for the last key.
+ freezeFrom(iter.key, iter.keyLen, 0, frontier, startFP, index);
+
+ return freezeNode(frontier[0], startFP, index);
+ }
+ /**
+ * Freeze frontier nodes from depth {@code keyLen} down to depth {@code toDepth + 1}. Each frozen
+ * node's fp is registered as a child of its parent (one level up).
+ */
+ private void freezeFrom(
+ byte[] key, int keyLen, int toDepth, FrontierNode[] frontier, long startFP, IndexOutput index)
+ throws IOException {
+ for (int d = keyLen; d > toDepth; d--) {
+ long fp = freezeNode(frontier[d], startFP, index);
+ frontier[d - 1].addChild(key[d - 1] & 0xFF, fp);
+ frontier[d].reset();
+ }
+ }
+
+ /**
+ * Serialize a single frontier node to the index output and return its fp (relative to startFP).
+ */
+ private long freezeNode(FrontierNode node, long startFP, IndexOutput index) throws IOException {
+ int childrenNum = node.childrenNum;
+
+ if (childrenNum == 0) {
+ assert node.output != null : "leaf nodes should have output.";
+ long bottomFp = index.getFilePointer() - startFP;
+ Output output = node.output;
+ int outputFpBytes = bytesRequiredVLong(output.fp);
+ int header =
+ SIGN_NO_CHILDREN
+ | ((outputFpBytes - 1) << 2)
+ | (output.hasTerms ? LEAF_NODE_HAS_TERMS : 0)
+ | (output.floorData != null ? LEAF_NODE_HAS_FLOOR : 0);
+ index.writeByte(((byte) header));
+ writeLongNBytes(output.fp, outputFpBytes, index);
+ if (output.floorData != null) {
+ index.writeBytes(output.floorData.bytes, output.floorData.offset, output.floorData.length);
+ }
+ return bottomFp;
+ }
+
+ if (childrenNum == 1) {
+ long bottomFp = index.getFilePointer() - startFP;
+ long childDeltaFp = bottomFp - node.childFps[0];
+ assert childDeltaFp > 0 : "parent node is always written after children: " + childDeltaFp;
+ int childFpBytes = bytesRequiredVLong(childDeltaFp);
+ int encodedOutputFpBytes = node.output == null ? 0 : bytesRequiredVLong(node.output.fp << 2);
+
+ int sign =
+ node.output != null ? SIGN_SINGLE_CHILD_WITH_OUTPUT : SIGN_SINGLE_CHILD_WITHOUT_OUTPUT;
+ int header = sign | ((childFpBytes - 1) << 2) | ((encodedOutputFpBytes - 1) << 5);
+ index.writeByte((byte) header);
+ index.writeByte((byte) node.childLabels[0]);
+ writeLongNBytes(childDeltaFp, childFpBytes, index);
+
+ if (node.output != null) {
Output output = node.output;
- int outputFpBytes = bytesRequiredVLong(output.fp);
- int header =
- SIGN_NO_CHILDREN
- | ((outputFpBytes - 1) << 2)
- | (output.hasTerms ? LEAF_NODE_HAS_TERMS : 0)
- | (output.floorData != null ? LEAF_NODE_HAS_FLOOR : 0);
- index.writeByte(((byte) header));
- writeLongNBytes(output.fp, outputFpBytes, index);
+ long encodedFp = encodeFP(output);
+ writeLongNBytes(encodedFp, encodedOutputFpBytes, index);
if (output.floorData != null) {
index.writeBytes(
output.floorData.bytes, output.floorData.offset, output.floorData.length);
}
- continue;
- }
-
- // If there are any children have not been saved, push the first one into stack and continue.
- // We want to ensure saving children before parent.
-
- if (frame.savedTo == null) {
- frame.savedTo = node.firstChild;
- stack.push(new SaveFrame(frame.savedTo));
- continue;
- }
- if (frame.savedTo.next != null) {
- assert frame.numChildFps > 0 && frame.childFps[frame.numChildFps - 1] >= 0;
- frame.savedTo = frame.savedTo.next;
- stack.push(new SaveFrame(frame.savedTo));
- continue;
}
+ return bottomFp;
+ }
- // All children have been written, now it's time to write the parent!
-
- assert assertNonLeafFramePreparingSaving(frame);
- frame.fp = index.getFilePointer() - startFP;
- stack.pop();
- if (stack.isEmpty() == false) {
- stack.peek().addChildFp(frame.fp);
+ // Multi-children
+ long bottomFp = index.getFilePointer() - startFP;
+
+ final int minLabel = node.childLabels[0];
+ final int maxLabel = node.childLabels[childrenNum - 1];
+ assert maxLabel > minLabel;
+ ChildSaveStrategy childSaveStrategy = ChildSaveStrategy.choose(minLabel, maxLabel, childrenNum);
+ int strategyBytes = childSaveStrategy.needBytes(minLabel, maxLabel, childrenNum);
+ assert strategyBytes > 0 && strategyBytes <= 32;
+
+ long maxChildDeltaFp = bottomFp - node.childFps[0];
+ assert maxChildDeltaFp > 0 : "parent always written after all children";
+
+ int childrenFpBytes = bytesRequiredVLong(maxChildDeltaFp);
+ int encodedOutputFpBytes = node.output == null ? 1 : bytesRequiredVLong(node.output.fp << 2);
+ int header =
+ SIGN_MULTI_CHILDREN
+ | ((childrenFpBytes - 1) << 2)
+ | ((node.output != null ? 1 : 0) << 5)
+ | ((encodedOutputFpBytes - 1) << 6)
+ | (childSaveStrategy.code << 9)
+ | ((strategyBytes - 1) << 11)
+ | (minLabel << 16);
+
+ writeLongNBytes(header, 3, index);
+
+ if (node.output != null) {
+ Output output = node.output;
+ long encodedFp = encodeFP(output);
+ writeLongNBytes(encodedFp, encodedOutputFpBytes, index);
+ if (output.floorData != null) {
+ index.writeByte((byte) (childrenNum - 1));
}
+ }
- if (childrenNum == 1) {
-
- // [n bytes] floor data
- // [n bytes] encoded output fp | [n bytes] child fp | [1 byte] label
- // [3bit] encoded output fp bytes | [3bit] child fp bytes | [2bit] sign
-
- long childDeltaFp = frame.fp - frame.childFps[0];
- assert childDeltaFp > 0 : "parent node is always written after children: " + childDeltaFp;
- int childFpBytes = bytesRequiredVLong(childDeltaFp);
- int encodedOutputFpBytes =
- node.output == null ? 0 : bytesRequiredVLong(node.output.fp << 2);
-
- // TODO if we have only one child and no output, we can store child labels in this node.
- // E.g. for a single term trie [foobar], we can save only two nodes [fooba] and [r]
-
- int sign =
- node.output != null ? SIGN_SINGLE_CHILD_WITH_OUTPUT : SIGN_SINGLE_CHILD_WITHOUT_OUTPUT;
- int header = sign | ((childFpBytes - 1) << 2) | ((encodedOutputFpBytes - 1) << 5);
- index.writeByte((byte) header);
- index.writeByte((byte) node.firstChild.label);
- writeLongNBytes(childDeltaFp, childFpBytes, index);
-
- if (node.output != null) {
- Output output = node.output;
- long encodedFp = encodeFP(output);
- writeLongNBytes(encodedFp, encodedOutputFpBytes, index);
- if (output.floorData != null) {
- index.writeBytes(
- output.floorData.bytes, output.floorData.offset, output.floorData.length);
- }
- }
- } else {
-
- // [n bytes] floor data
- // [n bytes] children fps | [n bytes] strategy data
- // [1 byte] children count (if floor data) | [n bytes] encoded output fp | [1 byte] label
- // [5bit] strategy bytes | 2bit children strategy | [3bit] encoded output fp bytes
- // [1bit] has output | [3bit] children fp bytes | [2bit] sign
-
- final int minLabel = node.firstChild.label;
- final int maxLabel = node.lastChild.label;
- assert maxLabel > minLabel;
- ChildSaveStrategy childSaveStrategy =
- ChildSaveStrategy.choose(minLabel, maxLabel, childrenNum);
- int strategyBytes = childSaveStrategy.needBytes(minLabel, maxLabel, childrenNum);
- assert strategyBytes > 0 && strategyBytes <= 32;
-
- // children fps are in order, so the first child's fp is min, then delta is max.
- long maxChildDeltaFp = frame.fp - frame.childFps[0];
- assert maxChildDeltaFp > 0 : "parent always written after all children";
-
- int childrenFpBytes = bytesRequiredVLong(maxChildDeltaFp);
- int encodedOutputFpBytes =
- node.output == null ? 1 : bytesRequiredVLong(node.output.fp << 2);
- int header =
- SIGN_MULTI_CHILDREN
- | ((childrenFpBytes - 1) << 2)
- | ((node.output != null ? 1 : 0) << 5)
- | ((encodedOutputFpBytes - 1) << 6)
- | (childSaveStrategy.code << 9)
- | ((strategyBytes - 1) << 11)
- | (minLabel << 16);
-
- writeLongNBytes(header, 3, index);
-
- if (node.output != null) {
- Output output = node.output;
- long encodedFp = encodeFP(output);
- writeLongNBytes(encodedFp, encodedOutputFpBytes, index);
- if (output.floorData != null) {
- // We need this childrenNum to compute where the floor data start.
- index.writeByte((byte) (childrenNum - 1));
- }
- }
-
- long strategyStartFp = index.getFilePointer();
- childSaveStrategy.save(node, childrenNum, strategyBytes, index);
- assert index.getFilePointer() == strategyStartFp + strategyBytes
- : childSaveStrategy.name()
- + " strategy bytes compute error, computed: "
- + strategyBytes
- + " actual: "
- + (index.getFilePointer() - strategyStartFp);
-
- for (int i = 0; i < frame.numChildFps; i++) {
- assert frame.fp > frame.childFps[i] : "parent always written after all children";
- writeLongNBytes(frame.fp - frame.childFps[i], childrenFpBytes, index);
- }
+ long strategyStartFp = index.getFilePointer();
+ childSaveStrategy.save(node.childLabels, childrenNum, strategyBytes, index);
+ assert index.getFilePointer() == strategyStartFp + strategyBytes
+ : childSaveStrategy.name()
+ + " strategy bytes compute error, computed: "
+ + strategyBytes
+ + " actual: "
+ + (index.getFilePointer() - strategyStartFp);
+
+ for (int i = 0; i < childrenNum; i++) {
+ assert bottomFp > node.childFps[i] : "parent always written after all children";
+ writeLongNBytes(bottomFp - node.childFps[i], childrenFpBytes, index);
+ }
- if (node.output != null && node.output.floorData != null) {
- BytesRef floorData = node.output.floorData;
- index.writeBytes(floorData.bytes, floorData.offset, floorData.length);
- }
- }
+ if (node.output != null && node.output.floorData != null) {
+ BytesRef floorData = node.output.floorData;
+ index.writeBytes(floorData.bytes, floorData.offset, floorData.length);
}
- return rootFrame.fp;
+
+ return bottomFp;
}
private long encodeFP(Output output) {
@@ -412,59 +476,12 @@ private static int bytesRequiredVLong(long v) {
*/
private static void writeLongNBytes(long v, int n, DataOutput out) throws IOException {
for (int i = 0; i < n; i++) {
- // Note that we sometimes write trailing 0 bytes here, when the incoming int n is bigger than
- // would be required for a "normal" vLong
out.writeByte((byte) v);
v >>>= 8;
}
assert v == 0;
}
- private static boolean assertChildrenLabelInOrder(Node node) {
- if (node.childrenNum == 0) {
- assert node.firstChild == null;
- assert node.lastChild == null;
- } else if (node.childrenNum == 1) {
- assert node.firstChild == node.lastChild;
- assert node.firstChild.next == null;
- } else if (node.childrenNum > 1) {
- int n = 0;
- for (Node child = node.firstChild; child != null; child = child.next) {
- n++;
- assert child.next == null || child.label < child.next.label
- : " the label of children nodes should always be in strictly increasing order.";
- }
- assert node.childrenNum == n;
- }
- return true;
- }
-
- private static boolean assertNonLeafFramePreparingSaving(SaveFrame frame) {
- Node node = frame.node;
- assert assertChildrenLabelInOrder(node);
- assert node.childrenNum != 0;
- assert frame.numChildFps == node.childrenNum
- : "expected " + node.childrenNum + " child fps, got " + frame.numChildFps;
- if (node.childrenNum == 1) {
- assert node.firstChild == node.lastChild;
- assert node.firstChild.next == null;
- assert frame.savedTo == node.firstChild;
- assert frame.childFps[0] >= 0;
- } else {
- int n = 0;
- for (int i = 0; i < frame.numChildFps; i++) {
- n++;
- assert frame.childFps[i] >= 0;
- assert i == frame.numChildFps - 1 || frame.childFps[i] < frame.childFps[i + 1]
- : " the fp of children nodes should always be in order.";
- }
- assert node.childrenNum == n;
- assert node.lastChild == frame.savedTo;
- assert frame.savedTo.next == null;
- }
- return true;
- }
-
enum ChildSaveStrategy {
/**
@@ -479,13 +496,13 @@ int needBytes(int minLabel, int maxLabel, int labelCnt) {
}
@Override
- void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output)
+ void save(int[] childLabels, int labelCnt, int strategyBytes, IndexOutput output)
throws IOException {
byte presenceBits = 1; // The first arc is always present.
int presenceIndex = 0;
- int previousLabel = parent.firstChild.label;
- for (Node child = parent.firstChild.next; child != null; child = child.next) {
- int label = child.label;
+ int previousLabel = childLabels[0];
+ for (int i = 1; i < labelCnt; i++) {
+ int label = childLabels[i];
assert label > previousLabel;
presenceIndex += label - previousLabel;
while (presenceIndex >= Byte.SIZE) {
@@ -493,13 +510,9 @@ void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output)
presenceBits = 0;
presenceIndex -= Byte.SIZE;
}
- // Set the bit at presenceIndex to flag that the corresponding arc is present.
presenceBits |= 1 << presenceIndex;
previousLabel = label;
}
- assert presenceIndex == (parent.lastChild.label - parent.firstChild.label) % 8;
- assert presenceBits != 0; // The last byte is not 0.
- assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present.
output.writeByte(presenceBits);
}
@@ -535,14 +548,14 @@ int lookup(
ARRAY(1) {
@Override
int needBytes(int minLabel, int maxLabel, int labelCnt) {
- return labelCnt - 1; // min label saved
+ return labelCnt - 1;
}
@Override
- void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output)
+ void save(int[] childLabels, int labelCnt, int strategyBytes, IndexOutput output)
throws IOException {
- for (Node child = parent.firstChild.next; child != null; child = child.next) {
- output.writeByte((byte) child.label);
+ for (int i = 1; i < labelCnt; i++) {
+ output.writeByte((byte) childLabels[i]);
}
}
@@ -560,7 +573,7 @@ int lookup(
} else if (midLabel > targetLabel) {
high = mid - 1;
} else {
- return mid + 1; // min label not included, plus 1
+ return mid + 1;
}
}
return -1;
@@ -582,12 +595,12 @@ int needBytes(int minLabel, int maxLabel, int labelCnt) {
}
@Override
- void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output)
+ void save(int[] childLabels, int labelCnt, int strategyBytes, IndexOutput output)
throws IOException {
- output.writeByte((byte) parent.lastChild.label);
- int lastLabel = parent.firstChild.label;
- for (Node child = parent.firstChild.next; child != null; child = child.next) {
- while (++lastLabel < child.label) {
+ output.writeByte((byte) childLabels[labelCnt - 1]);
+ int lastLabel = childLabels[0];
+ for (int i = 1; i < labelCnt; i++) {
+ while (++lastLabel < childLabels[i]) {
output.writeByte((byte) lastLabel);
}
}
@@ -642,7 +655,7 @@ int lookup(
abstract int needBytes(int minLabel, int maxLabel, int labelCnt);
- abstract void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output)
+ abstract void save(int[] childLabels, int labelCnt, int strategyBytes, IndexOutput output)
throws IOException;
abstract int lookup(
@@ -668,4 +681,22 @@ static ChildSaveStrategy choose(int minLabel, int maxLabel, int labelCnt) {
return childSaveStrategy;
}
}
+
+ /** Used for tests only. */
+ void visit(BiConsumer consumer) {
+ if (emptyOutput != null) {
+ consumer.accept(new BytesRef(), emptyOutput);
+ }
+ try {
+ EntryIterator iter = new EntryIterator();
+ while (iter.hasNext()) {
+ iter.readHeader();
+ iter.readBody();
+ consumer.accept(
+ new BytesRef(ArrayUtil.copyOfSubArray(iter.key, 0, iter.keyLen)), iter.output);
+ }
+ } catch (IOException e) {
+ throw new UncheckedIOException("should not happen on in-memory buffer", e);
+ }
+ }
}
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java
index f1bc495f656e..e6f022df54e8 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java
@@ -95,8 +95,6 @@ private void testTrieBuilder(Supplier randomBytesSupplier, int count) {
}
TrieBuilder add = TrieBuilder.bytesRefToTrie(entry.getKey(), entry.getValue());
trieBuilder.append(add);
- Assert.assertThrows(IllegalStateException.class, () -> add.append(trieBuilder));
- Assert.assertThrows(IllegalStateException.class, () -> trieBuilder.append(add));
}
Map actual = new TreeMap<>();
trieBuilder.visit(actual::put);
@@ -128,21 +126,12 @@ private void testTrieLookup(Supplier randomBytesSupplier, int round) thr
}
TrieBuilder add = TrieBuilder.bytesRefToTrie(entry.getKey(), entry.getValue());
trieBuilder.append(add);
- Assert.assertThrows(IllegalStateException.class, () -> add.append(trieBuilder));
- Assert.assertThrows(IllegalStateException.class, () -> trieBuilder.append(add));
}
try (Directory directory = newDirectory()) {
try (IndexOutput index = directory.createOutput("index", IOContext.DEFAULT);
IndexOutput meta = directory.createOutput("meta", IOContext.DEFAULT)) {
trieBuilder.save(meta, index);
- assertThrows(IllegalStateException.class, () -> trieBuilder.save(meta, index));
- assertThrows(
- IllegalStateException.class,
- () ->
- trieBuilder.append(
- TrieBuilder.bytesRefToTrie(
- new BytesRef(), new TrieBuilder.Output(0L, true, null))));
}
try (IndexInput indexIn = directory.openInput("index", IOContext.DEFAULT);