Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion haystack/dataclasses/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# SPDX-License-Identifier: Apache-2.0

import hashlib
import json
from dataclasses import asdict, dataclass, field, fields
from typing import Any

Expand Down Expand Up @@ -113,7 +114,9 @@ def _create_id(self) -> str:
dataframe = None # this allows the ID creation to remain unchanged even if the dataframe field has been removed
blob = self.blob.data if self.blob is not None else None
mime_type = self.blob.mime_type if self.blob is not None else None
meta = self.meta or {}
# Sort keys so meta order doesn't affect the hash. Keep "{}" for empty meta
# so existing IDs stay stable.
meta = json.dumps(self.meta, sort_keys=True, default=str) if self.meta else "{}"
embedding = self.embedding if self.embedding is not None else None
sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
data = f"{text}{dataframe}{blob!r}{mime_type}{meta}{embedding}{sparse_embedding}"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
upgrade:
- |
The hash used to auto-generate ``Document.id`` is now computed from a
canonical (key-sorted) JSON serialization of ``meta``. Documents with
empty ``meta`` are unaffected, but most other documents will get different
IDs than they did before:

* documents with non-empty ``meta`` (the serialization changes from
``dict``'s repr to JSON);
* documents whose ``meta`` contains non-JSON-serializable values such as
``datetime`` or custom classes (these are now serialized via ``str(...)``
rather than ``repr(...)``, e.g. ``"2024-01-01 00:00:00"`` instead of
``"datetime.datetime(2024, 1, 1, 0, 0)"``).

If you rely on auto-generated IDs to match documents already persisted in a
``DocumentStore``, you will need to re-ingest the affected documents (or
pass the previous ``id`` explicitly when constructing the ``Document``).
fixes:
- |
``Document.id`` is now deterministic regardless of the insertion order of
keys in ``meta``. Previously the hash was built from ``dict``'s repr, which
reflects insertion order, so two documents with the same content and the
same ``meta`` could get different IDs depending on how the ``meta`` dict was
constructed. This silently broke ``DuplicatePolicy.SKIP`` / ``FAIL`` and
any cache or dedup table keyed on the document ID whenever upstream code
produced ``meta`` in different orders.
8 changes: 4 additions & 4 deletions test/core/pipeline/features/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3104,7 +3104,7 @@ def run(self, query: str) -> dict[str, list[Document]]:
("rag_prompt", 1): {
"documents": [
Document(
id="969664d0cf76e52b0ffb719d00d3e5a6b1c90bb29e56f6107dfd87bf2f5388ed",
id="366a10745500c26f1177f434c74513daacaa7f9d2e09ba892cfcd48652eb80c1",
content="This is a document potentially answering the question.",
meta={"access_group": 1},
)
Expand Down Expand Up @@ -4381,7 +4381,7 @@ def pipeline_that_converts_files(pipeline_class):
content="Some test content",
meta={
"file_type": "json",
"source_id": "0c6c5951d18da2935c7af3e24d417a9f94ca85403866dcfee1de93922504e1e5",
"source_id": "7eead7200d4ecead81a174a1da6512d8955f3a23acdc3f8431885d4793a63a74",
"page_number": 1,
"split_id": 0,
"split_idx_start": 0,
Expand All @@ -4391,7 +4391,7 @@ def pipeline_that_converts_files(pipeline_class):
content="Text file content ",
meta={
"file_type": "txt",
"source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42",
"source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27",
"page_number": 1,
"split_id": 0,
"split_idx_start": 0,
Expand All @@ -4401,7 +4401,7 @@ def pipeline_that_converts_files(pipeline_class):
content="for testing this.",
meta={
"file_type": "txt",
"source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42",
"source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27",
"page_number": 1,
"split_id": 1,
"split_idx_start": 18,
Expand Down
19 changes: 17 additions & 2 deletions test/dataclasses/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_init_with_parameters():
embedding=[0.1, 0.2, 0.3],
sparse_embedding=sparse_embedding,
)
assert doc.id == "1aa43af57c1dbc317241bf55d3067049f334d3b458d95dc72f71a7111f6c1a56"
assert doc.id == "c31efd4986b1f2424e5058482c6f668ccad2309043c2346524cd81d255e159fe"
assert doc.content == "test text"
assert doc.blob is not None
assert doc.blob.data == blob_data
Expand Down Expand Up @@ -95,7 +95,7 @@ def test_init_with_legacy_field():
embedding=[0.1, 0.2, 0.3],
meta={"date": "10-10-2023", "type": "article"},
)
assert doc.id == "a2c0321b34430cc675294611e55529fceb56140ca3202f1c59a43a8cecac1f43"
assert doc.id == "dcd4914f727544e89ce8082f6f2e298d244dd0803a4dc167f19d24e7d43b28ac"
assert doc.content == "test text"
assert doc.meta == {"date": "10-10-2023", "type": "article"}
assert doc.score == 0.812
Expand Down Expand Up @@ -123,6 +123,21 @@ def test_basic_equality_id():
assert doc1 != doc2


def test_id_is_independent_of_meta_key_order():
doc1 = Document(content="hello", meta={"a": 1, "b": 2})
doc2 = Document(content="hello", meta={"b": 2, "a": 1})

assert doc1.meta == doc2.meta
assert doc1.id == doc2.id


def test_id_is_independent_of_nested_meta_key_order():
doc1 = Document(content="hello", meta={"outer": {"a": 1, "b": 2}})
doc2 = Document(content="hello", meta={"outer": {"b": 2, "a": 1}})

assert doc1.id == doc2.id


def test_to_dict():
doc = Document()
assert doc.to_dict() == {
Expand Down
Loading