deepset-ai · Aarkin7 · May 31, 2026 · May 31, 2026 · Jun 2, 2026
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import hashlib
+import json
 from dataclasses import asdict, dataclass, field, fields
 from typing import Any
 
@@ -113,7 +114,9 @@ def _create_id(self) -> str:
         dataframe = None  # this allows the ID creation to remain unchanged even if the dataframe field has been removed
         blob = self.blob.data if self.blob is not None else None
         mime_type = self.blob.mime_type if self.blob is not None else None
-        meta = self.meta or {}
+        # Sort keys so meta order doesn't affect the hash. Keep "{}" for empty meta
+        # so existing IDs stay stable.
+        meta = json.dumps(self.meta, sort_keys=True, default=str) if self.meta else "{}"
         embedding = self.embedding if self.embedding is not None else None
         sparse_embedding = self.sparse_embedding.to_dict() if self.sparse_embedding is not None else ""
         data = f"{text}{dataframe}{blob!r}{mime_type}{meta}{embedding}{sparse_embedding}"

@@ -0,0 +1,27 @@
+---
+upgrade:
+  - |
+    The hash used to auto-generate ``Document.id`` is now computed from a
+    canonical (key-sorted) JSON serialization of ``meta``. Documents with
+    empty ``meta`` are unaffected, but most other documents will get different
+    IDs than they did before:
+
+    * documents with non-empty ``meta`` (the serialization changes from
+      ``dict``'s repr to JSON);
+    * documents whose ``meta`` contains non-JSON-serializable values such as
+      ``datetime`` or custom classes (these are now serialized via ``str(...)``
+      rather than ``repr(...)``, e.g. ``"2024-01-01 00:00:00"`` instead of
+      ``"datetime.datetime(2024, 1, 1, 0, 0)"``).
+
+    If you rely on auto-generated IDs to match documents already persisted in a
+    ``DocumentStore``, you will need to re-ingest the affected documents (or
+    pass the previous ``id`` explicitly when constructing the ``Document``).
+fixes:
+  - |
+    ``Document.id`` is now deterministic regardless of the insertion order of
+    keys in ``meta``. Previously the hash was built from ``dict``'s repr, which
+    reflects insertion order, so two documents with the same content and the
+    same ``meta`` could get different IDs depending on how the ``meta`` dict was
+    constructed. This silently broke ``DuplicatePolicy.SKIP`` / ``FAIL`` and
+    any cache or dedup table keyed on the document ID whenever upstream code
+    produced ``meta`` in different orders.
@@ -3104,7 +3104,7 @@ def run(self, query: str) -> dict[str, list[Document]]:
                     ("rag_prompt", 1): {
                         "documents": [
                             Document(
-                                id="969664d0cf76e52b0ffb719d00d3e5a6b1c90bb29e56f6107dfd87bf2f5388ed",
+                                id="366a10745500c26f1177f434c74513daacaa7f9d2e09ba892cfcd48652eb80c1",
                                 content="This is a document potentially answering the question.",
                                 meta={"access_group": 1},
                             )
@@ -4381,7 +4381,7 @@ def pipeline_that_converts_files(pipeline_class):
             content="Some test content",
             meta={
                 "file_type": "json",
-                "source_id": "0c6c5951d18da2935c7af3e24d417a9f94ca85403866dcfee1de93922504e1e5",
+                "source_id": "7eead7200d4ecead81a174a1da6512d8955f3a23acdc3f8431885d4793a63a74",
                 "page_number": 1,
                 "split_id": 0,
                 "split_idx_start": 0,
@@ -4391,7 +4391,7 @@ def pipeline_that_converts_files(pipeline_class):
             content="Text file content ",
             meta={
                 "file_type": "txt",
-                "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42",
+                "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27",
                 "page_number": 1,
                 "split_id": 0,
                 "split_idx_start": 0,
@@ -4401,7 +4401,7 @@ def pipeline_that_converts_files(pipeline_class):
             content="for testing this.",
             meta={
                 "file_type": "txt",
-                "source_id": "41cb91740f6e64ab542122936ea746c238ae0a92fd29b698efabbe23d0ba4c42",
+                "source_id": "696d5c046b58b24bf806ff94f6b529fb3a08f068b6bf39e572683537736a0c27",
                 "page_number": 1,
                 "split_id": 1,
                 "split_idx_start": 18,

@@ -52,7 +52,7 @@ def test_init_with_parameters():
         embedding=[0.1, 0.2, 0.3],
         sparse_embedding=sparse_embedding,
     )
-    assert doc.id == "1aa43af57c1dbc317241bf55d3067049f334d3b458d95dc72f71a7111f6c1a56"
+    assert doc.id == "c31efd4986b1f2424e5058482c6f668ccad2309043c2346524cd81d255e159fe"
     assert doc.content == "test text"
     assert doc.blob is not None
     assert doc.blob.data == blob_data
@@ -95,7 +95,7 @@ def test_init_with_legacy_field():
         embedding=[0.1, 0.2, 0.3],
         meta={"date": "10-10-2023", "type": "article"},
     )
-    assert doc.id == "a2c0321b34430cc675294611e55529fceb56140ca3202f1c59a43a8cecac1f43"
+    assert doc.id == "dcd4914f727544e89ce8082f6f2e298d244dd0803a4dc167f19d24e7d43b28ac"
     assert doc.content == "test text"
     assert doc.meta == {"date": "10-10-2023", "type": "article"}
     assert doc.score == 0.812
@@ -123,6 +123,21 @@ def test_basic_equality_id():
     assert doc1 != doc2
 
 
+def test_id_is_independent_of_meta_key_order():
+    doc1 = Document(content="hello", meta={"a": 1, "b": 2})
+    doc2 = Document(content="hello", meta={"b": 2, "a": 1})
+
+    assert doc1.meta == doc2.meta
+    assert doc1.id == doc2.id
+
+
+def test_id_is_independent_of_nested_meta_key_order():
+    doc1 = Document(content="hello", meta={"outer": {"a": 1, "b": 2}})
+    doc2 = Document(content="hello", meta={"outer": {"b": 2, "a": 1}})
+
+    assert doc1.id == doc2.id
+
+
 def test_to_dict():
     doc = Document()
     assert doc.to_dict() == {