From 78b9d7a74a4042e368005d714c1fa1e63784b3a1 Mon Sep 17 00:00:00 2001
From: Jonathan Rhyne <jonathan@pspdfkit.com>
Date: Sun, 26 Apr 2026 21:02:58 -0400
Subject: [PATCH] Add Python PDF RAG ingestion pipeline example

Companion code for the tutorial "Build a PDF ingestion pipeline for AI
apps in Python" on nutrient.io. Walks through PDF -> Markdown via the
DWS Processor API -> heading-aware chunking -> OpenAI embeddings ->
Chroma -> Claude answer with cited sources.

Apache-style modified BSD license matching the rest of the repo.
Includes Makefile, .env.example, pytest unit tests for the chunker,
and a 5-minute quickstart.
---
 .../.env.example                              |  10 ++
 pdf-rag-ingestion-pipeline-python/.gitignore  |  14 +++
 pdf-rag-ingestion-pipeline-python/LICENSE     |  42 ++++++++
 pdf-rag-ingestion-pipeline-python/Makefile    |  28 +++++
 pdf-rag-ingestion-pipeline-python/README.md   | 102 ++++++++++++++++++
 .../ingestion/__init__.py                     |   0
 .../ingestion/chunk.py                        |  37 +++++++
 .../ingestion/embed.py                        |  15 +++
 .../ingestion/extract.py                      |  35 ++++++
 .../ingestion/store.py                        |  24 +++++
 .../pdfs/.gitkeep                             |   0
 .../pyproject.toml                            |  49 +++++++++
 .../retrieval/__init__.py                     |   0
 .../retrieval/ask.py                          |  43 ++++++++
 pdf-rag-ingestion-pipeline-python/run.py      |  46 ++++++++
 .../tests/__init__.py                         |   0
 .../tests/test_chunk.py                       |  46 ++++++++
 17 files changed, 491 insertions(+)
 create mode 100644 pdf-rag-ingestion-pipeline-python/.env.example
 create mode 100644 pdf-rag-ingestion-pipeline-python/.gitignore
 create mode 100644 pdf-rag-ingestion-pipeline-python/LICENSE
 create mode 100644 pdf-rag-ingestion-pipeline-python/Makefile
 create mode 100644 pdf-rag-ingestion-pipeline-python/README.md
 create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/__init__.py
 create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/chunk.py
 create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/embed.py
 create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/extract.py
 create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/store.py
 create mode 100644 pdf-rag-ingestion-pipeline-python/pdfs/.gitkeep
 create mode 100644 pdf-rag-ingestion-pipeline-python/pyproject.toml
 create mode 100644 pdf-rag-ingestion-pipeline-python/retrieval/__init__.py
 create mode 100644 pdf-rag-ingestion-pipeline-python/retrieval/ask.py
 create mode 100644 pdf-rag-ingestion-pipeline-python/run.py
 create mode 100644 pdf-rag-ingestion-pipeline-python/tests/__init__.py
 create mode 100644 pdf-rag-ingestion-pipeline-python/tests/test_chunk.py

diff --git a/pdf-rag-ingestion-pipeline-python/.env.example b/pdf-rag-ingestion-pipeline-python/.env.example
new file mode 100644
index 00000000..47e36db2
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/.env.example
@@ -0,0 +1,10 @@
+# Required
+NUTRIENT_API_KEY=
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+
+# Optional
+EMBEDDING_MODEL=text-embedding-3-small
+LLM_MODEL=claude-sonnet-4-6
+VECTOR_DB_PATH=.chroma
+PDF_FOLDER=pdfs
diff --git a/pdf-rag-ingestion-pipeline-python/.gitignore b/pdf-rag-ingestion-pipeline-python/.gitignore
new file mode 100644
index 00000000..dec9c4dd
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/.gitignore
@@ -0,0 +1,14 @@
+.env
+.venv/
+__pycache__/
+*.pyc
+*.pyo
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+.chroma/
+dist/
+build/
+*.egg-info/
+pdfs/*
+!pdfs/.gitkeep
diff --git a/pdf-rag-ingestion-pipeline-python/LICENSE b/pdf-rag-ingestion-pipeline-python/LICENSE
new file mode 100644
index 00000000..7f815afc
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/LICENSE
@@ -0,0 +1,42 @@
+The Nutrient Sample applications are licensed with a modified BSD
+license. In plain language: you're allowed to do whatever you wish
+with the code, modify, redistribute, embed in your products (free or
+commercial), but you must include copyright, terms of usage and
+disclaimer as stated in the license.
+
+You will require a commercial Nutrient License to run these examples
+in non-demo mode. Please refer to sales@nutrient.io for details.
+
+Copyright © 2017-present PSPDFKit GmbH d/b/a Nutrient.
+All rights reserved.
+
+Redistribution and use in source or binary forms,
+with or without modification, are permitted provided
+that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the
+  distribution.
+
+- Redistributions of Nutrient Samples must include attribution to
+  Nutrient, either in documentation or other appropriate media.
+
+- Neither the name of the Nutrient, PSPDFKit GmbH, nor its developers
+  may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pdf-rag-ingestion-pipeline-python/Makefile b/pdf-rag-ingestion-pipeline-python/Makefile
new file mode 100644
index 00000000..cfcc84c4
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/Makefile
@@ -0,0 +1,28 @@
+.PHONY: install ingest ask demo lint test clean
+
+install:
+	python -m venv .venv && . .venv/bin/activate && pip install -e '.[dev]'
+
+ingest:
+	python run.py
+
+ask:
+	@if [ -z "$(Q)" ]; then echo "Usage: make ask Q=\"your question\""; exit 1; fi
+	python -m retrieval.ask "$(Q)"
+
+demo:
+	cp samples/*.pdf pdfs/ 2>/dev/null || true
+	python run.py
+	python -m retrieval.ask "What does this document describe?"
+
+lint:
+	ruff check .
+	ruff format --check .
+	mypy ingestion retrieval
+
+test:
+	pytest -q
+
+clean:
+	rm -rf .chroma .venv .pytest_cache .ruff_cache .mypy_cache
+	find . -name __pycache__ -type d -exec rm -rf {} +
diff --git a/pdf-rag-ingestion-pipeline-python/README.md b/pdf-rag-ingestion-pipeline-python/README.md
new file mode 100644
index 00000000..dbf85194
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/README.md
@@ -0,0 +1,102 @@
+# PDF RAG Ingestion Pipeline (Python)
+
+End-to-end ingestion pipeline for AI document apps: **PDF → Markdown → chunks → embeddings → retrieval → LLM answer**, powered by [Nutrient's PDF-to-Markdown API](https://www.nutrient.io/api/pdf-to-md-api/).
+
+Companion code for the tutorial:
+[Build a PDF ingestion pipeline for AI apps in Python](https://www.nutrient.io/blog/build-pdf-rag-ingestion-pipeline-python/).
+
+This example lives inside the [awesome-nutrient](https://github.com/PSPDFKit/awesome-nutrient) examples repo. See the root [README](../README.md) for other Nutrient examples.
+
+## Why this exists
+
+Most "I built a RAG app" tutorials skip the part that breaks in production: getting clean text out of real PDFs. Page-noise, broken tables, and lost reading order quietly degrade every chunk that hits your vector DB — and that shows up as bad retrievals, not as crashes.
+
+This repo turns a folder of PDFs into a queryable index for an LLM in five minutes, using:
+
+- **[Nutrient DWS Processor API](https://www.nutrient.io/api/pdf-to-md-api/)** for PDF → Markdown
+- **OpenAI `text-embedding-3-small`** for embeddings
+- **Chroma** for local vector storage (swap to Pinecone, pgvector, Weaviate, or Qdrant by replacing `ingestion/store.py`)
+- **Anthropic Claude** for the answer step (swap to OpenAI by editing `retrieval/ask.py`)
+
+## Quickstart (5 minutes)
+
+```bash
+git clone https://github.com/PSPDFKit/awesome-nutrient.git
+cd awesome-nutrient/pdf-rag-ingestion-pipeline-python
+python -m venv .venv && source .venv/bin/activate
+pip install -e '.[dev]'
+cp .env.example .env
+# add your NUTRIENT_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY to .env
+cp samples/*.pdf pdfs/
+python run.py
+python -m retrieval.ask "What does this document describe?"
+```
+
+## Choosing the right Nutrient path
+
+There isn't one Nutrient path for AI document ingestion — there are three. Pick by data residency, document type, and output shape:
+
+| Use case                                          | Nutrient path                                                                          | Tradeoff                                          |
+| ------------------------------------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------- |
+| Born-digital PDFs, fastest path to working RAG    | [PDF-to-Markdown API](https://www.nutrient.io/api/pdf-to-md-api/) (this repo)          | Cloud API. Documents are POSTed to Nutrient.      |
+| Born-digital PDFs, must run locally / no upload   | [`@pspdfkit/pdf-to-markdown` CLI / Claude Code skill](https://www.nutrient.io/ai/skills/pdf-to-markdown/) | Node CLI. No OCR yet.                             |
+| Scanned, image-only, or handwriting PDFs          | [Nutrient Python SDK with OCR/ICR engines](https://www.nutrient.io/sdk/python/pdf-data-extraction/) | On-prem. Heavier setup, supports tougher inputs.  |
+| Tables and key-value pairs as JSON, not Markdown  | [Data-extraction API](https://www.nutrient.io/api/data-extraction-api/)                | Different output shape. Better for forms.         |
+
+## Repo structure
+
+```
+pdf-rag-ingestion-pipeline-python/
+├─ README.md
+├─ LICENSE
+├─ pyproject.toml
+├─ Makefile
+├─ .env.example
+├─ pdfs/                     # drop your PDFs here
+├─ samples/                  # one small public-domain PDF for `make demo`
+├─ ingestion/
+│  ├─ extract.py             # PDF -> Markdown via Nutrient
+│  ├─ chunk.py               # Markdown -> chunks
+│  ├─ embed.py               # chunks -> vectors
+│  └─ store.py               # vectors -> Chroma
+├─ retrieval/
+│  └─ ask.py                 # query -> top-k context -> LLM answer
+├─ run.py                    # end-to-end CLI
+└─ tests/
+```
+
+## Configuration
+
+All knobs live in `.env`:
+
+```env
+# Required
+NUTRIENT_API_KEY=
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+
+# Optional
+EMBEDDING_MODEL=text-embedding-3-small
+LLM_MODEL=claude-sonnet-4-6
+VECTOR_DB_PATH=.chroma
+PDF_FOLDER=pdfs
+```
+
+## Production checklist
+
+- **Cache extraction.** Hash PDF bytes and skip re-extraction on unchanged files.
+- **Build an evaluation set.** ~20 questions per document type. Track retrieval hit-rate and answer correctness over time.
+- **Scanned PDFs.** Route through [Nutrient's Python SDK with OCR/ICR engines](https://www.nutrient.io/sdk/python/pdf-data-extraction/) before this pipeline.
+- **Tables and key-values as data**, not Markdown — see [data-extraction API](https://www.nutrient.io/api/data-extraction-api/).
+- **Observability.** Log Markdown length, chunk count, embedding model, retrieval top-k.
+
+
+## Benchmarks
+
+For a published, reproducible benchmark of Nutrient's open-source [PDF-to-Markdown CLI](https://www.nutrient.io/ai/skills/pdf-to-markdown/) against Docling, MarkItDown, pypdf, pymupdf4llm, and liteparse — including reading order, table structure, heading detection, and speed — see the [PDF-to-Markdown skill page](https://www.nutrient.io/ai/skills/pdf-to-markdown/). Numbers there were measured with the local CLI; if you need a head-to-head against the cloud Markdown endpoint specifically, run the comparison on your own document mix.
+
+## License
+
+See [LICENSE](./LICENSE) — modified BSD, same terms as other examples in [awesome-nutrient](https://github.com/PSPDFKit/awesome-nutrient).
+
+The Nutrient PDF-to-Markdown API and Processor backend are proprietary; usage is governed by your DWS subscription. Free trial available at [dashboard.nutrient.io](https://dashboard.nutrient.io/).
diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/__init__.py b/pdf-rag-ingestion-pipeline-python/ingestion/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/chunk.py b/pdf-rag-ingestion-pipeline-python/ingestion/chunk.py
new file mode 100644
index 00000000..8e924dfb
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/ingestion/chunk.py
@@ -0,0 +1,37 @@
+"""Heading-aware Markdown chunking for RAG ingestion."""
+from __future__ import annotations
+
+import re
+from typing import TypedDict
+
+HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$", re.MULTILINE)
+
+
+class Chunk(TypedDict):
+    title: str
+    text: str
+
+
+def split_by_heading(md: str, max_chars: int = 1800) -> list[Chunk]:
+    """Split Markdown into chunks by heading boundaries, soft-capped at max_chars.
+
+    Falls back to a single chunk capped at max_chars when no headings are found
+    so very small or unstructured documents still yield retrievable text.
+    """
+    headings = list(HEADING_RE.finditer(md))
+    if not headings:
+        return [{"title": "(untitled)", "text": md[:max_chars]}]
+
+    sections: list[dict[str, str]] = []
+    for i, h in enumerate(headings):
+        end = headings[i + 1].start() if i + 1 < len(headings) else len(md)
+        title = h.group(2).strip()
+        body = md[h.end() : end].strip()
+        sections.append({"title": title, "body": body})
+
+    chunks: list[Chunk] = []
+    for s in sections:
+        text = f"# {s['title']}\n\n{s['body']}"
+        for i in range(0, len(text), max_chars):
+            chunks.append({"title": s["title"], "text": text[i : i + max_chars]})
+    return chunks
diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/embed.py b/pdf-rag-ingestion-pipeline-python/ingestion/embed.py
new file mode 100644
index 00000000..b4e85a3a
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/ingestion/embed.py
@@ -0,0 +1,15 @@
+"""Embed text chunks with OpenAI."""
+from __future__ import annotations
+
+import os
+
+from openai import OpenAI
+
+_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))
+_model = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-small")
+
+
+def embed(texts: list[str]) -> list[list[float]]:
+    """Embed a batch of texts with the configured OpenAI model."""
+    response = _client.embeddings.create(model=_model, input=texts)
+    return [d.embedding for d in response.data]
diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/extract.py b/pdf-rag-ingestion-pipeline-python/ingestion/extract.py
new file mode 100644
index 00000000..990f714e
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/ingestion/extract.py
@@ -0,0 +1,35 @@
+"""Convert a PDF to Markdown using the Nutrient DWS Processor API."""
+from __future__ import annotations
+
+import asyncio
+import pathlib
+
+from dotenv import load_dotenv
+from nutrient_dws import NutrientClient
+
+load_dotenv()
+
+
+async def pdf_to_markdown(pdf_path: pathlib.Path) -> str:
+    """Convert a single PDF file to Markdown via Nutrient DWS.
+
+    Uses the official `nutrient-dws` Python client. Reads
+    ``NUTRIENT_API_KEY`` from the environment.
+    """
+    async with NutrientClient() as client:
+        result = await client.convert(str(pdf_path), "markdown")
+        return result.buffer.decode("utf-8")
+
+
+async def _main() -> None:
+    sample = pathlib.Path("pdfs").glob("*.pdf")
+    pdf = next(sample, None)
+    if pdf is None:
+        print("Drop a PDF into ./pdfs first.")
+        return
+    md = await pdf_to_markdown(pdf)
+    print(md[:500])
+
+
+if __name__ == "__main__":
+    asyncio.run(_main())
diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/store.py b/pdf-rag-ingestion-pipeline-python/ingestion/store.py
new file mode 100644
index 00000000..dfef6c0b
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/ingestion/store.py
@@ -0,0 +1,24 @@
+"""Persist chunk embeddings in a local Chroma collection."""
+from __future__ import annotations
+
+import os
+
+import chromadb
+
+_client = chromadb.PersistentClient(path=os.environ.get("VECTOR_DB_PATH", ".chroma"))
+_collection = _client.get_or_create_collection("pdf-rag")
+
+
+def upsert(
+    ids: list[str],
+    docs: list[str],
+    metas: list[dict],
+    embeddings: list[list[float]],
+) -> None:
+    """Upsert a batch of chunks into the local Chroma collection."""
+    _collection.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embeddings)
+
+
+def query(question: str, k: int = 6) -> dict:
+    """Query the collection for top-k chunks matching the question."""
+    return _collection.query(query_texts=[question], n_results=k)
diff --git a/pdf-rag-ingestion-pipeline-python/pdfs/.gitkeep b/pdf-rag-ingestion-pipeline-python/pdfs/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/pdf-rag-ingestion-pipeline-python/pyproject.toml b/pdf-rag-ingestion-pipeline-python/pyproject.toml
new file mode 100644
index 00000000..3989be7c
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/pyproject.toml
@@ -0,0 +1,49 @@
+[project]
+name = "pdf-rag-ingestion-pipeline-python"
+version = "0.1.0"
+description = "End-to-end Python pipeline: PDFs to Markdown to embeddings to RAG, powered by Nutrient PDF-to-Markdown."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "Apache-2.0" }
+authors = [
+  { name = "Nutrient", email = "support@nutrient.io" },
+]
+dependencies = [
+  "nutrient-dws>=1.0.0",
+  "chromadb>=0.5",
+  "openai>=1.40",
+  "anthropic>=0.40",
+  "python-dotenv>=1.0",
+  "tqdm>=4.66",
+]
+
+[project.optional-dependencies]
+dev = [
+  "ruff>=0.6",
+  "mypy>=1.10",
+  "pytest>=8",
+  "pytest-asyncio>=0.23",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["ingestion", "retrieval"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP"]
+
+[tool.mypy]
+python_version = "3.10"
+ignore_missing_imports = true
+disallow_untyped_defs = false
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]
diff --git a/pdf-rag-ingestion-pipeline-python/retrieval/__init__.py b/pdf-rag-ingestion-pipeline-python/retrieval/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/pdf-rag-ingestion-pipeline-python/retrieval/ask.py b/pdf-rag-ingestion-pipeline-python/retrieval/ask.py
new file mode 100644
index 00000000..2e929368
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/retrieval/ask.py
@@ -0,0 +1,43 @@
+"""Retrieve top-k context from Chroma and ask Claude."""
+from __future__ import annotations
+
+import os
+import sys
+
+import anthropic
+
+from ingestion.store import query
+
+_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY", ""))
+_model = os.environ.get("LLM_MODEL", "claude-sonnet-4-6")
+
+
+def ask(question: str, k: int = 6) -> str:
+    """Answer a question using top-k retrieved chunks and Claude."""
+    res = query(question, k=k)
+    docs = res["documents"][0]
+    sources = [f"{m['source']} — {m['section']}" for m in res["metadatas"][0]]
+    context = "\n\n---\n\n".join(docs)
+    msg = _client.messages.create(
+        model=_model,
+        max_tokens=800,
+        messages=[
+            {
+                "role": "user",
+                "content": (
+                    "Answer the question using only the context. "
+                    "Cite sources by section name when relevant.\n\n"
+                    f"CONTEXT:\n{context}\n\nQUESTION: {question}"
+                ),
+            }
+        ],
+    )
+    answer = msg.content[0].text if msg.content else ""
+    return answer + "\n\nSources:\n" + "\n".join(sources)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print('Usage: python -m retrieval.ask "your question"')
+        raise SystemExit(2)
+    print(ask(sys.argv[1]))
diff --git a/pdf-rag-ingestion-pipeline-python/run.py b/pdf-rag-ingestion-pipeline-python/run.py
new file mode 100644
index 00000000..c91f3c1f
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/run.py
@@ -0,0 +1,46 @@
+"""End-to-end ingestion CLI: PDFs in `./pdfs/` → Chroma collection."""
+from __future__ import annotations
+
+import asyncio
+import hashlib
+import os
+import pathlib
+
+from tqdm import tqdm
+
+from ingestion.chunk import split_by_heading
+from ingestion.embed import embed
+from ingestion.extract import pdf_to_markdown
+from ingestion.store import upsert
+
+
+async def ingest(folder: str | None = None) -> None:
+    folder = folder or os.environ.get("PDF_FOLDER", "pdfs")
+    pdf_paths = list(pathlib.Path(folder).glob("*.pdf"))
+    if not pdf_paths:
+        print(f"No PDFs in ./{folder}/. Drop one in and re-run.")
+        return
+
+    for pdf in pdf_paths:
+        md = await pdf_to_markdown(pdf)
+        chunks = split_by_heading(md)
+        ids = [
+            hashlib.sha1(f"{pdf.name}-{i}".encode()).hexdigest()
+            for i in range(len(chunks))
+        ]
+        docs = [c["text"] for c in chunks]
+        metas = [{"source": pdf.name, "section": c["title"]} for c in chunks]
+        for i in tqdm(range(0, len(docs), 64), desc=pdf.name):
+            batch_ids = ids[i : i + 64]
+            batch_docs = docs[i : i + 64]
+            batch_metas = metas[i : i + 64]
+            upsert(
+                ids=batch_ids,
+                docs=batch_docs,
+                metas=batch_metas,
+                embeddings=embed(batch_docs),
+            )
+
+
+if __name__ == "__main__":
+    asyncio.run(ingest())
diff --git a/pdf-rag-ingestion-pipeline-python/tests/__init__.py b/pdf-rag-ingestion-pipeline-python/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/pdf-rag-ingestion-pipeline-python/tests/test_chunk.py b/pdf-rag-ingestion-pipeline-python/tests/test_chunk.py
new file mode 100644
index 00000000..1e0fbf77
--- /dev/null
+++ b/pdf-rag-ingestion-pipeline-python/tests/test_chunk.py
@@ -0,0 +1,46 @@
+"""Unit tests for the heading-aware Markdown splitter."""
+from __future__ import annotations
+
+from ingestion.chunk import split_by_heading
+
+
+def test_no_headings_returns_single_chunk() -> None:
+    md = "Just a paragraph of text without any headings."
+    chunks = split_by_heading(md)
+    assert len(chunks) == 1
+    assert chunks[0]["title"] == "(untitled)"
+    assert "paragraph" in chunks[0]["text"]
+
+
+def test_single_heading_returns_one_chunk() -> None:
+    md = "# Section A\n\nSome body text."
+    chunks = split_by_heading(md)
+    assert len(chunks) == 1
+    assert chunks[0]["title"] == "Section A"
+    assert "Some body text" in chunks[0]["text"]
+
+
+def test_multiple_headings_split() -> None:
+    md = (
+        "# Intro\n\nIntro body.\n\n"
+        "## Background\n\nBackground body.\n\n"
+        "# Conclusion\n\nConcluding body."
+    )
+    chunks = split_by_heading(md)
+    titles = [c["title"] for c in chunks]
+    assert titles == ["Intro", "Background", "Conclusion"]
+
+
+def test_long_section_is_soft_capped() -> None:
+    body = "x" * 5000
+    md = f"# Long\n\n{body}"
+    chunks = split_by_heading(md, max_chars=1800)
+    assert len(chunks) >= 3
+    for c in chunks:
+        assert len(c["text"]) <= 1800
+
+
+def test_heading_levels_are_respected() -> None:
+    md = "### Deep Heading\n\nbody"
+    chunks = split_by_heading(md)
+    assert chunks[0]["title"] == "Deep Heading"