From 78b9d7a74a4042e368005d714c1fa1e63784b3a1 Mon Sep 17 00:00:00 2001 From: Jonathan Rhyne Date: Sun, 26 Apr 2026 21:02:58 -0400 Subject: [PATCH] Add Python PDF RAG ingestion pipeline example Companion code for the tutorial "Build a PDF ingestion pipeline for AI apps in Python" on nutrient.io. Walks through PDF -> Markdown via the DWS Processor API -> heading-aware chunking -> OpenAI embeddings -> Chroma -> Claude answer with cited sources. Apache-style modified BSD license matching the rest of the repo. Includes Makefile, .env.example, pytest unit tests for the chunker, and a 5-minute quickstart. --- .../.env.example | 10 ++ pdf-rag-ingestion-pipeline-python/.gitignore | 14 +++ pdf-rag-ingestion-pipeline-python/LICENSE | 42 ++++++++ pdf-rag-ingestion-pipeline-python/Makefile | 28 +++++ pdf-rag-ingestion-pipeline-python/README.md | 102 ++++++++++++++++++ .../ingestion/__init__.py | 0 .../ingestion/chunk.py | 37 +++++++ .../ingestion/embed.py | 15 +++ .../ingestion/extract.py | 35 ++++++ .../ingestion/store.py | 24 +++++ .../pdfs/.gitkeep | 0 .../pyproject.toml | 49 +++++++++ .../retrieval/__init__.py | 0 .../retrieval/ask.py | 43 ++++++++ pdf-rag-ingestion-pipeline-python/run.py | 46 ++++++++ .../tests/__init__.py | 0 .../tests/test_chunk.py | 46 ++++++++ 17 files changed, 491 insertions(+) create mode 100644 pdf-rag-ingestion-pipeline-python/.env.example create mode 100644 pdf-rag-ingestion-pipeline-python/.gitignore create mode 100644 pdf-rag-ingestion-pipeline-python/LICENSE create mode 100644 pdf-rag-ingestion-pipeline-python/Makefile create mode 100644 pdf-rag-ingestion-pipeline-python/README.md create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/__init__.py create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/chunk.py create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/embed.py create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/extract.py create mode 100644 pdf-rag-ingestion-pipeline-python/ingestion/store.py create mode 100644 pdf-rag-ingestion-pipeline-python/pdfs/.gitkeep create mode 100644 pdf-rag-ingestion-pipeline-python/pyproject.toml create mode 100644 pdf-rag-ingestion-pipeline-python/retrieval/__init__.py create mode 100644 pdf-rag-ingestion-pipeline-python/retrieval/ask.py create mode 100644 pdf-rag-ingestion-pipeline-python/run.py create mode 100644 pdf-rag-ingestion-pipeline-python/tests/__init__.py create mode 100644 pdf-rag-ingestion-pipeline-python/tests/test_chunk.py diff --git a/pdf-rag-ingestion-pipeline-python/.env.example b/pdf-rag-ingestion-pipeline-python/.env.example new file mode 100644 index 00000000..47e36db2 --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/.env.example @@ -0,0 +1,10 @@ +# Required +NUTRIENT_API_KEY= +OPENAI_API_KEY= +ANTHROPIC_API_KEY= + +# Optional +EMBEDDING_MODEL=text-embedding-3-small +LLM_MODEL=claude-sonnet-4-6 +VECTOR_DB_PATH=.chroma +PDF_FOLDER=pdfs diff --git a/pdf-rag-ingestion-pipeline-python/.gitignore b/pdf-rag-ingestion-pipeline-python/.gitignore new file mode 100644 index 00000000..dec9c4dd --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/.gitignore @@ -0,0 +1,14 @@ +.env +.venv/ +__pycache__/ +*.pyc +*.pyo +.pytest_cache/ +.ruff_cache/ +.mypy_cache/ +.chroma/ +dist/ +build/ +*.egg-info/ +pdfs/* +!pdfs/.gitkeep diff --git a/pdf-rag-ingestion-pipeline-python/LICENSE b/pdf-rag-ingestion-pipeline-python/LICENSE new file mode 100644 index 00000000..7f815afc --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/LICENSE @@ -0,0 +1,42 @@ +The Nutrient Sample applications are licensed with a modified BSD +license. In plain language: you're allowed to do whatever you wish +with the code, modify, redistribute, embed in your products (free or +commercial), but you must include copyright, terms of usage and +disclaimer as stated in the license. + +You will require a commercial Nutrient License to run these examples +in non-demo mode. Please refer to sales@nutrient.io for details. + +Copyright © 2017-present PSPDFKit GmbH d/b/a Nutrient. +All rights reserved. + +Redistribution and use in source or binary forms, +with or without modification, are permitted provided +that the following conditions are met: + +- Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +- Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + +- Redistributions of Nutrient Samples must include attribution to + Nutrient, either in documentation or other appropriate media. + +- Neither the name of the Nutrient, PSPDFKit GmbH, nor its developers + may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/pdf-rag-ingestion-pipeline-python/Makefile b/pdf-rag-ingestion-pipeline-python/Makefile new file mode 100644 index 00000000..cfcc84c4 --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/Makefile @@ -0,0 +1,28 @@ +.PHONY: install ingest ask demo lint test clean + +install: + python -m venv .venv && . .venv/bin/activate && pip install -e '.[dev]' + +ingest: + python run.py + +ask: + @if [ -z "$(Q)" ]; then echo "Usage: make ask Q=\"your question\""; exit 1; fi + python -m retrieval.ask "$(Q)" + +demo: + cp samples/*.pdf pdfs/ 2>/dev/null || true + python run.py + python -m retrieval.ask "What does this document describe?" + +lint: + ruff check . + ruff format --check . + mypy ingestion retrieval + +test: + pytest -q + +clean: + rm -rf .chroma .venv .pytest_cache .ruff_cache .mypy_cache + find . -name __pycache__ -type d -exec rm -rf {} + diff --git a/pdf-rag-ingestion-pipeline-python/README.md b/pdf-rag-ingestion-pipeline-python/README.md new file mode 100644 index 00000000..dbf85194 --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/README.md @@ -0,0 +1,102 @@ +# PDF RAG Ingestion Pipeline (Python) + +End-to-end ingestion pipeline for AI document apps: **PDF → Markdown → chunks → embeddings → retrieval → LLM answer**, powered by [Nutrient's PDF-to-Markdown API](https://www.nutrient.io/api/pdf-to-md-api/). + +Companion code for the tutorial: +[Build a PDF ingestion pipeline for AI apps in Python](https://www.nutrient.io/blog/build-pdf-rag-ingestion-pipeline-python/). + +This example lives inside the [awesome-nutrient](https://github.com/PSPDFKit/awesome-nutrient) examples repo. See the root [README](../README.md) for other Nutrient examples. + +## Why this exists + +Most "I built a RAG app" tutorials skip the part that breaks in production: getting clean text out of real PDFs. Page-noise, broken tables, and lost reading order quietly degrade every chunk that hits your vector DB — and that shows up as bad retrievals, not as crashes. + +This repo turns a folder of PDFs into a queryable index for an LLM in five minutes, using: + +- **[Nutrient DWS Processor API](https://www.nutrient.io/api/pdf-to-md-api/)** for PDF → Markdown +- **OpenAI `text-embedding-3-small`** for embeddings +- **Chroma** for local vector storage (swap to Pinecone, pgvector, Weaviate, or Qdrant by replacing `ingestion/store.py`) +- **Anthropic Claude** for the answer step (swap to OpenAI by editing `retrieval/ask.py`) + +## Quickstart (5 minutes) + +```bash +git clone https://github.com/PSPDFKit/awesome-nutrient.git +cd awesome-nutrient/pdf-rag-ingestion-pipeline-python +python -m venv .venv && source .venv/bin/activate +pip install -e '.[dev]' +cp .env.example .env +# add your NUTRIENT_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY to .env +cp samples/*.pdf pdfs/ +python run.py +python -m retrieval.ask "What does this document describe?" +``` + +## Choosing the right Nutrient path + +There isn't one Nutrient path for AI document ingestion — there are three. Pick by data residency, document type, and output shape: + +| Use case | Nutrient path | Tradeoff | +| ------------------------------------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------- | +| Born-digital PDFs, fastest path to working RAG | [PDF-to-Markdown API](https://www.nutrient.io/api/pdf-to-md-api/) (this repo) | Cloud API. Documents are POSTed to Nutrient. | +| Born-digital PDFs, must run locally / no upload | [`@pspdfkit/pdf-to-markdown` CLI / Claude Code skill](https://www.nutrient.io/ai/skills/pdf-to-markdown/) | Node CLI. No OCR yet. | +| Scanned, image-only, or handwriting PDFs | [Nutrient Python SDK with OCR/ICR engines](https://www.nutrient.io/sdk/python/pdf-data-extraction/) | On-prem. Heavier setup, supports tougher inputs. | +| Tables and key-value pairs as JSON, not Markdown | [Data-extraction API](https://www.nutrient.io/api/data-extraction-api/) | Different output shape. Better for forms. | + +## Repo structure + +``` +pdf-rag-ingestion-pipeline-python/ +├─ README.md +├─ LICENSE +├─ pyproject.toml +├─ Makefile +├─ .env.example +├─ pdfs/ # drop your PDFs here +├─ samples/ # one small public-domain PDF for `make demo` +├─ ingestion/ +│ ├─ extract.py # PDF -> Markdown via Nutrient +│ ├─ chunk.py # Markdown -> chunks +│ ├─ embed.py # chunks -> vectors +│ └─ store.py # vectors -> Chroma +├─ retrieval/ +│ └─ ask.py # query -> top-k context -> LLM answer +├─ run.py # end-to-end CLI +└─ tests/ +``` + +## Configuration + +All knobs live in `.env`: + +```env +# Required +NUTRIENT_API_KEY= +OPENAI_API_KEY= +ANTHROPIC_API_KEY= + +# Optional +EMBEDDING_MODEL=text-embedding-3-small +LLM_MODEL=claude-sonnet-4-6 +VECTOR_DB_PATH=.chroma +PDF_FOLDER=pdfs +``` + +## Production checklist + +- **Cache extraction.** Hash PDF bytes and skip re-extraction on unchanged files. +- **Build an evaluation set.** ~20 questions per document type. Track retrieval hit-rate and answer correctness over time. +- **Scanned PDFs.** Route through [Nutrient's Python SDK with OCR/ICR engines](https://www.nutrient.io/sdk/python/pdf-data-extraction/) before this pipeline. +- **Tables and key-values as data**, not Markdown — see [data-extraction API](https://www.nutrient.io/api/data-extraction-api/). +- **Observability.** Log Markdown length, chunk count, embedding model, retrieval top-k. + + +## Benchmarks + +For a published, reproducible benchmark of Nutrient's open-source [PDF-to-Markdown CLI](https://www.nutrient.io/ai/skills/pdf-to-markdown/) against Docling, MarkItDown, pypdf, pymupdf4llm, and liteparse — including reading order, table structure, heading detection, and speed — see the [PDF-to-Markdown skill page](https://www.nutrient.io/ai/skills/pdf-to-markdown/). Numbers there were measured with the local CLI; if you need a head-to-head against the cloud Markdown endpoint specifically, run the comparison on your own document mix. + +## License + +See [LICENSE](./LICENSE) — modified BSD, same terms as other examples in [awesome-nutrient](https://github.com/PSPDFKit/awesome-nutrient). + +The Nutrient PDF-to-Markdown API and Processor backend are proprietary; usage is governed by your DWS subscription. Free trial available at [dashboard.nutrient.io](https://dashboard.nutrient.io/). diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/__init__.py b/pdf-rag-ingestion-pipeline-python/ingestion/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/chunk.py b/pdf-rag-ingestion-pipeline-python/ingestion/chunk.py new file mode 100644 index 00000000..8e924dfb --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/ingestion/chunk.py @@ -0,0 +1,37 @@ +"""Heading-aware Markdown chunking for RAG ingestion.""" +from __future__ import annotations + +import re +from typing import TypedDict + +HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$", re.MULTILINE) + + +class Chunk(TypedDict): + title: str + text: str + + +def split_by_heading(md: str, max_chars: int = 1800) -> list[Chunk]: + """Split Markdown into chunks by heading boundaries, soft-capped at max_chars. + + Falls back to a single chunk capped at max_chars when no headings are found + so very small or unstructured documents still yield retrievable text. + """ + headings = list(HEADING_RE.finditer(md)) + if not headings: + return [{"title": "(untitled)", "text": md[:max_chars]}] + + sections: list[dict[str, str]] = [] + for i, h in enumerate(headings): + end = headings[i + 1].start() if i + 1 < len(headings) else len(md) + title = h.group(2).strip() + body = md[h.end() : end].strip() + sections.append({"title": title, "body": body}) + + chunks: list[Chunk] = [] + for s in sections: + text = f"# {s['title']}\n\n{s['body']}" + for i in range(0, len(text), max_chars): + chunks.append({"title": s["title"], "text": text[i : i + max_chars]}) + return chunks diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/embed.py b/pdf-rag-ingestion-pipeline-python/ingestion/embed.py new file mode 100644 index 00000000..b4e85a3a --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/ingestion/embed.py @@ -0,0 +1,15 @@ +"""Embed text chunks with OpenAI.""" +from __future__ import annotations + +import os + +from openai import OpenAI + +_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "")) +_model = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-small") + + +def embed(texts: list[str]) -> list[list[float]]: + """Embed a batch of texts with the configured OpenAI model.""" + response = _client.embeddings.create(model=_model, input=texts) + return [d.embedding for d in response.data] diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/extract.py b/pdf-rag-ingestion-pipeline-python/ingestion/extract.py new file mode 100644 index 00000000..990f714e --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/ingestion/extract.py @@ -0,0 +1,35 @@ +"""Convert a PDF to Markdown using the Nutrient DWS Processor API.""" +from __future__ import annotations + +import asyncio +import pathlib + +from dotenv import load_dotenv +from nutrient_dws import NutrientClient + +load_dotenv() + + +async def pdf_to_markdown(pdf_path: pathlib.Path) -> str: + """Convert a single PDF file to Markdown via Nutrient DWS. + + Uses the official `nutrient-dws` Python client. Reads + ``NUTRIENT_API_KEY`` from the environment. + """ + async with NutrientClient() as client: + result = await client.convert(str(pdf_path), "markdown") + return result.buffer.decode("utf-8") + + +async def _main() -> None: + sample = pathlib.Path("pdfs").glob("*.pdf") + pdf = next(sample, None) + if pdf is None: + print("Drop a PDF into ./pdfs first.") + return + md = await pdf_to_markdown(pdf) + print(md[:500]) + + +if __name__ == "__main__": + asyncio.run(_main()) diff --git a/pdf-rag-ingestion-pipeline-python/ingestion/store.py b/pdf-rag-ingestion-pipeline-python/ingestion/store.py new file mode 100644 index 00000000..dfef6c0b --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/ingestion/store.py @@ -0,0 +1,24 @@ +"""Persist chunk embeddings in a local Chroma collection.""" +from __future__ import annotations + +import os + +import chromadb + +_client = chromadb.PersistentClient(path=os.environ.get("VECTOR_DB_PATH", ".chroma")) +_collection = _client.get_or_create_collection("pdf-rag") + + +def upsert( + ids: list[str], + docs: list[str], + metas: list[dict], + embeddings: list[list[float]], +) -> None: + """Upsert a batch of chunks into the local Chroma collection.""" + _collection.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embeddings) + + +def query(question: str, k: int = 6) -> dict: + """Query the collection for top-k chunks matching the question.""" + return _collection.query(query_texts=[question], n_results=k) diff --git a/pdf-rag-ingestion-pipeline-python/pdfs/.gitkeep b/pdf-rag-ingestion-pipeline-python/pdfs/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/pdf-rag-ingestion-pipeline-python/pyproject.toml b/pdf-rag-ingestion-pipeline-python/pyproject.toml new file mode 100644 index 00000000..3989be7c --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/pyproject.toml @@ -0,0 +1,49 @@ +[project] +name = "pdf-rag-ingestion-pipeline-python" +version = "0.1.0" +description = "End-to-end Python pipeline: PDFs to Markdown to embeddings to RAG, powered by Nutrient PDF-to-Markdown." +readme = "README.md" +requires-python = ">=3.10" +license = { text = "Apache-2.0" } +authors = [ + { name = "Nutrient", email = "support@nutrient.io" }, +] +dependencies = [ + "nutrient-dws>=1.0.0", + "chromadb>=0.5", + "openai>=1.40", + "anthropic>=0.40", + "python-dotenv>=1.0", + "tqdm>=4.66", +] + +[project.optional-dependencies] +dev = [ + "ruff>=0.6", + "mypy>=1.10", + "pytest>=8", + "pytest-asyncio>=0.23", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["ingestion", "retrieval"] + +[tool.ruff] +line-length = 100 +target-version = "py310" + +[tool.ruff.lint] +select = ["E", "F", "I", "B", "UP"] + +[tool.mypy] +python_version = "3.10" +ignore_missing_imports = true +disallow_untyped_defs = false + +[tool.pytest.ini_options] +asyncio_mode = "auto" +testpaths = ["tests"] diff --git a/pdf-rag-ingestion-pipeline-python/retrieval/__init__.py b/pdf-rag-ingestion-pipeline-python/retrieval/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pdf-rag-ingestion-pipeline-python/retrieval/ask.py b/pdf-rag-ingestion-pipeline-python/retrieval/ask.py new file mode 100644 index 00000000..2e929368 --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/retrieval/ask.py @@ -0,0 +1,43 @@ +"""Retrieve top-k context from Chroma and ask Claude.""" +from __future__ import annotations + +import os +import sys + +import anthropic + +from ingestion.store import query + +_client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY", "")) +_model = os.environ.get("LLM_MODEL", "claude-sonnet-4-6") + + +def ask(question: str, k: int = 6) -> str: + """Answer a question using top-k retrieved chunks and Claude.""" + res = query(question, k=k) + docs = res["documents"][0] + sources = [f"{m['source']} — {m['section']}" for m in res["metadatas"][0]] + context = "\n\n---\n\n".join(docs) + msg = _client.messages.create( + model=_model, + max_tokens=800, + messages=[ + { + "role": "user", + "content": ( + "Answer the question using only the context. " + "Cite sources by section name when relevant.\n\n" + f"CONTEXT:\n{context}\n\nQUESTION: {question}" + ), + } + ], + ) + answer = msg.content[0].text if msg.content else "" + return answer + "\n\nSources:\n" + "\n".join(sources) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print('Usage: python -m retrieval.ask "your question"') + raise SystemExit(2) + print(ask(sys.argv[1])) diff --git a/pdf-rag-ingestion-pipeline-python/run.py b/pdf-rag-ingestion-pipeline-python/run.py new file mode 100644 index 00000000..c91f3c1f --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/run.py @@ -0,0 +1,46 @@ +"""End-to-end ingestion CLI: PDFs in `./pdfs/` → Chroma collection.""" +from __future__ import annotations + +import asyncio +import hashlib +import os +import pathlib + +from tqdm import tqdm + +from ingestion.chunk import split_by_heading +from ingestion.embed import embed +from ingestion.extract import pdf_to_markdown +from ingestion.store import upsert + + +async def ingest(folder: str | None = None) -> None: + folder = folder or os.environ.get("PDF_FOLDER", "pdfs") + pdf_paths = list(pathlib.Path(folder).glob("*.pdf")) + if not pdf_paths: + print(f"No PDFs in ./{folder}/. Drop one in and re-run.") + return + + for pdf in pdf_paths: + md = await pdf_to_markdown(pdf) + chunks = split_by_heading(md) + ids = [ + hashlib.sha1(f"{pdf.name}-{i}".encode()).hexdigest() + for i in range(len(chunks)) + ] + docs = [c["text"] for c in chunks] + metas = [{"source": pdf.name, "section": c["title"]} for c in chunks] + for i in tqdm(range(0, len(docs), 64), desc=pdf.name): + batch_ids = ids[i : i + 64] + batch_docs = docs[i : i + 64] + batch_metas = metas[i : i + 64] + upsert( + ids=batch_ids, + docs=batch_docs, + metas=batch_metas, + embeddings=embed(batch_docs), + ) + + +if __name__ == "__main__": + asyncio.run(ingest()) diff --git a/pdf-rag-ingestion-pipeline-python/tests/__init__.py b/pdf-rag-ingestion-pipeline-python/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pdf-rag-ingestion-pipeline-python/tests/test_chunk.py b/pdf-rag-ingestion-pipeline-python/tests/test_chunk.py new file mode 100644 index 00000000..1e0fbf77 --- /dev/null +++ b/pdf-rag-ingestion-pipeline-python/tests/test_chunk.py @@ -0,0 +1,46 @@ +"""Unit tests for the heading-aware Markdown splitter.""" +from __future__ import annotations + +from ingestion.chunk import split_by_heading + + +def test_no_headings_returns_single_chunk() -> None: + md = "Just a paragraph of text without any headings." + chunks = split_by_heading(md) + assert len(chunks) == 1 + assert chunks[0]["title"] == "(untitled)" + assert "paragraph" in chunks[0]["text"] + + +def test_single_heading_returns_one_chunk() -> None: + md = "# Section A\n\nSome body text." + chunks = split_by_heading(md) + assert len(chunks) == 1 + assert chunks[0]["title"] == "Section A" + assert "Some body text" in chunks[0]["text"] + + +def test_multiple_headings_split() -> None: + md = ( + "# Intro\n\nIntro body.\n\n" + "## Background\n\nBackground body.\n\n" + "# Conclusion\n\nConcluding body." + ) + chunks = split_by_heading(md) + titles = [c["title"] for c in chunks] + assert titles == ["Intro", "Background", "Conclusion"] + + +def test_long_section_is_soft_capped() -> None: + body = "x" * 5000 + md = f"# Long\n\n{body}" + chunks = split_by_heading(md, max_chars=1800) + assert len(chunks) >= 3 + for c in chunks: + assert len(c["text"]) <= 1800 + + +def test_heading_levels_are_respected() -> None: + md = "### Deep Heading\n\nbody" + chunks = split_by_heading(md) + assert chunks[0]["title"] == "Deep Heading"