PSPDFKit · jdrhyne · Apr 27, 2026 · miguelcalderon · May 21, 2026 · miguelcalderon
@@ -0,0 +1,10 @@
+# Required
+NUTRIENT_API_KEY=
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+
+# Optional
+EMBEDDING_MODEL=text-embedding-3-small
+LLM_MODEL=claude-sonnet-4-6
+VECTOR_DB_PATH=.chroma
+PDF_FOLDER=pdfs
@@ -0,0 +1,14 @@
+.env
+.venv/
+__pycache__/
+*.pyc
+*.pyo
+.pytest_cache/
+.ruff_cache/
+.mypy_cache/
+.chroma/
+dist/
+build/
+*.egg-info/
+pdfs/*
+!pdfs/.gitkeep
@@ -0,0 +1,42 @@
+The Nutrient Sample applications are licensed with a modified BSD
+license. In plain language: you're allowed to do whatever you wish
+with the code, modify, redistribute, embed in your products (free or
+commercial), but you must include copyright, terms of usage and
+disclaimer as stated in the license.
+
+You will require a commercial Nutrient License to run these examples
+in non-demo mode. Please refer to sales@nutrient.io for details.
+
+Copyright © 2017-present PSPDFKit GmbH d/b/a Nutrient.
+All rights reserved.
+
+Redistribution and use in source or binary forms,
+with or without modification, are permitted provided
+that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+- Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the
+  distribution.
+
+- Redistributions of Nutrient Samples must include attribution to
+  Nutrient, either in documentation or other appropriate media.
+
+- Neither the name of the Nutrient, PSPDFKit GmbH, nor its developers
+  may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,28 @@
+.PHONY: install ingest ask demo lint test clean
+
+install:
+	python -m venv .venv && . .venv/bin/activate && pip install -e '.[dev]'
+
+ingest:
+	python run.py
+
+ask:
+	@if [ -z "$(Q)" ]; then echo "Usage: make ask Q=\"your question\""; exit 1; fi
+	python -m retrieval.ask "$(Q)"
+
+demo:
+	cp samples/*.pdf pdfs/ 2>/dev/null || true
+	python run.py
+	python -m retrieval.ask "What does this document describe?"
+
+lint:
+	ruff check .
+	ruff format --check .
+	mypy ingestion retrieval
+
+test:
+	pytest -q
+
+clean:
+	rm -rf .chroma .venv .pytest_cache .ruff_cache .mypy_cache
+	find . -name __pycache__ -type d -exec rm -rf {} +
@@ -0,0 +1,102 @@
+# PDF RAG Ingestion Pipeline (Python)
+
+End-to-end ingestion pipeline for AI document apps: **PDF → Markdown → chunks → embeddings → retrieval → LLM answer**, powered by [Nutrient's PDF-to-Markdown API](https://www.nutrient.io/api/pdf-to-md-api/).
+
+Companion code for the tutorial:
+[Build a PDF ingestion pipeline for AI apps in Python](https://www.nutrient.io/blog/build-pdf-rag-ingestion-pipeline-python/).
+
+This example lives inside the [awesome-nutrient](https://github.com/PSPDFKit/awesome-nutrient) examples repo. See the root [README](../README.md) for other Nutrient examples.
+
+## Why this exists
+
+Most "I built a RAG app" tutorials skip the part that breaks in production: getting clean text out of real PDFs. Page-noise, broken tables, and lost reading order quietly degrade every chunk that hits your vector DB — and that shows up as bad retrievals, not as crashes.
+
+This repo turns a folder of PDFs into a queryable index for an LLM in five minutes, using:
+
+- **[Nutrient DWS Processor API](https://www.nutrient.io/api/pdf-to-md-api/)** for PDF → Markdown
+- **OpenAI `text-embedding-3-small`** for embeddings
+- **Chroma** for local vector storage (swap to Pinecone, pgvector, Weaviate, or Qdrant by replacing `ingestion/store.py`)
+- **Anthropic Claude** for the answer step (swap to OpenAI by editing `retrieval/ask.py`)
+
+## Quickstart (5 minutes)
+
+```bash
+git clone https://github.com/PSPDFKit/awesome-nutrient.git
+cd awesome-nutrient/pdf-rag-ingestion-pipeline-python
+python -m venv .venv && source .venv/bin/activate
+pip install -e '.[dev]'
+cp .env.example .env
+# add your NUTRIENT_API_KEY, OPENAI_API_KEY, ANTHROPIC_API_KEY to .env
+cp samples/*.pdf pdfs/
+python run.py
+python -m retrieval.ask "What does this document describe?"
+```
+
+## Choosing the right Nutrient path
+
+There isn't one Nutrient path for AI document ingestion — there are three. Pick by data residency, document type, and output shape:
+
+| Use case                                          | Nutrient path                                                                          | Tradeoff                                          |
+| ------------------------------------------------- | -------------------------------------------------------------------------------------- | ------------------------------------------------- |
+| Born-digital PDFs, fastest path to working RAG    | [PDF-to-Markdown API](https://www.nutrient.io/api/pdf-to-md-api/) (this repo)          | Cloud API. Documents are POSTed to Nutrient.      |
+| Born-digital PDFs, must run locally / no upload   | [`@pspdfkit/pdf-to-markdown` CLI / Claude Code skill](https://www.nutrient.io/ai/skills/pdf-to-markdown/) | Node CLI. No OCR yet.                             |
+| Scanned, image-only, or handwriting PDFs          | [Nutrient Python SDK with OCR/ICR engines](https://www.nutrient.io/sdk/python/pdf-data-extraction/) | On-prem. Heavier setup, supports tougher inputs.  |
+| Tables and key-value pairs as JSON, not Markdown  | [Data-extraction API](https://www.nutrient.io/api/data-extraction-api/)                | Different output shape. Better for forms.         |
+
+## Repo structure
+
+```
+pdf-rag-ingestion-pipeline-python/
+├─ README.md
+├─ LICENSE
+├─ pyproject.toml
+├─ Makefile
+├─ .env.example
+├─ pdfs/                     # drop your PDFs here
+├─ samples/                  # one small public-domain PDF for `make demo`
+├─ ingestion/
+│  ├─ extract.py             # PDF -> Markdown via Nutrient
+│  ├─ chunk.py               # Markdown -> chunks
+│  ├─ embed.py               # chunks -> vectors
+│  └─ store.py               # vectors -> Chroma
+├─ retrieval/
+│  └─ ask.py                 # query -> top-k context -> LLM answer
+├─ run.py                    # end-to-end CLI
+└─ tests/
+```
+
+## Configuration
+
+All knobs live in `.env`:
+
+```env
+# Required
+NUTRIENT_API_KEY=
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+
+# Optional
+EMBEDDING_MODEL=text-embedding-3-small
+LLM_MODEL=claude-sonnet-4-6
+VECTOR_DB_PATH=.chroma
+PDF_FOLDER=pdfs
+```
+
+## Production checklist
+
+- **Cache extraction.** Hash PDF bytes and skip re-extraction on unchanged files.
+- **Build an evaluation set.** ~20 questions per document type. Track retrieval hit-rate and answer correctness over time.
+- **Scanned PDFs.** Route through [Nutrient's Python SDK with OCR/ICR engines](https://www.nutrient.io/sdk/python/pdf-data-extraction/) before this pipeline.
+- **Tables and key-values as data**, not Markdown — see [data-extraction API](https://www.nutrient.io/api/data-extraction-api/).
+- **Observability.** Log Markdown length, chunk count, embedding model, retrieval top-k.
+
+
+## Benchmarks
+
+For a published, reproducible benchmark of Nutrient's open-source [PDF-to-Markdown CLI](https://www.nutrient.io/ai/skills/pdf-to-markdown/) against Docling, MarkItDown, pypdf, pymupdf4llm, and liteparse — including reading order, table structure, heading detection, and speed — see the [PDF-to-Markdown skill page](https://www.nutrient.io/ai/skills/pdf-to-markdown/). Numbers there were measured with the local CLI; if you need a head-to-head against the cloud Markdown endpoint specifically, run the comparison on your own document mix.
+
+## License
+
+See [LICENSE](./LICENSE) — modified BSD, same terms as other examples in [awesome-nutrient](https://github.com/PSPDFKit/awesome-nutrient).
+
+The Nutrient PDF-to-Markdown API and Processor backend are proprietary; usage is governed by your DWS subscription. Free trial available at [dashboard.nutrient.io](https://dashboard.nutrient.io/).
@@ -0,0 +1,37 @@
+"""Heading-aware Markdown chunking for RAG ingestion."""
+from __future__ import annotations
+
+import re
+from typing import TypedDict
+
+HEADING_RE = re.compile(r"^(#{1,6})\s+(.*)$", re.MULTILINE)
+
+
+class Chunk(TypedDict):
+    title: str
+    text: str
+
+
+def split_by_heading(md: str, max_chars: int = 1800) -> list[Chunk]:
+    """Split Markdown into chunks by heading boundaries, soft-capped at max_chars.
+
+    Falls back to a single chunk capped at max_chars when no headings are found
+    so very small or unstructured documents still yield retrievable text.
+    """
+    headings = list(HEADING_RE.finditer(md))
+    if not headings:
+        return [{"title": "(untitled)", "text": md[:max_chars]}]
+
+    sections: list[dict[str, str]] = []
+    for i, h in enumerate(headings):
+        end = headings[i + 1].start() if i + 1 < len(headings) else len(md)
+        title = h.group(2).strip()
+        body = md[h.end() : end].strip()
+        sections.append({"title": title, "body": body})
+
+    chunks: list[Chunk] = []
+    for s in sections:
+        text = f"# {s['title']}\n\n{s['body']}"
+        for i in range(0, len(text), max_chars):
+            chunks.append({"title": s["title"], "text": text[i : i + max_chars]})
+    return chunks
@@ -0,0 +1,15 @@
+"""Embed text chunks with OpenAI."""
+from __future__ import annotations
+
+import os
+
+from openai import OpenAI
+
+_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", ""))
+_model = os.environ.get("EMBEDDING_MODEL", "text-embedding-3-small")
+
+
+def embed(texts: list[str]) -> list[list[float]]:
+    """Embed a batch of texts with the configured OpenAI model."""
+    response = _client.embeddings.create(model=_model, input=texts)
+    return [d.embedding for d in response.data]
@@ -0,0 +1,35 @@
+"""Convert a PDF to Markdown using the Nutrient DWS Processor API."""
+from __future__ import annotations
+
+import asyncio
+import pathlib
+
+from dotenv import load_dotenv
+from nutrient_dws import NutrientClient
+
+load_dotenv()
+
+
+async def pdf_to_markdown(pdf_path: pathlib.Path) -> str:
+    """Convert a single PDF file to Markdown via Nutrient DWS.
+
+    Uses the official `nutrient-dws` Python client. Reads
+    ``NUTRIENT_API_KEY`` from the environment.
+    """
+    async with NutrientClient() as client:
+        result = await client.convert(str(pdf_path), "markdown")
+        return result.buffer.decode("utf-8")
+
+
+async def _main() -> None:
+    sample = pathlib.Path("pdfs").glob("*.pdf")
+    pdf = next(sample, None)
+    if pdf is None:
+        print("Drop a PDF into ./pdfs first.")
+        return
+    md = await pdf_to_markdown(pdf)
+    print(md[:500])
+
+
+if __name__ == "__main__":
+    asyncio.run(_main())
@@ -0,0 +1,24 @@
+"""Persist chunk embeddings in a local Chroma collection."""
+from __future__ import annotations
+
+import os
+
+import chromadb
+
+_client = chromadb.PersistentClient(path=os.environ.get("VECTOR_DB_PATH", ".chroma"))
+_collection = _client.get_or_create_collection("pdf-rag")
+
+
+def upsert(
+    ids: list[str],
+    docs: list[str],
+    metas: list[dict],
+    embeddings: list[list[float]],
+) -> None:
+    """Upsert a batch of chunks into the local Chroma collection."""
+    _collection.upsert(ids=ids, documents=docs, metadatas=metas, embeddings=embeddings)
+
+
+def query(question: str, k: int = 6) -> dict:
+    """Query the collection for top-k chunks matching the question."""
+    return _collection.query(query_texts=[question], n_results=k)
@@ -0,0 +1,49 @@
+[project]
+name = "pdf-rag-ingestion-pipeline-python"
+version = "0.1.0"
+description = "End-to-end Python pipeline: PDFs to Markdown to embeddings to RAG, powered by Nutrient PDF-to-Markdown."
+readme = "README.md"
+requires-python = ">=3.10"
+license = { text = "Apache-2.0" }
+authors = [
+  { name = "Nutrient", email = "support@nutrient.io" },
+]
+dependencies = [
+  "nutrient-dws>=1.0.0",
+  "chromadb>=0.5",
+  "openai>=1.40",
+  "anthropic>=0.40",
+  "python-dotenv>=1.0",
+  "tqdm>=4.66",
+]
+
+[project.optional-dependencies]
+dev = [
+  "ruff>=0.6",
+  "mypy>=1.10",
+  "pytest>=8",
+  "pytest-asyncio>=0.23",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["ingestion", "retrieval"]
+
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+
+[tool.ruff.lint]
+select = ["E", "F", "I", "B", "UP"]
+
+[tool.mypy]
+python_version = "3.10"
+ignore_missing_imports = true
+disallow_untyped_defs = false
+
+[tool.pytest.ini_options]
+asyncio_mode = "auto"
+testpaths = ["tests"]