CogStack · vladd-bit · May 28, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -84,11 +84,11 @@ RUN apt-get -y --no-install-recommends -o Dpkg::Options::="--force-confold" -y -
 ##### utils for python and TESSERACT
 RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections
 
-RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
+RUN apt-get update && apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
     libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic \
     ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 \
     fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki \
-    libpcre3 libpcre3-dev \
+    libpcre3 libpcre3-dev libxml2 libxml2-dev libxslt1.1 libxslt-dev \
     mesa-opencl-icd pocl-opencl-icd libvips-tools libvips libvips-dev \
     imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5
 
@@ -168,5 +168,4 @@ RUN groupadd --system --gid "$OCR_SERVICE_GID" ocrsvc && \
 ENV HOME=/home/ocrsvc
 USER ocrsvc
 
-# Now run the simple api
-CMD ["/bin/bash", "start_service_production.sh"]
+CMD ["./start_service_production.sh"]
diff --git a/Makefile b/Makefile
@@ -0,0 +1,24 @@
+SHELL := /usr/bin/env bash
+.SHELLFLAGS := -o pipefail -c
+
+HELM_RELEASE ?= ocr-service
+HELM_CHART ?= ./charts/ocr-service
+HELM_TEXT_ONLY_VALUES ?= ./charts/ocr-service/values-text-only.yaml
+HELM_ARGS ?=
+
+.PHONY: helm-install helm-install-text-only helm-template helm-lint helm-uninstall
+
+helm-install:
+	helm upgrade --install $(HELM_RELEASE) $(HELM_CHART) $(HELM_ARGS)
+
+helm-install-text-only:
+	helm upgrade --install $(HELM_RELEASE) $(HELM_CHART) -f $(HELM_TEXT_ONLY_VALUES) $(HELM_ARGS)
+
+helm-template:
+	helm template $(HELM_RELEASE) $(HELM_CHART) $(HELM_ARGS)
+
+helm-lint:
+	helm lint $(HELM_CHART)
+
+helm-uninstall:
+	helm uninstall $(HELM_RELEASE) $(HELM_ARGS)
diff --git a/export_env_vars.sh b/export_env_vars.sh
diff --git a/ocr_service/app/app.py b/ocr_service/app/app.py
@@ -141,7 +141,7 @@ def create_app() -> FastAPI:
     global _started
 
     try:
-        app = FastAPI(title="OCR Service",
+        app = FastAPI(title="OCR_Service",
                       description="OCR Service API",
                       version=settings.OCR_SERVICE_VERSION,
                       default_response_class=ORJSONResponse,

diff --git a/ocr_service/processor/converter.py b/ocr_service/processor/converter.py
@@ -3,9 +3,12 @@
 import atexit
 import multiprocessing
 import os
+import re
 import time
 import traceback
 import uuid
+import zipfile
+from html import unescape
 from io import BytesIO
 from subprocess import PIPE, Popen
 from threading import Timer
@@ -19,12 +22,21 @@
 
 from ocr_service.dto.process_context import ProcessContext
 from ocr_service.settings import settings
-from ocr_service.utils.utils import INPUT_FILTERS, delete_tmp_files, terminate_hanging_process
+from ocr_service.utils.utils import (
+    INPUT_FILTERS,
+    delete_tmp_files,
+    is_encrypted_office_document,
+    terminate_hanging_process,
+)
 
 CURRENT_PDF_FILE: pdfium.PdfDocument | None = None
 
 
 class DocumentConverter:
+
+    MULTI_WHITESPACE = re.compile(r"[ \t]+")
+    MULTI_NEWLINES = re.compile(r"\n{3,}")
+
     def __init__(self, log, loffice_process_list: dict[str, Any]) -> None:
         self.log = log
         self.loffice_process_list = loffice_process_list
@@ -45,8 +57,15 @@ def resolve_content_type(file_type: object | None) -> str:
 
     @staticmethod
     def finalize_output_text(output_text: str) -> str:
-        output_text = output_text.translate({'\\n': '', '\\t': '', '\n\n': '\n'})  # type: ignore
-        return str(output_text).encode("utf-8", errors="replace").decode("utf-8")
+
+        # normalize line endings
+        output_text = output_text.replace("\r\n", "\n").replace("\r", "\n")
+        # remove multiple whitespaces
+        output_text = DocumentConverter.MULTI_WHITESPACE.sub(" ", output_text)
+        # remove multiple new-lines
+        output_text = DocumentConverter.MULTI_NEWLINES.sub("\n\n", output_text)
+
+        return output_text.encode("utf-8", errors="replace").decode("utf-8").strip()
 
     def _extract_text_fallback(self, 
                                stream: bytes, *,
@@ -57,7 +76,7 @@ def _extract_text_fallback(self,
         text = ""
 
         if is_html or is_xml:
-            parser = "html.parser" if is_html else "xml"
+            parser = "html.parser" if is_html else "lxml-xml"
             try:
                 soup = BeautifulSoup(stream, parser)
             except Exception:
@@ -70,16 +89,34 @@ def _extract_text_fallback(self,
             else:
                 text = soup.get_text(separator="\n")
 
+            # remove XML-ish self-closing tags
+            text = re.sub(r"<[^>]+/>", "", text)
+            # remove empty XML tags
+            text = re.sub(r"</?[\w:.-]+>", "", text)        
+
         if not text and is_rtf:
             try:
                 text = rtf_to_text(stream.decode("utf-8", "ignore"))
             except Exception:
                 self.log.warning("Failed to parse RTF during fallback; using raw decode")
 
         if not text:
-            text = stream.decode("utf-8", "ignore")
+            text = stream.decode("utf-8", "ignore") 
+
+        return unescape(text)
 
-        return text.strip()
+    def _extract_office_zip_text_fallback(self, stream: bytes, file_name: str) -> str:
+        ext = os.path.splitext(file_name)[1].lower()
+        xml_path = {".docx": "word/document.xml", ".odt": "content.xml"}.get(ext)
+        if not xml_path:
+            return ""
+
+        try:
+            with zipfile.ZipFile(BytesIO(stream)) as archive:
+                return self._extract_text_fallback(archive.read(xml_path), is_xml=True)
+        except Exception:
+            self.log.warning("Failed to extract %s from %s during fallback", xml_path, file_name)
+            return ""
 
     @staticmethod
     def initialize_pdf_worker(stream: bytes) -> None:
@@ -111,11 +148,14 @@ def render_page(page_num: int) -> Image.Image:
             crop=(0, 0, 0, 0),
             grayscale=settings.OCR_CONVERT_GRAYSCALE_IMAGES,
         ).to_pil()
+
+
         page.close()
 
         return img
 
     def _pdf_to_img(self, stream: bytes) -> tuple[list[Image.Image], dict]:
+
         pdf_image_pages = []
         doc_metadata: dict[str, Any] = {}
 
@@ -351,6 +391,35 @@ def _xml_to_text(self, ctx: ProcessContext) -> str:
         return " ".join(parts)
 
 
+    def _apply_text_fallback(
+        self,
+        ctx: ProcessContext,
+        *,
+        is_html: bool = False,
+        is_xml: bool = False,
+        is_rtf: bool = False,
+        reason: str,
+    ) -> None:
+        self.log.warning(
+            "Falling back to text extraction for %s after %s",
+            ctx.file_name,
+            reason,
+        )
+        ctx.pdf_stream = b""
+        ctx.images = []
+        ctx.output_text = self._extract_office_zip_text_fallback(ctx.stream, ctx.file_name)
+        if not ctx.output_text:
+            ctx.output_text = self._extract_text_fallback(
+                ctx.stream,
+                is_html=is_html,
+                is_xml=is_xml,
+                is_rtf=is_rtf,
+            )
+        ctx.metadata["pages"] = 1
+        ctx.metadata["content-type"] = "text/plain"
+        ctx.metadata["fallback_reason"] = reason
+
+
     def _handle_pdf_stream(self, ctx: ProcessContext) -> None:
         if settings.OPERATION_MODE == "OCR":
             ctx.images, pdf_metadata = self._preprocess_pdf_to_img(ctx.pdf_stream)
@@ -364,11 +433,24 @@ def prepare(self, ctx: ProcessContext) -> None:
 
         self.log.info("Checking file type for doc id: %s", ctx.file_name)
 
+        if is_encrypted_office_document(ctx.stream):
+            self.log.warning(
+                "Encrypted Office document detected for %s; skipping LibreOffice conversion",
+                ctx.file_name,
+            )
+            ctx.metadata["content-type"] = "application/vnd.openxmlformats-officedocument"
+            ctx.metadata["encrypted"] = True
+            ctx.metadata["unsupported_reason"] = "encrypted_office_document"
+            ctx.metadata["pages"] = 0
+            return
+
         _is_pdf = type(ctx.file_type) is archive.Pdf
         _is_rtf = type(ctx.file_type) is archive.Rtf or ctx.checks.is_rtf()
         _is_html = ctx.checks.is_html()
         _is_xml = ctx.checks.is_xml() and not _is_html
         _is_plain = ctx.checks.is_plain_text()
+        _has_office_zip_fallback = os.path.splitext(ctx.file_name)[1].lower() in {".docx", ".odt"}
+        text_fallback_allowed = _is_xml or _is_rtf or _has_office_zip_fallback
 
         if _is_pdf:
             ctx.pdf_stream = ctx.stream
@@ -427,19 +509,38 @@ def prepare(self, ctx: ProcessContext) -> None:
             self.log.info("Unknown file type; attempting to convert to PDF via unoserver/LO")
             ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name)
 
-        if not ctx.pdf_stream and not ctx.output_text and ctx.checks.is_text_like():
-            self.log.warning(
-                "No PDF produced for %s; falling back to plain-text extraction",
-                ctx.file_name,
-            )
-            ctx.output_text = self._extract_text_fallback(
-                ctx.stream,
+        if not ctx.pdf_stream and not ctx.output_text and (ctx.checks.is_text_like() or _has_office_zip_fallback):
+            self._apply_text_fallback(
+                ctx,
                 is_html=_is_html,
                 is_xml=_is_xml,
                 is_rtf=_is_rtf,
+                reason="no_pdf_produced",
             )
-            ctx.metadata["pages"] = 1
-            ctx.metadata["content-type"] = "text/plain"
 
         if ctx.pdf_stream:
-            self._handle_pdf_stream(ctx) 
+            try:
+                self._handle_pdf_stream(ctx)
+            except Exception:
+                if not text_fallback_allowed:
+                    raise
+                self.log.exception(
+                    "Converted PDF handling failed for %s; trying text fallback",
+                    ctx.file_name,
+                )
+                self._apply_text_fallback(
+                    ctx,
+                    is_html=_is_html,
+                    is_xml=_is_xml,
+                    is_rtf=_is_rtf,
+                    reason="converted_pdf_handling_failed",
+                )
+            else:
+                if text_fallback_allowed and not ctx.output_text and not ctx.images:
+                    self._apply_text_fallback(
+                        ctx,
+                        is_html=_is_html,
+                        is_xml=_is_xml,
+                        is_rtf=_is_rtf,
+                        reason="converted_pdf_handling_failed",
+                    )
diff --git a/ocr_service/processor/ocr_engine.py b/ocr_service/processor/ocr_engine.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from enum import Enum
 import logging
 import time
 import traceback
@@ -11,11 +12,16 @@
 from ocr_service.settings import settings
 
 
+class OcrPipeline(str, Enum):
+    TESSERACT = "tesseract"
+    VLM = "vlm"
+    MIXED = "mixed"
+
+
 class OcrEngine:
     def __init__(self, log: logging.Logger) -> None:
         self.log = log
 
-
     def _init_tesseract_api_worker(self) -> PyTessBaseAPI:
         tesseract_api = PyTessBaseAPI(path=settings.TESSDATA_PREFIX, lang=settings.TESSERACT_LANGUAGE)  # type: ignore
         self.log.debug("Initialised pytesseract api worker for language:" + str(settings.TESSERACT_LANGUAGE))

diff --git a/ocr_service/settings.py b/ocr_service/settings.py
@@ -1,6 +1,7 @@
 # mypy: disable-error-code=prop-decorator
 
 import ast
+from enum import Enum
 import logging
 import multiprocessing
 import os
@@ -89,6 +90,10 @@ def validate_lo_port_range(cls, value: str | None) -> str | None:
         return value
 
     def model_post_init(self, __context: Any) -> None:
+        """
+            Performs additional actions after the model is instantiated and all field validators are applied.
+        """
+
         default_lo_python = "/Applications/LibreOffice.app/Contents/Resources/python"
         default_lo_exec = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
         tessdata_prefix = self.OCR_TESSDATA_PREFIX