Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,11 @@ RUN apt-get -y --no-install-recommends -o Dpkg::Options::="--force-confold" -y -
##### utils for python and TESSERACT
RUN echo "ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true" | debconf-set-selections

RUN apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
RUN apt-get update && apt-get install -y --no-install-recommends fontconfig ttf-mscorefonts-installer libimage-exiftool-perl libtcnative-1 \
libsm6 libxext6 gstreamer1.0-libav fonts-deva fonts-dejavu fonts-gfs-didot fonts-gfs-didot-classic fonts-junicode fonts-ebgaramond fonts-noto-cjk fonts-takao-gothic fonts-vlgothic \
ghostscript ghostscript-x gsfonts gsfonts-other gsfonts-x11 fonts-croscore fonts-crosextra-caladea fonts-crosextra-carlito fonts-liberation fonts-open-sans fonts-noto-core fonts-ibm-plex fonts-urw-base35 \
fonts-noto fonts-noto-cjk fonts-noto-extra xfonts-terminus fonts-font-awesome fonts-hack fonts-inconsolata fonts-liberation2 fonts-mononoki \
libpcre3 libpcre3-dev \
libpcre3 libpcre3-dev libxml2 libxml2-dev libxslt1.1 libxslt-dev \
mesa-opencl-icd pocl-opencl-icd libvips-tools libvips libvips-dev \
imagemagick libcairo2-dev tesseract-ocr tesseract-ocr-all libtesseract5 libtesseract-dev libleptonica-dev liblept5

Expand Down Expand Up @@ -168,5 +168,4 @@ RUN groupadd --system --gid "$OCR_SERVICE_GID" ocrsvc && \
ENV HOME=/home/ocrsvc
USER ocrsvc

# Now run the simple api
CMD ["/bin/bash", "start_service_production.sh"]
CMD ["./start_service_production.sh"]
24 changes: 24 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
SHELL := /usr/bin/env bash
.SHELLFLAGS := -o pipefail -c

HELM_RELEASE ?= ocr-service
HELM_CHART ?= ./charts/ocr-service
HELM_TEXT_ONLY_VALUES ?= ./charts/ocr-service/values-text-only.yaml
HELM_ARGS ?=

.PHONY: helm-install helm-install-text-only helm-template helm-lint helm-uninstall

helm-install:
helm upgrade --install $(HELM_RELEASE) $(HELM_CHART) $(HELM_ARGS)

helm-install-text-only:
helm upgrade --install $(HELM_RELEASE) $(HELM_CHART) -f $(HELM_TEXT_ONLY_VALUES) $(HELM_ARGS)

helm-template:
helm template $(HELM_RELEASE) $(HELM_CHART) $(HELM_ARGS)

helm-lint:
helm lint $(HELM_CHART)

helm-uninstall:
helm uninstall $(HELM_RELEASE) $(HELM_ARGS)
Empty file modified export_env_vars.sh
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion ocr_service/app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def create_app() -> FastAPI:
global _started

try:
app = FastAPI(title="OCR Service",
app = FastAPI(title="OCR_Service",
description="OCR Service API",
version=settings.OCR_SERVICE_VERSION,
default_response_class=ORJSONResponse,
Expand Down
133 changes: 117 additions & 16 deletions ocr_service/processor/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@
import atexit
import multiprocessing
import os
import re
import time
import traceback
import uuid
import zipfile
from html import unescape
from io import BytesIO
from subprocess import PIPE, Popen
from threading import Timer
Expand All @@ -19,12 +22,21 @@

from ocr_service.dto.process_context import ProcessContext
from ocr_service.settings import settings
from ocr_service.utils.utils import INPUT_FILTERS, delete_tmp_files, terminate_hanging_process
from ocr_service.utils.utils import (
INPUT_FILTERS,
delete_tmp_files,
is_encrypted_office_document,
terminate_hanging_process,
)

CURRENT_PDF_FILE: pdfium.PdfDocument | None = None


class DocumentConverter:

MULTI_WHITESPACE = re.compile(r"[ \t]+")
MULTI_NEWLINES = re.compile(r"\n{3,}")

def __init__(self, log, loffice_process_list: dict[str, Any]) -> None:
self.log = log
self.loffice_process_list = loffice_process_list
Expand All @@ -45,8 +57,15 @@ def resolve_content_type(file_type: object | None) -> str:

@staticmethod
def finalize_output_text(output_text: str) -> str:
output_text = output_text.translate({'\\n': '', '\\t': '', '\n\n': '\n'}) # type: ignore
return str(output_text).encode("utf-8", errors="replace").decode("utf-8")

# normalize line endings
output_text = output_text.replace("\r\n", "\n").replace("\r", "\n")
# remove multiple whitespaces
output_text = DocumentConverter.MULTI_WHITESPACE.sub(" ", output_text)
# remove multiple new-lines
output_text = DocumentConverter.MULTI_NEWLINES.sub("\n\n", output_text)

return output_text.encode("utf-8", errors="replace").decode("utf-8").strip()

def _extract_text_fallback(self,
stream: bytes, *,
Expand All @@ -57,7 +76,7 @@ def _extract_text_fallback(self,
text = ""

if is_html or is_xml:
parser = "html.parser" if is_html else "xml"
parser = "html.parser" if is_html else "lxml-xml"
try:
soup = BeautifulSoup(stream, parser)
except Exception:
Expand All @@ -70,16 +89,34 @@ def _extract_text_fallback(self,
else:
text = soup.get_text(separator="\n")

# remove XML-ish self-closing tags
text = re.sub(r"<[^>]+/>", "", text)
# remove empty XML tags
text = re.sub(r"</?[\w:.-]+>", "", text)

if not text and is_rtf:
try:
text = rtf_to_text(stream.decode("utf-8", "ignore"))
except Exception:
self.log.warning("Failed to parse RTF during fallback; using raw decode")

if not text:
text = stream.decode("utf-8", "ignore")
text = stream.decode("utf-8", "ignore")

return unescape(text)

return text.strip()
def _extract_office_zip_text_fallback(self, stream: bytes, file_name: str) -> str:
ext = os.path.splitext(file_name)[1].lower()
xml_path = {".docx": "word/document.xml", ".odt": "content.xml"}.get(ext)
if not xml_path:
return ""

try:
with zipfile.ZipFile(BytesIO(stream)) as archive:
return self._extract_text_fallback(archive.read(xml_path), is_xml=True)
except Exception:
self.log.warning("Failed to extract %s from %s during fallback", xml_path, file_name)
return ""

@staticmethod
def initialize_pdf_worker(stream: bytes) -> None:
Expand Down Expand Up @@ -111,11 +148,14 @@ def render_page(page_num: int) -> Image.Image:
crop=(0, 0, 0, 0),
grayscale=settings.OCR_CONVERT_GRAYSCALE_IMAGES,
).to_pil()


page.close()

return img

def _pdf_to_img(self, stream: bytes) -> tuple[list[Image.Image], dict]:

pdf_image_pages = []
doc_metadata: dict[str, Any] = {}

Expand Down Expand Up @@ -351,6 +391,35 @@ def _xml_to_text(self, ctx: ProcessContext) -> str:
return " ".join(parts)


def _apply_text_fallback(
self,
ctx: ProcessContext,
*,
is_html: bool = False,
is_xml: bool = False,
is_rtf: bool = False,
reason: str,
) -> None:
self.log.warning(
"Falling back to text extraction for %s after %s",
ctx.file_name,
reason,
)
ctx.pdf_stream = b""
ctx.images = []
ctx.output_text = self._extract_office_zip_text_fallback(ctx.stream, ctx.file_name)
if not ctx.output_text:
ctx.output_text = self._extract_text_fallback(
ctx.stream,
is_html=is_html,
is_xml=is_xml,
is_rtf=is_rtf,
)
ctx.metadata["pages"] = 1
ctx.metadata["content-type"] = "text/plain"
ctx.metadata["fallback_reason"] = reason


def _handle_pdf_stream(self, ctx: ProcessContext) -> None:
if settings.OPERATION_MODE == "OCR":
ctx.images, pdf_metadata = self._preprocess_pdf_to_img(ctx.pdf_stream)
Expand All @@ -364,11 +433,24 @@ def prepare(self, ctx: ProcessContext) -> None:

self.log.info("Checking file type for doc id: %s", ctx.file_name)

if is_encrypted_office_document(ctx.stream):
self.log.warning(
"Encrypted Office document detected for %s; skipping LibreOffice conversion",
ctx.file_name,
)
ctx.metadata["content-type"] = "application/vnd.openxmlformats-officedocument"
ctx.metadata["encrypted"] = True
ctx.metadata["unsupported_reason"] = "encrypted_office_document"
ctx.metadata["pages"] = 0
return

_is_pdf = type(ctx.file_type) is archive.Pdf
_is_rtf = type(ctx.file_type) is archive.Rtf or ctx.checks.is_rtf()
_is_html = ctx.checks.is_html()
_is_xml = ctx.checks.is_xml() and not _is_html
_is_plain = ctx.checks.is_plain_text()
_has_office_zip_fallback = os.path.splitext(ctx.file_name)[1].lower() in {".docx", ".odt"}
text_fallback_allowed = _is_xml or _is_rtf or _has_office_zip_fallback

if _is_pdf:
ctx.pdf_stream = ctx.stream
Expand Down Expand Up @@ -427,19 +509,38 @@ def prepare(self, ctx: ProcessContext) -> None:
self.log.info("Unknown file type; attempting to convert to PDF via unoserver/LO")
ctx.pdf_stream = self._preprocess_doc(ctx.stream, file_name=ctx.file_name)

if not ctx.pdf_stream and not ctx.output_text and ctx.checks.is_text_like():
self.log.warning(
"No PDF produced for %s; falling back to plain-text extraction",
ctx.file_name,
)
ctx.output_text = self._extract_text_fallback(
ctx.stream,
if not ctx.pdf_stream and not ctx.output_text and (ctx.checks.is_text_like() or _has_office_zip_fallback):
self._apply_text_fallback(
ctx,
is_html=_is_html,
is_xml=_is_xml,
is_rtf=_is_rtf,
reason="no_pdf_produced",
)
ctx.metadata["pages"] = 1
ctx.metadata["content-type"] = "text/plain"

if ctx.pdf_stream:
self._handle_pdf_stream(ctx)
try:
self._handle_pdf_stream(ctx)
except Exception:
if not text_fallback_allowed:
raise
self.log.exception(
"Converted PDF handling failed for %s; trying text fallback",
ctx.file_name,
)
self._apply_text_fallback(
ctx,
is_html=_is_html,
is_xml=_is_xml,
is_rtf=_is_rtf,
reason="converted_pdf_handling_failed",
)
else:
if text_fallback_allowed and not ctx.output_text and not ctx.images:
self._apply_text_fallback(
ctx,
is_html=_is_html,
is_xml=_is_xml,
is_rtf=_is_rtf,
reason="converted_pdf_handling_failed",
)
8 changes: 7 additions & 1 deletion ocr_service/processor/ocr_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from enum import Enum
import logging
import time
import traceback
Expand All @@ -11,11 +12,16 @@
from ocr_service.settings import settings


class OcrPipeline(str, Enum):
TESSERACT = "tesseract"
VLM = "vlm"
MIXED = "mixed"


class OcrEngine:
def __init__(self, log: logging.Logger) -> None:
self.log = log


def _init_tesseract_api_worker(self) -> PyTessBaseAPI:
tesseract_api = PyTessBaseAPI(path=settings.TESSDATA_PREFIX, lang=settings.TESSERACT_LANGUAGE) # type: ignore
self.log.debug("Initialised pytesseract api worker for language:" + str(settings.TESSERACT_LANGUAGE))
Expand Down
5 changes: 5 additions & 0 deletions ocr_service/settings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# mypy: disable-error-code=prop-decorator

import ast
from enum import Enum
import logging
import multiprocessing
import os
Expand Down Expand Up @@ -89,6 +90,10 @@ def validate_lo_port_range(cls, value: str | None) -> str | None:
return value

def model_post_init(self, __context: Any) -> None:
"""
Performs additional actions after the model is instantiated and all field validators are applied.
"""

default_lo_python = "/Applications/LibreOffice.app/Contents/Resources/python"
default_lo_exec = "/Applications/LibreOffice.app/Contents/MacOS/soffice"
tessdata_prefix = self.OCR_TESSDATA_PREFIX
Expand Down
Loading
Loading