From 082c23692e2d94cba60e6e9d450ac2d4a6e23bb7 Mon Sep 17 00:00:00 2001
From: Yufeng He <40085740+he-yufeng@users.noreply.github.com>
Date: Sat, 23 May 2026 00:48:05 +0800
Subject: [PATCH] fix: preserve underlined docx text
---
.../markitdown/converters/_docx_converter.py | 9 ++++
.../src/markitdown/converters/_markdownify.py | 12 ++++++
packages/markitdown/tests/test_module_misc.py | 42 +++++++++++++++++++
3 files changed, 63 insertions(+)
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 3975107b1..509ab887f 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -27,6 +27,8 @@
ACCEPTED_FILE_EXTENSIONS = [".docx"]
+DEFAULT_STYLE_MAP = "u => u"
+
class DocxConverter(HtmlConverter):
"""
@@ -76,6 +78,13 @@ def convert(
)
style_map = kwargs.get("style_map", None)
+ if style_map is None:
+ style_map = DEFAULT_STYLE_MAP
+ elif isinstance(style_map, str) and not any(
+ line.strip().startswith("u =>") for line in style_map.splitlines()
+ ):
+ style_map = f"{style_map}\n{DEFAULT_STYLE_MAP}"
+
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py
index 19e8a2984..1a0b831be 100644
--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -122,5 +122,17 @@ def convert_input(
return "[x] " if el.has_attr("checked") else "[ ] "
return ""
+ def convert_u(
+ self,
+ el: Any,
+ text: str,
+ convert_as_inline: Optional[bool] = False,
+ **kwargs,
+ ) -> str:
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
+ if not text:
+ return ""
+ return f"{prefix}{text}{suffix}"
+
def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 4d62e4919..b69632298 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -3,6 +3,7 @@
import os
import re
import shutil
+import zipfile
import pytest
from unittest.mock import MagicMock
@@ -274,6 +275,40 @@ def test_docx_equations() -> None:
assert block_equations, "No block equations found in the document."
+def test_docx_underline(tmp_path) -> None:
+ docx_file = tmp_path / "underlined.docx"
+ files = {
+ "[Content_Types].xml": """
+
plain underlined
" + result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html") + assert "plain underlined" in result.text_content + + def test_deeply_nested_html_fallback() -> None: """Large, deeply nested HTML should fall back to plain-text extraction instead of silently returning unconverted HTML (issue #1636).