From 082c23692e2d94cba60e6e9d450ac2d4a6e23bb7 Mon Sep 17 00:00:00 2001
From: Yufeng He <40085740+he-yufeng@users.noreply.github.com>
Date: Sat, 23 May 2026 00:48:05 +0800
Subject: [PATCH] fix: preserve underlined docx text

---
 .../markitdown/converters/_docx_converter.py  |  9 ++++
 .../src/markitdown/converters/_markdownify.py | 12 ++++++
 packages/markitdown/tests/test_module_misc.py | 42 +++++++++++++++++++
 3 files changed, 63 insertions(+)
diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 3975107b1..509ab887f 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -27,6 +27,8 @@
 
 ACCEPTED_FILE_EXTENSIONS = [".docx"]
 
+DEFAULT_STYLE_MAP = "u => u"
+
 
 class DocxConverter(HtmlConverter):
     """
@@ -76,6 +78,13 @@ def convert(
             )
 
         style_map = kwargs.get("style_map", None)
+        if style_map is None:
+            style_map = DEFAULT_STYLE_MAP
+        elif isinstance(style_map, str) and not any(
+            line.strip().startswith("u =>") for line in style_map.splitlines()
+        ):
+            style_map = f"{style_map}\n{DEFAULT_STYLE_MAP}"
+
         pre_process_stream = pre_process_docx(file_stream)
         return self._html_converter.convert_string(
             mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py
index 19e8a2984..1a0b831be 100644
--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -122,5 +122,17 @@ def convert_input(
             return "[x] " if el.has_attr("checked") else "[ ] "
         return ""
 
+    def convert_u(
+        self,
+        el: Any,
+        text: str,
+        convert_as_inline: Optional[bool] = False,
+        **kwargs,
+    ) -> str:
+        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
+        if not text:
+            return ""
+        return f"{prefix}<u>{text}</u>{suffix}"
+
     def convert_soup(self, soup: Any) -> str:
         return super().convert_soup(soup)  # type: ignore
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 4d62e4919..b69632298 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -3,6 +3,7 @@
 import os
 import re
 import shutil
+import zipfile
 import pytest
 from unittest.mock import MagicMock
 
@@ -274,6 +275,40 @@ def test_docx_equations() -> None:
     assert block_equations, "No block equations found in the document."
 
 
+def test_docx_underline(tmp_path) -> None:
+    docx_file = tmp_path / "underlined.docx"
+    files = {
+        "[Content_Types].xml": """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
+  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
+  <Default Extension="xml" ContentType="application/xml"/>
+  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
+</Types>""",
+        "_rels/.rels": """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
+  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
+</Relationships>""",
+        "word/_rels/document.xml.rels": """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"/>""",
+        "word/document.xml": """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+  <w:body>
+    <w:p>
+      <w:r><w:t>plain </w:t></w:r>
+      <w:r><w:rPr><w:u w:val="single"/></w:rPr><w:t>underlined</w:t></w:r>
+    </w:p>
+    <w:sectPr/>
+  </w:body>
+</w:document>""",
+    }
+    with zipfile.ZipFile(docx_file, "w", zipfile.ZIP_DEFLATED) as docx:
+        for name, content in files.items():
+            docx.writestr(name, content)
+
+    result = MarkItDown().convert(str(docx_file))
+    assert "plain <u>underlined</u>" in result.text_content
+
+
 def test_input_as_strings() -> None:
     markitdown = MarkItDown()
 
@@ -288,6 +323,13 @@ def test_input_as_strings() -> None:
     assert "# Test" in result.text_content
 
 
+def test_html_underline() -> None:
+    markitdown = MarkItDown()
+    input_data = b"<html><body><p>plain <u>underlined</u></p></body></html>"
+    result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
+    assert "plain <u>underlined</u>" in result.text_content
+
+
 def test_deeply_nested_html_fallback() -> None:
     """Large, deeply nested HTML should fall back to plain-text extraction
     instead of silently returning unconverted HTML (issue #1636).