From 082c23692e2d94cba60e6e9d450ac2d4a6e23bb7 Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Sat, 23 May 2026 00:48:05 +0800 Subject: [PATCH] fix: preserve underlined docx text --- .../markitdown/converters/_docx_converter.py | 9 ++++ .../src/markitdown/converters/_markdownify.py | 12 ++++++ packages/markitdown/tests/test_module_misc.py | 42 +++++++++++++++++++ 3 files changed, 63 insertions(+) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 3975107b1..509ab887f 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -27,6 +27,8 @@ ACCEPTED_FILE_EXTENSIONS = [".docx"] +DEFAULT_STYLE_MAP = "u => u" + class DocxConverter(HtmlConverter): """ @@ -76,6 +78,13 @@ def convert( ) style_map = kwargs.get("style_map", None) + if style_map is None: + style_map = DEFAULT_STYLE_MAP + elif isinstance(style_map, str) and not any( + line.strip().startswith("u =>") for line in style_map.splitlines() + ): + style_map = f"{style_map}\n{DEFAULT_STYLE_MAP}" + pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..1a0b831be 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -122,5 +122,17 @@ def convert_input( return "[x] " if el.has_attr("checked") else "[ ] " return "" + def convert_u( + self, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ) -> str: + prefix, suffix, text = markdownify.chomp(text) # type: ignore + if not text: + return "" + return f"{prefix}{text}{suffix}" + def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..b69632298 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -3,6 +3,7 @@ import os import re import shutil +import zipfile import pytest from unittest.mock import MagicMock @@ -274,6 +275,40 @@ def test_docx_equations() -> None: assert block_equations, "No block equations found in the document." +def test_docx_underline(tmp_path) -> None: + docx_file = tmp_path / "underlined.docx" + files = { + "[Content_Types].xml": """ + + + + +""", + "_rels/.rels": """ + + +""", + "word/_rels/document.xml.rels": """ +""", + "word/document.xml": """ + + + + plain + underlined + + + +""", + } + with zipfile.ZipFile(docx_file, "w", zipfile.ZIP_DEFLATED) as docx: + for name, content in files.items(): + docx.writestr(name, content) + + result = MarkItDown().convert(str(docx_file)) + assert "plain underlined" in result.text_content + + def test_input_as_strings() -> None: markitdown = MarkItDown() @@ -288,6 +323,13 @@ def test_input_as_strings() -> None: assert "# Test" in result.text_content +def test_html_underline() -> None: + markitdown = MarkItDown() + input_data = b"

plain underlined

" + result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html") + assert "plain underlined" in result.text_content + + def test_deeply_nested_html_fallback() -> None: """Large, deeply nested HTML should fall back to plain-text extraction instead of silently returning unconverted HTML (issue #1636).