Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

ACCEPTED_FILE_EXTENSIONS = [".docx"]

DEFAULT_STYLE_MAP = "u => u"


class DocxConverter(HtmlConverter):
"""
Expand Down Expand Up @@ -76,6 +78,13 @@ def convert(
)

style_map = kwargs.get("style_map", None)
if style_map is None:
style_map = DEFAULT_STYLE_MAP
elif isinstance(style_map, str) and not any(
line.strip().startswith("u =>") for line in style_map.splitlines()
):
style_map = f"{style_map}\n{DEFAULT_STYLE_MAP}"

pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
Expand Down
12 changes: 12 additions & 0 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,17 @@ def convert_input(
return "[x] " if el.has_attr("checked") else "[ ] "
return ""

def convert_u(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text:
return ""
return f"{prefix}<u>{text}</u>{suffix}"

def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
42 changes: 42 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import shutil
import zipfile
import pytest
from unittest.mock import MagicMock

Expand Down Expand Up @@ -274,6 +275,40 @@ def test_docx_equations() -> None:
assert block_equations, "No block equations found in the document."


def test_docx_underline(tmp_path) -> None:
docx_file = tmp_path / "underlined.docx"
files = {
"[Content_Types].xml": """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>""",
"_rels/.rels": """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>""",
"word/_rels/document.xml.rels": """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"/>""",
"word/document.xml": """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:t>plain </w:t></w:r>
<w:r><w:rPr><w:u w:val="single"/></w:rPr><w:t>underlined</w:t></w:r>
</w:p>
<w:sectPr/>
</w:body>
</w:document>""",
}
with zipfile.ZipFile(docx_file, "w", zipfile.ZIP_DEFLATED) as docx:
for name, content in files.items():
docx.writestr(name, content)

result = MarkItDown().convert(str(docx_file))
assert "plain <u>underlined</u>" in result.text_content


def test_input_as_strings() -> None:
markitdown = MarkItDown()

Expand All @@ -288,6 +323,13 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content


def test_html_underline() -> None:
markitdown = MarkItDown()
input_data = b"<html><body><p>plain <u>underlined</u></p></body></html>"
result = markitdown.convert_stream(io.BytesIO(input_data), file_extension=".html")
assert "plain <u>underlined</u>" in result.text_content


def test_deeply_nested_html_fallback() -> None:
"""Large, deeply nested HTML should fall back to plain-text extraction
instead of silently returning unconverted HTML (issue #1636).
Expand Down