diff --git a/HISTORY.md b/HISTORY.md index 464771f..79d89e6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,9 @@ - Declare support for Python 3.10 - 3.14 [ale-rt] +- Do not break `title` and `textarea` in page templates. + (Fixes #198) + [ale-rt] ## 3.1.1 (2025-06-23) diff --git a/zpretty/elements.py b/zpretty/elements.py index e32ba09..5eb2053 100644 --- a/zpretty/elements.py +++ b/zpretty/elements.py @@ -77,7 +77,19 @@ class PrettyElement: ) escaper = EntitySubstitution() preserve_text_whitespace_elements = ["pre"] - skip_text_escaping_elements = ["script", "style"] + skip_text_escaping_elements = [ + # Do not fiddle with the content of script tags, + # as it may contain html entities that we do not want to be escaped + "script", + # Do not fiddle with the content of style tags, + # as it may contain html entities that we do not want to be escaped + "style", + # The title and textarea tags may contain markup-like text + # that should that by HTML parser is escaped into text, + # but that we want to be rendered as markup in page templates. + "title", + "textarea", + ] def __init__(self, context, level=0): """Take something a (bs4) element and an indentation level""" diff --git a/zpretty/prettifier.py b/zpretty/prettifier.py index c4cea39..097d89d 100644 --- a/zpretty/prettifier.py +++ b/zpretty/prettifier.py @@ -1,6 +1,7 @@ from bs4 import BeautifulSoup from bs4.element import Doctype from bs4.element import ProcessingInstruction +from bs4.element import Tag from logging import getLogger from uuid import uuid4 from zpretty.elements import PrettyElement @@ -26,6 +27,7 @@ class ZPrettifier: _doctype_pattern = re.compile( r"([]*(\[[^]]*\])?>)", re.IGNORECASE | re.DOTALL ) + _rcdata_tags = ("title", "textarea") _cdatas = [] _doctype = None @@ -52,6 +54,12 @@ def __init__(self, filename="", text="", encoding="utf8"): if self._ampersand_marker in value: attrs[key] = value.replace(self._ampersand_marker, "&") + if self.parser == "html.parser": + # Page templates are parsed with the html.parser, + # but can contain invalid markup inside RCDATA tags, + # see https://github.com/collective/zpretty/issues/198 + self.fix_rcdata_markup(soup) + self.soup = soup # Cleanup all spurious self._newlines_marker attributes, see #35 @@ -61,6 +69,43 @@ def __init__(self, filename="", text="", encoding="utf8"): self.root = self.pretty_element(self.soup, -1) + def fix_rcdata_markup(self, soup): + """Parse markup-like text inside RCDATA tags as child nodes. + + In page templates we might have elements inside these fields, + e.g. inside a