diff --git a/HISTORY.md b/HISTORY.md index 464771f..79d89e6 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -5,6 +5,9 @@ - Declare support for Python 3.10 - 3.14 [ale-rt] +- Do not break `title` and `textarea` in page templates. + (Fixes #198) + [ale-rt] ## 3.1.1 (2025-06-23) diff --git a/zpretty/elements.py b/zpretty/elements.py index e32ba09..5eb2053 100644 --- a/zpretty/elements.py +++ b/zpretty/elements.py @@ -77,7 +77,19 @@ class PrettyElement: ) escaper = EntitySubstitution() preserve_text_whitespace_elements = ["pre"] - skip_text_escaping_elements = ["script", "style"] + skip_text_escaping_elements = [ + # Do not fiddle with the content of script tags, + # as it may contain html entities that we do not want to be escaped + "script", + # Do not fiddle with the content of style tags, + # as it may contain html entities that we do not want to be escaped + "style", + # The title and textarea tags may contain markup-like text + # that should that by HTML parser is escaped into text, + # but that we want to be rendered as markup in page templates. + "title", + "textarea", + ] def __init__(self, context, level=0): """Take something a (bs4) element and an indentation level""" diff --git a/zpretty/prettifier.py b/zpretty/prettifier.py index c4cea39..097d89d 100644 --- a/zpretty/prettifier.py +++ b/zpretty/prettifier.py @@ -1,6 +1,7 @@ from bs4 import BeautifulSoup from bs4.element import Doctype from bs4.element import ProcessingInstruction +from bs4.element import Tag from logging import getLogger from uuid import uuid4 from zpretty.elements import PrettyElement @@ -26,6 +27,7 @@ class ZPrettifier: _doctype_pattern = re.compile( r"([]*(\[[^]]*\])?>)", re.IGNORECASE | re.DOTALL ) + _rcdata_tags = ("title", "textarea") _cdatas = [] _doctype = None @@ -52,6 +54,12 @@ def __init__(self, filename="", text="", encoding="utf8"): if self._ampersand_marker in value: attrs[key] = value.replace(self._ampersand_marker, "&") + if self.parser == "html.parser": + # Page templates are parsed with the html.parser, + # but can contain invalid markup inside RCDATA tags, + # see https://github.com/collective/zpretty/issues/198 + self.fix_rcdata_markup(soup) + self.soup = soup # Cleanup all spurious self._newlines_marker attributes, see #35 @@ -61,6 +69,43 @@ def __init__(self, filename="", text="", encoding="utf8"): self.root = self.pretty_element(self.soup, -1) + def fix_rcdata_markup(self, soup): + """Parse markup-like text inside RCDATA tags as child nodes. + + In page templates we might have elements inside these fields, + e.g. inside a or <textarea>. + The html.parser used by BeautifulSoup escapes the markup inside these tags + and does not parse it as tags, but we want to prettify it as well. + + This method applies a workaround for this problem, + by prettifying the content of these tags as if it were an XML fragment + and then replacing the content of the tag with the prettified version. + + Then the rcdata elements content will be rendered as it is. + """ + for tag in soup.find_all(self._rcdata_tags): + raw_content = "".join(str(node) for node in tag.contents) + + null_tag_name = self.pretty_element.null_tag_name + fragment_soup = BeautifulSoup( + f"<{null_tag_name}>{raw_content}</{null_tag_name}>", + self.parser, + ) + fragment_root = getattr(fragment_soup, null_tag_name, None) + if not fragment_root: + continue + + parsed_children = list(fragment_root.children) + # Check if the tag contains some markup like text, + # if not we can skip it and avoid to mess with the content + if not any(isinstance(child, Tag) for child in parsed_children): + continue + + # Replace the content of the tag with the parsed prettified children + tag.clear() + for child in parsed_children: + tag.append(child) + def _prepare_text(self): """This tweaks the text passed to the prettifier to overcome some limitations of the BeautifulSoup parser diff --git a/zpretty/tests/original/sample_pt.pt b/zpretty/tests/original/sample_pt.pt index 71b6e3a..c563553 100644 --- a/zpretty/tests/original/sample_pt.pt +++ b/zpretty/tests/original/sample_pt.pt @@ -7,6 +7,16 @@ xml:lang="en" i18n:domain="plone" ><![CDATA[ <>& ]]> + <head> + <title> + <!--! Check https://github.com/collective/zpretty/issues/198 --> + <i18n:translate> + Should stay like this + </i18n:translate> + + <i18n:translate>Sample Page</i18n:translate> + + @@ -29,6 +39,21 @@ Foo Bar +
+ +
]> + + This is a title with an entity reference: &name; + <subtitle>This is a subtitle with another entity reference: &another;</subtitle> + 1