diff --git a/graalpython/com.oracle.graal.python.test/src/tests/test_pyexpat_entity_expansion_limit.py b/graalpython/com.oracle.graal.python.test/src/tests/test_pyexpat_entity_expansion_limit.py new file mode 100644 index 0000000000..9ae80f9a29 --- /dev/null +++ b/graalpython/com.oracle.graal.python.test/src/tests/test_pyexpat_entity_expansion_limit.py @@ -0,0 +1,90 @@ +# Copyright (c) 2026, 2026, Oracle and/or its affiliates. All rights reserved. +# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +# +# The Universal Permissive License (UPL), Version 1.0 +# +# Subject to the condition set forth below, permission is hereby granted to any +# person obtaining a copy of this software, associated documentation and/or +# data (collectively the "Software"), free of charge and under any and all +# copyright rights in the Software, and any and all patent rights owned or +# freely licensable by each licensor hereunder covering either (i) the +# unmodified Software as contributed to or provided by such licensor, or (ii) +# the Larger Works (as defined below), to deal in both +# +# (a) the Software, and +# +# (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +# one is included with the Software each a "Larger Work" to which the Software +# is contributed by such licensors), +# +# without restriction, including without limitation the rights to copy, create +# derivative works of, display, perform, and distribute the Software and make, +# use, sell, offer for sale, import, export, have made, and have sold the +# Software and the Larger Work(s), and to sublicense the foregoing rights on +# either these or other terms. +# +# This license is subject to the following condition: +# +# The above copyright notice and either this complete permission notice or at a +# minimum a reference to the UPL must be included in all copies or substantial +# portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import subprocess +import sys +import textwrap +import unittest + + +def entity_expansion_payload(levels=5, fanout=8): + entities = [''] + for level in range(1, levels + 1): + value = f"&e{level - 1};" * fanout + entities.append(f'') + dtd = "\n".join(entities) + return f"&e{levels};".encode() + + +class PyExpatEntityExpansionLimitTest(unittest.TestCase): + + def test_java_backend_internal_entity_expansion_is_limited(self): + code = textwrap.dedent(f""" + from xml.parsers import expat + + expat.ParserCreate().Parse({entity_expansion_payload(levels=6)!r}, True) + + parser = expat.ParserCreate() + try: + parser.Parse({entity_expansion_payload(levels=7)!r}, True) + except expat.ExpatError as e: + expected_code = expat.errors.codes[expat.errors.XML_ERROR_AMPLIFICATION_LIMIT_BREACH] + if e.code != expected_code: + raise SystemExit(f"unexpected error code: {{e.code}} != {{expected_code}}") + if expat.ErrorString(e.code) != expat.errors.XML_ERROR_AMPLIFICATION_LIMIT_BREACH: + raise SystemExit(f"unexpected error string: {{expat.ErrorString(e.code)!r}}") + raise SystemExit(0) + raise SystemExit("entity expansion was not limited") + """) + + result = subprocess.run([ + sys.executable, + "--vm.Djdk.xml.entityExpansionLimit=0", + "--vm.Djdk.xml.totalEntitySizeLimit=0", + "--vm.Djdk.xml.entityReplacementLimit=0", + "--python.PyExpatModuleBackend=java", + "-c", + code, + ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + + self.assertEqual(0, result.returncode, result.stdout + result.stderr) + + +if __name__ == '__main__': + unittest.main() diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/PXMLParser.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/PXMLParser.java index 66b50a7265..eb5e92c661 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/PXMLParser.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/PXMLParser.java @@ -53,6 +53,8 @@ public final class PXMLParser extends PythonBuiltinObject { public static final int XML_ERROR_FINISHED = 1; public static final int XML_ERROR_SYNTAX = 2; public static final int XML_ERROR_UNCLOSED_TOKEN = 3; + public static final int XML_ERROR_AMPLIFICATION_LIMIT_BREACH = 43; + public static final String XML_ERROR_AMPLIFICATION_LIMIT_BREACH_MESSAGE = "limit on input amplification factor (from DTD and entities) breached"; private final TruffleString namespaceSeparator; diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/PyExpatModuleBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/PyExpatModuleBuiltins.java index a6d3382371..032f2a4cbe 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/PyExpatModuleBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/PyExpatModuleBuiltins.java @@ -111,6 +111,8 @@ Interface to pyexpat parser (Java backend). addError(errors, codes, messages, "XML_ERROR_FINISHED", "parsing finished", PXMLParser.XML_ERROR_FINISHED); addError(errors, codes, messages, "XML_ERROR_SYNTAX", "syntax error", PXMLParser.XML_ERROR_SYNTAX); addError(errors, codes, messages, "XML_ERROR_UNCLOSED_TOKEN", "unclosed token", PXMLParser.XML_ERROR_UNCLOSED_TOKEN); + addError(errors, codes, messages, "XML_ERROR_AMPLIFICATION_LIMIT_BREACH", PXMLParser.XML_ERROR_AMPLIFICATION_LIMIT_BREACH_MESSAGE, + PXMLParser.XML_ERROR_AMPLIFICATION_LIMIT_BREACH); errors.setAttribute(T_CODES, codes); errors.setAttribute(T_MESSAGES, messages); addBuiltinConstant("errors", errors); @@ -141,12 +143,14 @@ abstract static class ErrorStringNode extends PythonBuiltinNode { private static final TruffleString T_PARSING_FINISHED = tsLiteral("parsing finished"); private static final TruffleString T_UNCLOSED_TOKEN = tsLiteral("unclosed token"); private static final TruffleString T_SYNTAX_ERROR = tsLiteral("syntax error"); + private static final TruffleString T_AMPLIFICATION_LIMIT_BREACH = tsLiteral(PXMLParser.XML_ERROR_AMPLIFICATION_LIMIT_BREACH_MESSAGE); @Specialization static TruffleString doIt(int code) { return switch (code) { case PXMLParser.XML_ERROR_FINISHED -> T_PARSING_FINISHED; case PXMLParser.XML_ERROR_UNCLOSED_TOKEN -> T_UNCLOSED_TOKEN; + case PXMLParser.XML_ERROR_AMPLIFICATION_LIMIT_BREACH -> T_AMPLIFICATION_LIMIT_BREACH; default -> T_SYNTAX_ERROR; }; } diff --git a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/XMLParserBuiltins.java b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/XMLParserBuiltins.java index 71e81a420b..809860dfd1 100644 --- a/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/XMLParserBuiltins.java +++ b/graalpython/com.oracle.graal.python/src/com/oracle/graal/python/builtins/modules/pyexpat/XMLParserBuiltins.java @@ -60,6 +60,8 @@ import java.util.Map; import java.util.Set; +import javax.xml.XMLConstants; +import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.xml.sax.Attributes; @@ -124,6 +126,28 @@ public final class XMLParserBuiltins extends PythonBuiltins { public static final TpSlots SLOTS = XMLParserBuiltinsSlotsGen.SLOTS; + /* + * JAXP exposes entity-expansion protection through separate per-parser properties. The matching jdk.xml.* system + * properties are read once into static limit values so stricter process-wide limits remain effective without + * re-reading properties for every parse. The SAX feature URIs below disable external entity and DTD loading where + * the selected SAX implementation supports them. + */ + private static final int ENTITY_EXPANSION_LIMIT = 1_000_000; + private static final int TOTAL_ENTITY_SIZE_LIMIT = 1_000_000; + private static final int ENTITY_REPLACEMENT_LIMIT = 1_000_000; + private static final String JDK_ENTITY_EXPANSION_LIMIT_PROPERTY = "jdk.xml.entityExpansionLimit"; + private static final String JDK_TOTAL_ENTITY_SIZE_LIMIT_PROPERTY = "jdk.xml.totalEntitySizeLimit"; + private static final String JDK_ENTITY_REPLACEMENT_LIMIT_PROPERTY = "jdk.xml.entityReplacementLimit"; + private static final String JAXP_ENTITY_EXPANSION_LIMIT = "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit"; + private static final String JAXP_TOTAL_ENTITY_SIZE_LIMIT = "http://www.oracle.com/xml/jaxp/properties/totalEntitySizeLimit"; + private static final String JAXP_ENTITY_REPLACEMENT_LIMIT = "http://www.oracle.com/xml/jaxp/properties/entityReplacementLimit"; + private static final String ENTITY_EXPANSION_LIMIT_VALUE = getParserLimit(JDK_ENTITY_EXPANSION_LIMIT_PROPERTY, ENTITY_EXPANSION_LIMIT); + private static final String TOTAL_ENTITY_SIZE_LIMIT_VALUE = getParserLimit(JDK_TOTAL_ENTITY_SIZE_LIMIT_PROPERTY, TOTAL_ENTITY_SIZE_LIMIT); + private static final String ENTITY_REPLACEMENT_LIMIT_VALUE = getParserLimit(JDK_ENTITY_REPLACEMENT_LIMIT_PROPERTY, ENTITY_REPLACEMENT_LIMIT); + private static final String SAX_EXTERNAL_GENERAL_ENTITIES = "http://xml.org/sax/features/external-general-entities"; + private static final String SAX_EXTERNAL_PARAMETER_ENTITIES = "http://xml.org/sax/features/external-parameter-entities"; + private static final String SAX_LOAD_EXTERNAL_DTD = "http://apache.org/xml/features/nonvalidating/load-external-dtd"; + @Override protected List> getNodeFactories() { return XMLParserBuiltinsFactory.getFactories(); @@ -1187,11 +1211,16 @@ private Object elementModel(String model) { try { SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(parser.getNamespaceSeparator() != null); - XMLReader reader = factory.newSAXParser().getXMLReader(); - try { - reader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - } catch (Exception ignored) { - } + factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + SAXParser saxParser = factory.newSAXParser(); + saxParser.setProperty(JAXP_ENTITY_EXPANSION_LIMIT, ENTITY_EXPANSION_LIMIT_VALUE); + saxParser.setProperty(JAXP_TOTAL_ENTITY_SIZE_LIMIT, TOTAL_ENTITY_SIZE_LIMIT_VALUE); + saxParser.setProperty(JAXP_ENTITY_REPLACEMENT_LIMIT, ENTITY_REPLACEMENT_LIMIT_VALUE); + saxParser.setProperty(XMLConstants.ACCESS_EXTERNAL_DTD, ""); + XMLReader reader = saxParser.getXMLReader(); + setFeatureIfSupported(reader, SAX_EXTERNAL_GENERAL_ENTITIES, false); + setFeatureIfSupported(reader, SAX_EXTERNAL_PARAMETER_ENTITIES, false); + setFeatureIfSupported(reader, SAX_LOAD_EXTERNAL_DTD, false); reader.setEntityResolver(new EntityResolver2() { @Override public InputSource getExternalSubset(String name, String baseURI) { @@ -1257,6 +1286,28 @@ public InputSource resolveEntity(String publicId, String systemId) { } } + private static String getParserLimit(String property, int defaultLimit) { + String configuredLimit = System.getProperty(property); + if (configuredLimit != null) { + try { + int limit = Integer.parseInt(configuredLimit); + if (limit > 0 && limit < defaultLimit) { + // Preserve a user-configured process-wide limit when it is stricter than GraalPy's default. + return configuredLimit; + } + } catch (NumberFormatException ignored) { + } + } + return Integer.toString(defaultLimit); + } + + private static void setFeatureIfSupported(XMLReader reader, String feature, boolean value) { + try { + reader.setFeature(feature, value); + } catch (Exception ignored) { + } + } + @FunctionalInterface private interface ByteIndexSupplier { int get(); @@ -1452,6 +1503,9 @@ private static String formatErrorMessage(SAXParseException e) { if (message == null) { return "syntax error"; } + if (isEntityAmplificationLimitError(message)) { + return PXMLParser.XML_ERROR_AMPLIFICATION_LIMIT_BREACH_MESSAGE + ": line " + e.getLineNumber() + ", column " + Math.max(0, e.getColumnNumber() - 1); + } if (message.contains("entity") && message.contains("not declared")) { int firstQuote = message.indexOf('"'); int secondQuote = firstQuote >= 0 ? message.indexOf('"', firstQuote + 1) : -1; @@ -1466,10 +1520,18 @@ private static String formatErrorMessage(SAXParseException e) { private static int mapErrorCode(SAXParseException e) { String message = e.getMessage(); if (message != null) { + if (isEntityAmplificationLimitError(message)) { + return PXMLParser.XML_ERROR_AMPLIFICATION_LIMIT_BREACH; + } if (message.contains("start and end within the same entity") || message.contains("premature end of file") || message.contains("must be terminated")) { return PXMLParser.XML_ERROR_UNCLOSED_TOKEN; } } return PXMLParser.XML_ERROR_SYNTAX; } + + private static boolean isEntityAmplificationLimitError(String message) { + return message.contains("JAXP00010001") || message.contains("JAXP00010004") || message.contains("JAXP00010007") || message.contains("entityExpansionLimit") || + message.contains("totalEntitySizeLimit") || message.contains("entityReplacementLimit"); + } }