NatLabRockies · ppinchuk · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
@@ -91,10 +91,10 @@
   },
   "$core_principles": {
     "scope_context": {
-      "description": "Only extract requirements that apply to geothermal heat pump / ground source heat pump systems, including closed-loop and open-loop systems where relevant. If text uses generic terms like 'geothermal well' or 'geothermal system', treat it as GHP only when section context clearly indicates heating/cooling or heat-pump exchange wells/loops. Do NOT consider unrelated geothermal electricity generation rules unless the text explicitly governs heat pump exchange wells/loops. Do not extract proposed, future, draft, or contingent requirements that are not currently effective as written."
+      "description": "Only extract requirements that apply to geothermal heat pump / ground source heat pump / geothermal heat exchange systems (or equivalent terms), including closed-loop and open-loop systems where relevant. If the text uses generic terms like 'geothermal well' or 'geothermal system', treat it as GHP only when section context clearly indicates heating/cooling or heat-pump exchange wells/loops. Do NOT consider rules for geothermal temperature gradient exploration/development/test wells, geothermal well field gathering systems, geothermal power generation systems, or any other geothermal electricity generation system rules unless the text **explicitly** also governs geothermal heat exchange wells/loops. Do not extract proposed, future, draft, or contingent requirements that are not currently effective as written."
     },
     "strict_evidence_gate": {
-      "description": "Extract a feature only when the ordinance text explicitly states a requirement, definition, or prohibition for that same feature. Never infer, assume, extrapolate, or guess from related context, implications, headings, or nearby provisions. If the ordinance points to an outside document or standard without restating the controlling requirement in the ordinance text itself, do not import missing values from that outside source."
+      "description": "Extract a feature only when the ordinance text explicitly states a requirement, definition, or prohibition for that same feature. Never infer, assume, extrapolate, or guess from related context, implications, headings, or nearby provisions. Chapter titles, section titles, tables of contents, navigation lists, and cross-reference lists are not valid evidence by themselves unless they also quote or restate the operative requirement, definition, or prohibition. If the ordinance points to an outside document or standard without restating the controlling requirement in the ordinance text itself, do not import missing values from that outside source."
     },
     "data_omission": {
       "description": "Emit only positively matched features. If a feature is not explicitly present, omit it entirely rather than returning placeholder text. For qualitative features, use value=null and units=null only when an enacted, explicit requirement or definition for that same feature is present. For numeric features, extract only when an explicit numeric threshold is stated in the ordinance text; otherwise omit the feature instead of returning null, empty, or qualitative-only values. Never emit absence placeholders such as 'not found', 'no explicit requirement', 'none', or similar text in any field."
@@ -207,7 +207,7 @@
           "description": "Extract only explicit requirements for visual or physical concealment of above-ground geothermal equipment. Valid concealment mechanisms include barriers or treatments such as screening, fences, walls, landscaping, berms, opaque enclosures, parapets, louvers, or screening panels. A screening row is valid only when the ordinance excerpt itself explicitly states at least one concealment mechanism. Exclude pure siting/location/setback/front-yard/side-yard/rear-yard placement rules, visibility assumptions, or indirect inference from location constraints unless the same requirement explicitly adds a concealment mechanism."
         },
         "permit": {
-          "description": "Extract permit requirements that explicitly govern GHP installation, drilling, or operation (for example mechanical, geothermal well, environmental, zoning, or health approvals), including triggering conditions. Do not extract generic non-GHP permits unless the ordinance explicitly ties them to geothermal systems."
+          "description": "Extract permit requirements that explicitly govern GHP installation, drilling, or operation (for example mechanical, geothermal well, environmental, zoning, or health approvals), including triggering conditions. Do not extract generic non-GHP permits unless the ordinance explicitly ties them to geothermal heat pump systems."
         },
         "inspection": {
           "description": "Extract required inspections (pre-drilling, during installation, final inspection, pressure/flow testing, health/environmental inspections). Include inspection type(s) and responsible authority details in summary when stated."

@@ -5,9 +5,9 @@ data_type_short_desc: geothermal ground source heat pump ordinance
 query_templates:
     - "{jurisdiction} geothermal ground source heat pump code"
     - "filetype:pdf {jurisdiction} geothermal heat pump ordinance"
-    - "{jurisdiction} geothermal energy system ordinance"
+    - "{jurisdiction} private geothermal energy system ordinance"
     - "Where can I find the legal text for geothermal heat pump zoning ordinances in {jurisdiction}?"
-    - "What is the specific legal information regarding zoning ordinances for geothermal heat pumps in {jurisdiction}?"
+    - "What is the specific legal information regarding zoning ordinances for private heat exchange wells in {jurisdiction}?"
 
 website_keywords:
     # These still need massaging
@@ -30,11 +30,14 @@ heuristic_keywords:
   good_tech_keywords:
     - "geoexchange"
     - "geo-exchange"
+    - "heat-exchange"
     - "wellfield"
     - "direct-use"
     - "closed-loop"
   good_tech_phrases:
+    - "water well"
     - "well field"
+    - "geothermal heat"
     - "geothermal resource"
     - "geothermal drilling"
     - "geothermal well"
@@ -46,6 +49,7 @@ heuristic_keywords:
     - "open loop"
     - "vertical loop"
     - "horizontal loop"
+    - "heat exchange"
     - "heating and cooling"
     - "space heating"
     - "direct use"

@@ -100,7 +100,7 @@ async def get_heuristic(self):  # noqa: PLR6301
         """
         return WaterRightsHeuristic()
 
-    async def filter_docs(self, extraction_context):
+    async def filter_docs(self, extraction_context, __):
         """Filter down candidate documents before parsing
 
         Parameters

@@ -76,7 +76,9 @@ async def collect(self, workflow):  # noqa: PLR6301
             docs = await load_known_docs(
                 workflow.jurisdiction,
                 [info["source_fp"] for info in workflow.known_local_docs],
-                local_file_loader_kwargs=workflow.runtime.local_file_loader_kwargs,
+                local_file_loader_kwargs=(
+                    workflow.runtime.local_file_loader_kwargs
+                ),
             )
         except Exception:
             logger.exception(

@@ -311,6 +311,10 @@ async def run(self, jurisdictions_df):
             content of the message may vary depending on the results of
             the processing.
         """
+        logger.info("Loading collection manifest...")
+        logger.debug(
+            "Manifest path(s): %s", self.runtime.request.collection_manifest_fp
+        )
         manifest = await load_collection_manifest(
             self.runtime.request.collection_manifest_fp, self.runtime.tech
         )

@@ -299,6 +299,22 @@ def se_kwargs(self):
         return extra_kwargs
 
 
+class DocParsingParams:
+    """Value Object for document parsing settings"""
+
+    def __init__(self, max_num_docs_per_jurisdiction=None):
+        """
+
+        Parameters
+        ----------
+        max_num_docs_per_jurisdiction : int, optional
+            Maximum number of documents to parse for each jurisdiction
+            (regardless of the collection method). If ``None``, all
+            collected documents are parsed. By default, ``None``.
+        """
+        self.max_num_docs_per_jurisdiction = max_num_docs_per_jurisdiction
+
+
 class BaseRequest:
     """Parameter Object base class for pipeline requests"""
 
@@ -314,6 +330,7 @@ def __init__(  # noqa: PLR0913
         model="gpt-4o-mini",
         llm_costs=None,
         num_urls_to_check_per_jurisdiction=5,
+        max_num_docs_to_parse_per_jurisdiction=None,
         max_num_concurrent_browsers=10,
         max_num_concurrent_website_searches=10,
         max_num_concurrent_jurisdictions=25,
@@ -439,6 +456,10 @@ def __init__(  # noqa: PLR0913
             Number of unique Google search result URLs to check for each
             jurisdiction when attempting to locate ordinance documents.
             By default, ``5``.
+        max_num_docs_to_parse_per_jurisdiction : int, optional
+            Maximum number of documents to parse for each jurisdiction
+            (regardless of the collection method). If ``None``, all
+            collected documents are parsed. By default, ``None``.
         max_num_concurrent_browsers : int, default=10
             Maximum number of browser instances to launch concurrently
             for retrieving information from the web. Increasing this
@@ -619,6 +640,11 @@ def __init__(  # noqa: PLR0913
             simple_se_result_sort=simple_se_result_sort,
             pytesseract_exe_fp=pytesseract_exe_fp,
         )
+        self.parsing_settings = DocParsingParams(
+            max_num_docs_per_jurisdiction=(
+                max_num_docs_to_parse_per_jurisdiction
+            )
+        )
         self.runtime_settings = RuntimeSettings(
             td_kwargs=td_kwargs,
             tpe_kwargs=tpe_kwargs,
@@ -963,6 +989,7 @@ def __init__(  # noqa: PLR0913
         collection_manifest_fp,
         *,
         model="gpt-4o-mini",
+        max_num_docs_to_parse_per_jurisdiction=None,
         max_num_concurrent_jurisdictions=25,
         file_loader_kwargs=None,
         td_kwargs=None,
@@ -1052,6 +1079,10 @@ def __init__(  # noqa: PLR0913
                 set ``"model": "gpt-4o-mini-2025-04-11"``.
 
             By default, ``"gpt-4o-mini"``.
+        max_num_docs_to_parse_per_jurisdiction : int, optional
+            Maximum number of documents to parse for each jurisdiction
+            (regardless of the collection method). If ``None``, all
+            collected documents are parsed. By default, ``None``.
         max_num_concurrent_jurisdictions : int, default=25
             Maximum number of jurisdictions to process concurrently.
             Limiting this can help manage memory usage when dealing with
@@ -1139,6 +1170,9 @@ def __init__(  # noqa: PLR0913
             tech=tech,
             jurisdiction_fp=jurisdiction_fp,
             model=model,
+            max_num_docs_to_parse_per_jurisdiction=(
+                max_num_docs_to_parse_per_jurisdiction
+            ),
             max_num_concurrent_jurisdictions=max_num_concurrent_jurisdictions,
             file_loader_kwargs=file_loader_kwargs,
             td_kwargs=td_kwargs,

@@ -16,6 +16,12 @@ class DocumentExtraction:
     def __init__(self, workflow):
         self.workflow = workflow
 
+    @property
+    def max_docs_to_parse(self):
+        """int or None: Maximum number of documents to parse"""
+        parse_settings = self.workflow.runtime.request.parsing_settings
+        return parse_settings.max_num_docs_per_jurisdiction
+
     async def extract_from_docs(self, docs):
         """Filter and extract data from a set of docs
 
@@ -35,7 +41,7 @@ async def extract_from_docs(self, docs):
 
         extraction_context = ExtractionContext(documents=docs)
         extraction_context = await self.workflow.extractor.filter_docs(
-            extraction_context
+            extraction_context, self.max_docs_to_parse
         )
         if not extraction_context:
             return None

@@ -194,6 +194,10 @@ async def extract_from_collection_info(self, collection_info):
         )
         self.jurisdiction_website = collection_info.get("jurisdiction_website")
         try:
+            COMPASS_PB.update_jurisdiction_task(
+                self.jurisdiction.full_name,
+                description="Loading pre-parsed documents...",
+            )
             docs = await load_collected_docs(
                 collection_info, task_name=self.jurisdiction.full_name
             )

@@ -103,7 +103,7 @@ async def get_heuristic(self):
         raise NotImplementedError
 
     @abstractmethod
-    async def filter_docs(self, extraction_context):
+    async def filter_docs(self, extraction_context, max_num_docs=None):
         """Filter down candidate documents before parsing
 
         Parameters
@@ -112,6 +112,10 @@ async def filter_docs(self, extraction_context):
             Context containing candidate documents to be filtered.
             Set the ``.documents`` attribute of this object to be the
             iterable of documents that should be kept for parsing.
+        max_num_docs : int, optional
+            Maximum number of documents to parse (regardless of the
+            collection method). If ``None``, all collected documents are
+            parsed. By default, ``None``.
 
         Returns
         -------

@@ -278,13 +278,17 @@ async def get_heuristic(self):
         """
         return self.HEURISTIC()
 
-    async def filter_docs(self, extraction_context):
+    async def filter_docs(self, extraction_context, max_num_docs=None):
         """Filter down candidate documents before parsing
 
         Parameters
         ----------
         extraction_context : ExtractionContext
             Context containing candidate documents to be filtered.
+        max_num_docs : int, optional
+            Maximum number of documents to parse (regardless of the
+            collection method). If ``None``, all collected documents are
+            parsed. By default, ``None``.
 
         Returns
         -------
@@ -340,6 +344,12 @@ async def filter_docs(self, extraction_context):
         if not docs:
             return None
 
+        if max_num_docs is not None:
+            logger.debug(
+                "Sub-setting to %d document(s) for parsing", max_num_docs
+            )
+            docs = docs[:max_num_docs]
+
         extraction_context.documents = docs
         return extraction_context
 

@@ -25,9 +25,14 @@
 Determine whether the chunk contains content that matches any of the \
 schema's criteria. Be strict and literal: only mark relevant if the chunk \
 clearly addresses the specific technology and document scope described in \
-the schema. Do not infer beyond the text. If relevant, summarize the \
-specific matching content; if not, state why it does not meet the schema's \
-requirements. Keep the response concise and consistent.\
+the schema. Do not infer beyond the text. Do not treat chapter titles, \
+section titles, tables of contents, navigation links, cross-reference \
+lists, or citation-only indexes as relevant on their own; mark them \
+relevant only when they also include operative provisions, definitions, \
+prohibitions, or other substantive regulatory text that matches the \
+schema. If relevant, summarize the specific matching content; if not, state \
+why it does not meet the schema's requirements. Keep the response concise \
+and consistent.\
 """
 _TEXT_COLLECTION_MAIN_PROMPT = """\
 Determine whether this text excerpt contains any information relevant to \

@@ -15,6 +15,7 @@
 from logging.handlers import QueueHandler
 
 import numpy as np
+import pandas as pd
 from elm.web.document import PDFDocument, MDDocument
 from elm.utilities.parse import read_pdf, read_pdf_ocr
 from docling.datamodel.backend_options import HTMLBackendOptions
@@ -369,8 +370,10 @@ def _read_docling(
 
     with warnings.catch_warnings():
         warnings.simplefilter("ignore", RuntimeWarning)
-        mean_confidence = conv_result.confidence.mean_score
-        low_score_confidence = conv_result.confidence.low_score
+        mean_confidence = _none_if_missing(conv_result.confidence.mean_score)
+        low_score_confidence = _none_if_missing(
+            conv_result.confidence.low_score
+        )
 
     attrs = {
         "doc_filename": conv_result.input.file.stem,
@@ -407,6 +410,11 @@ def _configure_pytesseract(tesseract_cmd):
     pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
 
 
+def _none_if_missing(value):
+    """Return ``None`` when a scalar confidence value is missing"""
+    return None if pd.isna(value) else value
+
+
 def _try_decode_ocr_pages(pages):
     """Try to decode pages into strings"""
     decoded_pages = []

@@ -55,9 +55,29 @@ def setup_graph_correct_document_type(**kwargs):
         ),
     )
 
-    G.add_edge("init", "is_model", condition=llm_response_starts_with_yes)
     G.add_edge(
-        "check_for_laws", "is_model", condition=llm_response_starts_with_yes
+        "init", "has_substantive_text", condition=llm_response_starts_with_yes
+    )
+    G.add_edge(
+        "check_for_laws",
+        "has_substantive_text",
+        condition=llm_response_starts_with_yes,
+    )
+    G.add_node(
+        "has_substantive_text",
+        prompt=(
+            "Does this excerpt include substantive legal content such as "
+            "operative provisions, definitions, or other regulatory text, "
+            "rather than only a table of contents, chapter/section heading "
+            "list, navigation links, or a citation-only index? "
+            "{YES_NO_PROMPT}"
+        ),
+    )
+
+    G.add_edge(
+        "has_substantive_text",
+        "is_model",
+        condition=llm_response_starts_with_yes,
     )
     G.add_node(
         "is_model",
@@ -265,8 +285,10 @@ def setup_graph_correct_document_type(**kwargs):
             "2. **'type'** (string) - The best-fitting category for the "
             "source of the text.\n"
             "3. **'{key}'** (boolean) -\n"
-            "\t- `true` if the text is a **legally binding regulation**.\n"
-            "\t- `false` if the text belongs to any other type of document or "
+            "\t- `true` if the text is a **legally binding regulation** "
+            "with substantive legal content.\n"
+            "\t- `false` if the text belongs to any other type of document, "
+            "if it does not contain substantive legal content, or "
             "if you cannot tell for certain one way or another.\n\n"
         ),
     )

@@ -181,7 +181,7 @@ The contract
 ``get_heuristic()``
   Return a heuristic instance for initial document screening.
 
-``filter_docs(extraction_context)``
+``filter_docs(extraction_context, max_num_docs=None)``
   The heart of customization. Take ``extraction_context.docs`` and reduce
   them to relevant content. Store anything you want in
   ``extraction_context.attrs`` for later stages.
@@ -251,7 +251,7 @@ for data centers:
         def get_heuristic(cls):
             return NoOpHeuristic()
 
-        async def filter_docs(self, extraction_context):
+        async def filter_docs(self, extraction_context, max_num_docs=None):
             docs = extraction_context.docs
 
             page_texts = [
@@ -801,7 +801,7 @@ parsing, and output. Here is the complete plugin class:
        def get_heuristic(cls):
            return NoOpHeuristic()
 
-       async def filter_docs(self, extraction_context):
+       async def filter_docs(self, extraction_context, max_num_docs=None):
            docs = extraction_context.docs
 
            page_texts = [

@@ -79,7 +79,7 @@ def check(self, text):
 
         return _KeepEverything()
 
-    async def filter_docs(self, extraction_context):
+    async def filter_docs(self, extraction_context, __):
         """Keep all docs for deterministic round-trip tests"""
         if not extraction_context:
             return None