diff --git a/.circleci/build-and-test/jobs.yml b/.circleci/build-and-test/jobs.yml
index 4b9e2c796..6105bc2ab 100644
--- a/.circleci/build-and-test/jobs.yml
+++ b/.circleci/build-and-test/jobs.yml
@@ -77,7 +77,7 @@
name: Setup cypress test data
command: |
cd tdrs-backend
- docker-compose exec web python manage.py loaddata cypress/users cypress/data_files cypress/regions cypress/profile_editing_regions cypress/profile_editing_users
+ docker-compose exec web python manage.py loaddata cypress/users cypress/data_files cypress/regions cypress/profile_editing_regions cypress/profile_editing_users cypress/feature_flags
- run:
name: Run Cypress e2e tests
command: cd tdrs-frontend; yarn test:e2e-ci
diff --git a/.github/ISSUE_TEMPLATE/bug-template.md b/.github/ISSUE_TEMPLATE/bug-template.md
index 0b371534d..019d7d52a 100644
--- a/.github/ISSUE_TEMPLATE/bug-template.md
+++ b/.github/ISSUE_TEMPLATE/bug-template.md
@@ -3,7 +3,7 @@ name: Bug Report template
about: Template for bug reporting
title: ''
labels: bug, dev
-assignees: ''
+assignees: kennymcnett, reitermb, victoriaatraft, elipe17
---
diff --git a/.github/ISSUE_TEMPLATE/release-tracker-issue-template.md b/.github/ISSUE_TEMPLATE/release-tracker-issue-template.md
index e912da957..8243289ed 100644
--- a/.github/ISSUE_TEMPLATE/release-tracker-issue-template.md
+++ b/.github/ISSUE_TEMPLATE/release-tracker-issue-template.md
@@ -39,6 +39,15 @@ https://github.com/raft-tech/TANF-app/releases/tag/vX.X.X
### đź§Ş 2. Staging Validation & QASP (ACF / Alex)
*Tracking the status once ACF takes over deployment and testing.*
+### Before you Deploy
+- [ ] **Requires base image updates**:
+ - [ ] Re-tag `ghcr.io/raft-tech/tdp-frontend-base:vX.X.X` for the HHS GHCR instance
+ - [ ] Re-tag `ghcr.io/raft-tech/tdp-backend-base:vX.X.X` for the HHS GHCR instance
+- [ ] **Requires HHS CircleCI config updates**:
+ - [ ]
+- [ ] **Requires PLG deployment**
+
+### Staging Deployment
- [ ] **Staging Cleared:** Team notified that Staging is about to be updated/restarted.
- [ ] **Deployed to Staging:** PR merged and deployed to the Staging environment.
- [ ] **Feature Validation:** Testing instructions from the linked PRs have been executed and passed.
diff --git a/.gitignore b/.gitignore
index 447283e14..d80201e44 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
# UI
+tdrs-frontend/.yarn/
tdrs-frontend/node_modules/
tdrs-frontend/build/
tdrs-frontend/coverage/
@@ -125,4 +126,3 @@ cypress.env.json
# DB seeds
tdrs-backend/*.pg
tdrs-backend/django.log
-
diff --git a/Taskfile.yml b/Taskfile.yml
index 99c9a216c..16855ea9f 100644
--- a/Taskfile.yml
+++ b/Taskfile.yml
@@ -294,7 +294,7 @@ tasks:
cmds:
- export CYPRESS_TOKEN=local-cypress-token
- docker compose -f docker-compose.yml exec web python manage.py delete_cypress_users -usernames new-cypress@teamraft.com cypress-admin@teamraft.com cypress-data-analyst-dana@teamraft.com cypress-fra-data-analyst-derek@teamraft.com cypress-data-analyst-donna@teamraft.com cypress-fra-data-analyst-david@teamraft.com cypress-fra-ofa-regional-staff-rachel@acf.hhs.gov cypress-fra-ofa-regional-staff-robert@acf.hhs.gov cypress-fra-ofa-regional-staff-rita@acf.hhs.gov cypress-fra-ofa-regional-staff-ryan@acf.hhs.gov
- - docker compose -f docker-compose.yml exec web python manage.py loaddata cypress/users cypress/data_files cypress/regions cypress/profile_editing_regions cypress/profile_editing_users
+ - docker compose -f docker-compose.yml exec web python manage.py loaddata cypress/users cypress/data_files cypress/regions cypress/profile_editing_regions cypress/profile_editing_users cypress/feature_flags
frontend-e2e-local:
desc: Run Cypress E2E tests locally (Cypress on host, app in docker)
diff --git a/docs/Technical-Documentation/tech-memos/parsing-refactor-plan.md b/docs/Technical-Documentation/tech-memos/parsing-refactor-plan.md
new file mode 100644
index 000000000..1ef32846a
--- /dev/null
+++ b/docs/Technical-Documentation/tech-memos/parsing-refactor-plan.md
@@ -0,0 +1,304 @@
+# Technical Memo: Refactoring Parsing and Reparsing in TANF Data Portal Backend
+
+## 1. Purpose
+
+This memo proposes a refactor of the **parsing** and **reparsing** pipelines in the TANF Data Portal backend (`tdrs-backend`). The goal is to:
+
+- Reduce duplicated orchestration logic between initial parsing and reparsing
+- Make parsing behavior easier to reason about, test, and extend
+- Improve performance and observability for large-scale reparsing operations
+- Clearly separate “what to parse” (selection) from “how to parse” (pipeline)
+- Establish a stable contract (service + state machine) so new file types and policy changes do not require touching Celery tasks or ad hoc utilities.
+
+### Why this refactor is beneficial
+- **Single source of truth for parsing logic:** Today, behavior is split across parser classes, the Celery task, and reparse utilities. Moving to a ParsingService collapses side effects (status updates, summaries, error reports) into one place, reducing drift and regression risk.
+- **Testability:** A service with clear inputs/outputs can be unit-tested without Celery, making it easier to cover both happy paths and failure modes. Reparsing can reuse the same entry point with explicit context.
+- **Extensibility:** A factory-driven, class-based parser plus decoder abstraction makes it straightforward to add file types (for example, new CSV/XLSX variants) or program types without rewriting orchestration. SchemaManager and decoders become the main extension points.
+- **Operational clarity:** With the submission state machine and centralized transitions, operators and users see consistent states (for example, uploaded, virus_scan_started, parse_started, parsed_with_errors, parsed_completed, completed) instead of implicit flags scattered across models.
+- **Safer reparsing:** Consolidating reparse behavior (backups, deletions, status updates) through a dedicated reparse service and shared state transitions improves idempotency, makes resume/rollback safer, and keeps ReparseMeta/ReparseFileMeta in sync.
+- **Observability:** Centralized logging and optional metrics around a single service boundary make it easier to trace a file journey, correlate errors, and measure performance (parse durations, error counts).
+The recommendations are based on the current implementation in:
+
+- `tdpservice/parsers/…`
+- `tdpservice/scheduling/parser_task.py`
+- `tdpservice/search_indexes/reparse.py`
+- `tdpservice/search_indexes/utils.py`
+- `tdpservice/search_indexes/models/reparse_meta.py`
+- `tdpservice/data_files/models.py` (especially `DataFile` and `ReparseFileMeta`)
+
+
+## 2. Current Architecture (High-Level)
+
+### 2.1 Initial parsing flow
+
+**Entry point:** a TANF/SSP/TRIBAL/FRA data file is uploaded (`DataFile` instance created).
+
+**Core components:**
+
+- **Celery task:** `tdpservice/scheduling/parser_task.parse_data_file`
+ - Looks up the `DataFile`
+ - Uses `ParserFactory` to get the correct parser class for the file’s program type
+ - Calls parser methods (e.g., `parse_and_validate()`)
+ - Updates `DataFileSummary` / status flags
+ - Generates error reports via `ErrorReportFactory`
+ - Sends notification emails (`send_data_submitted_email`)
+ - If the parse is part of a reparse run, it also updates `ReparseFileMeta`
+
+- **Parser infrastructure:**
+ - `tdpservice/parsers/factory.ParserFactory`
+ - `tdpservice/parsers/parser_classes/base_parser.BaseParser`
+ - Concrete parsers:
+ - `TanfDataReportParser`
+ - `FRAParser`
+ - `ProgramAuditParser`
+ - `SchemaManager` (`schema_manager.py`) to manage program- and section-specific schema
+ - `ErrorGeneratorFactory` and `ParserError` to generate and persist row-level errors
+ - `DataFileSummary` to track high-level outcomes (record counts, error counts)
+
+**Characteristics:**
+- “Initial parse” logic is **implicitly defined** by the behavior inside `parse_data_file` and the individual parser classes.
+- The Celery task contains non-trivial orchestration logic: logging, error handling, and special cases for reparse runs.
+
+
+### 2.2 Reparsing flow
+
+Reparsing is used to “clean and reprocess” existing data files, usually when schemas or validation logic change.
+
+**Core components:**
+
+- **Reparse orchestration:** `tdpservice/search_indexes/reparse.py`
+ - User / admin triggers some “clean and reparse” behavior (fiscal year, quarter, optional filters)
+ - Uses helpers from `tdpservice/search_indexes/utils.py` and `tdpservice/search_indexes/util.py`:
+ - `backup(...)` → creates a DB backup
+ - `delete_associated_models(...)` → deletes records associated with the selected files (ParserError, DataFileSummary, index rows, etc.)
+ - `calculate_timeout(...)` / `assert_sequential_execution(...)` → safety checks for long-running reparses
+ - `count_total_num_records(...)` / `count_all_records(...)` → record-count snapshots
+
+- **Meta tracking models:**
+ - `ReparseMeta` (`tdpservice/search_indexes/models/reparse_meta.py`)
+ - Represents a single “reparse run” (with fields like `timeout_at`, `finished`, `success`, `total_num_records_initial`, `total_num_records_after`, etc.)
+ - Aggregates related `ReparseFileMeta` records
+ - `ReparseFileMeta` (`tdpservice/data_files/models.py`)
+ - Represents the parse status of a single `DataFile` within a reparse run (finished, success, record counts, error counts)
+
+- **Scheduling reparses:**
+ - Once backup and deletion are done, `reparse.py` calls into `parser_task` for each `DataFile` to schedule Celery tasks for reparsing.
+ - The same `parse_data_file` task is used, but with additional `reparse_id` context that ties the parse to a `ReparseMeta` / `ReparseFileMeta` pair.
+
+**Characteristics:**
+- Reparsing logic is spread across:
+ - `search_indexes/reparse.py`
+ - `search_indexes/utils.py`
+ - `parser_task.parse_data_file`
+ - The parsers and error-reporting logic
+- `ReparseMeta` / `ReparseFileMeta` add an extra dimension of lifecycle and state, but much of the behavior to maintain them lives in the Celery task.
+
+
+## 3. Pain Points & Risks
+
+From a maintainability and performance perspective, several issues stand out:
+
+### 3.1 Duplicated orchestration between “parse” and “reparse”
+
+- The **initial parsing path** and the **reparsing path** both:
+ - Determine which files to parse
+ - Manage database state (deleting old records, updating summaries)
+ - Schedule Celery tasks
+ - Generate logs and metrics
+- However, the logic to do this is split across multiple modules, and the reparsing path has its own backup / cleanup orchestration.
+- When schemas or error-handling rules change, engineers must remember to update both flows, which increases the risk of subtle inconsistencies.
+
+### 3.2 Tight coupling to Celery and logging
+
+- `parse_data_file` is both a Celery task and a “business service”:
+ - It contains domain logic (how we parse, validate, update summaries, etc.)
+ - It also contains infrastructure concerns (Celery wiring, logging, email notifications, file rotation).
+- This makes unit testing harder and encourages direct calls to the Celery task instead of to a clear, reusable parsing service.
+
+### 3.3 Reparsing logic is scattered and not obviously idempotent
+
+- `search_indexes/reparse.py` performs several responsibilities:
+ - Safety checks (sequential execution, timeouts)
+ - Backup orchestration
+ - Bulk deletion of associated records
+ - Scheduling reparse tasks for each `DataFile`
+- Much of this logic operates directly on the DB and logging functions, which makes it harder to reason about/retry safely.
+- While `ReparseMeta` / `ReparseFileMeta` provide state tracking, the actual transitions are implemented in different modules, making it non-obvious how to safely resume or inspect a partially finished reparse run.
+
+### 3.4 Performance & operational concerns on large datasets
+
+- `count_total_num_records(...)`, `count_all_records(...)`, and bulk deletions may become expensive as datasets grow.
+- Parsing and reparsing touch several related tables (DataFile, ParserError, DataFileSummary, index tables, etc.), so the order and batching of operations matters for performance and locking behavior.
+- Today, these concerns are embedded in the reparse utilities and Celery task without a single place to tune or monitor the pipeline behavior.
+
+
+## 4. Proposed Refactor: Single-File Parsing Service + Reparse Orchestration Service
+
+The core idea is to have a **single-file parsing service** that owns all parsing side effects for one `DataFile`, and a **reparse orchestration service** that manages reparse runs and delegates per-file work to the parsing service. This keeps SRP and makes the hierarchy explicit:
+
+- The Celery task (`parse_data_file`) only wires arguments/logging and calls the parsing service.
+- The reparse service manages `ReparseMeta` / `ReparseFileMeta` lifecycle and calls the parsing service for each file in a reparse run.
+- No other caller should reach directly into parser classes or touch reparse metadata.
+
+
+### 4.1 Introduce a `ParsingService` (single file only)
+
+Create a new module, for example:
+
+- `tdpservice/parsing/service.py` or
+- `tdpservice/parsers/service.py`
+
+with a class like:
+
+```python
+class ParsingService:
+ def __init__(self, *, logger, now_fn=timezone.now):
+ self.logger = logger
+ self.now_fn = now_fn
+
+ def parse_data_file(self, data_file_id: int) -> DataFileSummary:
+ """
+ Orchestrate the full lifecycle for parsing a single DataFile.
+
+ - Load DataFile and related metadata
+ - Select parser via ParserFactory
+ - Invoke parser.parse_and_validate()
+ - Update DataFileSummary / DataFile status
+ - Generate error report
+ - Return the refreshed DataFileSummary (no awareness of reparse metadata)
+ """
+```
+
+This service should encapsulate **what it means** to fully process a `DataFile`, regardless of why it is being parsed (initial submit or reparse).
+
+
+### 4.2 Make Celery task a thin wrapper around the service
+
+Refactor `tdpservice/scheduling/parser_task.parse_data_file` to:
+
+- Parse out its Celery-specific concerns (arguments, retries, logging context)
+- Delegate the core work to `ParsingService.parse_data_file`
+
+Example (conceptually):
+
+```python
+@shared_task(bind=True)
+def parse_data_file(self, data_file_id: int, reparse_id: int | None = None):
+ logger = get_task_logger(__name__)
+ service = ParsingService(logger=logger)
+ service.parse_data_file(data_file_id)
+```
+
+This keeps Celery wiring and logging but moves domain logic into the service.
+
+
+### 4.3 Introduce a `ReparseService` (orchestration + metadata)
+
+Refactor `tdpservice/search_indexes/reparse.py` and `search_indexes/utils.py` so that they:
+
+1. Determine **which DataFiles** should be reparsed (by fiscal year, quarter, program type, STT, etc.).
+2. Perform backup operations (if needed).
+3. Clean out associated records (ParserError, summaries, index rows) in a coherent, batched way.
+4. For each file, create/update `ReparseFileMeta`, then invoke `ParsingService.parse_data_file(...)`.
+5. Aggregate per-file outcomes back into `ReparseMeta` (finished, success, counts).
+
+`ReparseService` owns all `ReparseMeta` / `ReparseFileMeta` transitions; `ParsingService` stays focused on parsing a single file.
+
+This makes state transitions explicit and easier to test, and avoids scattering them between the Celery task and reparse utilities.
+
+
+### 4.5 Improve testability & observability
+
+Once parsing and reparsing are routed through a single service class:
+- **Unit tests** can exercise `ParsingService` directly using a small in-memory or test DB dataset.
+- **Integration tests** can cover:
+ - Initial parse of a sample file
+ - Reparse run across a few files
+ - Recovery from a mid-run failure
+- Logging can be standardized around a single logger/context, making it easier to trace parsing results in logs or APM tooling.
+
+
+## 5. Suggested Implementation Phases
+
+To de-risk the refactor, implement in small, incremental steps:
+
+
+### Phase 1 – Document & stabilize current behavior
+
+- Capture current parsing and reparsing flows in sequence diagrams or text:
+ - How `parse_data_file` is called
+ - Which tables are touched and in what order
+ - How ReparseMeta / ReparseFileMeta are created and updated
+- Add any missing indexes or small performance improvements that are clearly safe (see separate performance tickets).
+
+
+### Phase 2 – Introduce `ParsingService` without changing behavior
+
+- Extract the body of `parse_data_file` into `ParsingService.parse_data_file`, keeping behavior identical.
+- The Celery task delegates to the service, but inputs/outputs remain unchanged.
+- Add tests that assert the service produces the same side effects as the existing Celery task for a small fixture.
+
+
+### Phase 3 – Wire reparsing through `ParsingService`
+
+- Refactor `search_indexes/reparse.py` and `search_indexes/utils.py` so that they:
+ - Only perform backup + selection + deletion + Celey scheduling
+ - Never run parsing logic directly
+- Ensure that `ReparseMeta` / `ReparseFileMeta` updates are driven by `ReparseService` and not by ad-hoc logic outside the service.
+
+
+### Phase 4 – Clean up and harden
+
+- Remove now-dead code paths or duplicated logic.
+- Add better failure handling and idempotency guarantees for reparse runs.
+
+#### Phase 4 – Reparse hardening checklist (explicit requirements)
+
+- **Separate batch vs single-file logic**
+ - `ParsingService` owns exactly one `DataFile` parse end-to-end.
+ - `ReparseService` owns reparse orchestration for *N files* and is the only place that updates `ReparseMeta` / `ReparseFileMeta`.
+- **Idempotent reparse runs**
+ - Define what happens if the same reparse is triggered twice (default: skip finished files; optional: force-restart with a new attempt id).
+ - Ensure a reparse run does not double-create or double-count `DataFileSummary`, `ParserError`, or error reports.
+- **Explicit reparse attempt tracking**
+ - Record an `attempt` number (or `run_id`) per `DataFile` within a reparse so we can distinguish first-run vs retry outputs.
+ - Avoid overwriting debugging signals (timestamps, success/failure) without keeping attempt history.
+- **Batch progress aggregation**
+ - Store or compute counts by child state: `pending`, `in_progress`, `succeeded`, `failed`, `stuck`, `canceled`.
+ - Provide a derived overall reparse status for admin visibility and logs (with explicit precedence rules).
+- **Better failure handling**
+ - Decide policy for partial failures (recommended: continue processing remaining files; batch completes with failures).
+ - Persist failure context on file meta (stage, exception type/message) and surface a summarized view on the reparse meta.
+- **Stuck detection + recovery**
+ - Track `started_at` and a “last progress” timestamp per file (e.g., `last_state_change_at` / `heartbeat_at`).
+ - If a file remains in an active stage past a threshold, mark it `stuck` and provide a clear operator action (retry, fail, or cancel).
+- **Concurrency controls**
+ - Prevent two reparses from processing the same `DataFile` concurrently (DB guard/locking or explicit “in progress” ownership).
+ - Optional: chunk large reparses to avoid overwhelming workers and to improve progress reporting.
+- **Make side effects configurable**
+ - Allow `ReparseService` to control whether to send submission emails and whether/when to regenerate error reports.
+ - Default behavior: do not send “data submitted” emails for reparses unless explicitly requested.
+- **Transactional boundaries**
+ - Ensure per-file “start” and “finish” updates are atomic and durable even if a worker crashes mid-run.
+ - Prefer explicit transactions around metadata updates so reparse progress cannot end up partially written.
+- Add regression tests for:
+ - Incremental reparsing (subset of files, STT-specific)
+ - Large batches (e.g., dozens/hundreds of files)
+
+
+## 6. Expected Benefits
+
+1. **Reduced duplication**
+ Single source of truth for parsing logic, regardless of whether it is an initial parse or a reparse.
+
+2. **Easier reasoning and debugging**
+ Parsing behavior is concentrated in `ParsingService`; reparse orchestration just chooses “what” to parse.
+
+3. **Better testability**
+ Service-oriented design enables targeted unit tests instead of having to go through Celery + management commands for every behavior change.
+
+4. **Improved performance tuning**
+ Backup, deletion, and parsing steps are separated more clearly, making it easier to profile and optimize the heaviest operations.
+
+5. **Lower operational risk**
+ A more explicit lifecycle for ReparseMeta / ReparseFileMeta, with a single place where their states are updated, makes it easier to detect and recover from partial failures.
diff --git a/docs/Technical-Documentation/tech-memos/submission-state-machine.md b/docs/Technical-Documentation/tech-memos/submission-state-machine.md
new file mode 100644
index 000000000..74fc3ab1a
--- /dev/null
+++ b/docs/Technical-Documentation/tech-memos/submission-state-machine.md
@@ -0,0 +1,88 @@
+# Submission State Machine for File Processing
+
+## Purpose
+Define and enforce a clear lifecycle for uploaded files so parsing and triage share a consistent contract. This is a precursor to the parser refactor to avoid churn and make status handling predictable.
+
+## Why a state machine (and what it adds)
+- **Guardrails for future changes:** Even though end users cannot alter parsing, developers can. An explicit transition map prevents drift when we add steps (AV scan, retries, change requests) or touch the parser/reparser code paths. Instead of silently landing in an inconsistent state, we fail fast on illegal transitions.
+- **Durable, user-visible lifecycle:** `DataFileSummary` is per-parse and can be deleted/recreated during reparses. `DataFile.state` is a durable record of the submission lifecycle (upload -> scan -> parse) that survives reparses and exists even before a summary is created.
+- **Better triage and alerts:** Granular states (for example `virus_scan_started` vs `parse_started`) make it obvious where a file stalled without scraping logs. They enable targeted alerts (for example, "stuck in `parse_started` > 15m") and safer retries.
+
+## States
+`uploaded` -> `virus_scan_started` -> (`virus_scan_failed` | `virus_scan_successful`) -> `parse_started` -> (`parsed_with_errors` | `parsed_completed`) -> `completed`.
+
+Any active state can transition to `canceled`. A file that exceeds time thresholds in an active state is marked `stuck` (and may later be escalated to `failed` by policy).
+
+Note: parsing can write records in batches. This proposal does not model a separate ingest phase yet.
+
+## Allowed transitions (code sketch)
+```python
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, Iterable
+
+
+class SubmissionState(str, Enum):
+ UPLOADED = "uploaded"
+ VIRUS_SCAN_STARTED = "virus_scan_started"
+ VIRUS_SCAN_FAILED = "virus_scan_failed"
+ VIRUS_SCAN_SUCCESSFUL = "virus_scan_successful"
+ PARSE_STARTED = "parse_started"
+ PARSE_COMPLETED = "parse_completed"
+ STUCK = "stuck"
+ COMPLETED = "completed"
+ CANCELED = "canceled"
+
+
+ALLOWED_TRANSITIONS: Dict[SubmissionState, Iterable[SubmissionState]] = {
+ SubmissionState.UPLOADED: {
+ SubmissionState.VIRUS_SCAN_STARTED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.VIRUS_SCAN_STARTED: {
+ SubmissionState.VIRUS_SCAN_FAILED,
+ SubmissionState.VIRUS_SCAN_SUCCESSFUL,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.VIRUS_SCAN_FAILED: {
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.VIRUS_SCAN_SUCCESSFUL: {
+ SubmissionState.PARSE_STARTED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.PARSE_STARTED: {
+ SubmissionState.PARSED_WITH_ERRORS,
+ SubmissionState.PARSED_COMPLETED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.PARSED_WITH_ERRORS: {
+ SubmissionState.COMPLETED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.PARSED_COMPLETED: {
+ SubmissionState.COMPLETED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.STUCK: {
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.COMPLETED: set(),
+ SubmissionState.CANCELED: set(),
+}
+
+
+class InvalidTransition(Exception):
+ ...
+
+
+@dataclass
+class SubmissionLifecycle:
+ state: SubmissionState
+ history: list[str] = field(default_factory=list)
+
+ def transition(self, next_state: SubmissionState, note: str = "") -> None:
+ if next_state not in ALLOWED_TRANSITIONS[self.state]:
+ raise InvalidTransition(f"{self.state} -> {next_state} not allowed")
+ self.history.append(f"{self.state} -> {next_state}: {note}")
+ self.state = next_state
\ No newline at end of file
diff --git a/scripts/apply-database-config.sh b/scripts/apply-database-config.sh
index f5fe8da87..deadbccbc 100644
--- a/scripts/apply-database-config.sh
+++ b/scripts/apply-database-config.sh
@@ -87,7 +87,7 @@ echo "Done."
if [[ $app == "tdp-backend-develop" || $space == "tanf-dev" ]]; then
echo "Applying e2e test data"
python manage.py populate_stts
- python manage.py loaddata cypress/users cypress/data_files cypress/regions cypress/profile_editing_regions cypress/profile_editing_users
+ python manage.py loaddata cypress/users cypress/data_files cypress/regions cypress/profile_editing_regions cypress/profile_editing_users cypress/feature_flags
echo "Done."
fi
diff --git a/tdrs-backend/plg/alertmanager/alertmanager.yml b/tdrs-backend/plg/alertmanager/alertmanager.yml
index 888c8e27c..6b61725d9 100644
--- a/tdrs-backend/plg/alertmanager/alertmanager.yml
+++ b/tdrs-backend/plg/alertmanager/alertmanager.yml
@@ -39,7 +39,8 @@ route:
- matchers:
- alertname=~"UpTime"
receiver: dev-team-emails
- repeat_interval: 24h
+ repeat_interval: 48h
+ continue: true
# Send all severity CRITICAL/ERROR alerts to OFA admin emails
- matchers:
- severity=~"ERROR|CRITICAL"
@@ -47,11 +48,15 @@ route:
continue: true
# Send all severity CRITICAL/ERROR/WARNING alerts to mattermost and dev team emails
- matchers:
- - severity=~"ERROR|CRITICAL|WARNING"
+ - severity=~"ERROR|CRITICAL"
receiver: mattermost
continue: true
- matchers:
- - severity=~"ERROR|CRITICAL|WARNING"
+ - severity=~"WARNING"
+ receiver: dev-mattermost
+ continue: true
+ - matchers:
+ - severity=~"ERROR|CRITICAL"
receiver: dev-team-emails
continue: true
# Inhibition rules allow to mute a set of alerts given that another alert is
@@ -90,11 +95,29 @@ receivers:
{{ if or (eq .Labels.severity "CRITICAL") (eq .Labels.severity "ERROR") }}
@here
{{ end }}
- *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
+ *Alert:* {{ .Labels.alertname }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
+
+ *Description:* {{ .Annotations.description }}
+
+ *Details:*
+ • *Job:* `{{ .Labels.job }}`
+ • *Instance:* `{{ .Labels.instance }}`
+ • *Env:* `{{ .Labels.env }}`
+ {{ end }}
+ - name: 'dev-mattermost'
+ slack_configs:
+ - channel: 'tdp-dev-alerts'
+ username: 'alertmanager'
+ send_resolved: true
+ api_url: 'https://fake.mattermost.com'
+ text: |-
+ {{ range .Alerts -}}
+ *Alert:* {{ .Labels.alertname }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
*Description:* {{ .Annotations.description }}
*Details:*
- {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
- {{ end }}
+ • *Job:* `{{ .Labels.job }}`
+ • *Instance:* `{{ .Labels.instance }}`
+ • *Env:* `{{ .Labels.env }}`
{{ end }}
diff --git a/tdrs-backend/plg/deploy.sh b/tdrs-backend/plg/deploy.sh
index c6febf762..24631ea0a 100755
--- a/tdrs-backend/plg/deploy.sh
+++ b/tdrs-backend/plg/deploy.sh
@@ -105,6 +105,7 @@ deploy_alertmanager() {
yq eval -i ".global.slack_api_url = \"$MATTERMOST_WEBHOOK_URL\"" $CONFIG
yq eval -i ".receivers[0].email_configs[0].to = \"${ADMIN_EMAILS}\"" $CONFIG
yq eval -i ".receivers[1].email_configs[0].to = \"${DEV_EMAILS}\"" $CONFIG
+ yq eval -i ".receivers[3].slack_configs[0].api_url = \"${DEV_MATTERMOST_WEBHOOK_URL}\"" $CONFIG
cf push --no-route -f manifest.yml -t 180 --strategy rolling
cf map-route alertmanager apps.internal --hostname alertmanager
rm $CONFIG
diff --git a/tdrs-backend/plg/grafana/dashboards/health.json b/tdrs-backend/plg/grafana/dashboards/health.json
index 182e54487..c47c8a766 100644
--- a/tdrs-backend/plg/grafana/dashboards/health.json
+++ b/tdrs-backend/plg/grafana/dashboards/health.json
@@ -12,13 +12,28 @@
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${datasource}"
+ },
+ "enable": true,
+ "hide": true,
+ "expr": "pg_up{job=~\"postgres-$pg_env\"} == 0 or absent(pg_up{job=~\"postgres-$pg_env\"})",
+ "iconColor": "red",
+ "name": "Database Unreachable",
+ "step": "60s",
+ "tagKeys": "job",
+ "titleFormat": "Database Unreachable",
+ "textFormat": "The postgres exporter cannot reach the database (pg_up=0 or absent)"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
- "id": 7,
+ "id": 5,
"links": [],
"panels": [
{
@@ -47,7 +62,7 @@
},
"id": 1,
"options": {
- "alertInstanceLabelFilter": "",
+ "alertInstanceLabelFilter": "{job=~\"tdp-backend-$app_env\"}",
"alertName": "backend",
"dashboardAlerts": false,
"datasource": "Prometheus",
@@ -66,7 +81,7 @@
},
"viewMode": "list"
},
- "pluginVersion": "12.0.1",
+ "pluginVersion": "12.0.2",
"title": "Backend - Active Alerts",
"type": "alertlist"
},
@@ -131,7 +146,7 @@
"textMode": "value",
"wideLayout": true
},
- "pluginVersion": "12.0.1",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -205,7 +220,7 @@
"textMode": "value",
"wideLayout": true
},
- "pluginVersion": "12.0.1",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -285,7 +300,7 @@
"showThresholdMarkers": true,
"sizing": "auto"
},
- "pluginVersion": "12.0.1",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -366,7 +381,7 @@
"sizing": "auto",
"valueMode": "color"
},
- "pluginVersion": "12.0.1",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -388,63 +403,13 @@
"title": "Errors Rate (Backend)",
"type": "bargauge"
},
- {
- "fieldConfig": {
- "defaults": {},
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 0,
- "y": 10
- },
- "id": 29,
- "options": {
- "code": {
- "language": "plaintext",
- "showLineNumbers": false,
- "showMiniMap": false
- },
- "content": "# Add backend CPU here",
- "mode": "markdown"
- },
- "pluginVersion": "12.0.1",
- "title": "Average CPU Usage (Backend)",
- "type": "text"
- },
- {
- "fieldConfig": {
- "defaults": {},
- "overrides": []
- },
- "gridPos": {
- "h": 8,
- "w": 12,
- "x": 12,
- "y": 10
- },
- "id": 30,
- "options": {
- "code": {
- "language": "plaintext",
- "showLineNumbers": false,
- "showMiniMap": false
- },
- "content": "# Add backend Memory here",
- "mode": "markdown"
- },
- "pluginVersion": "12.0.1",
- "title": "Average Memory Usage (Backend)",
- "type": "text"
- },
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
- "y": 18
+ "y": 10
},
"id": 31,
"panels": [],
@@ -460,11 +425,11 @@
"h": 4,
"w": 24,
"x": 0,
- "y": 19
+ "y": 11
},
"id": 34,
"options": {
- "alertInstanceLabelFilter": "",
+ "alertInstanceLabelFilter": "{job=~\"tdp-celery-$app_env|tdp-celery-exporter-$app_env\"}",
"alertName": "celery",
"dashboardAlerts": false,
"datasource": "Prometheus",
@@ -483,7 +448,7 @@
},
"viewMode": "list"
},
- "pluginVersion": "12.0.1",
+ "pluginVersion": "12.0.2",
"title": "Celery - Active Alerts",
"type": "alertlist"
},
@@ -527,7 +492,7 @@
"h": 5,
"w": 12,
"x": 0,
- "y": 23
+ "y": 15
},
"id": 35,
"options": {
@@ -547,7 +512,7 @@
"textMode": "value",
"wideLayout": true
},
- "pluginVersion": "11.2.0",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -601,7 +566,7 @@
"h": 5,
"w": 12,
"x": 12,
- "y": 23
+ "y": 15
},
"id": 24,
"options": {
@@ -621,7 +586,7 @@
"textMode": "value",
"wideLayout": true
},
- "pluginVersion": "11.2.0",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -689,7 +654,7 @@
"h": 7,
"w": 24,
"x": 0,
- "y": 28
+ "y": 20
},
"id": 25,
"options": {
@@ -711,7 +676,7 @@
}
]
},
- "pluginVersion": "11.2.0",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -882,7 +847,7 @@
"h": 6,
"w": 24,
"x": 0,
- "y": 35
+ "y": 27
},
"id": 26,
"options": {
@@ -898,7 +863,7 @@
},
"showHeader": true
},
- "pluginVersion": "11.2.0",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -1000,7 +965,7 @@
"h": 7,
"w": 24,
"x": 0,
- "y": 41
+ "y": 33
},
"id": 28,
"options": {
@@ -1011,11 +976,12 @@
"showLegend": true
},
"tooltip": {
+ "hideZeros": false,
"mode": "multi",
"sort": "desc"
}
},
- "pluginVersion": "11.2.0",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -1039,7 +1005,7 @@
"h": 1,
"w": 24,
"x": 0,
- "y": 48
+ "y": 40
},
"id": 33,
"panels": [],
@@ -1055,27 +1021,30 @@
"h": 4,
"w": 24,
"x": 0,
- "y": 49
+ "y": 41
},
"id": 6,
"options": {
- "alertInstanceLabelFilter": "",
+ "alertInstanceLabelFilter": "{job=~\"postgres-$pg_env\"}",
"alertName": "database",
"dashboardAlerts": false,
"datasource": "Prometheus",
"groupBy": [],
"groupMode": "default",
"maxItems": 20,
+ "showInactiveAlerts": false,
"sortOrder": 3,
"stateFilter": {
"error": true,
"firing": true,
"noData": true,
"normal": true,
- "pending": true
+ "pending": true,
+ "recovering": true
},
"viewMode": "list"
},
+ "pluginVersion": "12.0.2",
"title": "Database - Active Alerts",
"type": "alertlist"
},
@@ -1119,7 +1088,7 @@
"h": 5,
"w": 12,
"x": 0,
- "y": 53
+ "y": 45
},
"id": 18,
"options": {
@@ -1139,7 +1108,7 @@
"textMode": "value",
"wideLayout": true
},
- "pluginVersion": "11.2.0",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -1193,7 +1162,7 @@
"h": 5,
"w": 12,
"x": 12,
- "y": 53
+ "y": 45
},
"id": 36,
"options": {
@@ -1213,7 +1182,7 @@
"textMode": "value",
"wideLayout": true
},
- "pluginVersion": "11.2.0",
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -1297,7 +1266,7 @@
"h": 8,
"w": 12,
"x": 0,
- "y": 58
+ "y": 50
},
"id": 20,
"options": {
@@ -1313,10 +1282,12 @@
"showLegend": true
},
"tooltip": {
+ "hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -1401,7 +1372,7 @@
"h": 8,
"w": 12,
"x": 12,
- "y": 58
+ "y": 50
},
"id": 22,
"options": {
@@ -1417,10 +1388,12 @@
"showLegend": true
},
"tooltip": {
+ "hideZeros": false,
"mode": "multi",
"sort": "none"
}
},
+ "pluginVersion": "12.0.2",
"targets": [
{
"datasource": {
@@ -1454,7 +1427,7 @@
}
],
"preload": false,
- "refresh": "5s",
+ "refresh": "30s",
"schemaVersion": 41,
"tags": [
"Backend",
@@ -1552,27 +1525,6 @@
"query": "production, staging, dev, local",
"type": "custom"
},
- {
- "current": {
- "isNone": true,
- "text": "None",
- "value": ""
- },
- "datasource": {
- "type": "prometheus",
- "uid": "${datasource}"
- },
- "definition": "",
- "includeAll": false,
- "label": "Namespace",
- "name": "namespace",
- "options": [],
- "query": "label_values(celery_worker_up{}, namespace)",
- "refresh": 2,
- "regex": "",
- "sort": 1,
- "type": "query"
- },
{
"current": {
"isNone": true,
@@ -1604,5 +1556,5 @@
"timezone": "browser",
"title": "Uptime/Health",
"uid": "aeh7ymwdwpvk0e",
- "version": 1
+ "version": 4
}
diff --git a/tdrs-backend/plg/grafana/dashboards/postgres_dashboard.json b/tdrs-backend/plg/grafana/dashboards/postgres_dashboard.json
index 1a3a4109d..0ed41d6aa 100644
--- a/tdrs-backend/plg/grafana/dashboards/postgres_dashboard.json
+++ b/tdrs-backend/plg/grafana/dashboards/postgres_dashboard.json
@@ -49,6 +49,23 @@
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
+ },
+ {
+ "datasource": {
+ "type": "prometheus",
+ "uid": "${DS_PROMETHEUS}"
+ },
+ "enable": true,
+ "expr": "pg_up{instance=~\"$instance\"} == 0 or absent(pg_up{instance=~\"$instance\"})",
+ "hide": false,
+ "iconColor": "red",
+ "name": "Database Unreachable",
+ "step": "60s",
+ "tagKeys": "instance",
+ "titleFormat": "Database Unreachable",
+ "textFormat": "The postgres exporter cannot reach the database (pg_up=0 or absent)",
+ "type": "tags",
+ "useValueForTime": false
}
]
},
@@ -235,12 +252,13 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "pg_static{release=\"$release\", instance=\"$instance\"}",
+ "expr": "pg_static{instance=\"$instance\"}",
"format": "time_series",
- "instant": true,
+ "instant": false,
"intervalFactor": 1,
"legendFormat": "{{short_version}}",
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Version",
@@ -314,11 +332,12 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "pg_postmaster_start_time_seconds{release=\"$release\", instance=\"$instance\"} * 1000",
+ "expr": "pg_postmaster_start_time_seconds{instance=\"$instance\"} * 1000",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "",
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Start Time",
@@ -391,11 +410,12 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "SUM(pg_stat_database_tup_inserted{release=\"$release\", datname=~\"$datname\", instance=~\"$instance\"})",
+ "expr": "SUM(pg_stat_database_tup_inserted{datname=~\"$datname\", instance=~\"$instance\"})",
"format": "time_series",
"intervalFactor": 2,
"refId": "A",
- "step": 4
+ "step": 4,
+ "range": true
}
],
"title": "Current insert data",
@@ -472,7 +492,8 @@
"format": "time_series",
"intervalFactor": 2,
"refId": "A",
- "step": 4
+ "step": 4,
+ "range": true
}
],
"title": "Current fetch data",
@@ -549,7 +570,8 @@
"format": "time_series",
"intervalFactor": 2,
"refId": "A",
- "step": 4
+ "step": 4,
+ "range": true
}
],
"title": "Current update data",
@@ -622,10 +644,11 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "pg_settings_max_connections{release=\"$release\", instance=\"$instance\"}",
+ "expr": "pg_settings_max_connections{instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Max Connections",
@@ -723,7 +746,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "avg(rate(process_cpu_seconds_total{release=\"$release\", instance=\"$instance\"}[5m]) * 1000)",
+ "expr": "avg(rate(process_cpu_seconds_total{instance=\"$instance\"}[5m]) * 1000)",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "CPU Time",
@@ -825,7 +848,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "avg(rate(process_resident_memory_bytes{release=\"$release\", instance=\"$instance\"}[5m]))",
+ "expr": "avg(rate(process_resident_memory_bytes{instance=\"$instance\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Resident Mem",
@@ -836,7 +859,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "avg(rate(process_virtual_memory_bytes{release=\"$release\", instance=\"$instance\"}[5m]))",
+ "expr": "avg(rate(process_virtual_memory_bytes{instance=\"$instance\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Virtual Mem",
@@ -938,7 +961,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "expr": "process_open_fds{release=\"$release\", instance=\"$instance\"}",
+ "expr": "process_open_fds{instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Open FD",
@@ -1044,7 +1067,8 @@
"expr": "pg_settings_shared_buffers_bytes{instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Shared Buffers",
@@ -1120,7 +1144,8 @@
"expr": "pg_settings_effective_cache_size_bytes{instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Effective Cache",
@@ -1196,7 +1221,8 @@
"expr": "pg_settings_maintenance_work_mem_bytes{instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Maintenance Work Mem",
@@ -1273,7 +1299,8 @@
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "",
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Work Mem",
@@ -1350,7 +1377,8 @@
"expr": "pg_settings_max_wal_size_bytes{instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Max WAL Size",
@@ -1426,7 +1454,8 @@
"expr": "pg_settings_random_page_cost{instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Random Page Cost",
@@ -1502,7 +1531,8 @@
"expr": "pg_settings_seq_page_cost",
"format": "time_series",
"intervalFactor": 1,
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Seq Page Cost",
@@ -1578,7 +1608,8 @@
"expr": "pg_settings_max_worker_processes{instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Max Worker Processes",
@@ -1654,7 +1685,8 @@
"expr": "pg_settings_max_parallel_workers{instance=\"$instance\"}",
"format": "time_series",
"intervalFactor": 1,
- "refId": "A"
+ "refId": "A",
+ "range": true
}
],
"title": "Max Parallel Workers",
@@ -3261,62 +3293,16 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "definition": "",
- "hide": 0,
- "includeAll": false,
- "label": "Namespace",
- "multi": false,
- "name": "namespace",
- "options": [],
- "query": "query_result(pg_exporter_last_scrape_duration_seconds)",
- "refresh": 2,
- "regex": "/.*kubernetes_namespace=\"([^\"]+).*/",
- "skipUrlSync": false,
- "sort": 1,
- "tagValuesQuery": "",
- "tagsQuery": "",
- "type": "query",
- "useTags": false
- },
- {
- "current": {},
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "definition": "",
- "hide": 0,
- "includeAll": false,
- "label": "Release",
- "multi": false,
- "name": "release",
- "options": [],
- "query": "query_result(pg_exporter_last_scrape_duration_seconds{kubernetes_namespace=\"$namespace\"})",
- "refresh": 2,
- "regex": "/.*release=\"([^\"]+)/",
- "skipUrlSync": false,
- "sort": 1,
- "tagValuesQuery": "",
- "tagsQuery": "",
- "type": "query",
- "useTags": false
- },
- {
- "current": {},
- "datasource": {
- "type": "prometheus",
- "uid": "${DS_PROMETHEUS}"
- },
- "definition": "",
+ "definition": "label_values(pg_exporter_last_scrape_duration_seconds, instance)",
"hide": 0,
"includeAll": false,
"label": "Instance",
"multi": false,
"name": "instance",
"options": [],
- "query": "query_result(pg_up{release=\"$release\"})",
- "refresh": 1,
- "regex": "/.*instance=\"([^\"]+).*/",
+ "query": "label_values(pg_exporter_last_scrape_duration_seconds, instance)",
+ "refresh": 2,
+ "regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
@@ -3330,15 +3316,15 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "definition": "",
+ "definition": "label_values(pg_stat_database_tup_fetched{instance=~\"$instance\"}, datname)",
"hide": 0,
"includeAll": true,
"label": "Database",
"multi": true,
"name": "datname",
"options": [],
- "query": "label_values(datname)",
- "refresh": 1,
+ "query": "label_values(pg_stat_database_tup_fetched{instance=~\"$instance\"}, datname)",
+ "refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
@@ -3353,7 +3339,7 @@
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
- "definition": "",
+ "definition": "label_values({mode=~\"accessexclusivelock|accesssharelock|exclusivelock|rowexclusivelock|rowsharelock|sharelock|sharerowexclusivelock|shareupdateexclusivelock\"}, mode)",
"hide": 0,
"includeAll": true,
"label": "Lock table",
@@ -3361,7 +3347,7 @@
"name": "mode",
"options": [],
"query": "label_values({mode=~\"accessexclusivelock|accesssharelock|exclusivelock|rowexclusivelock|rowsharelock|sharelock|sharerowexclusivelock|shareupdateexclusivelock\"}, mode)",
- "refresh": 1,
+ "refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 0,
@@ -3406,4 +3392,4 @@
"uid": "000000039",
"version": 8,
"weekStart": ""
-}
\ No newline at end of file
+}
diff --git a/tdrs-backend/plg/prometheus/alerts.local.yml b/tdrs-backend/plg/prometheus/alerts.local.yml
index e7ab2a770..3b5facc44 100644
--- a/tdrs-backend/plg/prometheus/alerts.local.yml
+++ b/tdrs-backend/plg/prometheus/alerts.local.yml
@@ -5,47 +5,46 @@ groups:
expr: last_over_time(pg_up{job="postgres-local"}[1m]) == 0 or last_over_time(up{job="postgres-local"}[1m]) == 0
for: 1m
labels:
- severity: CRITICAL
+ severity: WARNING
annotations:
- summary: "The {{ $labels.service }} service is down."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
+ summary: "The {{ $labels.job }} service is down."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
- name: backend.alerts
rules:
- alert: LocalBackendDown
expr: last_over_time(up{job=~"tdp-backend-local"}[1m]) == 0
- for: 10m
+ for: 2m
labels:
- severity: ERROR
+ severity: WARNING
annotations:
- summary: "The {{ $labels.service }} service is down."
+ summary: "The {{ $labels.job }} service is down."
description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
- name: plg.alerts
rules:
- alert: LocalLokiDown
expr: last_over_time(up{job="loki"}[1m]) == 0
labels:
- severity: ERROR
+ severity: WARNING
annotations:
summary: "The {{ $labels.service }} service is down."
description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
- name: app.alerts
rules:
- - alert: UpTime
+ - alert: LocalUpTime
expr: avg_over_time(up[1m]) < 0.95
for: 30m
labels:
severity: WARNING
annotations:
- summary: "The {{ $labels.service }} service has a uptime warning."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment is not maintaining 95% uptime."
+ summary: "The {{ $labels.job }} service has a uptime warning."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment is not maintaining 95% uptime."
- name: celery.alerts
rules:
- alert: LocalCeleryWorkerDown
- expr: last_over_time(up{job=~"celery-.*|celery-exporter.*"}[1m]) == 0
- for: 10m
+ expr: last_over_time(up{job="tdp-celery-local"}[1m]) == 0 or last_over_time(celery_active_worker_count{job="tdp-celery-local"}[1m]) < 1
+ for: 2m
labels:
- severity: ERROR
+ severity: WARNING
annotations:
- summary: "The {{ $labels.service }} service is down."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
-
+ summary: "The {{ $labels.job }} service is down."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
diff --git a/tdrs-backend/plg/prometheus/alerts.yml b/tdrs-backend/plg/prometheus/alerts.yml
index 1a7493262..9c5049699 100644
--- a/tdrs-backend/plg/prometheus/alerts.yml
+++ b/tdrs-backend/plg/prometheus/alerts.yml
@@ -4,50 +4,50 @@ groups:
- alert: DevDatabaseDown
expr: last_over_time(pg_up{job="postgres-dev"}[1m]) == 0 or last_over_time(up{job="postgres-dev"}[1m]) == 0
labels:
- severity: CRITICAL
+ severity: WARNING
annotations:
- summary: "The {{ $labels.service }} service is down."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
+ summary: "The {{ $labels.job }} service is down."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
- alert: StagingDatabaseDown
expr: last_over_time(pg_up{job="postgres-staging"}[1m]) == 0 or last_over_time(up{job="postgres-staging"}[1m]) == 0
labels:
- severity: ERROR
+ severity: WARNING
annotations:
- summary: "The {{ $labels.service }} service is down."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
+ summary: "The {{ $labels.job }} service is down."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
- alert: ProductionDatabaseDown
expr: last_over_time(pg_up{job="postgres-production"}[1m]) == 0 or last_over_time(up{job="postgres-production"}[1m]) == 0
labels:
severity: CRITICAL
annotations:
- summary: "The {{ $labels.service }} service is down."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
+ summary: "The {{ $labels.job }} service is down."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment has been down for more than 1 minute."
- name: backend.alerts
rules:
- alert: DevEnvironmentBackendDown
expr: last_over_time(up{job=~"tdp-backend.*", job!~".*prod", job!~".*staging"}[1m]) == 0
for: 10m
labels:
- severity: ERROR
+ severity: WARNING
annotations:
- summary: "The {{ $labels.service }} service is down."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
+ summary: "The {{ $labels.job }} service is down."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
- alert: StagingBackendDown
expr: last_over_time(up{job=~"tdp-backend-staging"}[1m]) == 0
for: 10m
labels:
- severity: ERROR
+ severity: WARNING
annotations:
- summary: "The {{ $labels.service }} service is down."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
+ summary: "The {{ $labels.job }} service is down."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
- alert: ProductionBackendDown
expr: last_over_time(up{job=~"tdp-backend-prod"}[1m]) == 0
for: 10m
labels:
severity: CRITICAL
annotations:
- summary: "The {{ $labels.service }} service is down."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
+ summary: "The {{ $labels.job }} service is down."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
- name: plg.alerts
rules:
- alert: LokiDown
@@ -72,8 +72,8 @@ groups:
labels:
severity: WARNING
annotations:
- summary: "The {{ $labels.service }} service has a uptime warning."
- description: "The {{ $labels.service }} service in the {{ $labels.env }} environment is not maintaining 95% uptime."
+ summary: "The {{ $labels.job }} service has a uptime warning."
+ description: "The {{ $labels.job }} service in the {{ $labels.env }} environment is not maintaining 95% uptime."
- name: celery.alerts
rules:
- alert: CeleryTaskHighFailRate
@@ -131,12 +131,51 @@ groups:
for: 20m
labels:
severity: WARNING
- - alert: CeleryWorkerDown
- expr: last_over_time(up{job=~"celery-.*|celery-exporter.*"}[1m]) == 0
+ - alert: A11yCeleryWorkerDown
+ expr: last_over_time(up{job="tdp-celery-a11y"}[1m]) == 0 or last_over_time(celery_active_worker_count{job="tdp-celery-a11y"}[1m]) < 1
+ for: 10m
+ labels:
+ severity: WARNING
+ annotations:
+ summary: "The {{ $labels.service }} service is down."
+ description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
+ - alert: RaftCeleryWorkerDown
+ expr: last_over_time(up{job="tdp-celery-raft"}[1m]) == 0 or last_over_time(celery_active_worker_count{job="tdp-celery-raft"}[1m]) < 1
+ for: 10m
+ labels:
+ severity: WARNING
+ annotations:
+ summary: "The {{ $labels.service }} service is down."
+ description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
+ - alert: QaspCeleryWorkerDown
+ expr: last_over_time(up{job="tdp-celery-qasp"}[1m]) == 0 or last_over_time(celery_active_worker_count{job="tdp-celery-qasp"}[1m]) < 1
+ for: 10m
+ labels:
+ severity: WARNING
+ annotations:
+ summary: "The {{ $labels.service }} service is down."
+ description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
+ - alert: DevCeleryWorkerDown
+ expr: last_over_time(up{job="tdp-celery-develop"}[1m]) == 0 or last_over_time(celery_active_worker_count{job="tdp-celery-develop"}[1m]) < 1
+ for: 10m
+ labels:
+ severity: WARNING
+ annotations:
+ summary: "The {{ $labels.service }} service is down."
+ description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
+ - alert: StagingCeleryWorkerDown
+ expr: last_over_time(up{job="tdp-celery-staging"}[1m]) == 0 or last_over_time(celery_active_worker_count{job="tdp-celery-staging"}[1m]) < 1
+ for: 10m
+ labels:
+ severity: WARNING
+ annotations:
+ summary: "The {{ $labels.service }} service is down."
+ description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
+ - alert: ProdCeleryWorkerDown
+ expr: last_over_time(up{job="tdp-celery-prod"}[1m]) == 0 or last_over_time(celery_active_worker_count{job="tdp-celery-prod"}[1m]) < 1
for: 10m
labels:
severity: ERROR
annotations:
summary: "The {{ $labels.service }} service is down."
description: "The {{ $labels.service }} service in the {{ $labels.env }} environment has been down for more than 10 minutes."
-
diff --git a/tdrs-backend/plg/prometheus/manifest.yml b/tdrs-backend/plg/prometheus/manifest.yml
index d5c72d72f..c6f1fc680 100644
--- a/tdrs-backend/plg/prometheus/manifest.yml
+++ b/tdrs-backend/plg/prometheus/manifest.yml
@@ -11,6 +11,6 @@ applications:
mv ./prometheus-2.54.1.linux-amd64/prometheus ./prometheus &&
mv ./prometheus-2.54.1.linux-amd64/promtool ./promtool &&
rm -rf ./prometheus-2.54.1.linux-amd64 && rm -rf prometheus-2.54.1.linux-amd64.tar.gz &&
- ./prometheus --config.file=/home/vcap/app/prometheus.yml --storage.tsdb.path=/home/vcap/app/prometheus-data --storage.tsdb.retention.time=30d --storage.tsdb.retention.size=6GB --web.listen-address="0.0.0.0:8080" --web.enable-lifecycle
+ ./prometheus --config.file=/home/vcap/app/prometheus.yml --storage.tsdb.path=/home/vcap/app/prometheus-data --storage.tsdb.retention.time=30d --storage.tsdb.retention.size=6GB --web.listen-address="0.0.0.0:8080" --web.enable-lifecycle --web.enable-remote-write-receiver --enable-feature=exemplar-storage --enable-feature=native-histograms
buildpacks:
- https://github.com/cloudfoundry/binary-buildpack
diff --git a/tdrs-backend/scripts/create_grafana_postgres_role.py b/tdrs-backend/scripts/create_grafana_postgres_role.py
index bbccdf3b9..d6682289d 100644
--- a/tdrs-backend/scripts/create_grafana_postgres_role.py
+++ b/tdrs-backend/scripts/create_grafana_postgres_role.py
@@ -13,7 +13,7 @@
$$;
GRANT CONNECT ON DATABASE {db_name} TO {role};
GRANT USAGE ON SCHEMA public TO {role};
-{select_stmt}
+{revoke_create}{select_stmt}
"""
SELECT_STATEMENT = "GRANT SELECT ON {tables} TO {role};"
@@ -64,7 +64,9 @@ def run(*args): # noqa: C901
if remaining == ("all",):
select_stmt = ADMIN_SELECT_STATEMENT.format(role=role)
- sql = sql_tmpl.format(role=role, db_name=db_name, select_stmt=select_stmt)
+ sql = sql_tmpl.format(
+ role=role, db_name=db_name, revoke_create="", select_stmt=select_stmt
+ )
else:
tables: list[str] = []
for arg in remaining:
@@ -83,8 +85,15 @@ def run(*args): # noqa: C901
tables_str = ",".join(tables)
select_stmt = SELECT_STATEMENT.format(tables=tables_str, role=role)
+ revoke_create = "REVOKE CREATE ON SCHEMA public FROM {role};\n".format(
+ role=role
+ )
sql = sql_tmpl.format(
- role=role, tables=tables_str, db_name=db_name, select_stmt=select_stmt
+ role=role,
+ tables=tables_str,
+ db_name=db_name,
+ revoke_create=revoke_create,
+ select_stmt=select_stmt,
)
with connection.cursor() as cursor:
diff --git a/tdrs-backend/tdpservice/data_files/enums.py b/tdrs-backend/tdpservice/data_files/enums.py
new file mode 100644
index 000000000..3525718f5
--- /dev/null
+++ b/tdrs-backend/tdpservice/data_files/enums.py
@@ -0,0 +1,19 @@
+"""Enums for the data_files app."""
+
+from django.db import models
+
+
+class SubmissionState(models.TextChoices):
+ """Lifecycle states for a submitted data file."""
+
+ UPLOADED = "uploaded", "Uploaded"
+ VIRUS_SCAN_STARTED = "virus_scan_started", "Virus scan started"
+ VIRUS_SCAN_FAILED = "virus_scan_failed", "Virus scan failed"
+ VIRUS_SCAN_COMPLETED = "virus_scan_completed", "Virus scan completed"
+ PARSE_STARTED = "parse_started", "Parse started"
+ PARSE_FAILED = "parse_failed", "Parse failed"
+ PARSED_WITH_ERRORS = "parsed_with_errors", "Parsed with errors"
+ PARSE_COMPLETED = "parse_completed", "Parse completed"
+ STUCK = "stuck", "Stuck"
+ COMPLETED = "completed", "Completed"
+ CANCELED = "canceled", "Canceled"
diff --git a/tdrs-backend/tdpservice/data_files/migrations/0025_datafile_state.py b/tdrs-backend/tdpservice/data_files/migrations/0025_datafile_state.py
new file mode 100644
index 000000000..fcfdd22e8
--- /dev/null
+++ b/tdrs-backend/tdpservice/data_files/migrations/0025_datafile_state.py
@@ -0,0 +1,34 @@
+# Generated by Django 3.2.15 on 2026-03-10 15:00
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+ dependencies = [
+ ("data_files", "0024_alter_datafile_file"),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name="datafile",
+ name="state",
+ field=models.CharField(
+ choices=[
+ ("uploaded", "Uploaded"),
+ ("virus_scan_started", "Virus scan started"),
+ ("virus_scan_failed", "Virus scan failed"),
+ ("virus_scan_completed", "Virus scan completed"),
+ ("parse_started", "Parse started"),
+ ("parse_failed", "Parse failed"),
+ ("parsed_with_errors", "Parsed with errors"),
+ ("parse_completed", "Parse completed"),
+ ("stuck", "Stuck"),
+ ("completed", "Completed"),
+ ("canceled", "Canceled"),
+ ],
+ default="uploaded",
+ max_length=32,
+ ),
+ preserve_default=True,
+ ),
+ ]
diff --git a/tdrs-backend/tdpservice/data_files/models.py b/tdrs-backend/tdpservice/data_files/models.py
index 6d0544a47..46bd3bf5d 100644
--- a/tdrs-backend/tdpservice/data_files/models.py
+++ b/tdrs-backend/tdpservice/data_files/models.py
@@ -17,6 +17,7 @@
from tdpservice.backends import DataFilesS3Storage
from tdpservice.common.fields import S3VersionedFileField
from tdpservice.common.models import FileRecord
+from tdpservice.data_files.enums import SubmissionState
from tdpservice.data_files.util import (
create_legacy_s3_log_file_path,
create_s3_log_file_path,
@@ -171,6 +172,13 @@ class Meta:
is_program_audit = models.BooleanField(default=False)
version = models.IntegerField()
+ state = models.CharField(
+ max_length=32,
+ blank=False,
+ null=False,
+ choices=SubmissionState.choices,
+ default=SubmissionState.UPLOADED,
+ )
user = models.ForeignKey(
User, on_delete=models.CASCADE, related_name="user", blank=False, null=False
diff --git a/tdrs-backend/tdpservice/data_files/submission_lifecycle.py b/tdrs-backend/tdpservice/data_files/submission_lifecycle.py
new file mode 100644
index 000000000..9594a238b
--- /dev/null
+++ b/tdrs-backend/tdpservice/data_files/submission_lifecycle.py
@@ -0,0 +1,126 @@
+"""Helpers for DataFile submission state transitions."""
+
+import logging
+from dataclasses import dataclass
+from typing import Callable, Dict, Iterable
+
+from tdpservice.data_files.enums import SubmissionState
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class TransitionRecord:
+ """In-memory record of a single submission state transition."""
+
+ previous_state: SubmissionState
+ next_state: SubmissionState
+ note: str = ""
+
+
+class InvalidTransition(ValueError):
+ """Raised when an invalid submission state transition is attempted."""
+
+
+ALLOWED_TRANSITIONS: Dict[SubmissionState, Iterable[SubmissionState]] = {
+ SubmissionState.UPLOADED: {
+ SubmissionState.VIRUS_SCAN_STARTED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.VIRUS_SCAN_STARTED: {
+ SubmissionState.VIRUS_SCAN_FAILED,
+ SubmissionState.VIRUS_SCAN_COMPLETED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.VIRUS_SCAN_FAILED: {
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.VIRUS_SCAN_COMPLETED: {
+ SubmissionState.PARSE_STARTED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.PARSE_STARTED: {
+ SubmissionState.PARSE_FAILED,
+ SubmissionState.PARSED_WITH_ERRORS,
+ SubmissionState.PARSE_COMPLETED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.PARSE_FAILED: {
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.PARSED_WITH_ERRORS: {
+ SubmissionState.COMPLETED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.PARSE_COMPLETED: {
+ SubmissionState.COMPLETED,
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.STUCK: {
+ SubmissionState.CANCELED,
+ },
+ SubmissionState.COMPLETED: set(),
+ SubmissionState.CANCELED: set(),
+}
+
+
+def coerce_submission_state(state) -> SubmissionState:
+ """Normalize a state value into a SubmissionState enum."""
+ if isinstance(state, SubmissionState):
+ return state
+ return SubmissionState(state)
+
+
+def allowed_next_states(current_state) -> set[SubmissionState]:
+ """Return the allowed next states for the given current state."""
+ normalized_current_state = coerce_submission_state(current_state)
+ return set(ALLOWED_TRANSITIONS[normalized_current_state])
+
+
+def validate_transition(current_state, next_state) -> TransitionRecord:
+ """Validate a transition request and return a transition record."""
+ normalized_current_state = coerce_submission_state(current_state)
+ normalized_next_state = coerce_submission_state(next_state)
+
+ if normalized_next_state not in allowed_next_states(normalized_current_state):
+ raise InvalidTransition(
+ f"Cannot transition submission from {normalized_current_state.value} "
+ + f"to {normalized_next_state.value}."
+ )
+
+ return TransitionRecord(
+ previous_state=normalized_current_state,
+ next_state=normalized_next_state,
+ )
+
+
+def transition_datafile(
+ data_file,
+ next_state,
+ note="",
+ logger_hook: Callable | None = None,
+):
+ """Safely transition a DataFile.state value and persist the new state."""
+ transition = validate_transition(data_file.state, next_state)
+ transition = TransitionRecord(
+ previous_state=transition.previous_state,
+ next_state=transition.next_state,
+ note=note,
+ )
+
+ data_file.state = transition.next_state
+ data_file.save(update_fields=["state"])
+
+ log_payload = {
+ "data_file_id": data_file.id,
+ "previous_state": transition.previous_state.value,
+ "next_state": transition.next_state.value,
+ "note": note,
+ }
+
+ if logger_hook is not None:
+ logger_hook(log_payload)
+ else:
+ logger.info("DataFile submission state transition", extra=log_payload)
+
+ return data_file
diff --git a/tdrs-backend/tdpservice/data_files/test/factories.py b/tdrs-backend/tdpservice/data_files/test/factories.py
index cce404d84..0dcd94b81 100644
--- a/tdrs-backend/tdpservice/data_files/test/factories.py
+++ b/tdrs-backend/tdpservice/data_files/test/factories.py
@@ -2,6 +2,7 @@
import factory
+from tdpservice.data_files.enums import SubmissionState
from tdpservice.stts.test.factories import STTFactory
from tdpservice.users.test.factories import UserFactory
@@ -22,6 +23,7 @@ class Meta:
quarter = "Q1"
year = 2020
version = 1
+ state = SubmissionState.UPLOADED
user = factory.SubFactory(UserFactory)
stt = factory.SubFactory(STTFactory)
file = factory.django.FileField(data=b"test", filename="my_data_file.txt")
diff --git a/tdrs-backend/tdpservice/data_files/test/test_serializers.py b/tdrs-backend/tdpservice/data_files/test/test_serializers.py
index c19c56afe..112a9d4c5 100644
--- a/tdrs-backend/tdpservice/data_files/test/test_serializers.py
+++ b/tdrs-backend/tdpservice/data_files/test/test_serializers.py
@@ -1,4 +1,5 @@
"""Test data file serializers."""
+
from django.core.exceptions import ValidationError
import pytest
@@ -10,6 +11,7 @@
validate_file_infection,
)
from tdpservice.security.clients import ClamAVClient
+from tdpservice.security.models import ClamAVFileScan
@pytest.mark.django_db
@@ -55,6 +57,14 @@ def test_immutability_of_data_file(data_file_instance):
@pytest.mark.django_db
def test_created_at(data_file_data, data_analyst):
"""If a serializer has valid data it will return a valid object."""
+ ClamAVFileScan.objects.record_scan(
+ data_file_data["file"],
+ data_file_data["file"].name,
+ f"File scan marked as CLEAN for file: {data_file_data['file'].name}",
+ ClamAVFileScan.Result.CLEAN,
+ data_analyst,
+ )
+
create_serializer = DataFileSerializer(
context={"user": data_analyst}, data=data_file_data
)
@@ -65,6 +75,14 @@ def test_created_at(data_file_data, data_analyst):
assert data_file.av_scans.exists()
+@pytest.mark.django_db
+def test_state_not_exposed_by_serializer(data_file_instance):
+ """Test submission state remains schema-only for serializer output."""
+ serialized = DataFileSerializer(data_file_instance).data
+
+ assert "state" not in serialized
+
+
@pytest.mark.django_db
def test_data_file_still_created_if_av_scan_fails_to_create(
data_file_data, mocker, data_analyst
@@ -131,17 +149,19 @@ def test_rejects_invalid_file_extensions(file_name):
@pytest.mark.django_db
-def test_rejects_infected_file(infected_file, fake_file_name, user):
+def test_rejects_infected_file(infected_file, fake_file_name, user, settings):
"""Test infected files are rejected by serializer validation."""
+ settings.CLAMAV_NEEDED = True
with pytest.raises(ValidationError):
validate_file_infection(infected_file, fake_file_name, user)
@pytest.mark.django_db
def test_rejects_uploads_on_clamav_connection_error(
- fake_file, fake_file_name, mocker, user
+ fake_file, fake_file_name, mocker, user, settings
):
"""Test that DataFiles cannot pass validation if ClamAV is down."""
+ settings.CLAMAV_NEEDED = True
mocker.patch(
"tdpservice.security.clients.ClamAVClient.scan_file",
side_effect=ClamAVClient.ServiceUnavailable(),
diff --git a/tdrs-backend/tdpservice/data_files/test/test_submission_lifecycle.py b/tdrs-backend/tdpservice/data_files/test/test_submission_lifecycle.py
new file mode 100644
index 000000000..bab6f3b36
--- /dev/null
+++ b/tdrs-backend/tdpservice/data_files/test/test_submission_lifecycle.py
@@ -0,0 +1,144 @@
+"""Tests for submission lifecycle helpers."""
+
+import pytest
+
+from tdpservice.data_files.enums import SubmissionState
+from tdpservice.data_files.submission_lifecycle import (
+ InvalidTransition,
+ allowed_next_states,
+ transition_datafile,
+ validate_transition,
+)
+from tdpservice.data_files.test.factories import DataFileFactory
+
+
+def test_valid_transitions_succeed():
+ """Test allowed state transitions validate successfully."""
+ first = validate_transition(
+ SubmissionState.UPLOADED, SubmissionState.VIRUS_SCAN_STARTED
+ )
+ second = validate_transition(
+ SubmissionState.VIRUS_SCAN_STARTED, SubmissionState.VIRUS_SCAN_COMPLETED
+ )
+
+ assert first.previous_state == SubmissionState.UPLOADED
+ assert first.next_state == SubmissionState.VIRUS_SCAN_STARTED
+ assert second.previous_state == SubmissionState.VIRUS_SCAN_STARTED
+ assert second.next_state == SubmissionState.VIRUS_SCAN_COMPLETED
+ assert allowed_next_states(SubmissionState.UPLOADED) == {
+ SubmissionState.VIRUS_SCAN_STARTED,
+ SubmissionState.CANCELED,
+ }
+
+
+def test_invalid_transition_raises():
+ """Test invalid transitions raise InvalidTransition."""
+ with pytest.raises(InvalidTransition, match="uploaded to parse_completed"):
+ validate_transition(SubmissionState.UPLOADED, SubmissionState.PARSE_COMPLETED)
+
+
+@pytest.mark.parametrize(
+ "state",
+ [
+ SubmissionState.COMPLETED,
+ SubmissionState.CANCELED,
+ ],
+)
+def test_terminal_states_cannot_transition(state):
+ """Test terminal states reject further transitions."""
+ with pytest.raises(InvalidTransition, match=f"{state.value} to uploaded"):
+ validate_transition(state, SubmissionState.UPLOADED)
+
+
+@pytest.mark.django_db
+def test_transition_datafile_updates_state():
+ """Test transition_datafile persists the expected state."""
+ data_file = DataFileFactory(state=SubmissionState.UPLOADED)
+
+ transition_datafile(
+ data_file,
+ SubmissionState.VIRUS_SCAN_STARTED,
+ note="Picked up by AV scan worker",
+ )
+ data_file.refresh_from_db()
+
+ assert data_file.state == SubmissionState.VIRUS_SCAN_STARTED
+
+
+@pytest.mark.django_db
+def test_transition_datafile_calls_logger_hook():
+ """Test transition_datafile emits structured payloads to a logger hook."""
+ data_file = DataFileFactory(state=SubmissionState.PARSE_STARTED)
+ payloads = []
+
+ transition_datafile(
+ data_file,
+ SubmissionState.PARSE_COMPLETED,
+ note="Parser completed successfully",
+ logger_hook=payloads.append,
+ )
+
+ assert payloads == [
+ {
+ "data_file_id": data_file.id,
+ "previous_state": SubmissionState.PARSE_STARTED.value,
+ "next_state": SubmissionState.PARSE_COMPLETED.value,
+ "note": "Parser completed successfully",
+ }
+ ]
+
+
+@pytest.mark.django_db
+def test_transition_datafile_integration_persists_sequential_state_changes():
+ """Test sequential persisted transitions on a real DataFile instance."""
+ data_file = DataFileFactory(state=SubmissionState.UPLOADED)
+ payloads = []
+
+ transition_datafile(
+ data_file,
+ SubmissionState.VIRUS_SCAN_STARTED,
+ note="Virus scan worker picked up the file",
+ logger_hook=payloads.append,
+ )
+ data_file.refresh_from_db()
+
+ assert data_file.state == SubmissionState.VIRUS_SCAN_STARTED
+
+ transition_datafile(
+ data_file,
+ SubmissionState.VIRUS_SCAN_COMPLETED,
+ note="Virus scan passed",
+ logger_hook=payloads.append,
+ )
+ data_file.refresh_from_db()
+
+ assert data_file.state == SubmissionState.VIRUS_SCAN_COMPLETED
+ assert payloads == [
+ {
+ "data_file_id": data_file.id,
+ "previous_state": SubmissionState.UPLOADED.value,
+ "next_state": SubmissionState.VIRUS_SCAN_STARTED.value,
+ "note": "Virus scan worker picked up the file",
+ },
+ {
+ "data_file_id": data_file.id,
+ "previous_state": SubmissionState.VIRUS_SCAN_STARTED.value,
+ "next_state": SubmissionState.VIRUS_SCAN_COMPLETED.value,
+ "note": "Virus scan passed",
+ },
+ ]
+
+
+@pytest.mark.django_db
+def test_transition_datafile_supports_parse_failed_state():
+ """Test transition_datafile persists parse failures caused by exceptions."""
+ data_file = DataFileFactory(state=SubmissionState.PARSE_STARTED)
+
+ transition_datafile(
+ data_file,
+ SubmissionState.PARSE_FAILED,
+ note="Parser raised an unexpected exception",
+ )
+ data_file.refresh_from_db()
+
+ assert data_file.state == SubmissionState.PARSE_FAILED
diff --git a/tdrs-backend/tdpservice/email/helpers/data_file.py b/tdrs-backend/tdpservice/email/helpers/data_file.py
index 3dc494bab..b7ea13037 100644
--- a/tdrs-backend/tdpservice/email/helpers/data_file.py
+++ b/tdrs-backend/tdpservice/email/helpers/data_file.py
@@ -1,5 +1,7 @@
"""Helper functions for sending data file submission emails."""
+from zoneinfo import ZoneInfo
+
from django.conf import settings
from tdpservice.data_files.models import DataFile
@@ -79,6 +81,19 @@ def get_tanf_total_errors_context_count(datafile_summary):
return {"total_errors": total_errors}
+def get_pia_quarter_label(quarter):
+ """Return the human-readable quarter label for PIA submissions."""
+ match quarter:
+ case DataFile.Quarter.Q1:
+ return "Quarter 1 (October - December)"
+ case DataFile.Quarter.Q2:
+ return "Quarter 2 (January - March)"
+ case DataFile.Quarter.Q3:
+ return "Quarter 3 (April - June)"
+ case DataFile.Quarter.Q4:
+ return "Quarter 4 (July - September)"
+
+
def get_fra_aggregates_context_count(datafile_summary):
"""Return the relevant context data from case aggregates for FRA files."""
case_aggregates = datafile_summary.case_aggregates or {}
@@ -113,10 +128,20 @@ def send_data_submitted_email(
prog_type = datafile.program_type
section_name = get_program_section_str(prog_type, datafile.section)
+ is_program_audit = datafile.is_program_audit
- file_type = get_friendly_program_type(prog_type)
+ file_type = (
+ "TANF Program Integrity Audit"
+ if is_program_audit
+ else get_friendly_program_type(prog_type)
+ )
stt_name = datafile.stt.name
- submission_date = datafile.created_at
+ if datafile.created_at is not None:
+ stt_tz = ZoneInfo(datafile.stt.timezone or "UTC")
+ local_time = datafile.created_at.astimezone(stt_tz)
+ submission_date = local_time.strftime("%m/%d/%Y %I:%M %p %Z")
+ else:
+ submission_date = datafile.created_at
fiscal_year = datafile.fiscal_year
submitted_by = datafile.submitted_by
@@ -135,51 +160,80 @@ def send_data_submitted_email(
"status": datafile_summary.status,
"has_errors": datafile_summary.status != DataFileSummary.Status.ACCEPTED,
"is_aggregate": is_aggregate,
+ "is_program_audit": is_program_audit,
"url": settings.FRONTEND_BASE_URL,
}
if datafile_summary.status == DataFileSummary.Status.PENDING:
return
- elif datafile_summary.status == DataFileSummary.Status.ACCEPTED:
- subject = f"{section_name} Successfully Submitted Without Errors"
- text_message = f"{file_type} has been submitted and processed without errors."
- else:
- subject = f"Action Required: {section_name} Contains Errors"
- text_message = f"{file_type} has been submitted and processed with errors."
- context.update({"subject": subject})
+ text_message = (
+ f"{file_type} has been submitted and processed without errors."
+ if datafile_summary.status == DataFileSummary.Status.ACCEPTED
+ else f"{file_type} has been submitted and processed with errors."
+ )
- match prog_type:
- case (
- DataFile.ProgramType.TANF
- | DataFile.ProgramType.SSP
- | DataFile.ProgramType.TRIBAL
- ):
- if is_aggregate:
- context.update(get_tanf_total_errors_context_count(datafile_summary))
- else:
- context.update(get_tanf_aggregates_context_count(datafile_summary))
-
- template_options = {
- DataFileSummary.Status.ACCEPTED: TanfDataFileEmail.ACCEPTED.value,
- DataFileSummary.Status.ACCEPTED_WITH_ERRORS: TanfDataFileEmail.ACCEPTED_WITH_ERRORS.value,
- DataFileSummary.Status.PARTIALLY_ACCEPTED: TanfDataFileEmail.PARTIALLY_ACCEPTED.value,
- DataFileSummary.Status.REJECTED: TanfDataFileEmail.REJECTED.value,
- }
-
- template_path = template_options[datafile_summary.status]
+ if is_program_audit:
+ quarter_label = get_pia_quarter_label(datafile.quarter)
+ context.update({"quarter_label": quarter_label})
+ context.update(get_tanf_aggregates_context_count(datafile_summary))
- case DataFile.ProgramType.FRA:
- context.update(get_fra_aggregates_context_count(datafile_summary))
+ if datafile_summary.status == DataFileSummary.Status.ACCEPTED:
+ subject = (
+ f"{file_type}: {quarter_label} Successfully Submitted Without Errors"
+ )
+ else:
+ subject = f"Action Required: {file_type}: {quarter_label} Contains Errors"
- template_options = {
- DataFileSummary.Status.ACCEPTED: FraDataFileEmail.ACCEPTED.value,
- DataFileSummary.Status.ACCEPTED_WITH_ERRORS: FraDataFileEmail.ACCEPTED_WITH_ERRORS.value,
- DataFileSummary.Status.PARTIALLY_ACCEPTED: FraDataFileEmail.PARTIALLY_ACCEPTED.value,
- DataFileSummary.Status.REJECTED: FraDataFileEmail.REJECTED.value,
- }
+ template_options = {
+ DataFileSummary.Status.ACCEPTED: TanfDataFileEmail.ACCEPTED.value,
+ DataFileSummary.Status.ACCEPTED_WITH_ERRORS: TanfDataFileEmail.ACCEPTED_WITH_ERRORS.value,
+ DataFileSummary.Status.PARTIALLY_ACCEPTED: TanfDataFileEmail.PARTIALLY_ACCEPTED.value,
+ DataFileSummary.Status.REJECTED: TanfDataFileEmail.REJECTED.value,
+ }
- template_path = template_options[datafile_summary.status]
+ template_path = template_options[datafile_summary.status]
+ else:
+ if datafile_summary.status == DataFileSummary.Status.ACCEPTED:
+ subject = f"{section_name} Successfully Submitted Without Errors"
+ else:
+ subject = f"Action Required: {section_name} Contains Errors"
+
+ match prog_type:
+ case (
+ DataFile.ProgramType.TANF
+ | DataFile.ProgramType.SSP
+ | DataFile.ProgramType.TRIBAL
+ ):
+ if is_aggregate:
+ context.update(
+ get_tanf_total_errors_context_count(datafile_summary)
+ )
+ else:
+ context.update(get_tanf_aggregates_context_count(datafile_summary))
+
+ template_options = {
+ DataFileSummary.Status.ACCEPTED: TanfDataFileEmail.ACCEPTED.value,
+ DataFileSummary.Status.ACCEPTED_WITH_ERRORS: TanfDataFileEmail.ACCEPTED_WITH_ERRORS.value,
+ DataFileSummary.Status.PARTIALLY_ACCEPTED: TanfDataFileEmail.PARTIALLY_ACCEPTED.value,
+ DataFileSummary.Status.REJECTED: TanfDataFileEmail.REJECTED.value,
+ }
+
+ template_path = template_options[datafile_summary.status]
+
+ case DataFile.ProgramType.FRA:
+ context.update(get_fra_aggregates_context_count(datafile_summary))
+
+ template_options = {
+ DataFileSummary.Status.ACCEPTED: FraDataFileEmail.ACCEPTED.value,
+ DataFileSummary.Status.ACCEPTED_WITH_ERRORS: FraDataFileEmail.ACCEPTED_WITH_ERRORS.value,
+ DataFileSummary.Status.PARTIALLY_ACCEPTED: FraDataFileEmail.PARTIALLY_ACCEPTED.value,
+ DataFileSummary.Status.REJECTED: FraDataFileEmail.REJECTED.value,
+ }
+
+ template_path = template_options[datafile_summary.status]
+
+ context.update({"subject": subject})
log(
f"Data file submitted; emailing Data Analysts {list(recipients)}",
diff --git a/tdrs-backend/tdpservice/email/helpers/feedback_report.py b/tdrs-backend/tdpservice/email/helpers/feedback_report.py
index ce5a87943..cf12d5414 100644
--- a/tdrs-backend/tdpservice/email/helpers/feedback_report.py
+++ b/tdrs-backend/tdpservice/email/helpers/feedback_report.py
@@ -9,7 +9,7 @@
def send_feedback_report_available_email(report_file: ReportFile, recipients):
"""
- Send an email to Data Analysts when a feedback report is available.
+ Send an email when a feedback report is available.
Parameters
----------
@@ -53,7 +53,7 @@ def send_feedback_report_available_email(report_file: ReportFile, recipients):
}
log(
- f"Feedback report available; emailing Data Analysts {list(recipients)}",
+ f"Feedback report available; emailing recipients {list(recipients)}",
logger_context=logger_context,
)
diff --git a/tdrs-backend/tdpservice/email/helpers/test/test_data_file.py b/tdrs-backend/tdpservice/email/helpers/test/test_data_file.py
index bf952d9d6..e799f072e 100644
--- a/tdrs-backend/tdpservice/email/helpers/test/test_data_file.py
+++ b/tdrs-backend/tdpservice/email/helpers/test/test_data_file.py
@@ -1,11 +1,14 @@
"""Test functions for data_file email helper."""
+from datetime import datetime, timezone
+
from django.core import mail
import pytest
from tdpservice.data_files.models import DataFile
from tdpservice.email.helpers.data_file import (
+ get_pia_quarter_label,
get_tanf_aggregates_context_count,
get_tanf_total_errors_context_count,
send_data_submitted_email,
@@ -202,6 +205,76 @@ def test_send_data_submitted_email(
assert mail.outbox[0].body == msg
+_PIA_Q1_LABEL = "Quarter 1 (October - December)"
+_PIA_FILE_TYPE = "TANF Program Integrity Audit"
+
+
+@pytest.mark.django_db
+@pytest.mark.parametrize(
+ "status,expected_subject,expected_text",
+ [
+ (
+ DataFileSummary.Status.ACCEPTED,
+ f"{_PIA_FILE_TYPE}: {_PIA_Q1_LABEL} Successfully Submitted Without Errors",
+ f"{_PIA_FILE_TYPE} has been submitted and processed without errors.",
+ ),
+ (
+ DataFileSummary.Status.ACCEPTED_WITH_ERRORS,
+ f"Action Required: {_PIA_FILE_TYPE}: {_PIA_Q1_LABEL} Contains Errors",
+ f"{_PIA_FILE_TYPE} has been submitted and processed with errors.",
+ ),
+ (
+ DataFileSummary.Status.PARTIALLY_ACCEPTED,
+ f"Action Required: {_PIA_FILE_TYPE}: {_PIA_Q1_LABEL} Contains Errors",
+ f"{_PIA_FILE_TYPE} has been submitted and processed with errors.",
+ ),
+ (
+ DataFileSummary.Status.REJECTED,
+ f"Action Required: {_PIA_FILE_TYPE}: {_PIA_Q1_LABEL} Contains Errors",
+ f"{_PIA_FILE_TYPE} has been submitted and processed with errors.",
+ ),
+ ],
+)
+def test_send_data_submitted_email_pia(
+ user, stt, status, expected_subject, expected_text
+):
+ """Test that PIA submissions use distinct subjects, text, and quarter-based templates."""
+ df = DataFile(
+ user=user,
+ section=DataFile.Section.ACTIVE_CASE_DATA,
+ program_type=DataFile.ProgramType.TANF,
+ quarter=DataFile.Quarter.Q1,
+ year=2021,
+ stt=stt,
+ is_program_audit=True,
+ )
+
+ dfs = DataFileSummary(datafile=df, status=status)
+
+ send_data_submitted_email(dfs, ["test@not-real.com"])
+
+ assert len(mail.outbox) == 1
+ assert mail.outbox[0].subject == expected_subject
+ assert mail.outbox[0].body == expected_text
+
+
+class TestGetPiaQuarterLabel:
+ """Tests for get_pia_quarter_label."""
+
+ @pytest.mark.parametrize(
+ "quarter,expected",
+ [
+ (DataFile.Quarter.Q1, "Quarter 1 (October - December)"),
+ (DataFile.Quarter.Q2, "Quarter 2 (January - March)"),
+ (DataFile.Quarter.Q3, "Quarter 3 (April - June)"),
+ (DataFile.Quarter.Q4, "Quarter 4 (July - September)"),
+ ],
+ )
+ def test_quarter_labels(self, quarter, expected):
+ """Test that all quarters map to correct human-readable labels."""
+ assert get_pia_quarter_label(quarter) == expected
+
+
class TestGetTanfAggregatesContextCount:
"""Tests for get_tanf_aggregates_context_count."""
@@ -348,3 +421,101 @@ def test_send_stuck_file_email(user, stt):
== "List of submitted files with pending status after 1 hour"
)
assert mail.outbox[0].body == "The system has detected stuck files."
+
+
+@pytest.mark.django_db
+def test_submission_date_formatted_in_stt_timezone(user, stt):
+ """Test that the email submission_date is formatted in the STT's timezone."""
+ stt.timezone = "America/Chicago"
+ stt.save()
+
+ df = DataFile.objects.create(
+ user=user,
+ section=DataFile.Section.ACTIVE_CASE_DATA,
+ program_type=DataFile.ProgramType.TANF,
+ quarter="Q1",
+ year=2021,
+ version=1,
+ stt=stt,
+ )
+ # Override created_at to a known UTC time (2024-01-15 18:00 UTC = 12:00 PM CST)
+ DataFile.objects.filter(pk=df.pk).update(
+ created_at=datetime(2024, 1, 15, 18, 0, 0, tzinfo=timezone.utc)
+ )
+ df.refresh_from_db()
+
+ dfs = DataFileSummary.objects.create(
+ datafile=df,
+ status=DataFileSummary.Status.ACCEPTED,
+ )
+
+ send_data_submitted_email(dfs, ["test@not-real.com"])
+
+ assert len(mail.outbox) == 1
+ body = mail.outbox[0].alternatives[0][0] # HTML body
+ assert "01/15/2024 12:00 PM CST" in body
+
+
+@pytest.mark.django_db
+def test_submission_date_formatted_in_eastern_timezone(user, stt):
+ """Test that Eastern timezone formatting includes EST/EDT label."""
+ stt.timezone = "America/New_York"
+ stt.save()
+
+ df = DataFile.objects.create(
+ user=user,
+ section=DataFile.Section.ACTIVE_CASE_DATA,
+ program_type=DataFile.ProgramType.TANF,
+ quarter="Q1",
+ year=2021,
+ version=1,
+ stt=stt,
+ )
+ # 2024-01-15 18:00 UTC = 1:00 PM EST (January, so EST not EDT)
+ DataFile.objects.filter(pk=df.pk).update(
+ created_at=datetime(2024, 1, 15, 18, 0, 0, tzinfo=timezone.utc)
+ )
+ df.refresh_from_db()
+
+ dfs = DataFileSummary.objects.create(
+ datafile=df,
+ status=DataFileSummary.Status.ACCEPTED,
+ )
+
+ send_data_submitted_email(dfs, ["test@not-real.com"])
+
+ assert len(mail.outbox) == 1
+ body = mail.outbox[0].alternatives[0][0]
+ assert "01/15/2024 01:00 PM EST" in body
+
+
+@pytest.mark.django_db
+def test_submission_date_utc_fallback_when_no_timezone(user, stt):
+ """Test that submission_date falls back to UTC when STT has no timezone."""
+ stt.timezone = ""
+ stt.save()
+
+ df = DataFile.objects.create(
+ user=user,
+ section=DataFile.Section.ACTIVE_CASE_DATA,
+ program_type=DataFile.ProgramType.TANF,
+ quarter="Q1",
+ year=2021,
+ version=1,
+ stt=stt,
+ )
+ DataFile.objects.filter(pk=df.pk).update(
+ created_at=datetime(2024, 1, 15, 18, 0, 0, tzinfo=timezone.utc)
+ )
+ df.refresh_from_db()
+
+ dfs = DataFileSummary.objects.create(
+ datafile=df,
+ status=DataFileSummary.Status.ACCEPTED,
+ )
+
+ send_data_submitted_email(dfs, ["test@not-real.com"])
+
+ assert len(mail.outbox) == 1
+ body = mail.outbox[0].alternatives[0][0]
+ assert "01/15/2024 06:00 PM UTC" in body
diff --git a/tdrs-backend/tdpservice/email/templates/tanf/accepted.html b/tdrs-backend/tdpservice/email/templates/tanf/accepted.html
index e883cad80..f28b50af1 100644
--- a/tdrs-backend/tdpservice/email/templates/tanf/accepted.html
+++ b/tdrs-backend/tdpservice/email/templates/tanf/accepted.html
@@ -1,7 +1,7 @@
{% extends '../datafile_base.html' %}
{% block message %}
-Your {{ file_type }} data files submitted for {{ stt_name }} on {{ submission_date }} in Fiscal Year {{ fiscal_year }} have been processed and accepted.
+Your {{ file_type }} data file submitted for {{ stt_name }} on {{ submission_date }} in Fiscal Year {{ fiscal_year }} has been processed and accepted.
{% endblock %}
{% block table %}
@@ -11,7 +11,7 @@
-
Section
+ {% if is_program_audit %}Fiscal Quarter {% else %}Section {% endif %}
Submitted By
Status
{% if is_aggregate %}
@@ -23,7 +23,7 @@