Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/dataset-input-contract.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added dataset input contract helpers for data-generation packages.
10 changes: 10 additions & 0 deletions policyengine_us/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
from .dataset_schema import USSingleYearDataset, USMultiYearDataset
from .economic_assumptions import extend_single_year_dataset, get_parameter_last_year
from .dataset_input_contract import (
DatasetInputKind,
DatasetInputMetadata,
dataset_input_metadata,
dataset_input_variables,
get_dataset_input_metadata,
is_dataset_exportable_variable,
is_dataset_input_variable,
is_formula_owned_variable,
)
233 changes: 233 additions & 0 deletions policyengine_us/data/dataset_input_contract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
from __future__ import annotations

from dataclasses import dataclass
from typing import Literal


DatasetInputKind = Literal[
"stochastic_status",
"medical_status",
"geographic_status",
"identifier_status",
"income_override",
"deprecated_alias",
]


@dataclass(frozen=True)
class DatasetInputMetadata:
"""Metadata for variables datasets may intentionally provide."""

variable: str
kind: DatasetInputKind
rationale: str


_DATASET_INPUT_METADATA: dict[str, DatasetInputMetadata] = {
"takes_up_aca_if_eligible": DatasetInputMetadata(
variable="takes_up_aca_if_eligible",
kind="stochastic_status",
rationale="Dataset builders may model ACA take-up among eligible tax units.",
),
"takes_up_basic_health_program_if_eligible": DatasetInputMetadata(
variable="takes_up_basic_health_program_if_eligible",
kind="stochastic_status",
rationale=(
"Dataset builders may model Basic Health Program take-up among "
"eligible people."
),
),
"takes_up_chip_if_eligible": DatasetInputMetadata(
variable="takes_up_chip_if_eligible",
kind="stochastic_status",
rationale="Dataset builders may model CHIP take-up among eligible people.",
),
"takes_up_dc_ptc": DatasetInputMetadata(
variable="takes_up_dc_ptc",
kind="stochastic_status",
rationale=(
"Dataset builders may model DC property tax credit take-up among "
"eligible tax units."
),
),
"takes_up_early_head_start_if_eligible": DatasetInputMetadata(
variable="takes_up_early_head_start_if_eligible",
kind="stochastic_status",
rationale=(
"Dataset builders may model Early Head Start take-up among eligible people."
),
),
"takes_up_eitc": DatasetInputMetadata(
variable="takes_up_eitc",
kind="stochastic_status",
rationale="Dataset builders may model EITC take-up among eligible tax units.",
),
"takes_up_head_start_if_eligible": DatasetInputMetadata(
variable="takes_up_head_start_if_eligible",
kind="stochastic_status",
rationale="Dataset builders may model Head Start take-up among eligible people.",
),
"takes_up_housing_assistance_if_eligible": DatasetInputMetadata(
variable="takes_up_housing_assistance_if_eligible",
kind="stochastic_status",
rationale=(
"Dataset builders may model housing assistance take-up among "
"eligible SPM units."
),
),
"takes_up_medicaid_if_eligible": DatasetInputMetadata(
variable="takes_up_medicaid_if_eligible",
kind="stochastic_status",
rationale="Dataset builders may model Medicaid take-up among eligible people.",
),
"takes_up_medicare_if_eligible": DatasetInputMetadata(
variable="takes_up_medicare_if_eligible",
kind="stochastic_status",
rationale="Dataset builders may model Medicare take-up among eligible people.",
),
"takes_up_snap_if_eligible": DatasetInputMetadata(
variable="takes_up_snap_if_eligible",
kind="stochastic_status",
rationale="Dataset builders may model SNAP take-up among eligible SPM units.",
),
"takes_up_ssi_if_eligible": DatasetInputMetadata(
variable="takes_up_ssi_if_eligible",
kind="stochastic_status",
rationale="Dataset builders may model SSI take-up among eligible people.",
),
"takes_up_tanf_if_eligible": DatasetInputMetadata(
variable="takes_up_tanf_if_eligible",
kind="stochastic_status",
rationale="Dataset builders may model TANF take-up among eligible SPM units.",
),
"would_claim_wic": DatasetInputMetadata(
variable="would_claim_wic",
kind="stochastic_status",
rationale="Dataset builders may model WIC claiming among eligible people.",
),
"is_wic_at_nutritional_risk": DatasetInputMetadata(
variable="is_wic_at_nutritional_risk",
kind="medical_status",
rationale=(
"Dataset builders may model WIC nutritional-risk status; the model "
"uses the input directly."
),
),
"meets_ssi_disability_criteria": DatasetInputMetadata(
variable="meets_ssi_disability_criteria",
kind="medical_status",
rationale=(
"Dataset builders may provide the SSI medical-disability criterion "
"separately from broad disability flags."
),
),
"has_tin": DatasetInputMetadata(
variable="has_tin",
kind="identifier_status",
rationale=(
"Dataset builders may provide taxpayer identification status; the "
"fallback formula defaults to True when no data are supplied."
),
),
"has_itin": DatasetInputMetadata(
variable="has_itin",
kind="deprecated_alias",
rationale=(
"Deprecated alias accepted during migration from has_itin to has_tin."
),
),
"in_nyc": DatasetInputMetadata(
variable="in_nyc",
kind="geographic_status",
rationale=(
"Dataset builders may provide NYC residency directly when county "
"geography is unavailable or deliberately suppressed."
),
),
"fsla_overtime_premium": DatasetInputMetadata(
variable="fsla_overtime_premium",
kind="income_override",
rationale=(
"Dataset builders may provide measured or imputed FLSA overtime "
"premium income instead of relying on weekly-hours approximations."
),
),
}


def dataset_input_metadata() -> dict[str, DatasetInputMetadata]:
"""Return metadata for variables datasets may intentionally provide."""
return dict(_DATASET_INPUT_METADATA)


def dataset_input_variables(
*,
kind: DatasetInputKind | None = None,
) -> frozenset[str]:
"""Return variables that are explicit dataset inputs under the US model."""
if kind is None:
return frozenset(_DATASET_INPUT_METADATA)
return frozenset(
name
for name, metadata in _DATASET_INPUT_METADATA.items()
if metadata.kind == kind
)


def get_dataset_input_metadata(
variable_name: str,
) -> DatasetInputMetadata | None:
"""Return dataset-input metadata for a variable, if explicitly defined."""
return _DATASET_INPUT_METADATA.get(variable_name)


def is_dataset_input_variable(variable_name: str) -> bool:
"""Return whether a variable is an explicit dataset input."""
return variable_name in _DATASET_INPUT_METADATA


def variable_has_formula(variable) -> bool:
"""Return whether a variable is computed by formula/adds/subtracts logic."""
return any(
bool(getattr(variable, attribute, None))
for attribute in ("formulas", "adds", "subtracts")
)


def is_formula_owned_variable(variable_name: str, *, system=None) -> bool:
"""Return whether datasets should normally leave a variable to formulas.

Ordinary input variables are not formula-owned. Formula-backed variables
listed in ``dataset_input_metadata`` are deliberate dataset overrides and
therefore also return ``False`` here.
"""
if system is None:
from policyengine_us import CountryTaxBenefitSystem

system = CountryTaxBenefitSystem()

variable = system.variables.get(variable_name)
if variable is None:
raise KeyError(f"Unknown variable: {variable_name}")
return variable_has_formula(variable) and not is_dataset_input_variable(
variable_name
)


def is_dataset_exportable_variable(variable_name: str, *, system=None) -> bool:
"""Return whether a dataset may export the variable as an input column.

This helper is intended for data-generation packages. It treats ordinary
model input variables as exportable and also allows the explicit override
variables documented in ``dataset_input_metadata``. Formula-owned outputs
should be calculated by PolicyEngine-US rather than persisted in datasets.
"""
if system is None:
from policyengine_us import CountryTaxBenefitSystem

system = CountryTaxBenefitSystem()

variable = system.variables.get(variable_name)
if variable is None:
raise KeyError(f"Unknown variable: {variable_name}")
return variable.is_input_variable() or is_dataset_input_variable(variable_name)
79 changes: 79 additions & 0 deletions policyengine_us/tests/core/test_dataset_input_contract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import pytest

from policyengine_us import CountryTaxBenefitSystem
from policyengine_us.data import (
dataset_input_metadata,
dataset_input_variables,
get_dataset_input_metadata,
is_dataset_exportable_variable,
is_dataset_input_variable,
is_formula_owned_variable,
)


def test_dataset_input_contract_marks_stochastic_status_inputs():
expected = {
"takes_up_aca_if_eligible",
"takes_up_dc_ptc",
"takes_up_eitc",
"takes_up_head_start_if_eligible",
"takes_up_early_head_start_if_eligible",
"takes_up_medicaid_if_eligible",
"takes_up_snap_if_eligible",
"takes_up_tanf_if_eligible",
"would_claim_wic",
}

assert expected <= dataset_input_variables(kind="stochastic_status")


def test_dataset_input_contract_marks_known_formula_overrides():
system = CountryTaxBenefitSystem()

for variable in ("has_tin", "has_itin", "in_nyc", "fsla_overtime_premium"):
assert is_dataset_input_variable(variable)
assert is_dataset_exportable_variable(variable, system=system)
assert not is_formula_owned_variable(variable, system=system)


def test_dataset_input_contract_marks_medical_inputs():
metadata = get_dataset_input_metadata("meets_ssi_disability_criteria")

assert metadata is not None
assert metadata.kind == "medical_status"
assert "SSI" in metadata.rationale
assert "is_wic_at_nutritional_risk" in dataset_input_variables(
kind="medical_status"
)


def test_formula_owned_helper_rejects_computed_outputs():
system = CountryTaxBenefitSystem()

assert is_formula_owned_variable("wic", system=system)
assert not is_dataset_exportable_variable("wic", system=system)


def test_dataset_input_contract_is_consistent_with_model_variables():
system = CountryTaxBenefitSystem()
metadata = dataset_input_metadata()

missing = sorted(set(metadata) - set(system.variables))
assert missing == []

undocumented_defaults = {
name
for name, variable in system.variables.items()
if name.startswith(("takes_up_", "would_claim_"))
and getattr(variable, "default_value", None) is True
and name not in metadata
}
assert undocumented_defaults == set()


def test_dataset_contract_helpers_raise_for_unknown_variables():
with pytest.raises(KeyError):
is_formula_owned_variable("not_a_variable")

with pytest.raises(KeyError):
is_dataset_exportable_variable("not_a_variable")
2 changes: 1 addition & 1 deletion policyengine_us/variables/gov/usda/wic/wic_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def formula(person, period, parameters):
mother = person("is_mother", period)
breastfeeding = person("is_breastfeeding", period)
age = person("age", period)
# Categorize mothers based on the minimum age of children in the SPM unit.
# Categorize mothers based on the minimum age of children in the family.
min_age_family = person.family.min(age)
return select(
[
Expand Down
Loading