Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions econml/_lazy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.

"""Lazy module loading to avoid expensive imports at package load time."""

import importlib


class _LazyModule:
"""Proxy that delays importing a module until an attribute is accessed.

Use at module level as a drop-in replacement for ``import heavy_lib``::

heavy_lib = _LazyModule("heavy_lib")

The real module is imported on first attribute access, so the cost is
deferred until the functionality is actually needed.
"""

def __init__(self, module_name):
self._module_name = module_name
self._module = None

def _load(self):
if self._module is None:
self._module = importlib.import_module(self._module_name)
return self._module

def __getattr__(self, name):
return getattr(self._load(), name)

def __repr__(self):
if self._module is not None:
return repr(self._module)
return f"<_LazyModule '{self._module_name}' (not yet loaded)>"
7 changes: 4 additions & 3 deletions econml/_ortho_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ class in this module implements the general logic in a very versatile way
filter_none_kwargs, one_hot_encoder, strata_from_discrete_arrays,
jacify_featurizer, reshape, shape)
from .sklearn_extensions.model_selection import ModelSelector
from ._lazy import _LazyModule

_rlearner = _LazyModule("econml.dml._rlearner") # lazy: avoid circular import

try:
import ray
Expand Down Expand Up @@ -1149,9 +1152,7 @@ def score(self, Y, T, X=None, W=None, Z=None, sample_weight=None, groups=None, s
}
# If using an _rlearner, the scoring parameter can be passed along, if provided
if scoring is not None:
# Cannot import in header, or circular imports
from .dml._rlearner import _ModelFinal
if isinstance(self._ortho_learner_model_final, _ModelFinal):
if isinstance(self._ortho_learner_model_final, _rlearner._ModelFinal):
score_kwargs['scoring'] = scoring
else:
raise NotImplementedError("scoring parameter only implemented for "
Expand Down
4 changes: 3 additions & 1 deletion econml/_shap.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
"""

import inspect
import shap
from collections import defaultdict
import numpy as np
from ._lazy import _LazyModule
from .utilities import broadcast_unit_treatments, cross_product, get_feature_names_or_default

shap = _LazyModule("shap") # lazy: heavy dependency only needed when shap_values() is called


def _shap_explain_cme(cme_model, X, d_t, d_y,
feature_names=None, treatment_names=None, output_names=None,
Expand Down
3 changes: 1 addition & 2 deletions econml/data/dynamic_panel_dgp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
from econml.utilities import cross_product
from statsmodels.tools.tools import add_constant
from econml.utilities import cross_product, add_constant
import pandas as pd
import scipy as sp
from scipy.stats import expon
Expand Down
5 changes: 4 additions & 1 deletion econml/dml/causal_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@
from .._cate_estimator import LinearCateEstimator
from .._shap import _shap_explain_multitask_model_cate
from .._ortho_learner import _OrthoLearner
from .._lazy import _LazyModule
from ..validate.sensitivity_analysis import (sensitivity_interval, RV, dml_sensitivity_values,
sensitivity_summary)

_score = _LazyModule("econml.score") # lazy: avoid circular import


class _CausalForestFinalWrapper:

Expand Down Expand Up @@ -757,7 +760,7 @@ def tune(self, Y, T, *, X=None, W=None,
The tuned causal forest object. This is the same object (not a copy) as the original one, but where
all parameters of the object have been set to the best performing parameters from the tuning grid.
"""
from ..score import RScorer # import here to avoid circular import issue
RScorer = _score.RScorer
Y, T, X, sample_weight, groups = check_input_arrays(Y, T, X, sample_weight, groups)
W, = check_input_arrays(W, force_all_finite='allow-nan' if 'W' in self._gen_allowed_missing_vars() else True,
ensure_2d=True)
Expand Down
7 changes: 4 additions & 3 deletions econml/inference/_bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from joblib import Parallel, delayed
from sklearn.base import clone
from scipy.stats import norm
from .._lazy import _LazyModule

_cate_estimator = _LazyModule("econml._cate_estimator") # lazy: avoid circular import


class BootstrapEstimator:
Expand Down Expand Up @@ -83,10 +86,8 @@ def fit(self, *args, **named_args):

The full signature of this method is the same as that of the wrapped object's `fit` method.
"""
from .._cate_estimator import BaseCateEstimator # need to nest this here to avoid circular import

index_chunks = None
if isinstance(self._instances[0], BaseCateEstimator):
if isinstance(self._instances[0], _cate_estimator.BaseCateEstimator):
index_chunks = self._instances[0]._strata(*args, **named_args)
if index_chunks is not None:
index_chunks = self.__stratified_indices(index_chunks)
Expand Down
20 changes: 10 additions & 10 deletions econml/sklearn_extensions/linear_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import warnings
from collections.abc import Iterable
from scipy.stats import norm
from ..utilities import ndim, shape, reshape, _safe_norm_ppf, check_input_arrays
from ..utilities import ndim, shape, reshape, _safe_norm_ppf, check_input_arrays, add_constant
import sklearn
from sklearn import clone
from sklearn.linear_model import LinearRegression, LassoCV, MultiTaskLassoCV, Lasso, MultiTaskLasso
Expand All @@ -33,12 +33,14 @@
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import check_is_fitted
from sklearn.base import BaseEstimator
from statsmodels.tools.tools import add_constant
from statsmodels.api import RLM
import statsmodels
from .._lazy import _LazyModule
from joblib import Parallel, delayed
from typing import List

_statsmodels_api = _LazyModule("statsmodels.api") # lazy: only needed for RLM
_statsmodels = _LazyModule("statsmodels") # lazy: only needed for RLM robust norms
_model_selection = _LazyModule("econml.sklearn_extensions.model_selection") # lazy: avoid circular import


class _WeightedCVIterableWrapper(_CVIterableWrapper):
def __init__(self, cv):
Expand All @@ -56,15 +58,13 @@ def split(self, X=None, y=None, groups=None, sample_weight=None):


def _weighted_check_cv(cv=5, y=None, classifier=False, random_state=None):
# local import to avoid circular imports
from .model_selection import WeightedKFold, WeightedStratifiedKFold
cv = 5 if cv is None else cv
if isinstance(cv, numbers.Integral):
if (classifier and (y is not None) and
(type_of_target(y) in ('binary', 'multiclass'))):
return WeightedStratifiedKFold(cv, random_state=random_state)
return _model_selection.WeightedStratifiedKFold(cv, random_state=random_state)
else:
return WeightedKFold(cv, random_state=random_state)
return _model_selection.WeightedKFold(cv, random_state=random_state)

if not hasattr(cv, 'split') or isinstance(cv, str):
if not isinstance(cv, Iterable) or isinstance(cv, str):
Expand Down Expand Up @@ -2041,9 +2041,9 @@ def fit(self, X, y):
self._n_out = 0 if len(y.shape) == 1 else (y.shape[1],)

def model_gen(y):
return RLM(endog=y,
return _statsmodels_api.RLM(endog=y,
exog=X,
M=statsmodels.robust.norms.HuberT(t=self.t)).fit(cov=self.cov_type,
M=_statsmodels.robust.norms.HuberT(t=self.t)).fit(cov=self.cov_type,
maxiter=self.maxiter,
tol=self.tol)
if y.ndim < 2:
Expand Down
58 changes: 57 additions & 1 deletion econml/tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import pytest
from econml.utilities import (check_high_dimensional, einsum_sparse, todense, tocoo, transpose,
inverse_onehot, cross_product, transpose_dictionary, deprecated, _deprecate_positional,
strata_from_discrete_arrays)
strata_from_discrete_arrays, add_constant)
from sklearn.preprocessing import OneHotEncoder, SplineTransformer


Expand Down Expand Up @@ -197,3 +197,59 @@ def test_single_strata_from_discrete_array(self):
assert set(strata_from_discrete_arrays([T, Z])) == set(np.arange(6))
assert set(strata_from_discrete_arrays([T])) == set(np.arange(3))
assert strata_from_discrete_arrays([]) is None

def test_add_constant(self):
import pandas as pd
from statsmodels.tools.tools import add_constant as sm_add_constant

rng = np.random.default_rng(0)
X = rng.standard_normal((6, 3))

# Matches statsmodels for ndarray inputs.
np.testing.assert_allclose(add_constant(X), sm_add_constant(X))
np.testing.assert_allclose(add_constant(X, prepend=False),
sm_add_constant(X, prepend=False))

# 1D input is promoted to 2D and a constant column is added.
v = np.array([1.0, 2.0, 3.0])
np.testing.assert_array_equal(add_constant(v),
np.array([[1.0, 1.0], [1.0, 2.0], [1.0, 3.0]]))

# 3D+ inputs are rejected.
with self.assertRaises(ValueError):
add_constant(np.zeros((2, 2, 2)))

# has_constant policies on a column that is already constant.
Xc = np.column_stack([np.ones(5), rng.standard_normal(5)])
np.testing.assert_array_equal(add_constant(Xc, has_constant='skip'), Xc)
with self.assertRaises(ValueError):
add_constant(Xc, has_constant='raise')
# 'add' should always prepend another ones column.
out_add = add_constant(Xc, has_constant='add')
assert out_add.shape == (5, 3)
np.testing.assert_array_equal(out_add[:, 0], np.ones(5))

# List input behaves like ndarray.
np.testing.assert_array_equal(add_constant([[1.0, 2.0], [3.0, 4.0]]),
np.array([[1.0, 1.0, 2.0], [1.0, 3.0, 4.0]]))

# pandas DataFrame and Series inputs are accepted and produce
# ndarrays (this differs from statsmodels, which preserves the
# pandas type — see the docstring Notes section).
df = pd.DataFrame({'a': [1.0, 2.0, 3.0], 'b': [4.0, 5.0, 6.0]})
out_df = add_constant(df)
assert isinstance(out_df, np.ndarray)
np.testing.assert_array_equal(out_df, np.array([[1.0, 1.0, 4.0],
[1.0, 2.0, 5.0],
[1.0, 3.0, 6.0]]))

# Non-default index should not reorder the underlying values
# (statsmodels behaves the same way).
df_idx = pd.DataFrame({'a': [10.0, 20.0, 30.0]}, index=[7, 2, 5])
np.testing.assert_array_equal(add_constant(df_idx),
np.array([[1.0, 10.0], [1.0, 20.0], [1.0, 30.0]]))

s = pd.Series([1.0, 2.0, 3.0], name='x')
out_s = add_constant(s)
assert isinstance(out_s, np.ndarray)
np.testing.assert_array_equal(out_s, np.array([[1.0, 1.0], [1.0, 2.0], [1.0, 3.0]]))
68 changes: 61 additions & 7 deletions econml/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,69 @@
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, LabelEncoder
import warnings
from warnings import warn
from statsmodels.iolib.table import SimpleTable
from statsmodels.iolib.summary import summary_return
from ._lazy import _LazyModule
from inspect import signature
from packaging.version import parse

_statsmodels_table = _LazyModule("statsmodels.iolib.table") # lazy: only needed for Summary output
_statsmodels_summary = _LazyModule("statsmodels.iolib.summary") # lazy: only needed for Summary output


MAX_RAND_SEED = np.iinfo(np.int32).max


def add_constant(data, prepend=True, has_constant='skip'):
"""Add a column of ones to an array.

Parameters
----------
data : array_like
A column-ordered design matrix. Any input accepted by
:func:`numpy.asarray` is allowed, including pandas
``DataFrame`` and ``Series`` objects.
prepend : bool, default True
If True the constant is in the first column, else appended.
has_constant : {'skip', 'add', 'raise'}, default 'skip'
Behavior when *data* already contains a constant column.
``'skip'`` returns *data* unchanged, ``'raise'`` raises
``ValueError``, ``'add'`` adds another column of ones anyway.

Returns
-------
ndarray
The array with a ones column prepended (or appended).

Notes
-----
This differs from :func:`statsmodels.tools.add_constant` in that the
return value is always a NumPy ``ndarray``. ``statsmodels`` preserves
pandas input types (``DataFrame`` in → ``DataFrame`` out with a
``'const'`` column; ``Series`` in → ``DataFrame`` out). Here, pandas
inputs are converted via :func:`numpy.asarray`, which takes the
underlying values in row-storage order — the same data ordering
``statsmodels`` operates on — but column names and the row index are
not carried through to the result. Callers that need to preserve
pandas metadata should reattach it after the call.
"""
x = np.asarray(data)
if x.ndim == 1:
x = x[:, None]
elif x.ndim > 2:
raise ValueError('Only implemented for 2-dimensional arrays')

if has_constant != 'add':
is_const = (np.ptp(x, axis=0) == 0) & np.all(x != 0.0, axis=0)
if is_const.any():
if has_constant == 'skip':
return x
cols = ",".join(str(c) for c in np.where(is_const)[0])
raise ValueError(f"Column(s) {cols} are constant.")

ones = np.ones(x.shape[0])
parts = [ones, x] if prepend else [x, ones]
return np.column_stack(parts)


class IdentityFeatures(TransformerMixin):
"""Featurizer that just returns the input data."""

Expand Down Expand Up @@ -1147,7 +1201,7 @@ def _repr_html_(self):
return self.as_html()

def add_table(self, res, header, index, title):
table = SimpleTable(res, header, index, title)
table = _statsmodels_table.SimpleTable(res, header, index, title)
self.tables.append(table)

def add_extra_txt(self, etext):
Expand All @@ -1170,7 +1224,7 @@ def as_text(self):
summary tables and extra text as one string

"""
txt = summary_return(self.tables, return_fmt='text')
txt = _statsmodels_summary.summary_return(self.tables, return_fmt='text')
if self.extra_txt is not None:
txt = txt + '\n\n' + self.extra_txt
return txt
Expand All @@ -1190,7 +1244,7 @@ def as_latex(self):
tables.

"""
latex = summary_return(self.tables, return_fmt='latex')
latex = _statsmodels_summary.summary_return(self.tables, return_fmt='latex')
if self.extra_txt is not None:
latex = latex + '\n\n' + self.extra_txt.replace('\n', ' \\newline\n ')
return latex
Expand All @@ -1204,7 +1258,7 @@ def as_csv(self):
concatenated summary tables in comma delimited format

"""
csv = summary_return(self.tables, return_fmt='csv')
csv = _statsmodels_summary.summary_return(self.tables, return_fmt='csv')
if self.extra_txt is not None:
csv = csv + '\n\n' + self.extra_txt
return csv
Expand All @@ -1218,7 +1272,7 @@ def as_html(self):
concatenated summary tables in HTML format

"""
html = summary_return(self.tables, return_fmt='html')
html = _statsmodels_summary.summary_return(self.tables, return_fmt='html')
if self.extra_txt is not None:
html = html + '<br/><br/>' + self.extra_txt.replace('\n', '<br/>')
return html
Expand Down
Loading
Loading