Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions econml/dml/causal_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,10 +680,12 @@ def _gen_featurizer(self):
return clone(self.featurizer, safe=False)

def _gen_model_y(self):
return _make_first_stage_selector(self.model_y, self.discrete_outcome, self.random_state)
return _make_first_stage_selector(self.model_y, self.discrete_outcome, self.random_state,
n_jobs=self.n_jobs)

def _gen_model_t(self):
return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state)
return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state,
n_jobs=self.n_jobs)

def _gen_model_final(self):
return MultiOutputGRF(CausalForest(n_estimators=self.n_estimators,
Expand Down
17 changes: 11 additions & 6 deletions econml/dml/dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,13 @@ def best_score(self):
return self._model.best_score


def _make_first_stage_selector(model, is_discrete, random_state):
def _make_first_stage_selector(model, is_discrete, random_state, n_jobs=None):
if model == 'auto':
model = ['forest', 'linear']
return _FirstStageSelector(get_selector(model,
is_discrete=is_discrete,
random_state=random_state),
random_state=random_state,
n_jobs=n_jobs),
discrete_target=is_discrete)


Expand Down Expand Up @@ -561,10 +562,12 @@ def _gen_featurizer(self):
return clone(self.featurizer, safe=False)

def _gen_model_y(self):
return _make_first_stage_selector(self.model_y, self.discrete_outcome, self.random_state)
return _make_first_stage_selector(self.model_y, self.discrete_outcome, self.random_state,
n_jobs=getattr(self, 'n_jobs', None))

def _gen_model_t(self):
return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state)
return _make_first_stage_selector(self.model_t, self.discrete_treatment, self.random_state,
n_jobs=getattr(self, 'n_jobs', None))

def _gen_model_final(self):
return clone(self.model_final, safe=False)
Expand Down Expand Up @@ -1647,11 +1650,13 @@ def _gen_featurizer(self):

def _gen_model_y(self):
return _make_first_stage_selector(self.model_y, is_discrete=self.discrete_outcome,
random_state=self.random_state)
random_state=self.random_state,
n_jobs=getattr(self, 'n_jobs', None))

def _gen_model_t(self):
return _make_first_stage_selector(self.model_t, is_discrete=self.discrete_treatment,
random_state=self.random_state)
random_state=self.random_state,
n_jobs=getattr(self, 'n_jobs', None))

def _gen_model_final(self):
return clone(self.model_final, safe=False)
Expand Down
4 changes: 2 additions & 2 deletions econml/dr/_drlearner.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,10 @@ def predict(self, Y, T, X=None, W=None, *, sample_weight=None, groups=None):
return Y_pred.reshape(Y.shape + (T.shape[1] + 1,)), propensities, raw_propensities


def _make_first_stage_selector(model, is_discrete, random_state):
def _make_first_stage_selector(model, is_discrete, random_state, n_jobs=None):
if model == "auto":
model = ['linear', 'forest']
return get_selector(model, is_discrete=is_discrete, random_state=random_state)
return get_selector(model, is_discrete=is_discrete, random_state=random_state, n_jobs=n_jobs)


class _ModelFinal:
Expand Down
32 changes: 17 additions & 15 deletions econml/sklearn_extensions/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,12 +437,13 @@ def _to_logisticRegression(model: LogisticRegressionCV):
_copy_to(model, lr, ["penalty", "dual", "intercept_scaling",
"class_weight",
"solver",
"verbose", "n_jobs",
"verbose",
"tol", "max_iter", "random_state", "n_iter_"])
# if sklearn version < 1.8, copy multi_class as well
# if sklearn version < 1.8, copy multi_class and n_jobs as well
# (sklearn 1.8 deprecated n_jobs on LogisticRegression; it has no effect post-fit)
from packaging import version
if version.parse(sklearn.__version__) < version.parse("1.8"):
_copy_to(model, lr, ["multi_class"])
_copy_to(model, lr, ["multi_class", "n_jobs"])
_copy_to(model, lr, ["classes_"])

_copy_to(model, lr, ["C", "l1_ratio"], True) # these are arrays in LogisticRegressionCV, need to convert them next
Expand Down Expand Up @@ -616,38 +617,39 @@ def best_score(self):
return self._best_score


def get_selector(input, is_discrete, *, random_state=None, cv=None, wrapper=GridSearchCV, needs_scoring=False):
def get_selector(input, is_discrete, *, random_state=None, cv=None, wrapper=GridSearchCV, needs_scoring=False,
n_jobs=None):
named_models = {
'linear': (LogisticRegressionCV(random_state=random_state, cv=cv) if is_discrete
else WeightedLassoCVWrapper(random_state=random_state, cv=cv)),
'linear': (LogisticRegressionCV(random_state=random_state, cv=cv, n_jobs=n_jobs) if is_discrete
else WeightedLassoCVWrapper(random_state=random_state, cv=cv, n_jobs=n_jobs)),
'poly': ([make_pipeline(PolynomialFeatures(d),
(LogisticRegressionCV(random_state=random_state, cv=cv) if is_discrete
else WeightedLassoCVWrapper(random_state=random_state, cv=cv)))
(LogisticRegressionCV(random_state=random_state, cv=cv, n_jobs=n_jobs) if is_discrete
else WeightedLassoCVWrapper(random_state=random_state, cv=cv, n_jobs=n_jobs)))
for d in range(1, 4)]),
'forest': (GridSearchCV(RandomForestClassifier(random_state=random_state) if is_discrete
else RandomForestRegressor(random_state=random_state),
param_grid={}, cv=cv)),
'forest': (GridSearchCV(RandomForestClassifier(random_state=random_state, n_jobs=n_jobs) if is_discrete
else RandomForestRegressor(random_state=random_state, n_jobs=n_jobs),
param_grid={}, cv=cv, n_jobs=n_jobs)),
'gbf': (GridSearchCV(GradientBoostingClassifier(random_state=random_state) if is_discrete
else GradientBoostingRegressor(random_state=random_state),
param_grid={}, cv=cv)),
param_grid={}, cv=cv, n_jobs=n_jobs)),
'nnet': (GridSearchCV(MLPClassifier(random_state=random_state) if is_discrete
else MLPRegressor(random_state=random_state),
param_grid={}, cv=cv)),
param_grid={}, cv=cv, n_jobs=n_jobs)),
'automl': ["poly", "forest", "gbf", "nnet"],
}
if isinstance(input, ModelSelector): # we've already got a model selector, don't need to do anything
return input
elif isinstance(input, list): # we've got a list; call get_selector on each element, then wrap in a ListSelector
models = [get_selector(model, is_discrete,
random_state=random_state, cv=cv, wrapper=wrapper,
needs_scoring=True) # we need to score to compare outputs to each other
needs_scoring=True, n_jobs=n_jobs) # we need to score to compare outputs to each other
for model in input]
return ListSelector(models)
elif isinstance(input, str): # we've got a string; look it up
if input in named_models:
return get_selector(named_models[input], is_discrete,
random_state=random_state, cv=cv, wrapper=wrapper,
needs_scoring=needs_scoring)
needs_scoring=needs_scoring, n_jobs=n_jobs)
else:
raise ValueError(f"Unknown model type: {input}, must be one of {named_models.keys()}")
elif SklearnCVSelector.can_wrap(input):
Expand Down
49 changes: 49 additions & 0 deletions econml/tests/test_dml.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,6 +819,55 @@ def true_fn(x):
sn6 = est.score_nuisances(Y=y, T=T, X=X, W=W, t_scoring='log_loss')
np.testing.assert_allclose(sn6['T_log_loss'], [17.4,17.4], rtol=0, atol=0.1)

def test_n_jobs_propagates_to_first_stage_auto_selector(self):
# Regression test for #1009: SparseLinearDML / CausalForestDML accept
# n_jobs but previously only threaded it into the second-stage Lasso /
# final forest. The 'auto' first-stage selector built RandomForest +
# GridSearchCV + LogisticRegressionCV + WeightedLassoCVWrapper without
# n_jobs, so first-stage fits ran single-core.
def collect_n_jobs(obj, seen=None):
if seen is None:
seen = set()
oid = id(obj)
if oid in seen:
return []
seen.add(oid)
out = []
if hasattr(obj, "n_jobs"):
out.append((type(obj).__name__, obj.n_jobs))
for attr in ("_model", "models", "searcher", "estimator", "_best_model"):
v = getattr(obj, attr, None)
if v is None:
continue
children = v if isinstance(v, (list, tuple)) else [v]
for c in children:
out.extend(collect_n_jobs(c, seen))
return out

sentinel = 3 # any non-default int; -1 also works but is harder to assert against
propagating_types = {
'GridSearchCV', 'RandomForestRegressor', 'RandomForestClassifier',
'LogisticRegressionCV', 'WeightedLassoCVWrapper',
}

for est in (
SparseLinearDML(model_y='auto', model_t='auto', n_jobs=sentinel,
random_state=0),
SparseLinearDML(model_y='auto', model_t='auto', n_jobs=sentinel,
discrete_treatment=True, random_state=0),
CausalForestDML(model_y='auto', model_t='auto', n_jobs=sentinel,
random_state=0),
):
for selector in (est._gen_model_y(), est._gen_model_t()):
seen_any = False
for name, n_jobs in collect_n_jobs(selector):
if name in propagating_types:
seen_any = True
assert n_jobs == sentinel, \
f"{type(est).__name__} -> {name}.n_jobs = {n_jobs}, expected {sentinel}"
assert seen_any, \
f"selector tree for {type(est).__name__} had no n_jobs-bearing leaves"

def test_aaforest_pandas(self):
"""Test that we can use CausalForest with pandas inputs."""
df = pd.DataFrame({'a': np.random.normal(size=500),
Expand Down