Source code for wraquant.causal.integrations

"""External package wrappers for causal inference.

Functions in this module require the ``causal`` optional dependency group
(DoWhy, EconML, DoubleML) and are guarded by ``@requires_extra('causal')``.
"""

from __future__ import annotations

from typing import Any

import numpy as np
import pandas as pd

from wraquant.core.decorators import requires_extra

__all__ = [
    "dowhy_causal_model",
    "econml_dml",
    "econml_forest",
    "doubleml_plr",
]


def _extract_p_value(estimate: Any) -> float | None:
    """Safely extract p-value from a DoWhy estimate object."""
    sig = getattr(estimate, "test_stat_significance", None)
    if sig is None:
        return None
    if callable(sig):
        try:
            result = sig()
            if isinstance(result, dict):
                return result.get("p_value")
        except Exception:
            pass
        return None
    if isinstance(sig, dict):
        return sig.get("p_value")
    return None


# ---------------------------------------------------------------------------
# DoWhy
# ---------------------------------------------------------------------------


[docs] @requires_extra("causal") def dowhy_causal_model( data: pd.DataFrame, treatment: str, outcome: str, graph: str | None = None, common_causes: list[str] | None = None, method: str = "backdoor.propensity_score_matching", ) -> dict[str, Any]: """Build and estimate a causal model using DoWhy. Parameters ---------- data : pd.DataFrame Observational data. treatment : str Name of the treatment column. outcome : str Name of the outcome column. graph : str or None Causal graph in GML or DOT format. If None, common_causes must be provided and DoWhy will construct a simple graph. common_causes : list[str] or None List of common cause (confounder) column names. Required if graph is not provided. method : str Estimation method name (e.g., 'backdoor.propensity_score_matching', 'backdoor.linear_regression', 'iv.instrumental_variable'). Returns ------- dict ``estimate``: float — estimated causal effect, ``p_value``: float or None — p-value if available, ``method``: str — method used, ``model``: DoWhy CausalModel object, ``identified_estimand``: the identified estimand, ``causal_estimate``: the full DoWhy estimate object. """ import dowhy model = dowhy.CausalModel( data=data, treatment=treatment, outcome=outcome, graph=graph, common_causes=common_causes, ) identified_estimand = model.identify_effect() estimate = model.estimate_effect( identified_estimand, method_name=method, ) return { "estimate": float(estimate.value), "p_value": _extract_p_value(estimate), "method": method, "model": model, "identified_estimand": identified_estimand, "causal_estimate": estimate, }
# --------------------------------------------------------------------------- # EconML — Double Machine Learning # ---------------------------------------------------------------------------
[docs] @requires_extra("causal") def econml_dml( outcome: np.ndarray | pd.Series, treatment: np.ndarray | pd.Series, covariates: np.ndarray | pd.DataFrame, model_y: Any = None, model_t: Any = None, n_splits: int = 3, ) -> dict[str, Any]: """Estimate causal effects using EconML's LinearDML. Parameters ---------- outcome : array-like Outcome variable. treatment : array-like Treatment variable (can be continuous). covariates : array-like Covariate matrix. model_y : estimator or None Nuisance model for the outcome. Defaults to Lasso. model_t : estimator or None Nuisance model for the treatment. Defaults to Lasso. n_splits : int Number of cross-fitting splits. Returns ------- dict ``ate``: float — average treatment effect, ``se``: float — standard error, ``ci_lower``: float — lower CI bound, ``ci_upper``: float — upper CI bound, ``model``: fitted LinearDML object. """ from econml.dml import LinearDML from sklearn.linear_model import LassoCV Y = np.asarray(outcome).ravel() T = np.asarray(treatment).ravel() X = np.asarray(covariates) if X.ndim == 1: X = X.reshape(-1, 1) if model_y is None: model_y = LassoCV() if model_t is None: model_t = LassoCV() dml = LinearDML( model_y=model_y, model_t=model_t, cv=n_splits, random_state=42, ) dml.fit(Y, T, X=X) ate = float(dml.ate()) ate_inference = dml.ate_inference() return { "ate": ate, "se": float(ate_inference.stderr), "ci_lower": float(ate_inference.conf_int()[0][0]), "ci_upper": float(ate_inference.conf_int()[1][0]), "model": dml, }
# --------------------------------------------------------------------------- # EconML — Causal Forest # ---------------------------------------------------------------------------
[docs] @requires_extra("causal") def econml_forest( outcome: np.ndarray | pd.Series, treatment: np.ndarray | pd.Series, covariates: np.ndarray | pd.DataFrame, n_estimators: int = 100, min_samples_leaf: int = 5, ) -> dict[str, Any]: """Estimate heterogeneous treatment effects using EconML's CausalForestDML. Parameters ---------- outcome : array-like Outcome variable. treatment : array-like Treatment variable. covariates : array-like Covariate matrix. n_estimators : int Number of trees in the forest. min_samples_leaf : int Minimum number of samples per leaf. Returns ------- dict ``ate``: float — average treatment effect, ``cate``: np.ndarray — conditional ATE for each observation, ``se``: float — standard error of ATE, ``model``: fitted CausalForestDML object. """ from econml.dml import CausalForestDML Y = np.asarray(outcome).ravel() T = np.asarray(treatment).ravel() X = np.asarray(covariates) if X.ndim == 1: X = X.reshape(-1, 1) forest = CausalForestDML( n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=42, ) forest.fit(Y, T, X=X) cate = forest.effect(X).ravel() ate = float(np.mean(cate)) ate_inference = forest.ate_inference() return { "ate": ate, "cate": cate, "se": float(ate_inference.stderr), "model": forest, }
# --------------------------------------------------------------------------- # DoubleML — Partially Linear Regression # ---------------------------------------------------------------------------
[docs] @requires_extra("causal") def doubleml_plr( outcome: np.ndarray | pd.Series, treatment: np.ndarray | pd.Series, covariates: np.ndarray | pd.DataFrame, n_folds: int = 5, ml_l: Any = None, ml_m: Any = None, ) -> dict[str, Any]: """Estimate treatment effect using DoubleML's partially linear regression. Parameters ---------- outcome : array-like Outcome variable. treatment : array-like Treatment variable. covariates : array-like Covariate matrix. n_folds : int Number of cross-fitting folds. ml_l : estimator or None Nuisance learner for E[Y|X]. Defaults to Lasso. ml_m : estimator or None Nuisance learner for E[D|X]. Defaults to Lasso. Returns ------- dict ``ate``: float — treatment effect estimate (theta), ``se``: float — standard error, ``ci_lower``: float — lower CI bound, ``ci_upper``: float — upper CI bound, ``t_stat``: float — t-statistic, ``p_value``: float — p-value, ``model``: fitted DoubleMLPLR object. """ import doubleml as dml from sklearn.linear_model import LassoCV Y = np.asarray(outcome).ravel() D = np.asarray(treatment).ravel() X = np.asarray(covariates) if X.ndim == 1: X = X.reshape(-1, 1) col_names = [f"X{i}" for i in range(X.shape[1])] df = pd.DataFrame(X, columns=col_names) df["Y"] = Y df["D"] = D data = dml.DoubleMLData(df, y_col="Y", d_cols="D", x_cols=col_names) if ml_l is None: ml_l = LassoCV() if ml_m is None: ml_m = LassoCV() plr = dml.DoubleMLPLR(data, ml_l=ml_l, ml_m=ml_m, n_folds=n_folds) plr.fit() summary = plr.summary return { "ate": float(plr.coef[0]), "se": float(plr.se[0]), "ci_lower": float(summary["2.5 %"].iloc[0]), "ci_upper": float(summary["97.5 %"].iloc[0]), "t_stat": float(plr.t_stat[0]), "p_value": float(plr.pval[0]), "model": plr, }