Source code for wraquant.causal.integrations

"""External package wrappers for causal inference.

Functions in this module require the ``causal`` optional dependency group
(DoWhy, EconML, DoubleML) and are guarded by ``@requires_extra('causal')``.
"""

from __future__ import annotations

from typing import Any

import numpy as np
import pandas as pd

from wraquant.core.decorators import requires_extra

__all__ = [
    "dowhy_causal_model",
    "econml_dml",
    "econml_forest",
    "doubleml_plr",
]


def _extract_p_value(estimate: Any) -> float | None:
    """Safely extract p-value from a DoWhy estimate object."""
    sig = getattr(estimate, "test_stat_significance", None)
    if sig is None:
        return None
    if callable(sig):
        try:
            result = sig()
            if isinstance(result, dict):
                return result.get("p_value")
        except Exception:
            pass
        return None
    if isinstance(sig, dict):
        return sig.get("p_value")
    return None


# ---------------------------------------------------------------------------
# DoWhy
# ---------------------------------------------------------------------------



[docs]
@requires_extra("causal")
def dowhy_causal_model(
    data: pd.DataFrame,
    treatment: str,
    outcome: str,
    graph: str | None = None,
    common_causes: list[str] | None = None,
    method: str = "backdoor.propensity_score_matching",
) -> dict[str, Any]:
    """Build and estimate a causal model using DoWhy.

    Parameters
    ----------
    data : pd.DataFrame
        Observational data.
    treatment : str
        Name of the treatment column.
    outcome : str
        Name of the outcome column.
    graph : str or None
        Causal graph in GML or DOT format. If None, common_causes must
        be provided and DoWhy will construct a simple graph.
    common_causes : list[str] or None
        List of common cause (confounder) column names. Required if
        graph is not provided.
    method : str
        Estimation method name (e.g., 'backdoor.propensity_score_matching',
        'backdoor.linear_regression', 'iv.instrumental_variable').

    Returns
    -------
    dict
        ``estimate``: float — estimated causal effect,
        ``p_value``: float or None — p-value if available,
        ``method``: str — method used,
        ``model``: DoWhy CausalModel object,
        ``identified_estimand``: the identified estimand,
        ``causal_estimate``: the full DoWhy estimate object.
    """
    import dowhy

    model = dowhy.CausalModel(
        data=data,
        treatment=treatment,
        outcome=outcome,
        graph=graph,
        common_causes=common_causes,
    )

    identified_estimand = model.identify_effect()
    estimate = model.estimate_effect(
        identified_estimand,
        method_name=method,
    )

    return {
        "estimate": float(estimate.value),
        "p_value": _extract_p_value(estimate),
        "method": method,
        "model": model,
        "identified_estimand": identified_estimand,
        "causal_estimate": estimate,
    }



# ---------------------------------------------------------------------------
# EconML — Double Machine Learning
# ---------------------------------------------------------------------------



[docs]
@requires_extra("causal")
def econml_dml(
    outcome: np.ndarray | pd.Series,
    treatment: np.ndarray | pd.Series,
    covariates: np.ndarray | pd.DataFrame,
    model_y: Any = None,
    model_t: Any = None,
    n_splits: int = 3,
) -> dict[str, Any]:
    """Estimate causal effects using EconML's LinearDML.

    Parameters
    ----------
    outcome : array-like
        Outcome variable.
    treatment : array-like
        Treatment variable (can be continuous).
    covariates : array-like
        Covariate matrix.
    model_y : estimator or None
        Nuisance model for the outcome. Defaults to Lasso.
    model_t : estimator or None
        Nuisance model for the treatment. Defaults to Lasso.
    n_splits : int
        Number of cross-fitting splits.

    Returns
    -------
    dict
        ``ate``: float — average treatment effect,
        ``se``: float — standard error,
        ``ci_lower``: float — lower CI bound,
        ``ci_upper``: float — upper CI bound,
        ``model``: fitted LinearDML object.
    """
    from econml.dml import LinearDML
    from sklearn.linear_model import LassoCV

    Y = np.asarray(outcome).ravel()
    T = np.asarray(treatment).ravel()
    X = np.asarray(covariates)
    if X.ndim == 1:
        X = X.reshape(-1, 1)

    if model_y is None:
        model_y = LassoCV()
    if model_t is None:
        model_t = LassoCV()

    dml = LinearDML(
        model_y=model_y,
        model_t=model_t,
        cv=n_splits,
        random_state=42,
    )
    dml.fit(Y, T, X=X)

    ate = float(dml.ate())
    ate_inference = dml.ate_inference()

    return {
        "ate": ate,
        "se": float(ate_inference.stderr),
        "ci_lower": float(ate_inference.conf_int()[0][0]),
        "ci_upper": float(ate_inference.conf_int()[1][0]),
        "model": dml,
    }



# ---------------------------------------------------------------------------
# EconML — Causal Forest
# ---------------------------------------------------------------------------



[docs]
@requires_extra("causal")
def econml_forest(
    outcome: np.ndarray | pd.Series,
    treatment: np.ndarray | pd.Series,
    covariates: np.ndarray | pd.DataFrame,
    n_estimators: int = 100,
    min_samples_leaf: int = 5,
) -> dict[str, Any]:
    """Estimate heterogeneous treatment effects using EconML's CausalForestDML.

    Parameters
    ----------
    outcome : array-like
        Outcome variable.
    treatment : array-like
        Treatment variable.
    covariates : array-like
        Covariate matrix.
    n_estimators : int
        Number of trees in the forest.
    min_samples_leaf : int
        Minimum number of samples per leaf.

    Returns
    -------
    dict
        ``ate``: float — average treatment effect,
        ``cate``: np.ndarray — conditional ATE for each observation,
        ``se``: float — standard error of ATE,
        ``model``: fitted CausalForestDML object.
    """
    from econml.dml import CausalForestDML

    Y = np.asarray(outcome).ravel()
    T = np.asarray(treatment).ravel()
    X = np.asarray(covariates)
    if X.ndim == 1:
        X = X.reshape(-1, 1)

    forest = CausalForestDML(
        n_estimators=n_estimators,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
    )
    forest.fit(Y, T, X=X)

    cate = forest.effect(X).ravel()
    ate = float(np.mean(cate))
    ate_inference = forest.ate_inference()

    return {
        "ate": ate,
        "cate": cate,
        "se": float(ate_inference.stderr),
        "model": forest,
    }



# ---------------------------------------------------------------------------
# DoubleML — Partially Linear Regression
# ---------------------------------------------------------------------------



[docs]
@requires_extra("causal")
def doubleml_plr(
    outcome: np.ndarray | pd.Series,
    treatment: np.ndarray | pd.Series,
    covariates: np.ndarray | pd.DataFrame,
    n_folds: int = 5,
    ml_l: Any = None,
    ml_m: Any = None,
) -> dict[str, Any]:
    """Estimate treatment effect using DoubleML's partially linear regression.

    Parameters
    ----------
    outcome : array-like
        Outcome variable.
    treatment : array-like
        Treatment variable.
    covariates : array-like
        Covariate matrix.
    n_folds : int
        Number of cross-fitting folds.
    ml_l : estimator or None
        Nuisance learner for E[Y|X]. Defaults to Lasso.
    ml_m : estimator or None
        Nuisance learner for E[D|X]. Defaults to Lasso.

    Returns
    -------
    dict
        ``ate``: float — treatment effect estimate (theta),
        ``se``: float — standard error,
        ``ci_lower``: float — lower CI bound,
        ``ci_upper``: float — upper CI bound,
        ``t_stat``: float — t-statistic,
        ``p_value``: float — p-value,
        ``model``: fitted DoubleMLPLR object.
    """
    import doubleml as dml
    from sklearn.linear_model import LassoCV

    Y = np.asarray(outcome).ravel()
    D = np.asarray(treatment).ravel()
    X = np.asarray(covariates)
    if X.ndim == 1:
        X = X.reshape(-1, 1)

    col_names = [f"X{i}" for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=col_names)
    df["Y"] = Y
    df["D"] = D

    data = dml.DoubleMLData(df, y_col="Y", d_cols="D", x_cols=col_names)

    if ml_l is None:
        ml_l = LassoCV()
    if ml_m is None:
        ml_m = LassoCV()

    plr = dml.DoubleMLPLR(data, ml_l=ml_l, ml_m=ml_m, n_folds=n_folds)
    plr.fit()

    summary = plr.summary
    return {
        "ate": float(plr.coef[0]),
        "se": float(plr.se[0]),
        "ci_lower": float(summary["2.5 %"].iloc[0]),
        "ci_upper": float(summary["97.5 %"].iloc[0]),
        "t_stat": float(plr.t_stat[0]),
        "p_value": float(plr.pval[0]),
        "model": plr,
    }