Source code for wraquant.econometrics.diagnostics

"""Regression diagnostics for econometric models.

Provides tests for serial correlation, heteroskedasticity, normality,
functional form misspecification, and multicollinearity detection.
"""

from __future__ import annotations

from typing import Any

import numpy as np
import pandas as pd
from scipy import stats as sp_stats
from statsmodels.stats.diagnostic import (
    acorr_breusch_godfrey,
)


[docs] def durbin_watson(residuals: np.ndarray | pd.Series) -> float: """Compute the Durbin-Watson statistic for serial correlation. The statistic ranges from 0 to 4. A value near 2 indicates no first-order autocorrelation, values near 0 indicate positive autocorrelation, and values near 4 indicate negative autocorrelation. Delegates to ``wraquant.stats.tests.durbin_watson`` for the core computation. Parameters: residuals: OLS residuals. Returns: Durbin-Watson statistic (float in [0, 4]). """ from wraquant.stats.tests import durbin_watson as _stats_dw result = _stats_dw(residuals) return result["statistic"]
[docs] def breusch_godfrey( residuals: np.ndarray | pd.Series, X: np.ndarray | pd.DataFrame, nlags: int = 4, ) -> dict[str, Any]: """Breusch-Godfrey LM test for serial correlation. Regresses the residuals on the original regressors plus lagged residuals. A significant test statistic rejects the null of no serial correlation up to order *nlags*. Parameters: residuals: OLS residuals. X: Design matrix (with or without constant; a constant is added internally if needed). nlags: Number of lags to include. Returns: Dictionary with ``lm_statistic``, ``lm_p_value``, ``f_statistic``, ``f_p_value``, and ``is_autocorrelated`` (at 5 % level). """ import statsmodels.api as sm resid = np.asarray(residuals).ravel() X_arr = np.asarray(X) # The BG test needs an OLS results object. We reconstruct y so that # OLS(y, X).resid == resid. Since OLS residuals are orthogonal to X, # any vector of the form y = X @ b + resid works; pick b = 0 for # simplicity, then add back the projection to make the fit non-trivial. from wraquant.stats.regression import ols as _ols _diag_ols = _ols(resid, X_arr, add_constant=False) y_reconstructed = X_arr @ _diag_ols["coefficients"] + resid ols_result = sm.OLS(y_reconstructed, X_arr).fit() lm_stat, lm_p, f_stat, f_p = acorr_breusch_godfrey(ols_result, nlags=nlags) return { "lm_statistic": float(lm_stat), "lm_p_value": float(lm_p), "f_statistic": float(f_stat), "f_p_value": float(f_p), "is_autocorrelated": bool(lm_p < 0.05), }
[docs] def breusch_pagan( residuals: np.ndarray | pd.Series, X: np.ndarray | pd.DataFrame, ) -> dict[str, Any]: """Breusch-Pagan test for heteroskedasticity. Tests whether the variance of the residuals depends on the regressors. The null hypothesis is homoskedasticity. Delegates to ``wraquant.stats.tests.breusch_pagan`` for the core computation. Parameters: residuals: OLS residuals. X: Design matrix used in the original regression. Returns: Dictionary with ``lm_statistic``, ``lm_p_value``, ``f_statistic``, ``f_p_value``, and ``is_heteroskedastic`` (at 5 % level). """ from wraquant.stats.tests import breusch_pagan as _stats_bp result = _stats_bp(residuals, X) return { "lm_statistic": result["lm_stat"], "lm_p_value": result["p_value"], "f_statistic": result["f_stat"], "f_p_value": result["f_p_value"], "is_heteroskedastic": result["is_heteroskedastic"], }
[docs] def white_test( residuals: np.ndarray | pd.Series, X: np.ndarray | pd.DataFrame, ) -> dict[str, Any]: """White's general test for heteroskedasticity. Includes cross-product terms and squared regressors, making it more general than Breusch-Pagan. The null hypothesis is homoskedasticity. Delegates to ``wraquant.stats.tests.white_test`` for the core computation. Parameters: residuals: OLS residuals. X: Design matrix used in the original regression. Returns: Dictionary with ``lm_statistic``, ``lm_p_value``, ``f_statistic``, ``f_p_value``, and ``is_heteroskedastic`` (at 5 % level). """ from wraquant.stats.tests import white_test as _stats_white result = _stats_white(residuals, X) return { "lm_statistic": result["lm_stat"], "lm_p_value": result["p_value"], "f_statistic": result["f_stat"], "f_p_value": result["f_p_value"], "is_heteroskedastic": result["is_heteroskedastic"], }
[docs] def jarque_bera( residuals: np.ndarray | pd.Series, ) -> dict[str, Any]: """Jarque-Bera test for normality of residuals. The null hypothesis is that the residuals are normally distributed. Delegates to ``wraquant.stats.tests.test_normality`` for the core computation. Parameters: residuals: Model residuals. Returns: Dictionary with ``statistic``, ``p_value``, ``skewness``, ``kurtosis``, and ``is_normal`` (at 5 % level). """ from wraquant.stats.tests import test_normality as _test_normality resid_series = pd.Series(np.asarray(residuals).ravel()) result = _test_normality(resid_series, method="jarque_bera") resid_clean = resid_series.dropna().values return { "statistic": result["statistic"], "p_value": result["p_value"], "skewness": float(sp_stats.skew(resid_clean)), "kurtosis": float(sp_stats.kurtosis(resid_clean, fisher=True)), "is_normal": result["is_normal"], }
[docs] def ramsey_reset( y: np.ndarray | pd.Series, X: np.ndarray | pd.DataFrame, power: int = 3, ) -> dict[str, Any]: """Ramsey RESET test for functional form misspecification. Adds powers of the fitted values to the regression and tests their joint significance. Rejection suggests the linear specification is inadequate. Parameters: y: Dependent variable. X: Design matrix (should include a constant). power: Maximum power of fitted values to include (default 3 adds y_hat^2 and y_hat^3). Returns: Dictionary with ``f_statistic``, ``p_value``, ``df_num``, ``df_denom``, and ``is_misspecified`` (at 5 % level). """ import statsmodels.api as sm y_arr = np.asarray(y).ravel() X_arr = np.asarray(X) # Original regression ols_orig = sm.OLS(y_arr, X_arr).fit() y_hat = ols_orig.fittedvalues # Build augmented regressors with y_hat^2, ..., y_hat^power augmented_cols = [X_arr] for p in range(2, power + 1): augmented_cols.append((y_hat**p).reshape(-1, 1)) X_aug = np.hstack(augmented_cols) ols_aug = sm.OLS(y_arr, X_aug).fit() # F-test for the added variables n = len(y_arr) k_orig = X_arr.shape[1] k_aug = X_aug.shape[1] df_num = k_aug - k_orig df_denom = n - k_aug ssr_orig = np.sum(ols_orig.resid**2) ssr_aug = np.sum(ols_aug.resid**2) f_stat = ((ssr_orig - ssr_aug) / df_num) / (ssr_aug / df_denom) p_value = 1.0 - sp_stats.f.cdf(f_stat, df_num, df_denom) return { "f_statistic": float(f_stat), "p_value": float(p_value), "df_num": int(df_num), "df_denom": int(df_denom), "is_misspecified": bool(p_value < 0.05), }
[docs] def vif(X: np.ndarray | pd.DataFrame) -> pd.Series: """Compute Variance Inflation Factors for each regressor. VIF > 10 is a common rule-of-thumb threshold indicating problematic multicollinearity. The input should *not* include a constant column. Parameters: X: Design matrix **without** an intercept. Returns: Series of VIF values indexed by column name (or integer index). """ from statsmodels.stats.outliers_influence import variance_inflation_factor X_arr = np.asarray(X, dtype=float) if X_arr.ndim == 1: X_arr = X_arr.reshape(-1, 1) n_cols = X_arr.shape[1] if isinstance(X, pd.DataFrame): names = list(X.columns) else: names = list(range(n_cols)) # Add constant for VIF calculation import statsmodels.api as sm X_with_const = sm.add_constant(X_arr) vif_values = [ variance_inflation_factor(X_with_const, i + 1) for i in range(n_cols) ] return pd.Series(vif_values, index=names, name="VIF")
[docs] def condition_number(X: np.ndarray | pd.DataFrame) -> float: """Compute the condition number of the X'X matrix. A condition number exceeding 30 suggests harmful multicollinearity (Belsley, Kuh, and Welsch, 1980). Parameters: X: Design matrix. Returns: Condition number (ratio of largest to smallest singular value of X). """ X_arr = np.asarray(X, dtype=float) sv = np.linalg.svd(X_arr, compute_uv=False) return float(sv.max() / sv.min())