Source code for wraquant.risk.factor

"""Factor risk models for return attribution and risk decomposition.

Factor models decompose portfolio risk into systematic (factor-driven) and
idiosyncratic (asset-specific) components. This is fundamental to
understanding *where* portfolio risk comes from and whether factor
exposures are intentional or accidental.

This module provides four approaches:

1. **Fundamental factor model** (``factor_risk_model``) -- regress returns
   on user-supplied factors (e.g., Fama-French, macro factors). Use when
   you know which factors matter.
2. **Statistical factor model** (``statistical_factor_model``) -- extract
   latent factors via PCA. Use when you do not have a prior on which
   factors drive returns.
3. **Fama-French regression** (``fama_french_regression``) -- specialised
   for the classic Fama-French framework with named factors (MKT, SMB,
   HML, etc.).
4. **Factor contribution** (``factor_contribution``) -- given portfolio
   weights and factor exposures, decompose portfolio risk into factor
   contributions.

References:
    - Fama & French (1993), "Common Risk Factors in the Returns on Stocks
      and Bonds"
    - Connor & Korajczyk (1986), "Performance Measurement with the
      Arbitrage Pricing Theory"
    - Menchero (2011), "The Barra Risk Model Handbook"
"""

from __future__ import annotations

from typing import Any

import numpy as np
import pandas as pd
from scipy import stats as sp_stats



[docs]
def factor_risk_model(
    returns: pd.Series | pd.DataFrame,
    factors: pd.DataFrame,
) -> dict[str, Any]:
    """Regress asset returns on factors and decompose total risk.

    Fits a multivariate OLS regression of returns on the provided factor
    returns, then decomposes total variance into the portion explained by
    factors (systematic risk) and the residual (specific/idiosyncratic
    risk).

    When to use:
        Use this function when you have a set of candidate factors (market,
        value, momentum, macro variables) and want to understand how much
        of the return variation they explain. The ``factor_risk`` /
        ``specific_risk`` split guides hedging decisions: hedge systematic
        risk with factor instruments; accept specific risk if you believe
        in the alpha.

    Mathematical formulation:
        r_t = alpha + B * f_t + eps_t

        Total variance = B' * Sigma_f * B + sigma_eps^2
        Factor risk share = B' * Sigma_f * B / Total variance
        Specific risk share = sigma_eps^2 / Total variance

    Parameters:
        returns: Asset return series (pd.Series for one asset,
            pd.DataFrame for multiple assets -- uses first column).
        factors: DataFrame of factor returns with columns as factor names
            and a compatible index.

    Returns:
        Dictionary containing:
        - **betas** (*dict[str, float]*) -- Factor loadings (regression
          coefficients). Positive beta = positive exposure.
        - **alpha** (*float*) -- Regression intercept (excess return not
          explained by factors).
        - **factor_risk** (*float*) -- Fraction of total variance explained
          by factors (0 to 1).
        - **specific_risk** (*float*) -- Fraction of total variance from
          idiosyncratic sources (1 - factor_risk).
        - **r_squared** (*float*) -- R-squared of the regression.
        - **residual_vol** (*float*) -- Annualized volatility of residuals.
        - **contributions** (*dict[str, float]*) -- Each factor's
          individual contribution to systematic variance.

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> mkt = np.random.normal(0.0005, 0.01, 252)
        >>> smb = np.random.normal(0, 0.005, 252)
        >>> stock = 1.1 * mkt + 0.3 * smb + np.random.normal(0, 0.005, 252)
        >>> result = factor_risk_model(
        ...     pd.Series(stock),
        ...     pd.DataFrame({"MKT": mkt, "SMB": smb}),
        ... )
        >>> result["factor_risk"] > 0.5
        True

    See Also:
        statistical_factor_model: PCA-based (no prior on factors).
        fama_french_regression: Specialised Fama-French interface.

    References:
        - Menchero (2011), "The Barra Risk Model Handbook"
    """
    if isinstance(returns, pd.DataFrame):
        y = returns.iloc[:, 0].copy()
    else:
        y = returns.copy()

    # Align
    aligned = pd.concat([y.rename("y"), factors], axis=1).dropna()
    y_vals = aligned["y"].values
    factor_names = [c for c in aligned.columns if c != "y"]
    X = aligned[factor_names].values

    # OLS via shared regression module
    from wraquant.stats.regression import ols as _ols

    ols_result = _ols(y_vals, X, add_constant=True)
    coeffs = ols_result["coefficients"]
    alpha_val = float(coeffs[0])
    betas = {name: float(coeffs[i + 1]) for i, name in enumerate(factor_names)}
    residuals = ols_result["residuals"]
    r_squared = ols_result["r_squared"]

    # Risk decomposition
    total_var = float(np.var(y_vals, ddof=1))
    residual_var = float(np.var(residuals, ddof=1))

    factor_var = total_var - residual_var
    factor_risk = max(0.0, factor_var / total_var) if total_var > 0 else 0.0
    specific_risk = 1.0 - factor_risk

    # Factor covariance for individual contributions
    factor_cov = np.cov(X, rowvar=False, ddof=1)
    beta_vec = np.array([betas[n] for n in factor_names])
    if factor_cov.ndim == 0:
        factor_cov = np.array([[float(factor_cov)]])

    beta_vec @ factor_cov @ beta_vec
    contributions = {}
    for i, name in enumerate(factor_names):
        # Marginal contribution: beta_i * (Sigma_f @ beta)_i
        marginal = beta_vec[i] * (factor_cov @ beta_vec)[i]
        contributions[name] = float(marginal / total_var) if total_var > 0 else 0.0

    residual_vol = float(np.sqrt(residual_var) * np.sqrt(252))

    return {
        "betas": betas,
        "alpha": alpha_val,
        "factor_risk": factor_risk,
        "specific_risk": specific_risk,
        "r_squared": float(r_squared),
        "residual_vol": residual_vol,
        "contributions": contributions,
    }




[docs]
def statistical_factor_model(
    returns: pd.DataFrame,
    n_factors: int = 3,
) -> dict[str, Any]:
    """PCA-based statistical factor model with risk decomposition.

    Extracts latent factors from the cross-section of asset returns using
    Principal Component Analysis (PCA). The first principal component
    typically captures market-wide movements; subsequent components
    capture sector, style, and other systematic effects.

    When to use:
        Use statistical factor models when you do not have a prior on
        which factors drive returns. PCA discovers the dominant sources
        of covariation. Useful for:
        - Constructing factor-mimicking portfolios.
        - Dimensionality reduction before portfolio optimisation.
        - Identifying hidden risk concentrations.

    Parameters:
        returns: DataFrame of asset returns (columns = assets, rows = dates).
            Should have at least ``n_factors + 1`` columns.
        n_factors: Number of principal components to extract. 3-5 is
            typical for equity portfolios.

    Returns:
        Dictionary containing:
        - **factors** (*pd.DataFrame*) -- Extracted factor return series
          (columns: PC1, PC2, ...).
        - **loadings** (*np.ndarray*) -- Factor loadings matrix (n_assets x
          n_factors).
        - **explained_variance** (*np.ndarray*) -- Variance explained by each
          factor.
        - **explained_variance_ratio** (*np.ndarray*) -- Fraction of total
          variance explained by each factor.
        - **cumulative_variance_ratio** (*np.ndarray*) -- Cumulative fraction
          of variance explained.
        - **factor_risk** (*float*) -- Total fraction of variance explained by
          all extracted factors.
        - **specific_risk** (*float*) -- Fraction of variance not explained.

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> market = np.random.normal(0, 0.01, 252)
        >>> returns = pd.DataFrame({
        ...     f"asset_{i}": market * (0.5 + i * 0.2) + np.random.normal(0, 0.005, 252)
        ...     for i in range(5)
        ... })
        >>> result = statistical_factor_model(returns, n_factors=2)
        >>> result["factor_risk"] > 0.3
        True

    See Also:
        factor_risk_model: When you know which factors to use.

    References:
        - Connor & Korajczyk (1986), "Performance Measurement with the
          Arbitrage Pricing Theory"
    """
    clean = returns.dropna()
    X = clean.values
    n_obs, n_assets = X.shape

    # Demean
    means = X.mean(axis=0)
    X_centered = X - means

    # SVD-based PCA
    U, S, Vt = np.linalg.svd(X_centered, full_matrices=False)

    # Eigenvalues (variance explained)
    eigenvalues = (S**2) / (n_obs - 1)
    total_var = eigenvalues.sum()

    n_factors = min(n_factors, min(n_obs, n_assets))

    # Factor returns: projections onto principal components
    factor_returns = X_centered @ Vt[:n_factors].T
    factor_names = [f"PC{i+1}" for i in range(n_factors)]
    factors_df = pd.DataFrame(factor_returns, index=clean.index, columns=factor_names)

    # Loadings
    loadings = Vt[:n_factors].T  # (n_assets x n_factors)

    explained_var = eigenvalues[:n_factors]
    explained_ratio = explained_var / total_var
    cumulative_ratio = np.cumsum(explained_ratio)

    factor_risk = float(cumulative_ratio[-1])
    specific_risk = 1.0 - factor_risk

    return {
        "factors": factors_df,
        "loadings": loadings,
        "explained_variance": explained_var,
        "explained_variance_ratio": explained_ratio,
        "cumulative_variance_ratio": cumulative_ratio,
        "factor_risk": factor_risk,
        "specific_risk": specific_risk,
    }




[docs]
def fama_french_regression(
    returns: pd.Series,
    factors_df: pd.DataFrame,
) -> dict[str, Any]:
    r"""Fama-French factor regression with full diagnostics.

    Regresses asset returns on named Fama-French factors (e.g., Mkt-RF,
    SMB, HML, RMW, CMA, Mom). Reports alpha, betas, t-statistics, and
    R-squared. The alpha represents the return not explained by factor
    exposures -- a positive, statistically significant alpha indicates
    genuine skill.

    When to use:
        Use for performance attribution and alpha measurement. The classic
        3-factor model (Mkt, SMB, HML) is the minimum; the 5-factor model
        adds RMW (profitability) and CMA (investment). Add Mom (momentum)
        for the 6-factor model.

    Parameters:
        returns: Asset or portfolio return series (excess of risk-free rate
            if the factors are excess returns).
        factors_df: DataFrame of factor returns. Column names should be
            descriptive (e.g., "Mkt-RF", "SMB", "HML").

    Returns:
        Dictionary containing:
        - **alpha** (*float*) -- Jensen's alpha (intercept).
        - **betas** (*dict[str, float]*) -- Factor loadings.
        - **t_stats** (*dict[str, float]*) -- t-statistics for each
          coefficient (including alpha under key "alpha").
        - **p_values** (*dict[str, float]*) -- p-values for each
          coefficient.
        - **r_squared** (*float*) -- R-squared.
        - **adj_r_squared** (*float*) -- Adjusted R-squared.
        - **residual_vol** (*float*) -- Annualized residual volatility.

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> mkt = np.random.normal(0.0005, 0.01, 252)
        >>> smb = np.random.normal(0, 0.005, 252)
        >>> hml = np.random.normal(0, 0.005, 252)
        >>> stock = 0.0001 + 1.1 * mkt + 0.3 * smb - 0.2 * hml + \\
        ...     np.random.normal(0, 0.003, 252)
        >>> factors = pd.DataFrame({"Mkt-RF": mkt, "SMB": smb, "HML": hml})
        >>> result = fama_french_regression(pd.Series(stock), factors)
        >>> abs(result["betas"]["Mkt-RF"] - 1.1) < 0.2
        True

    See Also:
        factor_risk_model: General factor regression with risk decomposition.

    References:
        - Fama & French (1993), "Common Risk Factors in the Returns on
          Stocks and Bonds"
        - Fama & French (2015), "A Five-Factor Asset Pricing Model"
    """
    aligned = pd.concat([returns.rename("y"), factors_df], axis=1).dropna()

    y = aligned["y"].values
    factor_names = [c for c in aligned.columns if c != "y"]
    X = aligned[factor_names].values

    # OLS via shared regression module
    from wraquant.stats.regression import ols as _ols

    ols_result = _ols(y, X, add_constant=True)
    coeffs = ols_result["coefficients"]
    t_stats_arr = ols_result["t_stats"]
    p_values_arr = ols_result["p_values"]
    residuals = ols_result["residuals"]
    r_squared = ols_result["r_squared"]
    adj_r_squared = ols_result["adj_r_squared"]

    alpha_val = float(coeffs[0])
    betas = {name: float(coeffs[i + 1]) for i, name in enumerate(factor_names)}
    t_stats_dict = {"alpha": float(t_stats_arr[0])}
    p_values_dict = {"alpha": float(p_values_arr[0])}

    for i, name in enumerate(factor_names):
        t_stats_dict[name] = float(t_stats_arr[i + 1])
        p_values_dict[name] = float(p_values_arr[i + 1])

    residual_vol = float(np.std(residuals, ddof=1) * np.sqrt(252))

    return {
        "alpha": alpha_val,
        "betas": betas,
        "t_stats": t_stats_dict,
        "p_values": p_values_dict,
        "r_squared": float(r_squared),
        "adj_r_squared": float(adj_r_squared),
        "residual_vol": residual_vol,
    }




[docs]
def factor_contribution(
    weights: np.ndarray,
    factor_betas: np.ndarray,
    factor_cov: np.ndarray,
) -> dict[str, Any]:
    """Decompose portfolio factor risk into per-factor contributions.

    Given portfolio weights, a matrix of factor loadings, and the factor
    covariance matrix, computes how much each factor contributes to
    total portfolio factor risk (variance).

    When to use:
        Use after estimating a factor model to understand which factors
        dominate portfolio risk. This guides factor hedging decisions:
        if 80% of portfolio risk comes from the market factor, you
        can hedge with index futures to dramatically reduce risk.

    Mathematical formulation:
        Portfolio factor variance = w' * B * Sigma_f * B' * w

        Factor i contribution = w' * B_i * (Sigma_f * B' * w)_i / total_var

    Parameters:
        weights: Portfolio weight vector (n_assets,).
        factor_betas: Factor loading matrix (n_assets x n_factors).
            Each row is an asset's factor exposures.
        factor_cov: Factor covariance matrix (n_factors x n_factors).

    Returns:
        Dictionary containing:
        - **total_factor_var** (*float*) -- Total portfolio factor variance.
        - **total_factor_vol** (*float*) -- Square root of factor variance.
        - **factor_contributions** (*np.ndarray*) -- Each factor's
          variance contribution (sums to total_factor_var).
        - **factor_pct_contributions** (*np.ndarray*) -- Percentage
          contributions (sum to 1.0).

    Example:
        >>> import numpy as np
        >>> weights = np.array([0.3, 0.3, 0.4])
        >>> betas = np.array([[1.0, 0.5], [1.2, -0.3], [0.8, 0.1]])
        >>> factor_cov = np.array([[0.0004, 0.00005], [0.00005, 0.0001]])
        >>> result = factor_contribution(weights, betas, factor_cov)
        >>> result["total_factor_var"] > 0
        True

    See Also:
        factor_risk_model: Estimate factor betas from return data.
        statistical_factor_model: Extract latent factors via PCA.
    """
    # Portfolio factor exposure: B' @ w -> (n_factors,)
    portfolio_beta = factor_betas.T @ weights  # (n_factors,)

    # Total factor variance: beta_p' @ Sigma_f @ beta_p
    total_factor_var = float(portfolio_beta @ factor_cov @ portfolio_beta)
    total_factor_vol = float(np.sqrt(max(0, total_factor_var)))

    # Per-factor marginal contribution
    # Euler decomposition: contribution_i = beta_p_i * (Sigma_f @ beta_p)_i
    marginal = factor_cov @ portfolio_beta
    contributions = portfolio_beta * marginal

    if total_factor_var > 0:
        pct_contributions = contributions / total_factor_var
    else:
        pct_contributions = np.zeros_like(contributions)

    return {
        "total_factor_var": total_factor_var,
        "total_factor_vol": total_factor_vol,
        "factor_contributions": contributions,
        "factor_pct_contributions": pct_contributions,
    }