Source code for wraquant.ml.evaluation

"""Model evaluation utilities for financial machine learning.

Provides both standard classification metrics and finance-specific
performance measures such as Sharpe ratio from predictions and backtesting
with transaction costs.
"""

from __future__ import annotations

from typing import Any, Sequence

import numpy as np
import pandas as pd

from wraquant.core.decorators import requires_extra

__all__ = [
    "classification_metrics",
    "financial_metrics",
    "learning_curve",
    "backtest_predictions",
]


# ---------------------------------------------------------------------------
# Classification metrics
# ---------------------------------------------------------------------------



[docs]
def classification_metrics(
    y_true: pd.Series | np.ndarray,
    y_pred: pd.Series | np.ndarray,
    y_prob: pd.Series | np.ndarray | None = None,
) -> dict[str, float]:
    """Compute standard classification metrics.

    Use classification metrics to evaluate direction-prediction models
    (e.g., predicting up/down/flat labels).  These metrics assess the
    statistical quality of the classifier independently of PnL; pair
    with ``financial_metrics`` for economic evaluation.

    Parameters
    ----------
    y_true : array-like
        True class labels.
    y_pred : array-like
        Predicted class labels.
    y_prob : array-like or None
        Predicted probabilities (for the positive class in binary
        classification).  When provided, log-loss and AUC are included.

    Returns
    -------
    dict[str, float]
        ``accuracy`` : float
            Fraction of correct predictions.
        ``precision`` : float
            Macro-averaged precision (how many predicted positives are
            actually positive).
        ``recall`` : float
            Macro-averaged recall (how many actual positives are
            captured).
        ``f1`` : float
            Macro-averaged F1 score (harmonic mean of precision and
            recall).
        ``log_loss`` : float (only if *y_prob* given)
            Cross-entropy loss.  Lower is better; measures calibration
            quality.
        ``auc`` : float (only if *y_prob* given, binary only)
            Area under the ROC curve.  0.5 = random, 1.0 = perfect.

    Example
    -------
    >>> import numpy as np
    >>> y_true = np.array([1, 0, 1, 1, 0, 1])
    >>> y_pred = np.array([1, 0, 0, 1, 0, 1])
    >>> metrics = classification_metrics(y_true, y_pred)
    >>> metrics['accuracy']
    0.8333333333333334
    >>> metrics['f1'] > 0.5
    True

    See Also
    --------
    financial_metrics : PnL-based evaluation of directional predictions.
    backtest_predictions : Full backtest with transaction costs.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    correct = y_true == y_pred
    accuracy = float(correct.mean())

    # Per-class precision / recall / F1, then macro-average
    classes = np.unique(np.concatenate([y_true, y_pred]))
    precisions: list[float] = []
    recalls: list[float] = []
    f1s: list[float] = []

    for c in classes:
        tp = int(np.sum((y_pred == c) & (y_true == c)))
        fp = int(np.sum((y_pred == c) & (y_true != c)))
        fn = int(np.sum((y_pred != c) & (y_true == c)))

        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0

        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)

    result: dict[str, float] = {
        "accuracy": accuracy,
        "precision": float(np.mean(precisions)),
        "recall": float(np.mean(recalls)),
        "f1": float(np.mean(f1s)),
    }

    if y_prob is not None:
        y_prob = np.asarray(y_prob)
        # Log loss (binary or multi-class safe)
        eps = 1e-15
        if y_prob.ndim == 1:
            # Binary classification
            p = np.clip(y_prob, eps, 1 - eps)
            ll = -float(np.mean(y_true * np.log(p) + (1 - y_true) * np.log(1 - p)))
        else:
            p = np.clip(y_prob, eps, 1 - eps)
            p.shape[1]
            one_hot = np.zeros_like(p)
            for i, c in enumerate(classes):
                one_hot[y_true == c, i] = 1.0
            ll = -float(np.mean(np.sum(one_hot * np.log(p), axis=1)))
        result["log_loss"] = ll

        # AUC (binary only)
        if y_prob.ndim == 1 and len(classes) == 2:
            result["auc"] = _auc_binary(y_true, y_prob, classes)

    return result



def _auc_binary(
    y_true: np.ndarray,
    y_prob: np.ndarray,
    classes: np.ndarray,
) -> float:
    """Compute AUC for binary classification using the trapezoidal rule."""
    # Map labels to 0/1
    pos_label = classes[1]
    y_bin = (y_true == pos_label).astype(int)

    # Sort by descending probability
    order = np.argsort(-y_prob)
    y_sorted = y_bin[order]

    tps = np.cumsum(y_sorted)
    fps = np.cumsum(1 - y_sorted)
    tpr = tps / tps[-1] if tps[-1] > 0 else tps
    fpr = fps / fps[-1] if fps[-1] > 0 else fps

    # Prepend origin
    tpr = np.concatenate([[0], tpr])
    fpr = np.concatenate([[0], fpr])

    return float(np.trapezoid(tpr, fpr))


# ---------------------------------------------------------------------------
# Financial metrics
# ---------------------------------------------------------------------------



[docs]
def financial_metrics(
    y_true: pd.Series | np.ndarray,
    y_pred: pd.Series | np.ndarray,
    returns: pd.Series | np.ndarray,
) -> dict[str, float]:
    """Compute finance-specific evaluation metrics from predictions.

    Use financial metrics to evaluate whether a model's predictions
    translate into actual trading profits.  A model can have high
    accuracy but poor financial performance if it is right on small moves
    and wrong on large moves.  These metrics directly measure economic
    value.

    The predicted labels are interpreted as position signals: ``1`` for
    long, ``-1`` for short, ``0`` for flat.

    Parameters
    ----------
    y_true : array-like
        True directional labels.
    y_pred : array-like
        Predicted directional labels (used as signals).
    returns : array-like
        Actual period returns corresponding to each observation.

    Returns
    -------
    dict[str, float]
        ``strategy_return`` : float
            Cumulative strategy return (sum of signal * return).
        ``sharpe`` : float
            Annualised Sharpe ratio (252 trading days).  Values above
            1.0 are generally considered good; above 2.0 is excellent.
        ``hit_rate`` : float
            Fraction of periods where predicted sign matches actual
            sign.  A hit rate above 0.5 is necessary but not sufficient
            for profitability.
        ``profit_factor`` : float
            Gross profit / gross loss.  Values above 1.0 indicate a
            profitable strategy; above 2.0 is strong.

    Example
    -------
    >>> import numpy as np
    >>> y_true = np.array([1, -1, 1, 1, -1])
    >>> y_pred = np.array([1, -1, -1, 1, 1])
    >>> returns = np.array([0.02, -0.01, 0.015, 0.005, -0.02])
    >>> metrics = financial_metrics(y_true, y_pred, returns)
    >>> metrics['hit_rate']
    0.6
    >>> metrics['sharpe'] != 0
    True

    See Also
    --------
    classification_metrics : Standard ML classification metrics.
    backtest_predictions : Full backtest with transaction costs.
    """
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    returns = np.asarray(returns, dtype=float)

    strat_returns = y_pred * returns

    cumulative = float(np.nansum(strat_returns))

    # Use canonical Sharpe implementation
    from wraquant.risk.metrics import sharpe_ratio as _sharpe_ratio

    sharpe = _sharpe_ratio(pd.Series(strat_returns)) if len(strat_returns) > 1 else 0.0

    # Hit rate: how often the predicted direction matches the actual
    correct_direction = np.sign(y_pred) == np.sign(y_true)
    hit_rate = float(np.nanmean(correct_direction))

    # Profit factor
    gross_profit = float(np.nansum(strat_returns[strat_returns > 0]))
    gross_loss = float(np.abs(np.nansum(strat_returns[strat_returns < 0])))
    profit_factor = float(gross_profit / gross_loss) if gross_loss > 0 else float("inf")

    return {
        "strategy_return": cumulative,
        "sharpe": sharpe,
        "hit_rate": hit_rate,
        "profit_factor": profit_factor,
    }



# ---------------------------------------------------------------------------
# Learning curve
# ---------------------------------------------------------------------------



[docs]
@requires_extra("ml")
def learning_curve(
    model: Any,
    X: pd.DataFrame | np.ndarray,
    y: pd.Series | np.ndarray,
    train_sizes: Sequence[int | float] | np.ndarray | None = None,
    cv: int = 5,
) -> dict[str, np.ndarray]:
    """Generate a learning curve for a model.

    Use learning curves to diagnose whether a model suffers from high
    bias (underfitting) or high variance (overfitting).  If training and
    test scores converge at a low value, the model is too simple.  If
    there is a large gap between training and test scores, the model is
    overfitting and more data or regularisation is needed.

    Parameters
    ----------
    model : estimator
        A scikit-learn-compatible estimator.
    X : pd.DataFrame or np.ndarray
        Feature matrix.
    y : pd.Series or np.ndarray
        Target vector.
    train_sizes : Sequence or None
        Training set sizes (absolute counts or fractions).  Defaults to
        ``np.linspace(0.1, 1.0, 10)``.
    cv : int
        Number of cross-validation folds.

    Returns
    -------
    dict
        ``train_sizes`` : np.ndarray
            Absolute number of training samples at each point.
        ``train_scores`` : np.ndarray, shape ``(len(sizes), cv)``
            Training scores at each size/fold.  Plot the mean across
            folds to visualize training performance.
        ``test_scores`` : np.ndarray, shape ``(len(sizes), cv)``
            Test scores at each size/fold.  The gap between train and
            test mean scores indicates overfitting.

    Example
    -------
    >>> from sklearn.linear_model import Ridge
    >>> import numpy as np
    >>> X = np.random.randn(300, 5)
    >>> y = X @ [1, 0.5, 0, 0, 0] + np.random.randn(300) * 0.1
    >>> result = learning_curve(Ridge(), X, y, cv=3)
    >>> result['train_sizes'].shape[0]  # 10 points by default
    10

    See Also
    --------
    classification_metrics : Evaluate classification quality.
    financial_metrics : Evaluate economic value of predictions.
    """
    from sklearn.model_selection import learning_curve as _lc

    if train_sizes is None:
        train_sizes = np.linspace(0.1, 1.0, 10)

    sizes, train_scores, test_scores = _lc(
        model,
        np.asarray(X),
        np.asarray(y),
        train_sizes=np.asarray(train_sizes),
        cv=cv,
        n_jobs=1,
    )

    return {
        "train_sizes": sizes,
        "train_scores": train_scores,
        "test_scores": test_scores,
    }



# ---------------------------------------------------------------------------
# Backtest predictions
# ---------------------------------------------------------------------------



[docs]
def backtest_predictions(
    predictions: pd.Series | np.ndarray,
    returns: pd.Series | np.ndarray,
    cost_bps: float = 10,
) -> dict[str, Any]:
    """Backtest a prediction signal against actual returns.

    Use backtest_predictions as a quick sanity check of a model's
    economic value before building a full backtest.  It applies
    realistic transaction costs (proportional to position changes)
    and computes key performance metrics including Sharpe, max drawdown,
    and turnover.

    Parameters
    ----------
    predictions : array-like
        Predicted position signals (e.g. 1, 0, -1).  The signal is
        applied as a position: ``signal * return``.
    returns : array-like
        Actual period returns corresponding to each prediction.
    cost_bps : float
        Transaction cost in basis points applied on each position
        change (default 10 bps).  For equities, 5-10 bps is typical;
        for futures, 1-3 bps.

    Returns
    -------
    dict
        ``gross_returns`` : np.ndarray
            Per-period strategy returns before costs.
        ``net_returns`` : np.ndarray
            Per-period strategy returns after costs.
        ``cumulative_return`` : float
            Total cumulative net return.  Positive = profitable.
        ``sharpe`` : float
            Annualised Sharpe ratio of net returns.  Above 1.0 is
            generally good; above 2.0 is excellent.
        ``max_drawdown`` : float
            Maximum peak-to-trough decline in cumulative PnL.
            Always negative or zero.
        ``turnover`` : float
            Mean absolute position change per period.  Higher turnover
            means higher transaction costs.

    Example
    -------
    >>> import numpy as np
    >>> preds = np.array([1, 1, -1, 1, -1, 0, 1])
    >>> rets = np.array([0.01, -0.005, -0.02, 0.015, 0.01, 0.005, 0.008])
    >>> result = backtest_predictions(preds, rets, cost_bps=10)
    >>> result['cumulative_return'] != 0
    True
    >>> result['max_drawdown'] <= 0
    True

    See Also
    --------
    financial_metrics : Quick financial metrics without transaction costs.
    wraquant.ml.pipeline.walk_forward_backtest : Walk-forward backtest.
    """
    preds = np.asarray(predictions, dtype=float)
    rets = np.asarray(returns, dtype=float)

    gross = preds * rets

    # Transaction costs
    position_changes = np.abs(np.diff(preds, prepend=0))
    costs = position_changes * (cost_bps / 10_000.0)
    net = gross - costs

    cumulative = float(np.nansum(net))
    mean_r = np.nanmean(net)
    std_r = np.nanstd(net, ddof=1) if len(net) > 1 else np.nan
    sharpe = float(mean_r / std_r * np.sqrt(252)) if std_r and std_r > 0 else 0.0

    # Max drawdown on cumulative curve
    cum_curve = np.nancumsum(net)
    running_max = np.maximum.accumulate(cum_curve)
    drawdowns = cum_curve - running_max
    max_dd = float(np.min(drawdowns)) if len(drawdowns) > 0 else 0.0

    turnover = float(np.nanmean(position_changes))

    return {
        "gross_returns": gross,
        "net_returns": net,
        "cumulative_return": cumulative,
        "sharpe": sharpe,
        "max_drawdown": max_dd,
        "turnover": turnover,
    }