"""Model evaluation utilities for financial machine learning.
Provides both standard classification metrics and finance-specific
performance measures such as Sharpe ratio from predictions and backtesting
with transaction costs.
"""
from __future__ import annotations
from typing import Any, Sequence
import numpy as np
import pandas as pd
from wraquant.core.decorators import requires_extra
__all__ = [
"classification_metrics",
"financial_metrics",
"learning_curve",
"backtest_predictions",
]
# ---------------------------------------------------------------------------
# Classification metrics
# ---------------------------------------------------------------------------
[docs]
def classification_metrics(
y_true: pd.Series | np.ndarray,
y_pred: pd.Series | np.ndarray,
y_prob: pd.Series | np.ndarray | None = None,
) -> dict[str, float]:
"""Compute standard classification metrics.
Use classification metrics to evaluate direction-prediction models
(e.g., predicting up/down/flat labels). These metrics assess the
statistical quality of the classifier independently of PnL; pair
with ``financial_metrics`` for economic evaluation.
Parameters
----------
y_true : array-like
True class labels.
y_pred : array-like
Predicted class labels.
y_prob : array-like or None
Predicted probabilities (for the positive class in binary
classification). When provided, log-loss and AUC are included.
Returns
-------
dict[str, float]
``accuracy`` : float
Fraction of correct predictions.
``precision`` : float
Macro-averaged precision (how many predicted positives are
actually positive).
``recall`` : float
Macro-averaged recall (how many actual positives are
captured).
``f1`` : float
Macro-averaged F1 score (harmonic mean of precision and
recall).
``log_loss`` : float (only if *y_prob* given)
Cross-entropy loss. Lower is better; measures calibration
quality.
``auc`` : float (only if *y_prob* given, binary only)
Area under the ROC curve. 0.5 = random, 1.0 = perfect.
Example
-------
>>> import numpy as np
>>> y_true = np.array([1, 0, 1, 1, 0, 1])
>>> y_pred = np.array([1, 0, 0, 1, 0, 1])
>>> metrics = classification_metrics(y_true, y_pred)
>>> metrics['accuracy']
0.8333333333333334
>>> metrics['f1'] > 0.5
True
See Also
--------
financial_metrics : PnL-based evaluation of directional predictions.
backtest_predictions : Full backtest with transaction costs.
"""
y_true = np.asarray(y_true)
y_pred = np.asarray(y_pred)
correct = y_true == y_pred
accuracy = float(correct.mean())
# Per-class precision / recall / F1, then macro-average
classes = np.unique(np.concatenate([y_true, y_pred]))
precisions: list[float] = []
recalls: list[float] = []
f1s: list[float] = []
for c in classes:
tp = int(np.sum((y_pred == c) & (y_true == c)))
fp = int(np.sum((y_pred == c) & (y_true != c)))
fn = int(np.sum((y_pred != c) & (y_true == c)))
prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0
precisions.append(prec)
recalls.append(rec)
f1s.append(f1)
result: dict[str, float] = {
"accuracy": accuracy,
"precision": float(np.mean(precisions)),
"recall": float(np.mean(recalls)),
"f1": float(np.mean(f1s)),
}
if y_prob is not None:
y_prob = np.asarray(y_prob)
# Log loss (binary or multi-class safe)
eps = 1e-15
if y_prob.ndim == 1:
# Binary classification
p = np.clip(y_prob, eps, 1 - eps)
ll = -float(np.mean(y_true * np.log(p) + (1 - y_true) * np.log(1 - p)))
else:
p = np.clip(y_prob, eps, 1 - eps)
p.shape[1]
one_hot = np.zeros_like(p)
for i, c in enumerate(classes):
one_hot[y_true == c, i] = 1.0
ll = -float(np.mean(np.sum(one_hot * np.log(p), axis=1)))
result["log_loss"] = ll
# AUC (binary only)
if y_prob.ndim == 1 and len(classes) == 2:
result["auc"] = _auc_binary(y_true, y_prob, classes)
return result
def _auc_binary(
y_true: np.ndarray,
y_prob: np.ndarray,
classes: np.ndarray,
) -> float:
"""Compute AUC for binary classification using the trapezoidal rule."""
# Map labels to 0/1
pos_label = classes[1]
y_bin = (y_true == pos_label).astype(int)
# Sort by descending probability
order = np.argsort(-y_prob)
y_sorted = y_bin[order]
tps = np.cumsum(y_sorted)
fps = np.cumsum(1 - y_sorted)
tpr = tps / tps[-1] if tps[-1] > 0 else tps
fpr = fps / fps[-1] if fps[-1] > 0 else fps
# Prepend origin
tpr = np.concatenate([[0], tpr])
fpr = np.concatenate([[0], fpr])
return float(np.trapezoid(tpr, fpr))
# ---------------------------------------------------------------------------
# Financial metrics
# ---------------------------------------------------------------------------
[docs]
def financial_metrics(
y_true: pd.Series | np.ndarray,
y_pred: pd.Series | np.ndarray,
returns: pd.Series | np.ndarray,
) -> dict[str, float]:
"""Compute finance-specific evaluation metrics from predictions.
Use financial metrics to evaluate whether a model's predictions
translate into actual trading profits. A model can have high
accuracy but poor financial performance if it is right on small moves
and wrong on large moves. These metrics directly measure economic
value.
The predicted labels are interpreted as position signals: ``1`` for
long, ``-1`` for short, ``0`` for flat.
Parameters
----------
y_true : array-like
True directional labels.
y_pred : array-like
Predicted directional labels (used as signals).
returns : array-like
Actual period returns corresponding to each observation.
Returns
-------
dict[str, float]
``strategy_return`` : float
Cumulative strategy return (sum of signal * return).
``sharpe`` : float
Annualised Sharpe ratio (252 trading days). Values above
1.0 are generally considered good; above 2.0 is excellent.
``hit_rate`` : float
Fraction of periods where predicted sign matches actual
sign. A hit rate above 0.5 is necessary but not sufficient
for profitability.
``profit_factor`` : float
Gross profit / gross loss. Values above 1.0 indicate a
profitable strategy; above 2.0 is strong.
Example
-------
>>> import numpy as np
>>> y_true = np.array([1, -1, 1, 1, -1])
>>> y_pred = np.array([1, -1, -1, 1, 1])
>>> returns = np.array([0.02, -0.01, 0.015, 0.005, -0.02])
>>> metrics = financial_metrics(y_true, y_pred, returns)
>>> metrics['hit_rate']
0.6
>>> metrics['sharpe'] != 0
True
See Also
--------
classification_metrics : Standard ML classification metrics.
backtest_predictions : Full backtest with transaction costs.
"""
y_true = np.asarray(y_true, dtype=float)
y_pred = np.asarray(y_pred, dtype=float)
returns = np.asarray(returns, dtype=float)
strat_returns = y_pred * returns
cumulative = float(np.nansum(strat_returns))
# Use canonical Sharpe implementation
from wraquant.risk.metrics import sharpe_ratio as _sharpe_ratio
sharpe = _sharpe_ratio(pd.Series(strat_returns)) if len(strat_returns) > 1 else 0.0
# Hit rate: how often the predicted direction matches the actual
correct_direction = np.sign(y_pred) == np.sign(y_true)
hit_rate = float(np.nanmean(correct_direction))
# Profit factor
gross_profit = float(np.nansum(strat_returns[strat_returns > 0]))
gross_loss = float(np.abs(np.nansum(strat_returns[strat_returns < 0])))
profit_factor = float(gross_profit / gross_loss) if gross_loss > 0 else float("inf")
return {
"strategy_return": cumulative,
"sharpe": sharpe,
"hit_rate": hit_rate,
"profit_factor": profit_factor,
}
# ---------------------------------------------------------------------------
# Learning curve
# ---------------------------------------------------------------------------
[docs]
@requires_extra("ml")
def learning_curve(
model: Any,
X: pd.DataFrame | np.ndarray,
y: pd.Series | np.ndarray,
train_sizes: Sequence[int | float] | np.ndarray | None = None,
cv: int = 5,
) -> dict[str, np.ndarray]:
"""Generate a learning curve for a model.
Use learning curves to diagnose whether a model suffers from high
bias (underfitting) or high variance (overfitting). If training and
test scores converge at a low value, the model is too simple. If
there is a large gap between training and test scores, the model is
overfitting and more data or regularisation is needed.
Parameters
----------
model : estimator
A scikit-learn-compatible estimator.
X : pd.DataFrame or np.ndarray
Feature matrix.
y : pd.Series or np.ndarray
Target vector.
train_sizes : Sequence or None
Training set sizes (absolute counts or fractions). Defaults to
``np.linspace(0.1, 1.0, 10)``.
cv : int
Number of cross-validation folds.
Returns
-------
dict
``train_sizes`` : np.ndarray
Absolute number of training samples at each point.
``train_scores`` : np.ndarray, shape ``(len(sizes), cv)``
Training scores at each size/fold. Plot the mean across
folds to visualize training performance.
``test_scores`` : np.ndarray, shape ``(len(sizes), cv)``
Test scores at each size/fold. The gap between train and
test mean scores indicates overfitting.
Example
-------
>>> from sklearn.linear_model import Ridge
>>> import numpy as np
>>> X = np.random.randn(300, 5)
>>> y = X @ [1, 0.5, 0, 0, 0] + np.random.randn(300) * 0.1
>>> result = learning_curve(Ridge(), X, y, cv=3)
>>> result['train_sizes'].shape[0] # 10 points by default
10
See Also
--------
classification_metrics : Evaluate classification quality.
financial_metrics : Evaluate economic value of predictions.
"""
from sklearn.model_selection import learning_curve as _lc
if train_sizes is None:
train_sizes = np.linspace(0.1, 1.0, 10)
sizes, train_scores, test_scores = _lc(
model,
np.asarray(X),
np.asarray(y),
train_sizes=np.asarray(train_sizes),
cv=cv,
n_jobs=1,
)
return {
"train_sizes": sizes,
"train_scores": train_scores,
"test_scores": test_scores,
}
# ---------------------------------------------------------------------------
# Backtest predictions
# ---------------------------------------------------------------------------
[docs]
def backtest_predictions(
predictions: pd.Series | np.ndarray,
returns: pd.Series | np.ndarray,
cost_bps: float = 10,
) -> dict[str, Any]:
"""Backtest a prediction signal against actual returns.
Use backtest_predictions as a quick sanity check of a model's
economic value before building a full backtest. It applies
realistic transaction costs (proportional to position changes)
and computes key performance metrics including Sharpe, max drawdown,
and turnover.
Parameters
----------
predictions : array-like
Predicted position signals (e.g. 1, 0, -1). The signal is
applied as a position: ``signal * return``.
returns : array-like
Actual period returns corresponding to each prediction.
cost_bps : float
Transaction cost in basis points applied on each position
change (default 10 bps). For equities, 5-10 bps is typical;
for futures, 1-3 bps.
Returns
-------
dict
``gross_returns`` : np.ndarray
Per-period strategy returns before costs.
``net_returns`` : np.ndarray
Per-period strategy returns after costs.
``cumulative_return`` : float
Total cumulative net return. Positive = profitable.
``sharpe`` : float
Annualised Sharpe ratio of net returns. Above 1.0 is
generally good; above 2.0 is excellent.
``max_drawdown`` : float
Maximum peak-to-trough decline in cumulative PnL.
Always negative or zero.
``turnover`` : float
Mean absolute position change per period. Higher turnover
means higher transaction costs.
Example
-------
>>> import numpy as np
>>> preds = np.array([1, 1, -1, 1, -1, 0, 1])
>>> rets = np.array([0.01, -0.005, -0.02, 0.015, 0.01, 0.005, 0.008])
>>> result = backtest_predictions(preds, rets, cost_bps=10)
>>> result['cumulative_return'] != 0
True
>>> result['max_drawdown'] <= 0
True
See Also
--------
financial_metrics : Quick financial metrics without transaction costs.
wraquant.ml.pipeline.walk_forward_backtest : Walk-forward backtest.
"""
preds = np.asarray(predictions, dtype=float)
rets = np.asarray(returns, dtype=float)
gross = preds * rets
# Transaction costs
position_changes = np.abs(np.diff(preds, prepend=0))
costs = position_changes * (cost_bps / 10_000.0)
net = gross - costs
cumulative = float(np.nansum(net))
mean_r = np.nanmean(net)
std_r = np.nanstd(net, ddof=1) if len(net) > 1 else np.nan
sharpe = float(mean_r / std_r * np.sqrt(252)) if std_r and std_r > 0 else 0.0
# Max drawdown on cumulative curve
cum_curve = np.nancumsum(net)
running_max = np.maximum.accumulate(cum_curve)
drawdowns = cum_curve - running_max
max_dd = float(np.min(drawdowns)) if len(drawdowns) > 0 else 0.0
turnover = float(np.nanmean(position_changes))
return {
"gross_returns": gross,
"net_returns": net,
"cumulative_return": cumulative,
"sharpe": sharpe,
"max_drawdown": max_dd,
"turnover": turnover,
}