Source code for wraquant.ml.features

"""Feature engineering utilities for financial machine learning.

All functions in this module use only numpy and pandas -- no external TA
libraries are required.
"""

from __future__ import annotations

from typing import Sequence

import numpy as np
import pandas as pd

__all__ = [
    "rolling_features",
    "return_features",
    "technical_features",
    "ta_features",
    "volatility_features",
    "microstructure_features",
    "label_fixed_horizon",
    "label_triple_barrier",
    "interaction_features",
    "cross_asset_features",
    "regime_features",
]


# ---------------------------------------------------------------------------
# Rolling statistics
# ---------------------------------------------------------------------------


[docs] def rolling_features( data: pd.Series | pd.DataFrame, windows: Sequence[int] = (5, 10, 21, 63), ) -> pd.DataFrame: """Generate rolling statistical features for each window length. Use rolling features as a general-purpose feature engineering step before training ML models on time-series data. The rolling statistics capture time-varying moments that can signal changes in trend (mean), risk (std), asymmetry (skew), and tail behaviour (kurtosis). For every window the following statistics are computed: mean, std, skew, kurtosis, min, and max. Parameters ---------- data : pd.Series or pd.DataFrame Numeric time-series data. If a DataFrame is passed, features are generated independently for each column. windows : Sequence[int] Rolling-window sizes (default ``(5, 10, 21, 63)``), corresponding roughly to 1-week, 2-week, 1-month, and 1-quarter horizons. Returns ------- pd.DataFrame DataFrame whose columns are named ``{col}_{stat}_w{window}`` (or ``{stat}_w{window}`` when *data* is a Series). The number of feature columns equals ``n_cols * len(windows) * 6``. Early rows contain NaN where the window has insufficient data. Example ------- >>> import pandas as pd, numpy as np >>> np.random.seed(0) >>> returns = pd.Series(np.random.randn(100) * 0.01, name='ret') >>> feats = rolling_features(returns, windows=(5, 21)) >>> feats.columns.tolist()[:3] ['mean_w5', 'std_w5', 'skew_w5'] >>> feats.shape[1] # 6 stats * 2 windows 12 See Also -------- return_features : Lagged and cumulative return features. volatility_features : Realised volatility and vol-of-vol features. """ if isinstance(data, pd.Series): data = data.to_frame(name=data.name or "value") was_series = True else: was_series = False frames: list[pd.DataFrame] = [] stats = ["mean", "std", "skew", "kurt", "min", "max"] for w in windows: roll = data.rolling(window=w, min_periods=w) rm = roll.mean() rs = roll.std() rsk = roll.apply(lambda x: x.skew(), raw=False) rk = roll.apply(lambda x: x.kurt(), raw=False) rmin = roll.min() rmax = roll.max() for col in data.columns: prefix = f"{col}_" if not was_series else "" for stat_name, stat_df in zip( stats, [rm, rs, rsk, rk, rmin, rmax], strict=True ): frames.append( stat_df[[col]].rename(columns={col: f"{prefix}{stat_name}_w{w}"}) ) return pd.concat(frames, axis=1)
# --------------------------------------------------------------------------- # Return-based features # ---------------------------------------------------------------------------
[docs] def return_features( prices: pd.Series, lags: Sequence[int] = (1, 2, 3, 5, 10, 21), ) -> pd.DataFrame: """Compute lagged and cumulative return features from a price series. Use return features as inputs to ML models predicting future returns or direction. Lagged returns capture momentum and mean-reversion signals at multiple horizons; cumulative returns capture trend strength. Parameters ---------- prices : pd.Series Price series (e.g. adjusted close). lags : Sequence[int] Lag periods for returns (default ``(1, 2, 3, 5, 10, 21)``). Returns ------- pd.DataFrame DataFrame with columns ``ret_lag{l}`` (log return *l* periods ago, a momentum/mean-reversion signal) and ``cum_ret_{l}`` (cumulative log return over the last *l* periods, a trend signal) for each lag *l*. Early rows are NaN. Example ------- >>> import pandas as pd, numpy as np >>> prices = pd.Series([100, 101, 102, 100, 103, 105, 104], ... name='close') >>> feats = return_features(prices, lags=(1, 3)) >>> list(feats.columns) ['ret_lag1', 'cum_ret_1', 'ret_lag3', 'cum_ret_3'] >>> feats['cum_ret_3'].iloc[-1] > 0 # cumulative 3-period return True See Also -------- rolling_features : Rolling statistical features. technical_features : Technical analysis features (RSI, MACD, etc.). """ result: dict[str, pd.Series] = {} log_ret = np.log(prices / prices.shift(1)) for lag in lags: # Simple return lagged by *lag* periods result[f"ret_lag{lag}"] = log_ret.shift(lag) # Cumulative return over *lag* periods result[f"cum_ret_{lag}"] = np.log(prices / prices.shift(lag)) return pd.DataFrame(result, index=prices.index)
# --------------------------------------------------------------------------- # Technical features (inline, no dependency on ta/ module) # --------------------------------------------------------------------------- def _rsi(close: pd.Series, period: int = 14) -> pd.Series: """Relative Strength Index.""" delta = close.diff() gain = delta.where(delta > 0, 0.0) loss = -delta.where(delta < 0, 0.0) avg_gain = gain.ewm(alpha=1.0 / period, min_periods=period, adjust=False).mean() avg_loss = loss.ewm(alpha=1.0 / period, min_periods=period, adjust=False).mean() rs = avg_gain / avg_loss.replace(0, np.nan) return 100.0 - 100.0 / (1.0 + rs) def _macd_histogram( close: pd.Series, fast: int = 12, slow: int = 26, signal: int = 9, ) -> pd.Series: """MACD histogram (MACD line minus signal line).""" ema_fast = close.ewm(span=fast, adjust=False).mean() ema_slow = close.ewm(span=slow, adjust=False).mean() macd_line = ema_fast - ema_slow signal_line = macd_line.ewm(span=signal, adjust=False).mean() return macd_line - signal_line def _bollinger_pctb( close: pd.Series, period: int = 20, n_std: float = 2.0 ) -> pd.Series: """Bollinger Band %B.""" sma = close.rolling(period).mean() std = close.rolling(period).std() upper = sma + n_std * std lower = sma - n_std * std return (close - lower) / (upper - lower).replace(0, np.nan) def _atr( high: pd.Series, low: pd.Series, close: pd.Series, period: int = 14 ) -> pd.Series: """Average True Range.""" prev_close = close.shift(1) tr = pd.concat( [high - low, (high - prev_close).abs(), (low - prev_close).abs()], axis=1, ).max(axis=1) return tr.rolling(period).mean()
[docs] def technical_features( high: pd.Series, low: pd.Series, close: pd.Series, volume: pd.Series | None = None, ) -> pd.DataFrame: """Compute common technical analysis features for ML pipelines. Use these features as inputs to ML models when you want to capture classic technical signals without depending on the full ``wraquant.ta`` module. Combines momentum (RSI, MACD), volatility (ATR, Bollinger), and optionally volume (OBV) into a single DataFrame. Computes RSI, MACD histogram, Bollinger Band %B, and ATR. If *volume* is provided, On-Balance Volume (OBV) is also included. Parameters ---------- high : pd.Series High prices. low : pd.Series Low prices. close : pd.Series Close prices. volume : pd.Series or None Trade volume (optional). When provided, adds OBV which tracks cumulative buying/selling pressure. Returns ------- pd.DataFrame DataFrame with columns: - ``rsi``: Relative Strength Index (0-100). Values above 70 indicate overbought; below 30 indicate oversold. - ``macd_hist``: MACD histogram. Positive values indicate bullish momentum; negative values indicate bearish. - ``bb_pctb``: Bollinger Band %B (0-1 range typically). Values above 1 mean price is above the upper band. - ``atr``: Average True Range. Higher values indicate more volatile price action. - ``obv`` (optional): On-Balance Volume. Rising OBV confirms an uptrend. Example ------- >>> import pandas as pd, numpy as np >>> np.random.seed(0) >>> n = 100 >>> close = pd.Series(100 + np.cumsum(np.random.randn(n) * 0.5)) >>> high = close + np.abs(np.random.randn(n) * 0.3) >>> low = close - np.abs(np.random.randn(n) * 0.3) >>> feats = technical_features(high, low, close) >>> list(feats.columns) ['rsi', 'macd_hist', 'bb_pctb', 'atr'] See Also -------- return_features : Lagged and cumulative return features. volatility_features : Realised volatility features. """ result: dict[str, pd.Series] = { "rsi": _rsi(close), "macd_hist": _macd_histogram(close), "bb_pctb": _bollinger_pctb(close), "atr": _atr(high, low, close), } if volume is not None: direction = np.sign(close.diff()).fillna(0) obv = (direction * volume).cumsum() result["obv"] = obv return pd.DataFrame(result, index=close.index)
# --------------------------------------------------------------------------- # Volatility features # ---------------------------------------------------------------------------
[docs] def volatility_features( returns: pd.Series, windows: Sequence[int] = (5, 10, 21, 63), ) -> pd.DataFrame: """Compute realised-volatility-related features. Use volatility features to capture the current risk environment and volatility regime. Realised volatility is the most important feature in many financial ML models because volatility clusters (GARCH effect) and predicts future volatility better than returns predict future returns. Parameters ---------- returns : pd.Series Log or simple return series. windows : Sequence[int] Window sizes for rolling calculations (default ``(5, 10, 21, 63)``). Returns ------- pd.DataFrame Columns: - ``realized_vol_w{w}``: Annualised rolling standard deviation (sqrt(252) scaling). Interpretation: a value of 0.20 means ~20% annualised volatility. - ``vol_of_vol_w{w}``: Rolling std of the rolling vol. High values indicate unstable volatility (vol-of-vol regime). - ``vol_ratio_w{w1}_w{w2}``: Ratio of short-window vol to long-window vol. Values > 1 indicate vol is spiking (risk-off signal); values < 1 indicate vol compression. Example ------- >>> import pandas as pd, numpy as np >>> np.random.seed(0) >>> rets = pd.Series(np.random.randn(200) * 0.01, name='daily_ret') >>> feats = volatility_features(rets, windows=(5, 21)) >>> 'realized_vol_w5' in feats.columns True >>> 'vol_ratio_w5_w21' in feats.columns True See Also -------- rolling_features : General rolling statistical features. wraquant.vol : Full volatility modelling (GARCH, stochastic vol). """ result: dict[str, pd.Series] = {} vol_series: dict[int, pd.Series] = {} for w in windows: rv = returns.rolling(w).std() * np.sqrt(252) vol_series[w] = rv result[f"realized_vol_w{w}"] = rv # Vol-of-vol: rolling std of the rolling vol result[f"vol_of_vol_w{w}"] = rv.rolling(w).std() sorted_windows = sorted(windows) for i in range(len(sorted_windows) - 1): w_short = sorted_windows[i] w_long = sorted_windows[i + 1] denominator = vol_series[w_long].replace(0, np.nan) result[f"vol_ratio_w{w_short}_w{w_long}"] = vol_series[w_short] / denominator return pd.DataFrame(result, index=returns.index)
# --------------------------------------------------------------------------- # Microstructure features # ---------------------------------------------------------------------------
[docs] def microstructure_features( high: pd.Series, low: pd.Series, close: pd.Series, volume: pd.Series, ) -> pd.DataFrame: """Compute market-microstructure features. Use microstructure features to capture liquidity conditions, information asymmetry, and trading activity. These are particularly valuable for short-horizon alpha models and execution-aware strategies where liquidity predicts future returns or trading costs. Parameters ---------- high : pd.Series High prices. low : pd.Series Low prices. close : pd.Series Close prices. volume : pd.Series Trade volume. Returns ------- pd.DataFrame Columns: - ``amihud_illiq``: Amihud illiquidity ratio (21-day rolling mean of |return| / dollar_volume). Higher values indicate less liquid, more price-impactful markets. - ``kyle_lambda``: Kyle's lambda (21-day rolling OLS slope of |price change| on signed sqrt-volume). Measures the price impact per unit of informed flow. Higher values suggest more information asymmetry. - ``log_volume``: Natural log of volume. Smooths the skewed volume distribution for ML model consumption. - ``volume_ma_ratio``: Current volume / 21-day moving average. Values > 1 indicate above-average activity (potential event). - ``dollar_volume``: Price * volume. Absolute measure of trading activity and liquidity. Example ------- >>> import pandas as pd, numpy as np >>> np.random.seed(0) >>> n = 100 >>> close = pd.Series(100 + np.cumsum(np.random.randn(n) * 0.5)) >>> high = close + np.abs(np.random.randn(n) * 0.3) >>> low = close - np.abs(np.random.randn(n) * 0.3) >>> volume = pd.Series(np.random.randint(1_000_000, 5_000_000, n)) >>> feats = microstructure_features(high, low, close, volume) >>> list(feats.columns) ['amihud_illiq', 'kyle_lambda', 'log_volume', 'volume_ma_ratio', 'dollar_volume'] References ---------- - Amihud (2002), "Illiquidity and stock returns" - Kyle (1985), "Continuous Auctions and Insider Trading" See Also -------- technical_features : Price-based technical indicators. """ returns = close.pct_change() dollar_volume = close * volume # Amihud illiquidity = |return| / dollar volume (rolling 21-day mean) amihud = (returns.abs() / dollar_volume.replace(0, np.nan)).rolling(21).mean() # Kyle's lambda estimate (rolling regression slope of |price change| on # signed sqrt-volume over 21-day windows) abs_dp = close.diff().abs() signed_sqrt_vol = np.sign(returns) * np.sqrt(volume.abs()) def _ols_slope(y: np.ndarray, x: np.ndarray) -> float: mask = np.isfinite(y) & np.isfinite(x) if mask.sum() < 3: return np.nan xm = x[mask] ym = y[mask] xm_dm = xm - xm.mean() denom = (xm_dm**2).sum() if denom == 0: return np.nan return float((xm_dm * (ym - ym.mean())).sum() / denom) kyle_lambda_vals = np.full(len(close), np.nan) dp_arr = abs_dp.values.astype(float) sv_arr = signed_sqrt_vol.values.astype(float) for i in range(21, len(close)): kyle_lambda_vals[i] = _ols_slope(dp_arr[i - 21 : i], sv_arr[i - 21 : i]) kyle_lambda = pd.Series(kyle_lambda_vals, index=close.index, name="kyle_lambda") vol_ma21 = volume.rolling(21).mean().replace(0, np.nan) return pd.DataFrame( { "amihud_illiq": amihud, "kyle_lambda": kyle_lambda, "log_volume": np.log1p(volume), "volume_ma_ratio": volume / vol_ma21, "dollar_volume": dollar_volume, }, index=close.index, )
# --------------------------------------------------------------------------- # Labelling # ---------------------------------------------------------------------------
[docs] def label_fixed_horizon( returns: pd.Series, horizon: int = 5, threshold: float = 0.0, ) -> pd.Series: """Label future return direction over a fixed horizon. Use fixed-horizon labelling as the simplest way to create supervised learning targets for directional prediction. Each observation is labelled based on the cumulative return over the next *horizon* periods. This is the standard approach for "will the price go up or down over the next N days?" classification. Parameters ---------- returns : pd.Series Period (e.g. daily) returns. horizon : int Number of periods to accumulate forward returns (default 5, i.e. one trading week). threshold : float If ``threshold > 0``, three labels are produced: ``1`` (up beyond threshold), ``0`` (flat), ``-1`` (down beyond threshold). If ``threshold == 0``, binary labels (``1`` / ``0``) are produced where ``1`` means positive cumulative return. Returns ------- pd.Series Integer labels aligned to the original index. The last *horizon* rows will be ``NaN`` (no future data available). Example ------- >>> import pandas as pd, numpy as np >>> rets = pd.Series([0.01, -0.005, 0.02, 0.01, -0.03, 0.015, 0.005]) >>> labels = label_fixed_horizon(rets, horizon=3, threshold=0.0) >>> labels.iloc[0] # sum of rets[1:4] = -0.005+0.02+0.01 > 0 1 Notes ----- Fixed-horizon labelling does not adapt to volatility. In high-vol regimes, the threshold is hit more often; in low-vol regimes, most labels become ``0``. For volatility-adaptive labels, use ``label_triple_barrier``. See Also -------- label_triple_barrier : Volatility-adaptive labelling (Lopez de Prado). """ # For each index i, accumulate returns[i+1] through returns[i+horizon]. # Use a forward-looking rolling sum. fwd_returns = returns.shift(-1) cum_fwd = fwd_returns.rolling(window=horizon, min_periods=horizon).sum() # Shift so that the value at index i represents the sum of the next # *horizon* returns starting from i+1. cum_fwd = cum_fwd.shift(-(horizon - 1)) if threshold > 0: labels = pd.Series( np.where( cum_fwd > threshold, 1, np.where(cum_fwd < -threshold, -1, 0), ), index=returns.index, dtype="Int64", ) else: labels = pd.Series( np.where(cum_fwd > 0, 1, 0), index=returns.index, dtype="Int64", ) labels[cum_fwd.isna()] = pd.NA return labels
[docs] def label_triple_barrier( close: pd.Series, upper: float | None = None, lower: float | None = None, max_holding: int = 10, ) -> pd.Series: """Triple-barrier labelling (Lopez de Prado). Use triple-barrier labelling when you want targets that adapt to market conditions. Unlike fixed-horizon labels, this method defines a profit-taking barrier (upper), a stop-loss barrier (lower), and a maximum holding period (vertical). Whichever barrier is hit first determines the label. This produces cleaner labels in volatile markets because the barriers can be scaled by volatility. For each bar the method sets three barriers: * **Upper**: price rises by *upper* fraction -> label = 1 * **Lower**: price falls by *lower* fraction -> label = -1 * **Vertical**: *max_holding* bars elapse -> label = sign of return If *upper* or *lower* is ``None`` the corresponding horizontal barrier is disabled. Parameters ---------- close : pd.Series Close price series. upper : float or None Fractional distance for the upper barrier (e.g. ``0.02`` for 2 %). lower : float or None Fractional distance for the lower barrier (positive value; e.g. ``0.02`` for -2 %). max_holding : int Maximum holding period in bars (vertical barrier). Returns ------- pd.Series Integer labels in ``{-1, 0, 1}`` aligned to the input index. ``1`` = profit-taking barrier hit first (bullish), ``-1`` = stop-loss barrier hit first (bearish), ``0`` = vertical barrier hit with zero return. The last *max_holding* entries may be ``NaN``. Example ------- >>> import pandas as pd >>> close = pd.Series([100, 101, 102, 103, 100, 97, 98, 99, 100, 101]) >>> labels = label_triple_barrier(close, upper=0.03, lower=0.03, max_holding=5) >>> labels.iloc[0] # price rises 3% by bar 3 (103/100 - 1 = 0.03) 1 Notes ----- In practice, set ``upper`` and ``lower`` proportional to recent volatility (e.g., ``upper = lower = daily_vol * sqrt(max_holding)``). This makes the labels regime-adaptive. References ---------- - Lopez de Prado (2018), "Advances in Financial Machine Learning", Ch. 3 See Also -------- label_fixed_horizon : Simpler fixed-horizon labelling. """ n = len(close) labels = pd.Series(np.full(n, np.nan), index=close.index, dtype="Int64") close_arr = close.values.astype(float) for i in range(n): entry = close_arr[i] if np.isnan(entry): continue end = min(i + max_holding, n - 1) label: int | None = None for j in range(i + 1, end + 1): price = close_arr[j] ret = (price - entry) / entry if upper is not None and ret >= upper: label = 1 break if lower is not None and ret <= -lower: label = -1 break if label is None: # Vertical barrier hit if end <= i or i + max_holding > n - 1: labels.iloc[i] = pd.NA continue final_ret = (close_arr[end] - entry) / entry if final_ret > 0: label = 1 elif final_ret < 0: label = -1 else: label = 0 labels.iloc[i] = label return labels
# --------------------------------------------------------------------------- # Interaction features # ---------------------------------------------------------------------------
[docs] def interaction_features( data: pd.DataFrame, columns: Sequence[str] | None = None, ) -> pd.DataFrame: """Create pairwise interaction terms between features. Use interaction features when you suspect that predictive power lies in the *combination* of features rather than individual signals. For example, ``momentum * volatility`` captures whether momentum is occurring in a high- or low-volatility environment, which may predict returns differently. For each pair of selected columns ``(A, B)``, computes: - ``A_x_B``: element-wise product (captures multiplicative relationships) - ``A_div_B``: element-wise ratio A / B (captures relative magnitudes) Parameters ---------- data : pd.DataFrame Feature DataFrame. columns : Sequence[str] or None Columns to use for interaction terms. If None, all columns are used. Returns ------- pd.DataFrame DataFrame containing all pairwise interaction features, with column names like ``col1_x_col2`` and ``col1_div_col2``. Example ------- >>> import pandas as pd, numpy as np >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) >>> result = interaction_features(df, columns=['a', 'b']) >>> 'a_x_b' in result.columns True >>> 'a_div_b' in result.columns True """ from itertools import combinations as _combinations if columns is None: columns = list(data.columns) result: dict[str, pd.Series] = {} for col_a, col_b in _combinations(columns, 2): result[f"{col_a}_x_{col_b}"] = data[col_a] * data[col_b] denominator = data[col_b].replace(0, np.nan) result[f"{col_a}_div_{col_b}"] = data[col_a] / denominator return pd.DataFrame(result, index=data.index)
# --------------------------------------------------------------------------- # Cross-asset features # ---------------------------------------------------------------------------
[docs] def cross_asset_features( asset: pd.Series, benchmark: pd.Series, windows: Sequence[int] = (10, 21, 63), ) -> pd.DataFrame: """Compute cross-asset relationship features. Use cross-asset features to capture how an asset co-moves with a benchmark or related instrument. Rolling correlation and beta detect changing exposures (useful for regime detection); relative strength identifies momentum divergence between the asset and its benchmark. Given an asset return series and a benchmark (or related asset) return series, computes rolling correlation, rolling beta, and relative strength for each window. Parameters ---------- asset : pd.Series Return series for the asset of interest. benchmark : pd.Series Return series for the benchmark or related asset. windows : Sequence[int] Rolling window sizes for correlation and beta calculations. Returns ------- pd.DataFrame DataFrame with columns: - ``rolling_corr_w{w}``: rolling Pearson correlation - ``rolling_beta_w{w}``: rolling OLS beta (cov / var of benchmark) - ``relative_strength_w{w}``: cumulative return ratio (asset / benchmark) over the window Example ------- >>> import pandas as pd, numpy as np >>> np.random.seed(0) >>> asset = pd.Series(np.random.randn(200) * 0.01, name='asset') >>> bench = pd.Series(np.random.randn(200) * 0.01, name='bench') >>> result = cross_asset_features(asset, bench, windows=[10, 21]) >>> 'rolling_corr_w10' in result.columns True >>> 'rolling_beta_w21' in result.columns True """ aligned = pd.DataFrame({"asset": asset, "benchmark": benchmark}).dropna() a = aligned["asset"] b = aligned["benchmark"] result: dict[str, pd.Series] = {} for w in windows: # Rolling correlation result[f"rolling_corr_w{w}"] = a.rolling(w).corr(b) # Rolling beta = cov(asset, benchmark) / var(benchmark) cov = a.rolling(w).cov(b) var = b.rolling(w).var() result[f"rolling_beta_w{w}"] = cov / var.replace(0, np.nan) # Relative strength: cumulative return of asset vs benchmark cum_asset = (1 + a).rolling(w).apply(np.prod, raw=True) cum_bench = (1 + b).rolling(w).apply(np.prod, raw=True) result[f"relative_strength_w{w}"] = cum_asset / cum_bench.replace(0, np.nan) return pd.DataFrame(result, index=aligned.index)
# --------------------------------------------------------------------------- # Regime features # ---------------------------------------------------------------------------
[docs] def regime_features( regime_probabilities: pd.DataFrame, regime_labels: pd.Series | None = None, ) -> pd.DataFrame: """Create features from regime probabilities or labels. Use regime features when you have upstream regime detection (e.g., HMM, Markov-switching) and want to feed regime state into downstream ML models. Regime duration and transition probability are predictive because regimes tend to persist (duration) but eventually break down (transition probability rises before a switch). Given regime probabilities (e.g., from an HMM or Markov-switching model), constructs features useful for downstream ML models: current regime identity, regime duration (how many consecutive periods in the current regime), and estimated transition probability (rolling mean of regime changes). Parameters ---------- regime_probabilities : pd.DataFrame DataFrame where each column is the probability of a regime (e.g., columns ``['bull', 'bear']`` with probabilities summing to 1). regime_labels : pd.Series or None Hard regime labels. If None, the most probable regime at each step is used (argmax of the probability columns). Returns ------- pd.DataFrame DataFrame with columns: - ``current_regime``: integer label of the current regime - ``regime_duration``: number of consecutive periods in the current regime - ``regime_change``: binary indicator (1 if regime changed) - ``transition_prob_w{w}``: rolling mean of regime changes for w in [5, 10, 21] - one column per regime probability from the input Example ------- >>> import pandas as pd, numpy as np >>> np.random.seed(42) >>> probs = pd.DataFrame({ ... 'bull': np.random.dirichlet([5, 2], size=100)[:, 0], ... 'bear': np.random.dirichlet([5, 2], size=100)[:, 1], ... }) >>> result = regime_features(probs) >>> 'current_regime' in result.columns True >>> 'regime_duration' in result.columns True """ result: dict[str, pd.Series] = {} # Current regime (argmax) if regime_labels is not None: current = regime_labels.astype(int) else: current = pd.Series( regime_probabilities.values.argmax(axis=1), index=regime_probabilities.index, name="current_regime", ) result["current_regime"] = current # Regime change indicator regime_change = (current != current.shift(1)).astype(int) regime_change.iloc[0] = 0 result["regime_change"] = regime_change # Regime duration (consecutive periods in current regime) duration = np.zeros(len(current), dtype=int) duration[0] = 1 current_vals = current.values for i in range(1, len(current_vals)): if current_vals[i] == current_vals[i - 1]: duration[i] = duration[i - 1] + 1 else: duration[i] = 1 result["regime_duration"] = pd.Series(duration, index=regime_probabilities.index) # Rolling transition probability (how frequently regimes change) for w in [5, 10, 21]: result[f"transition_prob_w{w}"] = regime_change.rolling(w, min_periods=1).mean() # Include raw probabilities for col in regime_probabilities.columns: result[f"prob_{col}"] = regime_probabilities[col] return pd.DataFrame(result, index=regime_probabilities.index)
# --------------------------------------------------------------------------- # TA-integrated features (imports from wraquant.ta) # ---------------------------------------------------------------------------
[docs] def ta_features( high: pd.Series, low: pd.Series, close: pd.Series, volume: pd.Series | None = None, include: Sequence[str] | None = None, ) -> pd.DataFrame: """Generate ML features using wraquant's full technical analysis library. Unlike ``technical_features`` (which uses inline implementations), this function imports directly from ``wraquant.ta`` to leverage the full 263-indicator library. This bridges the ``ml`` and ``ta`` modules so that ML pipelines can access production-quality TA indicators without manual wiring. By default, computes a curated set of the most ML-relevant indicators: RSI, MACD histogram, Bollinger Band %B, ATR, and optionally OBV. Use the *include* parameter to select additional indicators. Parameters: high: High prices. low: Low prices. close: Close prices. volume: Trade volume (optional). Required for volume-based indicators (OBV, MFI). include: Subset of indicators to include. Options: ``'rsi'``, ``'macd'``, ``'bbands'``, ``'atr'``, ``'obv'``. If *None*, includes all available indicators. Returns: DataFrame with one column per indicator, indexed like the input series. Column names are descriptive (e.g., ``ta_rsi``, ``ta_macd_hist``, ``ta_bb_pctb``, ``ta_atr``, ``ta_obv``). Example: >>> import pandas as pd, numpy as np >>> np.random.seed(0) >>> n = 100 >>> close = pd.Series(100 + np.cumsum(np.random.randn(n) * 0.5)) >>> high = close + np.abs(np.random.randn(n) * 0.3) >>> low = close - np.abs(np.random.randn(n) * 0.3) >>> feats = ta_features(high, low, close) >>> 'ta_rsi' in feats.columns True See Also: technical_features: Inline implementation (no ta/ dependency). wraquant.ta.momentum.rsi: Full RSI implementation. wraquant.ta.momentum.macd: Full MACD implementation. """ from wraquant.ta.momentum import macd, rsi from wraquant.ta.overlap import bollinger_bands from wraquant.ta.volatility import atr all_indicators = {"rsi", "macd", "bbands", "atr", "obv"} if include is None: selected = all_indicators.copy() else: selected = set(include) & all_indicators result: dict[str, pd.Series] = {} if "rsi" in selected: result["ta_rsi"] = rsi(close, period=14) if "macd" in selected: macd_result = macd(close) if isinstance(macd_result, dict): result["ta_macd_hist"] = macd_result.get( "histogram", macd_result.get("macd_hist", pd.Series(dtype=float)) ) else: result["ta_macd_hist"] = macd_result if "bbands" in selected: bb = bollinger_bands(close, period=20) if isinstance(bb, dict): upper = bb.get("upper", pd.Series(dtype=float)) lower = bb.get("lower", pd.Series(dtype=float)) bb_range = (upper - lower).replace(0, np.nan) result["ta_bb_pctb"] = (close - lower) / bb_range else: result["ta_bb_pctb"] = bb if "atr" in selected: result["ta_atr"] = atr(high, low, close, period=14) if "obv" in selected and volume is not None: from wraquant.ta.volume import obv result["ta_obv"] = obv(close, volume) return pd.DataFrame(result, index=close.index)