Source code for wraquant.microstructure.liquidity

"""Liquidity analytics for market microstructure.

Liquidity measures how easily an asset can be traded without
significantly moving its price. Illiquid assets carry a liquidity risk
premium and pose execution challenges. This module provides the
standard toolkit for measuring liquidity from trade and quote data.

Measures provided:

**Illiquidity / price impact**:
    - ``amihud_illiquidity``: the Amihud (2002) ratio -- average daily
      |return| / volume. Higher values indicate less liquid assets.
      The most widely used cross-sectional liquidity proxy because it
      only requires daily data.
    - ``kyle_lambda``: Kyle's lambda -- the permanent price impact
      coefficient estimated via rolling OLS of price changes on signed
      order flow. Higher lambda = more price impact per unit of volume.
    - ``price_impact``: per-trade permanent price impact.

**Spread estimators**:
    - ``roll_spread``: Roll (1984) implied spread from serial
      autocovariance of price changes. Requires only trade prices
      (no quote data needed).
    - ``effective_spread``: 2 * |trade_price - midpoint|. The
      standard measure of execution cost.
    - ``realized_spread``: spread earned by the liquidity provider
      after a delay, capturing adverse selection.

**Activity**:
    - ``turnover_ratio``: daily volume / shares outstanding. Measures
      trading activity relative to float.

How to choose:
    - **Cross-sectional liquidity ranking** (daily data only): use
      ``amihud_illiquidity``.
    - **Execution cost analysis** (trade + quote data): use
      ``effective_spread`` and ``realized_spread``.
    - **Price impact modeling**: use ``kyle_lambda`` for permanent
      impact; ``price_impact`` for per-trade measurement.
    - **No quote data available**: use ``roll_spread`` as a proxy
      for the bid-ask spread.

References:
    - Amihud (2002), "Illiquidity and Stock Returns"
    - Kyle (1985), "Continuous Auctions and Insider Trading"
    - Roll (1984), "A Simple Implicit Measure of the Effective
      Bid-Ask Spread"
"""

from __future__ import annotations

import numpy as np
import pandas as pd
from numpy.typing import NDArray

from wraquant.core._coerce import coerce_series



[docs]
def amihud_illiquidity(
    returns: pd.Series,
    volume: pd.Series,
    window: int | None = None,
) -> pd.Series | float:
    """Amihud (2002) illiquidity ratio: mean of |return| / dollar volume.

    A higher value indicates less liquid (more illiquid) markets.

    Parameters:
        returns: Asset return series.
        volume: Dollar volume series (price * shares traded).
        window: Rolling window size. If *None*, returns a single scalar
            average over the entire sample.

    Returns:
        Rolling Amihud illiquidity ratio (or a single float when
        *window* is *None*).

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> returns = pd.Series(np.random.randn(252) * 0.01)
        >>> volume = pd.Series(np.random.uniform(1e6, 5e6, 252))
        >>> illiq = amihud_illiquidity(returns, volume)
        >>> illiq > 0
        True

    See Also:
        kyle_lambda: Price impact coefficient (regression-based alternative).
        amihud_rolling: Rolling version with normalization.
    """
    returns = coerce_series(returns, "returns")
    volume = coerce_series(volume, "volume")
    ratio = np.abs(returns) / volume
    ratio = ratio.replace([np.inf, -np.inf], np.nan)
    if window is None:
        return float(np.nanmean(ratio))
    return ratio.rolling(window).mean()




[docs]
def kyle_lambda(
    prices: pd.Series,
    volume: pd.Series,
    window: int = 20,
) -> pd.Series:
    """Kyle's lambda -- price impact coefficient via rolling OLS.

    Regresses price changes on signed order flow (volume) to estimate the
    permanent price impact per unit of volume.

    Parameters:
        prices: Price series.
        volume: Signed volume series (positive for buys, negative for sells).
        window: Rolling regression window.

    Returns:
        Rolling Kyle's lambda series.  Higher values indicate more
        price impact per unit of volume (less liquid).

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> prices = pd.Series(100 + np.cumsum(np.random.randn(100) * 0.5))
        >>> volume = pd.Series(np.random.randn(100) * 1000)
        >>> lam = kyle_lambda(prices, volume, window=20)
        >>> len(lam) == 100
        True

    See Also:
        amihud_illiquidity: Simpler illiquidity proxy (no signed volume needed).
        lambda_kyle_rolling: Kyle's lambda with confidence intervals.
    """
    prices = coerce_series(prices, "prices")
    volume = coerce_series(volume, "volume")
    delta_p = prices.diff()
    # Rolling OLS: lambda = cov(dp, v) / var(v)
    cov_pv = delta_p.rolling(window).cov(volume)
    var_v = volume.rolling(window).var()
    lam = cov_pv / var_v
    lam = lam.replace([np.inf, -np.inf], np.nan)
    lam.name = "kyle_lambda"
    return lam




[docs]
def roll_spread(prices: pd.Series) -> float:
    """Roll (1984) implied bid-ask spread from serial covariance.

    Estimates the effective spread from the negative first-order
    autocovariance of price changes: spread = 2 * sqrt(-cov).

    Parameters:
        prices: Price series.

    Returns:
        Estimated implied spread. Returns *NaN* if the serial
        covariance is non-negative (model assumption violated).

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> # Simulate trade prices with bid-ask bounce
        >>> mid = 100 + np.cumsum(np.random.randn(500) * 0.01)
        >>> bounce = np.random.choice([-0.05, 0.05], size=500)
        >>> prices = pd.Series(mid + bounce)
        >>> spread = roll_spread(prices)
        >>> spread > 0 or np.isnan(spread)  # positive spread or NaN
        True

    See Also:
        effective_spread: Direct spread from trade and quote data.
        corwin_schultz_spread: High-low spread estimator (OHLC data).
    """
    prices = coerce_series(prices, "prices")
    dp = prices.diff().dropna()
    cov = np.cov(dp.values[:-1], dp.values[1:])[0, 1]
    if cov >= 0:
        return np.nan
    return 2.0 * np.sqrt(-cov)




[docs]
def effective_spread(
    trade_prices: pd.Series | NDArray[np.floating],
    midpoints: pd.Series | NDArray[np.floating],
) -> pd.Series | NDArray[np.floating]:
    """Effective bid-ask spread: 2 * |trade_price - midpoint|.

    Parameters:
        trade_prices: Executed trade prices.
        midpoints: Prevailing bid-ask midpoints at time of each trade.

    Returns:
        Per-trade effective spread, same type as the inputs.

    Example:
        >>> import pandas as pd, numpy as np
        >>> trades = pd.Series([100.05, 99.95, 100.03])
        >>> mids = pd.Series([100.0, 100.0, 100.0])
        >>> spreads = effective_spread(trades, mids)
        >>> float(spreads.iloc[0])
        0.1

    See Also:
        realized_spread: Post-trade spread (adverse selection component).
        roll_spread: Implied spread from price autocovariance.
    """
    trade_prices = coerce_series(trade_prices, "trade_prices")
    midpoints = coerce_series(midpoints, "midpoints")
    return 2.0 * np.abs(trade_prices - midpoints)




[docs]
def realized_spread(
    trade_prices: pd.Series,
    midpoints: pd.Series,
    delay: int = 5,
) -> pd.Series:
    """Realized spread incorporating a post-trade midpoint delay.

    Measures the revenue to the liquidity provider:
    ``2 * direction * (trade_price - midpoint_{t+delay})``.

    Parameters:
        trade_prices: Executed trade prices.
        midpoints: Mid-quote series aligned to trades.
        delay: Number of observations to shift the midpoint forward.

    Returns:
        Per-trade realized spread series (NaN for the last *delay* rows).

    Example:
        >>> import pandas as pd, numpy as np
        >>> trades = pd.Series([100.05, 99.95, 100.03, 100.01, 99.98])
        >>> mids = pd.Series([100.0, 100.0, 100.0, 100.0, 100.0])
        >>> rs = realized_spread(trades, mids, delay=2)
        >>> len(rs) == 5
        True

    See Also:
        effective_spread: Total execution cost (before adverse selection).
        spread_decomposition: Full Huang-Stoll decomposition.
    """
    trade_prices = coerce_series(trade_prices, "trade_prices")
    midpoints = coerce_series(midpoints, "midpoints")
    direction = np.sign(trade_prices - midpoints)
    future_mid = midpoints.shift(-delay)
    return 2.0 * direction * (trade_prices - future_mid)




[docs]
def price_impact(
    trade_prices: pd.Series,
    volume: pd.Series,
    direction: pd.Series,
) -> pd.Series:
    """Permanent price impact per trade.

    Computed as ``direction * (midpoint_{t+1} - midpoint_t) / volume``,
    approximated here via successive trade prices.

    Parameters:
        trade_prices: Executed trade prices.
        volume: Volume for each trade.
        direction: Trade direction indicator (+1 buy, -1 sell).

    Returns:
        Per-trade permanent price impact series.

    Example:
        >>> import pandas as pd, numpy as np
        >>> trades = pd.Series([100.0, 100.05, 100.10, 100.08])
        >>> vol = pd.Series([1000, 2000, 1500, 1800])
        >>> direction = pd.Series([1, 1, -1, 1])
        >>> impact = price_impact(trades, vol, direction)
        >>> len(impact) == 4
        True

    See Also:
        kyle_lambda: Aggregate price impact coefficient.
        wraquant.microstructure.market_quality.price_impact_regression:
            Permanent vs. temporary impact decomposition.
    """
    trade_prices = coerce_series(trade_prices, "trade_prices")
    volume = coerce_series(volume, "volume")
    direction = coerce_series(direction, "direction")
    dp = trade_prices.diff().shift(-1)
    impact = direction * dp / volume
    impact = impact.replace([np.inf, -np.inf], np.nan)
    impact.name = "price_impact"
    return impact




[docs]
def turnover_ratio(
    volume: pd.Series,
    shares_outstanding: pd.Series | float,
) -> pd.Series:
    """Turnover ratio: volume / shares outstanding.

    Parameters:
        volume: Daily trading volume.
        shares_outstanding: Total shares outstanding (scalar or series).

    Returns:
        Daily turnover ratio.  Higher values indicate more active trading.

    Example:
        >>> import pandas as pd
        >>> volume = pd.Series([1e6, 1.5e6, 0.8e6])
        >>> ratio = turnover_ratio(volume, shares_outstanding=100e6)
        >>> float(ratio.iloc[0])
        0.01

    See Also:
        amihud_illiquidity: Price-impact-based liquidity measure.
    """
    volume = coerce_series(volume, "volume")
    ratio = volume / shares_outstanding
    ratio.name = "turnover_ratio"
    return ratio



# ---------------------------------------------------------------------------
# Enhanced liquidity analytics
# ---------------------------------------------------------------------------



[docs]
def corwin_schultz_spread(
    high: pd.Series,
    low: pd.Series,
    window: int = 1,
) -> pd.Series:
    """Corwin & Schultz (2012) high-low spread estimator.

    Estimates the effective bid-ask spread from consecutive daily high and
    low prices.  The key insight is that daily high prices are almost always
    buyer-initiated (at the ask) while daily lows are seller-initiated (at
    the bid).  The ratio of high-to-low therefore captures both volatility
    *and* the spread.  By comparing single-day and two-day high-low ranges
    the method disentangles the two components.

    **When to use**: When only daily OHLC data is available and you need a
    spread estimate.  More robust than the Roll (1984) estimator because it
    does not require negative serial covariance and performs better in the
    presence of stale prices.

    **Interpretation**: Output is in price units (same scale as the input).
    Values typically range from 0 (perfectly liquid) to several percent of
    price for illiquid stocks.  Negative estimates are floored at zero
    (model assumption violated, usually when volatility overwhelms spread).

    Parameters:
        high: Daily high prices.
        low: Daily low prices.
        window: Averaging window for the spread estimate.  ``window=1``
            returns the raw daily estimate.

    Returns:
        Estimated bid-ask spread series, floored at zero.

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> close = pd.Series(100 + np.cumsum(np.random.randn(100) * 0.5))
        >>> high = close + np.abs(np.random.randn(100)) * 0.3
        >>> low = close - np.abs(np.random.randn(100)) * 0.3
        >>> spread = corwin_schultz_spread(high, low)
        >>> (spread >= 0).all()
        True

    References:
        Corwin, S. A. & Schultz, P. (2012). "A Simple Way to Estimate
        Bid-Ask Spreads from Daily High and Low Prices." *Journal of
        Finance*, 67(2), 719-760.

    See Also:
        roll_spread: Implied spread from trade prices only.
        effective_spread: Direct spread from trade and quote data.
    """
    high = coerce_series(high, "high")
    low = coerce_series(low, "low")
    # Natural log of high/low ratio, squared
    ln_hl = np.log(high / low)
    beta = ln_hl ** 2

    # Sum of beta over two consecutive days
    beta_sum = beta + beta.shift(1)

    # Two-day high-low range
    high_2d = pd.concat([high, high.shift(1)], axis=1).max(axis=1)
    low_2d = pd.concat([low, low.shift(1)], axis=1).min(axis=1)
    gamma = np.log(high_2d / low_2d) ** 2

    # Corwin-Schultz alpha and spread
    # alpha = (sqrt(2*beta) - sqrt(beta)) / (3 - 2*sqrt(2))
    #       - sqrt(gamma / (3 - 2*sqrt(2)))
    k = 3.0 - 2.0 * np.sqrt(2.0)
    alpha = (np.sqrt(2.0 * beta_sum) - np.sqrt(beta_sum)) / k - np.sqrt(gamma / k)

    # S = 2 * (e^alpha - 1) / (1 + e^alpha)
    exp_alpha = np.exp(alpha)
    spread = 2.0 * (exp_alpha - 1.0) / (1.0 + exp_alpha)

    # Floor at zero -- negative estimates are artefacts
    spread = spread.clip(lower=0.0)

    if window > 1:
        spread = spread.rolling(window).mean()

    spread.name = "corwin_schultz_spread"
    return spread




[docs]
def closing_quoted_spread(
    bid_close: pd.Series,
    ask_close: pd.Series,
) -> pd.Series:
    """Quoted bid-ask spread at the market close.

    The closing spread is particularly relevant for investors who trade at or
    near the close (e.g., mutual fund NAV calculations, index rebalancing,
    MOC orders).  It also serves as a simple daily liquidity proxy when
    intraday data is unavailable.

    **When to use**: When analyzing execution costs for daily-frequency
    traders, evaluating end-of-day liquidity conditions, or constructing a
    daily spread time series from closing quote data.

    **Interpretation**: Narrower spreads indicate better end-of-day
    liquidity.  Spread widening at the close often precedes periods of
    higher volatility or information events (e.g., earnings releases).

    Parameters:
        bid_close: Best bid price at market close.
        ask_close: Best ask price at market close.

    Returns:
        Closing quoted spread series (ask - bid), in price units.

    Example:
        >>> import pandas as pd
        >>> bid = pd.Series([99.90, 99.85, 99.95])
        >>> ask = pd.Series([100.10, 100.15, 100.05])
        >>> spread = closing_quoted_spread(bid, ask)
        >>> float(spread.iloc[0])
        0.2

    References:
        Chordia, T., Roll, R. & Subrahmanyam, A. (2001). "Market Liquidity
        and Trading Activity." *Journal of Finance*, 56(2), 501-530.

    See Also:
        effective_spread: Execution-weighted spread measure.
        relative_spread: Spread normalized by midpoint.
    """
    bid_close = coerce_series(bid_close, "bid_close")
    ask_close = coerce_series(ask_close, "ask_close")
    spread = ask_close - bid_close
    spread.name = "closing_quoted_spread"
    return spread




[docs]
def depth_imbalance(
    bid_depth: pd.Series | NDArray[np.floating],
    ask_depth: pd.Series | NDArray[np.floating],
) -> pd.Series | NDArray[np.floating]:
    """Order book depth imbalance.

    Computes ``(bid_depth - ask_depth) / (bid_depth + ask_depth)`` to
    measure the directional imbalance in resting limit order volume.

    **When to use**: For real-time assessment of supply-demand imbalance in
    the limit order book.  Commonly used as a short-horizon return predictor
    in high-frequency strategies.

    **Interpretation**:

    - **+1**: All depth is on the bid side (strong buying interest,
      bullish signal).
    - **-1**: All depth is on the ask side (strong selling interest,
      bearish signal).
    - **0**: Balanced book.

    Values persistently above +0.3 or below -0.3 often indicate directional
    pressure that leads to price movement in the direction of the deeper
    side.

    Parameters:
        bid_depth: Total volume at the best bid (or top-N bid levels).
        ask_depth: Total volume at the best ask (or top-N ask levels).

    Returns:
        Depth imbalance in [-1, 1].

    Example:
        >>> import pandas as pd
        >>> bid_depth = pd.Series([5000, 3000, 4000])
        >>> ask_depth = pd.Series([3000, 5000, 4000])
        >>> imb = depth_imbalance(bid_depth, ask_depth)
        >>> float(imb.iloc[0])  # more bids than asks -> positive
        0.25

    References:
        Cao, C., Hansch, O. & Wang, X. (2009). "The Information Content
        of an Open Limit-Order Book." *Journal of Futures Markets*, 29(1),
        16-41.

    See Also:
        wraquant.microstructure.toxicity.order_flow_imbalance:
            Volume-based imbalance measure.
        wraquant.microstructure.market_quality.depth: Total market depth.
    """
    is_series = isinstance(bid_depth, pd.Series)
    bid_series = coerce_series(bid_depth, "bid_depth")
    ask_series = coerce_series(ask_depth, "ask_depth")
    bid_arr = bid_series.to_numpy(dtype=np.float64)
    ask_arr = ask_series.to_numpy(dtype=np.float64)

    total = bid_arr + ask_arr
    imbalance = np.where(total > 0, (bid_arr - ask_arr) / total, 0.0)

    if is_series:
        return pd.Series(imbalance, index=bid_series.index, name="depth_imbalance")
    return imbalance




[docs]
def lambda_kyle_rolling(
    prices: pd.Series,
    volume: pd.Series,
    window: int = 20,
) -> pd.DataFrame:
    """Rolling Kyle's lambda with confidence intervals.

    Extends :func:`kyle_lambda` by computing standard errors from the
    rolling OLS regression, yielding point estimates along with 95%
    confidence bounds.  This is essential for determining whether the
    estimated price impact is statistically significant at each point in
    time.

    **When to use**: When you need not just the *level* of price impact but
    also its *precision*.  Useful for detecting regime changes in market
    liquidity -- a significant widening of the confidence interval suggests
    structural uncertainty about the price impact coefficient.

    **Interpretation**: A positive lambda indicates that buy-initiated
    volume pushes prices up (and sell-initiated pushes down), consistent
    with the Kyle (1985) model.  Lambda values close to zero (or with
    confidence intervals spanning zero) suggest limited permanent price
    impact, i.e., a liquid market.

    Parameters:
        prices: Price series.
        volume: Signed volume series (positive for buys, negative for
            sells).
        window: Rolling regression window (must be >= 5).

    Returns:
        DataFrame with columns ``'lambda'``, ``'std_err'``,
        ``'ci_lower'``, ``'ci_upper'`` (95% confidence interval).

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> prices = pd.Series(100 + np.cumsum(np.random.randn(50) * 0.1))
        >>> volume = pd.Series(np.random.randn(50) * 1000)
        >>> result = lambda_kyle_rolling(prices, volume, window=20)
        >>> list(result.columns)
        ['lambda', 'std_err', 'ci_lower', 'ci_upper']

    References:
        Kyle, A. S. (1985). "Continuous Auctions and Insider Trading."
        *Econometrica*, 53(6), 1315-1335.

    See Also:
        kyle_lambda: Simple point estimate without confidence intervals.
        amihud_rolling: Rolling Amihud illiquidity ratio.
    """
    prices = coerce_series(prices, "prices")
    volume = coerce_series(volume, "volume")
    delta_p = prices.diff()

    lam = pd.Series(np.nan, index=prices.index, name="lambda")
    se = pd.Series(np.nan, index=prices.index, name="std_err")

    for i in range(window, len(prices)):
        y = delta_p.iloc[i - window + 1 : i + 1].values
        x = volume.iloc[i - window + 1 : i + 1].values

        # Skip windows with NaN
        mask = ~(np.isnan(y) | np.isnan(x))
        if mask.sum() < 5:
            continue

        y_clean = y[mask]
        x_clean = x[mask]

        n = len(y_clean)
        x_bar = np.mean(x_clean)
        var_x = np.sum((x_clean - x_bar) ** 2)

        if var_x < 1e-15:
            continue

        beta = np.sum((x_clean - x_bar) * (y_clean - np.mean(y_clean))) / var_x
        residuals = y_clean - (np.mean(y_clean) - beta * x_bar + beta * x_clean)
        s2 = np.sum(residuals ** 2) / max(n - 2, 1)
        std_err = np.sqrt(s2 / var_x)

        lam.iloc[i] = beta
        se.iloc[i] = std_err

    ci_lower = lam - 1.96 * se
    ci_upper = lam + 1.96 * se

    return pd.DataFrame(
        {"lambda": lam, "std_err": se, "ci_lower": ci_lower, "ci_upper": ci_upper},
        index=prices.index,
    )




[docs]
def amihud_rolling(
    returns: pd.Series,
    volume: pd.Series,
    window: int = 21,
    normalize: bool = True,
) -> pd.Series:
    """Rolling Amihud (2002) illiquidity ratio with proper normalization.

    Computes the Amihud ratio over a rolling window and optionally
    normalizes by the cross-sectional or time-series mean so that values
    are comparable across different assets and time periods.

    **When to use**: For tracking how an individual asset's liquidity
    evolves over time.  The normalization makes the measure comparable
    across assets with different price levels and trading volumes.

    **Interpretation**: Higher values indicate less liquidity (more price
    impact per unit of trading volume).  Sudden spikes often correspond
    to liquidity crises or market stress events.  Typical values for
    large-cap US stocks are 1e-11 to 1e-9 (unnormalized).

    Parameters:
        returns: Asset return series.
        volume: Dollar volume series (price * shares traded).
        window: Rolling window size (default 21 for ~1 month of trading
            days).
        normalize: If *True*, divide each rolling value by the full-sample
            mean so the time-series average is 1.0.

    Returns:
        Rolling Amihud illiquidity series.

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> returns = pd.Series(np.random.randn(100) * 0.01)
        >>> volume = pd.Series(np.random.uniform(1e6, 5e6, 100))
        >>> illiq = amihud_rolling(returns, volume, window=21)
        >>> illiq.name
        'amihud_rolling'

    References:
        Amihud, Y. (2002). "Illiquidity and Stock Returns: Cross-Section
        and Time-Series Effects." *Journal of Financial Markets*, 5(1),
        31-56.

    See Also:
        amihud_illiquidity: Static (full-sample) Amihud ratio.
        liquidity_commonality: How much liquidity co-moves with the market.
    """
    returns = coerce_series(returns, "returns")
    volume = coerce_series(volume, "volume")
    ratio = np.abs(returns) / volume
    ratio = ratio.replace([np.inf, -np.inf], np.nan)
    rolling = ratio.rolling(window).mean()

    if normalize:
        full_mean = np.nanmean(rolling)
        if full_mean > 0:
            rolling = rolling / full_mean

    rolling.name = "amihud_rolling"
    return rolling




[docs]
def liquidity_commonality(
    asset_illiquidity: pd.Series,
    market_illiquidity: pd.Series,
    window: int = 60,
) -> pd.Series:
    """Commonality in liquidity (Chordia, Roll & Subrahmanyam, 2000).

    Measures how much an individual asset's liquidity co-moves with
    market-wide liquidity.  The commonality coefficient is estimated via
    rolling regressions of changes in the asset's illiquidity measure on
    changes in the market-wide illiquidity measure.

    **When to use**: For assessing systematic liquidity risk.  Assets with
    high commonality become illiquid precisely when the entire market
    becomes illiquid -- an undesirable property that investors demand a
    premium for bearing.

    **Interpretation**: The output is the rolling R-squared from the
    regression.  Higher values (closer to 1) indicate stronger co-movement
    with market liquidity.  Values above 0.3 suggest meaningful systematic
    liquidity risk.  Most large-cap stocks show commonality R-squared of
    0.05-0.20.

    Parameters:
        asset_illiquidity: Individual asset's illiquidity measure (e.g.,
            Amihud ratio, effective spread) as a time series.
        market_illiquidity: Market-wide illiquidity aggregate (e.g.,
            equal-weighted average Amihud ratio across all stocks).
        window: Rolling regression window (default 60 for ~3 months).

    Returns:
        Rolling R-squared of the commonality regression.

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> asset = pd.Series(np.random.randn(200).cumsum())
        >>> market = pd.Series(np.random.randn(200).cumsum())
        >>> r2 = liquidity_commonality(asset, market, window=60)
        >>> r2.name
        'liquidity_commonality'

    References:
        Chordia, T., Roll, R. & Subrahmanyam, A. (2000). "Commonality in
        Liquidity." *Journal of Financial Economics*, 56(1), 3-28.

    See Also:
        amihud_rolling: Generate the illiquidity input for this function.
    """
    asset_illiquidity = coerce_series(asset_illiquidity, "asset_illiquidity")
    market_illiquidity = coerce_series(market_illiquidity, "market_illiquidity")
    d_asset = asset_illiquidity.diff()
    d_market = market_illiquidity.diff()

    r_squared = pd.Series(np.nan, index=asset_illiquidity.index, name="liquidity_commonality")

    for i in range(window, len(d_asset)):
        y = d_asset.iloc[i - window + 1 : i + 1].values
        x = d_market.iloc[i - window + 1 : i + 1].values

        mask = ~(np.isnan(y) | np.isnan(x))
        if mask.sum() < 5:
            continue

        y_c = y[mask]
        x_c = x[mask]

        x_bar = np.mean(x_c)
        y_bar = np.mean(y_c)
        ss_xx = np.sum((x_c - x_bar) ** 2)
        ss_yy = np.sum((y_c - y_bar) ** 2)

        if ss_xx < 1e-15 or ss_yy < 1e-15:
            r_squared.iloc[i] = 0.0
            continue

        ss_xy = np.sum((x_c - x_bar) * (y_c - y_bar))
        r2 = (ss_xy ** 2) / (ss_xx * ss_yy)
        r_squared.iloc[i] = r2

    return r_squared




[docs]
def spread_decomposition(
    trade_prices: pd.Series,
    bid: pd.Series,
    ask: pd.Series,
    direction: pd.Series,
    delay: int = 5,
) -> dict[str, float]:
    """Huang-Stoll (1997) three-way spread decomposition.

    Decomposes the effective spread into three economically distinct
    components:

    1. **Adverse selection**: compensation for trading against informed
       traders who possess private information.  This portion of the spread
       is a *permanent* price impact -- the midpoint moves against the
       liquidity provider after the trade.
    2. **Order processing**: compensation for the mechanical costs of
       market-making (exchange fees, technology, labor).
    3. **Inventory holding**: compensation for the risk of holding an
       unbalanced inventory.

    **When to use**: For understanding *why* spreads are wide.  If adverse
    selection dominates, the market has significant information asymmetry.
    If order processing dominates, the market is structurally costly.

    **Interpretation**:

    - Adverse selection fraction > 0.5 indicates a market dominated by
      informed trading (e.g., single-stock options, small-cap equities
      before earnings).
    - Order processing fraction > 0.5 indicates a market where mechanical
      costs dominate (e.g., bond markets, low-volatility large-cap
      equities).
    - Inventory fraction is typically the smallest component for equities
      but can be large for less liquid instruments.

    Parameters:
        trade_prices: Executed trade prices.
        bid: Best bid prices at time of each trade.
        ask: Best ask prices at time of each trade.
        direction: Trade direction indicator (+1 buy, -1 sell).
        delay: Number of observations to look ahead for measuring the
            permanent price impact (default 5).

    Returns:
        Dictionary with keys:

        - ``'adverse_selection'``: fraction of the spread due to
          information asymmetry.
        - ``'order_processing'``: fraction due to order handling costs.
        - ``'inventory_holding'``: fraction due to inventory risk.
        - ``'effective_spread_mean'``: average effective spread.

    Example:
        >>> import pandas as pd, numpy as np
        >>> np.random.seed(42)
        >>> n = 200
        >>> mid = 100 + np.cumsum(np.random.randn(n) * 0.01)
        >>> spread_half = 0.05
        >>> bid = pd.Series(mid - spread_half)
        >>> ask = pd.Series(mid + spread_half)
        >>> direction = pd.Series(np.random.choice([1, -1], n))
        >>> trades = pd.Series(np.where(direction > 0, ask, bid))
        >>> result = spread_decomposition(trades, bid, ask, direction)
        >>> 0 <= result['adverse_selection'] <= 1
        True

    References:
        Huang, R. D. & Stoll, H. R. (1997). "The Components of the
        Bid-Ask Spread: A General Approach." *Review of Financial Studies*,
        10(4), 995-1034.

    See Also:
        effective_spread: Total execution cost measure.
        realized_spread: Liquidity provider's revenue component.
    """
    trade_prices = coerce_series(trade_prices, "trade_prices")
    bid = coerce_series(bid, "bid")
    ask = coerce_series(ask, "ask")
    direction = coerce_series(direction, "direction")
    mid = (bid + ask) / 2.0

    # Effective half-spread per trade
    eff_half = direction * (trade_prices - mid)

    # Permanent component: midpoint revision in the direction of the trade
    mid_future = mid.shift(-delay)
    permanent = direction * (mid_future - mid)

    # Drop NaN rows at the end
    valid = ~(eff_half.isna() | permanent.isna())
    eff_valid = eff_half[valid]
    perm_valid = permanent[valid]

    mean_eff = float(np.nanmean(eff_valid))
    mean_perm = float(np.nanmean(perm_valid))

    if mean_eff <= 0:
        # Degenerate case
        return {
            "adverse_selection": float("nan"),
            "order_processing": float("nan"),
            "inventory_holding": float("nan"),
            "effective_spread_mean": float(mean_eff * 2.0),
        }

    # Adverse selection fraction
    adverse_frac = np.clip(mean_perm / mean_eff, 0.0, 1.0)

    # Realized spread = transitory component (order processing + inventory)
    transitory_frac = 1.0 - adverse_frac

    # Split transitory into order processing and inventory via serial
    # correlation of trade direction (proxy for inventory management)
    dir_arr = direction[valid].values.astype(np.float64)
    if len(dir_arr) > 1:
        autocorr = np.corrcoef(dir_arr[:-1], dir_arr[1:])[0, 1]
        if np.isnan(autocorr):
            autocorr = 0.0
        # Inventory fraction proportional to serial correlation of direction
        inventory_share = np.clip(abs(autocorr), 0.0, 1.0)
    else:
        inventory_share = 0.0

    inventory_frac = transitory_frac * inventory_share
    processing_frac = transitory_frac * (1.0 - inventory_share)

    return {
        "adverse_selection": float(adverse_frac),
        "order_processing": float(processing_frac),
        "inventory_holding": float(inventory_frac),
        "effective_spread_mean": float(mean_eff * 2.0),
    }