Source code for wraquant.news.sentiment

"""Sentiment analysis and news-based signal generation.

Provides functions for scoring news sentiment, building sentiment time series,
and generating discrete trading signals from textual data.  Uses FMP as the
primary news data source and includes a built-in keyword-based sentiment
scorer that requires no NLP dependencies.  When ``textblob`` or ``vaderSentiment``
are installed, those engines can be used for higher-quality scoring.

The sentiment pipeline is:

1. **Fetch** -- Pull headlines/articles via ``FMPClient``.
2. **Score** -- Assign each headline a score in [-1, +1].
3. **Aggregate** -- Combine scores with recency weighting.
4. **Signal** -- Convert the aggregate into a discrete trading signal.

References:
    - Tetlock (2007), "Giving Content to Investor Sentiment"
    - Loughran & McDonald (2011), "When Is a Liability Not a Liability?"
    - Hutto & Gilbert (2014), "VADER: A Parsimonious Rule-based Model for
      Sentiment Analysis of Social Media Text"
"""

from __future__ import annotations

import math
import re
from typing import Any, Sequence

import numpy as np
import pandas as pd

from wraquant._lazy import is_available
from wraquant.core.decorators import requires_extra

# ---------------------------------------------------------------------------
# Built-in keyword lexicon (Loughran-McDonald inspired)
# ---------------------------------------------------------------------------

_POSITIVE_WORDS: frozenset[str] = frozenset(
    {
        "beat",
        "beats",
        "exceeds",
        "exceeded",
        "surge",
        "surges",
        "surged",
        "rally",
        "rallies",
        "rallied",
        "gain",
        "gains",
        "gained",
        "profit",
        "profitable",
        "profitability",
        "upgrade",
        "upgrades",
        "upgraded",
        "outperform",
        "outperforms",
        "outperformed",
        "bullish",
        "optimistic",
        "strong",
        "stronger",
        "strongest",
        "record",
        "high",
        "higher",
        "highest",
        "growth",
        "growing",
        "grew",
        "expand",
        "expands",
        "expanded",
        "expansion",
        "positive",
        "upside",
        "breakout",
        "breakthrough",
        "boom",
        "booming",
        "recover",
        "recovers",
        "recovered",
        "recovery",
        "rebound",
        "rebounds",
        "robust",
        "solid",
        "impressive",
        "innovation",
        "innovative",
        "opportunity",
        "opportunities",
        "favorable",
        "success",
        "successful",
        "dividend",
        "buyback",
        "repurchase",
        "win",
        "wins",
        "won",
        "approval",
        "approved",
        "approves",
        "launch",
        "launches",
        "launched",
        "partnership",
        "collaboration",
        "acquisition",
        "momentum",
        "accelerate",
        "accelerated",
        "accelerating",
        "soar",
        "soars",
        "soared",
        "boost",
        "boosts",
        "boosted",
        "exceed",
        "top",
        "tops",
        "topped",
    }
)

_NEGATIVE_WORDS: frozenset[str] = frozenset(
    {
        "miss",
        "misses",
        "missed",
        "decline",
        "declines",
        "declined",
        "drop",
        "drops",
        "dropped",
        "fall",
        "falls",
        "fell",
        "loss",
        "losses",
        "losing",
        "downgrade",
        "downgrades",
        "downgraded",
        "underperform",
        "underperforms",
        "underperformed",
        "bearish",
        "pessimistic",
        "weak",
        "weaker",
        "weakest",
        "low",
        "lower",
        "lowest",
        "risk",
        "risks",
        "risky",
        "negative",
        "downside",
        "crash",
        "crashes",
        "crashed",
        "selloff",
        "sell-off",
        "recession",
        "recessionary",
        "contraction",
        "shrink",
        "shrinks",
        "shrunk",
        "bankruptcy",
        "bankrupt",
        "default",
        "defaults",
        "defaulted",
        "fraud",
        "fraudulent",
        "scandal",
        "investigation",
        "lawsuit",
        "litigation",
        "fine",
        "fined",
        "penalty",
        "warning",
        "warns",
        "warned",
        "cut",
        "cuts",
        "layoff",
        "layoffs",
        "restructuring",
        "impairment",
        "writedown",
        "write-down",
        "volatility",
        "volatile",
        "uncertainty",
        "uncertain",
        "concern",
        "concerns",
        "worried",
        "worry",
        "fear",
        "fears",
        "plunge",
        "plunges",
        "plunged",
        "slump",
        "slumps",
        "slumped",
        "tumble",
        "tumbles",
        "tumbled",
        "deficit",
        "debt",
        "overvalued",
        "bubble",
        "inflation",
        "inflationary",
        "tariff",
        "tariffs",
        "sanctions",
        "shutdown",
        "delay",
        "delays",
        "delayed",
        "disappointing",
        "disappointed",
        "disappoint",
    }
)

_NEGATION_WORDS: frozenset[str] = frozenset(
    {
        "not",
        "no",
        "never",
        "neither",
        "nor",
        "hardly",
        "barely",
        "scarcely",
        "doesn't",
        "don't",
        "didn't",
        "isn't",
        "aren't",
        "wasn't",
        "weren't",
        "won't",
        "wouldn't",
        "couldn't",
        "shouldn't",
    }
)

_INTENSIFIER_WORDS: frozenset[str] = frozenset(
    {
        "very",
        "extremely",
        "significantly",
        "substantially",
        "dramatically",
        "sharply",
        "strongly",
        "massively",
        "hugely",
        "remarkably",
    }
)

_WORD_RE = re.compile(r"[a-z'\-]+")


def _keyword_score(text: str) -> float:
    """Score a text using the built-in keyword lexicon.

    Uses Loughran-McDonald-inspired word lists with negation handling
    and intensity modifiers to produce a score in [-1, +1].

    Parameters:
        text: Raw text to score.

    Returns:
        Sentiment score in [-1.0, +1.0].
    """
    words = _WORD_RE.findall(text.lower())
    if not words:
        return 0.0

    score = 0.0
    negated = False
    intensified = False

    for word in words:
        if word in _NEGATION_WORDS:
            negated = True
            continue
        if word in _INTENSIFIER_WORDS:
            intensified = True
            continue

        base = 0.0
        if word in _POSITIVE_WORDS:
            base = 1.0
        elif word in _NEGATIVE_WORDS:
            base = -1.0

        if base != 0.0:
            if intensified:
                base *= 1.5
            if negated:
                base *= -0.75  # Negation partially flips, not full reversal
            score += base

        # Reset modifiers after consuming a sentiment word
        negated = False
        intensified = False

    # Normalize: divide by sqrt(word count) so longer texts don't dominate
    # but still benefit from repeated sentiment words
    normalized = score / math.sqrt(len(words))
    # Clamp to [-1, 1]
    return float(max(-1.0, min(1.0, normalized)))


def _vader_score(text: str) -> float:
    """Score text using VADER sentiment analyzer.

    Parameters:
        text: Raw text to score.

    Returns:
        VADER compound score in [-1.0, +1.0].
    """
    from vaderSentiment.vaderSentiment import (
        SentimentIntensityAnalyzer,  # type: ignore[import-untyped]
    )

    analyzer = SentimentIntensityAnalyzer()
    scores = analyzer.polarity_scores(text)
    return float(scores["compound"])


def _textblob_score(text: str) -> float:
    """Score text using TextBlob sentiment analysis.

    Parameters:
        text: Raw text to score.

    Returns:
        TextBlob polarity score in [-1.0, +1.0].
    """
    from textblob import TextBlob  # type: ignore[import-untyped]

    blob = TextBlob(text)
    return float(blob.sentiment.polarity)


def _get_scorer(engine: str = "auto") -> tuple[str, Any]:
    """Resolve the sentiment scoring engine.

    Parameters:
        engine: One of ``"auto"``, ``"keyword"``, ``"vader"``,
            ``"textblob"``.  ``"auto"`` tries VADER first, then TextBlob,
            then falls back to the built-in keyword scorer.

    Returns:
        Tuple of (engine_name, scorer_function).

    Raises:
        ValueError: If the requested engine is not recognized or not
            installed.
    """
    if engine == "keyword":
        return "keyword", _keyword_score
    if engine == "vader":
        if not is_available("vaderSentiment"):
            msg = (
                "vaderSentiment is not installed.  "
                "Install it with: pip install vaderSentiment"
            )
            raise ValueError(msg)
        return "vader", _vader_score
    if engine == "textblob":
        if not is_available("textblob"):
            msg = "textblob is not installed.  " "Install it with: pip install textblob"
            raise ValueError(msg)
        return "textblob", _textblob_score
    if engine == "auto":
        if is_available("vaderSentiment"):
            return "vader", _vader_score
        if is_available("textblob"):
            return "textblob", _textblob_score
        return "keyword", _keyword_score

    msg = f"Unknown sentiment engine: {engine!r}.  Use 'auto', 'keyword', 'vader', or 'textblob'."
    raise ValueError(msg)


# ---------------------------------------------------------------------------
# Recency weighting
# ---------------------------------------------------------------------------


def _recency_weights(dates: pd.Series, half_life_days: float = 7.0) -> np.ndarray:
    """Compute exponential decay weights based on recency.

    Parameters:
        dates: Series of datetime values.
        half_life_days: Half-life for the exponential decay in days.

    Returns:
        Array of weights in (0, 1], most recent = 1.0.
    """
    if dates.empty:
        return np.array([])

    dates_dt = pd.to_datetime(dates, utc=True)
    most_recent = dates_dt.max()
    days_ago = (most_recent - dates_dt).dt.total_seconds() / 86400.0
    decay = np.log(2) / half_life_days
    weights = np.exp(-decay * days_ago.values)
    return weights.astype(float)


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------



[docs]
@requires_extra("market-data")
def news_sentiment(
    symbol: str,
    limit: int = 50,
    *,
    engine: str = "auto",
    half_life_days: float = 7.0,
    include_press_releases: bool = True,
) -> dict[str, Any]:
    """Analyze sentiment of recent news for a stock.

    Fetches recent news headlines (and optionally press releases) from FMP,
    scores each headline using the specified sentiment engine, and computes
    aggregate statistics including recency-weighted sentiment and trend
    direction.

    The recency weighting uses exponential decay so that recent articles
    contribute more to the aggregate than older ones.  The trend is
    determined by comparing first-half vs. second-half sentiment to detect
    whether coverage is improving or deteriorating.

    Parameters:
        symbol: Ticker symbol (e.g., ``"AAPL"``).
        limit: Maximum number of news articles to fetch.  Higher values
            give a more robust sentiment estimate but include older news.
        engine: Sentiment scoring engine.  ``"auto"`` tries VADER, then
            TextBlob, then falls back to the built-in keyword scorer.
            Options: ``"auto"``, ``"keyword"``, ``"vader"``, ``"textblob"``.
        half_life_days: Half-life for recency weighting in days.  Default
            of 7 means a one-week-old article gets half the weight of
            today's article.
        include_press_releases: If True, also fetch press releases and
            include them in the analysis.

    Returns:
        Dictionary containing:
        - **symbol** (*str*) -- Ticker symbol.
        - **engine** (*str*) -- Sentiment engine used.
        - **article_count** (*int*) -- Total number of articles scored.
        - **articles** (*list[dict]*) -- List of dicts, each with keys
          ``title``, ``date``, ``source``, ``sentiment``, ``url``.
        - **aggregate** (*dict*) -- Aggregate statistics:
          - **mean** (*float*) -- Simple mean sentiment.
          - **weighted_mean** (*float*) -- Recency-weighted mean.
          - **median** (*float*) -- Median sentiment.
          - **std** (*float*) -- Standard deviation of scores.
          - **bullish_pct** (*float*) -- Fraction of positive articles.
          - **bearish_pct** (*float*) -- Fraction of negative articles.
          - **neutral_pct** (*float*) -- Fraction of neutral articles.
        - **trend** (*str*) -- ``"improving"``, ``"deteriorating"``, or
          ``"stable"`` based on first-half vs. second-half comparison.
        - **trend_delta** (*float*) -- Second-half mean minus first-half mean.
        - **news_volume** (*str*) -- ``"high"``, ``"medium"``, or ``"low"``
          based on article count relative to limit.

    Example:
        >>> from wraquant.news.sentiment import news_sentiment
        >>> result = news_sentiment("AAPL", limit=30)
        >>> print(f"Weighted sentiment: {result['aggregate']['weighted_mean']:.3f}")
        >>> print(f"Trend: {result['trend']}")

    Notes:
        Reference: Tetlock (2007). "Giving Content to Investor Sentiment."
        *The Journal of Finance*, 62(3), 1139-1168.

    See Also:
        sentiment_timeseries: Build a daily time series of sentiment.
        sentiment_signal: Convert sentiment to a trading signal.
    """
    from wraquant.data.providers.fmp import FMPClient

    client = FMPClient()
    engine_name, scorer = _get_scorer(engine)

    # Fetch news data
    news_df = client.stock_news(symbol, limit=limit)
    frames = [news_df]

    if include_press_releases:
        try:
            pr_df = client.press_releases(symbol, limit=max(10, limit // 3))
            frames.append(pr_df)
        except Exception:  # noqa: BLE001
            pass  # Press releases may not be available for all symbols

    combined = pd.concat(frames, ignore_index=True) if len(frames) > 1 else news_df

    if combined.empty:
        return {
            "symbol": symbol,
            "engine": engine_name,
            "article_count": 0,
            "articles": [],
            "aggregate": {
                "mean": 0.0,
                "weighted_mean": 0.0,
                "median": 0.0,
                "std": 0.0,
                "bullish_pct": 0.0,
                "bearish_pct": 0.0,
                "neutral_pct": 0.0,
            },
            "trend": "stable",
            "trend_delta": 0.0,
            "news_volume": "low",
        }

    # Identify column names (FMP may vary)
    title_col = _resolve_col(combined, ["title", "headline", "text"])
    date_col = _resolve_col(combined, ["publishedDate", "date", "published_date"])
    source_col = _resolve_col(combined, ["site", "source", "publisher"])
    url_col = _resolve_col(combined, ["url", "link"])

    # Score each article
    titles = combined[title_col].fillna("").astype(str)
    scores = np.array([scorer(t) for t in titles], dtype=float)

    # Build articles list
    articles: list[dict[str, Any]] = []
    for i in range(len(combined)):
        article: dict[str, Any] = {
            "title": titles.iloc[i],
            "sentiment": float(scores[i]),
        }
        if date_col:
            article["date"] = str(combined[date_col].iloc[i])
        if source_col:
            article["source"] = str(combined[source_col].iloc[i])
        if url_col:
            article["url"] = str(combined[url_col].iloc[i])
        articles.append(article)

    # Aggregate statistics
    mean_score = float(np.mean(scores))
    median_score = float(np.median(scores))
    std_score = float(np.std(scores, ddof=1)) if len(scores) > 1 else 0.0

    # Recency-weighted mean
    if date_col:
        weights = _recency_weights(combined[date_col], half_life_days)
        if len(weights) > 0 and weights.sum() > 0:
            weighted_mean = float(np.average(scores, weights=weights))
        else:
            weighted_mean = mean_score
    else:
        weighted_mean = mean_score

    bullish_pct = float(np.mean(scores > 0.05))
    bearish_pct = float(np.mean(scores < -0.05))
    neutral_pct = 1.0 - bullish_pct - bearish_pct

    # Trend detection: split into halves chronologically
    n = len(scores)
    mid = n // 2
    if mid > 0 and n - mid > 0:
        first_half_mean = float(np.mean(scores[:mid]))
        second_half_mean = float(np.mean(scores[mid:]))
        trend_delta = second_half_mean - first_half_mean
        if trend_delta > 0.1:
            trend = "improving"
        elif trend_delta < -0.1:
            trend = "deteriorating"
        else:
            trend = "stable"
    else:
        trend = "stable"
        trend_delta = 0.0

    # News volume assessment
    if len(combined) >= limit * 0.8:
        news_volume = "high"
    elif len(combined) >= limit * 0.3:
        news_volume = "medium"
    else:
        news_volume = "low"

    return {
        "symbol": symbol,
        "engine": engine_name,
        "article_count": len(combined),
        "articles": articles,
        "aggregate": {
            "mean": mean_score,
            "weighted_mean": weighted_mean,
            "median": median_score,
            "std": std_score,
            "bullish_pct": bullish_pct,
            "bearish_pct": bearish_pct,
            "neutral_pct": neutral_pct,
        },
        "trend": trend,
        "trend_delta": trend_delta,
        "news_volume": news_volume,
    }




[docs]
@requires_extra("market-data")
def sentiment_timeseries(
    symbol: str,
    days: int = 90,
    *,
    engine: str = "auto",
    resample: str = "D",
) -> pd.Series:
    """Build a daily (or custom frequency) sentiment time series.

    Fetches up to ``days`` worth of news for a symbol, scores each article,
    and resamples into a regular time series by averaging sentiment within
    each period.  Missing days are forward-filled so the series can be
    used directly alongside price data.

    Parameters:
        symbol: Ticker symbol (e.g., ``"MSFT"``).
        days: Number of calendar days of history to request.  FMP may
            return fewer articles than this span covers.
        engine: Sentiment scoring engine (see ``news_sentiment``).
        resample: Pandas resample frequency string.  ``"D"`` for daily,
            ``"W"`` for weekly, ``"B"`` for business days.

    Returns:
        pd.Series with a DatetimeIndex and sentiment scores averaged
        per period.  Index name is ``"date"``, series name is
        ``"sentiment"``.

    Example:
        >>> from wraquant.news.sentiment import sentiment_timeseries
        >>> ts = sentiment_timeseries("TSLA", days=30)
        >>> print(ts.tail())

    See Also:
        news_sentiment: Detailed sentiment analysis for a single snapshot.
        sentiment_signal: Convert the time series to a signal.
    """
    from wraquant.data.providers.fmp import FMPClient

    client = FMPClient()
    _, scorer = _get_scorer(engine)

    # Estimate limit: assume ~2 articles/day on average
    estimated_limit = max(50, days * 3)
    news_df = client.stock_news(symbol, limit=estimated_limit)

    if news_df.empty:
        idx = pd.date_range(
            end=pd.Timestamp.now(tz="UTC").normalize(),
            periods=days,
            freq="D",
        )
        return pd.Series(0.0, index=idx, name="sentiment")

    date_col = _resolve_col(news_df, ["publishedDate", "date", "published_date"])
    title_col = _resolve_col(news_df, ["title", "headline", "text"])

    if not date_col or not title_col:
        msg = "News DataFrame missing required date or title columns."
        raise ValueError(msg)

    df = news_df[[date_col, title_col]].copy()
    df["date"] = pd.to_datetime(df[date_col], utc=True)
    df["sentiment"] = df[title_col].fillna("").astype(str).apply(scorer)

    # Filter to requested date range
    cutoff = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=days)
    df = df.loc[df["date"] >= cutoff]

    if df.empty:
        idx = pd.date_range(
            end=pd.Timestamp.now(tz="UTC").normalize(),
            periods=days,
            freq="D",
        )
        return pd.Series(0.0, index=idx, name="sentiment")

    # Resample and fill
    df = df.set_index("date")
    resampled = df["sentiment"].resample(resample).mean()

    # Forward-fill gaps, then back-fill any leading NaNs
    resampled = resampled.ffill().bfill()
    resampled.index.name = "date"
    resampled.name = "sentiment"

    return resampled




[docs]
@requires_extra("market-data")
def sentiment_signal(
    symbol: str,
    threshold: float = 0.3,
    *,
    engine: str = "auto",
    half_life_days: float = 7.0,
) -> str:
    """Generate a discrete sentiment-based trading signal for a stock.

    Fetches recent news, computes the recency-weighted aggregate sentiment,
    and classifies it as bullish, bearish, or neutral based on the
    threshold.

    The signal logic is:
    - ``weighted_mean > threshold``  =>  ``"bullish"``
    - ``weighted_mean < -threshold`` =>  ``"bearish"``
    - Otherwise                      =>  ``"neutral"``

    Parameters:
        symbol: Ticker symbol (e.g., ``"GOOG"``).
        threshold: Absolute threshold for signal classification.
            Lower values produce more signals (more sensitive);
            higher values filter out weak sentiment.  Default of 0.3
            is moderately conservative.
        engine: Sentiment scoring engine (see ``news_sentiment``).
        half_life_days: Half-life for recency weighting (see
            ``news_sentiment``).

    Returns:
        One of ``"bullish"``, ``"bearish"``, or ``"neutral"``.

    Example:
        >>> from wraquant.news.sentiment import sentiment_signal
        >>> signal = sentiment_signal("NVDA", threshold=0.2)
        >>> print(f"Sentiment signal: {signal}")

    See Also:
        news_sentiment: Full sentiment analysis with article-level detail.
        sentiment_timeseries: Historical sentiment time series.
    """
    result = news_sentiment(
        symbol,
        limit=50,
        engine=engine,
        half_life_days=half_life_days,
    )

    weighted_mean = result["aggregate"]["weighted_mean"]

    if weighted_mean > threshold:
        return "bullish"
    if weighted_mean < -threshold:
        return "bearish"
    return "neutral"



# ---------------------------------------------------------------------------
# Legacy API (kept for backward compatibility)
# ---------------------------------------------------------------------------



[docs]
def sentiment_score(
    texts: str | Sequence[str],
    *,
    engine: str = "auto",
) -> dict[str, Any]:
    """Score text passages on a numeric sentiment scale.

    Scores arbitrary text using the specified sentiment engine.  This is
    the low-level scoring function; for news-specific analysis with
    data fetching and aggregation, use ``news_sentiment`` instead.

    Parameters:
        texts: A single text string or a sequence of text strings to
            score.
        engine: Sentiment engine.  ``"auto"`` tries VADER, then TextBlob,
            then the built-in keyword scorer.  Options: ``"auto"``,
            ``"keyword"``, ``"vader"``, ``"textblob"``.

    Returns:
        Dictionary containing:
        - **scores** (*list[float]*) -- Sentiment score for each text,
          in the range [-1.0, 1.0].
        - **mean_score** (*float*) -- Mean sentiment across all texts.
        - **engine** (*str*) -- Name of the engine used.

    Example:
        >>> result = sentiment_score("Stock rallied on strong earnings")
        >>> print(f"Score: {result['scores'][0]:.3f}")
        >>> print(f"Engine: {result['engine']}")

    See Also:
        news_sentiment: Full news sentiment pipeline with data fetching.
        sentiment_signal: Discrete signal from sentiment.
    """
    if isinstance(texts, str):
        texts = [texts]

    engine_name, scorer = _get_scorer(engine)
    scores = [scorer(t) for t in texts]
    mean_score = float(np.mean(scores)) if scores else 0.0

    return {
        "scores": scores,
        "mean_score": mean_score,
        "engine": engine_name,
    }




[docs]
def news_impact(
    returns: pd.Series,
    event_dates: list | pd.DatetimeIndex,
    window: int = 5,
) -> dict[str, Any]:
    """Measure the impact of news events on returns using event study.

    Delegates to ``wraquant.causal.treatment.event_study`` to compute
    cumulative abnormal returns (CARs) around each event date.

    When to use:
        Use news impact analysis to quantify whether specific news
        events (earnings releases, FDA approvals, geopolitical shocks)
        have a statistically significant effect on returns.

    Parameters:
        returns: Return series with a DatetimeIndex.
        event_dates: List of event dates to study.
        window: Number of periods before and after each event to
            include in the analysis window.

    Returns:
        Dictionary containing:
        - **car** (*float*) -- Mean cumulative abnormal return across
          all events.
        - **event_results** -- Detailed event study output from
          ``wraquant.causal.treatment.event_study``.

    Example:
        >>> import pandas as pd, numpy as np
        >>> rng = np.random.default_rng(42)
        >>> dates = pd.bdate_range("2023-01-01", periods=252)
        >>> rets = pd.Series(rng.normal(0.0005, 0.01, 252), index=dates)
        >>> events = [dates[50], dates[150]]
        >>> result = news_impact(rets, events, window=5)

    See Also:
        wraquant.causal.treatment.event_study: Underlying event study.
        earnings_surprise: Earnings-specific impact metric.
    """
    from wraquant.causal.treatment import event_study

    result = event_study(returns, event_dates, window=window)

    if hasattr(result, "effect") and result.effect is not None:
        car = float(result.effect)
    else:
        car = 0.0

    return {
        "car": car,
        "event_results": result,
    }




[docs]
def earnings_surprise(
    actual: float,
    estimate: float,
) -> float:
    """Compute the standardized earnings surprise.

    Earnings surprise is one of the most widely used signals in
    fundamental-driven quant strategies.  A positive surprise (actual
    exceeds estimate) typically triggers positive abnormal returns in
    the short term (post-earnings announcement drift, or PEAD).

    Mathematical formulation:
        surprise = (actual - estimate) / |estimate|

    When to use:
        Use earnings surprise as an input to event-driven strategies.
        Combine with ``news_impact`` to quantify the return effect.

    Parameters:
        actual: Actual reported earnings per share.
        estimate: Consensus analyst estimate of earnings per share.

    Returns:
        Standardized earnings surprise as a float.  Positive values
        indicate a beat; negative values indicate a miss.

    Example:
        >>> earnings_surprise(actual=2.50, estimate=2.30)
        0.08695652173913043
        >>> earnings_surprise(actual=1.80, estimate=2.00)
        -0.1

    See Also:
        news_impact: Measure the return impact of events.
        sentiment_score: Score textual sentiment.
    """
    if abs(estimate) < 1e-12:
        return 0.0
    return float((actual - estimate) / abs(estimate))




[docs]
def sentiment_aggregate(
    scores: Sequence[float],
    method: str = "mean",
) -> float:
    """Aggregate multiple sentiment scores into a single composite.

    When to use:
        Use after collecting sentiment scores from multiple sources
        (multiple news articles, analyst reports, social media posts)
        to produce a single consensus sentiment for a given asset or
        time period.

    Parameters:
        scores: Sequence of sentiment scores (each in [-1, 1]).
        method: Aggregation method.  ``"mean"`` (default) computes the
            arithmetic mean.  ``"median"`` computes the median.

    Returns:
        Aggregated sentiment score as a float.

    Raises:
        ValueError: If *method* is not ``"mean"`` or ``"median"``.

    Example:
        >>> sentiment_aggregate([0.5, 0.3, -0.1, 0.7])
        0.35
        >>> sentiment_aggregate([0.5, 0.3, -0.1, 0.7], method="median")
        0.4

    See Also:
        sentiment_score: Generate individual scores.
        news_sentiment: Full sentiment pipeline.
    """
    arr = np.asarray(scores, dtype=float)
    if len(arr) == 0:
        return 0.0

    if method == "mean":
        return float(np.mean(arr))
    if method == "median":
        return float(np.median(arr))

    msg = f"Unknown aggregation method: {method!r}. Use 'mean' or 'median'."
    raise ValueError(msg)




[docs]
def news_signal(
    sentiment_series: pd.Series | Sequence[float],
    threshold: float = 0.5,
) -> pd.Series:
    """Convert a continuous sentiment series into discrete trading signals.

    Applies threshold-based classification to convert continuous
    sentiment scores into actionable trading signals: +1 (bullish),
    -1 (bearish), or 0 (neutral).

    When to use:
        Use as the final step in a sentiment pipeline, after scoring
        and aggregation, to generate position signals for a trading
        strategy.

    Parameters:
        sentiment_series: Series or sequence of sentiment scores.
        threshold: Absolute threshold for signal generation.  Scores
            above ``+threshold`` produce +1; below ``-threshold``
            produce -1; values in between produce 0.

    Returns:
        pd.Series of integer signals (-1, 0, or +1).

    Example:
        >>> import pandas as pd
        >>> sent = pd.Series([0.8, 0.3, -0.6, 0.1, -0.9])
        >>> news_signal(sent, threshold=0.5)
        0    1
        1    0
        2   -1
        3    0
        4   -1
        dtype: int64

    See Also:
        sentiment_score: Generate sentiment scores.
        sentiment_aggregate: Combine multiple scores.
    """
    if isinstance(sentiment_series, pd.Series):
        arr = sentiment_series.values.astype(float)
        index = sentiment_series.index
    else:
        arr = np.asarray(sentiment_series, dtype=float)
        index = range(len(arr))

    signals = np.where(arr > threshold, 1, np.where(arr < -threshold, -1, 0))

    return pd.Series(signals.astype(int), index=index)



# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _resolve_col(
    df: pd.DataFrame,
    candidates: list[str],
) -> str | None:
    """Find the first matching column name from a list of candidates.

    Parameters:
        df: DataFrame to search.
        candidates: Ordered list of possible column names.

    Returns:
        The first matching column name, or None if none match.
    """
    for col in candidates:
        if col in df.columns:
            return col
    return None