Source code for wraquant.data.cleaning_advanced

"""Advanced data cleaning integrations using optional packages.

Provides wrappers around pyjanitor, rapidfuzz, dateparser,
price-parser, country-converter, ftfy, and unidecode for column
name cleaning, fuzzy merging, flexible date parsing, price parsing,
country normalisation, and text encoding fixes.
"""

from __future__ import annotations

from typing import Any

import pandas as pd

from wraquant.core.decorators import requires_extra

__all__ = [
    "janitor_clean_names",
    "janitor_remove_empty",
    "fuzzy_merge",
    "parse_dates_flexible",
    "parse_prices",
    "normalize_countries",
    "fix_text",
]



[docs]
@requires_extra("cleaning")
def janitor_clean_names(df: pd.DataFrame) -> pd.DataFrame:
    """Clean DataFrame column names using pyjanitor.

    Converts column names to lowercase snake_case, strips whitespace,
    and replaces special characters with underscores.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with messy column names.

    Returns
    -------
    pd.DataFrame
        DataFrame with cleaned column names.
    """
    import janitor  # noqa: F401 — registers .clean_names accessor

    return df.clean_names(remove_special=True)




[docs]
@requires_extra("cleaning")
def janitor_remove_empty(df: pd.DataFrame) -> pd.DataFrame:
    """Remove empty rows and columns using pyjanitor.

    Drops rows and columns that are entirely NaN or empty.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame possibly containing empty rows/columns.

    Returns
    -------
    pd.DataFrame
        DataFrame with empty rows and columns removed.
    """
    import janitor  # noqa: F401

    return df.remove_empty()




[docs]
@requires_extra("cleaning")
def fuzzy_merge(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    left_col: str,
    right_col: str,
    threshold: float = 80.0,
) -> pd.DataFrame:
    """Merge two DataFrames using fuzzy string matching via rapidfuzz.

    For each value in *left_col* of *df1*, the best match above
    *threshold* in *right_col* of *df2* is found. Matched rows are
    joined; unmatched rows from *df1* are retained with NaN for *df2*
    columns.

    Parameters
    ----------
    df1 : pd.DataFrame
        Left DataFrame.
    df2 : pd.DataFrame
        Right DataFrame.
    left_col : str
        Column name in *df1* to match on.
    right_col : str
        Column name in *df2* to match on.
    threshold : float, default 80.0
        Minimum similarity score (0--100) to consider a match.

    Returns
    -------
    pd.DataFrame
        Merged DataFrame with an additional ``match_score`` column
        indicating the similarity score for each matched pair.
    """
    from rapidfuzz import fuzz, process

    right_values = df2[right_col].astype(str).tolist()
    matches: list[dict[str, Any]] = []

    for idx, left_val in df1[left_col].items():
        result = process.extractOne(
            str(left_val),
            right_values,
            scorer=fuzz.WRatio,
            score_cutoff=threshold,
        )
        if result is not None:
            match_str, score, match_idx = result
            matches.append({
                "left_idx": idx,
                "right_idx": df2.index[match_idx],
                "match_score": score,
            })
        else:
            matches.append({
                "left_idx": idx,
                "right_idx": None,
                "match_score": 0.0,
            })

    match_df = pd.DataFrame(matches)

    # Build result: left rows joined with matching right rows
    result = df1.copy()
    result["match_score"] = match_df["match_score"].values

    right_indices = match_df["right_idx"].values
    for col in df2.columns:
        if col == right_col:
            col_name = f"{right_col}_matched"
        else:
            col_name = col if col not in result.columns else f"{col}_right"
        values = []
        for ri in right_indices:
            if ri is not None:
                values.append(df2.loc[ri, col])
            else:
                values.append(None)
        result[col_name] = values

    return result




[docs]
@requires_extra("cleaning")
def parse_dates_flexible(series: pd.Series) -> pd.Series:
    """Parse mixed-format date strings using dateparser.

    Handles a wide variety of date formats and natural language dates
    (e.g. ``'yesterday'``, ``'3 days ago'``).

    Parameters
    ----------
    series : pd.Series
        Series of date strings in potentially mixed formats.

    Returns
    -------
    pd.Series
        Series of ``datetime`` objects. Values that cannot be parsed
        are set to ``NaT``.
    """
    import dateparser

    def _parse(val: Any) -> Any:
        if pd.isna(val):
            return pd.NaT
        parsed = dateparser.parse(str(val))
        return parsed if parsed is not None else pd.NaT

    return series.apply(_parse)




[docs]
@requires_extra("cleaning")
def parse_prices(series: pd.Series) -> pd.DataFrame:
    """Parse price strings into numeric amounts and currencies.

    Uses the ``price-parser`` library to extract amounts and currency
    codes from strings like ``'$1,234.56'`` or ``'EUR 99.99'``.

    Parameters
    ----------
    series : pd.Series
        Series of price strings.

    Returns
    -------
    pd.DataFrame
        DataFrame with columns:

        * **amount** -- extracted numeric price (float, NaN if unparseable).
        * **currency** -- extracted currency code (str or None).
    """
    from price_parser import Price

    amounts: list[float | None] = []
    currencies: list[str | None] = []

    for val in series:
        if pd.isna(val):
            amounts.append(None)
            currencies.append(None)
            continue
        price = Price.fromstring(str(val))
        amounts.append(float(price.amount) if price.amount is not None else None)
        currencies.append(price.currency)

    return pd.DataFrame(
        {"amount": amounts, "currency": currencies},
        index=series.index,
    )




[docs]
@requires_extra("cleaning")
def normalize_countries(series: pd.Series) -> pd.DataFrame:
    """Standardise country names and codes using country-converter.

    Parameters
    ----------
    series : pd.Series
        Series of country names, ISO codes, or other country
        identifiers in various formats.

    Returns
    -------
    pd.DataFrame
        DataFrame with columns:

        * **name_short** -- standardised short country name.
        * **iso3** -- ISO 3166-1 alpha-3 code.
        * **iso2** -- ISO 3166-1 alpha-2 code.
    """
    import country_converter as coco

    cc = coco.CountryConverter()
    values = series.astype(str).tolist()

    return pd.DataFrame(
        {
            "name_short": cc.convert(values, to="name_short"),
            "iso3": cc.convert(values, to="ISO3"),
            "iso2": cc.convert(values, to="ISO2"),
        },
        index=series.index,
    )




[docs]
@requires_extra("cleaning")
def fix_text(series: pd.Series) -> pd.Series:
    """Fix text encoding issues using ftfy and unidecode.

    Repairs mojibake, normalises Unicode, and transliterates non-ASCII
    characters to their closest ASCII equivalents.

    Parameters
    ----------
    series : pd.Series
        Series of strings that may contain encoding artefacts.

    Returns
    -------
    pd.Series
        Series with fixed text encoding. NaN values are preserved.
    """
    import ftfy
    from unidecode import unidecode

    def _fix(val: Any) -> Any:
        if pd.isna(val):
            return val
        text = str(val)
        text = ftfy.fix_text(text)
        text = unidecode(text)
        return text

    return series.apply(_fix)