"""Advanced data cleaning integrations using optional packages.
Provides wrappers around pyjanitor, rapidfuzz, dateparser,
price-parser, country-converter, ftfy, and unidecode for column
name cleaning, fuzzy merging, flexible date parsing, price parsing,
country normalisation, and text encoding fixes.
"""
from __future__ import annotations
from typing import Any
import pandas as pd
from wraquant.core.decorators import requires_extra
__all__ = [
"janitor_clean_names",
"janitor_remove_empty",
"fuzzy_merge",
"parse_dates_flexible",
"parse_prices",
"normalize_countries",
"fix_text",
]
[docs]
@requires_extra("cleaning")
def janitor_clean_names(df: pd.DataFrame) -> pd.DataFrame:
"""Clean DataFrame column names using pyjanitor.
Converts column names to lowercase snake_case, strips whitespace,
and replaces special characters with underscores.
Parameters
----------
df : pd.DataFrame
DataFrame with messy column names.
Returns
-------
pd.DataFrame
DataFrame with cleaned column names.
"""
import janitor # noqa: F401 — registers .clean_names accessor
return df.clean_names(remove_special=True)
[docs]
@requires_extra("cleaning")
def janitor_remove_empty(df: pd.DataFrame) -> pd.DataFrame:
"""Remove empty rows and columns using pyjanitor.
Drops rows and columns that are entirely NaN or empty.
Parameters
----------
df : pd.DataFrame
DataFrame possibly containing empty rows/columns.
Returns
-------
pd.DataFrame
DataFrame with empty rows and columns removed.
"""
import janitor # noqa: F401
return df.remove_empty()
[docs]
@requires_extra("cleaning")
def fuzzy_merge(
df1: pd.DataFrame,
df2: pd.DataFrame,
left_col: str,
right_col: str,
threshold: float = 80.0,
) -> pd.DataFrame:
"""Merge two DataFrames using fuzzy string matching via rapidfuzz.
For each value in *left_col* of *df1*, the best match above
*threshold* in *right_col* of *df2* is found. Matched rows are
joined; unmatched rows from *df1* are retained with NaN for *df2*
columns.
Parameters
----------
df1 : pd.DataFrame
Left DataFrame.
df2 : pd.DataFrame
Right DataFrame.
left_col : str
Column name in *df1* to match on.
right_col : str
Column name in *df2* to match on.
threshold : float, default 80.0
Minimum similarity score (0--100) to consider a match.
Returns
-------
pd.DataFrame
Merged DataFrame with an additional ``match_score`` column
indicating the similarity score for each matched pair.
"""
from rapidfuzz import fuzz, process
right_values = df2[right_col].astype(str).tolist()
matches: list[dict[str, Any]] = []
for idx, left_val in df1[left_col].items():
result = process.extractOne(
str(left_val),
right_values,
scorer=fuzz.WRatio,
score_cutoff=threshold,
)
if result is not None:
match_str, score, match_idx = result
matches.append({
"left_idx": idx,
"right_idx": df2.index[match_idx],
"match_score": score,
})
else:
matches.append({
"left_idx": idx,
"right_idx": None,
"match_score": 0.0,
})
match_df = pd.DataFrame(matches)
# Build result: left rows joined with matching right rows
result = df1.copy()
result["match_score"] = match_df["match_score"].values
right_indices = match_df["right_idx"].values
for col in df2.columns:
if col == right_col:
col_name = f"{right_col}_matched"
else:
col_name = col if col not in result.columns else f"{col}_right"
values = []
for ri in right_indices:
if ri is not None:
values.append(df2.loc[ri, col])
else:
values.append(None)
result[col_name] = values
return result
[docs]
@requires_extra("cleaning")
def parse_dates_flexible(series: pd.Series) -> pd.Series:
"""Parse mixed-format date strings using dateparser.
Handles a wide variety of date formats and natural language dates
(e.g. ``'yesterday'``, ``'3 days ago'``).
Parameters
----------
series : pd.Series
Series of date strings in potentially mixed formats.
Returns
-------
pd.Series
Series of ``datetime`` objects. Values that cannot be parsed
are set to ``NaT``.
"""
import dateparser
def _parse(val: Any) -> Any:
if pd.isna(val):
return pd.NaT
parsed = dateparser.parse(str(val))
return parsed if parsed is not None else pd.NaT
return series.apply(_parse)
[docs]
@requires_extra("cleaning")
def parse_prices(series: pd.Series) -> pd.DataFrame:
"""Parse price strings into numeric amounts and currencies.
Uses the ``price-parser`` library to extract amounts and currency
codes from strings like ``'$1,234.56'`` or ``'EUR 99.99'``.
Parameters
----------
series : pd.Series
Series of price strings.
Returns
-------
pd.DataFrame
DataFrame with columns:
* **amount** -- extracted numeric price (float, NaN if unparseable).
* **currency** -- extracted currency code (str or None).
"""
from price_parser import Price
amounts: list[float | None] = []
currencies: list[str | None] = []
for val in series:
if pd.isna(val):
amounts.append(None)
currencies.append(None)
continue
price = Price.fromstring(str(val))
amounts.append(float(price.amount) if price.amount is not None else None)
currencies.append(price.currency)
return pd.DataFrame(
{"amount": amounts, "currency": currencies},
index=series.index,
)
[docs]
@requires_extra("cleaning")
def normalize_countries(series: pd.Series) -> pd.DataFrame:
"""Standardise country names and codes using country-converter.
Parameters
----------
series : pd.Series
Series of country names, ISO codes, or other country
identifiers in various formats.
Returns
-------
pd.DataFrame
DataFrame with columns:
* **name_short** -- standardised short country name.
* **iso3** -- ISO 3166-1 alpha-3 code.
* **iso2** -- ISO 3166-1 alpha-2 code.
"""
import country_converter as coco
cc = coco.CountryConverter()
values = series.astype(str).tolist()
return pd.DataFrame(
{
"name_short": cc.convert(values, to="name_short"),
"iso3": cc.convert(values, to="ISO3"),
"iso2": cc.convert(values, to="ISO2"),
},
index=series.index,
)
[docs]
@requires_extra("cleaning")
def fix_text(series: pd.Series) -> pd.Series:
"""Fix text encoding issues using ftfy and unidecode.
Repairs mojibake, normalises Unicode, and transliterates non-ASCII
characters to their closest ASCII equivalents.
Parameters
----------
series : pd.Series
Series of strings that may contain encoding artefacts.
Returns
-------
pd.Series
Series with fixed text encoding. NaN values are preserved.
"""
import ftfy
from unidecode import unidecode
def _fix(val: Any) -> Any:
if pd.isna(val):
return val
text = str(val)
text = ftfy.fix_text(text)
text = unidecode(text)
return text
return series.apply(_fix)