import sqlite3 from pathlib import Path import sys from typing import Tuple import matplotlib.pyplot as plt from scipy.signal import savgol_filter import pandas as pd import seaborn as sns from statsmodels.nonparametric.smoothers_lowess import lowess import numpy as np sns.set_theme(style="whitegrid") plt.rcParams["figure.figsize"] = (8, 8) project_root = Path(__file__).resolve().parent.parent DB_PATH = project_root / "dataset" / "ds.sqlite" BASE_OUT_DIR = project_root / "main_hypot" # Константы данных CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"] ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES] PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES] ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES] # Константы визуализации/очистки X_COL = "avg_imp_per_day" # x всегда фиксирован DEFAULT_X_MAX = 18 DEFAULT_SCATTER_COLOR = "#2c7bb6" DEFAULT_POINT_SIZE = 20 DEFAULT_ALPHA = 0.08 DEFAULT_TREND_ALPHA = 0.1 DEFAULT_TREND_FRAC = 0.3 DEFAULT_TREND_COLOR = "red" DEFAULT_TREND_LINEWIDTH = 2.5 DEFAULT_IQR_K = 1.5 DEFAULT_Q_LOW = 0.05 DEFAULT_Q_HIGH = 0.95 DEFAULT_ALPHA_MIN = 0.04 DEFAULT_ALPHA_MAX = 0.7 DEFAULT_BINS_X = 60 DEFAULT_BINS_Y = 60 DEFAULT_Y_MIN = -0.5 DEFAULT_Y_MAX = 10 DEFAULT_TREND_METHOD = "savgol" # options: lowess, rolling, savgol DEFAULT_ROLLING_WINDOW = 200 DEFAULT_SAVGOL_WINDOW = 501 DEFAULT_SAVGOL_POLY = 2 def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series: denom = denominator.replace(0, pd.NA) return numerator / denom def load_client_level(db_path: Path) -> pd.DataFrame: """Собирает агрегаты по клиентам без зависимостей от eda_utils.""" conn = sqlite3.connect(db_path) df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) conn.close() df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1) df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1) client = ( df.groupby("id") .agg( imp_total=("imp_total", "sum"), orders_amt_total=("orders_amt_total", "sum"), contact_days=("business_dt", "nunique"), ) .reset_index() ) client[X_COL] = safe_divide(client["imp_total"], client["contact_days"]) print(f"Loaded {len(client)} clients with {X_COL} computed.") return client def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]: q1, q3 = series.quantile([q_low, q_high]) iqr = q3 - q1 return q1 - iqr_k * iqr, q3 + iqr_k * iqr def remove_outliers( df: pd.DataFrame, y_col: str, x_col: str = X_COL, iqr_k: float = DEFAULT_IQR_K, q_low: float = DEFAULT_Q_LOW, q_high: float = DEFAULT_Q_HIGH, ) -> pd.DataFrame: """Убирает выбросы по IQR отдельно по x и y.""" x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k) y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k) filtered = df[ df[x_col].between(max(0, x_low), x_high) & df[y_col].between(max(0, y_low), y_high) ].copy() print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).") return filtered def compute_density_alpha( df: pd.DataFrame, x_col: str, y_col: str, x_max: float, *, bins_x: int = DEFAULT_BINS_X, bins_y: int = DEFAULT_BINS_Y, alpha_min: float = DEFAULT_ALPHA_MIN, alpha_max: float = DEFAULT_ALPHA_MAX, y_min: float = DEFAULT_Y_MIN, y_max_limit: float = DEFAULT_Y_MAX, ) -> np.ndarray: """Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах.""" x_vals = df[x_col].to_numpy() y_vals = df[y_col].to_numpy() if len(x_vals) == 0: return np.array([]) x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1) y_upper = max(min(y_vals.max(), y_max_limit), 1e-9) y_edges = np.linspace(y_min, y_upper, bins_y + 1) x_bins = np.digitize(x_vals, x_edges) - 1 y_bins = np.digitize(y_vals, y_edges) - 1 valid = ( (x_bins >= 0) & (x_bins < bins_x) & (y_bins >= 0) & (y_bins < bins_y) ) counts = np.zeros((bins_x, bins_y), dtype=int) for xb, yb in zip(x_bins[valid], y_bins[valid]): counts[xb, yb] += 1 bin_counts = counts[ np.clip(x_bins, 0, bins_x - 1), np.clip(y_bins, 0, bins_y - 1), ] max_count = bin_counts.max() if len(bin_counts) else 1 if max_count == 0: weight = np.zeros_like(bin_counts, dtype=float) else: weight = (bin_counts / max_count) ** np.sqrt(1.5) weight = np.clip(weight, 0, 1) return alpha_min + (alpha_max - alpha_min) * weight def compute_trend( df: pd.DataFrame, y_col: str, *, x_col: str = X_COL, method: str = DEFAULT_TREND_METHOD, lowess_frac: float = DEFAULT_TREND_FRAC, rolling_window: int = DEFAULT_ROLLING_WINDOW, savgol_window: int = DEFAULT_SAVGOL_WINDOW, savgol_poly: int = DEFAULT_SAVGOL_POLY, ) -> Tuple[np.ndarray, np.ndarray]: """Возвращает (x_sorted, trend_y) по выбранному методу.""" d = df[[x_col, y_col]].dropna().sort_values(x_col) x_vals = d[x_col].to_numpy() y_vals = d[y_col].to_numpy() if len(x_vals) == 0: return np.array([]), np.array([]) m = method.lower() if m == "lowess": trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True) return trend[:, 0], trend[:, 1] if m == "rolling": w = max(3, rolling_window) if w % 2 == 0: w += 1 y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy() return x_vals, y_trend if m == "savgol": w = max(5, savgol_window) if w % 2 == 0: w += 1 poly = min(savgol_poly, w - 1) y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp") return x_vals, y_trend # fallback to lowess trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True) return trend[:, 0], trend[:, 1] def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame: subset = df[df[x_col] <= x_max].copy() print(f"{len(df)} points; {len(subset)} within x<={x_max}.") return subset def plot_density_scatter( df: pd.DataFrame, y_col: str, title: str, out_path: Path, *, x_col: str = X_COL, x_max: float = DEFAULT_X_MAX, scatter_color: str = DEFAULT_SCATTER_COLOR, point_size: int = DEFAULT_POINT_SIZE, alpha: float = DEFAULT_ALPHA, alpha_min: float = DEFAULT_ALPHA_MIN, alpha_max: float = DEFAULT_ALPHA_MAX, bins_x: int = DEFAULT_BINS_X, bins_y: int = DEFAULT_BINS_Y, y_min: float = DEFAULT_Y_MIN, y_max: float = DEFAULT_Y_MAX, with_trend: bool = False, trend_method: str = DEFAULT_TREND_METHOD, trend_frac: float = DEFAULT_TREND_FRAC, trend_color: str = DEFAULT_TREND_COLOR, trend_linewidth: float = DEFAULT_TREND_LINEWIDTH, rolling_window: int = DEFAULT_ROLLING_WINDOW, savgol_window: int = DEFAULT_SAVGOL_WINDOW, savgol_poly: int = DEFAULT_SAVGOL_POLY, return_fig: bool = False, ) -> None: fig, ax = plt.subplots(figsize=(8, 8)) alpha_values = compute_density_alpha( df, x_col=x_col, y_col=y_col, x_max=x_max, bins_x=bins_x, bins_y=bins_y, alpha_min=alpha_min, alpha_max=alpha_max, y_min=y_min, y_max_limit=y_max, ) ax.scatter( df[x_col], df[y_col], color=scatter_color, s=point_size, alpha=alpha_values if len(alpha_values) else alpha, linewidths=0, ) trend_data = None if with_trend: tx, ty = compute_trend( df, y_col=y_col, x_col=x_col, method=trend_method, lowess_frac=trend_frac, rolling_window=rolling_window, savgol_window=savgol_window, savgol_poly=savgol_poly, ) if len(tx): ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд") ax.legend() trend_data = (tx, ty) ax.set_xlim(0, x_max) ax.set_ylim(y_min, y_max) ax.set_yticks(range(0, int(y_max) + 1, 2)) ax.set_xlabel("Среднее число показов в день") ax.set_ylabel(y_col) ax.set_title(title) ax.grid(alpha=0.3) out_path.parent.mkdir(parents=True, exist_ok=True) fig.tight_layout() fig.savefig(out_path, dpi=150) if return_fig: return fig, ax, trend_data plt.close(fig) print(f"Saved {out_path}") def plot_raw_scatter( df: pd.DataFrame, y_col: str, out_dir: Path, *, x_col: str = X_COL, x_max: float = DEFAULT_X_MAX, scatter_color: str = DEFAULT_SCATTER_COLOR, point_size: int = DEFAULT_POINT_SIZE, alpha: float = DEFAULT_ALPHA, alpha_min: float = DEFAULT_ALPHA_MIN, alpha_max: float = DEFAULT_ALPHA_MAX, bins_x: int = DEFAULT_BINS_X, bins_y: int = DEFAULT_BINS_Y, y_min: float = DEFAULT_Y_MIN, y_max: float = DEFAULT_Y_MAX, trend_method: str = DEFAULT_TREND_METHOD, trend_frac: float = DEFAULT_TREND_FRAC, trend_color: str = DEFAULT_TREND_COLOR, trend_linewidth: float = DEFAULT_TREND_LINEWIDTH, rolling_window: int = DEFAULT_ROLLING_WINDOW, savgol_window: int = DEFAULT_SAVGOL_WINDOW, savgol_poly: int = DEFAULT_SAVGOL_POLY, ) -> None: in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max) plot_density_scatter( in_range, y_col=y_col, title=f"Облако: {y_col} vs {x_col} (все клиенты)", out_path=out_dir / "scatter.png", x_col=x_col, x_max=x_max, scatter_color=scatter_color, point_size=point_size, alpha=alpha, alpha_min=alpha_min, alpha_max=alpha_max, bins_x=bins_x, bins_y=bins_y, y_min=y_min, y_max=y_max, trend_method=trend_method, trend_frac=trend_frac, trend_color=trend_color, trend_linewidth=trend_linewidth, rolling_window=rolling_window, savgol_window=savgol_window, savgol_poly=savgol_poly, ) def plot_clean_scatter( df: pd.DataFrame, y_col: str, out_dir: Path, *, x_col: str = X_COL, x_max: float = DEFAULT_X_MAX, scatter_color: str = DEFAULT_SCATTER_COLOR, point_size: int = DEFAULT_POINT_SIZE, alpha: float = DEFAULT_ALPHA, iqr_k: float = DEFAULT_IQR_K, q_low: float = DEFAULT_Q_LOW, q_high: float = DEFAULT_Q_HIGH, alpha_min: float = DEFAULT_ALPHA_MIN, alpha_max: float = DEFAULT_ALPHA_MAX, bins_x: int = DEFAULT_BINS_X, bins_y: int = DEFAULT_BINS_Y, y_min: float = DEFAULT_Y_MIN, y_max: float = DEFAULT_Y_MAX, trend_method: str = DEFAULT_TREND_METHOD, trend_frac: float = DEFAULT_TREND_FRAC, trend_color: str = DEFAULT_TREND_COLOR, trend_linewidth: float = DEFAULT_TREND_LINEWIDTH, rolling_window: int = DEFAULT_ROLLING_WINDOW, savgol_window: int = DEFAULT_SAVGOL_WINDOW, savgol_poly: int = DEFAULT_SAVGOL_POLY, ) -> None: in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max) cleaned = remove_outliers( in_range, y_col=y_col, x_col=x_col, iqr_k=iqr_k, q_low=q_low, q_high=q_high, ) plot_density_scatter( cleaned, y_col=y_col, title=f"Облако без выбросов (IQR) {y_col} vs {x_col}", out_path=out_dir / "scatter_clean.png", x_col=x_col, x_max=x_max, scatter_color=scatter_color, point_size=point_size, alpha=alpha, alpha_min=alpha_min, alpha_max=alpha_max, bins_x=bins_x, bins_y=bins_y, y_min=y_min, y_max=y_max, trend_method=trend_method, trend_frac=trend_frac, trend_color=trend_color, trend_linewidth=trend_linewidth, rolling_window=rolling_window, savgol_window=savgol_window, savgol_poly=savgol_poly, ) def plot_clean_trend_scatter( df: pd.DataFrame, y_col: str, out_dir: Path, *, x_col: str = X_COL, x_max: float = DEFAULT_X_MAX, scatter_color: str = DEFAULT_SCATTER_COLOR, point_size: int = DEFAULT_POINT_SIZE, alpha: float = DEFAULT_TREND_ALPHA, iqr_k: float = DEFAULT_IQR_K, q_low: float = DEFAULT_Q_LOW, q_high: float = DEFAULT_Q_HIGH, trend_frac: float = DEFAULT_TREND_FRAC, trend_color: str = DEFAULT_TREND_COLOR, trend_linewidth: float = DEFAULT_TREND_LINEWIDTH, alpha_min: float = DEFAULT_ALPHA_MIN, alpha_max: float = DEFAULT_ALPHA_MAX, bins_x: int = DEFAULT_BINS_X, bins_y: int = DEFAULT_BINS_Y, y_min: float = DEFAULT_Y_MIN, y_max: float = DEFAULT_Y_MAX, trend_method: str = DEFAULT_TREND_METHOD, rolling_window: int = DEFAULT_ROLLING_WINDOW, savgol_window: int = DEFAULT_SAVGOL_WINDOW, savgol_poly: int = DEFAULT_SAVGOL_POLY, return_components: bool = False, ) -> None: in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max) cleaned = remove_outliers( in_range, y_col=y_col, x_col=x_col, iqr_k=iqr_k, q_low=q_low, q_high=q_high, ) fig_ax = plot_density_scatter( cleaned, y_col=y_col, title=f"Облако без выбросов + тренд {y_col} vs {x_col}", out_path=out_dir / "scatter_trend.png", x_col=x_col, x_max=x_max, scatter_color=scatter_color, point_size=point_size, alpha=alpha, with_trend=True, trend_frac=trend_frac, trend_color=trend_color, trend_linewidth=trend_linewidth, alpha_min=alpha_min, alpha_max=alpha_max, bins_x=bins_x, bins_y=bins_y, y_min=y_min, y_max=y_max, trend_method=trend_method, rolling_window=rolling_window, savgol_window=savgol_window, savgol_poly=savgol_poly, return_fig=return_components, ) if return_components: fig, ax, trend_data = fig_ax return fig, ax, cleaned, trend_data def generate_scatter_set( df: pd.DataFrame, y_col: str, *, base_out_dir: Path = BASE_OUT_DIR, x_col: str = X_COL, x_max: float = DEFAULT_X_MAX, scatter_color: str = DEFAULT_SCATTER_COLOR, point_size: int = DEFAULT_POINT_SIZE, alpha: float = DEFAULT_ALPHA, trend_alpha: float = DEFAULT_TREND_ALPHA, trend_frac: float = DEFAULT_TREND_FRAC, trend_color: str = DEFAULT_TREND_COLOR, trend_linewidth: float = DEFAULT_TREND_LINEWIDTH, iqr_k: float = DEFAULT_IQR_K, q_low: float = DEFAULT_Q_LOW, q_high: float = DEFAULT_Q_HIGH, alpha_min: float = DEFAULT_ALPHA_MIN, alpha_max: float = DEFAULT_ALPHA_MAX, bins_x: int = DEFAULT_BINS_X, bins_y: int = DEFAULT_BINS_Y, y_min: float = DEFAULT_Y_MIN, y_max: float = DEFAULT_Y_MAX, trend_method: str = DEFAULT_TREND_METHOD, rolling_window: int = DEFAULT_ROLLING_WINDOW, savgol_window: int = DEFAULT_SAVGOL_WINDOW, savgol_poly: int = DEFAULT_SAVGOL_POLY, ) -> None: """Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col.""" out_dir = base_out_dir / str(y_col).replace("/", "_") plot_raw_scatter( df, y_col=y_col, out_dir=out_dir, x_col=x_col, x_max=x_max, scatter_color=scatter_color, point_size=point_size, alpha=alpha, alpha_min=alpha_min, alpha_max=alpha_max, bins_x=bins_x, bins_y=bins_y, y_min=y_min, y_max=y_max, trend_method=trend_method, trend_frac=trend_frac, trend_color=trend_color, trend_linewidth=trend_linewidth, rolling_window=rolling_window, savgol_window=savgol_window, savgol_poly=savgol_poly, ) plot_clean_scatter( df, y_col=y_col, out_dir=out_dir, x_col=x_col, x_max=x_max, scatter_color=scatter_color, point_size=point_size, alpha=alpha, iqr_k=iqr_k, q_low=q_low, q_high=q_high, alpha_min=alpha_min, alpha_max=alpha_max, bins_x=bins_x, bins_y=bins_y, y_min=y_min, y_max=y_max, trend_method=trend_method, trend_frac=trend_frac, trend_color=trend_color, trend_linewidth=trend_linewidth, rolling_window=rolling_window, savgol_window=savgol_window, savgol_poly=savgol_poly, ) plot_clean_trend_scatter( df, y_col=y_col, out_dir=out_dir, x_col=x_col, x_max=x_max, scatter_color=scatter_color, point_size=point_size, alpha=trend_alpha, iqr_k=iqr_k, q_low=q_low, q_high=q_high, trend_frac=trend_frac, trend_color=trend_color, trend_linewidth=trend_linewidth, alpha_min=alpha_min, alpha_max=alpha_max, bins_x=bins_x, bins_y=bins_y, y_min=y_min, y_max=y_max, trend_method=trend_method, rolling_window=rolling_window, savgol_window=savgol_window, savgol_poly=savgol_poly, ) def main() -> None: client = load_client_level(DB_PATH) zero_orders = (client["orders_amt_total"] == 0).sum() non_zero = len(client) - zero_orders if len(client): print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})") generate_scatter_set(client, y_col="orders_amt_total") if __name__ == "__main__": main()