import sqlite3 from pathlib import Path import sys import matplotlib.pyplot as plt import pandas as pd import seaborn as sns from statsmodels.nonparametric.smoothers_lowess import lowess sns.set_theme(style="whitegrid") plt.rcParams["figure.figsize"] = (10, 6) project_root = Path(__file__).resolve().parent.parent sys.path.append(str(project_root / "preanalysis_old_bad")) import eda_utils as eda # noqa: E402 DB_PATH = project_root / "dataset" / "ds.sqlite" OUT_DIR = project_root / "main_hypot" X_COL = "avg_imp_per_day" Y_COL = "orders_amt_total" X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать SCATTER_COLOR = "#2c7bb6" def load_client_level(db_path: Path) -> pd.DataFrame: """Собирает агрегаты по клиентам без усреднения по x.""" conn = sqlite3.connect(db_path) df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) conn.close() for cols, name in [ (eda.ACTIVE_IMP_COLS, "active_imp_total"), (eda.PASSIVE_IMP_COLS, "passive_imp_total"), (eda.ACTIVE_CLICK_COLS, "active_click_total"), (eda.PASSIVE_CLICK_COLS, "passive_click_total"), (eda.ORDER_COLS, "orders_amt_total"), ]: df[name] = df[cols].sum(axis=1) df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] client = ( df.groupby("id") .agg( imp_total=("imp_total", "sum"), orders_amt_total=("orders_amt_total", "sum"), contact_days=("business_dt", "nunique"), ) .reset_index() ) client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"]) client[Y_COL] = client["orders_amt_total"] client = client[["id", X_COL, Y_COL]].dropna() in_range = client[client[X_COL] <= X_MAX].copy() print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.") return in_range def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame: """Убирает выбросы по IQR отдельно по x и y.""" def bounds(series: pd.Series) -> tuple[float, float]: q1, q3 = series.quantile([0.05, 0.95]) iqr = q3 - q1 return q1 - iqr_k * iqr, q3 + iqr_k * iqr x_low, x_high = bounds(df[X_COL]) y_low, y_high = bounds(df[Y_COL]) filtered = df[ df[X_COL].between(max(0, x_low), x_high) & df[Y_COL].between(max(0, y_low), y_high) ].copy() print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).") return filtered def plot_density_scatter( df: pd.DataFrame, title: str, out_name: str, with_trend: bool = False, alpha: float = 0.08, ) -> None: fig, ax = plt.subplots(figsize=(10, 6)) sns.scatterplot( data=df, x=X_COL, y=Y_COL, color=SCATTER_COLOR, s=20, alpha=alpha, linewidth=0, ax=ax, ) if with_trend: trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True) ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд") ax.legend() ax.set_xlim(0, X_MAX) ax.set_ylim(bottom=0) ax.set_xlabel("Среднее число показов в день") ax.set_ylabel("Число заказов за период (сумма)") ax.set_title(title) ax.grid(alpha=0.3) OUT_DIR.mkdir(parents=True, exist_ok=True) out_path = OUT_DIR / out_name fig.tight_layout() fig.savefig(out_path, dpi=150) plt.close(fig) print(f"Saved {out_path}") def main() -> None: client = load_client_level(DB_PATH) plot_density_scatter( client, title="Облако: заказы vs средние показы в день (все клиенты)", out_name="orders_vs_avg_imp_scatter.png", ) cleaned = remove_outliers(client) plot_density_scatter( cleaned, title="Облако без выбросов (IQR) заказы vs средние показы в день", out_name="orders_vs_avg_imp_scatter_clean.png", ) plot_density_scatter( cleaned, title="Облако без выбросов + тренд", out_name="orders_vs_avg_imp_scatter_trend.png", with_trend=True, alpha=0.1, ) if __name__ == "__main__": main()