diff --git a/main_hypot/best_model_and_plots.py b/main_hypot/best_model_and_plots.py index 2d52b1d..7114769 100644 --- a/main_hypot/best_model_and_plots.py +++ b/main_hypot/best_model_and_plots.py @@ -1,144 +1,143 @@ import sqlite3 from pathlib import Path import sys -import numpy as np + +import matplotlib.pyplot as plt import pandas as pd import seaborn as sns -import matplotlib.pyplot as plt +from statsmodels.nonparametric.smoothers_lowess import lowess sns.set_theme(style="whitegrid") -plt.rcParams["figure.figsize"] = (10, 5) +plt.rcParams["figure.figsize"] = (10, 6) project_root = Path(__file__).resolve().parent.parent sys.path.append(str(project_root / "preanalysis_old_bad")) import eda_utils as eda # noqa: E402 -db_path = project_root / "dataset" / "ds.sqlite" -conn = sqlite3.connect(db_path) -df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) -conn.close() +DB_PATH = project_root / "dataset" / "ds.sqlite" +OUT_DIR = project_root / "main_hypot" +X_COL = "avg_imp_per_day" +Y_COL = "orders_amt_total" +X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать +SCATTER_COLOR = "#2c7bb6" -for cols, name in [ - (eda.ACTIVE_IMP_COLS, "active_imp_total"), - (eda.PASSIVE_IMP_COLS, "passive_imp_total"), - (eda.ACTIVE_CLICK_COLS, "active_click_total"), - (eda.PASSIVE_CLICK_COLS, "passive_click_total"), - (eda.ORDER_COLS, "orders_amt_total"), -]: - df[name] = df[cols].sum(axis=1) -df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] -df["click_total"] = df["active_click_total"] + df["passive_click_total"] -contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") -client = ( - df.groupby("id") - .agg( - imp_total=("imp_total", "sum"), - click_total=("click_total", "sum"), - orders_amt_total=("orders_amt_total", "sum"), - age=("age", "median"), - gender_cd=("gender_cd", lambda s: s.mode().iat[0]), - device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]), +def load_client_level(db_path: Path) -> pd.DataFrame: + """Собирает агрегаты по клиентам без усреднения по x.""" + conn = sqlite3.connect(db_path) + df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) + conn.close() + + for cols, name in [ + (eda.ACTIVE_IMP_COLS, "active_imp_total"), + (eda.PASSIVE_IMP_COLS, "passive_imp_total"), + (eda.ACTIVE_CLICK_COLS, "active_click_total"), + (eda.PASSIVE_CLICK_COLS, "passive_click_total"), + (eda.ORDER_COLS, "orders_amt_total"), + ]: + df[name] = df[cols].sum(axis=1) + + df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] + + client = ( + df.groupby("id") + .agg( + imp_total=("imp_total", "sum"), + orders_amt_total=("orders_amt_total", "sum"), + contact_days=("business_dt", "nunique"), + ) + .reset_index() ) - .merge(contact_days, on="id", how="left") - .reset_index() -) -client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"]) # orders / impressions -client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая -client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"]) -# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending -stats_imp = ( - client.groupby("avg_imp_per_day", as_index=False) - .agg( - orders_mean=("orders_amt_total", "mean"), - n_clients=("id", "count"), + client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"]) + client[Y_COL] = client["orders_amt_total"] + client = client[["id", X_COL, Y_COL]].dropna() + + in_range = client[client[X_COL] <= X_MAX].copy() + print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.") + return in_range + + +def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame: + """Убирает выбросы по IQR отдельно по x и y.""" + def bounds(series: pd.Series) -> tuple[float, float]: + q1, q3 = series.quantile([0.05, 0.95]) + iqr = q3 - q1 + return q1 - iqr_k * iqr, q3 + iqr_k * iqr + + x_low, x_high = bounds(df[X_COL]) + y_low, y_high = bounds(df[Y_COL]) + filtered = df[ + df[X_COL].between(max(0, x_low), x_high) + & df[Y_COL].between(max(0, y_low), y_high) + ].copy() + print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).") + return filtered + + +def plot_density_scatter( + df: pd.DataFrame, + title: str, + out_name: str, + with_trend: bool = False, + alpha: float = 0.08, +) -> None: + fig, ax = plt.subplots(figsize=(10, 6)) + sns.scatterplot( + data=df, + x=X_COL, + y=Y_COL, + color=SCATTER_COLOR, + s=20, + alpha=alpha, + linewidth=0, + ax=ax, ) - .sort_values("avg_imp_per_day") -) -K_MULT = 2 # "в разы" -> 5x. Поменяй на 3/10 если хочешь -ABS_DY_MIN = 1 -X_MAX = 16 + if with_trend: + trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True) + ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд") + ax.legend() -stats_imp = stats_imp.sort_values("avg_imp_per_day").reset_index(drop=True) + ax.set_xlim(0, X_MAX) + ax.set_ylim(bottom=0) + ax.set_xlabel("Среднее число показов в день") + ax.set_ylabel("Число заказов за период (сумма)") + ax.set_title(title) + ax.grid(alpha=0.3) -# 1) cut by x -stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True) + OUT_DIR.mkdir(parents=True, exist_ok=True) + out_path = OUT_DIR / out_name + fig.tight_layout() + fig.savefig(out_path, dpi=150) + plt.close(fig) + print(f"Saved {out_path}") -# 2) detect vertical outliers by dy logic -before = len(stats_f) -y = stats_f["orders_mean"] -abs_dy = y.diff().abs() -prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean() -ratio = abs_dy / (prev3_mean.replace(0, np.nan)) # avoid inf when prev mean == 0 +def main() -> None: + client = load_client_level(DB_PATH) -is_outlier = (abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT) | (y > 5) -# первые точки не могут нормально иметь "3 предыдущих дельты" -is_outlier = is_outlier.fillna(False) + plot_density_scatter( + client, + title="Облако: заказы vs средние показы в день (все клиенты)", + out_name="orders_vs_avg_imp_scatter.png", + ) -stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True) -after = len(stats_f) -cleaned = before - after + cleaned = remove_outliers(client) + plot_density_scatter( + cleaned, + title="Облако без выбросов (IQR) заказы vs средние показы в день", + out_name="orders_vs_avg_imp_scatter_clean.png", + ) -print(f"{before} - {after}, cleaned: {cleaned}") + plot_density_scatter( + cleaned, + title="Облако без выбросов + тренд", + out_name="orders_vs_avg_imp_scatter_trend.png", + with_trend=True, + alpha=0.1, + ) -# --- smoothing (rolling mean on remaining points) --- -w = max(7, int(len(stats_f) * 0.05)) -if w % 2 == 0: - w += 1 -stats_f["orders_smooth"] = ( - stats_f["orders_mean"] - .rolling(window=w, center=True, min_periods=1) - .mean() -) -# --- cost line (linear expenses) --- -# нормируем так, чтобы масштаб был сопоставим с заказами -c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max() -stats_f["cost_line"] = c * stats_f["avg_imp_per_day"] - -# plot -plt.figure(figsize=(10, 8)) - -plt.plot( - stats_f["avg_imp_per_day"], - stats_f["orders_mean"], - marker="o", - linewidth=1, - alpha=0.3, - label="Среднее число заказов" -) - -plt.plot( - stats_f["avg_imp_per_day"], - stats_f["orders_smooth"], - color="red", - linewidth=2.5, - label="Сглаженный тренд заказов" -) - -plt.plot( - stats_f["avg_imp_per_day"], - stats_f["cost_line"], - color="black", - linestyle="--", - linewidth=2, - label="Линейные расходы на показы" -) - -plt.xlabel("Среднее число показов в день") -plt.ylabel("Среднее число заказов") -plt.title("Зависимость заказов от интенсивности коммуникаций") - -plt.legend() -plt.grid(alpha=0.3) -plt.tight_layout() - -plt.savefig( - project_root / "main_hypot" / "orders_vs_avg_imp_with_costs.png", - dpi=150 -) - -print("Saved orders_vs_avg_imp_with_costs.png") +if __name__ == "__main__": + main() diff --git a/main_hypot/orders_vs_avg_imp_scatter.png b/main_hypot/orders_vs_avg_imp_scatter.png new file mode 100644 index 0000000..cd60c3e Binary files /dev/null and b/main_hypot/orders_vs_avg_imp_scatter.png differ diff --git a/main_hypot/orders_vs_avg_imp_scatter_clean.png b/main_hypot/orders_vs_avg_imp_scatter_clean.png new file mode 100644 index 0000000..b34318c Binary files /dev/null and b/main_hypot/orders_vs_avg_imp_scatter_clean.png differ diff --git a/main_hypot/orders_vs_avg_imp_scatter_trend.png b/main_hypot/orders_vs_avg_imp_scatter_trend.png new file mode 100644 index 0000000..cbe9252 Binary files /dev/null and b/main_hypot/orders_vs_avg_imp_scatter_trend.png differ