diff --git a/spam_hypot/best_bins.png b/spam_hypot/best_bins.png index 6090301..73870e1 100644 Binary files a/spam_hypot/best_bins.png and b/spam_hypot/best_bins.png differ diff --git a/spam_hypot/best_model_and_plots.py b/spam_hypot/best_model_and_plots.py index efb5791..2d43014 100644 --- a/spam_hypot/best_model_and_plots.py +++ b/spam_hypot/best_model_and_plots.py @@ -50,16 +50,20 @@ client = ( .merge(contact_days, on="id", how="left") .reset_index() ) -client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"]) -client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"]) +client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"]) # orders / impressions +client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"]) -client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int) + +# таргет: высокий orders/impressions +client["high_or"] = (client["order_rate"] >= client["order_rate"].quantile(0.75)).astype(int) + X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]] X = X.copy() X["gender_cd"] = eda.normalize_gender(X["gender_cd"]) X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"]) -y = client["high_ctr"] +y = client["high_or"] + num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"] cat_cols = ["gender_cd", "device_platform_cd"] @@ -86,29 +90,74 @@ grid["age"] = base["age"] grid["gender_cd"] = base_gender grid["device_platform_cd"] = base_device proba_grid = model.predict_proba(grid)[:, 1] -plt.figure(figsize=(10, 4)) -plt.plot(grid["avg_imp_per_day"], proba_grid, marker="o") -plt.xlabel("avg_imp_per_day") -plt.ylabel("P(high CTR)") -plt.title("Предсказанная вероятность высокого CTR vs плотность показов") -plt.tight_layout() -plt.savefig(project_root / "spam_hypot" / "best_model_prob.png", dpi=150) -print("Saved best_model_prob.png") -# Dual axis CTR/CR vs fine bins -bins = pd.qcut(client["avg_imp_per_day"], 15, duplicates="drop") -stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"}) -stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values -stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str) -fig, ax1 = plt.subplots(figsize=(12, 5)) -ax2 = ax1.twinx() -ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR") -ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR") -ax1.set_ylabel("CTR") -ax2.set_ylabel("CR click→order") -ax1.set_xlabel("avg_imp_per_day bins") -plt.xticks(rotation=35) -ax1.set_title("CTR и CR по 15 бинам avg_imp_per_day") -fig.tight_layout() -plt.savefig(project_root / "spam_hypot" / "best_bins.png", dpi=150) -print("Saved best_bins.png") +# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending +stats_imp = ( + client.groupby("avg_imp_per_day", as_index=False) + .agg( + orders_mean=("orders_amt_total", "mean"), + n_clients=("id", "count"), + ) + .sort_values("avg_imp_per_day") +) + +K_MULT = 2 # "в разы" -> 5x. Поменяй на 3/10 если хочешь +ABS_DY_MIN = 1 +X_MAX = 16 + +stats_imp = stats_imp.sort_values("avg_imp_per_day").reset_index(drop=True) + +# 1) cut by x +stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True) + +# 2) detect vertical outliers by dy logic +before = len(stats_f) +y = stats_f["orders_mean"] +abs_dy = y.diff().abs() + +prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean() +ratio = abs_dy / (prev3_mean.replace(0, np.nan)) # avoid inf when prev mean == 0 + +is_outlier = (abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT) | (y > 5) +# первые точки не могут нормально иметь "3 предыдущих дельты" +is_outlier = is_outlier.fillna(False) + +stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True) +after = len(stats_f) +cleaned = before - after + +print(f"{before} - {after}, cleaned: {cleaned}") + +# --- smoothing (rolling mean on remaining points) --- +w = max(7, int(len(stats_f) * 0.05)) +if w % 2 == 0: + w += 1 + +stats_f["orders_smooth"] = ( + stats_f["orders_mean"] + .rolling(window=w, center=True, min_periods=1) + .mean() +) + +plt.figure(figsize=(10, 10)) + +# raw (filtered) curve +plt.plot( + stats_f["avg_imp_per_day"], stats_f["orders_mean"], + marker="o", linewidth=1, alpha=0.35, label="Orders (mean)" +) + +# smoothed trend (RED) +plt.plot( + stats_f["avg_imp_per_day"], stats_f["orders_smooth"], + color="red", linewidth=2.5, label=f"Smoothed trend" +) + +plt.xlabel("avg_imp_per_day") +plt.ylabel("Orders (mean)") +plt.title(f"Orders + smoothed trend vs avg_imp_per_day") +plt.legend() +plt.tight_layout() + +plt.savefig(project_root / "spam_hypot" / "orders_vs_avg_imp_per_day_filtered_smoothed.png", dpi=150) +print("Saved orders_vs_avg_imp_per_day_filtered_smoothed.png") diff --git a/spam_hypot/best_model_prob.png b/spam_hypot/best_model_prob.png index fb205ae..0f033f5 100644 Binary files a/spam_hypot/best_model_prob.png and b/spam_hypot/best_model_prob.png differ diff --git a/spam_hypot/orders_vs_avg_imp_per_day.png b/spam_hypot/orders_vs_avg_imp_per_day.png new file mode 100644 index 0000000..add60b4 Binary files /dev/null and b/spam_hypot/orders_vs_avg_imp_per_day.png differ diff --git a/spam_hypot/orders_vs_avg_imp_per_day_filtered_smoothed.png b/spam_hypot/orders_vs_avg_imp_per_day_filtered_smoothed.png new file mode 100644 index 0000000..890539e Binary files /dev/null and b/spam_hypot/orders_vs_avg_imp_per_day_filtered_smoothed.png differ diff --git a/spam_hypot/orders_vs_avg_imp_per_day_smoothed.png b/spam_hypot/orders_vs_avg_imp_per_day_smoothed.png new file mode 100644 index 0000000..e84166f Binary files /dev/null and b/spam_hypot/orders_vs_avg_imp_per_day_smoothed.png differ diff --git a/spam_hypot/orders_vs_avg_imp_per_day_smoothed_clean.png b/spam_hypot/orders_vs_avg_imp_per_day_smoothed_clean.png new file mode 100644 index 0000000..efc3fdf Binary files /dev/null and b/spam_hypot/orders_vs_avg_imp_per_day_smoothed_clean.png differ