some more for spam hypot
|
Before Width: | Height: | Size: 100 KiB After Width: | Height: | Size: 119 KiB |
@@ -50,16 +50,20 @@ client = (
|
|||||||
.merge(contact_days, on="id", how="left")
|
.merge(contact_days, on="id", how="left")
|
||||||
.reset_index()
|
.reset_index()
|
||||||
)
|
)
|
||||||
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
|
client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"]) # orders / impressions
|
||||||
client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
|
client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая
|
||||||
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
||||||
client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
|
|
||||||
|
# таргет: высокий orders/impressions
|
||||||
|
client["high_or"] = (client["order_rate"] >= client["order_rate"].quantile(0.75)).astype(int)
|
||||||
|
|
||||||
|
|
||||||
X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
|
X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
|
||||||
X = X.copy()
|
X = X.copy()
|
||||||
X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
|
X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
|
||||||
X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
|
X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
|
||||||
y = client["high_ctr"]
|
y = client["high_or"]
|
||||||
|
|
||||||
|
|
||||||
num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
|
num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
|
||||||
cat_cols = ["gender_cd", "device_platform_cd"]
|
cat_cols = ["gender_cd", "device_platform_cd"]
|
||||||
@@ -86,29 +90,74 @@ grid["age"] = base["age"]
|
|||||||
grid["gender_cd"] = base_gender
|
grid["gender_cd"] = base_gender
|
||||||
grid["device_platform_cd"] = base_device
|
grid["device_platform_cd"] = base_device
|
||||||
proba_grid = model.predict_proba(grid)[:, 1]
|
proba_grid = model.predict_proba(grid)[:, 1]
|
||||||
plt.figure(figsize=(10, 4))
|
|
||||||
plt.plot(grid["avg_imp_per_day"], proba_grid, marker="o")
|
|
||||||
plt.xlabel("avg_imp_per_day")
|
|
||||||
plt.ylabel("P(high CTR)")
|
|
||||||
plt.title("Предсказанная вероятность высокого CTR vs плотность показов")
|
|
||||||
plt.tight_layout()
|
|
||||||
plt.savefig(project_root / "spam_hypot" / "best_model_prob.png", dpi=150)
|
|
||||||
print("Saved best_model_prob.png")
|
|
||||||
|
|
||||||
# Dual axis CTR/CR vs fine bins
|
# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
|
||||||
bins = pd.qcut(client["avg_imp_per_day"], 15, duplicates="drop")
|
stats_imp = (
|
||||||
stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
|
client.groupby("avg_imp_per_day", as_index=False)
|
||||||
stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
|
.agg(
|
||||||
stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
|
orders_mean=("orders_amt_total", "mean"),
|
||||||
fig, ax1 = plt.subplots(figsize=(12, 5))
|
n_clients=("id", "count"),
|
||||||
ax2 = ax1.twinx()
|
)
|
||||||
ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
|
.sort_values("avg_imp_per_day")
|
||||||
ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
|
)
|
||||||
ax1.set_ylabel("CTR")
|
|
||||||
ax2.set_ylabel("CR click→order")
|
K_MULT = 2 # "в разы" -> 5x. Поменяй на 3/10 если хочешь
|
||||||
ax1.set_xlabel("avg_imp_per_day bins")
|
ABS_DY_MIN = 1
|
||||||
plt.xticks(rotation=35)
|
X_MAX = 16
|
||||||
ax1.set_title("CTR и CR по 15 бинам avg_imp_per_day")
|
|
||||||
fig.tight_layout()
|
stats_imp = stats_imp.sort_values("avg_imp_per_day").reset_index(drop=True)
|
||||||
plt.savefig(project_root / "spam_hypot" / "best_bins.png", dpi=150)
|
|
||||||
print("Saved best_bins.png")
|
# 1) cut by x
|
||||||
|
stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
|
||||||
|
|
||||||
|
# 2) detect vertical outliers by dy logic
|
||||||
|
before = len(stats_f)
|
||||||
|
y = stats_f["orders_mean"]
|
||||||
|
abs_dy = y.diff().abs()
|
||||||
|
|
||||||
|
prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
|
||||||
|
ratio = abs_dy / (prev3_mean.replace(0, np.nan)) # avoid inf when prev mean == 0
|
||||||
|
|
||||||
|
is_outlier = (abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT) | (y > 5)
|
||||||
|
# первые точки не могут нормально иметь "3 предыдущих дельты"
|
||||||
|
is_outlier = is_outlier.fillna(False)
|
||||||
|
|
||||||
|
stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
|
||||||
|
after = len(stats_f)
|
||||||
|
cleaned = before - after
|
||||||
|
|
||||||
|
print(f"{before} - {after}, cleaned: {cleaned}")
|
||||||
|
|
||||||
|
# --- smoothing (rolling mean on remaining points) ---
|
||||||
|
w = max(7, int(len(stats_f) * 0.05))
|
||||||
|
if w % 2 == 0:
|
||||||
|
w += 1
|
||||||
|
|
||||||
|
stats_f["orders_smooth"] = (
|
||||||
|
stats_f["orders_mean"]
|
||||||
|
.rolling(window=w, center=True, min_periods=1)
|
||||||
|
.mean()
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.figure(figsize=(10, 10))
|
||||||
|
|
||||||
|
# raw (filtered) curve
|
||||||
|
plt.plot(
|
||||||
|
stats_f["avg_imp_per_day"], stats_f["orders_mean"],
|
||||||
|
marker="o", linewidth=1, alpha=0.35, label="Orders (mean)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# smoothed trend (RED)
|
||||||
|
plt.plot(
|
||||||
|
stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
|
||||||
|
color="red", linewidth=2.5, label=f"Smoothed trend"
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.xlabel("avg_imp_per_day")
|
||||||
|
plt.ylabel("Orders (mean)")
|
||||||
|
plt.title(f"Orders + smoothed trend vs avg_imp_per_day")
|
||||||
|
plt.legend()
|
||||||
|
plt.tight_layout()
|
||||||
|
|
||||||
|
plt.savefig(project_root / "spam_hypot" / "orders_vs_avg_imp_per_day_filtered_smoothed.png", dpi=150)
|
||||||
|
print("Saved orders_vs_avg_imp_per_day_filtered_smoothed.png")
|
||||||
|
|||||||
|
Before Width: | Height: | Size: 41 KiB After Width: | Height: | Size: 47 KiB |
BIN
spam_hypot/orders_vs_avg_imp_per_day.png
Normal file
|
After Width: | Height: | Size: 91 KiB |
BIN
spam_hypot/orders_vs_avg_imp_per_day_filtered_smoothed.png
Normal file
|
After Width: | Height: | Size: 422 KiB |
BIN
spam_hypot/orders_vs_avg_imp_per_day_smoothed.png
Normal file
|
After Width: | Height: | Size: 177 KiB |
BIN
spam_hypot/orders_vs_avg_imp_per_day_smoothed_clean.png
Normal file
|
After Width: | Height: | Size: 70 KiB |