add full script

2025-12-13 03:10:17 +03:00
parent b274076ee7
commit 76ded21d8f
5 changed files with 37 additions and 56 deletions
--- a/spam_hypot/best_model_and_plots.py
+++ b/spam_hypot/best_model_and_plots.py
@@ -5,13 +5,6 @@ import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.compose import ColumnTransformer
 from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.metrics import roc_auc_score
 sns.set_theme(style="whitegrid")
 plt.rcParams["figure.figsize"] = (10, 5)
@@ -54,43 +47,6 @@ client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_t
 client["order_rate_pct"] = 100 * client["order_rate"]  # чтобы шкала была человеческая
 client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
 # таргет: высокий orders/impressions
 client["high_or"] = (client["order_rate"] >= client["order_rate"].quantile(0.75)).astype(int)
 X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
 X = X.copy()
 X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
 X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
 y = client["high_or"]
 num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
 cat_cols = ["gender_cd", "device_platform_cd"]
 pre = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
 ])
 model = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
 model.fit(X_train, y_train)
 proba = model.predict_proba(X_test)[:, 1]
 auc = roc_auc_score(y_test, proba)
 print("Best model AUC:", auc)
 # Probability vs avg_imp_per_day
 grid = pd.DataFrame({"avg_imp_per_day": np.linspace(client["avg_imp_per_day"].min(), client["avg_imp_per_day"].max(), 50)})
 base = client.median(numeric_only=True)
 base_gender = client["gender_cd"].mode().iat[0]
 base_device = client["device_platform_cd"].mode().iat[0]
 grid["imp_total"] = base["imp_total"]
 grid["click_total"] = base["click_total"]
 grid["age"] = base["age"]
 grid["gender_cd"] = base_gender
 grid["device_platform_cd"] = base_device
 proba_grid = model.predict_proba(grid)[:, 1]
 # Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
 stats_imp = (
    client.groupby("avg_imp_per_day", as_index=False)
@@ -138,26 +94,51 @@ stats_f["orders_smooth"] = (
    .rolling(window=w, center=True, min_periods=1)
    .mean()
 )
 # --- cost line (linear expenses) ---
 # нормируем так, чтобы масштаб был сопоставим с заказами
 c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
 stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
-plt.figure(figsize=(10, 10))
+# plot
 plt.figure(figsize=(10, 8))
 # raw (filtered) curve
 plt.plot(
-    stats_f["avg_imp_per_day"], stats_f["orders_mean"],
+    stats_f["avg_imp_per_day"],
-    marker="o", linewidth=1, alpha=0.35, label="Orders (mean)"
+    stats_f["orders_mean"],
    marker="o",
    linewidth=1,
    alpha=0.3,
    label="Среднее число заказов"
 )
 # smoothed trend (RED)
 plt.plot(
-    stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
+    stats_f["avg_imp_per_day"],
-    color="red", linewidth=2.5, label=f"Smoothed trend"
+    stats_f["orders_smooth"],
    color="red",
    linewidth=2.5,
    label="Сглаженный тренд заказов"
 )
-plt.xlabel("avg_imp_per_day")
+plt.plot(
-plt.ylabel("Orders (mean)")
+    stats_f["avg_imp_per_day"],
-plt.title(f"Orders + smoothed trend vs avg_imp_per_day")
+    stats_f["cost_line"],
    color="black",
    linestyle="--",
    linewidth=2,
    label="Линейные расходы на показы"
 )
 plt.xlabel("Среднее число показов в день")
 plt.ylabel("Среднее число заказов")
 plt.title("Зависимость заказов от интенсивности коммуникаций")
 plt.legend()
 plt.grid(alpha=0.3)
 plt.tight_layout()
-plt.savefig(project_root / "spam_hypot" / "orders_vs_avg_imp_per_day_filtered_smoothed.png", dpi=150)
+plt.savefig(
-print("Saved orders_vs_avg_imp_per_day_filtered_smoothed.png")
+    project_root / "spam_hypot" / "orders_vs_avg_imp_with_costs.png",
    dpi=150
 )
 print("Saved orders_vs_avg_imp_with_costs.png")
--- a/spam_hypot/orders_vs_avg_imp_with_costs.png
+++ b/spam_hypot/orders_vs_avg_imp_with_costs.png
--- a/spam_hypot/orders_vs_avg_imp_without_costs.png
+++ b/spam_hypot/orders_vs_avg_imp_without_costs.png
--- a/spam_hypot/orders_vs_avg_imp_without_costs_no_filter.png
+++ b/spam_hypot/orders_vs_avg_imp_without_costs_no_filter.png
--- a/spam_hypot/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png
+++ b/spam_hypot/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png