spam hypot

2025-12-12 23:17:56 +03:00
parent 174a96038f
commit ce595182b9
21 changed files with 2845 additions and 362 deletions
--- a/spam_hypot/best_model_and_plots.py
+++ b/spam_hypot/best_model_and_plots.py
@@ -0,0 +1,114 @@
+import sqlite3
+from pathlib import Path
+import sys
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.metrics import roc_auc_score
+
+sns.set_theme(style="whitegrid")
+plt.rcParams["figure.figsize"] = (10, 5)
+
+project_root = Path(__file__).resolve().parent.parent
+sys.path.append(str(project_root / "preanalysis"))
+import eda_utils as eda  # noqa: E402
+
+db_path = project_root / "dataset" / "ds.sqlite"
+conn = sqlite3.connect(db_path)
+df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
+conn.close()
+
+for cols, name in [
+    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
+    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
+    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
+    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
+    (eda.ORDER_COLS, "orders_amt_total"),
+]:
+    df[name] = df[cols].sum(axis=1)
+
+df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+df["click_total"] = df["active_click_total"] + df["passive_click_total"]
+contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
+client = (
+    df.groupby("id")
+    .agg(
+        imp_total=("imp_total", "sum"),
+        click_total=("click_total", "sum"),
+        orders_amt_total=("orders_amt_total", "sum"),
+        age=("age", "median"),
+        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
+        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
+    )
+    .merge(contact_days, on="id", how="left")
+    .reset_index()
+)
+client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
+client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
+client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
+client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
+
+X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
+X = X.copy()
+X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
+X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
+y = client["high_ctr"]
+
+num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
+cat_cols = ["gender_cd", "device_platform_cd"]
+pre = ColumnTransformer([
+    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
+    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
+])
+
+model = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+model.fit(X_train, y_train)
+proba = model.predict_proba(X_test)[:, 1]
+auc = roc_auc_score(y_test, proba)
+print("Best model AUC:", auc)
+
+# Probability vs avg_imp_per_day
+grid = pd.DataFrame({"avg_imp_per_day": np.linspace(client["avg_imp_per_day"].min(), client["avg_imp_per_day"].max(), 50)})
+base = client.median(numeric_only=True)
+base_gender = client["gender_cd"].mode().iat[0]
+base_device = client["device_platform_cd"].mode().iat[0]
+grid["imp_total"] = base["imp_total"]
+grid["click_total"] = base["click_total"]
+grid["age"] = base["age"]
+grid["gender_cd"] = base_gender
+grid["device_platform_cd"] = base_device
+proba_grid = model.predict_proba(grid)[:, 1]
+plt.figure(figsize=(10, 4))
+plt.plot(grid["avg_imp_per_day"], proba_grid, marker="o")
+plt.xlabel("avg_imp_per_day")
+plt.ylabel("P(high CTR)")
+plt.title("Предсказанная вероятность высокого CTR vs плотность показов")
+plt.tight_layout()
+plt.savefig(project_root / "spam_hypot" / "best_model_prob.png", dpi=150)
+print("Saved best_model_prob.png")
+
+# Dual axis CTR/CR vs fine bins
+bins = pd.qcut(client["avg_imp_per_day"], 15, duplicates="drop")
+stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
+stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
+stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
+fig, ax1 = plt.subplots(figsize=(12, 5))
+ax2 = ax1.twinx()
+ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
+ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
+ax1.set_ylabel("CTR")
+ax2.set_ylabel("CR click→order")
+ax1.set_xlabel("avg_imp_per_day bins")
+plt.xticks(rotation=35)
+ax1.set_title("CTR и CR по 15 бинам avg_imp_per_day")
+fig.tight_layout()
+plt.savefig(project_root / "spam_hypot" / "best_bins.png", dpi=150)
+print("Saved best_bins.png")