Files
dano2025/spam_hypot/best_model_and_plots.py
2025-12-12 23:17:56 +03:00

115 lines
4.9 KiB
Python

import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis"))
import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
)
.merge(contact_days, on="id", how="left")
.reset_index()
)
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
X = X.copy()
X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
y = client["high_ctr"]
num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
cat_cols = ["gender_cd", "device_platform_cd"]
pre = ColumnTransformer([
("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])
model = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)
print("Best model AUC:", auc)
# Probability vs avg_imp_per_day
grid = pd.DataFrame({"avg_imp_per_day": np.linspace(client["avg_imp_per_day"].min(), client["avg_imp_per_day"].max(), 50)})
base = client.median(numeric_only=True)
base_gender = client["gender_cd"].mode().iat[0]
base_device = client["device_platform_cd"].mode().iat[0]
grid["imp_total"] = base["imp_total"]
grid["click_total"] = base["click_total"]
grid["age"] = base["age"]
grid["gender_cd"] = base_gender
grid["device_platform_cd"] = base_device
proba_grid = model.predict_proba(grid)[:, 1]
plt.figure(figsize=(10, 4))
plt.plot(grid["avg_imp_per_day"], proba_grid, marker="o")
plt.xlabel("avg_imp_per_day")
plt.ylabel("P(high CTR)")
plt.title("Предсказанная вероятность высокого CTR vs плотность показов")
plt.tight_layout()
plt.savefig(project_root / "spam_hypot" / "best_model_prob.png", dpi=150)
print("Saved best_model_prob.png")
# Dual axis CTR/CR vs fine bins
bins = pd.qcut(client["avg_imp_per_day"], 15, duplicates="drop")
stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
fig, ax1 = plt.subplots(figsize=(12, 5))
ax2 = ax1.twinx()
ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
ax1.set_ylabel("CTR")
ax2.set_ylabel("CR click→order")
ax1.set_xlabel("avg_imp_per_day bins")
plt.xticks(rotation=35)
ax1.set_title("CTR и CR по 15 бинам avg_imp_per_day")
fig.tight_layout()
plt.savefig(project_root / "spam_hypot" / "best_bins.png", dpi=150)
print("Saved best_bins.png")