Files
dano2025/spam_hypot/best_model_and_plots.py
2025-12-13 02:46:54 +03:00

164 lines
5.7 KiB
Python

import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis"))
import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
)
.merge(contact_days, on="id", how="left")
.reset_index()
)
client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"]) # orders / impressions
client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
# таргет: высокий orders/impressions
client["high_or"] = (client["order_rate"] >= client["order_rate"].quantile(0.75)).astype(int)
X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
X = X.copy()
X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
y = client["high_or"]
num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
cat_cols = ["gender_cd", "device_platform_cd"]
pre = ColumnTransformer([
("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])
model = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, proba)
print("Best model AUC:", auc)
# Probability vs avg_imp_per_day
grid = pd.DataFrame({"avg_imp_per_day": np.linspace(client["avg_imp_per_day"].min(), client["avg_imp_per_day"].max(), 50)})
base = client.median(numeric_only=True)
base_gender = client["gender_cd"].mode().iat[0]
base_device = client["device_platform_cd"].mode().iat[0]
grid["imp_total"] = base["imp_total"]
grid["click_total"] = base["click_total"]
grid["age"] = base["age"]
grid["gender_cd"] = base_gender
grid["device_platform_cd"] = base_device
proba_grid = model.predict_proba(grid)[:, 1]
# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
stats_imp = (
client.groupby("avg_imp_per_day", as_index=False)
.agg(
orders_mean=("orders_amt_total", "mean"),
n_clients=("id", "count"),
)
.sort_values("avg_imp_per_day")
)
K_MULT = 2 # "в разы" -> 5x. Поменяй на 3/10 если хочешь
ABS_DY_MIN = 1
X_MAX = 16
stats_imp = stats_imp.sort_values("avg_imp_per_day").reset_index(drop=True)
# 1) cut by x
stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
# 2) detect vertical outliers by dy logic
before = len(stats_f)
y = stats_f["orders_mean"]
abs_dy = y.diff().abs()
prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
ratio = abs_dy / (prev3_mean.replace(0, np.nan)) # avoid inf when prev mean == 0
is_outlier = (abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT) | (y > 5)
# первые точки не могут нормально иметь "3 предыдущих дельты"
is_outlier = is_outlier.fillna(False)
stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
after = len(stats_f)
cleaned = before - after
print(f"{before} - {after}, cleaned: {cleaned}")
# --- smoothing (rolling mean on remaining points) ---
w = max(7, int(len(stats_f) * 0.05))
if w % 2 == 0:
w += 1
stats_f["orders_smooth"] = (
stats_f["orders_mean"]
.rolling(window=w, center=True, min_periods=1)
.mean()
)
plt.figure(figsize=(10, 10))
# raw (filtered) curve
plt.plot(
stats_f["avg_imp_per_day"], stats_f["orders_mean"],
marker="o", linewidth=1, alpha=0.35, label="Orders (mean)"
)
# smoothed trend (RED)
plt.plot(
stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
color="red", linewidth=2.5, label=f"Smoothed trend"
)
plt.xlabel("avg_imp_per_day")
plt.ylabel("Orders (mean)")
plt.title(f"Orders + smoothed trend vs avg_imp_per_day")
plt.legend()
plt.tight_layout()
plt.savefig(project_root / "spam_hypot" / "orders_vs_avg_imp_per_day_filtered_smoothed.png", dpi=150)
print("Saved orders_vs_avg_imp_per_day_filtered_smoothed.png")