dano2025/spam_hypot/model_compare.py

import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis"))
import eda_utils as eda  # noqa: E402

db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()

for cols, name in [
    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
    (eda.ORDER_COLS, "orders_amt_total"),
]:
    df[name] = df[cols].sum(axis=1)

df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
    df.groupby("id")
    .agg(
        imp_total=("imp_total", "sum"),
        click_total=("click_total", "sum"),
        orders_amt_total=("orders_amt_total", "sum"),
        age=("age", "median"),
        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
    )
    .merge(contact_days, on="id", how="left")
    .reset_index()
)
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)

X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
X = X.copy()
X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
y = client["high_ctr"]

num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
cat_cols = ["gender_cd", "device_platform_cd"]
pre = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
results = {}
for name, model in [("log_reg", log_reg), ("gb", gb)]:
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    results[name] = roc_auc_score(y_test, proba)
print("AUC results:", results)

imp = gb.named_steps["clf"].feature_importances_
feat = gb.named_steps["pre"].get_feature_names_out()
imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False)
print(imp_df.head(15))