gadem
This commit is contained in:
110
old data/model_compare.py
Normal file
110
old data/model_compare.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
project_root = Path(__file__).resolve().parent.parent
|
||||
sys.path.append(str(project_root / "preanalysis_old_bad"))
|
||||
import eda_utils as eda # noqa: E402
|
||||
|
||||
db_path = project_root / "dataset" / "ds.sqlite"
|
||||
conn = sqlite3.connect(db_path)
|
||||
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
||||
conn.close()
|
||||
|
||||
for cols, name in [
|
||||
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
|
||||
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
|
||||
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
|
||||
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
|
||||
(eda.ORDER_COLS, "orders_amt_total"),
|
||||
]:
|
||||
df[name] = df[cols].sum(axis=1)
|
||||
|
||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
||||
client = (
|
||||
df.groupby("id")
|
||||
.agg(
|
||||
imp_total=("imp_total", "sum"),
|
||||
click_total=("click_total", "sum"),
|
||||
orders_amt_total=("orders_amt_total", "sum"),
|
||||
age=("age", "median"),
|
||||
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
|
||||
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
|
||||
)
|
||||
.merge(contact_days, on="id", how="left")
|
||||
.reset_index()
|
||||
)
|
||||
# ... всё как у тебя до расчёта client["ctr_all"] включительно
|
||||
|
||||
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
|
||||
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
||||
|
||||
# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
|
||||
train_idx, test_idx = train_test_split(
|
||||
client.index, test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
train = client.loc[train_idx].copy()
|
||||
test = client.loc[test_idx].copy()
|
||||
|
||||
thr = train["ctr_all"].quantile(0.75) # порог только по train
|
||||
train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
|
||||
test["high_ctr"] = (test["ctr_all"] >= thr).astype(int)
|
||||
|
||||
# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
|
||||
X_train = train[[
|
||||
"avg_imp_per_day", "imp_total", "contact_days", # можно оставить
|
||||
"age", "gender_cd", "device_platform_cd"
|
||||
]].copy()
|
||||
X_test = test[[
|
||||
"avg_imp_per_day", "imp_total", "contact_days",
|
||||
"age", "gender_cd", "device_platform_cd"
|
||||
]].copy()
|
||||
|
||||
X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
|
||||
X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
|
||||
X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
|
||||
X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
|
||||
|
||||
y_train = train["high_ctr"]
|
||||
y_test = test["high_ctr"]
|
||||
|
||||
num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
|
||||
cat_cols = ["gender_cd", "device_platform_cd"]
|
||||
|
||||
pre = ColumnTransformer([
|
||||
("num", Pipeline([
|
||||
("imputer", SimpleImputer(strategy="median")),
|
||||
("scaler", StandardScaler())
|
||||
]), num_cols),
|
||||
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
|
||||
])
|
||||
|
||||
log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
|
||||
gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
|
||||
|
||||
results = {}
|
||||
for name, model in [("log_reg", log_reg), ("gb", gb)]:
|
||||
model.fit(X_train, y_train)
|
||||
proba = model.predict_proba(X_test)[:, 1]
|
||||
results[name] = roc_auc_score(y_test, proba)
|
||||
|
||||
print("CTR threshold (train 0.75q):", thr)
|
||||
print("AUC results:", results)
|
||||
|
||||
imp = gb.named_steps["clf"].feature_importances_
|
||||
feat = gb.named_steps["pre"].get_feature_names_out()
|
||||
imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False)
|
||||
print(imp_df.head(15))
|
||||
Reference in New Issue
Block a user