fully working spam hypot

2025-12-12 23:27:23 +03:00
parent ce595182b9
commit c5c10d1fcf
19 changed files with 39 additions and 3882 deletions
--- a/spam_hypot/model_compare.py
+++ b/spam_hypot/model_compare.py
@@ -46,32 +46,62 @@ client = (
    .merge(contact_days, on="id", how="left")
    .reset_index()
 )
+# ... всё как у тебя до расчёта client["ctr_all"] включительно
+
 client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
 client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
-client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)

-X = client[["avg_imp_per_day", "imp_total", "click_total", "age", "gender_cd", "device_platform_cd"]]
-X = X.copy()
-X["gender_cd"] = eda.normalize_gender(X["gender_cd"])
-X["device_platform_cd"] = eda.normalize_device(X["device_platform_cd"])
-y = client["high_ctr"]
+# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
+train_idx, test_idx = train_test_split(
+    client.index, test_size=0.2, random_state=42
+)

-num_cols = ["avg_imp_per_day", "imp_total", "click_total", "age"]
+train = client.loc[train_idx].copy()
+test = client.loc[test_idx].copy()
+
+thr = train["ctr_all"].quantile(0.75)   # порог только по train
+train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
+test["high_ctr"]  = (test["ctr_all"]  >= thr).astype(int)
+
+# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
+X_train = train[[
+    "avg_imp_per_day", "imp_total", "contact_days",  # можно оставить
+    "age", "gender_cd", "device_platform_cd"
+]].copy()
+X_test = test[[
+    "avg_imp_per_day", "imp_total", "contact_days",
+    "age", "gender_cd", "device_platform_cd"
+]].copy()
+
+X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
+X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
+X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
+X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
+
+y_train = train["high_ctr"]
+y_test = test["high_ctr"]
+
+num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
 cat_cols = ["gender_cd", "device_platform_cd"]
+
 pre = ColumnTransformer([
-    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num_cols),
+    ("num", Pipeline([
+        ("imputer", SimpleImputer(strategy="median")),
+        ("scaler", StandardScaler())
+    ]), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
 ])

 log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
 gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])

-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
 results = {}
 for name, model in [("log_reg", log_reg), ("gb", gb)]:
    model.fit(X_train, y_train)
    proba = model.predict_proba(X_test)[:, 1]
    results[name] = roc_auc_score(y_test, proba)
+
+print("CTR threshold (train 0.75q):", thr)
 print("AUC results:", results)

 imp = gb.named_steps["clf"].feature_importances_