some refactoring

2025-12-14 17:07:57 +03:00
parent 935639c3d6
commit cfee72470c
28 changed files with 7 additions and 1755 deletions
--- a/main_hypot/model_compare.py
+++ b/main_hypot/model_compare.py
@@ -0,0 +1,110 @@
+import sqlite3
+from pathlib import Path
+import sys
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.metrics import roc_auc_score
+
+project_root = Path(__file__).resolve().parent.parent
+sys.path.append(str(project_root / "preanalysis_old_bad"))
+import eda_utils as eda  # noqa: E402
+
+db_path = project_root / "dataset" / "ds.sqlite"
+conn = sqlite3.connect(db_path)
+df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
+conn.close()
+
+for cols, name in [
+    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
+    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
+    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
+    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
+    (eda.ORDER_COLS, "orders_amt_total"),
+]:
+    df[name] = df[cols].sum(axis=1)
+
+df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+df["click_total"] = df["active_click_total"] + df["passive_click_total"]
+contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
+client = (
+    df.groupby("id")
+    .agg(
+        imp_total=("imp_total", "sum"),
+        click_total=("click_total", "sum"),
+        orders_amt_total=("orders_amt_total", "sum"),
+        age=("age", "median"),
+        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
+        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
+    )
+    .merge(contact_days, on="id", how="left")
+    .reset_index()
+)
+# ... всё как у тебя до расчёта client["ctr_all"] включительно
+
+client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
+client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
+
+# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
+train_idx, test_idx = train_test_split(
+    client.index, test_size=0.2, random_state=42
+)
+
+train = client.loc[train_idx].copy()
+test = client.loc[test_idx].copy()
+
+thr = train["ctr_all"].quantile(0.75)   # порог только по train
+train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
+test["high_ctr"]  = (test["ctr_all"]  >= thr).astype(int)
+
+# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
+X_train = train[[
+    "avg_imp_per_day", "imp_total", "contact_days",  # можно оставить
+    "age", "gender_cd", "device_platform_cd"
+]].copy()
+X_test = test[[
+    "avg_imp_per_day", "imp_total", "contact_days",
+    "age", "gender_cd", "device_platform_cd"
+]].copy()
+
+X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
+X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
+X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
+X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
+
+y_train = train["high_ctr"]
+y_test = test["high_ctr"]
+
+num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
+cat_cols = ["gender_cd", "device_platform_cd"]
+
+pre = ColumnTransformer([
+    ("num", Pipeline([
+        ("imputer", SimpleImputer(strategy="median")),
+        ("scaler", StandardScaler())
+    ]), num_cols),
+    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
+])
+
+log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
+gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
+
+results = {}
+for name, model in [("log_reg", log_reg), ("gb", gb)]:
+    model.fit(X_train, y_train)
+    proba = model.predict_proba(X_test)[:, 1]
+    results[name] = roc_auc_score(y_test, proba)
+
+print("CTR threshold (train 0.75q):", thr)
+print("AUC results:", results)
+
+imp = gb.named_steps["clf"].feature_importances_
+feat = gb.named_steps["pre"].get_feature_names_out()
+imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False)
+print(imp_df.head(15))