import sqlite3 from pathlib import Path import sys import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import roc_auc_score project_root = Path(__file__).resolve().parent.parent sys.path.append(str(project_root / "preanalysis")) import eda_utils as eda # noqa: E402 db_path = project_root / "dataset" / "ds.sqlite" conn = sqlite3.connect(db_path) df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) conn.close() for cols, name in [ (eda.ACTIVE_IMP_COLS, "active_imp_total"), (eda.PASSIVE_IMP_COLS, "passive_imp_total"), (eda.ACTIVE_CLICK_COLS, "active_click_total"), (eda.PASSIVE_CLICK_COLS, "passive_click_total"), (eda.ORDER_COLS, "orders_amt_total"), ]: df[name] = df[cols].sum(axis=1) df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] df["click_total"] = df["active_click_total"] + df["passive_click_total"] contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") client = ( df.groupby("id") .agg( imp_total=("imp_total", "sum"), click_total=("click_total", "sum"), orders_amt_total=("orders_amt_total", "sum"), age=("age", "median"), gender_cd=("gender_cd", lambda s: s.mode().iat[0]), device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]), ) .merge(contact_days, on="id", how="left") .reset_index() ) # ... всё как у тебя до расчёта client["ctr_all"] включительно client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"]) client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"]) # --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ --- train_idx, test_idx = train_test_split( client.index, test_size=0.2, random_state=42 ) train = client.loc[train_idx].copy() test = client.loc[test_idx].copy() thr = train["ctr_all"].quantile(0.75) # порог только по train train["high_ctr"] = (train["ctr_all"] >= thr).astype(int) test["high_ctr"] = (test["ctr_all"] >= thr).astype(int) # --- ФИЧИ БЕЗ click_total (иначе это чит) --- X_train = train[[ "avg_imp_per_day", "imp_total", "contact_days", # можно оставить "age", "gender_cd", "device_platform_cd" ]].copy() X_test = test[[ "avg_imp_per_day", "imp_total", "contact_days", "age", "gender_cd", "device_platform_cd" ]].copy() X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"]) X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"]) X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"]) X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"]) y_train = train["high_ctr"] y_test = test["high_ctr"] num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"] cat_cols = ["gender_cd", "device_platform_cd"] pre = ColumnTransformer([ ("num", Pipeline([ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()) ]), num_cols), ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols), ]) log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))]) gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))]) results = {} for name, model in [("log_reg", log_reg), ("gb", gb)]: model.fit(X_train, y_train) proba = model.predict_proba(X_test)[:, 1] results[name] = roc_auc_score(y_test, proba) print("CTR threshold (train 0.75q):", thr) print("AUC results:", results) imp = gb.named_steps["clf"].feature_importances_ feat = gb.named_steps["pre"].get_feature_names_out() imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False) print(imp_df.head(15))