some refactoring

2025-12-14 17:07:57 +03:00
parent 935639c3d6
commit cfee72470c
28 changed files with 7 additions and 1755 deletions
--- a/main_hypot/best_bins.png
+++ b/main_hypot/best_bins.png
--- a/main_hypot/best_model_and_plots.py
+++ b/main_hypot/best_model_and_plots.py
@@ -0,0 +1,144 @@
+import sqlite3
+from pathlib import Path
+import sys
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+sns.set_theme(style="whitegrid")
+plt.rcParams["figure.figsize"] = (10, 5)
+
+project_root = Path(__file__).resolve().parent.parent
+sys.path.append(str(project_root / "preanalysis_old_bad"))
+import eda_utils as eda  # noqa: E402
+
+db_path = project_root / "dataset" / "ds.sqlite"
+conn = sqlite3.connect(db_path)
+df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
+conn.close()
+
+for cols, name in [
+    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
+    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
+    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
+    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
+    (eda.ORDER_COLS, "orders_amt_total"),
+]:
+    df[name] = df[cols].sum(axis=1)
+
+df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+df["click_total"] = df["active_click_total"] + df["passive_click_total"]
+contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
+client = (
+    df.groupby("id")
+    .agg(
+        imp_total=("imp_total", "sum"),
+        click_total=("click_total", "sum"),
+        orders_amt_total=("orders_amt_total", "sum"),
+        age=("age", "median"),
+        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
+        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
+    )
+    .merge(contact_days, on="id", how="left")
+    .reset_index()
+)
+client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"])  # orders / impressions
+client["order_rate_pct"] = 100 * client["order_rate"]  # чтобы шкала была человеческая
+client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
+
+# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
+stats_imp = (
+    client.groupby("avg_imp_per_day", as_index=False)
+    .agg(
+        orders_mean=("orders_amt_total", "mean"),
+        n_clients=("id", "count"),
+    )
+    .sort_values("avg_imp_per_day")
+)
+
+K_MULT = 2  # "в разы" -> 5x. Поменяй на 3/10 если хочешь
+ABS_DY_MIN = 1
+X_MAX = 16
+
+stats_imp = stats_imp.sort_values("avg_imp_per_day").reset_index(drop=True)
+
+# 1) cut by x
+stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
+
+# 2) detect vertical outliers by dy logic
+before = len(stats_f)
+y = stats_f["orders_mean"]
+abs_dy = y.diff().abs()
+
+prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
+ratio = abs_dy / (prev3_mean.replace(0, np.nan))  # avoid inf when prev mean == 0
+
+is_outlier = (abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT) | (y > 5)
+# первые точки не могут нормально иметь "3 предыдущих дельты"
+is_outlier = is_outlier.fillna(False)
+
+stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
+after = len(stats_f)
+cleaned = before - after
+
+print(f"{before} - {after}, cleaned: {cleaned}")
+
+# --- smoothing (rolling mean on remaining points) ---
+w = max(7, int(len(stats_f) * 0.05))
+if w % 2 == 0:
+    w += 1
+
+stats_f["orders_smooth"] = (
+    stats_f["orders_mean"]
+    .rolling(window=w, center=True, min_periods=1)
+    .mean()
+)
+# --- cost line (linear expenses) ---
+# нормируем так, чтобы масштаб был сопоставим с заказами
+c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
+stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
+
+# plot
+plt.figure(figsize=(10, 8))
+
+plt.plot(
+    stats_f["avg_imp_per_day"],
+    stats_f["orders_mean"],
+    marker="o",
+    linewidth=1,
+    alpha=0.3,
+    label="Среднее число заказов"
+)
+
+plt.plot(
+    stats_f["avg_imp_per_day"],
+    stats_f["orders_smooth"],
+    color="red",
+    linewidth=2.5,
+    label="Сглаженный тренд заказов"
+)
+
+plt.plot(
+    stats_f["avg_imp_per_day"],
+    stats_f["cost_line"],
+    color="black",
+    linestyle="--",
+    linewidth=2,
+    label="Линейные расходы на показы"
+)
+
+plt.xlabel("Среднее число показов в день")
+plt.ylabel("Среднее число заказов")
+plt.title("Зависимость заказов от интенсивности коммуникаций")
+
+plt.legend()
+plt.grid(alpha=0.3)
+plt.tight_layout()
+
+plt.savefig(
+    project_root / "main_hypot" / "orders_vs_avg_imp_with_costs.png",
+    dpi=150
+)
+
+print("Saved orders_vs_avg_imp_with_costs.png")
--- a/main_hypot/best_model_prob.png
+++ b/main_hypot/best_model_prob.png
--- a/main_hypot/model_compare.py
+++ b/main_hypot/model_compare.py
@@ -0,0 +1,110 @@
+import sqlite3
+from pathlib import Path
+import sys
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.impute import SimpleImputer
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.metrics import roc_auc_score
+
+project_root = Path(__file__).resolve().parent.parent
+sys.path.append(str(project_root / "preanalysis_old_bad"))
+import eda_utils as eda  # noqa: E402
+
+db_path = project_root / "dataset" / "ds.sqlite"
+conn = sqlite3.connect(db_path)
+df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
+conn.close()
+
+for cols, name in [
+    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
+    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
+    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
+    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
+    (eda.ORDER_COLS, "orders_amt_total"),
+]:
+    df[name] = df[cols].sum(axis=1)
+
+df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+df["click_total"] = df["active_click_total"] + df["passive_click_total"]
+contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
+client = (
+    df.groupby("id")
+    .agg(
+        imp_total=("imp_total", "sum"),
+        click_total=("click_total", "sum"),
+        orders_amt_total=("orders_amt_total", "sum"),
+        age=("age", "median"),
+        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
+        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
+    )
+    .merge(contact_days, on="id", how="left")
+    .reset_index()
+)
+# ... всё как у тебя до расчёта client["ctr_all"] включительно
+
+client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
+client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
+
+# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
+train_idx, test_idx = train_test_split(
+    client.index, test_size=0.2, random_state=42
+)
+
+train = client.loc[train_idx].copy()
+test = client.loc[test_idx].copy()
+
+thr = train["ctr_all"].quantile(0.75)   # порог только по train
+train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
+test["high_ctr"]  = (test["ctr_all"]  >= thr).astype(int)
+
+# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
+X_train = train[[
+    "avg_imp_per_day", "imp_total", "contact_days",  # можно оставить
+    "age", "gender_cd", "device_platform_cd"
+]].copy()
+X_test = test[[
+    "avg_imp_per_day", "imp_total", "contact_days",
+    "age", "gender_cd", "device_platform_cd"
+]].copy()
+
+X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
+X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
+X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
+X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
+
+y_train = train["high_ctr"]
+y_test = test["high_ctr"]
+
+num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
+cat_cols = ["gender_cd", "device_platform_cd"]
+
+pre = ColumnTransformer([
+    ("num", Pipeline([
+        ("imputer", SimpleImputer(strategy="median")),
+        ("scaler", StandardScaler())
+    ]), num_cols),
+    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
+])
+
+log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
+gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
+
+results = {}
+for name, model in [("log_reg", log_reg), ("gb", gb)]:
+    model.fit(X_train, y_train)
+    proba = model.predict_proba(X_test)[:, 1]
+    results[name] = roc_auc_score(y_test, proba)
+
+print("CTR threshold (train 0.75q):", thr)
+print("AUC results:", results)
+
+imp = gb.named_steps["clf"].feature_importances_
+feat = gb.named_steps["pre"].get_feature_names_out()
+imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False)
+print(imp_df.head(15))
--- a/main_hypot/orders_vs_avg_imp_per_day.png
+++ b/main_hypot/orders_vs_avg_imp_per_day.png
--- a/main_hypot/orders_vs_avg_imp_per_day_filtered_smoothed.png
+++ b/main_hypot/orders_vs_avg_imp_per_day_filtered_smoothed.png
--- a/main_hypot/orders_vs_avg_imp_per_day_smoothed.png
+++ b/main_hypot/orders_vs_avg_imp_per_day_smoothed.png
--- a/main_hypot/orders_vs_avg_imp_per_day_smoothed_clean.png
+++ b/main_hypot/orders_vs_avg_imp_per_day_smoothed_clean.png
--- a/main_hypot/orders_vs_avg_imp_with_costs.png
+++ b/main_hypot/orders_vs_avg_imp_with_costs.png
--- a/main_hypot/orders_vs_avg_imp_without_costs.png
+++ b/main_hypot/orders_vs_avg_imp_without_costs.png
--- a/main_hypot/orders_vs_avg_imp_without_costs_no_filter.png
+++ b/main_hypot/orders_vs_avg_imp_without_costs_no_filter.png
--- a/main_hypot/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png
+++ b/main_hypot/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png
--- a/main_hypot/quad_regression_with_costs.png
+++ b/main_hypot/quad_regression_with_costs.png
--- a/main_hypot/quadreg.py
+++ b/main_hypot/quadreg.py
@@ -0,0 +1,240 @@
+import sqlite3
+from pathlib import Path
+import sys
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+import statsmodels.api as sm
+
+sns.set_theme(style="whitegrid")
+plt.rcParams["figure.figsize"] = (10, 6)
+
+# -----------------------------
+# Load + feature engineering (как у тебя)
+# -----------------------------
+project_root = Path(__file__).resolve().parent.parent
+sys.path.append(str(project_root / "preanalysis_old_bad"))
+import eda_utils as eda  # noqa: E402
+
+db_path = project_root / "dataset" / "ds.sqlite"
+conn = sqlite3.connect(db_path)
+df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
+conn.close()
+
+for cols, name in [
+    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
+    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
+    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
+    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
+    (eda.ORDER_COLS, "orders_amt_total"),
+]:
+    df[name] = df[cols].sum(axis=1)
+
+df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+df["click_total"] = df["active_click_total"] + df["passive_click_total"]
+
+contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
+
+client = (
+    df.groupby("id")
+    .agg(
+        imp_total=("imp_total", "sum"),
+        click_total=("click_total", "sum"),
+        orders_amt_total=("orders_amt_total", "sum"),
+        age=("age", "median"),
+        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
+        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
+    )
+    .merge(contact_days, on="id", how="left")
+    .reset_index()
+)
+
+client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"])
+client["order_rate_pct"] = 100 * client["order_rate"]
+client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
+
+# -----------------------------
+# Aggregate curve points (как у тебя)
+# -----------------------------
+stats_imp = (
+    client.groupby("avg_imp_per_day", as_index=False)
+    .agg(
+        orders_mean=("orders_amt_total", "mean"),
+        n_clients=("id", "count"),
+    )
+    .sort_values("avg_imp_per_day")
+).reset_index(drop=True)
+
+# -----------------------------
+# Filtering / outlier logic (как у тебя)
+# -----------------------------
+K_MULT = 2
+ABS_DY_MIN = 1
+X_MAX = 16
+
+stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
+
+before = len(stats_f)
+y = stats_f["orders_mean"]
+abs_dy = y.diff().abs()
+
+prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
+ratio = abs_dy / (prev3_mean.replace(0, np.nan))
+
+is_outlier = ((abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT)) | (y > 5)
+is_outlier = is_outlier.fillna(False)
+
+stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
+after = len(stats_f)
+print(f"Фильтрация: было {before}, стало {after}, убрали {before-after} точек")
+
+# -----------------------------
+# Smoothing (оставим для визуалки, но регрессию делаем по orders_mean)
+# -----------------------------
+w = max(7, int(len(stats_f) * 0.05))
+if w % 2 == 0:
+    w += 1
+
+stats_f["orders_smooth"] = (
+    stats_f["orders_mean"]
+    .rolling(window=w, center=True, min_periods=1)
+    .mean()
+)
+
+# -----------------------------
+# Cost line (как у тебя, нормировка "в единицах заказов")
+# -----------------------------
+c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
+stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
+
+# -----------------------------
+# Quadratic regression: orders_mean ~ 1 + x + x^2
+# WLS with weights = n_clients
+# -----------------------------
+x = stats_f["avg_imp_per_day"].to_numpy()
+y = stats_f["orders_mean"].to_numpy()
+wts = stats_f["n_clients"].to_numpy().astype(float)
+
+X = np.column_stack([x, x**2])
+X = sm.add_constant(X)  # [1, x, x^2]
+
+model = sm.WLS(y, X, weights=wts)
+res = model.fit(cov_type="HC3")  # робастные ошибки
+
+b0, b1, b2 = res.params
+p_b1_two = res.pvalues[1]
+p_b2_two = res.pvalues[2]
+
+# one-sided p-values for directional hypotheses
+p_b1_pos = (p_b1_two / 2) if (b1 > 0) else (1 - p_b1_two / 2)
+p_b2_neg = (p_b2_two / 2) if (b2 < 0) else (1 - p_b2_two / 2)
+
+# turning point (if concave)
+x_star = None
+y_star = None
+if b2 < 0:
+    x_star = -b1 / (2 * b2)
+    y_star = b0 + b1 * x_star + b2 * x_star**2
+
+# Intersection with cost line: b0 + b1 x + b2 x^2 = c x  ->  b2 x^2 + (b1 - c) x + b0 = 0
+x_cross = None
+roots = np.roots([b2, (b1 - c), b0])  # may be complex
+roots = [r.real for r in roots if abs(r.imag) < 1e-8]
+roots_in_range = [r for r in roots if (stats_f["avg_imp_per_day"].min() <= r <= stats_f["avg_imp_per_day"].max())]
+if roots_in_range:
+    # берём корень ближе к "правой" части (обычно пересечение интереснее там, где начинается невыгодно)
+    x_cross = max(roots_in_range)
+
+# -----------------------------
+# Print results + interpretation (по-человечески)
+# -----------------------------
+print("\n=== Квадратичная регрессия (WLS, веса = n_clients, SE = HC3) ===")
+print(res.summary())
+
+print("\n=== Проверка гипотезы убывающей отдачи / спада ===")
+print(f"β1 (линейный эффект): {b1:.6f}, двусторонний p={p_b1_two:.4g}, односторонний p(β1>0)={p_b1_pos:.4g}")
+print(f"β2 (кривизна):       {b2:.6f}, двусторонний p={p_b2_two:.4g}, односторонний p(β2<0)={p_b2_neg:.4g}")
+
+alpha = 0.05
+support = (b1 > 0) and (b2 < 0) and (p_b1_pos < alpha) and (p_b2_neg < alpha)
+
+if support:
+    print("\nВывод: данные поддерживают гипотезу нелинейности.")
+    print("Есть статистически значимый рост на малых x (β1>0) и насыщение/спад (β2<0).")
+else:
+    print("\nВывод: строгого статистического подтверждения по знакам/значимости может не хватить.")
+    print("Но знак коэффициентов и форма кривой всё равно могут быть согласованы с гипотезой.")
+    print("На защите говори аккуратно: 'наблюдается тенденция/согласуется с гипотезой'.")
+
+if x_star is not None:
+    print(f"\nОценка 'порога насыщения' (вершина параболы): x* = {x_star:.3f} показов/день")
+    print(f"Прогноз среднего числа заказов в x*: y(x*) ≈ {y_star:.3f}")
+    if not (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
+        print("Внимание: x* вне диапазона наблюдений, интерпретация как 'оптимума' сомнительная.")
+else:
+    print("\nВершина не считается как максимум: β2 >= 0 (нет выпуклости вниз).")
+
+if x_cross is not None:
+    y_cross = b0 + b1 * x_cross + b2 * x_cross**2
+    print(f"\nТочка пересечения с линейными расходами (в нормировке c={c:.4f}): x≈{x_cross:.3f}, y≈{y_cross:.3f}")
+else:
+    print("\nПересечение с линией расходов в выбранной нормировке не найдено (или вне диапазона).")
+
+# -----------------------------
+# Plot: points + smooth + quadratic fit + cost + markers
+# -----------------------------
+x_grid = np.linspace(stats_f["avg_imp_per_day"].min(), stats_f["avg_imp_per_day"].max(), 300)
+y_hat = b0 + b1 * x_grid + b2 * x_grid**2
+cost_hat = c * x_grid
+
+plt.figure(figsize=(10, 8))
+
+plt.plot(
+    stats_f["avg_imp_per_day"], stats_f["orders_mean"],
+    marker="o", linestyle="-", linewidth=1, alpha=0.3,
+    label="Среднее число заказов (по точкам)"
+)
+
+plt.plot(
+    stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
+    color="red", linewidth=2.2,
+    label="Сглаженный тренд (rolling mean)"
+)
+
+plt.plot(
+    x_grid, y_hat,
+    color="blue", linewidth=2.5,
+    label="Квадратичная регрессия (WLS)"
+)
+
+plt.plot(
+    x_grid, cost_hat,
+    color="black", linestyle="--", linewidth=2,
+    label="Линейные расходы на показы"
+)
+
+if x_star is not None and (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
+    plt.axvline(x_star, color="blue", linestyle=":", linewidth=2)
+    plt.scatter([x_star], [y_star], color="blue", zorder=5)
+    plt.text(x_star, y_star, f"  x*={x_star:.2f}", va="bottom")
+
+if x_cross is not None:
+    y_cross = b0 + b1 * x_cross + b2 * x_cross**2
+    plt.axvline(x_cross, color="black", linestyle=":", linewidth=2, alpha=0.8)
+    plt.scatter([x_cross], [y_cross], color="black", zorder=5)
+    plt.text(x_cross, y_cross, f"  пересечение≈{x_cross:.2f}", va="top")
+
+plt.xlabel("Среднее число показов в день")
+plt.ylabel("Среднее число заказов")
+plt.title("Нелинейный эффект интенсивности коммуникаций: квадратичная регрессия")
+plt.legend()
+plt.grid(alpha=0.3)
+plt.tight_layout()
+
+out_dir = project_root / "main_hypot"
+out_dir.mkdir(parents=True, exist_ok=True)
+out_path = out_dir / "quad_regression_with_costs.png"
+plt.savefig(out_path, dpi=150)
+print(f"\nSaved: {out_path}")
--- a/main_hypot/stat_analysis.py
+++ b/main_hypot/stat_analysis.py
@@ -0,0 +1,87 @@
+import sqlite3
+from pathlib import Path
+import sys
+import numpy as np
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+from scipy import stats
+
+sns.set_theme(style="whitegrid")
+plt.rcParams["figure.figsize"] = (10, 5)
+
+project_root = Path(__file__).resolve().parent.parent
+sys.path.append(str(project_root / "preanalysis_old_bad"))
+import eda_utils as eda  # noqa: E402
+
+db_path = project_root / "dataset" / "ds.sqlite"
+conn = sqlite3.connect(db_path)
+df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
+conn.close()
+
+for cols, name in [
+    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
+    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
+    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
+    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
+    (eda.ORDER_COLS, "orders_amt_total"),
+]:
+    df[name] = df[cols].sum(axis=1)
+
+df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+df["click_total"] = df["active_click_total"] + df["passive_click_total"]
+
+contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
+client = (
+    df.groupby("id")
+    .agg(
+        imp_total=("imp_total", "sum"),
+        click_total=("click_total", "sum"),
+        orders_amt_total=("orders_amt_total", "sum"),
+        age=("age", "median"),
+        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
+        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
+    )
+    .merge(contact_days, on="id", how="left")
+    .reset_index()
+)
+
+client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
+client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
+client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
+client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
+client["has_order"] = (client["orders_amt_total"] > 0).astype(int)
+
+# Summary
+summary = client[["imp_total", "click_total", "orders_amt_total", "contact_days", "avg_imp_per_day", "ctr_all", "cr_click2order"]].describe().T
+print("Summary\n", summary)
+missing = client.isna().mean().sort_values(ascending=False)
+print("Missing\n", missing.head(10))
+
+# Correlations and Mann-Whitney
+corr_ctr = stats.spearmanr(client["avg_imp_per_day"], client["ctr_all"])
+corr_cr = stats.spearmanr(client["avg_imp_per_day"], client["cr_click2order"])
+q1 = client["avg_imp_per_day"].quantile(0.25)
+q4 = client["avg_imp_per_day"].quantile(0.75)
+low = client.loc[client["avg_imp_per_day"] <= q1, "ctr_all"].dropna()
+high = client.loc[client["avg_imp_per_day"] >= q4, "ctr_all"].dropna()
+wu = stats.mannwhitneyu(low, high, alternative="greater")
+print({"spearman_ctr": corr_ctr, "spearman_cr": corr_cr, "mw_low_gt_high": wu})
+
+# Bin stats and dual-axis plot
+bins = pd.qcut(client["avg_imp_per_day"], 10, duplicates="drop")
+stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
+stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
+stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
+fig, ax1 = plt.subplots(figsize=(12, 5))
+ax2 = ax1.twinx()
+ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
+ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
+ax1.set_ylabel("CTR")
+ax2.set_ylabel("CR click→order")
+ax1.set_xlabel("avg_imp_per_day bins")
+plt.xticks(rotation=35)
+ax1.set_title("CTR и CR по децилям avg_imp_per_day")
+fig.tight_layout()
+plt.savefig(project_root / "main_hypot" / "stat_bins.png", dpi=150)
+print("Saved plot stat_bins.png")
--- a/main_hypot/stat_bins.png
+++ b/main_hypot/stat_bins.png