good quadreg 0.92 r2

2025-12-14 22:53:28 +03:00
parent 4f8f266c3e
commit 3dc05530c0
19 changed files with 1126 additions and 1516 deletions
--- a/main_hypot/best_model_and_plots.py
+++ b/main_hypot/best_model_and_plots.py
@@ -1,43 +1,66 @@
 import sqlite3
 from pathlib import Path
 import sys
 from typing import Tuple
 import matplotlib.pyplot as plt
 from scipy.signal import savgol_filter
 import pandas as pd
 import seaborn as sns
 from statsmodels.nonparametric.smoothers_lowess import lowess
 import numpy as np
 sns.set_theme(style="whitegrid")
-plt.rcParams["figure.figsize"] = (10, 6)
+plt.rcParams["figure.figsize"] = (8, 8)
 project_root = Path(__file__).resolve().parent.parent
 sys.path.append(str(project_root / "preanalysis_old_bad"))
 import eda_utils as eda  # noqa: E402
 DB_PATH = project_root / "dataset" / "ds.sqlite"
-OUT_DIR = project_root / "main_hypot"
+BASE_OUT_DIR = project_root / "main_hypot"
-X_COL = "avg_imp_per_day"
+
-Y_COL = "orders_amt_total"
+# Константы данных
-X_MAX = 18  # обрезаем длинный хвост по показам, чтобы облака было легче читать
+CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
-SCATTER_COLOR = "#2c7bb6"
+ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
 PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
 ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
 # Константы визуализации/очистки
 X_COL = "avg_imp_per_day"  # x всегда фиксирован
 DEFAULT_X_MAX = 18
 DEFAULT_SCATTER_COLOR = "#2c7bb6"
 DEFAULT_POINT_SIZE = 20
 DEFAULT_ALPHA = 0.08
 DEFAULT_TREND_ALPHA = 0.1
 DEFAULT_TREND_FRAC = 0.3
 DEFAULT_TREND_COLOR = "red"
 DEFAULT_TREND_LINEWIDTH = 2.5
 DEFAULT_IQR_K = 1.5
 DEFAULT_Q_LOW = 0.05
 DEFAULT_Q_HIGH = 0.95
 DEFAULT_ALPHA_MIN = 0.04
 DEFAULT_ALPHA_MAX = 0.7
 DEFAULT_BINS_X = 60
 DEFAULT_BINS_Y = 60
 DEFAULT_Y_MIN = -0.5
 DEFAULT_Y_MAX = 10
 DEFAULT_TREND_METHOD = "savgol"  # options: lowess, rolling, savgol
 DEFAULT_ROLLING_WINDOW = 200
 DEFAULT_SAVGOL_WINDOW = 501
 DEFAULT_SAVGOL_POLY = 2
 def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
    denom = denominator.replace(0, pd.NA)
    return numerator / denom
 def load_client_level(db_path: Path) -> pd.DataFrame:
-    """Собирает агрегаты по клиентам без усреднения по x."""
+    """Собирает агрегаты по клиентам без зависимостей от eda_utils."""
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
    conn.close()
-    for cols, name in [
+    df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
-        (eda.ACTIVE_IMP_COLS, "active_imp_total"),
+    df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
        (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
        (eda.ACTIVE_CLICK_COLS, "active_click_total"),
        (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
        (eda.ORDER_COLS, "orders_amt_total"),
    ]:
        df[name] = df[cols].sum(axis=1)
    df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
    client = (
        df.groupby("id")
@@ -49,94 +72,503 @@ def load_client_level(db_path: Path) -> pd.DataFrame:
        .reset_index()
    )
-    client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"])
+    client[X_COL] = safe_divide(client["imp_total"], client["contact_days"])
-    client[Y_COL] = client["orders_amt_total"]
+    print(f"Loaded {len(client)} clients with {X_COL} computed.")
-    client = client[["id", X_COL, Y_COL]].dropna()
+    return client
    in_range = client[client[X_COL] <= X_MAX].copy()
    print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
    return in_range
-def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame:
+def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]:
-    """Убирает выбросы по IQR отдельно по x и y."""
+    q1, q3 = series.quantile([q_low, q_high])
    def bounds(series: pd.Series) -> tuple[float, float]:
        q1, q3 = series.quantile([0.05, 0.95])
    iqr = q3 - q1
    return q1 - iqr_k * iqr, q3 + iqr_k * iqr
-    x_low, x_high = bounds(df[X_COL])
+
-    y_low, y_high = bounds(df[Y_COL])
+def remove_outliers(
    df: pd.DataFrame,
    y_col: str,
    x_col: str = X_COL,
    iqr_k: float = DEFAULT_IQR_K,
    q_low: float = DEFAULT_Q_LOW,
    q_high: float = DEFAULT_Q_HIGH,
 ) -> pd.DataFrame:
    """Убирает выбросы по IQR отдельно по x и y."""
    x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k)
    y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k)
    filtered = df[
-        df[X_COL].between(max(0, x_low), x_high)
+        df[x_col].between(max(0, x_low), x_high)
-        & df[Y_COL].between(max(0, y_low), y_high)
+        & df[y_col].between(max(0, y_low), y_high)
    ].copy()
-    print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).")
+    print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).")
    return filtered
 def compute_density_alpha(
    df: pd.DataFrame,
    x_col: str,
    y_col: str,
    x_max: float,
    *,
    bins_x: int = DEFAULT_BINS_X,
    bins_y: int = DEFAULT_BINS_Y,
    alpha_min: float = DEFAULT_ALPHA_MIN,
    alpha_max: float = DEFAULT_ALPHA_MAX,
    y_min: float = DEFAULT_Y_MIN,
    y_max_limit: float = DEFAULT_Y_MAX,
 ) -> np.ndarray:
    """Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах."""
    x_vals = df[x_col].to_numpy()
    y_vals = df[y_col].to_numpy()
    if len(x_vals) == 0:
        return np.array([])
    x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1)
    y_upper = max(min(y_vals.max(), y_max_limit), 1e-9)
    y_edges = np.linspace(y_min, y_upper, bins_y + 1)
    x_bins = np.digitize(x_vals, x_edges) - 1
    y_bins = np.digitize(y_vals, y_edges) - 1
    valid = (
        (x_bins >= 0) & (x_bins < bins_x) &
        (y_bins >= 0) & (y_bins < bins_y)
    )
    counts = np.zeros((bins_x, bins_y), dtype=int)
    for xb, yb in zip(x_bins[valid], y_bins[valid]):
        counts[xb, yb] += 1
    bin_counts = counts[
        np.clip(x_bins, 0, bins_x - 1),
        np.clip(y_bins, 0, bins_y - 1),
    ]
    max_count = bin_counts.max() if len(bin_counts) else 1
    if max_count == 0:
        weight = np.zeros_like(bin_counts, dtype=float)
    else:
        weight = (bin_counts / max_count) ** np.sqrt(1.5)
    weight = np.clip(weight, 0, 1)
    return alpha_min + (alpha_max - alpha_min) * weight
 def compute_trend(
    df: pd.DataFrame,
    y_col: str,
    *,
    x_col: str = X_COL,
    method: str = DEFAULT_TREND_METHOD,
    lowess_frac: float = DEFAULT_TREND_FRAC,
    rolling_window: int = DEFAULT_ROLLING_WINDOW,
    savgol_window: int = DEFAULT_SAVGOL_WINDOW,
    savgol_poly: int = DEFAULT_SAVGOL_POLY,
 ) -> Tuple[np.ndarray, np.ndarray]:
    """Возвращает (x_sorted, trend_y) по выбранному методу."""
    d = df[[x_col, y_col]].dropna().sort_values(x_col)
    x_vals = d[x_col].to_numpy()
    y_vals = d[y_col].to_numpy()
    if len(x_vals) == 0:
        return np.array([]), np.array([])
    m = method.lower()
    if m == "lowess":
        trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
        return trend[:, 0], trend[:, 1]
    if m == "rolling":
        w = max(3, rolling_window)
        if w % 2 == 0:
            w += 1
        y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy()
        return x_vals, y_trend
    if m == "savgol":
        w = max(5, savgol_window)
        if w % 2 == 0:
            w += 1
        poly = min(savgol_poly, w - 1)
        y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp")
        return x_vals, y_trend
    # fallback to lowess
    trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
    return trend[:, 0], trend[:, 1]
 def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame:
    subset = df[df[x_col] <= x_max].copy()
    print(f"{len(df)} points; {len(subset)} within x<={x_max}.")
    return subset
 def plot_density_scatter(
    df: pd.DataFrame,
    y_col: str,
    title: str,
-    out_name: str,
+    out_path: Path,
    *,
    x_col: str = X_COL,
    x_max: float = DEFAULT_X_MAX,
    scatter_color: str = DEFAULT_SCATTER_COLOR,
    point_size: int = DEFAULT_POINT_SIZE,
    alpha: float = DEFAULT_ALPHA,
    alpha_min: float = DEFAULT_ALPHA_MIN,
    alpha_max: float = DEFAULT_ALPHA_MAX,
    bins_x: int = DEFAULT_BINS_X,
    bins_y: int = DEFAULT_BINS_Y,
    y_min: float = DEFAULT_Y_MIN,
    y_max: float = DEFAULT_Y_MAX,
    with_trend: bool = False,
-    alpha: float = 0.08,
+    trend_method: str = DEFAULT_TREND_METHOD,
    trend_frac: float = DEFAULT_TREND_FRAC,
    trend_color: str = DEFAULT_TREND_COLOR,
    trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
    rolling_window: int = DEFAULT_ROLLING_WINDOW,
    savgol_window: int = DEFAULT_SAVGOL_WINDOW,
    savgol_poly: int = DEFAULT_SAVGOL_POLY,
 ) -> None:
-    fig, ax = plt.subplots(figsize=(10, 6))
+    fig, ax = plt.subplots(figsize=(8, 8))
-    sns.scatterplot(
+    alpha_values = compute_density_alpha(
-        data=df,
+        df,
-        x=X_COL,
+        x_col=x_col,
-        y=Y_COL,
+        y_col=y_col,
-        color=SCATTER_COLOR,
+        x_max=x_max,
-        s=20,
+        bins_x=bins_x,
-        alpha=alpha,
+        bins_y=bins_y,
-        linewidth=0,
+        alpha_min=alpha_min,
-        ax=ax,
+        alpha_max=alpha_max,
        y_min=y_min,
        y_max_limit=y_max,
    )
    ax.scatter(
        df[x_col],
        df[y_col],
        color=scatter_color,
        s=point_size,
        alpha=alpha_values if len(alpha_values) else alpha,
        linewidths=0,
    )
    if with_trend:
-        trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True)
+        tx, ty = compute_trend(
-        ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд")
+            df,
            y_col=y_col,
            x_col=x_col,
            method=trend_method,
            lowess_frac=trend_frac,
            rolling_window=rolling_window,
            savgol_window=savgol_window,
            savgol_poly=savgol_poly,
        )
        if len(tx):
            ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
            ax.legend()
-    ax.set_xlim(0, X_MAX)
+    ax.set_xlim(0, x_max)
-    ax.set_ylim(bottom=0)
+    ax.set_ylim(y_min, y_max)
    ax.set_yticks(range(0, int(y_max) + 1, 2))
    ax.set_xlabel("Среднее число показов в день")
-    ax.set_ylabel("Число заказов за период (сумма)")
+    ax.set_ylabel(y_col)
    ax.set_title(title)
    ax.grid(alpha=0.3)
-    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path = OUT_DIR / out_name
    fig.tight_layout()
    fig.savefig(out_path, dpi=150)
    plt.close(fig)
    print(f"Saved {out_path}")
 def plot_raw_scatter(
    df: pd.DataFrame,
    y_col: str,
    out_dir: Path,
    *,
    x_col: str = X_COL,
    x_max: float = DEFAULT_X_MAX,
    scatter_color: str = DEFAULT_SCATTER_COLOR,
    point_size: int = DEFAULT_POINT_SIZE,
    alpha: float = DEFAULT_ALPHA,
    alpha_min: float = DEFAULT_ALPHA_MIN,
    alpha_max: float = DEFAULT_ALPHA_MAX,
    bins_x: int = DEFAULT_BINS_X,
    bins_y: int = DEFAULT_BINS_Y,
    y_min: float = DEFAULT_Y_MIN,
    y_max: float = DEFAULT_Y_MAX,
    trend_method: str = DEFAULT_TREND_METHOD,
    trend_frac: float = DEFAULT_TREND_FRAC,
    trend_color: str = DEFAULT_TREND_COLOR,
    trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
    rolling_window: int = DEFAULT_ROLLING_WINDOW,
    savgol_window: int = DEFAULT_SAVGOL_WINDOW,
    savgol_poly: int = DEFAULT_SAVGOL_POLY,
 ) -> None:
    in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
    plot_density_scatter(
        in_range,
        y_col=y_col,
        title=f"Облако: {y_col} vs {x_col} (все клиенты)",
        out_path=out_dir / "scatter.png",
        x_col=x_col,
        x_max=x_max,
        scatter_color=scatter_color,
        point_size=point_size,
        alpha=alpha,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        bins_x=bins_x,
        bins_y=bins_y,
        y_min=y_min,
        y_max=y_max,
        trend_method=trend_method,
        trend_frac=trend_frac,
        trend_color=trend_color,
        trend_linewidth=trend_linewidth,
        rolling_window=rolling_window,
        savgol_window=savgol_window,
        savgol_poly=savgol_poly,
    )
 def plot_clean_scatter(
    df: pd.DataFrame,
    y_col: str,
    out_dir: Path,
    *,
    x_col: str = X_COL,
    x_max: float = DEFAULT_X_MAX,
    scatter_color: str = DEFAULT_SCATTER_COLOR,
    point_size: int = DEFAULT_POINT_SIZE,
    alpha: float = DEFAULT_ALPHA,
    iqr_k: float = DEFAULT_IQR_K,
    q_low: float = DEFAULT_Q_LOW,
    q_high: float = DEFAULT_Q_HIGH,
    alpha_min: float = DEFAULT_ALPHA_MIN,
    alpha_max: float = DEFAULT_ALPHA_MAX,
    bins_x: int = DEFAULT_BINS_X,
    bins_y: int = DEFAULT_BINS_Y,
    y_min: float = DEFAULT_Y_MIN,
    y_max: float = DEFAULT_Y_MAX,
    trend_method: str = DEFAULT_TREND_METHOD,
    trend_frac: float = DEFAULT_TREND_FRAC,
    trend_color: str = DEFAULT_TREND_COLOR,
    trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
    rolling_window: int = DEFAULT_ROLLING_WINDOW,
    savgol_window: int = DEFAULT_SAVGOL_WINDOW,
    savgol_poly: int = DEFAULT_SAVGOL_POLY,
 ) -> None:
    in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
    cleaned = remove_outliers(
        in_range,
        y_col=y_col,
        x_col=x_col,
        iqr_k=iqr_k,
        q_low=q_low,
        q_high=q_high,
    )
    plot_density_scatter(
        cleaned,
        y_col=y_col,
        title=f"Облако без выбросов (IQR) {y_col} vs {x_col}",
        out_path=out_dir / "scatter_clean.png",
        x_col=x_col,
        x_max=x_max,
        scatter_color=scatter_color,
        point_size=point_size,
        alpha=alpha,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        bins_x=bins_x,
        bins_y=bins_y,
        y_min=y_min,
        y_max=y_max,
        trend_method=trend_method,
        trend_frac=trend_frac,
        trend_color=trend_color,
        trend_linewidth=trend_linewidth,
        rolling_window=rolling_window,
        savgol_window=savgol_window,
        savgol_poly=savgol_poly,
    )
 def plot_clean_trend_scatter(
    df: pd.DataFrame,
    y_col: str,
    out_dir: Path,
    *,
    x_col: str = X_COL,
    x_max: float = DEFAULT_X_MAX,
    scatter_color: str = DEFAULT_SCATTER_COLOR,
    point_size: int = DEFAULT_POINT_SIZE,
    alpha: float = DEFAULT_TREND_ALPHA,
    iqr_k: float = DEFAULT_IQR_K,
    q_low: float = DEFAULT_Q_LOW,
    q_high: float = DEFAULT_Q_HIGH,
    trend_frac: float = DEFAULT_TREND_FRAC,
    trend_color: str = DEFAULT_TREND_COLOR,
    trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
    alpha_min: float = DEFAULT_ALPHA_MIN,
    alpha_max: float = DEFAULT_ALPHA_MAX,
    bins_x: int = DEFAULT_BINS_X,
    bins_y: int = DEFAULT_BINS_Y,
    y_min: float = DEFAULT_Y_MIN,
    y_max: float = DEFAULT_Y_MAX,
    trend_method: str = DEFAULT_TREND_METHOD,
    rolling_window: int = DEFAULT_ROLLING_WINDOW,
    savgol_window: int = DEFAULT_SAVGOL_WINDOW,
    savgol_poly: int = DEFAULT_SAVGOL_POLY,
    return_components: bool = False,
 ) -> None:
    in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
    cleaned = remove_outliers(
        in_range,
        y_col=y_col,
        x_col=x_col,
        iqr_k=iqr_k,
        q_low=q_low,
        q_high=q_high,
    )
    plot_density_scatter(
        cleaned,
        y_col=y_col,
        title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
        out_path=out_dir / "scatter_trend.png",
        x_col=x_col,
        x_max=x_max,
        scatter_color=scatter_color,
        point_size=point_size,
        alpha=alpha,
        with_trend=True,
        trend_frac=trend_frac,
        trend_color=trend_color,
        trend_linewidth=trend_linewidth,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        bins_x=bins_x,
        bins_y=bins_y,
        y_min=y_min,
        y_max=y_max,
        trend_method=trend_method,
        rolling_window=rolling_window,
        savgol_window=savgol_window,
        savgol_poly=savgol_poly,
    )
    if return_components:
        return fig, ax, cleaned
 def generate_scatter_set(
    df: pd.DataFrame,
    y_col: str,
    *,
    base_out_dir: Path = BASE_OUT_DIR,
    x_col: str = X_COL,
    x_max: float = DEFAULT_X_MAX,
    scatter_color: str = DEFAULT_SCATTER_COLOR,
    point_size: int = DEFAULT_POINT_SIZE,
    alpha: float = DEFAULT_ALPHA,
    trend_alpha: float = DEFAULT_TREND_ALPHA,
    trend_frac: float = DEFAULT_TREND_FRAC,
    trend_color: str = DEFAULT_TREND_COLOR,
    trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
    iqr_k: float = DEFAULT_IQR_K,
    q_low: float = DEFAULT_Q_LOW,
    q_high: float = DEFAULT_Q_HIGH,
    alpha_min: float = DEFAULT_ALPHA_MIN,
    alpha_max: float = DEFAULT_ALPHA_MAX,
    bins_x: int = DEFAULT_BINS_X,
    bins_y: int = DEFAULT_BINS_Y,
    y_min: float = DEFAULT_Y_MIN,
    y_max: float = DEFAULT_Y_MAX,
    trend_method: str = DEFAULT_TREND_METHOD,
    rolling_window: int = DEFAULT_ROLLING_WINDOW,
    savgol_window: int = DEFAULT_SAVGOL_WINDOW,
    savgol_poly: int = DEFAULT_SAVGOL_POLY,
 ) -> None:
    """Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col."""
    out_dir = base_out_dir / str(y_col).replace("/", "_")
    plot_raw_scatter(
        df,
        y_col=y_col,
        out_dir=out_dir,
        x_col=x_col,
        x_max=x_max,
        scatter_color=scatter_color,
        point_size=point_size,
        alpha=alpha,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        bins_x=bins_x,
        bins_y=bins_y,
        y_min=y_min,
        y_max=y_max,
        trend_method=trend_method,
        trend_frac=trend_frac,
        trend_color=trend_color,
        trend_linewidth=trend_linewidth,
        rolling_window=rolling_window,
        savgol_window=savgol_window,
        savgol_poly=savgol_poly,
    )
    plot_clean_scatter(
        df,
        y_col=y_col,
        out_dir=out_dir,
        x_col=x_col,
        x_max=x_max,
        scatter_color=scatter_color,
        point_size=point_size,
        alpha=alpha,
        iqr_k=iqr_k,
        q_low=q_low,
        q_high=q_high,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        bins_x=bins_x,
        bins_y=bins_y,
        y_min=y_min,
        y_max=y_max,
        trend_method=trend_method,
        trend_frac=trend_frac,
        trend_color=trend_color,
        trend_linewidth=trend_linewidth,
        rolling_window=rolling_window,
        savgol_window=savgol_window,
        savgol_poly=savgol_poly,
    )
    plot_clean_trend_scatter(
        df,
        y_col=y_col,
        out_dir=out_dir,
        x_col=x_col,
        x_max=x_max,
        scatter_color=scatter_color,
        point_size=point_size,
        alpha=trend_alpha,
        iqr_k=iqr_k,
        q_low=q_low,
        q_high=q_high,
        trend_frac=trend_frac,
        trend_color=trend_color,
        trend_linewidth=trend_linewidth,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        bins_x=bins_x,
        bins_y=bins_y,
        y_min=y_min,
        y_max=y_max,
        trend_method=trend_method,
        rolling_window=rolling_window,
        savgol_window=savgol_window,
        savgol_poly=savgol_poly,
    )
 def main() -> None:
    client = load_client_level(DB_PATH)
-
+    zero_orders = (client["orders_amt_total"] == 0).sum()
-    plot_density_scatter(
+    non_zero = len(client) - zero_orders
-        client,
+    if len(client):
-        title="Облако: заказы vs средние показы в день (все клиенты)",
+        print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})")
-        out_name="orders_vs_avg_imp_scatter.png",
+    generate_scatter_set(client, y_col="orders_amt_total")
    )
    cleaned = remove_outliers(client)
    plot_density_scatter(
        cleaned,
        title="Облако без выбросов (IQR) заказы vs средние показы в день",
        out_name="orders_vs_avg_imp_scatter_clean.png",
    )
    plot_density_scatter(
        cleaned,
        title="Облако без выбросов + тренд",
        out_name="orders_vs_avg_imp_scatter_trend.png",
        with_trend=True,
        alpha=0.1,
    )
 if __name__ == "__main__":
--- a/main_hypot/quadreg.py
+++ b/main_hypot/quadreg.py
@@ -1,240 +1,352 @@
 import sqlite3
 from pathlib import Path
 import sys
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 import statsmodels.api as sm
 from pathlib import Path
 from typing import Tuple, Optional
-sns.set_theme(style="whitegrid")
+from sklearn.metrics import r2_score, roc_auc_score
 plt.rcParams["figure.figsize"] = (10, 6)
-# -----------------------------
+import best_model_and_plots as bmp
 # Load + feature engineering (как у тебя)
 # -----------------------------
 project_root = Path(__file__).resolve().parent.parent
 sys.path.append(str(project_root / "preanalysis_old_bad"))
 import eda_utils as eda  # noqa: E402
-db_path = project_root / "dataset" / "ds.sqlite"
+# Наследуем константы/визуальные настройки из scatter-скрипта
-conn = sqlite3.connect(db_path)
+X_COL = bmp.X_COL
-df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
+DEFAULT_X_MAX = bmp.DEFAULT_X_MAX
-conn.close()
+DEFAULT_Y_MIN = bmp.DEFAULT_Y_MIN
 DEFAULT_Y_MAX = bmp.DEFAULT_Y_MAX
 DEFAULT_SCATTER_COLOR = bmp.DEFAULT_SCATTER_COLOR
 DEFAULT_POINT_SIZE = bmp.DEFAULT_POINT_SIZE
 DEFAULT_ALPHA = bmp.DEFAULT_ALPHA
 DEFAULT_ALPHA_MIN = bmp.DEFAULT_ALPHA_MIN
 DEFAULT_ALPHA_MAX = bmp.DEFAULT_ALPHA_MAX
 DEFAULT_BINS_X = bmp.DEFAULT_BINS_X
 DEFAULT_BINS_Y = bmp.DEFAULT_BINS_Y
 DEFAULT_IQR_K = bmp.DEFAULT_IQR_K
 DEFAULT_Q_LOW = bmp.DEFAULT_Q_LOW
 DEFAULT_Q_HIGH = bmp.DEFAULT_Q_HIGH
 DEFAULT_TREND_FRAC = bmp.DEFAULT_TREND_FRAC
 DEFAULT_TREND_COLOR = bmp.DEFAULT_TREND_COLOR
 DEFAULT_TREND_LINEWIDTH = bmp.DEFAULT_TREND_LINEWIDTH
 BASE_OUT_DIR = bmp.BASE_OUT_DIR
 for cols, name in [
    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
    (eda.ORDER_COLS, "orders_amt_total"),
 ]:
    df[name] = df[cols].sum(axis=1)
-df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+def prepare_clean_data(
-df["click_total"] = df["active_click_total"] + df["passive_click_total"]
+    y_col: str,
-
+    *,
-contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
+    x_col: str = X_COL,
-
+    x_max: float = DEFAULT_X_MAX,
-client = (
+    iqr_k: float = DEFAULT_IQR_K,
-    df.groupby("id")
+    q_low: float = DEFAULT_Q_LOW,
-    .agg(
+    q_high: float = DEFAULT_Q_HIGH,
-        imp_total=("imp_total", "sum"),
+) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
-        click_total=("click_total", "sum"),
+    """Готовит очищенные данные: фильтр по x и IQR, возвращает x, y и DataFrame."""
-        orders_amt_total=("orders_amt_total", "sum"),
+    df = bmp.load_client_level(bmp.DB_PATH)
-        age=("age", "median"),
+    base = df[[x_col, y_col]].dropna()
-        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
+    in_range = bmp.filter_x_range(base, x_col, x_max)
-        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
+    cleaned = bmp.remove_outliers(
-    )
+        in_range,
-    .merge(contact_days, on="id", how="left")
+        y_col=y_col,
-    .reset_index()
+        x_col=x_col,
        iqr_k=iqr_k,
        q_low=q_low,
        q_high=q_high,
    )
    x = cleaned[x_col].to_numpy()
    y = cleaned[y_col].to_numpy()
    return x, y, cleaned
 client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"])
 client["order_rate_pct"] = 100 * client["order_rate"]
 client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
-# -----------------------------
+def fit_quadratic(
-# Aggregate curve points (как у тебя)
+    x: np.ndarray,
-# -----------------------------
+    y_target: np.ndarray,
-stats_imp = (
+    weights: Optional[np.ndarray] = None,
-    client.groupby("avg_imp_per_day", as_index=False)
+) -> Tuple[sm.regression.linear_model.RegressionResultsWrapper, np.ndarray]:
-    .agg(
+    """Фитим квадратику по x -> y_target (WLS), предсказываем на тех же x."""
-        orders_mean=("orders_amt_total", "mean"),
+    X_design = np.column_stack([x, x**2])
-        n_clients=("id", "count"),
+    X_design = sm.add_constant(X_design)
-    )
+    if weights is not None:
-    .sort_values("avg_imp_per_day")
+        model = sm.WLS(y_target, X_design, weights=weights).fit(cov_type="HC3")
 ).reset_index(drop=True)
 # -----------------------------
 # Filtering / outlier logic (как у тебя)
 # -----------------------------
 K_MULT = 2
 ABS_DY_MIN = 1
 X_MAX = 16
 stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
 before = len(stats_f)
 y = stats_f["orders_mean"]
 abs_dy = y.diff().abs()
 prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
 ratio = abs_dy / (prev3_mean.replace(0, np.nan))
 is_outlier = ((abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT)) | (y > 5)
 is_outlier = is_outlier.fillna(False)
 stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
 after = len(stats_f)
 print(f"Фильтрация: было {before}, стало {after}, убрали {before-after} точек")
 # -----------------------------
 # Smoothing (оставим для визуалки, но регрессию делаем по orders_mean)
 # -----------------------------
 w = max(7, int(len(stats_f) * 0.05))
 if w % 2 == 0:
    w += 1
 stats_f["orders_smooth"] = (
    stats_f["orders_mean"]
    .rolling(window=w, center=True, min_periods=1)
    .mean()
 )
 # -----------------------------
 # Cost line (как у тебя, нормировка "в единицах заказов")
 # -----------------------------
 c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
 stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
 # -----------------------------
 # Quadratic regression: orders_mean ~ 1 + x + x^2
 # WLS with weights = n_clients
 # -----------------------------
 x = stats_f["avg_imp_per_day"].to_numpy()
 y = stats_f["orders_mean"].to_numpy()
 wts = stats_f["n_clients"].to_numpy().astype(float)
 X = np.column_stack([x, x**2])
 X = sm.add_constant(X)  # [1, x, x^2]
 model = sm.WLS(y, X, weights=wts)
 res = model.fit(cov_type="HC3")  # робастные ошибки
 b0, b1, b2 = res.params
 p_b1_two = res.pvalues[1]
 p_b2_two = res.pvalues[2]
 # one-sided p-values for directional hypotheses
 p_b1_pos = (p_b1_two / 2) if (b1 > 0) else (1 - p_b1_two / 2)
 p_b2_neg = (p_b2_two / 2) if (b2 < 0) else (1 - p_b2_two / 2)
 # turning point (if concave)
 x_star = None
 y_star = None
 if b2 < 0:
    x_star = -b1 / (2 * b2)
    y_star = b0 + b1 * x_star + b2 * x_star**2
 # Intersection with cost line: b0 + b1 x + b2 x^2 = c x  ->  b2 x^2 + (b1 - c) x + b0 = 0
 x_cross = None
 roots = np.roots([b2, (b1 - c), b0])  # may be complex
 roots = [r.real for r in roots if abs(r.imag) < 1e-8]
 roots_in_range = [r for r in roots if (stats_f["avg_imp_per_day"].min() <= r <= stats_f["avg_imp_per_day"].max())]
 if roots_in_range:
    # берём корень ближе к "правой" части (обычно пересечение интереснее там, где начинается невыгодно)
    x_cross = max(roots_in_range)
 # -----------------------------
 # Print results + interpretation (по-человечески)
 # -----------------------------
 print("\n=== Квадратичная регрессия (WLS, веса = n_clients, SE = HC3) ===")
 print(res.summary())
 print("\n=== Проверка гипотезы убывающей отдачи / спада ===")
 print(f"β1 (линейный эффект): {b1:.6f}, двусторонний p={p_b1_two:.4g}, односторонний p(β1>0)={p_b1_pos:.4g}")
 print(f"β2 (кривизна):       {b2:.6f}, двусторонний p={p_b2_two:.4g}, односторонний p(β2<0)={p_b2_neg:.4g}")
 alpha = 0.05
 support = (b1 > 0) and (b2 < 0) and (p_b1_pos < alpha) and (p_b2_neg < alpha)
 if support:
    print("\nВывод: данные поддерживают гипотезу нелинейности.")
    print("Есть статистически значимый рост на малых x (β1>0) и насыщение/спад (β2<0).")
    else:
-    print("\nВывод: строгого статистического подтверждения по знакам/значимости может не хватить.")
+        model = sm.OLS(y_target, X_design).fit(cov_type="HC3")
    print("Но знак коэффициентов и форма кривой всё равно могут быть согласованы с гипотезой.")
    print("На защите говори аккуратно: 'наблюдается тенденция/согласуется с гипотезой'.")
-if x_star is not None:
+    y_hat = model.predict(X_design)
-    print(f"\nОценка 'порога насыщения' (вершина параболы): x* = {x_star:.3f} показов/день")
+    return model, y_hat
    print(f"Прогноз среднего числа заказов в x*: y(x*) ≈ {y_star:.3f}")
    if not (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
        print("Внимание: x* вне диапазона наблюдений, интерпретация как 'оптимума' сомнительная.")
 else:
    print("\nВершина не считается как максимум: β2 >= 0 (нет выпуклости вниз).")
 if x_cross is not None:
    y_cross = b0 + b1 * x_cross + b2 * x_cross**2
    print(f"\nТочка пересечения с линейными расходами (в нормировке c={c:.4f}): x≈{x_cross:.3f}, y≈{y_cross:.3f}")
 else:
    print("\nПересечение с линией расходов в выбранной нормировке не найдено (или вне диапазона).")
-# -----------------------------
+def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[Optional[float], Optional[float]]:
-# Plot: points + smooth + quadratic fit + cost + markers
+    """Возвращает (R2, AUC по метке y>0)."""
-# -----------------------------
+    r2 = r2_score(y_true, y_pred)
-x_grid = np.linspace(stats_f["avg_imp_per_day"].min(), stats_f["avg_imp_per_day"].max(), 300)
+    auc = None
-y_hat = b0 + b1 * x_grid + b2 * x_grid**2
+    try:
-cost_hat = c * x_grid
+        auc = roc_auc_score((y_true > 0).astype(int), y_pred)
    except ValueError:
        auc = None
    return r2, auc
 plt.figure(figsize=(10, 8))
-plt.plot(
+def map_trend_to_points(x_points: np.ndarray, trend_x: np.ndarray, trend_y: np.ndarray) -> np.ndarray:
-    stats_f["avg_imp_per_day"], stats_f["orders_mean"],
+    """Интерполирует значения тренда в точках x_points."""
-    marker="o", linestyle="-", linewidth=1, alpha=0.3,
+    if len(trend_x) == 0:
-    label="Среднее число заказов (по точкам)"
+        return np.zeros_like(x_points)
    # гарантируем отсортированность
    order = np.argsort(trend_x)
    tx = trend_x[order]
    ty = trend_y[order]
    return np.interp(x_points, tx, ty, left=ty[0], right=ty[-1])
 def density_weights(
    df: pd.DataFrame,
    y_col: str,
    *,
    x_col: str = X_COL,
    x_max: float = DEFAULT_X_MAX,
    alpha_min: float = DEFAULT_ALPHA_MIN,
    alpha_max: float = DEFAULT_ALPHA_MAX,
    bins_x: int = DEFAULT_BINS_X,
    bins_y: int = DEFAULT_BINS_Y,
    y_min: float = DEFAULT_Y_MIN,
    y_max: float = DEFAULT_Y_MAX,
 ) -> np.ndarray:
    """Строит веса из плотности (та же схема, что и альфы на графике)."""
    alphas = bmp.compute_density_alpha(
        df,
        x_col=x_col,
        y_col=y_col,
        x_max=x_max,
        bins_x=bins_x,
        bins_y=bins_y,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        y_min=y_min,
        y_max_limit=y_max,
    )
    if len(alphas) == 0:
        return np.ones(len(df))
    denom = max(alpha_max - alpha_min, 1e-9)
    weights = (alphas - alpha_min) / denom
    weights = np.clip(weights, 0, None)
    return weights
 def plot_quadratic_overlay(
    df: pd.DataFrame,
    model: sm.regression.linear_model.RegressionResultsWrapper,
    y_col: str,
    out_path: Path,
    *,
    x_col: str = X_COL,
    x_max: float = DEFAULT_X_MAX,
    y_min: float = DEFAULT_Y_MIN,
    y_max: float = DEFAULT_Y_MAX,
    scatter_color: str = DEFAULT_SCATTER_COLOR,
    point_size: int = DEFAULT_POINT_SIZE,
    alpha: float = DEFAULT_ALPHA,
    alpha_min: float = DEFAULT_ALPHA_MIN,
    alpha_max: float = DEFAULT_ALPHA_MAX,
    bins_x: int = DEFAULT_BINS_X,
    bins_y: int = DEFAULT_BINS_Y,
    trend_frac: float = DEFAULT_TREND_FRAC,
    trend_color: str = DEFAULT_TREND_COLOR,
    trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
    trend_method: str = bmp.DEFAULT_TREND_METHOD,
    rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
    savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
    savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
 ) -> None:
    """Рисует облако + LOWESS-тренд + линию квадр. регрессии."""
    fig, ax = bmp.plt.subplots(figsize=(8, 8))
    alpha_values = bmp.compute_density_alpha(
        df,
        x_col=x_col,
        y_col=y_col,
        x_max=x_max,
        bins_x=bins_x,
        bins_y=bins_y,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        y_min=y_min,
        y_max_limit=y_max,
    )
    ax.scatter(
        df[x_col],
        df[y_col],
        color=scatter_color,
        s=point_size,
        alpha=alpha_values if len(alpha_values) else alpha,
        linewidths=0,
        label="Точки (очищено)",
    )
-plt.plot(
+    # Тренд по выбранному методу
-    stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
+    tx, ty = bmp.compute_trend(
-    color="red", linewidth=2.2,
+        df,
-    label="Сглаженный тренд (rolling mean)"
+        y_col=y_col,
        x_col=x_col,
        method=trend_method,
        lowess_frac=trend_frac,
        rolling_window=rolling_window,
        savgol_window=savgol_window,
        savgol_poly=savgol_poly,
    )
    if len(tx):
        ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
    # Квадратичная регрессия
    x_grid = np.linspace(0, x_max, 400)
    X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
    y_grid = model.predict(X_grid)
    ax.plot(x_grid, y_grid, color="blue", linewidth=2.3, linestyle="--", label="Квадр. регрессия")
    ax.set_xlim(0, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_yticks(range(0, int(y_max) + 1, 2))
    ax.set_xlabel("Среднее число показов в день")
    ax.set_ylabel(y_col)
    ax.set_title(f"Квадратичная регрессия: {y_col} vs {x_col}")
    ax.grid(alpha=0.3)
    ax.legend()
    out_path.parent.mkdir(parents=True, exist_ok=True)
    fig.tight_layout()
    fig.savefig(out_path, dpi=150)
    bmp.plt.close(fig)
    print(f"Saved {out_path}")
 def report_model(
    model: sm.regression.linear_model.RegressionResultsWrapper,
    r2: Optional[float],
    auc: Optional[float],
    *,
    r2_trend: Optional[float] = None,
 ) -> None:
    params = model.params
    pvals = model.pvalues
    fmt_p = lambda p: f"<1e-300" if p < 1e-300 else f"{p:.4g}"
    print("\n=== Квадратичная регрессия (y ~ 1 + x + x^2) ===")
    print(f"const: {params[0]:.6f} (p={fmt_p(pvals[0])})")
    print(f"beta1 x: {params[1]:.6f} (p={fmt_p(pvals[1])})")
    print(f"beta2 x^2: {params[2]:.6f} (p={fmt_p(pvals[2])})")
    print(f"R2: {r2:.4f}" if r2 is not None else "R2: n/a")
    if r2_trend is not None:
        print(f"R2 vs trend target: {r2_trend:.4f}")
    print(f"AUC (target y>0): {auc:.4f}" if auc is not None else "AUC: n/a (один класс)")
 def generate_quadratic_analysis(
    y_col: str,
    *,
    x_col: str = X_COL,
    base_out_dir: Path = BASE_OUT_DIR,
    config_name: str = "default",
    x_max: float = DEFAULT_X_MAX,
    y_min: float = DEFAULT_Y_MIN,
    y_max: float = DEFAULT_Y_MAX,
    scatter_color: str = DEFAULT_SCATTER_COLOR,
    point_size: int = DEFAULT_POINT_SIZE,
    alpha: float = DEFAULT_ALPHA,
    alpha_min: float = DEFAULT_ALPHA_MIN,
    alpha_max: float = DEFAULT_ALPHA_MAX,
    bins_x: int = DEFAULT_BINS_X,
    bins_y: int = DEFAULT_BINS_Y,
    trend_frac: float = DEFAULT_TREND_FRAC,
    trend_color: str = DEFAULT_TREND_COLOR,
    trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
    iqr_k: float = DEFAULT_IQR_K,
    q_low: float = DEFAULT_Q_LOW,
    q_high: float = DEFAULT_Q_HIGH,
    trend_method: str = bmp.DEFAULT_TREND_METHOD,
    rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
    savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
    savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
 ) -> dict:
    x, y, cleaned_df = prepare_clean_data(
        y_col,
        x_col=x_col,
        x_max=x_max,
        iqr_k=iqr_k,
        q_low=q_low,
        q_high=q_high,
    )
    w = density_weights(
        cleaned_df,
        y_col=y_col,
        x_col=x_col,
        x_max=x_max,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        bins_x=bins_x,
        bins_y=bins_y,
        y_min=y_min,
        y_max=y_max,
    )
    # тренд по выбранному методу
    tx, ty = bmp.compute_trend(
        cleaned_df,
        y_col=y_col,
        x_col=x_col,
        method=trend_method,
        lowess_frac=trend_frac,
        rolling_window=rolling_window,
        savgol_window=savgol_window,
        savgol_poly=savgol_poly,
    )
-plt.plot(
+    trend_target = map_trend_to_points(x, tx, ty)
-    x_grid, y_hat,
+    model, y_hat = fit_quadratic(x, trend_target, weights=w)
-    color="blue", linewidth=2.5,
+    r2_actual, auc = compute_metrics(y, y_hat)
-    label="Квадратичная регрессия (WLS)"
+    r2_trend = r2_score(trend_target, y_hat) if len(trend_target) else None
    report_model(model, r2_actual, auc, r2_trend=r2_trend)
    out_dir = base_out_dir / config_name / str(y_col).replace("/", "_")
    plot_quadratic_overlay(
        cleaned_df,
        model,
        y_col=y_col,
        out_path=out_dir / "quad_regression.png",
        x_col=x_col,
        x_max=x_max,
        y_min=y_min,
        y_max=y_max,
        scatter_color=scatter_color,
        point_size=point_size,
        alpha=alpha,
        alpha_min=alpha_min,
        alpha_max=alpha_max,
        bins_x=bins_x,
        bins_y=bins_y,
        trend_frac=trend_frac,
        trend_color=trend_color,
        trend_linewidth=trend_linewidth,
        trend_method=trend_method,
        rolling_window=rolling_window,
        savgol_window=savgol_window,
        savgol_poly=savgol_poly,
    )
-plt.plot(
+    return {
-    x_grid, cost_hat,
+        "config": config_name,
-    color="black", linestyle="--", linewidth=2,
+        "y_col": y_col,
-    label="Линейные расходы на показы"
+        "r2": r2_actual,
-)
+        "r2_trend": r2_trend,
        "auc": auc,
        "params": {
            "trend_method": trend_method,
            "trend_frac": trend_frac,
            "rolling_window": rolling_window,
            "savgol_window": savgol_window,
            "savgol_poly": savgol_poly,
            "x_max": x_max,
            "weights_alpha_range": (alpha_min, alpha_max),
        },
        "coeffs": model.params.tolist(),
        "pvalues": model.pvalues.tolist(),
    }
 if x_star is not None and (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
    plt.axvline(x_star, color="blue", linestyle=":", linewidth=2)
    plt.scatter([x_star], [y_star], color="blue", zorder=5)
    plt.text(x_star, y_star, f"  x*={x_star:.2f}", va="bottom")
-if x_cross is not None:
+def main() -> None:
-    y_cross = b0 + b1 * x_cross + b2 * x_cross**2
+    generate_quadratic_analysis("orders_amt_total")
    plt.axvline(x_cross, color="black", linestyle=":", linewidth=2, alpha=0.8)
    plt.scatter([x_cross], [y_cross], color="black", zorder=5)
    plt.text(x_cross, y_cross, f"  пересечение≈{x_cross:.2f}", va="top")
 plt.xlabel("Среднее число показов в день")
 plt.ylabel("Среднее число заказов")
 plt.title("Нелинейный эффект интенсивности коммуникаций: квадратичная регрессия")
 plt.legend()
 plt.grid(alpha=0.3)
 plt.tight_layout()
-out_dir = project_root / "main_hypot"
+if __name__ == "__main__":
-out_dir.mkdir(parents=True, exist_ok=True)
+    main()
 out_path = out_dir / "quad_regression_with_costs.png"
 plt.savefig(out_path, dpi=150)
 print(f"\nSaved: {out_path}")
--- a/old_generated_plots/best_bins.png
+++ b/old_generated_plots/best_bins.png
--- a/old_generated_plots/best_model_prob.png
+++ b/old_generated_plots/best_model_prob.png
--- a/old_generated_plots/orders_vs_avg_imp_per_day.png
+++ b/old_generated_plots/orders_vs_avg_imp_per_day.png
--- a/old_generated_plots/orders_vs_avg_imp_per_day_filtered_smoothed.png
+++ b/old_generated_plots/orders_vs_avg_imp_per_day_filtered_smoothed.png
--- a/old_generated_plots/orders_vs_avg_imp_per_day_smoothed.png
+++ b/old_generated_plots/orders_vs_avg_imp_per_day_smoothed.png
--- a/old_generated_plots/orders_vs_avg_imp_per_day_smoothed_clean.png
+++ b/old_generated_plots/orders_vs_avg_imp_per_day_smoothed_clean.png
--- a/old_generated_plots/orders_vs_avg_imp_scatter.png
+++ b/old_generated_plots/orders_vs_avg_imp_scatter.png
--- a/old_generated_plots/orders_vs_avg_imp_scatter_clean.png
+++ b/old_generated_plots/orders_vs_avg_imp_scatter_clean.png
--- a/old_generated_plots/orders_vs_avg_imp_scatter_trend.png
+++ b/old_generated_plots/orders_vs_avg_imp_scatter_trend.png
--- a/old_generated_plots/orders_vs_avg_imp_with_costs.png
+++ b/old_generated_plots/orders_vs_avg_imp_with_costs.png
--- a/old_generated_plots/orders_vs_avg_imp_without_costs.png
+++ b/old_generated_plots/orders_vs_avg_imp_without_costs.png
--- a/old_generated_plots/orders_vs_avg_imp_without_costs_no_filter.png
+++ b/old_generated_plots/orders_vs_avg_imp_without_costs_no_filter.png
--- a/old_generated_plots/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png
+++ b/old_generated_plots/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png
--- a/old_generated_plots/quad_regression_with_costs.png
+++ b/old_generated_plots/quad_regression_with_costs.png
--- a/old_generated_plots/stat_bins.png
+++ b/old_generated_plots/stat_bins.png
--- a/preanalysis/eda_utils.py
+++ b/preanalysis/eda_utils.py
@@ -0,0 +1,154 @@
 from __future__ import annotations
 from pathlib import Path
 from typing import Dict, Iterable, List
 import numpy as np
 import pandas as pd
 # Paths and column groups
 DATA_PATH = Path("dataset/ds.csv")
 CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
 ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
 PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
 ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
 PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
 ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
 NUMERIC_COLS = (
    ACTIVE_IMP_COLS
    + PASSIVE_IMP_COLS
    + ACTIVE_CLICK_COLS
    + PASSIVE_CLICK_COLS
    + ORDER_COLS
    + ["age"]
 )
 CAT_COLS = ["gender_cd", "device_platform_cd"]
 def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
    """Divide with protection against zero (works for Series and scalars)."""
    if isinstance(denominator, pd.Series):
        denom = denominator.replace(0, np.nan)
    else:
        denom = np.nan if float(denominator) == 0 else denominator
    return numerator / denom
 def normalize_gender(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
    mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
    return cleaned.map(mapping).fillna("UNKNOWN")
 def normalize_device(series: pd.Series) -> pd.Series:
    cleaned = series.fillna("unknown").astype(str).str.strip()
    lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
    mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
    mapped = lowered.map(mapping)
    fallback = cleaned.str.title()
    return mapped.fillna(fallback)
 def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
    bins = [0, 25, 35, 45, 55, np.inf]
    labels = ["<25", "25-34", "35-44", "45-54", "55+"]
    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
    return df
 def add_totals(df: pd.DataFrame) -> pd.DataFrame:
    df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
    df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
    df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
    df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
    df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
    df["click_total"] = df["active_click_total"] + df["passive_click_total"]
    df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
    df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
    df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
    df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
    df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
    df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
    return df
 def add_flags(df: pd.DataFrame) -> pd.DataFrame:
    df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
    df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
    df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
    return df
 def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["business_dt"] = pd.to_datetime(df["business_dt"])
    df["gender_cd"] = normalize_gender(df["gender_cd"])
    df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
    df = add_age_group(df)
    df = add_totals(df)
    df = add_flags(df)
    return df
 def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
    stats = []
    for col in cols:
        series = df[col]
        stats.append(
            {
                "col": col,
                "count": series.count(),
                "mean": series.mean(),
                "median": series.median(),
                "std": series.std(),
                "min": series.min(),
                "q25": series.quantile(0.25),
                "q75": series.quantile(0.75),
                "max": series.max(),
                "share_zero": (series == 0).mean(),
                "p95": series.quantile(0.95),
                "p99": series.quantile(0.99),
            }
        )
    return pd.DataFrame(stats)
 def build_daily(df: pd.DataFrame) -> pd.DataFrame:
    agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
    daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
    daily = add_totals(daily)
    daily["day_of_week"] = daily["business_dt"].dt.day_name()
    return daily
 def build_client(df: pd.DataFrame) -> pd.DataFrame:
    agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
    meta_spec: Dict[str, str | callable] = {
        "age": "median",
        "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
        "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
        "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
    }
    agg_spec.update(meta_spec)
    client = df.groupby("id").agg(agg_spec).reset_index()
    contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
    imp_day = df.copy()
    imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
    max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
    client = add_totals(client)
    client = add_flags(client)
    client = client.merge(contact_days, on="id", how="left")
    client = client.merge(max_imp_day, on="id", how="left")
    client = add_contact_density(client)
    return client
 def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
    # contact_days must already be present
    if "contact_days" in df.columns:
        df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
    return df
    return df
--- a/preanalysis/first_stage.ipynb
+++ b/preanalysis/first_stage.ipynb