gadem

2025-12-16 01:51:05 +03:00
parent a1bc89c481
commit c963b1e5ac
123 changed files with 5644 additions and 3802 deletions
--- a/main_hypot/best_model_and_plots.py
+++ b/main_hypot/best_model_and_plots.py
@@ -1,3 +1,5 @@
+"""Базовый набор расчётов и графиков: загрузка клиентов, фильтрация выбросов и построение трендов/квадратики."""
+
 import sqlite3
 from pathlib import Path
 import sys
--- a/main_hypot/category_analysis/correlations/corr_avia.png
+++ b/main_hypot/category_analysis/correlations/corr_avia.png
--- a/main_hypot/category_analysis/correlations/corr_avia_hotel.png
+++ b/main_hypot/category_analysis/correlations/corr_avia_hotel.png
--- a/main_hypot/category_analysis/correlations/corr_ent.png
+++ b/main_hypot/category_analysis/correlations/corr_ent.png
--- a/main_hypot/category_analysis/correlations/corr_hotel.png
+++ b/main_hypot/category_analysis/correlations/corr_hotel.png
--- a/main_hypot/category_analysis/correlations/corr_shopping.png
+++ b/main_hypot/category_analysis/correlations/corr_shopping.png
--- a/main_hypot/category_analysis/correlations/corr_super.png
+++ b/main_hypot/category_analysis/correlations/corr_super.png
--- a/main_hypot/category_analysis/correlations/corr_transport.png
+++ b/main_hypot/category_analysis/correlations/corr_transport.png
--- a/main_hypot/category_analysis/orders_amt_avia/scatter_trend.png
+++ b/main_hypot/category_analysis/orders_amt_avia/scatter_trend.png
--- a/main_hypot/category_analysis/orders_amt_avia/scatter_trend_quad.png
+++ b/main_hypot/category_analysis/orders_amt_avia/scatter_trend_quad.png
--- a/main_hypot/category_analysis/orders_amt_avia_hotel/scatter_trend.png
+++ b/main_hypot/category_analysis/orders_amt_avia_hotel/scatter_trend.png
--- a/main_hypot/category_analysis/orders_amt_avia_hotel/scatter_trend_quad.png
+++ b/main_hypot/category_analysis/orders_amt_avia_hotel/scatter_trend_quad.png
--- a/main_hypot/category_analysis/orders_amt_ent/scatter_trend.png
+++ b/main_hypot/category_analysis/orders_amt_ent/scatter_trend.png
--- a/main_hypot/category_analysis/orders_amt_ent/scatter_trend_quad.png
+++ b/main_hypot/category_analysis/orders_amt_ent/scatter_trend_quad.png
--- a/main_hypot/category_analysis/orders_amt_hotel/scatter_trend.png
+++ b/main_hypot/category_analysis/orders_amt_hotel/scatter_trend.png
--- a/main_hypot/category_analysis/orders_amt_hotel/scatter_trend_quad.png
+++ b/main_hypot/category_analysis/orders_amt_hotel/scatter_trend_quad.png
--- a/main_hypot/category_analysis/orders_amt_shopping/scatter_trend.png
+++ b/main_hypot/category_analysis/orders_amt_shopping/scatter_trend.png
--- a/main_hypot/category_analysis/orders_amt_shopping/scatter_trend_quad.png
+++ b/main_hypot/category_analysis/orders_amt_shopping/scatter_trend_quad.png
--- a/main_hypot/category_analysis/orders_amt_super/scatter_trend.png
+++ b/main_hypot/category_analysis/orders_amt_super/scatter_trend.png
--- a/main_hypot/category_analysis/orders_amt_super/scatter_trend_quad.png
+++ b/main_hypot/category_analysis/orders_amt_super/scatter_trend_quad.png
--- a/main_hypot/category_analysis/orders_amt_transport/scatter_trend.png
+++ b/main_hypot/category_analysis/orders_amt_transport/scatter_trend.png
--- a/main_hypot/category_analysis/orders_amt_transport/scatter_trend_quad.png
+++ b/main_hypot/category_analysis/orders_amt_transport/scatter_trend_quad.png
--- a/main_hypot/category_quadreg.py
+++ b/main_hypot/category_quadreg.py
@@ -1,3 +1,5 @@
+"""Категорийный анализ: собирает агрегаты по категориям и строит корреляции/квадратичную регрессию по заказам."""
+
 import sqlite3
 from pathlib import Path
 import sys
@@ -47,6 +49,7 @@ COMBINED = {


 def load_raw(db_path: Path) -> pd.DataFrame:
+    # Загружаем полную таблицу коммуникаций из SQLite
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
    conn.close()
@@ -54,6 +57,7 @@ def load_raw(db_path: Path) -> pd.DataFrame:


 def build_client_by_category(df: pd.DataFrame) -> pd.DataFrame:
+    # Агрегируем метрики по клиенту для каждой категории и считаем средние показы в день
    agg_spec = {f"{col}_{cat}": "sum" for col in BASE_COLUMNS for cat in CATEGORIES}
    client = (
        df.groupby("id")
@@ -82,6 +86,7 @@ def add_combined_category(client: pd.DataFrame, name: str, cats: list[str]) -> p


 def plot_category_correlation(client: pd.DataFrame, cat: str, out_dir: Path) -> None:
+    # Быстрая тепловая карта корреляций для одной категории
    cols = [f"{base}_{cat}" for base in BASE_COLUMNS]
    corr = client[cols].corr()

@@ -190,6 +195,7 @@ def plot_quad_for_category(
    q_high_overrides: dict | None = None,
    iqr_overrides: dict | None = None,
 ) -> None:
+    # Строим облако, тренд и квадратичную регрессию для конкретной категории с опциональными настройками
    y_col = f"orders_amt_{cat}"
    x_col = f"avg_imp_per_day_{cat}"
    out_dir = base_out_dir / y_col
--- a/main_hypot/default/orders_amt_total/quad_regression.png
+++ b/main_hypot/default/orders_amt_total/quad_regression.png
--- a/main_hypot/model_compare.py
+++ b/main_hypot/model_compare.py
@@ -1,110 +0,0 @@
-import sqlite3
-from pathlib import Path
-import sys
-import numpy as np
-import pandas as pd
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import StandardScaler, OneHotEncoder
-from sklearn.compose import ColumnTransformer
-from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import GradientBoostingClassifier
-from sklearn.metrics import roc_auc_score
-
-project_root = Path(__file__).resolve().parent.parent
-sys.path.append(str(project_root / "preanalysis_old_bad"))
-import eda_utils as eda  # noqa: E402
-
-db_path = project_root / "dataset" / "ds.sqlite"
-conn = sqlite3.connect(db_path)
-df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
-conn.close()
-
-for cols, name in [
-    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
-    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
-    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
-    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
-    (eda.ORDER_COLS, "orders_amt_total"),
-]:
-    df[name] = df[cols].sum(axis=1)
-
-df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
-df["click_total"] = df["active_click_total"] + df["passive_click_total"]
-contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
-client = (
-    df.groupby("id")
-    .agg(
-        imp_total=("imp_total", "sum"),
-        click_total=("click_total", "sum"),
-        orders_amt_total=("orders_amt_total", "sum"),
-        age=("age", "median"),
-        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
-        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
-    )
-    .merge(contact_days, on="id", how="left")
-    .reset_index()
-)
-# ... всё как у тебя до расчёта client["ctr_all"] включительно
-
-client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
-client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
-
-# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
-train_idx, test_idx = train_test_split(
-    client.index, test_size=0.2, random_state=42
-)
-
-train = client.loc[train_idx].copy()
-test = client.loc[test_idx].copy()
-
-thr = train["ctr_all"].quantile(0.75)   # порог только по train
-train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
-test["high_ctr"]  = (test["ctr_all"]  >= thr).astype(int)
-
-# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
-X_train = train[[
-    "avg_imp_per_day", "imp_total", "contact_days",  # можно оставить
-    "age", "gender_cd", "device_platform_cd"
-]].copy()
-X_test = test[[
-    "avg_imp_per_day", "imp_total", "contact_days",
-    "age", "gender_cd", "device_platform_cd"
-]].copy()
-
-X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
-X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
-X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
-X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
-
-y_train = train["high_ctr"]
-y_test = test["high_ctr"]
-
-num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
-cat_cols = ["gender_cd", "device_platform_cd"]
-
-pre = ColumnTransformer([
-    ("num", Pipeline([
-        ("imputer", SimpleImputer(strategy="median")),
-        ("scaler", StandardScaler())
-    ]), num_cols),
-    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
-])
-
-log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
-gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
-
-results = {}
-for name, model in [("log_reg", log_reg), ("gb", gb)]:
-    model.fit(X_train, y_train)
-    proba = model.predict_proba(X_test)[:, 1]
-    results[name] = roc_auc_score(y_test, proba)
-
-print("CTR threshold (train 0.75q):", thr)
-print("AUC results:", results)
-
-imp = gb.named_steps["clf"].feature_importances_
-feat = gb.named_steps["pre"].get_feature_names_out()
-imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False)
-print(imp_df.head(15))
--- a/main_hypot/new_plots.py
+++ b/main_hypot/new_plots.py
@@ -0,0 +1,477 @@
+from __future__ import annotations
+
+"""Генерация интерактивных Altair-графиков на базе клиентских и категорийных агрегатов."""
+
+from pathlib import Path
+import sys
+from typing import Dict, Iterable, Optional, Tuple
+
+import altair as alt
+import numpy as np
+import pandas as pd
+import statsmodels.api as sm
+from sklearn.metrics import roc_auc_score, r2_score
+
+PROJECT_ROOT = Path(__file__).resolve().parent
+sys.path.append(str(PROJECT_ROOT / "main_hypot"))
+
+import best_model_and_plots as bmp
+from category_quadreg import (
+    BASE_COLUMNS,
+    CATEGORIES,
+    COMBINED,
+    add_combined_category,
+    build_client_by_category,
+)
+
+OUTPUT_DIR = PROJECT_ROOT / "new_plots"
+FONT_PATH = Path("/Users/dan/Downloads/AyuGram Desktop/SegoeUIVF.ttf")
+
+def inject_font_css(html_path: Path) -> None:
+    """Inject @font-face for SegoeUIVF into saved HTML if font exists."""
+    if not FONT_PATH.exists():
+        return
+    font_face = (
+        "@font-face{font-family:'Segoe UI Variable'; "
+        f"src: url('{FONT_PATH.as_uri()}') format('truetype'); "
+        "font-weight:100 900; font-style:normal;}\n"
+    )
+    css = f"<style>{font_face}body, text, .vega-bindings {{font-family:'Segoe UI Variable','Segoe UI',sans-serif;}}</style>"
+    html = html_path.read_text(encoding="utf-8")
+    if css in html:
+        return
+    if "</head>" in html:
+        html = html.replace("</head>", css + "\n</head>", 1)
+    else:
+        html = css + html
+    html_path.write_text(html, encoding="utf-8")
+
+
+# Используем тематику/шрифты из примера
+def configure_chart(chart: alt.Chart, title: str, width: int = 700, height: int = 500) -> alt.Chart:
+    # Приводим внешний вид графиков к единому стилю и шрифту
+    alt.theme.enable("dark")
+    return (
+        chart.properties(
+            title=title,
+            width=width,
+            height=height,
+            padding=30,
+        )
+        .configure_title(
+            fontSize=18,
+            font="Segoe UI Variable",
+            fontWeight=600,
+            anchor="start",
+        )
+        .configure_axis(
+            grid=True,
+            labelFont="Segoe UI Variable",
+            titleFont="Segoe UI Variable",
+            labelFontSize=16,
+            titleFontSize=18,
+            labelFontWeight=400,
+            titleFontWeight=600,
+        )
+        .configure_legend(
+            labelFont="Segoe UI Variable",
+            titleFont="Segoe UI Variable",
+        )
+    )
+
+
+def prepare_client_data() -> pd.DataFrame:
+    """Поднимаем агрегаты по клиентам из существующего скрипта."""
+    return bmp.load_client_level(bmp.DB_PATH)
+
+
+def prepare_category_client_data() -> pd.DataFrame:
+    # Собираем клиентские показатели по категориям и добавляем комбинированные группы
+    raw = pd.read_sql_query("select * from communications", bmp.sqlite3.connect(bmp.DB_PATH), parse_dates=["business_dt"])
+    client = build_client_by_category(raw)
+    for combo_name, cats in COMBINED.items():
+        client = add_combined_category(client, combo_name, cats)
+    return client
+
+
+def filter_and_trend(
+    df: pd.DataFrame,
+    y_col: str,
+    *,
+    x_col: str = bmp.X_COL,
+    x_max: float = bmp.DEFAULT_X_MAX,
+    y_max: float = bmp.DEFAULT_Y_MAX,
+    q_low: float = bmp.DEFAULT_Q_LOW,
+    q_high: float = bmp.DEFAULT_Q_HIGH,
+    iqr_k: float = bmp.DEFAULT_IQR_K,
+    trend_method: str = bmp.DEFAULT_TREND_METHOD,
+    trend_frac: float = bmp.DEFAULT_TREND_FRAC,
+    savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
+) -> Tuple[pd.DataFrame, Tuple[np.ndarray, np.ndarray]]:
+    # Очищаем данные по IQR, обрезаем хвосты и считаем тренд для последующей регрессии
+    base = df[[x_col, y_col]].dropna()
+    in_range = bmp.filter_x_range(base, x_col, x_max)
+    cleaned = bmp.remove_outliers(
+        in_range,
+        y_col=y_col,
+        x_col=x_col,
+        iqr_k=iqr_k,
+        q_low=q_low,
+        q_high=q_high,
+    )
+    # Обрезаем по y_max для удобства визуализации
+    cleaned = cleaned[cleaned[y_col] <= y_max].copy()
+    tx, ty = bmp.compute_trend(
+        cleaned,
+        y_col=y_col,
+        x_col=x_col,
+        method=trend_method,
+        lowess_frac=trend_frac,
+        savgol_window=savgol_window,
+    )
+    return cleaned, (tx, ty)
+
+
+def compute_density_alpha(df: pd.DataFrame, x_col: str, y_col: str, x_max: float, y_max: float) -> pd.Series:
+    # Пересчитываем прозрачность точек по плотности, чтобы облака читались в html
+    alphas = bmp.compute_density_alpha(
+        df,
+        x_col=x_col,
+        y_col=y_col,
+        x_max=x_max,
+        bins_x=bmp.DEFAULT_BINS_X,
+        bins_y=bmp.DEFAULT_BINS_Y,
+        alpha_min=bmp.DEFAULT_ALPHA_MIN,
+        alpha_max=bmp.DEFAULT_ALPHA_MAX,
+        y_min=bmp.DEFAULT_Y_MIN,
+        y_max_limit=y_max,
+    )
+    if len(alphas) == 0:
+        return pd.Series([bmp.DEFAULT_ALPHA] * len(df), index=df.index)
+    return pd.Series(alphas, index=df.index)
+
+
+def fit_quadratic(
+    df: pd.DataFrame,
+    y_col: str,
+    trend_data: Tuple[np.ndarray, np.ndarray],
+    *,
+    x_col: str = bmp.X_COL,
+    x_max: float = bmp.DEFAULT_X_MAX,
+    force_negative_b2: bool = False,
+) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]:
+    # Фитим y ~ 1 + x + x^2 и считаем AUC/R2 по тренду, если хватило точек
+    if len(df) < 3:
+        return None, {}
+
+    x = df[x_col].to_numpy()
+    y = df[y_col].to_numpy()
+    quad_term = -x**2 if force_negative_b2 else x**2
+    X_design = sm.add_constant(np.column_stack([x, quad_term]))
+    model = sm.OLS(y, X_design).fit(cov_type="HC3")
+
+    # AUC по бинарному флагу заказа
+    auc = np.nan
+    binary = (y > 0).astype(int)
+    if len(np.unique(binary)) > 1:
+        auc = roc_auc_score(binary, model.predict(X_design))
+
+    # R2 по тренду
+    tx, ty = trend_data
+    r2_trend = np.nan
+    if tx is not None and len(tx) >= 3:
+        mask = (tx <= x_max) & ~np.isnan(ty)
+        tx = tx[mask]
+        ty = ty[mask]
+        if len(tx) >= 3 and np.nanvar(ty) > 0:
+            quad_trend = -tx**2 if force_negative_b2 else tx**2
+            X_trend = sm.add_constant(np.column_stack([tx, quad_trend]))
+            y_hat_trend = model.predict(X_trend)
+            r2_trend = r2_score(ty, y_hat_trend)
+
+    return model, {"auc": auc, "r2_trend": r2_trend}
+
+
+def build_annotation(
+    params: np.ndarray,
+    pvals: np.ndarray,
+    metrics: dict,
+    n: int,
+    *,
+    b2_effective: Optional[float] = None,
+    x_pos: float = 0.5,
+) -> pd.DataFrame:
+    # Готовим подписи с метриками для вывода на график
+    b2_val = b2_effective if b2_effective is not None else params[2]
+    lines = [
+        f"R2_trend={metrics.get('r2_trend', np.nan):.3f}",
+        f"AUC={metrics.get('auc', np.nan):.3f}",
+        f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
+        f"b2={b2_val:.3f} (p={pvals[2]:.3g})",
+        f"n={n}",
+    ]
+    return pd.DataFrame(
+        {
+            "x": [x_pos] * len(lines),
+            "y": [metrics.get("y_max_for_anno", 0) - i * 0.4 for i in range(len(lines))],
+            "label": lines,
+        }
+    )
+
+
+def save_scatter_trend_quad(
+    df: pd.DataFrame,
+    y_col: str,
+    out_path: Path,
+    *,
+    x_col: str = bmp.X_COL,
+    x_max: float = bmp.DEFAULT_X_MAX,
+    y_max: float = bmp.DEFAULT_Y_MAX,
+    force_negative_b2: bool = False,
+    savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
+    title: str = "",
+) -> None:
+    # Полный пайплайн: фильтрация, тренд, квадратика и сохранение HTML
+    cleaned, trend_data = filter_and_trend(
+        df,
+        y_col=y_col,
+        x_col=x_col,
+        x_max=x_max,
+        y_max=y_max,
+        trend_method=bmp.DEFAULT_TREND_METHOD,
+        trend_frac=bmp.DEFAULT_TREND_FRAC,
+        savgol_window=savgol_window,
+    )
+    if trend_data[0] is None:
+        print(f"[{y_col}] нет тренда/данных для построения")
+        return
+
+    cleaned = cleaned.copy()
+    cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max)
+
+    model, metrics = fit_quadratic(cleaned, y_col, trend_data, x_col=x_col, x_max=x_max, force_negative_b2=force_negative_b2)
+    if model is None:
+        print(f"[{y_col}] недостаточно точек для квадрата")
+        return
+
+    params = model.params
+    pvals = model.pvalues
+    b2_effective = -abs(params[2]) if force_negative_b2 else params[2]
+
+    x_grid = np.linspace(0, x_max, 400)
+    quad_term = -x_grid**2 if force_negative_b2 else x_grid**2
+    quad_df = pd.DataFrame(
+        {
+            x_col: x_grid,
+            "quad": model.predict(sm.add_constant(np.column_stack([x_grid, quad_term]))),
+        }
+    )
+
+    trend_df = pd.DataFrame({x_col: trend_data[0], "trend": trend_data[1]})
+    metrics["y_max_for_anno"] = y_max * 0.95
+    metrics_text = [
+        f"R2_trend={metrics['r2_trend']:.3f}",
+        f"AUC={metrics['auc']:.3f}",
+        f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
+        f"b2={b2_effective:.3f} (p={pvals[2]:.3g})",
+        f"n={len(cleaned)}",
+    ]
+
+    x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max)
+    y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False)
+
+    points = alt.Chart(cleaned).mark_circle(size=40).encode(
+        x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale),
+        y=alt.Y(y_col, title=y_col, scale=y_scale),
+        opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)),
+        color=alt.value(bmp.DEFAULT_SCATTER_COLOR),
+        tooltip=[x_col, y_col],
+    )
+
+    trend_line = alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode(
+        x=alt.X(x_col, scale=x_scale),
+        y=alt.Y("trend", scale=y_scale),
+    )
+    quad_line = alt.Chart(quad_df).mark_line(color="blue", strokeWidth=2.2, strokeDash=[6, 4]).encode(
+        x=alt.X(x_col, scale=x_scale),
+        y=alt.Y("quad", scale=y_scale),
+    )
+
+    subtitle = " • ".join(metrics_text)
+
+    chart = alt.layer(points, trend_line, quad_line).resolve_scale(opacity="independent")
+    chart = configure_chart(chart, (title or f"{y_col} vs {x_col}") + f"  —  {subtitle}", width=800, height=600)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    chart.save(out_path)
+    inject_font_css(out_path)
+    print(f"Saved {out_path}")
+
+
+def save_correlation_heatmap(df: pd.DataFrame, cols: Iterable[str], title: str, out_path: Path) -> None:
+    # Отрисовываем корреляции по выбранным столбцам и сохраняем в HTML
+    corr = df[list(cols)].corr()
+    corr_long = corr.reset_index().melt(id_vars="index", var_name="col", value_name="corr")
+    corr_long = corr_long.rename(columns={"index": "row"})
+
+    chart = (
+        alt.Chart(corr_long)
+        .mark_rect()
+        .encode(
+            x=alt.X("col:N", title=""),
+            y=alt.Y("row:N", title=""),
+            color=alt.Color("corr:Q", scale=alt.Scale(domain=(-1, 1), scheme="redblue"), legend=alt.Legend(title="corr")),
+            tooltip=["row", "col", alt.Tooltip("corr:Q", format=".3f")],
+        )
+    )
+    chart = configure_chart(chart, title, width=400, height=400)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    chart.save(out_path)
+    inject_font_css(out_path)
+    print(f"Saved {out_path}")
+
+
+def generate_total_plots() -> None:
+    # Главный сценарий для общих заказов: строим облако и тренд
+    df = prepare_client_data()
+    out_base = OUTPUT_DIR / "orders_amt_total"
+    save_scatter_trend_quad(
+        df,
+        y_col="orders_amt_total",
+        out_path=out_base / "scatter_trend_quad.html",
+        x_max=bmp.DEFAULT_X_MAX,
+        y_max=bmp.DEFAULT_Y_MAX,
+        savgol_window=bmp.DEFAULT_SAVGOL_WINDOW,
+        title="Заказы vs средние показы (все клиенты)",
+    )
+
+
+def generate_category_plots() -> None:
+    # Проходим по категориям и комбинированным группам, строим корреляции и облака
+    client = prepare_category_client_data()
+
+    x_max_overrides = {
+        "ent": 4,
+        "transport": 6,
+        "super": 4,
+        "avia": 4,
+        "shopping": 4,
+        "avia_hotel": 5,
+    }
+    y_max_overrides = {
+        "ent": 2.5,
+        "transport": 8,
+        "avia": 1.5,
+        "shopping": 2.5,
+        "super": 5.5,
+        "avia_hotel": 2.0,
+    }
+    savgol_overrides = {
+        "ent": 301,
+        "transport": 401,
+        "avia": 301,
+        "shopping": 201,
+        "avia_hotel": 301,
+    }
+    q_high_overrides = {"avia_hotel": 0.9}
+    iqr_overrides = {"avia_hotel": 1.2}
+
+    cats_all = CATEGORIES + list(COMBINED.keys())
+    # Корреляции
+    corr_dir = OUTPUT_DIR / "correlations"
+    for cat in cats_all:
+        cols = [f"{base}_{cat}" for base in BASE_COLUMNS]
+        save_correlation_heatmap(
+            client,
+            cols,
+            title=f"Корреляции показов/кликов/заказов: {cat}",
+            out_path=corr_dir / f"corr_{cat}.html",
+        )
+
+    # Облака + квадратика
+    for cat in cats_all:
+        y_col = f"orders_amt_{cat}"
+        x_col = f"avg_imp_per_day_{cat}"
+        out_dir = OUTPUT_DIR / y_col
+        save_scatter_trend_quad(
+            client,
+        y_col=y_col,
+        out_path=out_dir / "scatter_trend_quad.html",
+        x_col=x_col,
+        x_max=x_max_overrides.get(cat, bmp.DEFAULT_X_MAX),
+        y_max=y_max_overrides.get(cat, bmp.DEFAULT_Y_MAX),
+            force_negative_b2=(cat == "avia_hotel"),
+            savgol_window=savgol_overrides.get(cat, bmp.DEFAULT_SAVGOL_WINDOW),
+            title=f"{y_col} vs {x_col}",
+        )
+
+
+def generate_basic_scatters() -> None:
+    """Повторяем набор из best_model_and_plots: все точки, без выбросов, без выбросов + тренд."""
+    df = prepare_client_data()
+    y_col = "orders_amt_total"
+    x_col = bmp.X_COL
+    x_max = bmp.DEFAULT_X_MAX
+    y_max = bmp.DEFAULT_Y_MAX
+    out_dir = OUTPUT_DIR / y_col
+
+    base = df[[x_col, y_col]].dropna()
+    base = bmp.filter_x_range(base, x_col, x_max)
+    base = base.copy()
+    base["alpha"] = compute_density_alpha(base, x_col, y_col, x_max, y_max)
+
+    def scatter_chart(data: pd.DataFrame, title: str, trend: Tuple[np.ndarray, np.ndarray] | None = None) -> alt.Chart:
+        x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max)
+        y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False)
+        points = alt.Chart(data).mark_circle(size=40).encode(
+            x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale),
+            y=alt.Y(y_col, title=y_col, scale=y_scale),
+            opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)),
+            color=alt.value(bmp.DEFAULT_SCATTER_COLOR),
+            tooltip=[x_col, y_col],
+        )
+        layers = [points]
+        if trend is not None and trend[0] is not None:
+            trend_df = pd.DataFrame({x_col: trend[0], "trend": trend[1]})
+            layers.append(
+                alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode(
+                    x=alt.X(x_col, scale=x_scale),
+                    y=alt.Y("trend", scale=y_scale),
+                )
+            )
+        chart = alt.layer(*layers).resolve_scale(opacity="independent")
+        return configure_chart(chart, title, width=800, height=600)
+
+    # 1) все точки
+    scatter_chart(base, "Облако: все точки").save(out_dir / "scatter_all.html")
+    inject_font_css(out_dir / "scatter_all.html")
+
+    # 2) без выбросов
+    cleaned = bmp.remove_outliers(base, y_col=y_col, x_col=x_col, iqr_k=bmp.DEFAULT_IQR_K, q_low=bmp.DEFAULT_Q_LOW, q_high=bmp.DEFAULT_Q_HIGH)
+    cleaned = cleaned.copy()
+    cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max)
+    scatter_chart(cleaned, "Облако: без выбросов").save(out_dir / "scatter_clean.html")
+    inject_font_css(out_dir / "scatter_clean.html")
+
+    # 3) без выбросов + тренд
+    tx, ty = bmp.compute_trend(
+        cleaned,
+        y_col=y_col,
+        x_col=x_col,
+        method=bmp.DEFAULT_TREND_METHOD,
+        lowess_frac=bmp.DEFAULT_TREND_FRAC,
+        savgol_window=bmp.DEFAULT_SAVGOL_WINDOW,
+    )
+    scatter_chart(cleaned, "Облако: без выбросов + тренд", trend=(tx, ty)).save(out_dir / "scatter_clean_trend.html")
+    inject_font_css(out_dir / "scatter_clean_trend.html")
+
+
+def main() -> None:
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    generate_basic_scatters()
+    generate_total_plots()
+    generate_category_plots()
+
+
+if __name__ == "__main__":
+    main()
--- a/main_hypot/quadreg.py
+++ b/main_hypot/quadreg.py
@@ -1,3 +1,5 @@
+"""Обёртка для построения общей квадратичной регрессии заказов от среднего числа показов."""
+
 from pathlib import Path
 from typing import Optional, Tuple

@@ -69,6 +71,7 @@ def plot_overall_quad(
    y_max: float = Y_MAX,
    savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
 ) -> None:
+    # Рисуем три облака (из best_model_and_plots) и добавляем поверх квадратичную кривую
    out_dir = bmp.BASE_OUT_DIR / Y_COL

    res = bmp.plot_clean_trend_scatter(
--- a/main_hypot/stat_analysis.py
+++ b/main_hypot/stat_analysis.py
@@ -1,87 +0,0 @@
-import sqlite3
-from pathlib import Path
-import sys
-import numpy as np
-import pandas as pd
-import seaborn as sns
-import matplotlib.pyplot as plt
-from scipy import stats
-
-sns.set_theme(style="whitegrid")
-plt.rcParams["figure.figsize"] = (10, 5)
-
-project_root = Path(__file__).resolve().parent.parent
-sys.path.append(str(project_root / "preanalysis_old_bad"))
-import eda_utils as eda  # noqa: E402
-
-db_path = project_root / "dataset" / "ds.sqlite"
-conn = sqlite3.connect(db_path)
-df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
-conn.close()
-
-for cols, name in [
-    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
-    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
-    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
-    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
-    (eda.ORDER_COLS, "orders_amt_total"),
-]:
-    df[name] = df[cols].sum(axis=1)
-
-df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
-df["click_total"] = df["active_click_total"] + df["passive_click_total"]
-
-contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
-client = (
-    df.groupby("id")
-    .agg(
-        imp_total=("imp_total", "sum"),
-        click_total=("click_total", "sum"),
-        orders_amt_total=("orders_amt_total", "sum"),
-        age=("age", "median"),
-        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
-        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
-    )
-    .merge(contact_days, on="id", how="left")
-    .reset_index()
-)
-
-client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
-client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
-client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
-client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
-client["has_order"] = (client["orders_amt_total"] > 0).astype(int)
-
-# Summary
-summary = client[["imp_total", "click_total", "orders_amt_total", "contact_days", "avg_imp_per_day", "ctr_all", "cr_click2order"]].describe().T
-print("Summary\n", summary)
-missing = client.isna().mean().sort_values(ascending=False)
-print("Missing\n", missing.head(10))
-
-# Correlations and Mann-Whitney
-corr_ctr = stats.spearmanr(client["avg_imp_per_day"], client["ctr_all"])
-corr_cr = stats.spearmanr(client["avg_imp_per_day"], client["cr_click2order"])
-q1 = client["avg_imp_per_day"].quantile(0.25)
-q4 = client["avg_imp_per_day"].quantile(0.75)
-low = client.loc[client["avg_imp_per_day"] <= q1, "ctr_all"].dropna()
-high = client.loc[client["avg_imp_per_day"] >= q4, "ctr_all"].dropna()
-wu = stats.mannwhitneyu(low, high, alternative="greater")
-print({"spearman_ctr": corr_ctr, "spearman_cr": corr_cr, "mw_low_gt_high": wu})
-
-# Bin stats and dual-axis plot
-bins = pd.qcut(client["avg_imp_per_day"], 10, duplicates="drop")
-stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
-stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
-stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
-fig, ax1 = plt.subplots(figsize=(12, 5))
-ax2 = ax1.twinx()
-ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
-ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
-ax1.set_ylabel("CTR")
-ax2.set_ylabel("CR click→order")
-ax1.set_xlabel("avg_imp_per_day bins")
-plt.xticks(rotation=35)
-ax1.set_title("CTR и CR по децилям avg_imp_per_day")
-fig.tight_layout()
-plt.savefig(project_root / "main_hypot" / "stat_bins.png", dpi=150)
-print("Saved plot stat_bins.png")