new plots

2025-12-14 17:30:01 +03:00
parent cfee72470c
commit 5cac173b2f
4 changed files with 116 additions and 117 deletions
--- a/main_hypot/best_model_and_plots.py
+++ b/main_hypot/best_model_and_plots.py
@@ -1,144 +1,143 @@
 import sqlite3
 from pathlib import Path
 import sys
-import numpy as np
+
+import matplotlib.pyplot as plt
 import pandas as pd
 import seaborn as sns
-import matplotlib.pyplot as plt
+from statsmodels.nonparametric.smoothers_lowess import lowess

 sns.set_theme(style="whitegrid")
-plt.rcParams["figure.figsize"] = (10, 5)
+plt.rcParams["figure.figsize"] = (10, 6)

 project_root = Path(__file__).resolve().parent.parent
 sys.path.append(str(project_root / "preanalysis_old_bad"))
 import eda_utils as eda  # noqa: E402

-db_path = project_root / "dataset" / "ds.sqlite"
-conn = sqlite3.connect(db_path)
-df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
-conn.close()
+DB_PATH = project_root / "dataset" / "ds.sqlite"
+OUT_DIR = project_root / "main_hypot"
+X_COL = "avg_imp_per_day"
+Y_COL = "orders_amt_total"
+X_MAX = 18  # обрезаем длинный хвост по показам, чтобы облака было легче читать
+SCATTER_COLOR = "#2c7bb6"

-for cols, name in [
-    (eda.ACTIVE_IMP_COLS, "active_imp_total"),
-    (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
-    (eda.ACTIVE_CLICK_COLS, "active_click_total"),
-    (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
-    (eda.ORDER_COLS, "orders_amt_total"),
-]:
-    df[name] = df[cols].sum(axis=1)

-df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
-df["click_total"] = df["active_click_total"] + df["passive_click_total"]
-contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
-client = (
-    df.groupby("id")
-    .agg(
-        imp_total=("imp_total", "sum"),
-        click_total=("click_total", "sum"),
-        orders_amt_total=("orders_amt_total", "sum"),
-        age=("age", "median"),
-        gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
-        device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
+def load_client_level(db_path: Path) -> pd.DataFrame:
+    """Собирает агрегаты по клиентам без усреднения по x."""
+    conn = sqlite3.connect(db_path)
+    df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
+    conn.close()
+
+    for cols, name in [
+        (eda.ACTIVE_IMP_COLS, "active_imp_total"),
+        (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
+        (eda.ACTIVE_CLICK_COLS, "active_click_total"),
+        (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
+        (eda.ORDER_COLS, "orders_amt_total"),
+    ]:
+        df[name] = df[cols].sum(axis=1)
+
+    df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+
+    client = (
+        df.groupby("id")
+        .agg(
+            imp_total=("imp_total", "sum"),
+            orders_amt_total=("orders_amt_total", "sum"),
+            contact_days=("business_dt", "nunique"),
+        )
+        .reset_index()
    )
-    .merge(contact_days, on="id", how="left")
-    .reset_index()
-)
-client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"])  # orders / impressions
-client["order_rate_pct"] = 100 * client["order_rate"]  # чтобы шкала была человеческая
-client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])

-# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
-stats_imp = (
-    client.groupby("avg_imp_per_day", as_index=False)
-    .agg(
-        orders_mean=("orders_amt_total", "mean"),
-        n_clients=("id", "count"),
+    client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"])
+    client[Y_COL] = client["orders_amt_total"]
+    client = client[["id", X_COL, Y_COL]].dropna()
+
+    in_range = client[client[X_COL] <= X_MAX].copy()
+    print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
+    return in_range
+
+
+def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame:
+    """Убирает выбросы по IQR отдельно по x и y."""
+    def bounds(series: pd.Series) -> tuple[float, float]:
+        q1, q3 = series.quantile([0.05, 0.95])
+        iqr = q3 - q1
+        return q1 - iqr_k * iqr, q3 + iqr_k * iqr
+
+    x_low, x_high = bounds(df[X_COL])
+    y_low, y_high = bounds(df[Y_COL])
+    filtered = df[
+        df[X_COL].between(max(0, x_low), x_high)
+        & df[Y_COL].between(max(0, y_low), y_high)
+    ].copy()
+    print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).")
+    return filtered
+
+
+def plot_density_scatter(
+    df: pd.DataFrame,
+    title: str,
+    out_name: str,
+    with_trend: bool = False,
+    alpha: float = 0.08,
+) -> None:
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.scatterplot(
+        data=df,
+        x=X_COL,
+        y=Y_COL,
+        color=SCATTER_COLOR,
+        s=20,
+        alpha=alpha,
+        linewidth=0,
+        ax=ax,
    )
-    .sort_values("avg_imp_per_day")
-)

-K_MULT = 2  # "в разы" -> 5x. Поменяй на 3/10 если хочешь
-ABS_DY_MIN = 1
-X_MAX = 16
+    if with_trend:
+        trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True)
+        ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд")
+        ax.legend()

-stats_imp = stats_imp.sort_values("avg_imp_per_day").reset_index(drop=True)
+    ax.set_xlim(0, X_MAX)
+    ax.set_ylim(bottom=0)
+    ax.set_xlabel("Среднее число показов в день")
+    ax.set_ylabel("Число заказов за период (сумма)")
+    ax.set_title(title)
+    ax.grid(alpha=0.3)

-# 1) cut by x
-stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+    out_path = OUT_DIR / out_name
+    fig.tight_layout()
+    fig.savefig(out_path, dpi=150)
+    plt.close(fig)
+    print(f"Saved {out_path}")

-# 2) detect vertical outliers by dy logic
-before = len(stats_f)
-y = stats_f["orders_mean"]
-abs_dy = y.diff().abs()

-prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
-ratio = abs_dy / (prev3_mean.replace(0, np.nan))  # avoid inf when prev mean == 0
+def main() -> None:
+    client = load_client_level(DB_PATH)

-is_outlier = (abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT) | (y > 5)
-# первые точки не могут нормально иметь "3 предыдущих дельты"
-is_outlier = is_outlier.fillna(False)
+    plot_density_scatter(
+        client,
+        title="Облако: заказы vs средние показы в день (все клиенты)",
+        out_name="orders_vs_avg_imp_scatter.png",
+    )

-stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
-after = len(stats_f)
-cleaned = before - after
+    cleaned = remove_outliers(client)
+    plot_density_scatter(
+        cleaned,
+        title="Облако без выбросов (IQR) заказы vs средние показы в день",
+        out_name="orders_vs_avg_imp_scatter_clean.png",
+    )

-print(f"{before} - {after}, cleaned: {cleaned}")
+    plot_density_scatter(
+        cleaned,
+        title="Облако без выбросов + тренд",
+        out_name="orders_vs_avg_imp_scatter_trend.png",
+        with_trend=True,
+        alpha=0.1,
+    )

-# --- smoothing (rolling mean on remaining points) ---
-w = max(7, int(len(stats_f) * 0.05))
-if w % 2 == 0:
-    w += 1

-stats_f["orders_smooth"] = (
-    stats_f["orders_mean"]
-    .rolling(window=w, center=True, min_periods=1)
-    .mean()
-)
-# --- cost line (linear expenses) ---
-# нормируем так, чтобы масштаб был сопоставим с заказами
-c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
-stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
-
-# plot
-plt.figure(figsize=(10, 8))
-
-plt.plot(
-    stats_f["avg_imp_per_day"],
-    stats_f["orders_mean"],
-    marker="o",
-    linewidth=1,
-    alpha=0.3,
-    label="Среднее число заказов"
-)
-
-plt.plot(
-    stats_f["avg_imp_per_day"],
-    stats_f["orders_smooth"],
-    color="red",
-    linewidth=2.5,
-    label="Сглаженный тренд заказов"
-)
-
-plt.plot(
-    stats_f["avg_imp_per_day"],
-    stats_f["cost_line"],
-    color="black",
-    linestyle="--",
-    linewidth=2,
-    label="Линейные расходы на показы"
-)
-
-plt.xlabel("Среднее число показов в день")
-plt.ylabel("Среднее число заказов")
-plt.title("Зависимость заказов от интенсивности коммуникаций")
-
-plt.legend()
-plt.grid(alpha=0.3)
-plt.tight_layout()
-
-plt.savefig(
-    project_root / "main_hypot" / "orders_vs_avg_imp_with_costs.png",
-    dpi=150
-)
-
-print("Saved orders_vs_avg_imp_with_costs.png")
+if __name__ == "__main__":
+    main()