from __future__ import annotations """Генерирует раздельные Altair-облака для активных/пассивных/всех показов по категориям.""" import sqlite3 import sys from pathlib import Path from typing import Optional, Tuple import altair as alt import numpy as np import pandas as pd import statsmodels.api as sm from sklearn.metrics import roc_auc_score, r2_score PROJECT_ROOT = Path(__file__).resolve().parent OUT_DIR = PROJECT_ROOT / "new_plots" / "final_result" sys.path.append(str(PROJECT_ROOT / "main_hypot")) import best_model_and_plots as bmp # noqa: E402 sys.path.append(str(PROJECT_ROOT)) import new_plots as npplots # noqa: E402 CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"] FONT_PATH = Path("/Users/dan/Downloads/AyuGram Desktop/SegoeUIVF.ttf") BASE_WINDOW = bmp.DEFAULT_SAVGOL_WINDOW ACTIVE_WINDOW_FACTOR = 0.5 PASSIVE_WINDOW_FACTOR = 0.95 COMBINED_WINDOW_FACTOR = 1.0 def load_raw() -> pd.DataFrame: # Читаем полные коммуникации из SQLite для дальнейших агрегаций conn = sqlite3.connect(bmp.DB_PATH) df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) conn.close() return df def build_client(df: pd.DataFrame) -> pd.DataFrame: # Агрегируем активные/пассивные показы и заказы по клиенту и считаем средние в день agg_spec = { **{f"active_imp_{c}": "sum" for c in CATEGORIES}, **{f"passive_imp_{c}": "sum" for c in CATEGORIES}, **{f"orders_amt_{c}": "sum" for c in CATEGORIES}, "business_dt": "nunique", } client = df.groupby("id").agg(agg_spec).reset_index() client = client.rename(columns={"business_dt": "contact_days"}) client["active_imp_total"] = client[[f"active_imp_{c}" for c in CATEGORIES]].sum(axis=1) client["passive_imp_total"] = client[[f"passive_imp_{c}" for c in CATEGORIES]].sum(axis=1) client["orders_amt_total"] = client[[f"orders_amt_{c}" for c in CATEGORIES]].sum(axis=1) for c in CATEGORIES: client[f"active_imp_per_day_{c}"] = bmp.safe_divide(client[f"active_imp_{c}"], client["contact_days"]) client[f"passive_imp_per_day_{c}"] = bmp.safe_divide(client[f"passive_imp_{c}"], client["contact_days"]) client[f"total_imp_per_day_{c}"] = bmp.safe_divide( client[f"active_imp_{c}"] + client[f"passive_imp_{c}"], client["contact_days"] ) client["active_imp_per_day_total"] = bmp.safe_divide(client["active_imp_total"], client["contact_days"]) client["passive_imp_per_day_total"] = bmp.safe_divide(client["passive_imp_total"], client["contact_days"]) client["total_imp_per_day_total"] = bmp.safe_divide( client["active_imp_total"] + client["passive_imp_total"], client["contact_days"] ) return client def compute_limits(df: pd.DataFrame, x_col: str, y_col: str) -> Tuple[float, float]: # Автоматический подбор разумных лимитов осей по 99 перцентилю x_q = df[x_col].quantile(0.99) y_q = df[y_col].quantile(0.99) x_max = float(max(0.1, x_q + 2.0)) y_max = float(max(0.1, y_q + 3.0)) return x_max, y_max def fit_quadratic( df: pd.DataFrame, trend_data: Tuple[np.ndarray, np.ndarray], *, x_col: str, y_col: str, x_max: float, ) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]: # Строим квадратичную регрессию и считаем AUC/R2 по тренду if len(df) < 3: return None, {} x = df[x_col].to_numpy() y = df[y_col].to_numpy() X = sm.add_constant(np.column_stack([x, x**2])) model = sm.OLS(y, X).fit(cov_type="HC3") auc = np.nan binary = (y > 0).astype(int) if len(np.unique(binary)) > 1: auc = roc_auc_score(binary, model.predict(X)) tx, ty = trend_data r2_trend = np.nan if tx is not None and len(tx) >= 3: mask = (tx <= x_max) & ~np.isnan(ty) txm = tx[mask] tym = ty[mask] if len(txm) >= 3 and np.nanvar(tym) > 0: X_trend = sm.add_constant(np.column_stack([txm, txm**2])) y_hat_trend = model.predict(X_trend) r2_trend = r2_score(tym, y_hat_trend) return model, {"auc": auc, "r2_trend": r2_trend} def scatter_trend_quad( df: pd.DataFrame, x_col: str, y_col: str, out_path: Path, *, title: str, window: int, x_override: float | None = None, y_override: float | None = None, x_scale_factor: float | None = None, ) -> None: # Пайплайн для одной комбинации x/y: фильтр, тренд, регрессия и сохранение HTML # Авто-лимиты x_max, y_max = compute_limits(df, x_col, y_col) if x_override is not None: x_max = x_override + 5.5 if y_override is not None: y_max = y_override + 3.0 if x_scale_factor is not None: x_max = x_max * x_scale_factor # Фильтр base = df[[x_col, y_col]].dropna() base = bmp.filter_x_range(base, x_col, x_max) base = base[base[y_col] <= y_max].copy() base["alpha"] = bmp.compute_density_alpha( base, x_col=x_col, y_col=y_col, x_max=x_max, bins_x=bmp.DEFAULT_BINS_X, bins_y=bmp.DEFAULT_BINS_Y, alpha_min=bmp.DEFAULT_ALPHA_MIN, alpha_max=bmp.DEFAULT_ALPHA_MAX, y_min=bmp.DEFAULT_Y_MIN, y_max_limit=y_max, ) tx, ty = bmp.compute_trend( base, y_col=y_col, x_col=x_col, method=bmp.DEFAULT_TREND_METHOD, lowess_frac=bmp.DEFAULT_TREND_FRAC, savgol_window=window, ) trend_data = (tx, ty) model, metrics = fit_quadratic(base, trend_data, x_col=x_col, y_col=y_col, x_max=x_max) if model is None: print(f"[{y_col}] not enough data") return params = model.params pvals = model.pvalues x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max) y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False) points = alt.Chart(base).mark_circle(size=40).encode( x=alt.X(x_col, title="Показы в день", scale=x_scale), y=alt.Y(y_col, title="Заказы", scale=y_scale), opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True), legend=None), color=alt.value("#FFDD2D"), tooltip=[x_col, y_col], ) layers = [points] if tx is not None and len(tx): trend_df = pd.DataFrame({x_col: tx, "trend": ty}) layers.append( alt.Chart(trend_df).mark_line(color="#EB4146", strokeWidth=2.5).encode( x=alt.X(x_col, scale=x_scale), y=alt.Y("trend", scale=y_scale), ) ) x_grid = np.linspace(0, x_max, 400) quad_df = pd.DataFrame( { x_col: x_grid, "quad": model.predict(sm.add_constant(np.column_stack([x_grid, x_grid**2]))), } ) layers.append( alt.Chart(quad_df).mark_line(color="#198F51", strokeWidth=2.2).encode( x=alt.X(x_col, scale=x_scale), y=alt.Y("quad", scale=y_scale), ) ) metrics_text = [ f"R2_trend={metrics['r2_trend']:.3f}", f"AUC={metrics['auc']:.3f}", f"b1={params[1]:.3f} (p={pvals[1]:.3g})", f"b2={params[2]:.3f} (p={pvals[2]:.3g})", f"n={len(base)}", ] subtitle = " • ".join(metrics_text) chart = alt.layer(*layers).resolve_scale(opacity="independent") chart = npplots.configure_chart(chart, f"{title} — {subtitle}", width=800, height=600) out_path.parent.mkdir(parents=True, exist_ok=True) chart.save(out_path) npplots.inject_font_css(out_path) print(f"Saved {out_path}") def main() -> None: raw = load_raw() client = build_client(raw) targets = [("total", "orders_amt_total")] + [(c, f"orders_amt_{c}") for c in CATEGORIES] for name, y_col in targets: out_dir = OUT_DIR / name # активные x_col_active = f"active_imp_per_day_{name}" if name != "total" else "active_imp_per_day_total" scatter_trend_quad( client, x_col=x_col_active, y_col=y_col, out_path=out_dir / f"{name}_active_scatter.html", title=f"{y_col}: активные показы/день", window=int(max(5, BASE_WINDOW * ACTIVE_WINDOW_FACTOR)), y_override=6.5 if name == "total" else None, x_scale_factor=0.5 if name == "total" else None, ) # пассивные x_col_passive = f"passive_imp_per_day_{name}" if name != "total" else "passive_imp_per_day_total" scatter_trend_quad( client, x_col=x_col_passive, y_col=y_col, out_path=out_dir / f"{name}_passive_scatter.html", title=f"{y_col}: пассивные показы/день", window=int(max(5, BASE_WINDOW * PASSIVE_WINDOW_FACTOR)), y_override=6.5 if name == "total" else None, x_scale_factor=0.95 if name == "total" else None, ) # комбинированные x_col_total = f"total_imp_per_day_{name}" if name != "total" else "total_imp_per_day_total" scatter_trend_quad( client, x_col=x_col_total, y_col=y_col, out_path=out_dir / f"{name}_combined_scatter.html", title=f"{y_col}: все показы/день", window=int(max(5, BASE_WINDOW * COMBINED_WINDOW_FACTOR)), x_override=13 if name == "total" else None, y_override=6.5 if name == "total" else None, ) if __name__ == "__main__": main()