from __future__ import annotations """Генерация интерактивных Altair-графиков на базе клиентских и категорийных агрегатов.""" from pathlib import Path import sys from typing import Dict, Iterable, Optional, Tuple import altair as alt import numpy as np import pandas as pd import statsmodels.api as sm from sklearn.metrics import roc_auc_score, r2_score PROJECT_ROOT = Path(__file__).resolve().parent sys.path.append(str(PROJECT_ROOT / "main_hypot")) import best_model_and_plots as bmp from category_quadreg import ( BASE_COLUMNS, CATEGORIES, COMBINED, add_combined_category, build_client_by_category, ) OUTPUT_DIR = PROJECT_ROOT / "new_plots" FONT_PATH = Path("/Users/dan/Downloads/AyuGram Desktop/SegoeUIVF.ttf") def inject_font_css(html_path: Path) -> None: """Inject @font-face for SegoeUIVF into saved HTML if font exists.""" if not FONT_PATH.exists(): return font_face = ( "@font-face{font-family:'Segoe UI Variable'; " f"src: url('{FONT_PATH.as_uri()}') format('truetype'); " "font-weight:100 900; font-style:normal;}\n" ) css = f"" html = html_path.read_text(encoding="utf-8") if css in html: return if "" in html: html = html.replace("", css + "\n", 1) else: html = css + html html_path.write_text(html, encoding="utf-8") # Используем тематику/шрифты из примера def configure_chart(chart: alt.Chart, title: str, width: int = 700, height: int = 500) -> alt.Chart: # Приводим внешний вид графиков к единому стилю и шрифту alt.theme.enable("dark") return ( chart.properties( title=title, width=width, height=height, padding=30, ) .configure_title( fontSize=18, font="Segoe UI Variable", fontWeight=600, anchor="start", ) .configure_axis( grid=True, labelFont="Segoe UI Variable", titleFont="Segoe UI Variable", labelFontSize=16, titleFontSize=18, labelFontWeight=400, titleFontWeight=600, ) .configure_legend( labelFont="Segoe UI Variable", titleFont="Segoe UI Variable", ) ) def prepare_client_data() -> pd.DataFrame: """Поднимаем агрегаты по клиентам из существующего скрипта.""" return bmp.load_client_level(bmp.DB_PATH) def prepare_category_client_data() -> pd.DataFrame: # Собираем клиентские показатели по категориям и добавляем комбинированные группы raw = pd.read_sql_query("select * from communications", bmp.sqlite3.connect(bmp.DB_PATH), parse_dates=["business_dt"]) client = build_client_by_category(raw) for combo_name, cats in COMBINED.items(): client = add_combined_category(client, combo_name, cats) return client def filter_and_trend( df: pd.DataFrame, y_col: str, *, x_col: str = bmp.X_COL, x_max: float = bmp.DEFAULT_X_MAX, y_max: float = bmp.DEFAULT_Y_MAX, q_low: float = bmp.DEFAULT_Q_LOW, q_high: float = bmp.DEFAULT_Q_HIGH, iqr_k: float = bmp.DEFAULT_IQR_K, trend_method: str = bmp.DEFAULT_TREND_METHOD, trend_frac: float = bmp.DEFAULT_TREND_FRAC, savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW, ) -> Tuple[pd.DataFrame, Tuple[np.ndarray, np.ndarray]]: # Очищаем данные по IQR, обрезаем хвосты и считаем тренд для последующей регрессии base = df[[x_col, y_col]].dropna() in_range = bmp.filter_x_range(base, x_col, x_max) cleaned = bmp.remove_outliers( in_range, y_col=y_col, x_col=x_col, iqr_k=iqr_k, q_low=q_low, q_high=q_high, ) # Обрезаем по y_max для удобства визуализации cleaned = cleaned[cleaned[y_col] <= y_max].copy() tx, ty = bmp.compute_trend( cleaned, y_col=y_col, x_col=x_col, method=trend_method, lowess_frac=trend_frac, savgol_window=savgol_window, ) return cleaned, (tx, ty) def compute_density_alpha(df: pd.DataFrame, x_col: str, y_col: str, x_max: float, y_max: float) -> pd.Series: # Пересчитываем прозрачность точек по плотности, чтобы облака читались в html alphas = bmp.compute_density_alpha( df, x_col=x_col, y_col=y_col, x_max=x_max, bins_x=bmp.DEFAULT_BINS_X, bins_y=bmp.DEFAULT_BINS_Y, alpha_min=bmp.DEFAULT_ALPHA_MIN, alpha_max=bmp.DEFAULT_ALPHA_MAX, y_min=bmp.DEFAULT_Y_MIN, y_max_limit=y_max, ) if len(alphas) == 0: return pd.Series([bmp.DEFAULT_ALPHA] * len(df), index=df.index) return pd.Series(alphas, index=df.index) def fit_quadratic( df: pd.DataFrame, y_col: str, trend_data: Tuple[np.ndarray, np.ndarray], *, x_col: str = bmp.X_COL, x_max: float = bmp.DEFAULT_X_MAX, force_negative_b2: bool = False, ) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]: # Фитим y ~ 1 + x + x^2 и считаем AUC/R2 по тренду, если хватило точек if len(df) < 3: return None, {} x = df[x_col].to_numpy() y = df[y_col].to_numpy() quad_term = -x**2 if force_negative_b2 else x**2 X_design = sm.add_constant(np.column_stack([x, quad_term])) model = sm.OLS(y, X_design).fit(cov_type="HC3") # AUC по бинарному флагу заказа auc = np.nan binary = (y > 0).astype(int) if len(np.unique(binary)) > 1: auc = roc_auc_score(binary, model.predict(X_design)) # R2 по тренду tx, ty = trend_data r2_trend = np.nan if tx is not None and len(tx) >= 3: mask = (tx <= x_max) & ~np.isnan(ty) tx = tx[mask] ty = ty[mask] if len(tx) >= 3 and np.nanvar(ty) > 0: quad_trend = -tx**2 if force_negative_b2 else tx**2 X_trend = sm.add_constant(np.column_stack([tx, quad_trend])) y_hat_trend = model.predict(X_trend) r2_trend = r2_score(ty, y_hat_trend) return model, {"auc": auc, "r2_trend": r2_trend} def build_annotation( params: np.ndarray, pvals: np.ndarray, metrics: dict, n: int, *, b2_effective: Optional[float] = None, x_pos: float = 0.5, ) -> pd.DataFrame: # Готовим подписи с метриками для вывода на график b2_val = b2_effective if b2_effective is not None else params[2] lines = [ f"R2_trend={metrics.get('r2_trend', np.nan):.3f}", f"AUC={metrics.get('auc', np.nan):.3f}", f"b1={params[1]:.3f} (p={pvals[1]:.3g})", f"b2={b2_val:.3f} (p={pvals[2]:.3g})", f"n={n}", ] return pd.DataFrame( { "x": [x_pos] * len(lines), "y": [metrics.get("y_max_for_anno", 0) - i * 0.4 for i in range(len(lines))], "label": lines, } ) def save_scatter_trend_quad( df: pd.DataFrame, y_col: str, out_path: Path, *, x_col: str = bmp.X_COL, x_max: float = bmp.DEFAULT_X_MAX, y_max: float = bmp.DEFAULT_Y_MAX, force_negative_b2: bool = False, savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW, title: str = "", ) -> None: # Полный пайплайн: фильтрация, тренд, квадратика и сохранение HTML cleaned, trend_data = filter_and_trend( df, y_col=y_col, x_col=x_col, x_max=x_max, y_max=y_max, trend_method=bmp.DEFAULT_TREND_METHOD, trend_frac=bmp.DEFAULT_TREND_FRAC, savgol_window=savgol_window, ) if trend_data[0] is None: print(f"[{y_col}] нет тренда/данных для построения") return cleaned = cleaned.copy() cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max) model, metrics = fit_quadratic(cleaned, y_col, trend_data, x_col=x_col, x_max=x_max, force_negative_b2=force_negative_b2) if model is None: print(f"[{y_col}] недостаточно точек для квадрата") return params = model.params pvals = model.pvalues b2_effective = -abs(params[2]) if force_negative_b2 else params[2] x_grid = np.linspace(0, x_max, 400) quad_term = -x_grid**2 if force_negative_b2 else x_grid**2 quad_df = pd.DataFrame( { x_col: x_grid, "quad": model.predict(sm.add_constant(np.column_stack([x_grid, quad_term]))), } ) trend_df = pd.DataFrame({x_col: trend_data[0], "trend": trend_data[1]}) metrics["y_max_for_anno"] = y_max * 0.95 metrics_text = [ f"R2_trend={metrics['r2_trend']:.3f}", f"AUC={metrics['auc']:.3f}", f"b1={params[1]:.3f} (p={pvals[1]:.3g})", f"b2={b2_effective:.3f} (p={pvals[2]:.3g})", f"n={len(cleaned)}", ] x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max) y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False) points = alt.Chart(cleaned).mark_circle(size=40).encode( x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale), y=alt.Y(y_col, title=y_col, scale=y_scale), opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)), color=alt.value(bmp.DEFAULT_SCATTER_COLOR), tooltip=[x_col, y_col], ) trend_line = alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode( x=alt.X(x_col, scale=x_scale), y=alt.Y("trend", scale=y_scale), ) quad_line = alt.Chart(quad_df).mark_line(color="blue", strokeWidth=2.2, strokeDash=[6, 4]).encode( x=alt.X(x_col, scale=x_scale), y=alt.Y("quad", scale=y_scale), ) subtitle = " • ".join(metrics_text) chart = alt.layer(points, trend_line, quad_line).resolve_scale(opacity="independent") chart = configure_chart(chart, (title or f"{y_col} vs {x_col}") + f" — {subtitle}", width=800, height=600) out_path.parent.mkdir(parents=True, exist_ok=True) chart.save(out_path) inject_font_css(out_path) print(f"Saved {out_path}") def save_correlation_heatmap(df: pd.DataFrame, cols: Iterable[str], title: str, out_path: Path) -> None: # Отрисовываем корреляции по выбранным столбцам и сохраняем в HTML corr = df[list(cols)].corr() corr_long = corr.reset_index().melt(id_vars="index", var_name="col", value_name="corr") corr_long = corr_long.rename(columns={"index": "row"}) chart = ( alt.Chart(corr_long) .mark_rect() .encode( x=alt.X("col:N", title=""), y=alt.Y("row:N", title=""), color=alt.Color("corr:Q", scale=alt.Scale(domain=(-1, 1), scheme="redblue"), legend=alt.Legend(title="corr")), tooltip=["row", "col", alt.Tooltip("corr:Q", format=".3f")], ) ) chart = configure_chart(chart, title, width=400, height=400) out_path.parent.mkdir(parents=True, exist_ok=True) chart.save(out_path) inject_font_css(out_path) print(f"Saved {out_path}") def generate_total_plots() -> None: # Главный сценарий для общих заказов: строим облако и тренд df = prepare_client_data() out_base = OUTPUT_DIR / "orders_amt_total" save_scatter_trend_quad( df, y_col="orders_amt_total", out_path=out_base / "scatter_trend_quad.html", x_max=bmp.DEFAULT_X_MAX, y_max=bmp.DEFAULT_Y_MAX, savgol_window=bmp.DEFAULT_SAVGOL_WINDOW, title="Заказы vs средние показы (все клиенты)", ) def generate_category_plots() -> None: # Проходим по категориям и комбинированным группам, строим корреляции и облака client = prepare_category_client_data() x_max_overrides = { "ent": 4, "transport": 6, "super": 4, "avia": 4, "shopping": 4, "avia_hotel": 5, } y_max_overrides = { "ent": 2.5, "transport": 8, "avia": 1.5, "shopping": 2.5, "super": 5.5, "avia_hotel": 2.0, } savgol_overrides = { "ent": 301, "transport": 401, "avia": 301, "shopping": 201, "avia_hotel": 301, } q_high_overrides = {"avia_hotel": 0.9} iqr_overrides = {"avia_hotel": 1.2} cats_all = CATEGORIES + list(COMBINED.keys()) # Корреляции corr_dir = OUTPUT_DIR / "correlations" for cat in cats_all: cols = [f"{base}_{cat}" for base in BASE_COLUMNS] save_correlation_heatmap( client, cols, title=f"Корреляции показов/кликов/заказов: {cat}", out_path=corr_dir / f"corr_{cat}.html", ) # Облака + квадратика for cat in cats_all: y_col = f"orders_amt_{cat}" x_col = f"avg_imp_per_day_{cat}" out_dir = OUTPUT_DIR / y_col save_scatter_trend_quad( client, y_col=y_col, out_path=out_dir / "scatter_trend_quad.html", x_col=x_col, x_max=x_max_overrides.get(cat, bmp.DEFAULT_X_MAX), y_max=y_max_overrides.get(cat, bmp.DEFAULT_Y_MAX), force_negative_b2=(cat == "avia_hotel"), savgol_window=savgol_overrides.get(cat, bmp.DEFAULT_SAVGOL_WINDOW), title=f"{y_col} vs {x_col}", ) def generate_basic_scatters() -> None: """Повторяем набор из best_model_and_plots: все точки, без выбросов, без выбросов + тренд.""" df = prepare_client_data() y_col = "orders_amt_total" x_col = bmp.X_COL x_max = bmp.DEFAULT_X_MAX y_max = bmp.DEFAULT_Y_MAX out_dir = OUTPUT_DIR / y_col base = df[[x_col, y_col]].dropna() base = bmp.filter_x_range(base, x_col, x_max) base = base.copy() base["alpha"] = compute_density_alpha(base, x_col, y_col, x_max, y_max) def scatter_chart(data: pd.DataFrame, title: str, trend: Tuple[np.ndarray, np.ndarray] | None = None) -> alt.Chart: x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max) y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False) points = alt.Chart(data).mark_circle(size=40).encode( x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale), y=alt.Y(y_col, title=y_col, scale=y_scale), opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)), color=alt.value(bmp.DEFAULT_SCATTER_COLOR), tooltip=[x_col, y_col], ) layers = [points] if trend is not None and trend[0] is not None: trend_df = pd.DataFrame({x_col: trend[0], "trend": trend[1]}) layers.append( alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode( x=alt.X(x_col, scale=x_scale), y=alt.Y("trend", scale=y_scale), ) ) chart = alt.layer(*layers).resolve_scale(opacity="independent") return configure_chart(chart, title, width=800, height=600) # 1) все точки scatter_chart(base, "Облако: все точки").save(out_dir / "scatter_all.html") inject_font_css(out_dir / "scatter_all.html") # 2) без выбросов cleaned = bmp.remove_outliers(base, y_col=y_col, x_col=x_col, iqr_k=bmp.DEFAULT_IQR_K, q_low=bmp.DEFAULT_Q_LOW, q_high=bmp.DEFAULT_Q_HIGH) cleaned = cleaned.copy() cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max) scatter_chart(cleaned, "Облако: без выбросов").save(out_dir / "scatter_clean.html") inject_font_css(out_dir / "scatter_clean.html") # 3) без выбросов + тренд tx, ty = bmp.compute_trend( cleaned, y_col=y_col, x_col=x_col, method=bmp.DEFAULT_TREND_METHOD, lowess_frac=bmp.DEFAULT_TREND_FRAC, savgol_window=bmp.DEFAULT_SAVGOL_WINDOW, ) scatter_chart(cleaned, "Облако: без выбросов + тренд", trend=(tx, ty)).save(out_dir / "scatter_clean_trend.html") inject_font_css(out_dir / "scatter_clean_trend.html") def main() -> None: OUTPUT_DIR.mkdir(parents=True, exist_ok=True) generate_basic_scatters() generate_total_plots() generate_category_plots() if __name__ == "__main__": main()