Files
dano2025/new_divided_scatters.py
2025-12-16 01:51:05 +03:00

277 lines
9.7 KiB
Python

from __future__ import annotations
"""Генерирует раздельные Altair-облака для активных/пассивных/всех показов по категориям."""
import sqlite3
import sys
from pathlib import Path
from typing import Optional, Tuple
import altair as alt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, r2_score
PROJECT_ROOT = Path(__file__).resolve().parent
OUT_DIR = PROJECT_ROOT / "new_plots" / "final_result"
sys.path.append(str(PROJECT_ROOT / "main_hypot"))
import best_model_and_plots as bmp # noqa: E402
sys.path.append(str(PROJECT_ROOT))
import new_plots as npplots # noqa: E402
CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
FONT_PATH = Path("/Users/dan/Downloads/AyuGram Desktop/SegoeUIVF.ttf")
BASE_WINDOW = bmp.DEFAULT_SAVGOL_WINDOW
ACTIVE_WINDOW_FACTOR = 0.5
PASSIVE_WINDOW_FACTOR = 0.95
COMBINED_WINDOW_FACTOR = 1.0
def load_raw() -> pd.DataFrame:
# Читаем полные коммуникации из SQLite для дальнейших агрегаций
conn = sqlite3.connect(bmp.DB_PATH)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
return df
def build_client(df: pd.DataFrame) -> pd.DataFrame:
# Агрегируем активные/пассивные показы и заказы по клиенту и считаем средние в день
agg_spec = {
**{f"active_imp_{c}": "sum" for c in CATEGORIES},
**{f"passive_imp_{c}": "sum" for c in CATEGORIES},
**{f"orders_amt_{c}": "sum" for c in CATEGORIES},
"business_dt": "nunique",
}
client = df.groupby("id").agg(agg_spec).reset_index()
client = client.rename(columns={"business_dt": "contact_days"})
client["active_imp_total"] = client[[f"active_imp_{c}" for c in CATEGORIES]].sum(axis=1)
client["passive_imp_total"] = client[[f"passive_imp_{c}" for c in CATEGORIES]].sum(axis=1)
client["orders_amt_total"] = client[[f"orders_amt_{c}" for c in CATEGORIES]].sum(axis=1)
for c in CATEGORIES:
client[f"active_imp_per_day_{c}"] = bmp.safe_divide(client[f"active_imp_{c}"], client["contact_days"])
client[f"passive_imp_per_day_{c}"] = bmp.safe_divide(client[f"passive_imp_{c}"], client["contact_days"])
client[f"total_imp_per_day_{c}"] = bmp.safe_divide(
client[f"active_imp_{c}"] + client[f"passive_imp_{c}"], client["contact_days"]
)
client["active_imp_per_day_total"] = bmp.safe_divide(client["active_imp_total"], client["contact_days"])
client["passive_imp_per_day_total"] = bmp.safe_divide(client["passive_imp_total"], client["contact_days"])
client["total_imp_per_day_total"] = bmp.safe_divide(
client["active_imp_total"] + client["passive_imp_total"], client["contact_days"]
)
return client
def compute_limits(df: pd.DataFrame, x_col: str, y_col: str) -> Tuple[float, float]:
# Автоматический подбор разумных лимитов осей по 99 перцентилю
x_q = df[x_col].quantile(0.99)
y_q = df[y_col].quantile(0.99)
x_max = float(max(0.1, x_q + 2.0))
y_max = float(max(0.1, y_q + 3.0))
return x_max, y_max
def fit_quadratic(
df: pd.DataFrame,
trend_data: Tuple[np.ndarray, np.ndarray],
*,
x_col: str,
y_col: str,
x_max: float,
) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]:
# Строим квадратичную регрессию и считаем AUC/R2 по тренду
if len(df) < 3:
return None, {}
x = df[x_col].to_numpy()
y = df[y_col].to_numpy()
X = sm.add_constant(np.column_stack([x, x**2]))
model = sm.OLS(y, X).fit(cov_type="HC3")
auc = np.nan
binary = (y > 0).astype(int)
if len(np.unique(binary)) > 1:
auc = roc_auc_score(binary, model.predict(X))
tx, ty = trend_data
r2_trend = np.nan
if tx is not None and len(tx) >= 3:
mask = (tx <= x_max) & ~np.isnan(ty)
txm = tx[mask]
tym = ty[mask]
if len(txm) >= 3 and np.nanvar(tym) > 0:
X_trend = sm.add_constant(np.column_stack([txm, txm**2]))
y_hat_trend = model.predict(X_trend)
r2_trend = r2_score(tym, y_hat_trend)
return model, {"auc": auc, "r2_trend": r2_trend}
def scatter_trend_quad(
df: pd.DataFrame,
x_col: str,
y_col: str,
out_path: Path,
*,
title: str,
window: int,
x_override: float | None = None,
y_override: float | None = None,
x_scale_factor: float | None = None,
) -> None:
# Пайплайн для одной комбинации x/y: фильтр, тренд, регрессия и сохранение HTML
# Авто-лимиты
x_max, y_max = compute_limits(df, x_col, y_col)
if x_override is not None:
x_max = x_override + 5.5
if y_override is not None:
y_max = y_override + 3.0
if x_scale_factor is not None:
x_max = x_max * x_scale_factor
# Фильтр
base = df[[x_col, y_col]].dropna()
base = bmp.filter_x_range(base, x_col, x_max)
base = base[base[y_col] <= y_max].copy()
base["alpha"] = bmp.compute_density_alpha(
base,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bmp.DEFAULT_BINS_X,
bins_y=bmp.DEFAULT_BINS_Y,
alpha_min=bmp.DEFAULT_ALPHA_MIN,
alpha_max=bmp.DEFAULT_ALPHA_MAX,
y_min=bmp.DEFAULT_Y_MIN,
y_max_limit=y_max,
)
tx, ty = bmp.compute_trend(
base,
y_col=y_col,
x_col=x_col,
method=bmp.DEFAULT_TREND_METHOD,
lowess_frac=bmp.DEFAULT_TREND_FRAC,
savgol_window=window,
)
trend_data = (tx, ty)
model, metrics = fit_quadratic(base, trend_data, x_col=x_col, y_col=y_col, x_max=x_max)
if model is None:
print(f"[{y_col}] not enough data")
return
params = model.params
pvals = model.pvalues
x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max)
y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False)
points = alt.Chart(base).mark_circle(size=40).encode(
x=alt.X(x_col, title="Показы в день", scale=x_scale),
y=alt.Y(y_col, title="Заказы", scale=y_scale),
opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True), legend=None),
color=alt.value("#FFDD2D"),
tooltip=[x_col, y_col],
)
layers = [points]
if tx is not None and len(tx):
trend_df = pd.DataFrame({x_col: tx, "trend": ty})
layers.append(
alt.Chart(trend_df).mark_line(color="#EB4146", strokeWidth=2.5).encode(
x=alt.X(x_col, scale=x_scale),
y=alt.Y("trend", scale=y_scale),
)
)
x_grid = np.linspace(0, x_max, 400)
quad_df = pd.DataFrame(
{
x_col: x_grid,
"quad": model.predict(sm.add_constant(np.column_stack([x_grid, x_grid**2]))),
}
)
layers.append(
alt.Chart(quad_df).mark_line(color="#198F51", strokeWidth=2.2).encode(
x=alt.X(x_col, scale=x_scale),
y=alt.Y("quad", scale=y_scale),
)
)
metrics_text = [
f"R2_trend={metrics['r2_trend']:.3f}",
f"AUC={metrics['auc']:.3f}",
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
f"b2={params[2]:.3f} (p={pvals[2]:.3g})",
f"n={len(base)}",
]
subtitle = "".join(metrics_text)
chart = alt.layer(*layers).resolve_scale(opacity="independent")
chart = npplots.configure_chart(chart, f"{title}{subtitle}", width=800, height=600)
out_path.parent.mkdir(parents=True, exist_ok=True)
chart.save(out_path)
npplots.inject_font_css(out_path)
print(f"Saved {out_path}")
def main() -> None:
raw = load_raw()
client = build_client(raw)
targets = [("total", "orders_amt_total")] + [(c, f"orders_amt_{c}") for c in CATEGORIES]
for name, y_col in targets:
out_dir = OUT_DIR / name
# активные
x_col_active = f"active_imp_per_day_{name}" if name != "total" else "active_imp_per_day_total"
scatter_trend_quad(
client,
x_col=x_col_active,
y_col=y_col,
out_path=out_dir / f"{name}_active_scatter.html",
title=f"{y_col}: активные показы/день",
window=int(max(5, BASE_WINDOW * ACTIVE_WINDOW_FACTOR)),
y_override=6.5 if name == "total" else None,
x_scale_factor=0.5 if name == "total" else None,
)
# пассивные
x_col_passive = f"passive_imp_per_day_{name}" if name != "total" else "passive_imp_per_day_total"
scatter_trend_quad(
client,
x_col=x_col_passive,
y_col=y_col,
out_path=out_dir / f"{name}_passive_scatter.html",
title=f"{y_col}: пассивные показы/день",
window=int(max(5, BASE_WINDOW * PASSIVE_WINDOW_FACTOR)),
y_override=6.5 if name == "total" else None,
x_scale_factor=0.95 if name == "total" else None,
)
# комбинированные
x_col_total = f"total_imp_per_day_{name}" if name != "total" else "total_imp_per_day_total"
scatter_trend_quad(
client,
x_col=x_col_total,
y_col=y_col,
out_path=out_dir / f"{name}_combined_scatter.html",
title=f"{y_col}: все показы/день",
window=int(max(5, BASE_WINDOW * COMBINED_WINDOW_FACTOR)),
x_override=13 if name == "total" else None,
y_override=6.5 if name == "total" else None,
)
if __name__ == "__main__":
main()