Files
dano2025/main_hypot/best_model_and_plots.py
2025-12-15 18:38:10 +03:00

583 lines
18 KiB
Python

import sqlite3
from pathlib import Path
import sys
from typing import Tuple
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
import pandas as pd
import seaborn as sns
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 8)
project_root = Path(__file__).resolve().parent.parent
DB_PATH = project_root / "dataset" / "ds.sqlite"
BASE_OUT_DIR = project_root / "main_hypot"
# Константы данных
CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
# Константы визуализации/очистки
X_COL = "avg_imp_per_day" # x всегда фиксирован
DEFAULT_X_MAX = 18
DEFAULT_SCATTER_COLOR = "#2c7bb6"
DEFAULT_POINT_SIZE = 20
DEFAULT_ALPHA = 0.08
DEFAULT_TREND_ALPHA = 0.1
DEFAULT_TREND_FRAC = 0.3
DEFAULT_TREND_COLOR = "red"
DEFAULT_TREND_LINEWIDTH = 2.5
DEFAULT_IQR_K = 1.5
DEFAULT_Q_LOW = 0.05
DEFAULT_Q_HIGH = 0.95
DEFAULT_ALPHA_MIN = 0.04
DEFAULT_ALPHA_MAX = 0.7
DEFAULT_BINS_X = 60
DEFAULT_BINS_Y = 60
DEFAULT_Y_MIN = -0.5
DEFAULT_Y_MAX = 10
DEFAULT_TREND_METHOD = "savgol" # options: lowess, rolling, savgol
DEFAULT_ROLLING_WINDOW = 200
DEFAULT_SAVGOL_WINDOW = 501
DEFAULT_SAVGOL_POLY = 2
def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
denom = denominator.replace(0, pd.NA)
return numerator / denom
def load_client_level(db_path: Path) -> pd.DataFrame:
"""Собирает агрегаты по клиентам без зависимостей от eda_utils."""
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
contact_days=("business_dt", "nunique"),
)
.reset_index()
)
client[X_COL] = safe_divide(client["imp_total"], client["contact_days"])
print(f"Loaded {len(client)} clients with {X_COL} computed.")
return client
def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]:
q1, q3 = series.quantile([q_low, q_high])
iqr = q3 - q1
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
def remove_outliers(
df: pd.DataFrame,
y_col: str,
x_col: str = X_COL,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
) -> pd.DataFrame:
"""Убирает выбросы по IQR отдельно по x и y."""
x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k)
y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k)
filtered = df[
df[x_col].between(max(0, x_low), x_high)
& df[y_col].between(max(0, y_low), y_high)
].copy()
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).")
return filtered
def compute_density_alpha(
df: pd.DataFrame,
x_col: str,
y_col: str,
x_max: float,
*,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max_limit: float = DEFAULT_Y_MAX,
) -> np.ndarray:
"""Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах."""
x_vals = df[x_col].to_numpy()
y_vals = df[y_col].to_numpy()
if len(x_vals) == 0:
return np.array([])
x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1)
y_upper = max(min(y_vals.max(), y_max_limit), 1e-9)
y_edges = np.linspace(y_min, y_upper, bins_y + 1)
x_bins = np.digitize(x_vals, x_edges) - 1
y_bins = np.digitize(y_vals, y_edges) - 1
valid = (
(x_bins >= 0) & (x_bins < bins_x) &
(y_bins >= 0) & (y_bins < bins_y)
)
counts = np.zeros((bins_x, bins_y), dtype=int)
for xb, yb in zip(x_bins[valid], y_bins[valid]):
counts[xb, yb] += 1
bin_counts = counts[
np.clip(x_bins, 0, bins_x - 1),
np.clip(y_bins, 0, bins_y - 1),
]
max_count = bin_counts.max() if len(bin_counts) else 1
if max_count == 0:
weight = np.zeros_like(bin_counts, dtype=float)
else:
weight = (bin_counts / max_count) ** np.sqrt(1.5)
weight = np.clip(weight, 0, 1)
return alpha_min + (alpha_max - alpha_min) * weight
def compute_trend(
df: pd.DataFrame,
y_col: str,
*,
x_col: str = X_COL,
method: str = DEFAULT_TREND_METHOD,
lowess_frac: float = DEFAULT_TREND_FRAC,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> Tuple[np.ndarray, np.ndarray]:
"""Возвращает (x_sorted, trend_y) по выбранному методу."""
d = df[[x_col, y_col]].dropna().sort_values(x_col)
x_vals = d[x_col].to_numpy()
y_vals = d[y_col].to_numpy()
if len(x_vals) == 0:
return np.array([]), np.array([])
m = method.lower()
if m == "lowess":
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
return trend[:, 0], trend[:, 1]
if m == "rolling":
w = max(3, rolling_window)
if w % 2 == 0:
w += 1
y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy()
return x_vals, y_trend
if m == "savgol":
w = max(5, savgol_window)
if w % 2 == 0:
w += 1
poly = min(savgol_poly, w - 1)
y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp")
return x_vals, y_trend
# fallback to lowess
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
return trend[:, 0], trend[:, 1]
def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame:
subset = df[df[x_col] <= x_max].copy()
print(f"{len(df)} points; {len(subset)} within x<={x_max}.")
return subset
def plot_density_scatter(
df: pd.DataFrame,
y_col: str,
title: str,
out_path: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
with_trend: bool = False,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
return_fig: bool = False,
) -> None:
fig, ax = plt.subplots(figsize=(8, 8))
alpha_values = compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bins_x,
bins_y=bins_y,
alpha_min=alpha_min,
alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
ax.scatter(
df[x_col],
df[y_col],
color=scatter_color,
s=point_size,
alpha=alpha_values if len(alpha_values) else alpha,
linewidths=0,
)
trend_data = None
if with_trend:
tx, ty = compute_trend(
df,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
if len(tx):
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
ax.legend()
trend_data = (tx, ty)
ax.set_xlim(0, x_max)
ax.set_ylim(y_min, y_max)
ax.set_yticks(range(0, int(y_max) + 1, 2))
ax.set_xlabel("Среднее число показов в день")
ax.set_ylabel(y_col)
ax.set_title(title)
ax.grid(alpha=0.3)
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(out_path, dpi=150)
if return_fig:
return fig, ax, trend_data
plt.close(fig)
print(f"Saved {out_path}")
def plot_raw_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
plot_density_scatter(
in_range,
y_col=y_col,
title=f"Облако: {y_col} vs {x_col} (все клиенты)",
out_path=out_dir / "scatter.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def plot_clean_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
cleaned = remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
plot_density_scatter(
cleaned,
y_col=y_col,
title=f"Облако без выбросов (IQR) {y_col} vs {x_col}",
out_path=out_dir / "scatter_clean.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def plot_clean_trend_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_TREND_ALPHA,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
return_components: bool = False,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
cleaned = remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
fig_ax = plot_density_scatter(
cleaned,
y_col=y_col,
title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
out_path=out_dir / "scatter_trend.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
with_trend=True,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
return_fig=return_components,
)
if return_components:
fig, ax, trend_data = fig_ax
return fig, ax, cleaned, trend_data
def generate_scatter_set(
df: pd.DataFrame,
y_col: str,
*,
base_out_dir: Path = BASE_OUT_DIR,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
trend_alpha: float = DEFAULT_TREND_ALPHA,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
"""Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col."""
out_dir = base_out_dir / str(y_col).replace("/", "_")
plot_raw_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plot_clean_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plot_clean_trend_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=trend_alpha,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def main() -> None:
client = load_client_level(DB_PATH)
zero_orders = (client["orders_amt_total"] == 0).sum()
non_zero = len(client) - zero_orders
if len(client):
print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})")
generate_scatter_set(client, y_col="orders_amt_total")
if __name__ == "__main__":
main()