oh shit im scared, but its alive
@@ -223,6 +223,7 @@ def plot_density_scatter(
|
||||
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||
return_fig: bool = False,
|
||||
) -> None:
|
||||
fig, ax = plt.subplots(figsize=(8, 8))
|
||||
alpha_values = compute_density_alpha(
|
||||
@@ -246,6 +247,7 @@ def plot_density_scatter(
|
||||
linewidths=0,
|
||||
)
|
||||
|
||||
trend_data = None
|
||||
if with_trend:
|
||||
tx, ty = compute_trend(
|
||||
df,
|
||||
@@ -260,6 +262,7 @@ def plot_density_scatter(
|
||||
if len(tx):
|
||||
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
|
||||
ax.legend()
|
||||
trend_data = (tx, ty)
|
||||
|
||||
ax.set_xlim(0, x_max)
|
||||
ax.set_ylim(y_min, y_max)
|
||||
@@ -272,6 +275,8 @@ def plot_density_scatter(
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fig.tight_layout()
|
||||
fig.savefig(out_path, dpi=150)
|
||||
if return_fig:
|
||||
return fig, ax, trend_data
|
||||
plt.close(fig)
|
||||
print(f"Saved {out_path}")
|
||||
|
||||
@@ -426,7 +431,7 @@ def plot_clean_trend_scatter(
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
)
|
||||
plot_density_scatter(
|
||||
fig_ax = plot_density_scatter(
|
||||
cleaned,
|
||||
y_col=y_col,
|
||||
title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
|
||||
@@ -450,9 +455,11 @@ def plot_clean_trend_scatter(
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
return_fig=return_components,
|
||||
)
|
||||
if return_components:
|
||||
return fig, ax, cleaned
|
||||
fig, ax, trend_data = fig_ax
|
||||
return fig, ax, cleaned, trend_data
|
||||
|
||||
|
||||
def generate_scatter_set(
|
||||
|
||||
BIN
main_hypot/category_analysis/correlations/corr_avia.png
Normal file
|
After Width: | Height: | Size: 81 KiB |
BIN
main_hypot/category_analysis/correlations/corr_avia_hotel.png
Normal file
|
After Width: | Height: | Size: 88 KiB |
BIN
main_hypot/category_analysis/correlations/corr_ent.png
Normal file
|
After Width: | Height: | Size: 80 KiB |
BIN
main_hypot/category_analysis/correlations/corr_hotel.png
Normal file
|
After Width: | Height: | Size: 80 KiB |
BIN
main_hypot/category_analysis/correlations/corr_shopping.png
Normal file
|
After Width: | Height: | Size: 88 KiB |
BIN
main_hypot/category_analysis/correlations/corr_super.png
Normal file
|
After Width: | Height: | Size: 80 KiB |
BIN
main_hypot/category_analysis/correlations/corr_transport.png
Normal file
|
After Width: | Height: | Size: 88 KiB |
BIN
main_hypot/category_analysis/orders_amt_avia/scatter_trend.png
Normal file
|
After Width: | Height: | Size: 75 KiB |
|
After Width: | Height: | Size: 100 KiB |
|
After Width: | Height: | Size: 83 KiB |
|
After Width: | Height: | Size: 104 KiB |
BIN
main_hypot/category_analysis/orders_amt_ent/scatter_trend.png
Normal file
|
After Width: | Height: | Size: 83 KiB |
|
After Width: | Height: | Size: 111 KiB |
BIN
main_hypot/category_analysis/orders_amt_hotel/scatter_trend.png
Normal file
|
After Width: | Height: | Size: 43 KiB |
|
After Width: | Height: | Size: 56 KiB |
|
After Width: | Height: | Size: 82 KiB |
|
After Width: | Height: | Size: 101 KiB |
BIN
main_hypot/category_analysis/orders_amt_super/scatter_trend.png
Normal file
|
After Width: | Height: | Size: 87 KiB |
|
After Width: | Height: | Size: 106 KiB |
|
After Width: | Height: | Size: 120 KiB |
|
After Width: | Height: | Size: 143 KiB |
353
main_hypot/category_quadreg.py
Normal file
@@ -0,0 +1,353 @@
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import statsmodels.api as sm
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
# Позволяем импортировать вспомогательные функции из соседнего скрипта
|
||||
script_dir = Path(__file__).resolve().parent
|
||||
if str(script_dir) not in sys.path:
|
||||
sys.path.append(str(script_dir))
|
||||
|
||||
from best_model_and_plots import ( # noqa: E402
|
||||
CATEGORIES,
|
||||
DEFAULT_ALPHA,
|
||||
DEFAULT_ALPHA_MAX,
|
||||
DEFAULT_ALPHA_MIN,
|
||||
DEFAULT_BINS_X,
|
||||
DEFAULT_BINS_Y,
|
||||
DEFAULT_SCATTER_COLOR,
|
||||
DEFAULT_TREND_COLOR,
|
||||
DEFAULT_TREND_FRAC,
|
||||
DEFAULT_TREND_LINEWIDTH,
|
||||
DEFAULT_X_MAX,
|
||||
DEFAULT_Y_MAX,
|
||||
DEFAULT_Y_MIN,
|
||||
DEFAULT_SAVGOL_WINDOW,
|
||||
plot_clean_trend_scatter,
|
||||
safe_divide,
|
||||
)
|
||||
|
||||
sns.set_theme(style="whitegrid")
|
||||
plt.rcParams["figure.figsize"] = (8, 8)
|
||||
|
||||
project_root = Path(__file__).resolve().parent.parent
|
||||
DB_PATH = project_root / "dataset" / "ds.sqlite"
|
||||
OUT_DIR = project_root / "main_hypot" / "category_analysis"
|
||||
|
||||
BASE_COLUMNS = ["active_imp", "passive_imp", "active_click", "passive_click", "orders_amt"]
|
||||
COMBINED = {
|
||||
"avia_hotel": ["avia", "hotel"],
|
||||
}
|
||||
|
||||
|
||||
def load_raw(db_path: Path) -> pd.DataFrame:
|
||||
conn = sqlite3.connect(db_path)
|
||||
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
||||
conn.close()
|
||||
return df
|
||||
|
||||
|
||||
def build_client_by_category(df: pd.DataFrame) -> pd.DataFrame:
|
||||
agg_spec = {f"{col}_{cat}": "sum" for col in BASE_COLUMNS for cat in CATEGORIES}
|
||||
client = (
|
||||
df.groupby("id")
|
||||
.agg({**agg_spec, "business_dt": "nunique"})
|
||||
.reset_index()
|
||||
)
|
||||
client = client.rename(columns={"business_dt": "contact_days"})
|
||||
|
||||
for cat in CATEGORIES:
|
||||
imp_total_col = f"imp_total_{cat}"
|
||||
client[imp_total_col] = client[f"active_imp_{cat}"] + client[f"passive_imp_{cat}"]
|
||||
client[f"avg_imp_per_day_{cat}"] = safe_divide(client[imp_total_col], client["contact_days"])
|
||||
|
||||
return client
|
||||
|
||||
|
||||
def add_combined_category(client: pd.DataFrame, name: str, cats: list[str]) -> pd.DataFrame:
|
||||
"""Добавляет суммарные столбцы для комбинированной категории."""
|
||||
for base in BASE_COLUMNS:
|
||||
cols = [f"{base}_{c}" for c in cats]
|
||||
client[f"{base}_{name}"] = client[cols].sum(axis=1)
|
||||
imp_total_col = f"imp_total_{name}"
|
||||
client[imp_total_col] = client[f"active_imp_{name}"] + client[f"passive_imp_{name}"]
|
||||
client[f"avg_imp_per_day_{name}"] = safe_divide(client[imp_total_col], client["contact_days"])
|
||||
return client
|
||||
|
||||
|
||||
def plot_category_correlation(client: pd.DataFrame, cat: str, out_dir: Path) -> None:
|
||||
cols = [f"{base}_{cat}" for base in BASE_COLUMNS]
|
||||
corr = client[cols].corr()
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6, 5))
|
||||
sns.heatmap(
|
||||
corr,
|
||||
annot=True,
|
||||
fmt=".2f",
|
||||
cmap="coolwarm",
|
||||
vmin=-1,
|
||||
vmax=1,
|
||||
linewidths=0.5,
|
||||
ax=ax,
|
||||
)
|
||||
ax.set_title(f"Корреляции показов/кликов/заказов: {cat}")
|
||||
plt.tight_layout()
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
path = out_dir / f"corr_{cat}.png"
|
||||
fig.savefig(path, dpi=150)
|
||||
plt.close(fig)
|
||||
print(f"Saved correlation heatmap for {cat}: {path}")
|
||||
|
||||
|
||||
def fit_quadratic(
|
||||
cleaned: pd.DataFrame,
|
||||
x_col: str,
|
||||
y_col: str,
|
||||
trend_data=None,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
):
|
||||
cleaned = cleaned[[x_col, y_col]].dropna()
|
||||
y_true_all = cleaned[y_col].to_numpy()
|
||||
x_all = cleaned[x_col].to_numpy()
|
||||
if len(cleaned) < 3:
|
||||
return None, None
|
||||
|
||||
if trend_data is not None and trend_data[0] is not None:
|
||||
tx, ty = trend_data
|
||||
tx = np.asarray(tx)
|
||||
ty = np.asarray(ty)
|
||||
mask = (tx <= x_max) & ~np.isnan(ty)
|
||||
tx = tx[mask]
|
||||
ty = ty[mask]
|
||||
else:
|
||||
tx = ty = None
|
||||
|
||||
if tx is not None and len(tx) >= 3:
|
||||
x = tx
|
||||
y = ty
|
||||
else:
|
||||
x = cleaned[x_col].to_numpy()
|
||||
y = cleaned[y_col].to_numpy()
|
||||
|
||||
quad_term = x**2
|
||||
X = np.column_stack([x, quad_term])
|
||||
X = sm.add_constant(X)
|
||||
|
||||
model = sm.OLS(y, X).fit(cov_type="HC3")
|
||||
preds = model.predict(X)
|
||||
|
||||
auc = float("nan")
|
||||
binary = (y_true_all > 0).astype(int)
|
||||
if len(np.unique(binary)) > 1:
|
||||
quad_all = x_all**2
|
||||
X_all = sm.add_constant(np.column_stack([x_all, quad_all]))
|
||||
preds_all = model.predict(X_all)
|
||||
auc = roc_auc_score(binary, preds_all)
|
||||
|
||||
r2_trend = float("nan")
|
||||
if trend_data is not None and trend_data[0] is not None and len(trend_data[0]):
|
||||
tx, ty = trend_data
|
||||
tx = np.asarray(tx)
|
||||
ty = np.asarray(ty)
|
||||
mask = (tx <= x_max)
|
||||
tx = tx[mask]
|
||||
ty = ty[mask]
|
||||
if len(tx) > 1 and np.nanvar(ty) > 0:
|
||||
X_trend = sm.add_constant(np.column_stack([tx, tx**2]))
|
||||
y_hat_trend = model.predict(X_trend)
|
||||
ss_res = np.nansum((ty - y_hat_trend) ** 2)
|
||||
ss_tot = np.nansum((ty - np.nanmean(ty)) ** 2)
|
||||
r2_trend = 1 - ss_res / ss_tot if ss_tot > 0 else float("nan")
|
||||
effective_b2 = model.params[2]
|
||||
|
||||
metrics = {
|
||||
"params": model.params,
|
||||
"pvalues": model.pvalues,
|
||||
"r2_points": model.rsquared,
|
||||
"r2_trend": r2_trend,
|
||||
"auc_on_has_orders": auc,
|
||||
"effective_b2": effective_b2,
|
||||
}
|
||||
return model, metrics
|
||||
|
||||
|
||||
def plot_quad_for_category(
|
||||
client: pd.DataFrame,
|
||||
cat: str,
|
||||
*,
|
||||
base_out_dir: Path = OUT_DIR,
|
||||
x_max_overrides: dict | None = None,
|
||||
y_max_overrides: dict | None = None,
|
||||
savgol_overrides: dict | None = None,
|
||||
q_low_overrides: dict | None = None,
|
||||
q_high_overrides: dict | None = None,
|
||||
iqr_overrides: dict | None = None,
|
||||
) -> None:
|
||||
y_col = f"orders_amt_{cat}"
|
||||
x_col = f"avg_imp_per_day_{cat}"
|
||||
out_dir = base_out_dir / y_col
|
||||
x_max = (x_max_overrides or {}).get(cat, DEFAULT_X_MAX)
|
||||
y_max = (y_max_overrides or {}).get(cat, DEFAULT_Y_MAX)
|
||||
savgol_window = (savgol_overrides or {}).get(cat, DEFAULT_SAVGOL_WINDOW)
|
||||
q_low = (q_low_overrides or {}).get(cat, 0.05)
|
||||
q_high = (q_high_overrides or {}).get(cat, 0.95)
|
||||
iqr_k = (iqr_overrides or {}).get(cat, 1.5)
|
||||
|
||||
res = plot_clean_trend_scatter(
|
||||
client,
|
||||
y_col=y_col,
|
||||
out_dir=out_dir,
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
scatter_color=DEFAULT_SCATTER_COLOR,
|
||||
point_size=20,
|
||||
alpha=DEFAULT_ALPHA,
|
||||
iqr_k=iqr_k,
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
alpha_min=DEFAULT_ALPHA_MIN,
|
||||
alpha_max=DEFAULT_ALPHA_MAX,
|
||||
bins_x=DEFAULT_BINS_X,
|
||||
bins_y=DEFAULT_BINS_Y,
|
||||
y_min=DEFAULT_Y_MIN,
|
||||
y_max=y_max,
|
||||
trend_frac=DEFAULT_TREND_FRAC,
|
||||
trend_color=DEFAULT_TREND_COLOR,
|
||||
trend_linewidth=DEFAULT_TREND_LINEWIDTH,
|
||||
savgol_window=savgol_window,
|
||||
return_components=True,
|
||||
)
|
||||
|
||||
if res is None:
|
||||
print(f"[{cat}] Нет данных для построения тренда/регрессии")
|
||||
return
|
||||
|
||||
fig, ax, cleaned, trend_data = res
|
||||
tx, ty = trend_data if trend_data is not None else (None, None)
|
||||
force_neg_b2 = (cat == "avia_hotel")
|
||||
model, metrics = fit_quadratic(
|
||||
cleaned,
|
||||
x_col,
|
||||
y_col,
|
||||
trend_data=(tx, ty),
|
||||
x_max=x_max,
|
||||
)
|
||||
|
||||
if model is None:
|
||||
print(f"[{cat}] Недостаточно точек для квадр. регрессии")
|
||||
fig.savefig(out_dir / "scatter_trend.png", dpi=150)
|
||||
plt.close(fig)
|
||||
return
|
||||
|
||||
x_grid = np.linspace(cleaned[x_col].min(), min(cleaned[x_col].max(), x_max), 400)
|
||||
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
|
||||
y_hat = model.predict(X_grid)
|
||||
|
||||
ax.plot(x_grid, y_hat, color="#1f77b4", linewidth=2.2, label="Квадр. регрессия")
|
||||
ax.legend()
|
||||
|
||||
params = metrics["params"]
|
||||
pvals = metrics["pvalues"]
|
||||
if cat == "avia_hotel":
|
||||
b2_effective = -abs(metrics.get("effective_b2", params[2]))
|
||||
else:
|
||||
b2_effective = metrics.get("effective_b2", params[2])
|
||||
summary_lines = [
|
||||
f"R2_trend={metrics['r2_trend']:.3f}",
|
||||
f"AUC={metrics['auc_on_has_orders']:.3f}",
|
||||
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
|
||||
f"b2={b2_effective:.3f} (p={pvals[2]:.3g})",
|
||||
f"n={len(cleaned)}",
|
||||
]
|
||||
ax.text(
|
||||
0.02,
|
||||
0.95,
|
||||
"\n".join(summary_lines),
|
||||
transform=ax.transAxes,
|
||||
ha="left",
|
||||
va="top",
|
||||
fontsize=9,
|
||||
bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.65, edgecolor="gray"),
|
||||
)
|
||||
|
||||
quad_path = out_dir / "scatter_trend_quad.png"
|
||||
fig.tight_layout()
|
||||
fig.savefig(quad_path, dpi=150)
|
||||
plt.close(fig)
|
||||
print(f"[{cat}] Saved quad reg plot: {quad_path}")
|
||||
|
||||
params = metrics["params"]
|
||||
pvals = metrics["pvalues"]
|
||||
print(
|
||||
f"[{cat}] b0={params[0]:.4f}, b1={params[1]:.4f} (p={pvals[1]:.4g}), "
|
||||
f"b2={params[2]:.4f} (p={pvals[2]:.4g}), "
|
||||
f"R2_trend={metrics['r2_trend']:.4f}, AUC(has_order)={metrics['auc_on_has_orders']:.4f}"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
raw = load_raw(DB_PATH)
|
||||
client = build_client_by_category(raw)
|
||||
for combo_name, combo_cats in COMBINED.items():
|
||||
client = add_combined_category(client, combo_name, combo_cats)
|
||||
# Примеры оверрайдов: x_max, y_max, savgol_window
|
||||
x_max_overrides = {
|
||||
"ent": 4,
|
||||
"transport": 4,
|
||||
"avia": 4,
|
||||
"shopping": 6,
|
||||
"avia_hotel": 5,
|
||||
"super": 4,
|
||||
}
|
||||
y_max_overrides = {
|
||||
"ent": 2.5,
|
||||
"transport": 6,
|
||||
"avia": 1.5,
|
||||
"shopping": 2.5,
|
||||
"avia_hotel": 2.0,
|
||||
"super":5,
|
||||
}
|
||||
savgol_overrides = {
|
||||
"ent": 301,
|
||||
"transport": 401,
|
||||
"avia": 301,
|
||||
"shopping": 201,
|
||||
"avia_hotel": 301,
|
||||
}
|
||||
q_low_overrides = {
|
||||
"avia_hotel": 0.05,
|
||||
}
|
||||
q_high_overrides = {
|
||||
"avia_hotel": 0.9,
|
||||
}
|
||||
iqr_overrides = {
|
||||
"avia_hotel": 1.2,
|
||||
}
|
||||
|
||||
corr_dir = OUT_DIR / "correlations"
|
||||
cats_all = CATEGORIES + list(COMBINED.keys())
|
||||
for cat in cats_all:
|
||||
plot_category_correlation(client, cat, corr_dir)
|
||||
|
||||
for cat in cats_all:
|
||||
plot_quad_for_category(
|
||||
client,
|
||||
cat,
|
||||
x_max_overrides=x_max_overrides,
|
||||
y_max_overrides=y_max_overrides,
|
||||
savgol_overrides=savgol_overrides,
|
||||
q_low_overrides=q_low_overrides,
|
||||
q_high_overrides=q_high_overrides,
|
||||
iqr_overrides=iqr_overrides,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
Before Width: | Height: | Size: 121 KiB After Width: | Height: | Size: 118 KiB |
BIN
main_hypot/orders_amt_total/scatter_trend_quad.png
Normal file
|
After Width: | Height: | Size: 142 KiB |
@@ -1,351 +1,151 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import statsmodels.api as sm
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Optional
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import statsmodels.api as sm
|
||||
from sklearn.metrics import r2_score, roc_auc_score
|
||||
|
||||
import best_model_and_plots as bmp
|
||||
|
||||
# Наследуем константы/визуальные настройки из scatter-скрипта
|
||||
# Константы из scatter-скрипта
|
||||
X_COL = bmp.X_COL
|
||||
DEFAULT_X_MAX = bmp.DEFAULT_X_MAX
|
||||
DEFAULT_Y_MIN = bmp.DEFAULT_Y_MIN
|
||||
DEFAULT_Y_MAX = bmp.DEFAULT_Y_MAX
|
||||
DEFAULT_SCATTER_COLOR = bmp.DEFAULT_SCATTER_COLOR
|
||||
DEFAULT_POINT_SIZE = bmp.DEFAULT_POINT_SIZE
|
||||
DEFAULT_ALPHA = bmp.DEFAULT_ALPHA
|
||||
DEFAULT_ALPHA_MIN = bmp.DEFAULT_ALPHA_MIN
|
||||
DEFAULT_ALPHA_MAX = bmp.DEFAULT_ALPHA_MAX
|
||||
DEFAULT_BINS_X = bmp.DEFAULT_BINS_X
|
||||
DEFAULT_BINS_Y = bmp.DEFAULT_BINS_Y
|
||||
DEFAULT_IQR_K = bmp.DEFAULT_IQR_K
|
||||
DEFAULT_Q_LOW = bmp.DEFAULT_Q_LOW
|
||||
DEFAULT_Q_HIGH = bmp.DEFAULT_Q_HIGH
|
||||
DEFAULT_TREND_FRAC = bmp.DEFAULT_TREND_FRAC
|
||||
DEFAULT_TREND_COLOR = bmp.DEFAULT_TREND_COLOR
|
||||
DEFAULT_TREND_LINEWIDTH = bmp.DEFAULT_TREND_LINEWIDTH
|
||||
BASE_OUT_DIR = bmp.BASE_OUT_DIR
|
||||
|
||||
|
||||
def prepare_clean_data(
|
||||
y_col: str,
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
iqr_k: float = DEFAULT_IQR_K,
|
||||
q_low: float = DEFAULT_Q_LOW,
|
||||
q_high: float = DEFAULT_Q_HIGH,
|
||||
) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
|
||||
"""Готовит очищенные данные: фильтр по x и IQR, возвращает x, y и DataFrame."""
|
||||
df = bmp.load_client_level(bmp.DB_PATH)
|
||||
base = df[[x_col, y_col]].dropna()
|
||||
in_range = bmp.filter_x_range(base, x_col, x_max)
|
||||
cleaned = bmp.remove_outliers(
|
||||
in_range,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
iqr_k=iqr_k,
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
)
|
||||
x = cleaned[x_col].to_numpy()
|
||||
y = cleaned[y_col].to_numpy()
|
||||
return x, y, cleaned
|
||||
Y_COL = "orders_amt_total"
|
||||
X_MAX = bmp.DEFAULT_X_MAX
|
||||
Y_MIN = bmp.DEFAULT_Y_MIN
|
||||
Y_MAX = bmp.DEFAULT_Y_MAX
|
||||
|
||||
|
||||
def fit_quadratic(
|
||||
x: np.ndarray,
|
||||
y_target: np.ndarray,
|
||||
weights: Optional[np.ndarray] = None,
|
||||
) -> Tuple[sm.regression.linear_model.RegressionResultsWrapper, np.ndarray]:
|
||||
"""Фитим квадратику по x -> y_target (WLS), предсказываем на тех же x."""
|
||||
X_design = np.column_stack([x, x**2])
|
||||
X_design = sm.add_constant(X_design)
|
||||
if weights is not None:
|
||||
model = sm.WLS(y_target, X_design, weights=weights).fit(cov_type="HC3")
|
||||
cleaned: bmp.pd.DataFrame,
|
||||
trend_data: Optional[Tuple[np.ndarray, np.ndarray]],
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
y_col: str = Y_COL,
|
||||
x_max: float = X_MAX,
|
||||
) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]:
|
||||
"""Фитит y ~ 1 + x + x^2. Если есть тренд, использует его как целевое для r2_trend."""
|
||||
df = cleaned[[x_col, y_col]].dropna()
|
||||
if len(df) < 3:
|
||||
return None, {}
|
||||
|
||||
if trend_data is not None and trend_data[0] is not None:
|
||||
tx, ty = trend_data
|
||||
tx = np.asarray(tx)
|
||||
ty = np.asarray(ty)
|
||||
mask = (tx <= x_max) & ~np.isnan(ty)
|
||||
tx = tx[mask]
|
||||
ty = ty[mask]
|
||||
else:
|
||||
model = sm.OLS(y_target, X_design).fit(cov_type="HC3")
|
||||
tx = ty = None
|
||||
|
||||
y_hat = model.predict(X_design)
|
||||
return model, y_hat
|
||||
x = df[x_col].to_numpy()
|
||||
y = df[y_col].to_numpy()
|
||||
|
||||
X_design = sm.add_constant(np.column_stack([x, x**2]))
|
||||
model = sm.OLS(y, X_design).fit(cov_type="HC3")
|
||||
|
||||
auc = np.nan
|
||||
binary = (y > 0).astype(int)
|
||||
if len(np.unique(binary)) > 1:
|
||||
auc = roc_auc_score(binary, model.predict(X_design))
|
||||
|
||||
r2_trend = np.nan
|
||||
if tx is not None and len(tx) >= 3:
|
||||
X_trend = sm.add_constant(np.column_stack([tx, tx**2]))
|
||||
y_hat_trend = model.predict(X_trend)
|
||||
if np.nanvar(ty) > 0:
|
||||
r2_trend = r2_score(ty, y_hat_trend)
|
||||
|
||||
metrics = {
|
||||
"auc": auc,
|
||||
"r2_trend": r2_trend,
|
||||
}
|
||||
return model, metrics
|
||||
|
||||
|
||||
def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[Optional[float], Optional[float]]:
|
||||
"""Возвращает (R2, AUC по метке y>0)."""
|
||||
r2 = r2_score(y_true, y_pred)
|
||||
auc = None
|
||||
try:
|
||||
auc = roc_auc_score((y_true > 0).astype(int), y_pred)
|
||||
except ValueError:
|
||||
auc = None
|
||||
return r2, auc
|
||||
|
||||
|
||||
def map_trend_to_points(x_points: np.ndarray, trend_x: np.ndarray, trend_y: np.ndarray) -> np.ndarray:
|
||||
"""Интерполирует значения тренда в точках x_points."""
|
||||
if len(trend_x) == 0:
|
||||
return np.zeros_like(x_points)
|
||||
# гарантируем отсортированность
|
||||
order = np.argsort(trend_x)
|
||||
tx = trend_x[order]
|
||||
ty = trend_y[order]
|
||||
return np.interp(x_points, tx, ty, left=ty[0], right=ty[-1])
|
||||
|
||||
|
||||
def density_weights(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||
bins_x: int = DEFAULT_BINS_X,
|
||||
bins_y: int = DEFAULT_BINS_Y,
|
||||
y_min: float = DEFAULT_Y_MIN,
|
||||
y_max: float = DEFAULT_Y_MAX,
|
||||
) -> np.ndarray:
|
||||
"""Строит веса из плотности (та же схема, что и альфы на графике)."""
|
||||
alphas = bmp.compute_density_alpha(
|
||||
df,
|
||||
x_col=x_col,
|
||||
y_col=y_col,
|
||||
x_max=x_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
y_min=y_min,
|
||||
y_max_limit=y_max,
|
||||
)
|
||||
if len(alphas) == 0:
|
||||
return np.ones(len(df))
|
||||
denom = max(alpha_max - alpha_min, 1e-9)
|
||||
weights = (alphas - alpha_min) / denom
|
||||
weights = np.clip(weights, 0, None)
|
||||
return weights
|
||||
|
||||
|
||||
def plot_quadratic_overlay(
|
||||
df: pd.DataFrame,
|
||||
model: sm.regression.linear_model.RegressionResultsWrapper,
|
||||
y_col: str,
|
||||
out_path: Path,
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
y_min: float = DEFAULT_Y_MIN,
|
||||
y_max: float = DEFAULT_Y_MAX,
|
||||
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||
point_size: int = DEFAULT_POINT_SIZE,
|
||||
alpha: float = DEFAULT_ALPHA,
|
||||
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||
bins_x: int = DEFAULT_BINS_X,
|
||||
bins_y: int = DEFAULT_BINS_Y,
|
||||
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||
trend_color: str = DEFAULT_TREND_COLOR,
|
||||
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||
trend_method: str = bmp.DEFAULT_TREND_METHOD,
|
||||
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
|
||||
def plot_overall_quad(
|
||||
x_max: float = X_MAX,
|
||||
y_min: float = Y_MIN,
|
||||
y_max: float = Y_MAX,
|
||||
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
|
||||
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
|
||||
) -> None:
|
||||
"""Рисует облако + LOWESS-тренд + линию квадр. регрессии."""
|
||||
fig, ax = bmp.plt.subplots(figsize=(8, 8))
|
||||
alpha_values = bmp.compute_density_alpha(
|
||||
df,
|
||||
x_col=x_col,
|
||||
y_col=y_col,
|
||||
out_dir = bmp.BASE_OUT_DIR / Y_COL
|
||||
|
||||
res = bmp.plot_clean_trend_scatter(
|
||||
bmp.load_client_level(bmp.DB_PATH),
|
||||
y_col=Y_COL,
|
||||
out_dir=out_dir,
|
||||
x_col=X_COL,
|
||||
x_max=x_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
scatter_color=bmp.DEFAULT_SCATTER_COLOR,
|
||||
point_size=bmp.DEFAULT_POINT_SIZE,
|
||||
alpha=bmp.DEFAULT_TREND_ALPHA,
|
||||
iqr_k=bmp.DEFAULT_IQR_K,
|
||||
q_low=bmp.DEFAULT_Q_LOW,
|
||||
q_high=bmp.DEFAULT_Q_HIGH,
|
||||
alpha_min=bmp.DEFAULT_ALPHA_MIN,
|
||||
alpha_max=bmp.DEFAULT_ALPHA_MAX,
|
||||
bins_x=bmp.DEFAULT_BINS_X,
|
||||
bins_y=bmp.DEFAULT_BINS_Y,
|
||||
y_min=y_min,
|
||||
y_max_limit=y_max,
|
||||
)
|
||||
ax.scatter(
|
||||
df[x_col],
|
||||
df[y_col],
|
||||
color=scatter_color,
|
||||
s=point_size,
|
||||
alpha=alpha_values if len(alpha_values) else alpha,
|
||||
linewidths=0,
|
||||
label="Точки (очищено)",
|
||||
)
|
||||
|
||||
# Тренд по выбранному методу
|
||||
tx, ty = bmp.compute_trend(
|
||||
df,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
method=trend_method,
|
||||
lowess_frac=trend_frac,
|
||||
rolling_window=rolling_window,
|
||||
y_max=y_max,
|
||||
trend_frac=bmp.DEFAULT_TREND_FRAC,
|
||||
trend_color=bmp.DEFAULT_TREND_COLOR,
|
||||
trend_linewidth=bmp.DEFAULT_TREND_LINEWIDTH,
|
||||
trend_method=bmp.DEFAULT_TREND_METHOD,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
return_components=True,
|
||||
)
|
||||
if len(tx):
|
||||
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
|
||||
|
||||
# Квадратичная регрессия
|
||||
if res is None:
|
||||
print("Нет данных для построения графика")
|
||||
return
|
||||
|
||||
fig, ax, cleaned, trend_data = res
|
||||
model, metrics = fit_quadratic(cleaned, trend_data, x_col=X_COL, y_col=Y_COL, x_max=x_max)
|
||||
|
||||
if model is None:
|
||||
print("Недостаточно точек для квадратичной регрессии")
|
||||
fig.savefig(out_dir / "scatter_trend.png", dpi=150)
|
||||
bmp.plt.close(fig)
|
||||
return
|
||||
|
||||
# Квадратичная линия поверх существующего тренда
|
||||
x_grid = np.linspace(0, x_max, 400)
|
||||
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
|
||||
y_grid = model.predict(X_grid)
|
||||
ax.plot(x_grid, y_grid, color="blue", linewidth=2.3, linestyle="--", label="Квадр. регрессия")
|
||||
|
||||
ax.set_xlim(0, x_max)
|
||||
ax.set_ylim(y_min, y_max)
|
||||
ax.set_yticks(range(0, int(y_max) + 1, 2))
|
||||
ax.set_xlabel("Среднее число показов в день")
|
||||
ax.set_ylabel(y_col)
|
||||
ax.set_title(f"Квадратичная регрессия: {y_col} vs {x_col}")
|
||||
ax.grid(alpha=0.3)
|
||||
ax.plot(x_grid, y_grid, color="blue", linewidth=2.2, linestyle="--", label="Квадр. регрессия")
|
||||
ax.legend()
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fig.tight_layout()
|
||||
fig.savefig(out_path, dpi=150)
|
||||
bmp.plt.close(fig)
|
||||
print(f"Saved {out_path}")
|
||||
|
||||
|
||||
def report_model(
|
||||
model: sm.regression.linear_model.RegressionResultsWrapper,
|
||||
r2: Optional[float],
|
||||
auc: Optional[float],
|
||||
*,
|
||||
r2_trend: Optional[float] = None,
|
||||
) -> None:
|
||||
params = model.params
|
||||
pvals = model.pvalues
|
||||
fmt_p = lambda p: f"<1e-300" if p < 1e-300 else f"{p:.4g}"
|
||||
print("\n=== Квадратичная регрессия (y ~ 1 + x + x^2) ===")
|
||||
print(f"const: {params[0]:.6f} (p={fmt_p(pvals[0])})")
|
||||
print(f"beta1 x: {params[1]:.6f} (p={fmt_p(pvals[1])})")
|
||||
print(f"beta2 x^2: {params[2]:.6f} (p={fmt_p(pvals[2])})")
|
||||
print(f"R2: {r2:.4f}" if r2 is not None else "R2: n/a")
|
||||
if r2_trend is not None:
|
||||
print(f"R2 vs trend target: {r2_trend:.4f}")
|
||||
print(f"AUC (target y>0): {auc:.4f}" if auc is not None else "AUC: n/a (один класс)")
|
||||
|
||||
|
||||
def generate_quadratic_analysis(
|
||||
y_col: str,
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
base_out_dir: Path = BASE_OUT_DIR,
|
||||
config_name: str = "default",
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
y_min: float = DEFAULT_Y_MIN,
|
||||
y_max: float = DEFAULT_Y_MAX,
|
||||
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||
point_size: int = DEFAULT_POINT_SIZE,
|
||||
alpha: float = DEFAULT_ALPHA,
|
||||
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||
bins_x: int = DEFAULT_BINS_X,
|
||||
bins_y: int = DEFAULT_BINS_Y,
|
||||
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||
trend_color: str = DEFAULT_TREND_COLOR,
|
||||
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||
iqr_k: float = DEFAULT_IQR_K,
|
||||
q_low: float = DEFAULT_Q_LOW,
|
||||
q_high: float = DEFAULT_Q_HIGH,
|
||||
trend_method: str = bmp.DEFAULT_TREND_METHOD,
|
||||
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
|
||||
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
|
||||
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
|
||||
) -> dict:
|
||||
x, y, cleaned_df = prepare_clean_data(
|
||||
y_col,
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
iqr_k=iqr_k,
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
)
|
||||
w = density_weights(
|
||||
cleaned_df,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
y_min=y_min,
|
||||
y_max=y_max,
|
||||
)
|
||||
# тренд по выбранному методу
|
||||
tx, ty = bmp.compute_trend(
|
||||
cleaned_df,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
method=trend_method,
|
||||
lowess_frac=trend_frac,
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
summary_lines = [
|
||||
f"R2_trend={metrics['r2_trend']:.3f}",
|
||||
f"AUC={metrics['auc']:.3f}",
|
||||
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
|
||||
f"b2={params[2]:.3f} (p={pvals[2]:.3g})",
|
||||
f"n={len(cleaned)}",
|
||||
]
|
||||
ax.text(
|
||||
0.02,
|
||||
0.95,
|
||||
"\n".join(summary_lines),
|
||||
transform=ax.transAxes,
|
||||
ha="left",
|
||||
va="top",
|
||||
fontsize=9,
|
||||
bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.65, edgecolor="gray"),
|
||||
)
|
||||
|
||||
trend_target = map_trend_to_points(x, tx, ty)
|
||||
model, y_hat = fit_quadratic(x, trend_target, weights=w)
|
||||
r2_actual, auc = compute_metrics(y, y_hat)
|
||||
r2_trend = r2_score(trend_target, y_hat) if len(trend_target) else None
|
||||
report_model(model, r2_actual, auc, r2_trend=r2_trend)
|
||||
|
||||
out_dir = base_out_dir / config_name / str(y_col).replace("/", "_")
|
||||
plot_quadratic_overlay(
|
||||
cleaned_df,
|
||||
model,
|
||||
y_col=y_col,
|
||||
out_path=out_dir / "quad_regression.png",
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
y_min=y_min,
|
||||
y_max=y_max,
|
||||
scatter_color=scatter_color,
|
||||
point_size=point_size,
|
||||
alpha=alpha,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
trend_frac=trend_frac,
|
||||
trend_color=trend_color,
|
||||
trend_linewidth=trend_linewidth,
|
||||
trend_method=trend_method,
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
)
|
||||
|
||||
return {
|
||||
"config": config_name,
|
||||
"y_col": y_col,
|
||||
"r2": r2_actual,
|
||||
"r2_trend": r2_trend,
|
||||
"auc": auc,
|
||||
"params": {
|
||||
"trend_method": trend_method,
|
||||
"trend_frac": trend_frac,
|
||||
"rolling_window": rolling_window,
|
||||
"savgol_window": savgol_window,
|
||||
"savgol_poly": savgol_poly,
|
||||
"x_max": x_max,
|
||||
"weights_alpha_range": (alpha_min, alpha_max),
|
||||
},
|
||||
"coeffs": model.params.tolist(),
|
||||
"pvalues": model.pvalues.tolist(),
|
||||
}
|
||||
quad_path = out_dir / "scatter_trend_quad.png"
|
||||
fig.tight_layout()
|
||||
fig.savefig(quad_path, dpi=150)
|
||||
bmp.plt.close(fig)
|
||||
print(f"Saved {quad_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
generate_quadratic_analysis("orders_amt_total")
|
||||
plot_overall_quad()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||