good quadreg 0.92 r2

This commit is contained in:
dan
2025-12-14 22:53:28 +03:00
parent 4f8f266c3e
commit 3dc05530c0
19 changed files with 1126 additions and 1516 deletions

View File

@@ -1,240 +1,352 @@
import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from pathlib import Path
from typing import Tuple, Optional
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
from sklearn.metrics import r2_score, roc_auc_score
# -----------------------------
# Load + feature engineering (как у тебя)
# -----------------------------
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
import best_model_and_plots as bmp
db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
# Наследуем константы/визуальные настройки из scatter-скрипта
X_COL = bmp.X_COL
DEFAULT_X_MAX = bmp.DEFAULT_X_MAX
DEFAULT_Y_MIN = bmp.DEFAULT_Y_MIN
DEFAULT_Y_MAX = bmp.DEFAULT_Y_MAX
DEFAULT_SCATTER_COLOR = bmp.DEFAULT_SCATTER_COLOR
DEFAULT_POINT_SIZE = bmp.DEFAULT_POINT_SIZE
DEFAULT_ALPHA = bmp.DEFAULT_ALPHA
DEFAULT_ALPHA_MIN = bmp.DEFAULT_ALPHA_MIN
DEFAULT_ALPHA_MAX = bmp.DEFAULT_ALPHA_MAX
DEFAULT_BINS_X = bmp.DEFAULT_BINS_X
DEFAULT_BINS_Y = bmp.DEFAULT_BINS_Y
DEFAULT_IQR_K = bmp.DEFAULT_IQR_K
DEFAULT_Q_LOW = bmp.DEFAULT_Q_LOW
DEFAULT_Q_HIGH = bmp.DEFAULT_Q_HIGH
DEFAULT_TREND_FRAC = bmp.DEFAULT_TREND_FRAC
DEFAULT_TREND_COLOR = bmp.DEFAULT_TREND_COLOR
DEFAULT_TREND_LINEWIDTH = bmp.DEFAULT_TREND_LINEWIDTH
BASE_OUT_DIR = bmp.BASE_OUT_DIR
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
def prepare_clean_data(
y_col: str,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
"""Готовит очищенные данные: фильтр по x и IQR, возвращает x, y и DataFrame."""
df = bmp.load_client_level(bmp.DB_PATH)
base = df[[x_col, y_col]].dropna()
in_range = bmp.filter_x_range(base, x_col, x_max)
cleaned = bmp.remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
.merge(contact_days, on="id", how="left")
.reset_index()
)
x = cleaned[x_col].to_numpy()
y = cleaned[y_col].to_numpy()
return x, y, cleaned
client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"])
client["order_rate_pct"] = 100 * client["order_rate"]
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
# -----------------------------
# Aggregate curve points (как у тебя)
# -----------------------------
stats_imp = (
client.groupby("avg_imp_per_day", as_index=False)
.agg(
orders_mean=("orders_amt_total", "mean"),
n_clients=("id", "count"),
def fit_quadratic(
x: np.ndarray,
y_target: np.ndarray,
weights: Optional[np.ndarray] = None,
) -> Tuple[sm.regression.linear_model.RegressionResultsWrapper, np.ndarray]:
"""Фитим квадратику по x -> y_target (WLS), предсказываем на тех же x."""
X_design = np.column_stack([x, x**2])
X_design = sm.add_constant(X_design)
if weights is not None:
model = sm.WLS(y_target, X_design, weights=weights).fit(cov_type="HC3")
else:
model = sm.OLS(y_target, X_design).fit(cov_type="HC3")
y_hat = model.predict(X_design)
return model, y_hat
def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[Optional[float], Optional[float]]:
"""Возвращает (R2, AUC по метке y>0)."""
r2 = r2_score(y_true, y_pred)
auc = None
try:
auc = roc_auc_score((y_true > 0).astype(int), y_pred)
except ValueError:
auc = None
return r2, auc
def map_trend_to_points(x_points: np.ndarray, trend_x: np.ndarray, trend_y: np.ndarray) -> np.ndarray:
"""Интерполирует значения тренда в точках x_points."""
if len(trend_x) == 0:
return np.zeros_like(x_points)
# гарантируем отсортированность
order = np.argsort(trend_x)
tx = trend_x[order]
ty = trend_y[order]
return np.interp(x_points, tx, ty, left=ty[0], right=ty[-1])
def density_weights(
df: pd.DataFrame,
y_col: str,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
) -> np.ndarray:
"""Строит веса из плотности (та же схема, что и альфы на графике)."""
alphas = bmp.compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bins_x,
bins_y=bins_y,
alpha_min=alpha_min,
alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
.sort_values("avg_imp_per_day")
).reset_index(drop=True)
if len(alphas) == 0:
return np.ones(len(df))
denom = max(alpha_max - alpha_min, 1e-9)
weights = (alphas - alpha_min) / denom
weights = np.clip(weights, 0, None)
return weights
# -----------------------------
# Filtering / outlier logic (как у тебя)
# -----------------------------
K_MULT = 2
ABS_DY_MIN = 1
X_MAX = 16
stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
def plot_quadratic_overlay(
df: pd.DataFrame,
model: sm.regression.linear_model.RegressionResultsWrapper,
y_col: str,
out_path: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
trend_method: str = bmp.DEFAULT_TREND_METHOD,
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
) -> None:
"""Рисует облако + LOWESS-тренд + линию квадр. регрессии."""
fig, ax = bmp.plt.subplots(figsize=(8, 8))
alpha_values = bmp.compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bins_x,
bins_y=bins_y,
alpha_min=alpha_min,
alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
ax.scatter(
df[x_col],
df[y_col],
color=scatter_color,
s=point_size,
alpha=alpha_values if len(alpha_values) else alpha,
linewidths=0,
label="Точки (очищено)",
)
before = len(stats_f)
y = stats_f["orders_mean"]
abs_dy = y.diff().abs()
# Тренд по выбранному методу
tx, ty = bmp.compute_trend(
df,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
if len(tx):
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
ratio = abs_dy / (prev3_mean.replace(0, np.nan))
# Квадратичная регрессия
x_grid = np.linspace(0, x_max, 400)
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
y_grid = model.predict(X_grid)
ax.plot(x_grid, y_grid, color="blue", linewidth=2.3, linestyle="--", label="Квадр. регрессия")
is_outlier = ((abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT)) | (y > 5)
is_outlier = is_outlier.fillna(False)
ax.set_xlim(0, x_max)
ax.set_ylim(y_min, y_max)
ax.set_yticks(range(0, int(y_max) + 1, 2))
ax.set_xlabel("Среднее число показов в день")
ax.set_ylabel(y_col)
ax.set_title(f"Квадратичная регрессия: {y_col} vs {x_col}")
ax.grid(alpha=0.3)
ax.legend()
stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
after = len(stats_f)
print(f"Фильтрация: было {before}, стало {after}, убрали {before-after} точек")
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(out_path, dpi=150)
bmp.plt.close(fig)
print(f"Saved {out_path}")
# -----------------------------
# Smoothing (оставим для визуалки, но регрессию делаем по orders_mean)
# -----------------------------
w = max(7, int(len(stats_f) * 0.05))
if w % 2 == 0:
w += 1
stats_f["orders_smooth"] = (
stats_f["orders_mean"]
.rolling(window=w, center=True, min_periods=1)
.mean()
)
def report_model(
model: sm.regression.linear_model.RegressionResultsWrapper,
r2: Optional[float],
auc: Optional[float],
*,
r2_trend: Optional[float] = None,
) -> None:
params = model.params
pvals = model.pvalues
fmt_p = lambda p: f"<1e-300" if p < 1e-300 else f"{p:.4g}"
print("\n=== Квадратичная регрессия (y ~ 1 + x + x^2) ===")
print(f"const: {params[0]:.6f} (p={fmt_p(pvals[0])})")
print(f"beta1 x: {params[1]:.6f} (p={fmt_p(pvals[1])})")
print(f"beta2 x^2: {params[2]:.6f} (p={fmt_p(pvals[2])})")
print(f"R2: {r2:.4f}" if r2 is not None else "R2: n/a")
if r2_trend is not None:
print(f"R2 vs trend target: {r2_trend:.4f}")
print(f"AUC (target y>0): {auc:.4f}" if auc is not None else "AUC: n/a (один класс)")
# -----------------------------
# Cost line (как у тебя, нормировка "в единицах заказов")
# -----------------------------
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
# -----------------------------
# Quadratic regression: orders_mean ~ 1 + x + x^2
# WLS with weights = n_clients
# -----------------------------
x = stats_f["avg_imp_per_day"].to_numpy()
y = stats_f["orders_mean"].to_numpy()
wts = stats_f["n_clients"].to_numpy().astype(float)
def generate_quadratic_analysis(
y_col: str,
*,
x_col: str = X_COL,
base_out_dir: Path = BASE_OUT_DIR,
config_name: str = "default",
x_max: float = DEFAULT_X_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
trend_method: str = bmp.DEFAULT_TREND_METHOD,
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
) -> dict:
x, y, cleaned_df = prepare_clean_data(
y_col,
x_col=x_col,
x_max=x_max,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
w = density_weights(
cleaned_df,
y_col=y_col,
x_col=x_col,
x_max=x_max,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
)
# тренд по выбранному методу
tx, ty = bmp.compute_trend(
cleaned_df,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
X = np.column_stack([x, x**2])
X = sm.add_constant(X) # [1, x, x^2]
trend_target = map_trend_to_points(x, tx, ty)
model, y_hat = fit_quadratic(x, trend_target, weights=w)
r2_actual, auc = compute_metrics(y, y_hat)
r2_trend = r2_score(trend_target, y_hat) if len(trend_target) else None
report_model(model, r2_actual, auc, r2_trend=r2_trend)
model = sm.WLS(y, X, weights=wts)
res = model.fit(cov_type="HC3") # робастные ошибки
out_dir = base_out_dir / config_name / str(y_col).replace("/", "_")
plot_quadratic_overlay(
cleaned_df,
model,
y_col=y_col,
out_path=out_dir / "quad_regression.png",
x_col=x_col,
x_max=x_max,
y_min=y_min,
y_max=y_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
b0, b1, b2 = res.params
p_b1_two = res.pvalues[1]
p_b2_two = res.pvalues[2]
return {
"config": config_name,
"y_col": y_col,
"r2": r2_actual,
"r2_trend": r2_trend,
"auc": auc,
"params": {
"trend_method": trend_method,
"trend_frac": trend_frac,
"rolling_window": rolling_window,
"savgol_window": savgol_window,
"savgol_poly": savgol_poly,
"x_max": x_max,
"weights_alpha_range": (alpha_min, alpha_max),
},
"coeffs": model.params.tolist(),
"pvalues": model.pvalues.tolist(),
}
# one-sided p-values for directional hypotheses
p_b1_pos = (p_b1_two / 2) if (b1 > 0) else (1 - p_b1_two / 2)
p_b2_neg = (p_b2_two / 2) if (b2 < 0) else (1 - p_b2_two / 2)
# turning point (if concave)
x_star = None
y_star = None
if b2 < 0:
x_star = -b1 / (2 * b2)
y_star = b0 + b1 * x_star + b2 * x_star**2
def main() -> None:
generate_quadratic_analysis("orders_amt_total")
# Intersection with cost line: b0 + b1 x + b2 x^2 = c x -> b2 x^2 + (b1 - c) x + b0 = 0
x_cross = None
roots = np.roots([b2, (b1 - c), b0]) # may be complex
roots = [r.real for r in roots if abs(r.imag) < 1e-8]
roots_in_range = [r for r in roots if (stats_f["avg_imp_per_day"].min() <= r <= stats_f["avg_imp_per_day"].max())]
if roots_in_range:
# берём корень ближе к "правой" части (обычно пересечение интереснее там, где начинается невыгодно)
x_cross = max(roots_in_range)
# -----------------------------
# Print results + interpretation (по-человечески)
# -----------------------------
print("\n=== Квадратичная регрессия (WLS, веса = n_clients, SE = HC3) ===")
print(res.summary())
print("\n=== Проверка гипотезы убывающей отдачи / спада ===")
print(f"β1 (линейный эффект): {b1:.6f}, двусторонний p={p_b1_two:.4g}, односторонний p(β1>0)={p_b1_pos:.4g}")
print(f"β2 (кривизна): {b2:.6f}, двусторонний p={p_b2_two:.4g}, односторонний p(β2<0)={p_b2_neg:.4g}")
alpha = 0.05
support = (b1 > 0) and (b2 < 0) and (p_b1_pos < alpha) and (p_b2_neg < alpha)
if support:
print("\nВывод: данные поддерживают гипотезу нелинейности.")
print("Есть статистически значимый рост на малых x (β1>0) и насыщение/спад (β2<0).")
else:
print("\nВывод: строгого статистического подтверждения по знакам/значимости может не хватить.")
print("Но знак коэффициентов и форма кривой всё равно могут быть согласованы с гипотезой.")
print("На защите говори аккуратно: 'наблюдается тенденция/согласуется с гипотезой'.")
if x_star is not None:
print(f"\nОценка 'порога насыщения' (вершина параболы): x* = {x_star:.3f} показов/день")
print(f"Прогноз среднего числа заказов в x*: y(x*) ≈ {y_star:.3f}")
if not (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
print("Внимание: x* вне диапазона наблюдений, интерпретация как 'оптимума' сомнительная.")
else:
print("\nВершина не считается как максимум: β2 >= 0 (нет выпуклости вниз).")
if x_cross is not None:
y_cross = b0 + b1 * x_cross + b2 * x_cross**2
print(f"\nТочка пересечения с линейными расходами (в нормировке c={c:.4f}): x≈{x_cross:.3f}, y≈{y_cross:.3f}")
else:
print("\nПересечение с линией расходов в выбранной нормировке не найдено (или вне диапазона).")
# -----------------------------
# Plot: points + smooth + quadratic fit + cost + markers
# -----------------------------
x_grid = np.linspace(stats_f["avg_imp_per_day"].min(), stats_f["avg_imp_per_day"].max(), 300)
y_hat = b0 + b1 * x_grid + b2 * x_grid**2
cost_hat = c * x_grid
plt.figure(figsize=(10, 8))
plt.plot(
stats_f["avg_imp_per_day"], stats_f["orders_mean"],
marker="o", linestyle="-", linewidth=1, alpha=0.3,
label="Среднее число заказов (по точкам)"
)
plt.plot(
stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
color="red", linewidth=2.2,
label="Сглаженный тренд (rolling mean)"
)
plt.plot(
x_grid, y_hat,
color="blue", linewidth=2.5,
label="Квадратичная регрессия (WLS)"
)
plt.plot(
x_grid, cost_hat,
color="black", linestyle="--", linewidth=2,
label="Линейные расходы на показы"
)
if x_star is not None and (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
plt.axvline(x_star, color="blue", linestyle=":", linewidth=2)
plt.scatter([x_star], [y_star], color="blue", zorder=5)
plt.text(x_star, y_star, f" x*={x_star:.2f}", va="bottom")
if x_cross is not None:
y_cross = b0 + b1 * x_cross + b2 * x_cross**2
plt.axvline(x_cross, color="black", linestyle=":", linewidth=2, alpha=0.8)
plt.scatter([x_cross], [y_cross], color="black", zorder=5)
plt.text(x_cross, y_cross, f" пересечение≈{x_cross:.2f}", va="top")
plt.xlabel("Среднее число показов в день")
plt.ylabel("Среднее число заказов")
plt.title("Нелинейный эффект интенсивности коммуникаций: квадратичная регрессия")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
out_dir = project_root / "main_hypot"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "quad_regression_with_costs.png"
plt.savefig(out_path, dpi=150)
print(f"\nSaved: {out_path}")
if __name__ == "__main__":
main()