good quadreg 0.92 r2
@@ -1,43 +1,66 @@
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy.signal import savgol_filter
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
from statsmodels.nonparametric.smoothers_lowess import lowess
|
from statsmodels.nonparametric.smoothers_lowess import lowess
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
sns.set_theme(style="whitegrid")
|
sns.set_theme(style="whitegrid")
|
||||||
plt.rcParams["figure.figsize"] = (10, 6)
|
plt.rcParams["figure.figsize"] = (8, 8)
|
||||||
|
|
||||||
project_root = Path(__file__).resolve().parent.parent
|
project_root = Path(__file__).resolve().parent.parent
|
||||||
sys.path.append(str(project_root / "preanalysis_old_bad"))
|
|
||||||
import eda_utils as eda # noqa: E402
|
|
||||||
|
|
||||||
DB_PATH = project_root / "dataset" / "ds.sqlite"
|
DB_PATH = project_root / "dataset" / "ds.sqlite"
|
||||||
OUT_DIR = project_root / "main_hypot"
|
BASE_OUT_DIR = project_root / "main_hypot"
|
||||||
X_COL = "avg_imp_per_day"
|
|
||||||
Y_COL = "orders_amt_total"
|
# Константы данных
|
||||||
X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать
|
CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
||||||
SCATTER_COLOR = "#2c7bb6"
|
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
||||||
|
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
||||||
|
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
||||||
|
|
||||||
|
# Константы визуализации/очистки
|
||||||
|
X_COL = "avg_imp_per_day" # x всегда фиксирован
|
||||||
|
DEFAULT_X_MAX = 18
|
||||||
|
DEFAULT_SCATTER_COLOR = "#2c7bb6"
|
||||||
|
DEFAULT_POINT_SIZE = 20
|
||||||
|
DEFAULT_ALPHA = 0.08
|
||||||
|
DEFAULT_TREND_ALPHA = 0.1
|
||||||
|
DEFAULT_TREND_FRAC = 0.3
|
||||||
|
DEFAULT_TREND_COLOR = "red"
|
||||||
|
DEFAULT_TREND_LINEWIDTH = 2.5
|
||||||
|
DEFAULT_IQR_K = 1.5
|
||||||
|
DEFAULT_Q_LOW = 0.05
|
||||||
|
DEFAULT_Q_HIGH = 0.95
|
||||||
|
DEFAULT_ALPHA_MIN = 0.04
|
||||||
|
DEFAULT_ALPHA_MAX = 0.7
|
||||||
|
DEFAULT_BINS_X = 60
|
||||||
|
DEFAULT_BINS_Y = 60
|
||||||
|
DEFAULT_Y_MIN = -0.5
|
||||||
|
DEFAULT_Y_MAX = 10
|
||||||
|
DEFAULT_TREND_METHOD = "savgol" # options: lowess, rolling, savgol
|
||||||
|
DEFAULT_ROLLING_WINDOW = 200
|
||||||
|
DEFAULT_SAVGOL_WINDOW = 501
|
||||||
|
DEFAULT_SAVGOL_POLY = 2
|
||||||
|
|
||||||
|
|
||||||
|
def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
|
||||||
|
denom = denominator.replace(0, pd.NA)
|
||||||
|
return numerator / denom
|
||||||
|
|
||||||
|
|
||||||
def load_client_level(db_path: Path) -> pd.DataFrame:
|
def load_client_level(db_path: Path) -> pd.DataFrame:
|
||||||
"""Собирает агрегаты по клиентам без усреднения по x."""
|
"""Собирает агрегаты по клиентам без зависимостей от eda_utils."""
|
||||||
conn = sqlite3.connect(db_path)
|
conn = sqlite3.connect(db_path)
|
||||||
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
for cols, name in [
|
df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
||||||
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
|
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
||||||
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
|
|
||||||
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
|
|
||||||
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
|
|
||||||
(eda.ORDER_COLS, "orders_amt_total"),
|
|
||||||
]:
|
|
||||||
df[name] = df[cols].sum(axis=1)
|
|
||||||
|
|
||||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
|
||||||
|
|
||||||
client = (
|
client = (
|
||||||
df.groupby("id")
|
df.groupby("id")
|
||||||
@@ -49,94 +72,503 @@ def load_client_level(db_path: Path) -> pd.DataFrame:
|
|||||||
.reset_index()
|
.reset_index()
|
||||||
)
|
)
|
||||||
|
|
||||||
client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
client[X_COL] = safe_divide(client["imp_total"], client["contact_days"])
|
||||||
client[Y_COL] = client["orders_amt_total"]
|
print(f"Loaded {len(client)} clients with {X_COL} computed.")
|
||||||
client = client[["id", X_COL, Y_COL]].dropna()
|
return client
|
||||||
|
|
||||||
in_range = client[client[X_COL] <= X_MAX].copy()
|
|
||||||
print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
|
|
||||||
return in_range
|
|
||||||
|
|
||||||
|
|
||||||
def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame:
|
def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]:
|
||||||
"""Убирает выбросы по IQR отдельно по x и y."""
|
q1, q3 = series.quantile([q_low, q_high])
|
||||||
def bounds(series: pd.Series) -> tuple[float, float]:
|
|
||||||
q1, q3 = series.quantile([0.05, 0.95])
|
|
||||||
iqr = q3 - q1
|
iqr = q3 - q1
|
||||||
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
|
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
|
||||||
|
|
||||||
x_low, x_high = bounds(df[X_COL])
|
|
||||||
y_low, y_high = bounds(df[Y_COL])
|
def remove_outliers(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
y_col: str,
|
||||||
|
x_col: str = X_COL,
|
||||||
|
iqr_k: float = DEFAULT_IQR_K,
|
||||||
|
q_low: float = DEFAULT_Q_LOW,
|
||||||
|
q_high: float = DEFAULT_Q_HIGH,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""Убирает выбросы по IQR отдельно по x и y."""
|
||||||
|
x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k)
|
||||||
|
y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k)
|
||||||
filtered = df[
|
filtered = df[
|
||||||
df[X_COL].between(max(0, x_low), x_high)
|
df[x_col].between(max(0, x_low), x_high)
|
||||||
& df[Y_COL].between(max(0, y_low), y_high)
|
& df[y_col].between(max(0, y_low), y_high)
|
||||||
].copy()
|
].copy()
|
||||||
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).")
|
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).")
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
def compute_density_alpha(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
x_col: str,
|
||||||
|
y_col: str,
|
||||||
|
x_max: float,
|
||||||
|
*,
|
||||||
|
bins_x: int = DEFAULT_BINS_X,
|
||||||
|
bins_y: int = DEFAULT_BINS_Y,
|
||||||
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||||
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||||
|
y_min: float = DEFAULT_Y_MIN,
|
||||||
|
y_max_limit: float = DEFAULT_Y_MAX,
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах."""
|
||||||
|
x_vals = df[x_col].to_numpy()
|
||||||
|
y_vals = df[y_col].to_numpy()
|
||||||
|
|
||||||
|
if len(x_vals) == 0:
|
||||||
|
return np.array([])
|
||||||
|
|
||||||
|
x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1)
|
||||||
|
y_upper = max(min(y_vals.max(), y_max_limit), 1e-9)
|
||||||
|
y_edges = np.linspace(y_min, y_upper, bins_y + 1)
|
||||||
|
|
||||||
|
x_bins = np.digitize(x_vals, x_edges) - 1
|
||||||
|
y_bins = np.digitize(y_vals, y_edges) - 1
|
||||||
|
|
||||||
|
valid = (
|
||||||
|
(x_bins >= 0) & (x_bins < bins_x) &
|
||||||
|
(y_bins >= 0) & (y_bins < bins_y)
|
||||||
|
)
|
||||||
|
counts = np.zeros((bins_x, bins_y), dtype=int)
|
||||||
|
for xb, yb in zip(x_bins[valid], y_bins[valid]):
|
||||||
|
counts[xb, yb] += 1
|
||||||
|
|
||||||
|
bin_counts = counts[
|
||||||
|
np.clip(x_bins, 0, bins_x - 1),
|
||||||
|
np.clip(y_bins, 0, bins_y - 1),
|
||||||
|
]
|
||||||
|
max_count = bin_counts.max() if len(bin_counts) else 1
|
||||||
|
if max_count == 0:
|
||||||
|
weight = np.zeros_like(bin_counts, dtype=float)
|
||||||
|
else:
|
||||||
|
weight = (bin_counts / max_count) ** np.sqrt(1.5)
|
||||||
|
weight = np.clip(weight, 0, 1)
|
||||||
|
return alpha_min + (alpha_max - alpha_min) * weight
|
||||||
|
|
||||||
|
|
||||||
|
def compute_trend(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
y_col: str,
|
||||||
|
*,
|
||||||
|
x_col: str = X_COL,
|
||||||
|
method: str = DEFAULT_TREND_METHOD,
|
||||||
|
lowess_frac: float = DEFAULT_TREND_FRAC,
|
||||||
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||||
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||||
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||||
|
) -> Tuple[np.ndarray, np.ndarray]:
|
||||||
|
"""Возвращает (x_sorted, trend_y) по выбранному методу."""
|
||||||
|
d = df[[x_col, y_col]].dropna().sort_values(x_col)
|
||||||
|
x_vals = d[x_col].to_numpy()
|
||||||
|
y_vals = d[y_col].to_numpy()
|
||||||
|
|
||||||
|
if len(x_vals) == 0:
|
||||||
|
return np.array([]), np.array([])
|
||||||
|
|
||||||
|
m = method.lower()
|
||||||
|
if m == "lowess":
|
||||||
|
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
|
||||||
|
return trend[:, 0], trend[:, 1]
|
||||||
|
if m == "rolling":
|
||||||
|
w = max(3, rolling_window)
|
||||||
|
if w % 2 == 0:
|
||||||
|
w += 1
|
||||||
|
y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy()
|
||||||
|
return x_vals, y_trend
|
||||||
|
if m == "savgol":
|
||||||
|
w = max(5, savgol_window)
|
||||||
|
if w % 2 == 0:
|
||||||
|
w += 1
|
||||||
|
poly = min(savgol_poly, w - 1)
|
||||||
|
y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp")
|
||||||
|
return x_vals, y_trend
|
||||||
|
|
||||||
|
# fallback to lowess
|
||||||
|
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
|
||||||
|
return trend[:, 0], trend[:, 1]
|
||||||
|
|
||||||
|
|
||||||
|
def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame:
|
||||||
|
subset = df[df[x_col] <= x_max].copy()
|
||||||
|
print(f"{len(df)} points; {len(subset)} within x<={x_max}.")
|
||||||
|
return subset
|
||||||
|
|
||||||
|
|
||||||
def plot_density_scatter(
|
def plot_density_scatter(
|
||||||
df: pd.DataFrame,
|
df: pd.DataFrame,
|
||||||
|
y_col: str,
|
||||||
title: str,
|
title: str,
|
||||||
out_name: str,
|
out_path: Path,
|
||||||
|
*,
|
||||||
|
x_col: str = X_COL,
|
||||||
|
x_max: float = DEFAULT_X_MAX,
|
||||||
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||||
|
point_size: int = DEFAULT_POINT_SIZE,
|
||||||
|
alpha: float = DEFAULT_ALPHA,
|
||||||
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||||
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||||
|
bins_x: int = DEFAULT_BINS_X,
|
||||||
|
bins_y: int = DEFAULT_BINS_Y,
|
||||||
|
y_min: float = DEFAULT_Y_MIN,
|
||||||
|
y_max: float = DEFAULT_Y_MAX,
|
||||||
with_trend: bool = False,
|
with_trend: bool = False,
|
||||||
alpha: float = 0.08,
|
trend_method: str = DEFAULT_TREND_METHOD,
|
||||||
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||||
|
trend_color: str = DEFAULT_TREND_COLOR,
|
||||||
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||||
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||||
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||||
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||||
) -> None:
|
) -> None:
|
||||||
fig, ax = plt.subplots(figsize=(10, 6))
|
fig, ax = plt.subplots(figsize=(8, 8))
|
||||||
sns.scatterplot(
|
alpha_values = compute_density_alpha(
|
||||||
data=df,
|
df,
|
||||||
x=X_COL,
|
x_col=x_col,
|
||||||
y=Y_COL,
|
y_col=y_col,
|
||||||
color=SCATTER_COLOR,
|
x_max=x_max,
|
||||||
s=20,
|
bins_x=bins_x,
|
||||||
alpha=alpha,
|
bins_y=bins_y,
|
||||||
linewidth=0,
|
alpha_min=alpha_min,
|
||||||
ax=ax,
|
alpha_max=alpha_max,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max_limit=y_max,
|
||||||
|
)
|
||||||
|
ax.scatter(
|
||||||
|
df[x_col],
|
||||||
|
df[y_col],
|
||||||
|
color=scatter_color,
|
||||||
|
s=point_size,
|
||||||
|
alpha=alpha_values if len(alpha_values) else alpha,
|
||||||
|
linewidths=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
if with_trend:
|
if with_trend:
|
||||||
trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True)
|
tx, ty = compute_trend(
|
||||||
ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд")
|
df,
|
||||||
|
y_col=y_col,
|
||||||
|
x_col=x_col,
|
||||||
|
method=trend_method,
|
||||||
|
lowess_frac=trend_frac,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
if len(tx):
|
||||||
|
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
|
||||||
ax.legend()
|
ax.legend()
|
||||||
|
|
||||||
ax.set_xlim(0, X_MAX)
|
ax.set_xlim(0, x_max)
|
||||||
ax.set_ylim(bottom=0)
|
ax.set_ylim(y_min, y_max)
|
||||||
|
ax.set_yticks(range(0, int(y_max) + 1, 2))
|
||||||
ax.set_xlabel("Среднее число показов в день")
|
ax.set_xlabel("Среднее число показов в день")
|
||||||
ax.set_ylabel("Число заказов за период (сумма)")
|
ax.set_ylabel(y_col)
|
||||||
ax.set_title(title)
|
ax.set_title(title)
|
||||||
ax.grid(alpha=0.3)
|
ax.grid(alpha=0.3)
|
||||||
|
|
||||||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
out_path = OUT_DIR / out_name
|
|
||||||
fig.tight_layout()
|
fig.tight_layout()
|
||||||
fig.savefig(out_path, dpi=150)
|
fig.savefig(out_path, dpi=150)
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
print(f"Saved {out_path}")
|
print(f"Saved {out_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def plot_raw_scatter(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
y_col: str,
|
||||||
|
out_dir: Path,
|
||||||
|
*,
|
||||||
|
x_col: str = X_COL,
|
||||||
|
x_max: float = DEFAULT_X_MAX,
|
||||||
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||||
|
point_size: int = DEFAULT_POINT_SIZE,
|
||||||
|
alpha: float = DEFAULT_ALPHA,
|
||||||
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||||
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||||
|
bins_x: int = DEFAULT_BINS_X,
|
||||||
|
bins_y: int = DEFAULT_BINS_Y,
|
||||||
|
y_min: float = DEFAULT_Y_MIN,
|
||||||
|
y_max: float = DEFAULT_Y_MAX,
|
||||||
|
trend_method: str = DEFAULT_TREND_METHOD,
|
||||||
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||||
|
trend_color: str = DEFAULT_TREND_COLOR,
|
||||||
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||||
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||||
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||||
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||||
|
) -> None:
|
||||||
|
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
|
||||||
|
plot_density_scatter(
|
||||||
|
in_range,
|
||||||
|
y_col=y_col,
|
||||||
|
title=f"Облако: {y_col} vs {x_col} (все клиенты)",
|
||||||
|
out_path=out_dir / "scatter.png",
|
||||||
|
x_col=x_col,
|
||||||
|
x_max=x_max,
|
||||||
|
scatter_color=scatter_color,
|
||||||
|
point_size=point_size,
|
||||||
|
alpha=alpha,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max=y_max,
|
||||||
|
trend_method=trend_method,
|
||||||
|
trend_frac=trend_frac,
|
||||||
|
trend_color=trend_color,
|
||||||
|
trend_linewidth=trend_linewidth,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_clean_scatter(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
y_col: str,
|
||||||
|
out_dir: Path,
|
||||||
|
*,
|
||||||
|
x_col: str = X_COL,
|
||||||
|
x_max: float = DEFAULT_X_MAX,
|
||||||
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||||
|
point_size: int = DEFAULT_POINT_SIZE,
|
||||||
|
alpha: float = DEFAULT_ALPHA,
|
||||||
|
iqr_k: float = DEFAULT_IQR_K,
|
||||||
|
q_low: float = DEFAULT_Q_LOW,
|
||||||
|
q_high: float = DEFAULT_Q_HIGH,
|
||||||
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||||
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||||
|
bins_x: int = DEFAULT_BINS_X,
|
||||||
|
bins_y: int = DEFAULT_BINS_Y,
|
||||||
|
y_min: float = DEFAULT_Y_MIN,
|
||||||
|
y_max: float = DEFAULT_Y_MAX,
|
||||||
|
trend_method: str = DEFAULT_TREND_METHOD,
|
||||||
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||||
|
trend_color: str = DEFAULT_TREND_COLOR,
|
||||||
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||||
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||||
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||||
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||||
|
) -> None:
|
||||||
|
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
|
||||||
|
cleaned = remove_outliers(
|
||||||
|
in_range,
|
||||||
|
y_col=y_col,
|
||||||
|
x_col=x_col,
|
||||||
|
iqr_k=iqr_k,
|
||||||
|
q_low=q_low,
|
||||||
|
q_high=q_high,
|
||||||
|
)
|
||||||
|
plot_density_scatter(
|
||||||
|
cleaned,
|
||||||
|
y_col=y_col,
|
||||||
|
title=f"Облако без выбросов (IQR) {y_col} vs {x_col}",
|
||||||
|
out_path=out_dir / "scatter_clean.png",
|
||||||
|
x_col=x_col,
|
||||||
|
x_max=x_max,
|
||||||
|
scatter_color=scatter_color,
|
||||||
|
point_size=point_size,
|
||||||
|
alpha=alpha,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max=y_max,
|
||||||
|
trend_method=trend_method,
|
||||||
|
trend_frac=trend_frac,
|
||||||
|
trend_color=trend_color,
|
||||||
|
trend_linewidth=trend_linewidth,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def plot_clean_trend_scatter(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
y_col: str,
|
||||||
|
out_dir: Path,
|
||||||
|
*,
|
||||||
|
x_col: str = X_COL,
|
||||||
|
x_max: float = DEFAULT_X_MAX,
|
||||||
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||||
|
point_size: int = DEFAULT_POINT_SIZE,
|
||||||
|
alpha: float = DEFAULT_TREND_ALPHA,
|
||||||
|
iqr_k: float = DEFAULT_IQR_K,
|
||||||
|
q_low: float = DEFAULT_Q_LOW,
|
||||||
|
q_high: float = DEFAULT_Q_HIGH,
|
||||||
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||||
|
trend_color: str = DEFAULT_TREND_COLOR,
|
||||||
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||||
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||||
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||||
|
bins_x: int = DEFAULT_BINS_X,
|
||||||
|
bins_y: int = DEFAULT_BINS_Y,
|
||||||
|
y_min: float = DEFAULT_Y_MIN,
|
||||||
|
y_max: float = DEFAULT_Y_MAX,
|
||||||
|
trend_method: str = DEFAULT_TREND_METHOD,
|
||||||
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||||
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||||
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||||
|
return_components: bool = False,
|
||||||
|
) -> None:
|
||||||
|
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
|
||||||
|
cleaned = remove_outliers(
|
||||||
|
in_range,
|
||||||
|
y_col=y_col,
|
||||||
|
x_col=x_col,
|
||||||
|
iqr_k=iqr_k,
|
||||||
|
q_low=q_low,
|
||||||
|
q_high=q_high,
|
||||||
|
)
|
||||||
|
plot_density_scatter(
|
||||||
|
cleaned,
|
||||||
|
y_col=y_col,
|
||||||
|
title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
|
||||||
|
out_path=out_dir / "scatter_trend.png",
|
||||||
|
x_col=x_col,
|
||||||
|
x_max=x_max,
|
||||||
|
scatter_color=scatter_color,
|
||||||
|
point_size=point_size,
|
||||||
|
alpha=alpha,
|
||||||
|
with_trend=True,
|
||||||
|
trend_frac=trend_frac,
|
||||||
|
trend_color=trend_color,
|
||||||
|
trend_linewidth=trend_linewidth,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max=y_max,
|
||||||
|
trend_method=trend_method,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
if return_components:
|
||||||
|
return fig, ax, cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def generate_scatter_set(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
y_col: str,
|
||||||
|
*,
|
||||||
|
base_out_dir: Path = BASE_OUT_DIR,
|
||||||
|
x_col: str = X_COL,
|
||||||
|
x_max: float = DEFAULT_X_MAX,
|
||||||
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||||
|
point_size: int = DEFAULT_POINT_SIZE,
|
||||||
|
alpha: float = DEFAULT_ALPHA,
|
||||||
|
trend_alpha: float = DEFAULT_TREND_ALPHA,
|
||||||
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||||
|
trend_color: str = DEFAULT_TREND_COLOR,
|
||||||
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||||
|
iqr_k: float = DEFAULT_IQR_K,
|
||||||
|
q_low: float = DEFAULT_Q_LOW,
|
||||||
|
q_high: float = DEFAULT_Q_HIGH,
|
||||||
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||||
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||||
|
bins_x: int = DEFAULT_BINS_X,
|
||||||
|
bins_y: int = DEFAULT_BINS_Y,
|
||||||
|
y_min: float = DEFAULT_Y_MIN,
|
||||||
|
y_max: float = DEFAULT_Y_MAX,
|
||||||
|
trend_method: str = DEFAULT_TREND_METHOD,
|
||||||
|
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||||
|
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||||
|
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||||
|
) -> None:
|
||||||
|
"""Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col."""
|
||||||
|
out_dir = base_out_dir / str(y_col).replace("/", "_")
|
||||||
|
plot_raw_scatter(
|
||||||
|
df,
|
||||||
|
y_col=y_col,
|
||||||
|
out_dir=out_dir,
|
||||||
|
x_col=x_col,
|
||||||
|
x_max=x_max,
|
||||||
|
scatter_color=scatter_color,
|
||||||
|
point_size=point_size,
|
||||||
|
alpha=alpha,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max=y_max,
|
||||||
|
trend_method=trend_method,
|
||||||
|
trend_frac=trend_frac,
|
||||||
|
trend_color=trend_color,
|
||||||
|
trend_linewidth=trend_linewidth,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
plot_clean_scatter(
|
||||||
|
df,
|
||||||
|
y_col=y_col,
|
||||||
|
out_dir=out_dir,
|
||||||
|
x_col=x_col,
|
||||||
|
x_max=x_max,
|
||||||
|
scatter_color=scatter_color,
|
||||||
|
point_size=point_size,
|
||||||
|
alpha=alpha,
|
||||||
|
iqr_k=iqr_k,
|
||||||
|
q_low=q_low,
|
||||||
|
q_high=q_high,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max=y_max,
|
||||||
|
trend_method=trend_method,
|
||||||
|
trend_frac=trend_frac,
|
||||||
|
trend_color=trend_color,
|
||||||
|
trend_linewidth=trend_linewidth,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
plot_clean_trend_scatter(
|
||||||
|
df,
|
||||||
|
y_col=y_col,
|
||||||
|
out_dir=out_dir,
|
||||||
|
x_col=x_col,
|
||||||
|
x_max=x_max,
|
||||||
|
scatter_color=scatter_color,
|
||||||
|
point_size=point_size,
|
||||||
|
alpha=trend_alpha,
|
||||||
|
iqr_k=iqr_k,
|
||||||
|
q_low=q_low,
|
||||||
|
q_high=q_high,
|
||||||
|
trend_frac=trend_frac,
|
||||||
|
trend_color=trend_color,
|
||||||
|
trend_linewidth=trend_linewidth,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max=y_max,
|
||||||
|
trend_method=trend_method,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
client = load_client_level(DB_PATH)
|
client = load_client_level(DB_PATH)
|
||||||
|
zero_orders = (client["orders_amt_total"] == 0).sum()
|
||||||
plot_density_scatter(
|
non_zero = len(client) - zero_orders
|
||||||
client,
|
if len(client):
|
||||||
title="Облако: заказы vs средние показы в день (все клиенты)",
|
print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})")
|
||||||
out_name="orders_vs_avg_imp_scatter.png",
|
generate_scatter_set(client, y_col="orders_amt_total")
|
||||||
)
|
|
||||||
|
|
||||||
cleaned = remove_outliers(client)
|
|
||||||
plot_density_scatter(
|
|
||||||
cleaned,
|
|
||||||
title="Облако без выбросов (IQR) заказы vs средние показы в день",
|
|
||||||
out_name="orders_vs_avg_imp_scatter_clean.png",
|
|
||||||
)
|
|
||||||
|
|
||||||
plot_density_scatter(
|
|
||||||
cleaned,
|
|
||||||
title="Облако без выбросов + тренд",
|
|
||||||
out_name="orders_vs_avg_imp_scatter_trend.png",
|
|
||||||
with_trend=True,
|
|
||||||
alpha=0.1,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -1,240 +1,352 @@
|
|||||||
import sqlite3
|
|
||||||
from pathlib import Path
|
|
||||||
import sys
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import seaborn as sns
|
|
||||||
|
|
||||||
import statsmodels.api as sm
|
import statsmodels.api as sm
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple, Optional
|
||||||
|
|
||||||
sns.set_theme(style="whitegrid")
|
from sklearn.metrics import r2_score, roc_auc_score
|
||||||
plt.rcParams["figure.figsize"] = (10, 6)
|
|
||||||
|
|
||||||
# -----------------------------
|
import best_model_and_plots as bmp
|
||||||
# Load + feature engineering (как у тебя)
|
|
||||||
# -----------------------------
|
|
||||||
project_root = Path(__file__).resolve().parent.parent
|
|
||||||
sys.path.append(str(project_root / "preanalysis_old_bad"))
|
|
||||||
import eda_utils as eda # noqa: E402
|
|
||||||
|
|
||||||
db_path = project_root / "dataset" / "ds.sqlite"
|
# Наследуем константы/визуальные настройки из scatter-скрипта
|
||||||
conn = sqlite3.connect(db_path)
|
X_COL = bmp.X_COL
|
||||||
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
DEFAULT_X_MAX = bmp.DEFAULT_X_MAX
|
||||||
conn.close()
|
DEFAULT_Y_MIN = bmp.DEFAULT_Y_MIN
|
||||||
|
DEFAULT_Y_MAX = bmp.DEFAULT_Y_MAX
|
||||||
|
DEFAULT_SCATTER_COLOR = bmp.DEFAULT_SCATTER_COLOR
|
||||||
|
DEFAULT_POINT_SIZE = bmp.DEFAULT_POINT_SIZE
|
||||||
|
DEFAULT_ALPHA = bmp.DEFAULT_ALPHA
|
||||||
|
DEFAULT_ALPHA_MIN = bmp.DEFAULT_ALPHA_MIN
|
||||||
|
DEFAULT_ALPHA_MAX = bmp.DEFAULT_ALPHA_MAX
|
||||||
|
DEFAULT_BINS_X = bmp.DEFAULT_BINS_X
|
||||||
|
DEFAULT_BINS_Y = bmp.DEFAULT_BINS_Y
|
||||||
|
DEFAULT_IQR_K = bmp.DEFAULT_IQR_K
|
||||||
|
DEFAULT_Q_LOW = bmp.DEFAULT_Q_LOW
|
||||||
|
DEFAULT_Q_HIGH = bmp.DEFAULT_Q_HIGH
|
||||||
|
DEFAULT_TREND_FRAC = bmp.DEFAULT_TREND_FRAC
|
||||||
|
DEFAULT_TREND_COLOR = bmp.DEFAULT_TREND_COLOR
|
||||||
|
DEFAULT_TREND_LINEWIDTH = bmp.DEFAULT_TREND_LINEWIDTH
|
||||||
|
BASE_OUT_DIR = bmp.BASE_OUT_DIR
|
||||||
|
|
||||||
for cols, name in [
|
|
||||||
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
|
|
||||||
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
|
|
||||||
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
|
|
||||||
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
|
|
||||||
(eda.ORDER_COLS, "orders_amt_total"),
|
|
||||||
]:
|
|
||||||
df[name] = df[cols].sum(axis=1)
|
|
||||||
|
|
||||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
def prepare_clean_data(
|
||||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
y_col: str,
|
||||||
|
*,
|
||||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
x_col: str = X_COL,
|
||||||
|
x_max: float = DEFAULT_X_MAX,
|
||||||
client = (
|
iqr_k: float = DEFAULT_IQR_K,
|
||||||
df.groupby("id")
|
q_low: float = DEFAULT_Q_LOW,
|
||||||
.agg(
|
q_high: float = DEFAULT_Q_HIGH,
|
||||||
imp_total=("imp_total", "sum"),
|
) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
|
||||||
click_total=("click_total", "sum"),
|
"""Готовит очищенные данные: фильтр по x и IQR, возвращает x, y и DataFrame."""
|
||||||
orders_amt_total=("orders_amt_total", "sum"),
|
df = bmp.load_client_level(bmp.DB_PATH)
|
||||||
age=("age", "median"),
|
base = df[[x_col, y_col]].dropna()
|
||||||
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
|
in_range = bmp.filter_x_range(base, x_col, x_max)
|
||||||
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
|
cleaned = bmp.remove_outliers(
|
||||||
|
in_range,
|
||||||
|
y_col=y_col,
|
||||||
|
x_col=x_col,
|
||||||
|
iqr_k=iqr_k,
|
||||||
|
q_low=q_low,
|
||||||
|
q_high=q_high,
|
||||||
)
|
)
|
||||||
.merge(contact_days, on="id", how="left")
|
x = cleaned[x_col].to_numpy()
|
||||||
.reset_index()
|
y = cleaned[y_col].to_numpy()
|
||||||
)
|
return x, y, cleaned
|
||||||
|
|
||||||
client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"])
|
|
||||||
client["order_rate_pct"] = 100 * client["order_rate"]
|
|
||||||
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
|
||||||
|
|
||||||
# -----------------------------
|
def fit_quadratic(
|
||||||
# Aggregate curve points (как у тебя)
|
x: np.ndarray,
|
||||||
# -----------------------------
|
y_target: np.ndarray,
|
||||||
stats_imp = (
|
weights: Optional[np.ndarray] = None,
|
||||||
client.groupby("avg_imp_per_day", as_index=False)
|
) -> Tuple[sm.regression.linear_model.RegressionResultsWrapper, np.ndarray]:
|
||||||
.agg(
|
"""Фитим квадратику по x -> y_target (WLS), предсказываем на тех же x."""
|
||||||
orders_mean=("orders_amt_total", "mean"),
|
X_design = np.column_stack([x, x**2])
|
||||||
n_clients=("id", "count"),
|
X_design = sm.add_constant(X_design)
|
||||||
|
if weights is not None:
|
||||||
|
model = sm.WLS(y_target, X_design, weights=weights).fit(cov_type="HC3")
|
||||||
|
else:
|
||||||
|
model = sm.OLS(y_target, X_design).fit(cov_type="HC3")
|
||||||
|
|
||||||
|
y_hat = model.predict(X_design)
|
||||||
|
return model, y_hat
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[Optional[float], Optional[float]]:
|
||||||
|
"""Возвращает (R2, AUC по метке y>0)."""
|
||||||
|
r2 = r2_score(y_true, y_pred)
|
||||||
|
auc = None
|
||||||
|
try:
|
||||||
|
auc = roc_auc_score((y_true > 0).astype(int), y_pred)
|
||||||
|
except ValueError:
|
||||||
|
auc = None
|
||||||
|
return r2, auc
|
||||||
|
|
||||||
|
|
||||||
|
def map_trend_to_points(x_points: np.ndarray, trend_x: np.ndarray, trend_y: np.ndarray) -> np.ndarray:
|
||||||
|
"""Интерполирует значения тренда в точках x_points."""
|
||||||
|
if len(trend_x) == 0:
|
||||||
|
return np.zeros_like(x_points)
|
||||||
|
# гарантируем отсортированность
|
||||||
|
order = np.argsort(trend_x)
|
||||||
|
tx = trend_x[order]
|
||||||
|
ty = trend_y[order]
|
||||||
|
return np.interp(x_points, tx, ty, left=ty[0], right=ty[-1])
|
||||||
|
|
||||||
|
|
||||||
|
def density_weights(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
y_col: str,
|
||||||
|
*,
|
||||||
|
x_col: str = X_COL,
|
||||||
|
x_max: float = DEFAULT_X_MAX,
|
||||||
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||||
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||||
|
bins_x: int = DEFAULT_BINS_X,
|
||||||
|
bins_y: int = DEFAULT_BINS_Y,
|
||||||
|
y_min: float = DEFAULT_Y_MIN,
|
||||||
|
y_max: float = DEFAULT_Y_MAX,
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""Строит веса из плотности (та же схема, что и альфы на графике)."""
|
||||||
|
alphas = bmp.compute_density_alpha(
|
||||||
|
df,
|
||||||
|
x_col=x_col,
|
||||||
|
y_col=y_col,
|
||||||
|
x_max=x_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max_limit=y_max,
|
||||||
)
|
)
|
||||||
.sort_values("avg_imp_per_day")
|
if len(alphas) == 0:
|
||||||
).reset_index(drop=True)
|
return np.ones(len(df))
|
||||||
|
denom = max(alpha_max - alpha_min, 1e-9)
|
||||||
|
weights = (alphas - alpha_min) / denom
|
||||||
|
weights = np.clip(weights, 0, None)
|
||||||
|
return weights
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Filtering / outlier logic (как у тебя)
|
|
||||||
# -----------------------------
|
|
||||||
K_MULT = 2
|
|
||||||
ABS_DY_MIN = 1
|
|
||||||
X_MAX = 16
|
|
||||||
|
|
||||||
stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
|
def plot_quadratic_overlay(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
model: sm.regression.linear_model.RegressionResultsWrapper,
|
||||||
|
y_col: str,
|
||||||
|
out_path: Path,
|
||||||
|
*,
|
||||||
|
x_col: str = X_COL,
|
||||||
|
x_max: float = DEFAULT_X_MAX,
|
||||||
|
y_min: float = DEFAULT_Y_MIN,
|
||||||
|
y_max: float = DEFAULT_Y_MAX,
|
||||||
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||||
|
point_size: int = DEFAULT_POINT_SIZE,
|
||||||
|
alpha: float = DEFAULT_ALPHA,
|
||||||
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||||
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||||
|
bins_x: int = DEFAULT_BINS_X,
|
||||||
|
bins_y: int = DEFAULT_BINS_Y,
|
||||||
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||||
|
trend_color: str = DEFAULT_TREND_COLOR,
|
||||||
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||||
|
trend_method: str = bmp.DEFAULT_TREND_METHOD,
|
||||||
|
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
|
||||||
|
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
|
||||||
|
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
|
||||||
|
) -> None:
|
||||||
|
"""Рисует облако + LOWESS-тренд + линию квадр. регрессии."""
|
||||||
|
fig, ax = bmp.plt.subplots(figsize=(8, 8))
|
||||||
|
alpha_values = bmp.compute_density_alpha(
|
||||||
|
df,
|
||||||
|
x_col=x_col,
|
||||||
|
y_col=y_col,
|
||||||
|
x_max=x_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max_limit=y_max,
|
||||||
|
)
|
||||||
|
ax.scatter(
|
||||||
|
df[x_col],
|
||||||
|
df[y_col],
|
||||||
|
color=scatter_color,
|
||||||
|
s=point_size,
|
||||||
|
alpha=alpha_values if len(alpha_values) else alpha,
|
||||||
|
linewidths=0,
|
||||||
|
label="Точки (очищено)",
|
||||||
|
)
|
||||||
|
|
||||||
before = len(stats_f)
|
# Тренд по выбранному методу
|
||||||
y = stats_f["orders_mean"]
|
tx, ty = bmp.compute_trend(
|
||||||
abs_dy = y.diff().abs()
|
df,
|
||||||
|
y_col=y_col,
|
||||||
|
x_col=x_col,
|
||||||
|
method=trend_method,
|
||||||
|
lowess_frac=trend_frac,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
if len(tx):
|
||||||
|
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
|
||||||
|
|
||||||
prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
|
# Квадратичная регрессия
|
||||||
ratio = abs_dy / (prev3_mean.replace(0, np.nan))
|
x_grid = np.linspace(0, x_max, 400)
|
||||||
|
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
|
||||||
|
y_grid = model.predict(X_grid)
|
||||||
|
ax.plot(x_grid, y_grid, color="blue", linewidth=2.3, linestyle="--", label="Квадр. регрессия")
|
||||||
|
|
||||||
is_outlier = ((abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT)) | (y > 5)
|
ax.set_xlim(0, x_max)
|
||||||
is_outlier = is_outlier.fillna(False)
|
ax.set_ylim(y_min, y_max)
|
||||||
|
ax.set_yticks(range(0, int(y_max) + 1, 2))
|
||||||
|
ax.set_xlabel("Среднее число показов в день")
|
||||||
|
ax.set_ylabel(y_col)
|
||||||
|
ax.set_title(f"Квадратичная регрессия: {y_col} vs {x_col}")
|
||||||
|
ax.grid(alpha=0.3)
|
||||||
|
ax.legend()
|
||||||
|
|
||||||
stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
after = len(stats_f)
|
fig.tight_layout()
|
||||||
print(f"Фильтрация: было {before}, стало {after}, убрали {before-after} точек")
|
fig.savefig(out_path, dpi=150)
|
||||||
|
bmp.plt.close(fig)
|
||||||
|
print(f"Saved {out_path}")
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Smoothing (оставим для визуалки, но регрессию делаем по orders_mean)
|
|
||||||
# -----------------------------
|
|
||||||
w = max(7, int(len(stats_f) * 0.05))
|
|
||||||
if w % 2 == 0:
|
|
||||||
w += 1
|
|
||||||
|
|
||||||
stats_f["orders_smooth"] = (
|
def report_model(
|
||||||
stats_f["orders_mean"]
|
model: sm.regression.linear_model.RegressionResultsWrapper,
|
||||||
.rolling(window=w, center=True, min_periods=1)
|
r2: Optional[float],
|
||||||
.mean()
|
auc: Optional[float],
|
||||||
)
|
*,
|
||||||
|
r2_trend: Optional[float] = None,
|
||||||
|
) -> None:
|
||||||
|
params = model.params
|
||||||
|
pvals = model.pvalues
|
||||||
|
fmt_p = lambda p: f"<1e-300" if p < 1e-300 else f"{p:.4g}"
|
||||||
|
print("\n=== Квадратичная регрессия (y ~ 1 + x + x^2) ===")
|
||||||
|
print(f"const: {params[0]:.6f} (p={fmt_p(pvals[0])})")
|
||||||
|
print(f"beta1 x: {params[1]:.6f} (p={fmt_p(pvals[1])})")
|
||||||
|
print(f"beta2 x^2: {params[2]:.6f} (p={fmt_p(pvals[2])})")
|
||||||
|
print(f"R2: {r2:.4f}" if r2 is not None else "R2: n/a")
|
||||||
|
if r2_trend is not None:
|
||||||
|
print(f"R2 vs trend target: {r2_trend:.4f}")
|
||||||
|
print(f"AUC (target y>0): {auc:.4f}" if auc is not None else "AUC: n/a (один класс)")
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Cost line (как у тебя, нормировка "в единицах заказов")
|
|
||||||
# -----------------------------
|
|
||||||
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
|
|
||||||
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
|
|
||||||
|
|
||||||
# -----------------------------
|
def generate_quadratic_analysis(
|
||||||
# Quadratic regression: orders_mean ~ 1 + x + x^2
|
y_col: str,
|
||||||
# WLS with weights = n_clients
|
*,
|
||||||
# -----------------------------
|
x_col: str = X_COL,
|
||||||
x = stats_f["avg_imp_per_day"].to_numpy()
|
base_out_dir: Path = BASE_OUT_DIR,
|
||||||
y = stats_f["orders_mean"].to_numpy()
|
config_name: str = "default",
|
||||||
wts = stats_f["n_clients"].to_numpy().astype(float)
|
x_max: float = DEFAULT_X_MAX,
|
||||||
|
y_min: float = DEFAULT_Y_MIN,
|
||||||
|
y_max: float = DEFAULT_Y_MAX,
|
||||||
|
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||||
|
point_size: int = DEFAULT_POINT_SIZE,
|
||||||
|
alpha: float = DEFAULT_ALPHA,
|
||||||
|
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||||
|
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||||
|
bins_x: int = DEFAULT_BINS_X,
|
||||||
|
bins_y: int = DEFAULT_BINS_Y,
|
||||||
|
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||||
|
trend_color: str = DEFAULT_TREND_COLOR,
|
||||||
|
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||||
|
iqr_k: float = DEFAULT_IQR_K,
|
||||||
|
q_low: float = DEFAULT_Q_LOW,
|
||||||
|
q_high: float = DEFAULT_Q_HIGH,
|
||||||
|
trend_method: str = bmp.DEFAULT_TREND_METHOD,
|
||||||
|
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
|
||||||
|
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
|
||||||
|
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
|
||||||
|
) -> dict:
|
||||||
|
x, y, cleaned_df = prepare_clean_data(
|
||||||
|
y_col,
|
||||||
|
x_col=x_col,
|
||||||
|
x_max=x_max,
|
||||||
|
iqr_k=iqr_k,
|
||||||
|
q_low=q_low,
|
||||||
|
q_high=q_high,
|
||||||
|
)
|
||||||
|
w = density_weights(
|
||||||
|
cleaned_df,
|
||||||
|
y_col=y_col,
|
||||||
|
x_col=x_col,
|
||||||
|
x_max=x_max,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max=y_max,
|
||||||
|
)
|
||||||
|
# тренд по выбранному методу
|
||||||
|
tx, ty = bmp.compute_trend(
|
||||||
|
cleaned_df,
|
||||||
|
y_col=y_col,
|
||||||
|
x_col=x_col,
|
||||||
|
method=trend_method,
|
||||||
|
lowess_frac=trend_frac,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
|
||||||
X = np.column_stack([x, x**2])
|
trend_target = map_trend_to_points(x, tx, ty)
|
||||||
X = sm.add_constant(X) # [1, x, x^2]
|
model, y_hat = fit_quadratic(x, trend_target, weights=w)
|
||||||
|
r2_actual, auc = compute_metrics(y, y_hat)
|
||||||
|
r2_trend = r2_score(trend_target, y_hat) if len(trend_target) else None
|
||||||
|
report_model(model, r2_actual, auc, r2_trend=r2_trend)
|
||||||
|
|
||||||
model = sm.WLS(y, X, weights=wts)
|
out_dir = base_out_dir / config_name / str(y_col).replace("/", "_")
|
||||||
res = model.fit(cov_type="HC3") # робастные ошибки
|
plot_quadratic_overlay(
|
||||||
|
cleaned_df,
|
||||||
|
model,
|
||||||
|
y_col=y_col,
|
||||||
|
out_path=out_dir / "quad_regression.png",
|
||||||
|
x_col=x_col,
|
||||||
|
x_max=x_max,
|
||||||
|
y_min=y_min,
|
||||||
|
y_max=y_max,
|
||||||
|
scatter_color=scatter_color,
|
||||||
|
point_size=point_size,
|
||||||
|
alpha=alpha,
|
||||||
|
alpha_min=alpha_min,
|
||||||
|
alpha_max=alpha_max,
|
||||||
|
bins_x=bins_x,
|
||||||
|
bins_y=bins_y,
|
||||||
|
trend_frac=trend_frac,
|
||||||
|
trend_color=trend_color,
|
||||||
|
trend_linewidth=trend_linewidth,
|
||||||
|
trend_method=trend_method,
|
||||||
|
rolling_window=rolling_window,
|
||||||
|
savgol_window=savgol_window,
|
||||||
|
savgol_poly=savgol_poly,
|
||||||
|
)
|
||||||
|
|
||||||
b0, b1, b2 = res.params
|
return {
|
||||||
p_b1_two = res.pvalues[1]
|
"config": config_name,
|
||||||
p_b2_two = res.pvalues[2]
|
"y_col": y_col,
|
||||||
|
"r2": r2_actual,
|
||||||
|
"r2_trend": r2_trend,
|
||||||
|
"auc": auc,
|
||||||
|
"params": {
|
||||||
|
"trend_method": trend_method,
|
||||||
|
"trend_frac": trend_frac,
|
||||||
|
"rolling_window": rolling_window,
|
||||||
|
"savgol_window": savgol_window,
|
||||||
|
"savgol_poly": savgol_poly,
|
||||||
|
"x_max": x_max,
|
||||||
|
"weights_alpha_range": (alpha_min, alpha_max),
|
||||||
|
},
|
||||||
|
"coeffs": model.params.tolist(),
|
||||||
|
"pvalues": model.pvalues.tolist(),
|
||||||
|
}
|
||||||
|
|
||||||
# one-sided p-values for directional hypotheses
|
|
||||||
p_b1_pos = (p_b1_two / 2) if (b1 > 0) else (1 - p_b1_two / 2)
|
|
||||||
p_b2_neg = (p_b2_two / 2) if (b2 < 0) else (1 - p_b2_two / 2)
|
|
||||||
|
|
||||||
# turning point (if concave)
|
def main() -> None:
|
||||||
x_star = None
|
generate_quadratic_analysis("orders_amt_total")
|
||||||
y_star = None
|
|
||||||
if b2 < 0:
|
|
||||||
x_star = -b1 / (2 * b2)
|
|
||||||
y_star = b0 + b1 * x_star + b2 * x_star**2
|
|
||||||
|
|
||||||
# Intersection with cost line: b0 + b1 x + b2 x^2 = c x -> b2 x^2 + (b1 - c) x + b0 = 0
|
|
||||||
x_cross = None
|
|
||||||
roots = np.roots([b2, (b1 - c), b0]) # may be complex
|
|
||||||
roots = [r.real for r in roots if abs(r.imag) < 1e-8]
|
|
||||||
roots_in_range = [r for r in roots if (stats_f["avg_imp_per_day"].min() <= r <= stats_f["avg_imp_per_day"].max())]
|
|
||||||
if roots_in_range:
|
|
||||||
# берём корень ближе к "правой" части (обычно пересечение интереснее там, где начинается невыгодно)
|
|
||||||
x_cross = max(roots_in_range)
|
|
||||||
|
|
||||||
# -----------------------------
|
if __name__ == "__main__":
|
||||||
# Print results + interpretation (по-человечески)
|
main()
|
||||||
# -----------------------------
|
|
||||||
print("\n=== Квадратичная регрессия (WLS, веса = n_clients, SE = HC3) ===")
|
|
||||||
print(res.summary())
|
|
||||||
|
|
||||||
print("\n=== Проверка гипотезы убывающей отдачи / спада ===")
|
|
||||||
print(f"β1 (линейный эффект): {b1:.6f}, двусторонний p={p_b1_two:.4g}, односторонний p(β1>0)={p_b1_pos:.4g}")
|
|
||||||
print(f"β2 (кривизна): {b2:.6f}, двусторонний p={p_b2_two:.4g}, односторонний p(β2<0)={p_b2_neg:.4g}")
|
|
||||||
|
|
||||||
alpha = 0.05
|
|
||||||
support = (b1 > 0) and (b2 < 0) and (p_b1_pos < alpha) and (p_b2_neg < alpha)
|
|
||||||
|
|
||||||
if support:
|
|
||||||
print("\nВывод: данные поддерживают гипотезу нелинейности.")
|
|
||||||
print("Есть статистически значимый рост на малых x (β1>0) и насыщение/спад (β2<0).")
|
|
||||||
else:
|
|
||||||
print("\nВывод: строгого статистического подтверждения по знакам/значимости может не хватить.")
|
|
||||||
print("Но знак коэффициентов и форма кривой всё равно могут быть согласованы с гипотезой.")
|
|
||||||
print("На защите говори аккуратно: 'наблюдается тенденция/согласуется с гипотезой'.")
|
|
||||||
|
|
||||||
if x_star is not None:
|
|
||||||
print(f"\nОценка 'порога насыщения' (вершина параболы): x* = {x_star:.3f} показов/день")
|
|
||||||
print(f"Прогноз среднего числа заказов в x*: y(x*) ≈ {y_star:.3f}")
|
|
||||||
if not (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
|
|
||||||
print("Внимание: x* вне диапазона наблюдений, интерпретация как 'оптимума' сомнительная.")
|
|
||||||
else:
|
|
||||||
print("\nВершина не считается как максимум: β2 >= 0 (нет выпуклости вниз).")
|
|
||||||
|
|
||||||
if x_cross is not None:
|
|
||||||
y_cross = b0 + b1 * x_cross + b2 * x_cross**2
|
|
||||||
print(f"\nТочка пересечения с линейными расходами (в нормировке c={c:.4f}): x≈{x_cross:.3f}, y≈{y_cross:.3f}")
|
|
||||||
else:
|
|
||||||
print("\nПересечение с линией расходов в выбранной нормировке не найдено (или вне диапазона).")
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Plot: points + smooth + quadratic fit + cost + markers
|
|
||||||
# -----------------------------
|
|
||||||
x_grid = np.linspace(stats_f["avg_imp_per_day"].min(), stats_f["avg_imp_per_day"].max(), 300)
|
|
||||||
y_hat = b0 + b1 * x_grid + b2 * x_grid**2
|
|
||||||
cost_hat = c * x_grid
|
|
||||||
|
|
||||||
plt.figure(figsize=(10, 8))
|
|
||||||
|
|
||||||
plt.plot(
|
|
||||||
stats_f["avg_imp_per_day"], stats_f["orders_mean"],
|
|
||||||
marker="o", linestyle="-", linewidth=1, alpha=0.3,
|
|
||||||
label="Среднее число заказов (по точкам)"
|
|
||||||
)
|
|
||||||
|
|
||||||
plt.plot(
|
|
||||||
stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
|
|
||||||
color="red", linewidth=2.2,
|
|
||||||
label="Сглаженный тренд (rolling mean)"
|
|
||||||
)
|
|
||||||
|
|
||||||
plt.plot(
|
|
||||||
x_grid, y_hat,
|
|
||||||
color="blue", linewidth=2.5,
|
|
||||||
label="Квадратичная регрессия (WLS)"
|
|
||||||
)
|
|
||||||
|
|
||||||
plt.plot(
|
|
||||||
x_grid, cost_hat,
|
|
||||||
color="black", linestyle="--", linewidth=2,
|
|
||||||
label="Линейные расходы на показы"
|
|
||||||
)
|
|
||||||
|
|
||||||
if x_star is not None and (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
|
|
||||||
plt.axvline(x_star, color="blue", linestyle=":", linewidth=2)
|
|
||||||
plt.scatter([x_star], [y_star], color="blue", zorder=5)
|
|
||||||
plt.text(x_star, y_star, f" x*={x_star:.2f}", va="bottom")
|
|
||||||
|
|
||||||
if x_cross is not None:
|
|
||||||
y_cross = b0 + b1 * x_cross + b2 * x_cross**2
|
|
||||||
plt.axvline(x_cross, color="black", linestyle=":", linewidth=2, alpha=0.8)
|
|
||||||
plt.scatter([x_cross], [y_cross], color="black", zorder=5)
|
|
||||||
plt.text(x_cross, y_cross, f" пересечение≈{x_cross:.2f}", va="top")
|
|
||||||
|
|
||||||
plt.xlabel("Среднее число показов в день")
|
|
||||||
plt.ylabel("Среднее число заказов")
|
|
||||||
plt.title("Нелинейный эффект интенсивности коммуникаций: квадратичная регрессия")
|
|
||||||
plt.legend()
|
|
||||||
plt.grid(alpha=0.3)
|
|
||||||
plt.tight_layout()
|
|
||||||
|
|
||||||
out_dir = project_root / "main_hypot"
|
|
||||||
out_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
out_path = out_dir / "quad_regression_with_costs.png"
|
|
||||||
plt.savefig(out_path, dpi=150)
|
|
||||||
print(f"\nSaved: {out_path}")
|
|
||||||
|
|||||||
|
Before Width: | Height: | Size: 119 KiB After Width: | Height: | Size: 119 KiB |
|
Before Width: | Height: | Size: 47 KiB After Width: | Height: | Size: 47 KiB |
|
Before Width: | Height: | Size: 91 KiB After Width: | Height: | Size: 91 KiB |
|
Before Width: | Height: | Size: 422 KiB After Width: | Height: | Size: 422 KiB |
|
Before Width: | Height: | Size: 177 KiB After Width: | Height: | Size: 177 KiB |
|
Before Width: | Height: | Size: 70 KiB After Width: | Height: | Size: 70 KiB |
|
Before Width: | Height: | Size: 122 KiB After Width: | Height: | Size: 122 KiB |
|
Before Width: | Height: | Size: 124 KiB After Width: | Height: | Size: 124 KiB |
|
Before Width: | Height: | Size: 130 KiB After Width: | Height: | Size: 130 KiB |
|
Before Width: | Height: | Size: 405 KiB After Width: | Height: | Size: 405 KiB |
|
Before Width: | Height: | Size: 387 KiB After Width: | Height: | Size: 387 KiB |
|
Before Width: | Height: | Size: 360 KiB After Width: | Height: | Size: 360 KiB |
|
Before Width: | Height: | Size: 256 KiB After Width: | Height: | Size: 256 KiB |
|
Before Width: | Height: | Size: 440 KiB After Width: | Height: | Size: 440 KiB |
|
Before Width: | Height: | Size: 87 KiB After Width: | Height: | Size: 87 KiB |
154
preanalysis/eda_utils.py
Normal file
@@ -0,0 +1,154 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, List
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# Paths and column groups
|
||||||
|
DATA_PATH = Path("dataset/ds.csv")
|
||||||
|
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
||||||
|
|
||||||
|
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
||||||
|
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
||||||
|
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
|
||||||
|
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
|
||||||
|
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
||||||
|
|
||||||
|
NUMERIC_COLS = (
|
||||||
|
ACTIVE_IMP_COLS
|
||||||
|
+ PASSIVE_IMP_COLS
|
||||||
|
+ ACTIVE_CLICK_COLS
|
||||||
|
+ PASSIVE_CLICK_COLS
|
||||||
|
+ ORDER_COLS
|
||||||
|
+ ["age"]
|
||||||
|
)
|
||||||
|
CAT_COLS = ["gender_cd", "device_platform_cd"]
|
||||||
|
|
||||||
|
|
||||||
|
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
|
||||||
|
"""Divide with protection against zero (works for Series and scalars)."""
|
||||||
|
if isinstance(denominator, pd.Series):
|
||||||
|
denom = denominator.replace(0, np.nan)
|
||||||
|
else:
|
||||||
|
denom = np.nan if float(denominator) == 0 else denominator
|
||||||
|
return numerator / denom
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_gender(series: pd.Series) -> pd.Series:
|
||||||
|
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
|
||||||
|
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
|
||||||
|
return cleaned.map(mapping).fillna("UNKNOWN")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_device(series: pd.Series) -> pd.Series:
|
||||||
|
cleaned = series.fillna("unknown").astype(str).str.strip()
|
||||||
|
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
|
||||||
|
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
|
||||||
|
mapped = lowered.map(mapping)
|
||||||
|
fallback = cleaned.str.title()
|
||||||
|
return mapped.fillna(fallback)
|
||||||
|
|
||||||
|
|
||||||
|
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
bins = [0, 25, 35, 45, 55, np.inf]
|
||||||
|
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
|
||||||
|
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
|
||||||
|
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
|
||||||
|
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
|
||||||
|
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
|
||||||
|
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
||||||
|
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
||||||
|
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
||||||
|
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
|
||||||
|
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
|
||||||
|
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
|
||||||
|
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
|
||||||
|
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
||||||
|
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
||||||
|
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
|
||||||
|
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
df["business_dt"] = pd.to_datetime(df["business_dt"])
|
||||||
|
df["gender_cd"] = normalize_gender(df["gender_cd"])
|
||||||
|
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
|
||||||
|
df = add_age_group(df)
|
||||||
|
df = add_totals(df)
|
||||||
|
df = add_flags(df)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
|
||||||
|
stats = []
|
||||||
|
for col in cols:
|
||||||
|
series = df[col]
|
||||||
|
stats.append(
|
||||||
|
{
|
||||||
|
"col": col,
|
||||||
|
"count": series.count(),
|
||||||
|
"mean": series.mean(),
|
||||||
|
"median": series.median(),
|
||||||
|
"std": series.std(),
|
||||||
|
"min": series.min(),
|
||||||
|
"q25": series.quantile(0.25),
|
||||||
|
"q75": series.quantile(0.75),
|
||||||
|
"max": series.max(),
|
||||||
|
"share_zero": (series == 0).mean(),
|
||||||
|
"p95": series.quantile(0.95),
|
||||||
|
"p99": series.quantile(0.99),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return pd.DataFrame(stats)
|
||||||
|
|
||||||
|
|
||||||
|
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
|
||||||
|
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
|
||||||
|
daily = add_totals(daily)
|
||||||
|
daily["day_of_week"] = daily["business_dt"].dt.day_name()
|
||||||
|
return daily
|
||||||
|
|
||||||
|
|
||||||
|
def build_client(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
|
||||||
|
meta_spec: Dict[str, str | callable] = {
|
||||||
|
"age": "median",
|
||||||
|
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
|
||||||
|
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
|
||||||
|
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
|
||||||
|
}
|
||||||
|
agg_spec.update(meta_spec)
|
||||||
|
client = df.groupby("id").agg(agg_spec).reset_index()
|
||||||
|
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
||||||
|
imp_day = df.copy()
|
||||||
|
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
||||||
|
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
|
||||||
|
client = add_totals(client)
|
||||||
|
client = add_flags(client)
|
||||||
|
client = client.merge(contact_days, on="id", how="left")
|
||||||
|
client = client.merge(max_imp_day, on="id", how="left")
|
||||||
|
client = add_contact_density(client)
|
||||||
|
return client
|
||||||
|
|
||||||
|
|
||||||
|
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
# contact_days must already be present
|
||||||
|
if "contact_days" in df.columns:
|
||||||
|
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
|
||||||
|
return df
|
||||||
|
return df
|
||||||