good quadreg 0.92 r2

This commit is contained in:
dan
2025-12-14 22:53:28 +03:00
parent 4f8f266c3e
commit 3dc05530c0
19 changed files with 1126 additions and 1516 deletions

View File

@@ -1,43 +1,66 @@
import sqlite3
from pathlib import Path
import sys
from typing import Tuple
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
import pandas as pd
import seaborn as sns
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams["figure.figsize"] = (8, 8)
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
DB_PATH = project_root / "dataset" / "ds.sqlite"
OUT_DIR = project_root / "main_hypot"
X_COL = "avg_imp_per_day"
Y_COL = "orders_amt_total"
X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать
SCATTER_COLOR = "#2c7bb6"
BASE_OUT_DIR = project_root / "main_hypot"
# Константы данных
CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
# Константы визуализации/очистки
X_COL = "avg_imp_per_day" # x всегда фиксирован
DEFAULT_X_MAX = 18
DEFAULT_SCATTER_COLOR = "#2c7bb6"
DEFAULT_POINT_SIZE = 20
DEFAULT_ALPHA = 0.08
DEFAULT_TREND_ALPHA = 0.1
DEFAULT_TREND_FRAC = 0.3
DEFAULT_TREND_COLOR = "red"
DEFAULT_TREND_LINEWIDTH = 2.5
DEFAULT_IQR_K = 1.5
DEFAULT_Q_LOW = 0.05
DEFAULT_Q_HIGH = 0.95
DEFAULT_ALPHA_MIN = 0.04
DEFAULT_ALPHA_MAX = 0.7
DEFAULT_BINS_X = 60
DEFAULT_BINS_Y = 60
DEFAULT_Y_MIN = -0.5
DEFAULT_Y_MAX = 10
DEFAULT_TREND_METHOD = "savgol" # options: lowess, rolling, savgol
DEFAULT_ROLLING_WINDOW = 200
DEFAULT_SAVGOL_WINDOW = 501
DEFAULT_SAVGOL_POLY = 2
def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
denom = denominator.replace(0, pd.NA)
return numerator / denom
def load_client_level(db_path: Path) -> pd.DataFrame:
"""Собирает агрегаты по клиентам без усреднения по x."""
"""Собирает агрегаты по клиентам без зависимостей от eda_utils."""
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
client = (
df.groupby("id")
@@ -49,94 +72,503 @@ def load_client_level(db_path: Path) -> pd.DataFrame:
.reset_index()
)
client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"])
client[Y_COL] = client["orders_amt_total"]
client = client[["id", X_COL, Y_COL]].dropna()
in_range = client[client[X_COL] <= X_MAX].copy()
print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
return in_range
client[X_COL] = safe_divide(client["imp_total"], client["contact_days"])
print(f"Loaded {len(client)} clients with {X_COL} computed.")
return client
def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame:
"""Убирает выбросы по IQR отдельно по x и y."""
def bounds(series: pd.Series) -> tuple[float, float]:
q1, q3 = series.quantile([0.05, 0.95])
def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]:
q1, q3 = series.quantile([q_low, q_high])
iqr = q3 - q1
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
x_low, x_high = bounds(df[X_COL])
y_low, y_high = bounds(df[Y_COL])
def remove_outliers(
df: pd.DataFrame,
y_col: str,
x_col: str = X_COL,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
) -> pd.DataFrame:
"""Убирает выбросы по IQR отдельно по x и y."""
x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k)
y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k)
filtered = df[
df[X_COL].between(max(0, x_low), x_high)
& df[Y_COL].between(max(0, y_low), y_high)
df[x_col].between(max(0, x_low), x_high)
& df[y_col].between(max(0, y_low), y_high)
].copy()
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).")
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).")
return filtered
def compute_density_alpha(
df: pd.DataFrame,
x_col: str,
y_col: str,
x_max: float,
*,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max_limit: float = DEFAULT_Y_MAX,
) -> np.ndarray:
"""Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах."""
x_vals = df[x_col].to_numpy()
y_vals = df[y_col].to_numpy()
if len(x_vals) == 0:
return np.array([])
x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1)
y_upper = max(min(y_vals.max(), y_max_limit), 1e-9)
y_edges = np.linspace(y_min, y_upper, bins_y + 1)
x_bins = np.digitize(x_vals, x_edges) - 1
y_bins = np.digitize(y_vals, y_edges) - 1
valid = (
(x_bins >= 0) & (x_bins < bins_x) &
(y_bins >= 0) & (y_bins < bins_y)
)
counts = np.zeros((bins_x, bins_y), dtype=int)
for xb, yb in zip(x_bins[valid], y_bins[valid]):
counts[xb, yb] += 1
bin_counts = counts[
np.clip(x_bins, 0, bins_x - 1),
np.clip(y_bins, 0, bins_y - 1),
]
max_count = bin_counts.max() if len(bin_counts) else 1
if max_count == 0:
weight = np.zeros_like(bin_counts, dtype=float)
else:
weight = (bin_counts / max_count) ** np.sqrt(1.5)
weight = np.clip(weight, 0, 1)
return alpha_min + (alpha_max - alpha_min) * weight
def compute_trend(
df: pd.DataFrame,
y_col: str,
*,
x_col: str = X_COL,
method: str = DEFAULT_TREND_METHOD,
lowess_frac: float = DEFAULT_TREND_FRAC,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> Tuple[np.ndarray, np.ndarray]:
"""Возвращает (x_sorted, trend_y) по выбранному методу."""
d = df[[x_col, y_col]].dropna().sort_values(x_col)
x_vals = d[x_col].to_numpy()
y_vals = d[y_col].to_numpy()
if len(x_vals) == 0:
return np.array([]), np.array([])
m = method.lower()
if m == "lowess":
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
return trend[:, 0], trend[:, 1]
if m == "rolling":
w = max(3, rolling_window)
if w % 2 == 0:
w += 1
y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy()
return x_vals, y_trend
if m == "savgol":
w = max(5, savgol_window)
if w % 2 == 0:
w += 1
poly = min(savgol_poly, w - 1)
y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp")
return x_vals, y_trend
# fallback to lowess
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
return trend[:, 0], trend[:, 1]
def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame:
subset = df[df[x_col] <= x_max].copy()
print(f"{len(df)} points; {len(subset)} within x<={x_max}.")
return subset
def plot_density_scatter(
df: pd.DataFrame,
y_col: str,
title: str,
out_name: str,
out_path: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
with_trend: bool = False,
alpha: float = 0.08,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(
data=df,
x=X_COL,
y=Y_COL,
color=SCATTER_COLOR,
s=20,
alpha=alpha,
linewidth=0,
ax=ax,
fig, ax = plt.subplots(figsize=(8, 8))
alpha_values = compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bins_x,
bins_y=bins_y,
alpha_min=alpha_min,
alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
ax.scatter(
df[x_col],
df[y_col],
color=scatter_color,
s=point_size,
alpha=alpha_values if len(alpha_values) else alpha,
linewidths=0,
)
if with_trend:
trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True)
ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд")
tx, ty = compute_trend(
df,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
if len(tx):
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
ax.legend()
ax.set_xlim(0, X_MAX)
ax.set_ylim(bottom=0)
ax.set_xlim(0, x_max)
ax.set_ylim(y_min, y_max)
ax.set_yticks(range(0, int(y_max) + 1, 2))
ax.set_xlabel("Среднее число показов в день")
ax.set_ylabel("Число заказов за период (сумма)")
ax.set_ylabel(y_col)
ax.set_title(title)
ax.grid(alpha=0.3)
OUT_DIR.mkdir(parents=True, exist_ok=True)
out_path = OUT_DIR / out_name
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(out_path, dpi=150)
plt.close(fig)
print(f"Saved {out_path}")
def plot_raw_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
plot_density_scatter(
in_range,
y_col=y_col,
title=f"Облако: {y_col} vs {x_col} (все клиенты)",
out_path=out_dir / "scatter.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def plot_clean_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
cleaned = remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
plot_density_scatter(
cleaned,
y_col=y_col,
title=f"Облако без выбросов (IQR) {y_col} vs {x_col}",
out_path=out_dir / "scatter_clean.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def plot_clean_trend_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_TREND_ALPHA,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
return_components: bool = False,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
cleaned = remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
plot_density_scatter(
cleaned,
y_col=y_col,
title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
out_path=out_dir / "scatter_trend.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
with_trend=True,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
if return_components:
return fig, ax, cleaned
def generate_scatter_set(
df: pd.DataFrame,
y_col: str,
*,
base_out_dir: Path = BASE_OUT_DIR,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
trend_alpha: float = DEFAULT_TREND_ALPHA,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
"""Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col."""
out_dir = base_out_dir / str(y_col).replace("/", "_")
plot_raw_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plot_clean_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plot_clean_trend_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=trend_alpha,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def main() -> None:
client = load_client_level(DB_PATH)
plot_density_scatter(
client,
title="Облако: заказы vs средние показы в день (все клиенты)",
out_name="orders_vs_avg_imp_scatter.png",
)
cleaned = remove_outliers(client)
plot_density_scatter(
cleaned,
title="Облако без выбросов (IQR) заказы vs средние показы в день",
out_name="orders_vs_avg_imp_scatter_clean.png",
)
plot_density_scatter(
cleaned,
title="Облако без выбросов + тренд",
out_name="orders_vs_avg_imp_scatter_trend.png",
with_trend=True,
alpha=0.1,
)
zero_orders = (client["orders_amt_total"] == 0).sum()
non_zero = len(client) - zero_orders
if len(client):
print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})")
generate_scatter_set(client, y_col="orders_amt_total")
if __name__ == "__main__":

View File

@@ -1,240 +1,352 @@
import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from pathlib import Path
from typing import Tuple, Optional
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
from sklearn.metrics import r2_score, roc_auc_score
# -----------------------------
# Load + feature engineering (как у тебя)
# -----------------------------
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
import best_model_and_plots as bmp
db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
# Наследуем константы/визуальные настройки из scatter-скрипта
X_COL = bmp.X_COL
DEFAULT_X_MAX = bmp.DEFAULT_X_MAX
DEFAULT_Y_MIN = bmp.DEFAULT_Y_MIN
DEFAULT_Y_MAX = bmp.DEFAULT_Y_MAX
DEFAULT_SCATTER_COLOR = bmp.DEFAULT_SCATTER_COLOR
DEFAULT_POINT_SIZE = bmp.DEFAULT_POINT_SIZE
DEFAULT_ALPHA = bmp.DEFAULT_ALPHA
DEFAULT_ALPHA_MIN = bmp.DEFAULT_ALPHA_MIN
DEFAULT_ALPHA_MAX = bmp.DEFAULT_ALPHA_MAX
DEFAULT_BINS_X = bmp.DEFAULT_BINS_X
DEFAULT_BINS_Y = bmp.DEFAULT_BINS_Y
DEFAULT_IQR_K = bmp.DEFAULT_IQR_K
DEFAULT_Q_LOW = bmp.DEFAULT_Q_LOW
DEFAULT_Q_HIGH = bmp.DEFAULT_Q_HIGH
DEFAULT_TREND_FRAC = bmp.DEFAULT_TREND_FRAC
DEFAULT_TREND_COLOR = bmp.DEFAULT_TREND_COLOR
DEFAULT_TREND_LINEWIDTH = bmp.DEFAULT_TREND_LINEWIDTH
BASE_OUT_DIR = bmp.BASE_OUT_DIR
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
)
.merge(contact_days, on="id", how="left")
.reset_index()
def prepare_clean_data(
y_col: str,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
"""Готовит очищенные данные: фильтр по x и IQR, возвращает x, y и DataFrame."""
df = bmp.load_client_level(bmp.DB_PATH)
base = df[[x_col, y_col]].dropna()
in_range = bmp.filter_x_range(base, x_col, x_max)
cleaned = bmp.remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
x = cleaned[x_col].to_numpy()
y = cleaned[y_col].to_numpy()
return x, y, cleaned
client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"])
client["order_rate_pct"] = 100 * client["order_rate"]
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
# -----------------------------
# Aggregate curve points (как у тебя)
# -----------------------------
stats_imp = (
client.groupby("avg_imp_per_day", as_index=False)
.agg(
orders_mean=("orders_amt_total", "mean"),
n_clients=("id", "count"),
)
.sort_values("avg_imp_per_day")
).reset_index(drop=True)
# -----------------------------
# Filtering / outlier logic (как у тебя)
# -----------------------------
K_MULT = 2
ABS_DY_MIN = 1
X_MAX = 16
stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
before = len(stats_f)
y = stats_f["orders_mean"]
abs_dy = y.diff().abs()
prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
ratio = abs_dy / (prev3_mean.replace(0, np.nan))
is_outlier = ((abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT)) | (y > 5)
is_outlier = is_outlier.fillna(False)
stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
after = len(stats_f)
print(f"Фильтрация: было {before}, стало {after}, убрали {before-after} точек")
# -----------------------------
# Smoothing (оставим для визуалки, но регрессию делаем по orders_mean)
# -----------------------------
w = max(7, int(len(stats_f) * 0.05))
if w % 2 == 0:
w += 1
stats_f["orders_smooth"] = (
stats_f["orders_mean"]
.rolling(window=w, center=True, min_periods=1)
.mean()
)
# -----------------------------
# Cost line (как у тебя, нормировка "в единицах заказов")
# -----------------------------
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
# -----------------------------
# Quadratic regression: orders_mean ~ 1 + x + x^2
# WLS with weights = n_clients
# -----------------------------
x = stats_f["avg_imp_per_day"].to_numpy()
y = stats_f["orders_mean"].to_numpy()
wts = stats_f["n_clients"].to_numpy().astype(float)
X = np.column_stack([x, x**2])
X = sm.add_constant(X) # [1, x, x^2]
model = sm.WLS(y, X, weights=wts)
res = model.fit(cov_type="HC3") # робастные ошибки
b0, b1, b2 = res.params
p_b1_two = res.pvalues[1]
p_b2_two = res.pvalues[2]
# one-sided p-values for directional hypotheses
p_b1_pos = (p_b1_two / 2) if (b1 > 0) else (1 - p_b1_two / 2)
p_b2_neg = (p_b2_two / 2) if (b2 < 0) else (1 - p_b2_two / 2)
# turning point (if concave)
x_star = None
y_star = None
if b2 < 0:
x_star = -b1 / (2 * b2)
y_star = b0 + b1 * x_star + b2 * x_star**2
# Intersection with cost line: b0 + b1 x + b2 x^2 = c x -> b2 x^2 + (b1 - c) x + b0 = 0
x_cross = None
roots = np.roots([b2, (b1 - c), b0]) # may be complex
roots = [r.real for r in roots if abs(r.imag) < 1e-8]
roots_in_range = [r for r in roots if (stats_f["avg_imp_per_day"].min() <= r <= stats_f["avg_imp_per_day"].max())]
if roots_in_range:
# берём корень ближе к "правой" части (обычно пересечение интереснее там, где начинается невыгодно)
x_cross = max(roots_in_range)
# -----------------------------
# Print results + interpretation (по-человечески)
# -----------------------------
print("\n=== Квадратичная регрессия (WLS, веса = n_clients, SE = HC3) ===")
print(res.summary())
print("\n=== Проверка гипотезы убывающей отдачи / спада ===")
print(f"β1 (линейный эффект): {b1:.6f}, двусторонний p={p_b1_two:.4g}, односторонний p(β1>0)={p_b1_pos:.4g}")
print(f"β2 (кривизна): {b2:.6f}, двусторонний p={p_b2_two:.4g}, односторонний p(β2<0)={p_b2_neg:.4g}")
alpha = 0.05
support = (b1 > 0) and (b2 < 0) and (p_b1_pos < alpha) and (p_b2_neg < alpha)
if support:
print("\nВывод: данные поддерживают гипотезу нелинейности.")
print("Есть статистически значимый рост на малых x (β1>0) и насыщение/спад (β2<0).")
def fit_quadratic(
x: np.ndarray,
y_target: np.ndarray,
weights: Optional[np.ndarray] = None,
) -> Tuple[sm.regression.linear_model.RegressionResultsWrapper, np.ndarray]:
"""Фитим квадратику по x -> y_target (WLS), предсказываем на тех же x."""
X_design = np.column_stack([x, x**2])
X_design = sm.add_constant(X_design)
if weights is not None:
model = sm.WLS(y_target, X_design, weights=weights).fit(cov_type="HC3")
else:
print("\nВывод: строгого статистического подтверждения по знакам/значимости может не хватить.")
print("Но знак коэффициентов и форма кривой всё равно могут быть согласованы с гипотезой.")
print("На защите говори аккуратно: 'наблюдается тенденция/согласуется с гипотезой'.")
model = sm.OLS(y_target, X_design).fit(cov_type="HC3")
if x_star is not None:
print(f"\nОценка 'порога насыщения' (вершина параболы): x* = {x_star:.3f} показов/день")
print(f"Прогноз среднего числа заказов в x*: y(x*) ≈ {y_star:.3f}")
if not (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
print("Внимание: x* вне диапазона наблюдений, интерпретация как 'оптимума' сомнительная.")
else:
print("\nВершина не считается как максимум: β2 >= 0 (нет выпуклости вниз).")
y_hat = model.predict(X_design)
return model, y_hat
if x_cross is not None:
y_cross = b0 + b1 * x_cross + b2 * x_cross**2
print(f"\nТочка пересечения с линейными расходами (в нормировке c={c:.4f}): x≈{x_cross:.3f}, y≈{y_cross:.3f}")
else:
print("\nПересечение с линией расходов в выбранной нормировке не найдено (или вне диапазона).")
# -----------------------------
# Plot: points + smooth + quadratic fit + cost + markers
# -----------------------------
x_grid = np.linspace(stats_f["avg_imp_per_day"].min(), stats_f["avg_imp_per_day"].max(), 300)
y_hat = b0 + b1 * x_grid + b2 * x_grid**2
cost_hat = c * x_grid
def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[Optional[float], Optional[float]]:
"""Возвращает (R2, AUC по метке y>0)."""
r2 = r2_score(y_true, y_pred)
auc = None
try:
auc = roc_auc_score((y_true > 0).astype(int), y_pred)
except ValueError:
auc = None
return r2, auc
plt.figure(figsize=(10, 8))
plt.plot(
stats_f["avg_imp_per_day"], stats_f["orders_mean"],
marker="o", linestyle="-", linewidth=1, alpha=0.3,
label="Среднее число заказов (по точкам)"
def map_trend_to_points(x_points: np.ndarray, trend_x: np.ndarray, trend_y: np.ndarray) -> np.ndarray:
"""Интерполирует значения тренда в точках x_points."""
if len(trend_x) == 0:
return np.zeros_like(x_points)
# гарантируем отсортированность
order = np.argsort(trend_x)
tx = trend_x[order]
ty = trend_y[order]
return np.interp(x_points, tx, ty, left=ty[0], right=ty[-1])
def density_weights(
df: pd.DataFrame,
y_col: str,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
) -> np.ndarray:
"""Строит веса из плотности (та же схема, что и альфы на графике)."""
alphas = bmp.compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bins_x,
bins_y=bins_y,
alpha_min=alpha_min,
alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
if len(alphas) == 0:
return np.ones(len(df))
denom = max(alpha_max - alpha_min, 1e-9)
weights = (alphas - alpha_min) / denom
weights = np.clip(weights, 0, None)
return weights
def plot_quadratic_overlay(
df: pd.DataFrame,
model: sm.regression.linear_model.RegressionResultsWrapper,
y_col: str,
out_path: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
trend_method: str = bmp.DEFAULT_TREND_METHOD,
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
) -> None:
"""Рисует облако + LOWESS-тренд + линию квадр. регрессии."""
fig, ax = bmp.plt.subplots(figsize=(8, 8))
alpha_values = bmp.compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bins_x,
bins_y=bins_y,
alpha_min=alpha_min,
alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
ax.scatter(
df[x_col],
df[y_col],
color=scatter_color,
s=point_size,
alpha=alpha_values if len(alpha_values) else alpha,
linewidths=0,
label="Точки (очищено)",
)
plt.plot(
stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
color="red", linewidth=2.2,
label="Сглаженный тренд (rolling mean)"
# Тренд по выбранному методу
tx, ty = bmp.compute_trend(
df,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
if len(tx):
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
# Квадратичная регрессия
x_grid = np.linspace(0, x_max, 400)
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
y_grid = model.predict(X_grid)
ax.plot(x_grid, y_grid, color="blue", linewidth=2.3, linestyle="--", label="Квадр. регрессия")
ax.set_xlim(0, x_max)
ax.set_ylim(y_min, y_max)
ax.set_yticks(range(0, int(y_max) + 1, 2))
ax.set_xlabel("Среднее число показов в день")
ax.set_ylabel(y_col)
ax.set_title(f"Квадратичная регрессия: {y_col} vs {x_col}")
ax.grid(alpha=0.3)
ax.legend()
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(out_path, dpi=150)
bmp.plt.close(fig)
print(f"Saved {out_path}")
def report_model(
model: sm.regression.linear_model.RegressionResultsWrapper,
r2: Optional[float],
auc: Optional[float],
*,
r2_trend: Optional[float] = None,
) -> None:
params = model.params
pvals = model.pvalues
fmt_p = lambda p: f"<1e-300" if p < 1e-300 else f"{p:.4g}"
print("\n=== Квадратичная регрессия (y ~ 1 + x + x^2) ===")
print(f"const: {params[0]:.6f} (p={fmt_p(pvals[0])})")
print(f"beta1 x: {params[1]:.6f} (p={fmt_p(pvals[1])})")
print(f"beta2 x^2: {params[2]:.6f} (p={fmt_p(pvals[2])})")
print(f"R2: {r2:.4f}" if r2 is not None else "R2: n/a")
if r2_trend is not None:
print(f"R2 vs trend target: {r2_trend:.4f}")
print(f"AUC (target y>0): {auc:.4f}" if auc is not None else "AUC: n/a (один класс)")
def generate_quadratic_analysis(
y_col: str,
*,
x_col: str = X_COL,
base_out_dir: Path = BASE_OUT_DIR,
config_name: str = "default",
x_max: float = DEFAULT_X_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
trend_method: str = bmp.DEFAULT_TREND_METHOD,
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
) -> dict:
x, y, cleaned_df = prepare_clean_data(
y_col,
x_col=x_col,
x_max=x_max,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
w = density_weights(
cleaned_df,
y_col=y_col,
x_col=x_col,
x_max=x_max,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
)
# тренд по выбранному методу
tx, ty = bmp.compute_trend(
cleaned_df,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plt.plot(
x_grid, y_hat,
color="blue", linewidth=2.5,
label="Квадратичная регрессия (WLS)"
trend_target = map_trend_to_points(x, tx, ty)
model, y_hat = fit_quadratic(x, trend_target, weights=w)
r2_actual, auc = compute_metrics(y, y_hat)
r2_trend = r2_score(trend_target, y_hat) if len(trend_target) else None
report_model(model, r2_actual, auc, r2_trend=r2_trend)
out_dir = base_out_dir / config_name / str(y_col).replace("/", "_")
plot_quadratic_overlay(
cleaned_df,
model,
y_col=y_col,
out_path=out_dir / "quad_regression.png",
x_col=x_col,
x_max=x_max,
y_min=y_min,
y_max=y_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plt.plot(
x_grid, cost_hat,
color="black", linestyle="--", linewidth=2,
label="Линейные расходы на показы"
)
return {
"config": config_name,
"y_col": y_col,
"r2": r2_actual,
"r2_trend": r2_trend,
"auc": auc,
"params": {
"trend_method": trend_method,
"trend_frac": trend_frac,
"rolling_window": rolling_window,
"savgol_window": savgol_window,
"savgol_poly": savgol_poly,
"x_max": x_max,
"weights_alpha_range": (alpha_min, alpha_max),
},
"coeffs": model.params.tolist(),
"pvalues": model.pvalues.tolist(),
}
if x_star is not None and (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
plt.axvline(x_star, color="blue", linestyle=":", linewidth=2)
plt.scatter([x_star], [y_star], color="blue", zorder=5)
plt.text(x_star, y_star, f" x*={x_star:.2f}", va="bottom")
if x_cross is not None:
y_cross = b0 + b1 * x_cross + b2 * x_cross**2
plt.axvline(x_cross, color="black", linestyle=":", linewidth=2, alpha=0.8)
plt.scatter([x_cross], [y_cross], color="black", zorder=5)
plt.text(x_cross, y_cross, f" пересечение≈{x_cross:.2f}", va="top")
def main() -> None:
generate_quadratic_analysis("orders_amt_total")
plt.xlabel("Среднее число показов в день")
plt.ylabel("Среднее число заказов")
plt.title("Нелинейный эффект интенсивности коммуникаций: квадратичная регрессия")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
out_dir = project_root / "main_hypot"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "quad_regression_with_costs.png"
plt.savefig(out_path, dpi=150)
print(f"\nSaved: {out_path}")
if __name__ == "__main__":
main()

View File

Before

Width:  |  Height:  |  Size: 119 KiB

After

Width:  |  Height:  |  Size: 119 KiB

View File

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 47 KiB

View File

Before

Width:  |  Height:  |  Size: 91 KiB

After

Width:  |  Height:  |  Size: 91 KiB

View File

Before

Width:  |  Height:  |  Size: 422 KiB

After

Width:  |  Height:  |  Size: 422 KiB

View File

Before

Width:  |  Height:  |  Size: 177 KiB

After

Width:  |  Height:  |  Size: 177 KiB

View File

Before

Width:  |  Height:  |  Size: 70 KiB

After

Width:  |  Height:  |  Size: 70 KiB

View File

Before

Width:  |  Height:  |  Size: 122 KiB

After

Width:  |  Height:  |  Size: 122 KiB

View File

Before

Width:  |  Height:  |  Size: 124 KiB

After

Width:  |  Height:  |  Size: 124 KiB

View File

Before

Width:  |  Height:  |  Size: 130 KiB

After

Width:  |  Height:  |  Size: 130 KiB

View File

Before

Width:  |  Height:  |  Size: 405 KiB

After

Width:  |  Height:  |  Size: 405 KiB

View File

Before

Width:  |  Height:  |  Size: 387 KiB

After

Width:  |  Height:  |  Size: 387 KiB

View File

Before

Width:  |  Height:  |  Size: 360 KiB

After

Width:  |  Height:  |  Size: 360 KiB

View File

Before

Width:  |  Height:  |  Size: 440 KiB

After

Width:  |  Height:  |  Size: 440 KiB

View File

Before

Width:  |  Height:  |  Size: 87 KiB

After

Width:  |  Height:  |  Size: 87 KiB

154
preanalysis/eda_utils.py Normal file
View File

@@ -0,0 +1,154 @@
from __future__ import annotations
from pathlib import Path
from typing import Dict, Iterable, List
import numpy as np
import pandas as pd
# Paths and column groups
DATA_PATH = Path("dataset/ds.csv")
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
NUMERIC_COLS = (
ACTIVE_IMP_COLS
+ PASSIVE_IMP_COLS
+ ACTIVE_CLICK_COLS
+ PASSIVE_CLICK_COLS
+ ORDER_COLS
+ ["age"]
)
CAT_COLS = ["gender_cd", "device_platform_cd"]
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
"""Divide with protection against zero (works for Series and scalars)."""
if isinstance(denominator, pd.Series):
denom = denominator.replace(0, np.nan)
else:
denom = np.nan if float(denominator) == 0 else denominator
return numerator / denom
def normalize_gender(series: pd.Series) -> pd.Series:
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
return cleaned.map(mapping).fillna("UNKNOWN")
def normalize_device(series: pd.Series) -> pd.Series:
cleaned = series.fillna("unknown").astype(str).str.strip()
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
mapped = lowered.map(mapping)
fallback = cleaned.str.title()
return mapped.fillna(fallback)
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
bins = [0, 25, 35, 45, 55, np.inf]
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
return df
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
return df
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
return df
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
df = pd.read_csv(path)
df["business_dt"] = pd.to_datetime(df["business_dt"])
df["gender_cd"] = normalize_gender(df["gender_cd"])
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
df = add_age_group(df)
df = add_totals(df)
df = add_flags(df)
return df
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
stats = []
for col in cols:
series = df[col]
stats.append(
{
"col": col,
"count": series.count(),
"mean": series.mean(),
"median": series.median(),
"std": series.std(),
"min": series.min(),
"q25": series.quantile(0.25),
"q75": series.quantile(0.75),
"max": series.max(),
"share_zero": (series == 0).mean(),
"p95": series.quantile(0.95),
"p99": series.quantile(0.99),
}
)
return pd.DataFrame(stats)
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
daily = add_totals(daily)
daily["day_of_week"] = daily["business_dt"].dt.day_name()
return daily
def build_client(df: pd.DataFrame) -> pd.DataFrame:
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
meta_spec: Dict[str, str | callable] = {
"age": "median",
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
}
agg_spec.update(meta_spec)
client = df.groupby("id").agg(agg_spec).reset_index()
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
imp_day = df.copy()
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
client = add_totals(client)
client = add_flags(client)
client = client.merge(contact_days, on="id", how="left")
client = client.merge(max_imp_day, on="id", how="left")
client = add_contact_density(client)
return client
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
# contact_days must already be present
if "contact_days" in df.columns:
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
return df
return df

File diff suppressed because one or more lines are too long