good quadreg 0.92 r2

This commit is contained in:
dan
2025-12-14 22:53:28 +03:00
parent 4f8f266c3e
commit 3dc05530c0
19 changed files with 1126 additions and 1516 deletions

View File

@@ -1,43 +1,66 @@
import sqlite3 import sqlite3
from pathlib import Path from pathlib import Path
import sys import sys
from typing import Tuple
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
from statsmodels.nonparametric.smoothers_lowess import lowess from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np
sns.set_theme(style="whitegrid") sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6) plt.rcParams["figure.figsize"] = (8, 8)
project_root = Path(__file__).resolve().parent.parent project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
DB_PATH = project_root / "dataset" / "ds.sqlite" DB_PATH = project_root / "dataset" / "ds.sqlite"
OUT_DIR = project_root / "main_hypot" BASE_OUT_DIR = project_root / "main_hypot"
X_COL = "avg_imp_per_day"
Y_COL = "orders_amt_total" # Константы данных
X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
SCATTER_COLOR = "#2c7bb6" ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
# Константы визуализации/очистки
X_COL = "avg_imp_per_day" # x всегда фиксирован
DEFAULT_X_MAX = 18
DEFAULT_SCATTER_COLOR = "#2c7bb6"
DEFAULT_POINT_SIZE = 20
DEFAULT_ALPHA = 0.08
DEFAULT_TREND_ALPHA = 0.1
DEFAULT_TREND_FRAC = 0.3
DEFAULT_TREND_COLOR = "red"
DEFAULT_TREND_LINEWIDTH = 2.5
DEFAULT_IQR_K = 1.5
DEFAULT_Q_LOW = 0.05
DEFAULT_Q_HIGH = 0.95
DEFAULT_ALPHA_MIN = 0.04
DEFAULT_ALPHA_MAX = 0.7
DEFAULT_BINS_X = 60
DEFAULT_BINS_Y = 60
DEFAULT_Y_MIN = -0.5
DEFAULT_Y_MAX = 10
DEFAULT_TREND_METHOD = "savgol" # options: lowess, rolling, savgol
DEFAULT_ROLLING_WINDOW = 200
DEFAULT_SAVGOL_WINDOW = 501
DEFAULT_SAVGOL_POLY = 2
def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
denom = denominator.replace(0, pd.NA)
return numerator / denom
def load_client_level(db_path: Path) -> pd.DataFrame: def load_client_level(db_path: Path) -> pd.DataFrame:
"""Собирает агрегаты по клиентам без усреднения по x.""" """Собирает агрегаты по клиентам без зависимостей от eda_utils."""
conn = sqlite3.connect(db_path) conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close() conn.close()
for cols, name in [ df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
(eda.ACTIVE_IMP_COLS, "active_imp_total"), df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
client = ( client = (
df.groupby("id") df.groupby("id")
@@ -49,94 +72,503 @@ def load_client_level(db_path: Path) -> pd.DataFrame:
.reset_index() .reset_index()
) )
client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"]) client[X_COL] = safe_divide(client["imp_total"], client["contact_days"])
client[Y_COL] = client["orders_amt_total"] print(f"Loaded {len(client)} clients with {X_COL} computed.")
client = client[["id", X_COL, Y_COL]].dropna() return client
in_range = client[client[X_COL] <= X_MAX].copy()
print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
return in_range
def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame: def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]:
"""Убирает выбросы по IQR отдельно по x и y.""" q1, q3 = series.quantile([q_low, q_high])
def bounds(series: pd.Series) -> tuple[float, float]:
q1, q3 = series.quantile([0.05, 0.95])
iqr = q3 - q1 iqr = q3 - q1
return q1 - iqr_k * iqr, q3 + iqr_k * iqr return q1 - iqr_k * iqr, q3 + iqr_k * iqr
x_low, x_high = bounds(df[X_COL])
y_low, y_high = bounds(df[Y_COL]) def remove_outliers(
df: pd.DataFrame,
y_col: str,
x_col: str = X_COL,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
) -> pd.DataFrame:
"""Убирает выбросы по IQR отдельно по x и y."""
x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k)
y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k)
filtered = df[ filtered = df[
df[X_COL].between(max(0, x_low), x_high) df[x_col].between(max(0, x_low), x_high)
& df[Y_COL].between(max(0, y_low), y_high) & df[y_col].between(max(0, y_low), y_high)
].copy() ].copy()
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).") print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).")
return filtered return filtered
def compute_density_alpha(
df: pd.DataFrame,
x_col: str,
y_col: str,
x_max: float,
*,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max_limit: float = DEFAULT_Y_MAX,
) -> np.ndarray:
"""Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах."""
x_vals = df[x_col].to_numpy()
y_vals = df[y_col].to_numpy()
if len(x_vals) == 0:
return np.array([])
x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1)
y_upper = max(min(y_vals.max(), y_max_limit), 1e-9)
y_edges = np.linspace(y_min, y_upper, bins_y + 1)
x_bins = np.digitize(x_vals, x_edges) - 1
y_bins = np.digitize(y_vals, y_edges) - 1
valid = (
(x_bins >= 0) & (x_bins < bins_x) &
(y_bins >= 0) & (y_bins < bins_y)
)
counts = np.zeros((bins_x, bins_y), dtype=int)
for xb, yb in zip(x_bins[valid], y_bins[valid]):
counts[xb, yb] += 1
bin_counts = counts[
np.clip(x_bins, 0, bins_x - 1),
np.clip(y_bins, 0, bins_y - 1),
]
max_count = bin_counts.max() if len(bin_counts) else 1
if max_count == 0:
weight = np.zeros_like(bin_counts, dtype=float)
else:
weight = (bin_counts / max_count) ** np.sqrt(1.5)
weight = np.clip(weight, 0, 1)
return alpha_min + (alpha_max - alpha_min) * weight
def compute_trend(
df: pd.DataFrame,
y_col: str,
*,
x_col: str = X_COL,
method: str = DEFAULT_TREND_METHOD,
lowess_frac: float = DEFAULT_TREND_FRAC,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> Tuple[np.ndarray, np.ndarray]:
"""Возвращает (x_sorted, trend_y) по выбранному методу."""
d = df[[x_col, y_col]].dropna().sort_values(x_col)
x_vals = d[x_col].to_numpy()
y_vals = d[y_col].to_numpy()
if len(x_vals) == 0:
return np.array([]), np.array([])
m = method.lower()
if m == "lowess":
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
return trend[:, 0], trend[:, 1]
if m == "rolling":
w = max(3, rolling_window)
if w % 2 == 0:
w += 1
y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy()
return x_vals, y_trend
if m == "savgol":
w = max(5, savgol_window)
if w % 2 == 0:
w += 1
poly = min(savgol_poly, w - 1)
y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp")
return x_vals, y_trend
# fallback to lowess
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
return trend[:, 0], trend[:, 1]
def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame:
subset = df[df[x_col] <= x_max].copy()
print(f"{len(df)} points; {len(subset)} within x<={x_max}.")
return subset
def plot_density_scatter( def plot_density_scatter(
df: pd.DataFrame, df: pd.DataFrame,
y_col: str,
title: str, title: str,
out_name: str, out_path: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
with_trend: bool = False, with_trend: bool = False,
alpha: float = 0.08, trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None: ) -> None:
fig, ax = plt.subplots(figsize=(10, 6)) fig, ax = plt.subplots(figsize=(8, 8))
sns.scatterplot( alpha_values = compute_density_alpha(
data=df, df,
x=X_COL, x_col=x_col,
y=Y_COL, y_col=y_col,
color=SCATTER_COLOR, x_max=x_max,
s=20, bins_x=bins_x,
alpha=alpha, bins_y=bins_y,
linewidth=0, alpha_min=alpha_min,
ax=ax, alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
ax.scatter(
df[x_col],
df[y_col],
color=scatter_color,
s=point_size,
alpha=alpha_values if len(alpha_values) else alpha,
linewidths=0,
) )
if with_trend: if with_trend:
trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True) tx, ty = compute_trend(
ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд") df,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
if len(tx):
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
ax.legend() ax.legend()
ax.set_xlim(0, X_MAX) ax.set_xlim(0, x_max)
ax.set_ylim(bottom=0) ax.set_ylim(y_min, y_max)
ax.set_yticks(range(0, int(y_max) + 1, 2))
ax.set_xlabel("Среднее число показов в день") ax.set_xlabel("Среднее число показов в день")
ax.set_ylabel("Число заказов за период (сумма)") ax.set_ylabel(y_col)
ax.set_title(title) ax.set_title(title)
ax.grid(alpha=0.3) ax.grid(alpha=0.3)
OUT_DIR.mkdir(parents=True, exist_ok=True) out_path.parent.mkdir(parents=True, exist_ok=True)
out_path = OUT_DIR / out_name
fig.tight_layout() fig.tight_layout()
fig.savefig(out_path, dpi=150) fig.savefig(out_path, dpi=150)
plt.close(fig) plt.close(fig)
print(f"Saved {out_path}") print(f"Saved {out_path}")
def plot_raw_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
plot_density_scatter(
in_range,
y_col=y_col,
title=f"Облако: {y_col} vs {x_col} (все клиенты)",
out_path=out_dir / "scatter.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def plot_clean_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
cleaned = remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
plot_density_scatter(
cleaned,
y_col=y_col,
title=f"Облако без выбросов (IQR) {y_col} vs {x_col}",
out_path=out_dir / "scatter_clean.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def plot_clean_trend_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_TREND_ALPHA,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
return_components: bool = False,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
cleaned = remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
plot_density_scatter(
cleaned,
y_col=y_col,
title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
out_path=out_dir / "scatter_trend.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
with_trend=True,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
if return_components:
return fig, ax, cleaned
def generate_scatter_set(
df: pd.DataFrame,
y_col: str,
*,
base_out_dir: Path = BASE_OUT_DIR,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
trend_alpha: float = DEFAULT_TREND_ALPHA,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
"""Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col."""
out_dir = base_out_dir / str(y_col).replace("/", "_")
plot_raw_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plot_clean_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plot_clean_trend_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=trend_alpha,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def main() -> None: def main() -> None:
client = load_client_level(DB_PATH) client = load_client_level(DB_PATH)
zero_orders = (client["orders_amt_total"] == 0).sum()
plot_density_scatter( non_zero = len(client) - zero_orders
client, if len(client):
title="Облако: заказы vs средние показы в день (все клиенты)", print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})")
out_name="orders_vs_avg_imp_scatter.png", generate_scatter_set(client, y_col="orders_amt_total")
)
cleaned = remove_outliers(client)
plot_density_scatter(
cleaned,
title="Облако без выбросов (IQR) заказы vs средние показы в день",
out_name="orders_vs_avg_imp_scatter_clean.png",
)
plot_density_scatter(
cleaned,
title="Облако без выбросов + тренд",
out_name="orders_vs_avg_imp_scatter_trend.png",
with_trend=True,
alpha=0.1,
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -1,240 +1,352 @@
import sqlite3
from pathlib import Path
import sys
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm import statsmodels.api as sm
from pathlib import Path
from typing import Tuple, Optional
sns.set_theme(style="whitegrid") from sklearn.metrics import r2_score, roc_auc_score
plt.rcParams["figure.figsize"] = (10, 6)
# ----------------------------- import best_model_and_plots as bmp
# Load + feature engineering (как у тебя)
# -----------------------------
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite" # Наследуем константы/визуальные настройки из scatter-скрипта
conn = sqlite3.connect(db_path) X_COL = bmp.X_COL
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) DEFAULT_X_MAX = bmp.DEFAULT_X_MAX
conn.close() DEFAULT_Y_MIN = bmp.DEFAULT_Y_MIN
DEFAULT_Y_MAX = bmp.DEFAULT_Y_MAX
DEFAULT_SCATTER_COLOR = bmp.DEFAULT_SCATTER_COLOR
DEFAULT_POINT_SIZE = bmp.DEFAULT_POINT_SIZE
DEFAULT_ALPHA = bmp.DEFAULT_ALPHA
DEFAULT_ALPHA_MIN = bmp.DEFAULT_ALPHA_MIN
DEFAULT_ALPHA_MAX = bmp.DEFAULT_ALPHA_MAX
DEFAULT_BINS_X = bmp.DEFAULT_BINS_X
DEFAULT_BINS_Y = bmp.DEFAULT_BINS_Y
DEFAULT_IQR_K = bmp.DEFAULT_IQR_K
DEFAULT_Q_LOW = bmp.DEFAULT_Q_LOW
DEFAULT_Q_HIGH = bmp.DEFAULT_Q_HIGH
DEFAULT_TREND_FRAC = bmp.DEFAULT_TREND_FRAC
DEFAULT_TREND_COLOR = bmp.DEFAULT_TREND_COLOR
DEFAULT_TREND_LINEWIDTH = bmp.DEFAULT_TREND_LINEWIDTH
BASE_OUT_DIR = bmp.BASE_OUT_DIR
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] def prepare_clean_data(
df["click_total"] = df["active_click_total"] + df["passive_click_total"] y_col: str,
*,
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
client = ( iqr_k: float = DEFAULT_IQR_K,
df.groupby("id") q_low: float = DEFAULT_Q_LOW,
.agg( q_high: float = DEFAULT_Q_HIGH,
imp_total=("imp_total", "sum"), ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
click_total=("click_total", "sum"), """Готовит очищенные данные: фильтр по x и IQR, возвращает x, y и DataFrame."""
orders_amt_total=("orders_amt_total", "sum"), df = bmp.load_client_level(bmp.DB_PATH)
age=("age", "median"), base = df[[x_col, y_col]].dropna()
gender_cd=("gender_cd", lambda s: s.mode().iat[0]), in_range = bmp.filter_x_range(base, x_col, x_max)
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]), cleaned = bmp.remove_outliers(
) in_range,
.merge(contact_days, on="id", how="left") y_col=y_col,
.reset_index() x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
) )
x = cleaned[x_col].to_numpy()
y = cleaned[y_col].to_numpy()
return x, y, cleaned
client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"])
client["order_rate_pct"] = 100 * client["order_rate"]
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
# ----------------------------- def fit_quadratic(
# Aggregate curve points (как у тебя) x: np.ndarray,
# ----------------------------- y_target: np.ndarray,
stats_imp = ( weights: Optional[np.ndarray] = None,
client.groupby("avg_imp_per_day", as_index=False) ) -> Tuple[sm.regression.linear_model.RegressionResultsWrapper, np.ndarray]:
.agg( """Фитим квадратику по x -> y_target (WLS), предсказываем на тех же x."""
orders_mean=("orders_amt_total", "mean"), X_design = np.column_stack([x, x**2])
n_clients=("id", "count"), X_design = sm.add_constant(X_design)
) if weights is not None:
.sort_values("avg_imp_per_day") model = sm.WLS(y_target, X_design, weights=weights).fit(cov_type="HC3")
).reset_index(drop=True)
# -----------------------------
# Filtering / outlier logic (как у тебя)
# -----------------------------
K_MULT = 2
ABS_DY_MIN = 1
X_MAX = 16
stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
before = len(stats_f)
y = stats_f["orders_mean"]
abs_dy = y.diff().abs()
prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
ratio = abs_dy / (prev3_mean.replace(0, np.nan))
is_outlier = ((abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT)) | (y > 5)
is_outlier = is_outlier.fillna(False)
stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
after = len(stats_f)
print(f"Фильтрация: было {before}, стало {after}, убрали {before-after} точек")
# -----------------------------
# Smoothing (оставим для визуалки, но регрессию делаем по orders_mean)
# -----------------------------
w = max(7, int(len(stats_f) * 0.05))
if w % 2 == 0:
w += 1
stats_f["orders_smooth"] = (
stats_f["orders_mean"]
.rolling(window=w, center=True, min_periods=1)
.mean()
)
# -----------------------------
# Cost line (как у тебя, нормировка "в единицах заказов")
# -----------------------------
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
# -----------------------------
# Quadratic regression: orders_mean ~ 1 + x + x^2
# WLS with weights = n_clients
# -----------------------------
x = stats_f["avg_imp_per_day"].to_numpy()
y = stats_f["orders_mean"].to_numpy()
wts = stats_f["n_clients"].to_numpy().astype(float)
X = np.column_stack([x, x**2])
X = sm.add_constant(X) # [1, x, x^2]
model = sm.WLS(y, X, weights=wts)
res = model.fit(cov_type="HC3") # робастные ошибки
b0, b1, b2 = res.params
p_b1_two = res.pvalues[1]
p_b2_two = res.pvalues[2]
# one-sided p-values for directional hypotheses
p_b1_pos = (p_b1_two / 2) if (b1 > 0) else (1 - p_b1_two / 2)
p_b2_neg = (p_b2_two / 2) if (b2 < 0) else (1 - p_b2_two / 2)
# turning point (if concave)
x_star = None
y_star = None
if b2 < 0:
x_star = -b1 / (2 * b2)
y_star = b0 + b1 * x_star + b2 * x_star**2
# Intersection with cost line: b0 + b1 x + b2 x^2 = c x -> b2 x^2 + (b1 - c) x + b0 = 0
x_cross = None
roots = np.roots([b2, (b1 - c), b0]) # may be complex
roots = [r.real for r in roots if abs(r.imag) < 1e-8]
roots_in_range = [r for r in roots if (stats_f["avg_imp_per_day"].min() <= r <= stats_f["avg_imp_per_day"].max())]
if roots_in_range:
# берём корень ближе к "правой" части (обычно пересечение интереснее там, где начинается невыгодно)
x_cross = max(roots_in_range)
# -----------------------------
# Print results + interpretation (по-человечески)
# -----------------------------
print("\n=== Квадратичная регрессия (WLS, веса = n_clients, SE = HC3) ===")
print(res.summary())
print("\n=== Проверка гипотезы убывающей отдачи / спада ===")
print(f"β1 (линейный эффект): {b1:.6f}, двусторонний p={p_b1_two:.4g}, односторонний p(β1>0)={p_b1_pos:.4g}")
print(f"β2 (кривизна): {b2:.6f}, двусторонний p={p_b2_two:.4g}, односторонний p(β2<0)={p_b2_neg:.4g}")
alpha = 0.05
support = (b1 > 0) and (b2 < 0) and (p_b1_pos < alpha) and (p_b2_neg < alpha)
if support:
print("\nВывод: данные поддерживают гипотезу нелинейности.")
print("Есть статистически значимый рост на малых x (β1>0) и насыщение/спад (β2<0).")
else: else:
print("\nВывод: строгого статистического подтверждения по знакам/значимости может не хватить.") model = sm.OLS(y_target, X_design).fit(cov_type="HC3")
print("Но знак коэффициентов и форма кривой всё равно могут быть согласованы с гипотезой.")
print("На защите говори аккуратно: 'наблюдается тенденция/согласуется с гипотезой'.")
if x_star is not None: y_hat = model.predict(X_design)
print(f"\nОценка 'порога насыщения' (вершина параболы): x* = {x_star:.3f} показов/день") return model, y_hat
print(f"Прогноз среднего числа заказов в x*: y(x*) ≈ {y_star:.3f}")
if not (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
print("Внимание: x* вне диапазона наблюдений, интерпретация как 'оптимума' сомнительная.")
else:
print("\nВершина не считается как максимум: β2 >= 0 (нет выпуклости вниз).")
if x_cross is not None:
y_cross = b0 + b1 * x_cross + b2 * x_cross**2
print(f"\nТочка пересечения с линейными расходами (в нормировке c={c:.4f}): x≈{x_cross:.3f}, y≈{y_cross:.3f}")
else:
print("\nПересечение с линией расходов в выбранной нормировке не найдено (или вне диапазона).")
# ----------------------------- def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[Optional[float], Optional[float]]:
# Plot: points + smooth + quadratic fit + cost + markers """Возвращает (R2, AUC по метке y>0)."""
# ----------------------------- r2 = r2_score(y_true, y_pred)
x_grid = np.linspace(stats_f["avg_imp_per_day"].min(), stats_f["avg_imp_per_day"].max(), 300) auc = None
y_hat = b0 + b1 * x_grid + b2 * x_grid**2 try:
cost_hat = c * x_grid auc = roc_auc_score((y_true > 0).astype(int), y_pred)
except ValueError:
auc = None
return r2, auc
plt.figure(figsize=(10, 8))
plt.plot( def map_trend_to_points(x_points: np.ndarray, trend_x: np.ndarray, trend_y: np.ndarray) -> np.ndarray:
stats_f["avg_imp_per_day"], stats_f["orders_mean"], """Интерполирует значения тренда в точках x_points."""
marker="o", linestyle="-", linewidth=1, alpha=0.3, if len(trend_x) == 0:
label="Среднее число заказов (по точкам)" return np.zeros_like(x_points)
# гарантируем отсортированность
order = np.argsort(trend_x)
tx = trend_x[order]
ty = trend_y[order]
return np.interp(x_points, tx, ty, left=ty[0], right=ty[-1])
def density_weights(
df: pd.DataFrame,
y_col: str,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
) -> np.ndarray:
"""Строит веса из плотности (та же схема, что и альфы на графике)."""
alphas = bmp.compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bins_x,
bins_y=bins_y,
alpha_min=alpha_min,
alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
if len(alphas) == 0:
return np.ones(len(df))
denom = max(alpha_max - alpha_min, 1e-9)
weights = (alphas - alpha_min) / denom
weights = np.clip(weights, 0, None)
return weights
def plot_quadratic_overlay(
df: pd.DataFrame,
model: sm.regression.linear_model.RegressionResultsWrapper,
y_col: str,
out_path: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
trend_method: str = bmp.DEFAULT_TREND_METHOD,
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
) -> None:
"""Рисует облако + LOWESS-тренд + линию квадр. регрессии."""
fig, ax = bmp.plt.subplots(figsize=(8, 8))
alpha_values = bmp.compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bins_x,
bins_y=bins_y,
alpha_min=alpha_min,
alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
ax.scatter(
df[x_col],
df[y_col],
color=scatter_color,
s=point_size,
alpha=alpha_values if len(alpha_values) else alpha,
linewidths=0,
label="Точки (очищено)",
) )
plt.plot( # Тренд по выбранному методу
stats_f["avg_imp_per_day"], stats_f["orders_smooth"], tx, ty = bmp.compute_trend(
color="red", linewidth=2.2, df,
label="Сглаженный тренд (rolling mean)" y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
if len(tx):
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
# Квадратичная регрессия
x_grid = np.linspace(0, x_max, 400)
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
y_grid = model.predict(X_grid)
ax.plot(x_grid, y_grid, color="blue", linewidth=2.3, linestyle="--", label="Квадр. регрессия")
ax.set_xlim(0, x_max)
ax.set_ylim(y_min, y_max)
ax.set_yticks(range(0, int(y_max) + 1, 2))
ax.set_xlabel("Среднее число показов в день")
ax.set_ylabel(y_col)
ax.set_title(f"Квадратичная регрессия: {y_col} vs {x_col}")
ax.grid(alpha=0.3)
ax.legend()
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(out_path, dpi=150)
bmp.plt.close(fig)
print(f"Saved {out_path}")
def report_model(
model: sm.regression.linear_model.RegressionResultsWrapper,
r2: Optional[float],
auc: Optional[float],
*,
r2_trend: Optional[float] = None,
) -> None:
params = model.params
pvals = model.pvalues
fmt_p = lambda p: f"<1e-300" if p < 1e-300 else f"{p:.4g}"
print("\n=== Квадратичная регрессия (y ~ 1 + x + x^2) ===")
print(f"const: {params[0]:.6f} (p={fmt_p(pvals[0])})")
print(f"beta1 x: {params[1]:.6f} (p={fmt_p(pvals[1])})")
print(f"beta2 x^2: {params[2]:.6f} (p={fmt_p(pvals[2])})")
print(f"R2: {r2:.4f}" if r2 is not None else "R2: n/a")
if r2_trend is not None:
print(f"R2 vs trend target: {r2_trend:.4f}")
print(f"AUC (target y>0): {auc:.4f}" if auc is not None else "AUC: n/a (один класс)")
def generate_quadratic_analysis(
y_col: str,
*,
x_col: str = X_COL,
base_out_dir: Path = BASE_OUT_DIR,
config_name: str = "default",
x_max: float = DEFAULT_X_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
trend_method: str = bmp.DEFAULT_TREND_METHOD,
rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
) -> dict:
x, y, cleaned_df = prepare_clean_data(
y_col,
x_col=x_col,
x_max=x_max,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
w = density_weights(
cleaned_df,
y_col=y_col,
x_col=x_col,
x_max=x_max,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
)
# тренд по выбранному методу
tx, ty = bmp.compute_trend(
cleaned_df,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
) )
plt.plot( trend_target = map_trend_to_points(x, tx, ty)
x_grid, y_hat, model, y_hat = fit_quadratic(x, trend_target, weights=w)
color="blue", linewidth=2.5, r2_actual, auc = compute_metrics(y, y_hat)
label="Квадратичная регрессия (WLS)" r2_trend = r2_score(trend_target, y_hat) if len(trend_target) else None
report_model(model, r2_actual, auc, r2_trend=r2_trend)
out_dir = base_out_dir / config_name / str(y_col).replace("/", "_")
plot_quadratic_overlay(
cleaned_df,
model,
y_col=y_col,
out_path=out_dir / "quad_regression.png",
x_col=x_col,
x_max=x_max,
y_min=y_min,
y_max=y_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
) )
plt.plot( return {
x_grid, cost_hat, "config": config_name,
color="black", linestyle="--", linewidth=2, "y_col": y_col,
label="Линейные расходы на показы" "r2": r2_actual,
) "r2_trend": r2_trend,
"auc": auc,
"params": {
"trend_method": trend_method,
"trend_frac": trend_frac,
"rolling_window": rolling_window,
"savgol_window": savgol_window,
"savgol_poly": savgol_poly,
"x_max": x_max,
"weights_alpha_range": (alpha_min, alpha_max),
},
"coeffs": model.params.tolist(),
"pvalues": model.pvalues.tolist(),
}
if x_star is not None and (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
plt.axvline(x_star, color="blue", linestyle=":", linewidth=2)
plt.scatter([x_star], [y_star], color="blue", zorder=5)
plt.text(x_star, y_star, f" x*={x_star:.2f}", va="bottom")
if x_cross is not None: def main() -> None:
y_cross = b0 + b1 * x_cross + b2 * x_cross**2 generate_quadratic_analysis("orders_amt_total")
plt.axvline(x_cross, color="black", linestyle=":", linewidth=2, alpha=0.8)
plt.scatter([x_cross], [y_cross], color="black", zorder=5)
plt.text(x_cross, y_cross, f" пересечение≈{x_cross:.2f}", va="top")
plt.xlabel("Среднее число показов в день")
plt.ylabel("Среднее число заказов")
plt.title("Нелинейный эффект интенсивности коммуникаций: квадратичная регрессия")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
out_dir = project_root / "main_hypot" if __name__ == "__main__":
out_dir.mkdir(parents=True, exist_ok=True) main()
out_path = out_dir / "quad_regression_with_costs.png"
plt.savefig(out_path, dpi=150)
print(f"\nSaved: {out_path}")

View File

Before

Width:  |  Height:  |  Size: 119 KiB

After

Width:  |  Height:  |  Size: 119 KiB

View File

Before

Width:  |  Height:  |  Size: 47 KiB

After

Width:  |  Height:  |  Size: 47 KiB

View File

Before

Width:  |  Height:  |  Size: 91 KiB

After

Width:  |  Height:  |  Size: 91 KiB

View File

Before

Width:  |  Height:  |  Size: 422 KiB

After

Width:  |  Height:  |  Size: 422 KiB

View File

Before

Width:  |  Height:  |  Size: 177 KiB

After

Width:  |  Height:  |  Size: 177 KiB

View File

Before

Width:  |  Height:  |  Size: 70 KiB

After

Width:  |  Height:  |  Size: 70 KiB

View File

Before

Width:  |  Height:  |  Size: 122 KiB

After

Width:  |  Height:  |  Size: 122 KiB

View File

Before

Width:  |  Height:  |  Size: 124 KiB

After

Width:  |  Height:  |  Size: 124 KiB

View File

Before

Width:  |  Height:  |  Size: 130 KiB

After

Width:  |  Height:  |  Size: 130 KiB

View File

Before

Width:  |  Height:  |  Size: 405 KiB

After

Width:  |  Height:  |  Size: 405 KiB

View File

Before

Width:  |  Height:  |  Size: 387 KiB

After

Width:  |  Height:  |  Size: 387 KiB

View File

Before

Width:  |  Height:  |  Size: 360 KiB

After

Width:  |  Height:  |  Size: 360 KiB

View File

Before

Width:  |  Height:  |  Size: 440 KiB

After

Width:  |  Height:  |  Size: 440 KiB

View File

Before

Width:  |  Height:  |  Size: 87 KiB

After

Width:  |  Height:  |  Size: 87 KiB

154
preanalysis/eda_utils.py Normal file
View File

@@ -0,0 +1,154 @@
from __future__ import annotations
from pathlib import Path
from typing import Dict, Iterable, List
import numpy as np
import pandas as pd
# Paths and column groups
DATA_PATH = Path("dataset/ds.csv")
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
NUMERIC_COLS = (
ACTIVE_IMP_COLS
+ PASSIVE_IMP_COLS
+ ACTIVE_CLICK_COLS
+ PASSIVE_CLICK_COLS
+ ORDER_COLS
+ ["age"]
)
CAT_COLS = ["gender_cd", "device_platform_cd"]
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
"""Divide with protection against zero (works for Series and scalars)."""
if isinstance(denominator, pd.Series):
denom = denominator.replace(0, np.nan)
else:
denom = np.nan if float(denominator) == 0 else denominator
return numerator / denom
def normalize_gender(series: pd.Series) -> pd.Series:
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
return cleaned.map(mapping).fillna("UNKNOWN")
def normalize_device(series: pd.Series) -> pd.Series:
cleaned = series.fillna("unknown").astype(str).str.strip()
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
mapped = lowered.map(mapping)
fallback = cleaned.str.title()
return mapped.fillna(fallback)
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
bins = [0, 25, 35, 45, 55, np.inf]
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
return df
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
return df
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
return df
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
df = pd.read_csv(path)
df["business_dt"] = pd.to_datetime(df["business_dt"])
df["gender_cd"] = normalize_gender(df["gender_cd"])
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
df = add_age_group(df)
df = add_totals(df)
df = add_flags(df)
return df
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
stats = []
for col in cols:
series = df[col]
stats.append(
{
"col": col,
"count": series.count(),
"mean": series.mean(),
"median": series.median(),
"std": series.std(),
"min": series.min(),
"q25": series.quantile(0.25),
"q75": series.quantile(0.75),
"max": series.max(),
"share_zero": (series == 0).mean(),
"p95": series.quantile(0.95),
"p99": series.quantile(0.99),
}
)
return pd.DataFrame(stats)
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
daily = add_totals(daily)
daily["day_of_week"] = daily["business_dt"].dt.day_name()
return daily
def build_client(df: pd.DataFrame) -> pd.DataFrame:
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
meta_spec: Dict[str, str | callable] = {
"age": "median",
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
}
agg_spec.update(meta_spec)
client = df.groupby("id").agg(agg_spec).reset_index()
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
imp_day = df.copy()
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
client = add_totals(client)
client = add_flags(client)
client = client.merge(contact_days, on="id", how="left")
client = client.merge(max_imp_day, on="id", how="left")
client = add_contact_density(client)
return client
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
# contact_days must already be present
if "contact_days" in df.columns:
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
return df
return df

File diff suppressed because one or more lines are too long