diff --git a/main_hypot/best_model_and_plots.py b/main_hypot/best_model_and_plots.py
index 7114769..1be2f16 100644
--- a/main_hypot/best_model_and_plots.py
+++ b/main_hypot/best_model_and_plots.py
@@ -1,43 +1,66 @@
import sqlite3
from pathlib import Path
import sys
+from typing import Tuple
import matplotlib.pyplot as plt
+from scipy.signal import savgol_filter
import pandas as pd
import seaborn as sns
from statsmodels.nonparametric.smoothers_lowess import lowess
+import numpy as np
sns.set_theme(style="whitegrid")
-plt.rcParams["figure.figsize"] = (10, 6)
+plt.rcParams["figure.figsize"] = (8, 8)
project_root = Path(__file__).resolve().parent.parent
-sys.path.append(str(project_root / "preanalysis_old_bad"))
-import eda_utils as eda # noqa: E402
-
DB_PATH = project_root / "dataset" / "ds.sqlite"
-OUT_DIR = project_root / "main_hypot"
-X_COL = "avg_imp_per_day"
-Y_COL = "orders_amt_total"
-X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать
-SCATTER_COLOR = "#2c7bb6"
+BASE_OUT_DIR = project_root / "main_hypot"
+
+# Константы данных
+CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
+ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
+PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
+ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
+
+# Константы визуализации/очистки
+X_COL = "avg_imp_per_day" # x всегда фиксирован
+DEFAULT_X_MAX = 18
+DEFAULT_SCATTER_COLOR = "#2c7bb6"
+DEFAULT_POINT_SIZE = 20
+DEFAULT_ALPHA = 0.08
+DEFAULT_TREND_ALPHA = 0.1
+DEFAULT_TREND_FRAC = 0.3
+DEFAULT_TREND_COLOR = "red"
+DEFAULT_TREND_LINEWIDTH = 2.5
+DEFAULT_IQR_K = 1.5
+DEFAULT_Q_LOW = 0.05
+DEFAULT_Q_HIGH = 0.95
+DEFAULT_ALPHA_MIN = 0.04
+DEFAULT_ALPHA_MAX = 0.7
+DEFAULT_BINS_X = 60
+DEFAULT_BINS_Y = 60
+DEFAULT_Y_MIN = -0.5
+DEFAULT_Y_MAX = 10
+DEFAULT_TREND_METHOD = "savgol" # options: lowess, rolling, savgol
+DEFAULT_ROLLING_WINDOW = 200
+DEFAULT_SAVGOL_WINDOW = 501
+DEFAULT_SAVGOL_POLY = 2
+
+
+def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
+ denom = denominator.replace(0, pd.NA)
+ return numerator / denom
def load_client_level(db_path: Path) -> pd.DataFrame:
- """Собирает агрегаты по клиентам без усреднения по x."""
+ """Собирает агрегаты по клиентам без зависимостей от eda_utils."""
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
- for cols, name in [
- (eda.ACTIVE_IMP_COLS, "active_imp_total"),
- (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
- (eda.ACTIVE_CLICK_COLS, "active_click_total"),
- (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
- (eda.ORDER_COLS, "orders_amt_total"),
- ]:
- df[name] = df[cols].sum(axis=1)
-
- df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+ df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
+ df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
client = (
df.groupby("id")
@@ -49,94 +72,503 @@ def load_client_level(db_path: Path) -> pd.DataFrame:
.reset_index()
)
- client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"])
- client[Y_COL] = client["orders_amt_total"]
- client = client[["id", X_COL, Y_COL]].dropna()
-
- in_range = client[client[X_COL] <= X_MAX].copy()
- print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
- return in_range
+ client[X_COL] = safe_divide(client["imp_total"], client["contact_days"])
+ print(f"Loaded {len(client)} clients with {X_COL} computed.")
+ return client
-def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame:
+def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]:
+ q1, q3 = series.quantile([q_low, q_high])
+ iqr = q3 - q1
+ return q1 - iqr_k * iqr, q3 + iqr_k * iqr
+
+
+def remove_outliers(
+ df: pd.DataFrame,
+ y_col: str,
+ x_col: str = X_COL,
+ iqr_k: float = DEFAULT_IQR_K,
+ q_low: float = DEFAULT_Q_LOW,
+ q_high: float = DEFAULT_Q_HIGH,
+) -> pd.DataFrame:
"""Убирает выбросы по IQR отдельно по x и y."""
- def bounds(series: pd.Series) -> tuple[float, float]:
- q1, q3 = series.quantile([0.05, 0.95])
- iqr = q3 - q1
- return q1 - iqr_k * iqr, q3 + iqr_k * iqr
-
- x_low, x_high = bounds(df[X_COL])
- y_low, y_high = bounds(df[Y_COL])
+ x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k)
+ y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k)
filtered = df[
- df[X_COL].between(max(0, x_low), x_high)
- & df[Y_COL].between(max(0, y_low), y_high)
+ df[x_col].between(max(0, x_low), x_high)
+ & df[y_col].between(max(0, y_low), y_high)
].copy()
- print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).")
+ print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).")
return filtered
+def compute_density_alpha(
+ df: pd.DataFrame,
+ x_col: str,
+ y_col: str,
+ x_max: float,
+ *,
+ bins_x: int = DEFAULT_BINS_X,
+ bins_y: int = DEFAULT_BINS_Y,
+ alpha_min: float = DEFAULT_ALPHA_MIN,
+ alpha_max: float = DEFAULT_ALPHA_MAX,
+ y_min: float = DEFAULT_Y_MIN,
+ y_max_limit: float = DEFAULT_Y_MAX,
+) -> np.ndarray:
+ """Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах."""
+ x_vals = df[x_col].to_numpy()
+ y_vals = df[y_col].to_numpy()
+
+ if len(x_vals) == 0:
+ return np.array([])
+
+ x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1)
+ y_upper = max(min(y_vals.max(), y_max_limit), 1e-9)
+ y_edges = np.linspace(y_min, y_upper, bins_y + 1)
+
+ x_bins = np.digitize(x_vals, x_edges) - 1
+ y_bins = np.digitize(y_vals, y_edges) - 1
+
+ valid = (
+ (x_bins >= 0) & (x_bins < bins_x) &
+ (y_bins >= 0) & (y_bins < bins_y)
+ )
+ counts = np.zeros((bins_x, bins_y), dtype=int)
+ for xb, yb in zip(x_bins[valid], y_bins[valid]):
+ counts[xb, yb] += 1
+
+ bin_counts = counts[
+ np.clip(x_bins, 0, bins_x - 1),
+ np.clip(y_bins, 0, bins_y - 1),
+ ]
+ max_count = bin_counts.max() if len(bin_counts) else 1
+ if max_count == 0:
+ weight = np.zeros_like(bin_counts, dtype=float)
+ else:
+ weight = (bin_counts / max_count) ** np.sqrt(1.5)
+ weight = np.clip(weight, 0, 1)
+ return alpha_min + (alpha_max - alpha_min) * weight
+
+
+def compute_trend(
+ df: pd.DataFrame,
+ y_col: str,
+ *,
+ x_col: str = X_COL,
+ method: str = DEFAULT_TREND_METHOD,
+ lowess_frac: float = DEFAULT_TREND_FRAC,
+ rolling_window: int = DEFAULT_ROLLING_WINDOW,
+ savgol_window: int = DEFAULT_SAVGOL_WINDOW,
+ savgol_poly: int = DEFAULT_SAVGOL_POLY,
+) -> Tuple[np.ndarray, np.ndarray]:
+ """Возвращает (x_sorted, trend_y) по выбранному методу."""
+ d = df[[x_col, y_col]].dropna().sort_values(x_col)
+ x_vals = d[x_col].to_numpy()
+ y_vals = d[y_col].to_numpy()
+
+ if len(x_vals) == 0:
+ return np.array([]), np.array([])
+
+ m = method.lower()
+ if m == "lowess":
+ trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
+ return trend[:, 0], trend[:, 1]
+ if m == "rolling":
+ w = max(3, rolling_window)
+ if w % 2 == 0:
+ w += 1
+ y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy()
+ return x_vals, y_trend
+ if m == "savgol":
+ w = max(5, savgol_window)
+ if w % 2 == 0:
+ w += 1
+ poly = min(savgol_poly, w - 1)
+ y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp")
+ return x_vals, y_trend
+
+ # fallback to lowess
+ trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
+ return trend[:, 0], trend[:, 1]
+
+
+def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame:
+ subset = df[df[x_col] <= x_max].copy()
+ print(f"{len(df)} points; {len(subset)} within x<={x_max}.")
+ return subset
+
+
def plot_density_scatter(
df: pd.DataFrame,
+ y_col: str,
title: str,
- out_name: str,
+ out_path: Path,
+ *,
+ x_col: str = X_COL,
+ x_max: float = DEFAULT_X_MAX,
+ scatter_color: str = DEFAULT_SCATTER_COLOR,
+ point_size: int = DEFAULT_POINT_SIZE,
+ alpha: float = DEFAULT_ALPHA,
+ alpha_min: float = DEFAULT_ALPHA_MIN,
+ alpha_max: float = DEFAULT_ALPHA_MAX,
+ bins_x: int = DEFAULT_BINS_X,
+ bins_y: int = DEFAULT_BINS_Y,
+ y_min: float = DEFAULT_Y_MIN,
+ y_max: float = DEFAULT_Y_MAX,
with_trend: bool = False,
- alpha: float = 0.08,
+ trend_method: str = DEFAULT_TREND_METHOD,
+ trend_frac: float = DEFAULT_TREND_FRAC,
+ trend_color: str = DEFAULT_TREND_COLOR,
+ trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
+ rolling_window: int = DEFAULT_ROLLING_WINDOW,
+ savgol_window: int = DEFAULT_SAVGOL_WINDOW,
+ savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
- fig, ax = plt.subplots(figsize=(10, 6))
- sns.scatterplot(
- data=df,
- x=X_COL,
- y=Y_COL,
- color=SCATTER_COLOR,
- s=20,
- alpha=alpha,
- linewidth=0,
- ax=ax,
+ fig, ax = plt.subplots(figsize=(8, 8))
+ alpha_values = compute_density_alpha(
+ df,
+ x_col=x_col,
+ y_col=y_col,
+ x_max=x_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ y_min=y_min,
+ y_max_limit=y_max,
+ )
+ ax.scatter(
+ df[x_col],
+ df[y_col],
+ color=scatter_color,
+ s=point_size,
+ alpha=alpha_values if len(alpha_values) else alpha,
+ linewidths=0,
)
if with_trend:
- trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True)
- ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд")
- ax.legend()
+ tx, ty = compute_trend(
+ df,
+ y_col=y_col,
+ x_col=x_col,
+ method=trend_method,
+ lowess_frac=trend_frac,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
+ if len(tx):
+ ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
+ ax.legend()
- ax.set_xlim(0, X_MAX)
- ax.set_ylim(bottom=0)
+ ax.set_xlim(0, x_max)
+ ax.set_ylim(y_min, y_max)
+ ax.set_yticks(range(0, int(y_max) + 1, 2))
ax.set_xlabel("Среднее число показов в день")
- ax.set_ylabel("Число заказов за период (сумма)")
+ ax.set_ylabel(y_col)
ax.set_title(title)
ax.grid(alpha=0.3)
- OUT_DIR.mkdir(parents=True, exist_ok=True)
- out_path = OUT_DIR / out_name
+ out_path.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(out_path, dpi=150)
plt.close(fig)
print(f"Saved {out_path}")
+def plot_raw_scatter(
+ df: pd.DataFrame,
+ y_col: str,
+ out_dir: Path,
+ *,
+ x_col: str = X_COL,
+ x_max: float = DEFAULT_X_MAX,
+ scatter_color: str = DEFAULT_SCATTER_COLOR,
+ point_size: int = DEFAULT_POINT_SIZE,
+ alpha: float = DEFAULT_ALPHA,
+ alpha_min: float = DEFAULT_ALPHA_MIN,
+ alpha_max: float = DEFAULT_ALPHA_MAX,
+ bins_x: int = DEFAULT_BINS_X,
+ bins_y: int = DEFAULT_BINS_Y,
+ y_min: float = DEFAULT_Y_MIN,
+ y_max: float = DEFAULT_Y_MAX,
+ trend_method: str = DEFAULT_TREND_METHOD,
+ trend_frac: float = DEFAULT_TREND_FRAC,
+ trend_color: str = DEFAULT_TREND_COLOR,
+ trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
+ rolling_window: int = DEFAULT_ROLLING_WINDOW,
+ savgol_window: int = DEFAULT_SAVGOL_WINDOW,
+ savgol_poly: int = DEFAULT_SAVGOL_POLY,
+) -> None:
+ in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
+ plot_density_scatter(
+ in_range,
+ y_col=y_col,
+ title=f"Облако: {y_col} vs {x_col} (все клиенты)",
+ out_path=out_dir / "scatter.png",
+ x_col=x_col,
+ x_max=x_max,
+ scatter_color=scatter_color,
+ point_size=point_size,
+ alpha=alpha,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ y_min=y_min,
+ y_max=y_max,
+ trend_method=trend_method,
+ trend_frac=trend_frac,
+ trend_color=trend_color,
+ trend_linewidth=trend_linewidth,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
+
+
+def plot_clean_scatter(
+ df: pd.DataFrame,
+ y_col: str,
+ out_dir: Path,
+ *,
+ x_col: str = X_COL,
+ x_max: float = DEFAULT_X_MAX,
+ scatter_color: str = DEFAULT_SCATTER_COLOR,
+ point_size: int = DEFAULT_POINT_SIZE,
+ alpha: float = DEFAULT_ALPHA,
+ iqr_k: float = DEFAULT_IQR_K,
+ q_low: float = DEFAULT_Q_LOW,
+ q_high: float = DEFAULT_Q_HIGH,
+ alpha_min: float = DEFAULT_ALPHA_MIN,
+ alpha_max: float = DEFAULT_ALPHA_MAX,
+ bins_x: int = DEFAULT_BINS_X,
+ bins_y: int = DEFAULT_BINS_Y,
+ y_min: float = DEFAULT_Y_MIN,
+ y_max: float = DEFAULT_Y_MAX,
+ trend_method: str = DEFAULT_TREND_METHOD,
+ trend_frac: float = DEFAULT_TREND_FRAC,
+ trend_color: str = DEFAULT_TREND_COLOR,
+ trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
+ rolling_window: int = DEFAULT_ROLLING_WINDOW,
+ savgol_window: int = DEFAULT_SAVGOL_WINDOW,
+ savgol_poly: int = DEFAULT_SAVGOL_POLY,
+) -> None:
+ in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
+ cleaned = remove_outliers(
+ in_range,
+ y_col=y_col,
+ x_col=x_col,
+ iqr_k=iqr_k,
+ q_low=q_low,
+ q_high=q_high,
+ )
+ plot_density_scatter(
+ cleaned,
+ y_col=y_col,
+ title=f"Облако без выбросов (IQR) {y_col} vs {x_col}",
+ out_path=out_dir / "scatter_clean.png",
+ x_col=x_col,
+ x_max=x_max,
+ scatter_color=scatter_color,
+ point_size=point_size,
+ alpha=alpha,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ y_min=y_min,
+ y_max=y_max,
+ trend_method=trend_method,
+ trend_frac=trend_frac,
+ trend_color=trend_color,
+ trend_linewidth=trend_linewidth,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
+
+
+def plot_clean_trend_scatter(
+ df: pd.DataFrame,
+ y_col: str,
+ out_dir: Path,
+ *,
+ x_col: str = X_COL,
+ x_max: float = DEFAULT_X_MAX,
+ scatter_color: str = DEFAULT_SCATTER_COLOR,
+ point_size: int = DEFAULT_POINT_SIZE,
+ alpha: float = DEFAULT_TREND_ALPHA,
+ iqr_k: float = DEFAULT_IQR_K,
+ q_low: float = DEFAULT_Q_LOW,
+ q_high: float = DEFAULT_Q_HIGH,
+ trend_frac: float = DEFAULT_TREND_FRAC,
+ trend_color: str = DEFAULT_TREND_COLOR,
+ trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
+ alpha_min: float = DEFAULT_ALPHA_MIN,
+ alpha_max: float = DEFAULT_ALPHA_MAX,
+ bins_x: int = DEFAULT_BINS_X,
+ bins_y: int = DEFAULT_BINS_Y,
+ y_min: float = DEFAULT_Y_MIN,
+ y_max: float = DEFAULT_Y_MAX,
+ trend_method: str = DEFAULT_TREND_METHOD,
+ rolling_window: int = DEFAULT_ROLLING_WINDOW,
+ savgol_window: int = DEFAULT_SAVGOL_WINDOW,
+ savgol_poly: int = DEFAULT_SAVGOL_POLY,
+ return_components: bool = False,
+) -> None:
+ in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
+ cleaned = remove_outliers(
+ in_range,
+ y_col=y_col,
+ x_col=x_col,
+ iqr_k=iqr_k,
+ q_low=q_low,
+ q_high=q_high,
+ )
+ plot_density_scatter(
+ cleaned,
+ y_col=y_col,
+ title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
+ out_path=out_dir / "scatter_trend.png",
+ x_col=x_col,
+ x_max=x_max,
+ scatter_color=scatter_color,
+ point_size=point_size,
+ alpha=alpha,
+ with_trend=True,
+ trend_frac=trend_frac,
+ trend_color=trend_color,
+ trend_linewidth=trend_linewidth,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ y_min=y_min,
+ y_max=y_max,
+ trend_method=trend_method,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
+ if return_components:
+ return fig, ax, cleaned
+
+
+def generate_scatter_set(
+ df: pd.DataFrame,
+ y_col: str,
+ *,
+ base_out_dir: Path = BASE_OUT_DIR,
+ x_col: str = X_COL,
+ x_max: float = DEFAULT_X_MAX,
+ scatter_color: str = DEFAULT_SCATTER_COLOR,
+ point_size: int = DEFAULT_POINT_SIZE,
+ alpha: float = DEFAULT_ALPHA,
+ trend_alpha: float = DEFAULT_TREND_ALPHA,
+ trend_frac: float = DEFAULT_TREND_FRAC,
+ trend_color: str = DEFAULT_TREND_COLOR,
+ trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
+ iqr_k: float = DEFAULT_IQR_K,
+ q_low: float = DEFAULT_Q_LOW,
+ q_high: float = DEFAULT_Q_HIGH,
+ alpha_min: float = DEFAULT_ALPHA_MIN,
+ alpha_max: float = DEFAULT_ALPHA_MAX,
+ bins_x: int = DEFAULT_BINS_X,
+ bins_y: int = DEFAULT_BINS_Y,
+ y_min: float = DEFAULT_Y_MIN,
+ y_max: float = DEFAULT_Y_MAX,
+ trend_method: str = DEFAULT_TREND_METHOD,
+ rolling_window: int = DEFAULT_ROLLING_WINDOW,
+ savgol_window: int = DEFAULT_SAVGOL_WINDOW,
+ savgol_poly: int = DEFAULT_SAVGOL_POLY,
+) -> None:
+ """Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col."""
+ out_dir = base_out_dir / str(y_col).replace("/", "_")
+ plot_raw_scatter(
+ df,
+ y_col=y_col,
+ out_dir=out_dir,
+ x_col=x_col,
+ x_max=x_max,
+ scatter_color=scatter_color,
+ point_size=point_size,
+ alpha=alpha,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ y_min=y_min,
+ y_max=y_max,
+ trend_method=trend_method,
+ trend_frac=trend_frac,
+ trend_color=trend_color,
+ trend_linewidth=trend_linewidth,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
+ plot_clean_scatter(
+ df,
+ y_col=y_col,
+ out_dir=out_dir,
+ x_col=x_col,
+ x_max=x_max,
+ scatter_color=scatter_color,
+ point_size=point_size,
+ alpha=alpha,
+ iqr_k=iqr_k,
+ q_low=q_low,
+ q_high=q_high,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ y_min=y_min,
+ y_max=y_max,
+ trend_method=trend_method,
+ trend_frac=trend_frac,
+ trend_color=trend_color,
+ trend_linewidth=trend_linewidth,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
+ plot_clean_trend_scatter(
+ df,
+ y_col=y_col,
+ out_dir=out_dir,
+ x_col=x_col,
+ x_max=x_max,
+ scatter_color=scatter_color,
+ point_size=point_size,
+ alpha=trend_alpha,
+ iqr_k=iqr_k,
+ q_low=q_low,
+ q_high=q_high,
+ trend_frac=trend_frac,
+ trend_color=trend_color,
+ trend_linewidth=trend_linewidth,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ y_min=y_min,
+ y_max=y_max,
+ trend_method=trend_method,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
+
+
def main() -> None:
client = load_client_level(DB_PATH)
-
- plot_density_scatter(
- client,
- title="Облако: заказы vs средние показы в день (все клиенты)",
- out_name="orders_vs_avg_imp_scatter.png",
- )
-
- cleaned = remove_outliers(client)
- plot_density_scatter(
- cleaned,
- title="Облако без выбросов (IQR) заказы vs средние показы в день",
- out_name="orders_vs_avg_imp_scatter_clean.png",
- )
-
- plot_density_scatter(
- cleaned,
- title="Облако без выбросов + тренд",
- out_name="orders_vs_avg_imp_scatter_trend.png",
- with_trend=True,
- alpha=0.1,
- )
+ zero_orders = (client["orders_amt_total"] == 0).sum()
+ non_zero = len(client) - zero_orders
+ if len(client):
+ print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})")
+ generate_scatter_set(client, y_col="orders_amt_total")
if __name__ == "__main__":
diff --git a/main_hypot/quadreg.py b/main_hypot/quadreg.py
index 7863537..1f164b5 100644
--- a/main_hypot/quadreg.py
+++ b/main_hypot/quadreg.py
@@ -1,240 +1,352 @@
-import sqlite3
-from pathlib import Path
-import sys
import numpy as np
import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-
import statsmodels.api as sm
+from pathlib import Path
+from typing import Tuple, Optional
-sns.set_theme(style="whitegrid")
-plt.rcParams["figure.figsize"] = (10, 6)
+from sklearn.metrics import r2_score, roc_auc_score
-# -----------------------------
-# Load + feature engineering (как у тебя)
-# -----------------------------
-project_root = Path(__file__).resolve().parent.parent
-sys.path.append(str(project_root / "preanalysis_old_bad"))
-import eda_utils as eda # noqa: E402
+import best_model_and_plots as bmp
-db_path = project_root / "dataset" / "ds.sqlite"
-conn = sqlite3.connect(db_path)
-df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
-conn.close()
+# Наследуем константы/визуальные настройки из scatter-скрипта
+X_COL = bmp.X_COL
+DEFAULT_X_MAX = bmp.DEFAULT_X_MAX
+DEFAULT_Y_MIN = bmp.DEFAULT_Y_MIN
+DEFAULT_Y_MAX = bmp.DEFAULT_Y_MAX
+DEFAULT_SCATTER_COLOR = bmp.DEFAULT_SCATTER_COLOR
+DEFAULT_POINT_SIZE = bmp.DEFAULT_POINT_SIZE
+DEFAULT_ALPHA = bmp.DEFAULT_ALPHA
+DEFAULT_ALPHA_MIN = bmp.DEFAULT_ALPHA_MIN
+DEFAULT_ALPHA_MAX = bmp.DEFAULT_ALPHA_MAX
+DEFAULT_BINS_X = bmp.DEFAULT_BINS_X
+DEFAULT_BINS_Y = bmp.DEFAULT_BINS_Y
+DEFAULT_IQR_K = bmp.DEFAULT_IQR_K
+DEFAULT_Q_LOW = bmp.DEFAULT_Q_LOW
+DEFAULT_Q_HIGH = bmp.DEFAULT_Q_HIGH
+DEFAULT_TREND_FRAC = bmp.DEFAULT_TREND_FRAC
+DEFAULT_TREND_COLOR = bmp.DEFAULT_TREND_COLOR
+DEFAULT_TREND_LINEWIDTH = bmp.DEFAULT_TREND_LINEWIDTH
+BASE_OUT_DIR = bmp.BASE_OUT_DIR
-for cols, name in [
- (eda.ACTIVE_IMP_COLS, "active_imp_total"),
- (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
- (eda.ACTIVE_CLICK_COLS, "active_click_total"),
- (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
- (eda.ORDER_COLS, "orders_amt_total"),
-]:
- df[name] = df[cols].sum(axis=1)
-df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
-df["click_total"] = df["active_click_total"] + df["passive_click_total"]
-
-contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
-
-client = (
- df.groupby("id")
- .agg(
- imp_total=("imp_total", "sum"),
- click_total=("click_total", "sum"),
- orders_amt_total=("orders_amt_total", "sum"),
- age=("age", "median"),
- gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
- device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
+def prepare_clean_data(
+ y_col: str,
+ *,
+ x_col: str = X_COL,
+ x_max: float = DEFAULT_X_MAX,
+ iqr_k: float = DEFAULT_IQR_K,
+ q_low: float = DEFAULT_Q_LOW,
+ q_high: float = DEFAULT_Q_HIGH,
+) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
+ """Готовит очищенные данные: фильтр по x и IQR, возвращает x, y и DataFrame."""
+ df = bmp.load_client_level(bmp.DB_PATH)
+ base = df[[x_col, y_col]].dropna()
+ in_range = bmp.filter_x_range(base, x_col, x_max)
+ cleaned = bmp.remove_outliers(
+ in_range,
+ y_col=y_col,
+ x_col=x_col,
+ iqr_k=iqr_k,
+ q_low=q_low,
+ q_high=q_high,
)
- .merge(contact_days, on="id", how="left")
- .reset_index()
-)
+ x = cleaned[x_col].to_numpy()
+ y = cleaned[y_col].to_numpy()
+ return x, y, cleaned
-client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"])
-client["order_rate_pct"] = 100 * client["order_rate"]
-client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
-# -----------------------------
-# Aggregate curve points (как у тебя)
-# -----------------------------
-stats_imp = (
- client.groupby("avg_imp_per_day", as_index=False)
- .agg(
- orders_mean=("orders_amt_total", "mean"),
- n_clients=("id", "count"),
+def fit_quadratic(
+ x: np.ndarray,
+ y_target: np.ndarray,
+ weights: Optional[np.ndarray] = None,
+) -> Tuple[sm.regression.linear_model.RegressionResultsWrapper, np.ndarray]:
+ """Фитим квадратику по x -> y_target (WLS), предсказываем на тех же x."""
+ X_design = np.column_stack([x, x**2])
+ X_design = sm.add_constant(X_design)
+ if weights is not None:
+ model = sm.WLS(y_target, X_design, weights=weights).fit(cov_type="HC3")
+ else:
+ model = sm.OLS(y_target, X_design).fit(cov_type="HC3")
+
+ y_hat = model.predict(X_design)
+ return model, y_hat
+
+
+def compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[Optional[float], Optional[float]]:
+ """Возвращает (R2, AUC по метке y>0)."""
+ r2 = r2_score(y_true, y_pred)
+ auc = None
+ try:
+ auc = roc_auc_score((y_true > 0).astype(int), y_pred)
+ except ValueError:
+ auc = None
+ return r2, auc
+
+
+def map_trend_to_points(x_points: np.ndarray, trend_x: np.ndarray, trend_y: np.ndarray) -> np.ndarray:
+ """Интерполирует значения тренда в точках x_points."""
+ if len(trend_x) == 0:
+ return np.zeros_like(x_points)
+ # гарантируем отсортированность
+ order = np.argsort(trend_x)
+ tx = trend_x[order]
+ ty = trend_y[order]
+ return np.interp(x_points, tx, ty, left=ty[0], right=ty[-1])
+
+
+def density_weights(
+ df: pd.DataFrame,
+ y_col: str,
+ *,
+ x_col: str = X_COL,
+ x_max: float = DEFAULT_X_MAX,
+ alpha_min: float = DEFAULT_ALPHA_MIN,
+ alpha_max: float = DEFAULT_ALPHA_MAX,
+ bins_x: int = DEFAULT_BINS_X,
+ bins_y: int = DEFAULT_BINS_Y,
+ y_min: float = DEFAULT_Y_MIN,
+ y_max: float = DEFAULT_Y_MAX,
+) -> np.ndarray:
+ """Строит веса из плотности (та же схема, что и альфы на графике)."""
+ alphas = bmp.compute_density_alpha(
+ df,
+ x_col=x_col,
+ y_col=y_col,
+ x_max=x_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ y_min=y_min,
+ y_max_limit=y_max,
)
- .sort_values("avg_imp_per_day")
-).reset_index(drop=True)
+ if len(alphas) == 0:
+ return np.ones(len(df))
+ denom = max(alpha_max - alpha_min, 1e-9)
+ weights = (alphas - alpha_min) / denom
+ weights = np.clip(weights, 0, None)
+ return weights
-# -----------------------------
-# Filtering / outlier logic (как у тебя)
-# -----------------------------
-K_MULT = 2
-ABS_DY_MIN = 1
-X_MAX = 16
-stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
+def plot_quadratic_overlay(
+ df: pd.DataFrame,
+ model: sm.regression.linear_model.RegressionResultsWrapper,
+ y_col: str,
+ out_path: Path,
+ *,
+ x_col: str = X_COL,
+ x_max: float = DEFAULT_X_MAX,
+ y_min: float = DEFAULT_Y_MIN,
+ y_max: float = DEFAULT_Y_MAX,
+ scatter_color: str = DEFAULT_SCATTER_COLOR,
+ point_size: int = DEFAULT_POINT_SIZE,
+ alpha: float = DEFAULT_ALPHA,
+ alpha_min: float = DEFAULT_ALPHA_MIN,
+ alpha_max: float = DEFAULT_ALPHA_MAX,
+ bins_x: int = DEFAULT_BINS_X,
+ bins_y: int = DEFAULT_BINS_Y,
+ trend_frac: float = DEFAULT_TREND_FRAC,
+ trend_color: str = DEFAULT_TREND_COLOR,
+ trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
+ trend_method: str = bmp.DEFAULT_TREND_METHOD,
+ rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
+ savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
+ savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
+) -> None:
+ """Рисует облако + LOWESS-тренд + линию квадр. регрессии."""
+ fig, ax = bmp.plt.subplots(figsize=(8, 8))
+ alpha_values = bmp.compute_density_alpha(
+ df,
+ x_col=x_col,
+ y_col=y_col,
+ x_max=x_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ y_min=y_min,
+ y_max_limit=y_max,
+ )
+ ax.scatter(
+ df[x_col],
+ df[y_col],
+ color=scatter_color,
+ s=point_size,
+ alpha=alpha_values if len(alpha_values) else alpha,
+ linewidths=0,
+ label="Точки (очищено)",
+ )
-before = len(stats_f)
-y = stats_f["orders_mean"]
-abs_dy = y.diff().abs()
+ # Тренд по выбранному методу
+ tx, ty = bmp.compute_trend(
+ df,
+ y_col=y_col,
+ x_col=x_col,
+ method=trend_method,
+ lowess_frac=trend_frac,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
+ if len(tx):
+ ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
-prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
-ratio = abs_dy / (prev3_mean.replace(0, np.nan))
+ # Квадратичная регрессия
+ x_grid = np.linspace(0, x_max, 400)
+ X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
+ y_grid = model.predict(X_grid)
+ ax.plot(x_grid, y_grid, color="blue", linewidth=2.3, linestyle="--", label="Квадр. регрессия")
-is_outlier = ((abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT)) | (y > 5)
-is_outlier = is_outlier.fillna(False)
+ ax.set_xlim(0, x_max)
+ ax.set_ylim(y_min, y_max)
+ ax.set_yticks(range(0, int(y_max) + 1, 2))
+ ax.set_xlabel("Среднее число показов в день")
+ ax.set_ylabel(y_col)
+ ax.set_title(f"Квадратичная регрессия: {y_col} vs {x_col}")
+ ax.grid(alpha=0.3)
+ ax.legend()
-stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
-after = len(stats_f)
-print(f"Фильтрация: было {before}, стало {after}, убрали {before-after} точек")
+ out_path.parent.mkdir(parents=True, exist_ok=True)
+ fig.tight_layout()
+ fig.savefig(out_path, dpi=150)
+ bmp.plt.close(fig)
+ print(f"Saved {out_path}")
-# -----------------------------
-# Smoothing (оставим для визуалки, но регрессию делаем по orders_mean)
-# -----------------------------
-w = max(7, int(len(stats_f) * 0.05))
-if w % 2 == 0:
- w += 1
-stats_f["orders_smooth"] = (
- stats_f["orders_mean"]
- .rolling(window=w, center=True, min_periods=1)
- .mean()
-)
+def report_model(
+ model: sm.regression.linear_model.RegressionResultsWrapper,
+ r2: Optional[float],
+ auc: Optional[float],
+ *,
+ r2_trend: Optional[float] = None,
+) -> None:
+ params = model.params
+ pvals = model.pvalues
+ fmt_p = lambda p: f"<1e-300" if p < 1e-300 else f"{p:.4g}"
+ print("\n=== Квадратичная регрессия (y ~ 1 + x + x^2) ===")
+ print(f"const: {params[0]:.6f} (p={fmt_p(pvals[0])})")
+ print(f"beta1 x: {params[1]:.6f} (p={fmt_p(pvals[1])})")
+ print(f"beta2 x^2: {params[2]:.6f} (p={fmt_p(pvals[2])})")
+ print(f"R2: {r2:.4f}" if r2 is not None else "R2: n/a")
+ if r2_trend is not None:
+ print(f"R2 vs trend target: {r2_trend:.4f}")
+ print(f"AUC (target y>0): {auc:.4f}" if auc is not None else "AUC: n/a (один класс)")
-# -----------------------------
-# Cost line (как у тебя, нормировка "в единицах заказов")
-# -----------------------------
-c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
-stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
-# -----------------------------
-# Quadratic regression: orders_mean ~ 1 + x + x^2
-# WLS with weights = n_clients
-# -----------------------------
-x = stats_f["avg_imp_per_day"].to_numpy()
-y = stats_f["orders_mean"].to_numpy()
-wts = stats_f["n_clients"].to_numpy().astype(float)
+def generate_quadratic_analysis(
+ y_col: str,
+ *,
+ x_col: str = X_COL,
+ base_out_dir: Path = BASE_OUT_DIR,
+ config_name: str = "default",
+ x_max: float = DEFAULT_X_MAX,
+ y_min: float = DEFAULT_Y_MIN,
+ y_max: float = DEFAULT_Y_MAX,
+ scatter_color: str = DEFAULT_SCATTER_COLOR,
+ point_size: int = DEFAULT_POINT_SIZE,
+ alpha: float = DEFAULT_ALPHA,
+ alpha_min: float = DEFAULT_ALPHA_MIN,
+ alpha_max: float = DEFAULT_ALPHA_MAX,
+ bins_x: int = DEFAULT_BINS_X,
+ bins_y: int = DEFAULT_BINS_Y,
+ trend_frac: float = DEFAULT_TREND_FRAC,
+ trend_color: str = DEFAULT_TREND_COLOR,
+ trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
+ iqr_k: float = DEFAULT_IQR_K,
+ q_low: float = DEFAULT_Q_LOW,
+ q_high: float = DEFAULT_Q_HIGH,
+ trend_method: str = bmp.DEFAULT_TREND_METHOD,
+ rolling_window: int = bmp.DEFAULT_ROLLING_WINDOW,
+ savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
+ savgol_poly: int = bmp.DEFAULT_SAVGOL_POLY,
+) -> dict:
+ x, y, cleaned_df = prepare_clean_data(
+ y_col,
+ x_col=x_col,
+ x_max=x_max,
+ iqr_k=iqr_k,
+ q_low=q_low,
+ q_high=q_high,
+ )
+ w = density_weights(
+ cleaned_df,
+ y_col=y_col,
+ x_col=x_col,
+ x_max=x_max,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ y_min=y_min,
+ y_max=y_max,
+ )
+ # тренд по выбранному методу
+ tx, ty = bmp.compute_trend(
+ cleaned_df,
+ y_col=y_col,
+ x_col=x_col,
+ method=trend_method,
+ lowess_frac=trend_frac,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
-X = np.column_stack([x, x**2])
-X = sm.add_constant(X) # [1, x, x^2]
+ trend_target = map_trend_to_points(x, tx, ty)
+ model, y_hat = fit_quadratic(x, trend_target, weights=w)
+ r2_actual, auc = compute_metrics(y, y_hat)
+ r2_trend = r2_score(trend_target, y_hat) if len(trend_target) else None
+ report_model(model, r2_actual, auc, r2_trend=r2_trend)
-model = sm.WLS(y, X, weights=wts)
-res = model.fit(cov_type="HC3") # робастные ошибки
+ out_dir = base_out_dir / config_name / str(y_col).replace("/", "_")
+ plot_quadratic_overlay(
+ cleaned_df,
+ model,
+ y_col=y_col,
+ out_path=out_dir / "quad_regression.png",
+ x_col=x_col,
+ x_max=x_max,
+ y_min=y_min,
+ y_max=y_max,
+ scatter_color=scatter_color,
+ point_size=point_size,
+ alpha=alpha,
+ alpha_min=alpha_min,
+ alpha_max=alpha_max,
+ bins_x=bins_x,
+ bins_y=bins_y,
+ trend_frac=trend_frac,
+ trend_color=trend_color,
+ trend_linewidth=trend_linewidth,
+ trend_method=trend_method,
+ rolling_window=rolling_window,
+ savgol_window=savgol_window,
+ savgol_poly=savgol_poly,
+ )
-b0, b1, b2 = res.params
-p_b1_two = res.pvalues[1]
-p_b2_two = res.pvalues[2]
+ return {
+ "config": config_name,
+ "y_col": y_col,
+ "r2": r2_actual,
+ "r2_trend": r2_trend,
+ "auc": auc,
+ "params": {
+ "trend_method": trend_method,
+ "trend_frac": trend_frac,
+ "rolling_window": rolling_window,
+ "savgol_window": savgol_window,
+ "savgol_poly": savgol_poly,
+ "x_max": x_max,
+ "weights_alpha_range": (alpha_min, alpha_max),
+ },
+ "coeffs": model.params.tolist(),
+ "pvalues": model.pvalues.tolist(),
+ }
-# one-sided p-values for directional hypotheses
-p_b1_pos = (p_b1_two / 2) if (b1 > 0) else (1 - p_b1_two / 2)
-p_b2_neg = (p_b2_two / 2) if (b2 < 0) else (1 - p_b2_two / 2)
-# turning point (if concave)
-x_star = None
-y_star = None
-if b2 < 0:
- x_star = -b1 / (2 * b2)
- y_star = b0 + b1 * x_star + b2 * x_star**2
+def main() -> None:
+ generate_quadratic_analysis("orders_amt_total")
-# Intersection with cost line: b0 + b1 x + b2 x^2 = c x -> b2 x^2 + (b1 - c) x + b0 = 0
-x_cross = None
-roots = np.roots([b2, (b1 - c), b0]) # may be complex
-roots = [r.real for r in roots if abs(r.imag) < 1e-8]
-roots_in_range = [r for r in roots if (stats_f["avg_imp_per_day"].min() <= r <= stats_f["avg_imp_per_day"].max())]
-if roots_in_range:
- # берём корень ближе к "правой" части (обычно пересечение интереснее там, где начинается невыгодно)
- x_cross = max(roots_in_range)
-# -----------------------------
-# Print results + interpretation (по-человечески)
-# -----------------------------
-print("\n=== Квадратичная регрессия (WLS, веса = n_clients, SE = HC3) ===")
-print(res.summary())
-
-print("\n=== Проверка гипотезы убывающей отдачи / спада ===")
-print(f"β1 (линейный эффект): {b1:.6f}, двусторонний p={p_b1_two:.4g}, односторонний p(β1>0)={p_b1_pos:.4g}")
-print(f"β2 (кривизна): {b2:.6f}, двусторонний p={p_b2_two:.4g}, односторонний p(β2<0)={p_b2_neg:.4g}")
-
-alpha = 0.05
-support = (b1 > 0) and (b2 < 0) and (p_b1_pos < alpha) and (p_b2_neg < alpha)
-
-if support:
- print("\nВывод: данные поддерживают гипотезу нелинейности.")
- print("Есть статистически значимый рост на малых x (β1>0) и насыщение/спад (β2<0).")
-else:
- print("\nВывод: строгого статистического подтверждения по знакам/значимости может не хватить.")
- print("Но знак коэффициентов и форма кривой всё равно могут быть согласованы с гипотезой.")
- print("На защите говори аккуратно: 'наблюдается тенденция/согласуется с гипотезой'.")
-
-if x_star is not None:
- print(f"\nОценка 'порога насыщения' (вершина параболы): x* = {x_star:.3f} показов/день")
- print(f"Прогноз среднего числа заказов в x*: y(x*) ≈ {y_star:.3f}")
- if not (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
- print("Внимание: x* вне диапазона наблюдений, интерпретация как 'оптимума' сомнительная.")
-else:
- print("\nВершина не считается как максимум: β2 >= 0 (нет выпуклости вниз).")
-
-if x_cross is not None:
- y_cross = b0 + b1 * x_cross + b2 * x_cross**2
- print(f"\nТочка пересечения с линейными расходами (в нормировке c={c:.4f}): x≈{x_cross:.3f}, y≈{y_cross:.3f}")
-else:
- print("\nПересечение с линией расходов в выбранной нормировке не найдено (или вне диапазона).")
-
-# -----------------------------
-# Plot: points + smooth + quadratic fit + cost + markers
-# -----------------------------
-x_grid = np.linspace(stats_f["avg_imp_per_day"].min(), stats_f["avg_imp_per_day"].max(), 300)
-y_hat = b0 + b1 * x_grid + b2 * x_grid**2
-cost_hat = c * x_grid
-
-plt.figure(figsize=(10, 8))
-
-plt.plot(
- stats_f["avg_imp_per_day"], stats_f["orders_mean"],
- marker="o", linestyle="-", linewidth=1, alpha=0.3,
- label="Среднее число заказов (по точкам)"
-)
-
-plt.plot(
- stats_f["avg_imp_per_day"], stats_f["orders_smooth"],
- color="red", linewidth=2.2,
- label="Сглаженный тренд (rolling mean)"
-)
-
-plt.plot(
- x_grid, y_hat,
- color="blue", linewidth=2.5,
- label="Квадратичная регрессия (WLS)"
-)
-
-plt.plot(
- x_grid, cost_hat,
- color="black", linestyle="--", linewidth=2,
- label="Линейные расходы на показы"
-)
-
-if x_star is not None and (stats_f["avg_imp_per_day"].min() <= x_star <= stats_f["avg_imp_per_day"].max()):
- plt.axvline(x_star, color="blue", linestyle=":", linewidth=2)
- plt.scatter([x_star], [y_star], color="blue", zorder=5)
- plt.text(x_star, y_star, f" x*={x_star:.2f}", va="bottom")
-
-if x_cross is not None:
- y_cross = b0 + b1 * x_cross + b2 * x_cross**2
- plt.axvline(x_cross, color="black", linestyle=":", linewidth=2, alpha=0.8)
- plt.scatter([x_cross], [y_cross], color="black", zorder=5)
- plt.text(x_cross, y_cross, f" пересечение≈{x_cross:.2f}", va="top")
-
-plt.xlabel("Среднее число показов в день")
-plt.ylabel("Среднее число заказов")
-plt.title("Нелинейный эффект интенсивности коммуникаций: квадратичная регрессия")
-plt.legend()
-plt.grid(alpha=0.3)
-plt.tight_layout()
-
-out_dir = project_root / "main_hypot"
-out_dir.mkdir(parents=True, exist_ok=True)
-out_path = out_dir / "quad_regression_with_costs.png"
-plt.savefig(out_path, dpi=150)
-print(f"\nSaved: {out_path}")
+if __name__ == "__main__":
+ main()
diff --git a/main_hypot/best_bins.png b/old_generated_plots/best_bins.png
similarity index 100%
rename from main_hypot/best_bins.png
rename to old_generated_plots/best_bins.png
diff --git a/main_hypot/best_model_prob.png b/old_generated_plots/best_model_prob.png
similarity index 100%
rename from main_hypot/best_model_prob.png
rename to old_generated_plots/best_model_prob.png
diff --git a/main_hypot/orders_vs_avg_imp_per_day.png b/old_generated_plots/orders_vs_avg_imp_per_day.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_per_day.png
rename to old_generated_plots/orders_vs_avg_imp_per_day.png
diff --git a/main_hypot/orders_vs_avg_imp_per_day_filtered_smoothed.png b/old_generated_plots/orders_vs_avg_imp_per_day_filtered_smoothed.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_per_day_filtered_smoothed.png
rename to old_generated_plots/orders_vs_avg_imp_per_day_filtered_smoothed.png
diff --git a/main_hypot/orders_vs_avg_imp_per_day_smoothed.png b/old_generated_plots/orders_vs_avg_imp_per_day_smoothed.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_per_day_smoothed.png
rename to old_generated_plots/orders_vs_avg_imp_per_day_smoothed.png
diff --git a/main_hypot/orders_vs_avg_imp_per_day_smoothed_clean.png b/old_generated_plots/orders_vs_avg_imp_per_day_smoothed_clean.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_per_day_smoothed_clean.png
rename to old_generated_plots/orders_vs_avg_imp_per_day_smoothed_clean.png
diff --git a/main_hypot/orders_vs_avg_imp_scatter.png b/old_generated_plots/orders_vs_avg_imp_scatter.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_scatter.png
rename to old_generated_plots/orders_vs_avg_imp_scatter.png
diff --git a/main_hypot/orders_vs_avg_imp_scatter_clean.png b/old_generated_plots/orders_vs_avg_imp_scatter_clean.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_scatter_clean.png
rename to old_generated_plots/orders_vs_avg_imp_scatter_clean.png
diff --git a/main_hypot/orders_vs_avg_imp_scatter_trend.png b/old_generated_plots/orders_vs_avg_imp_scatter_trend.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_scatter_trend.png
rename to old_generated_plots/orders_vs_avg_imp_scatter_trend.png
diff --git a/main_hypot/orders_vs_avg_imp_with_costs.png b/old_generated_plots/orders_vs_avg_imp_with_costs.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_with_costs.png
rename to old_generated_plots/orders_vs_avg_imp_with_costs.png
diff --git a/main_hypot/orders_vs_avg_imp_without_costs.png b/old_generated_plots/orders_vs_avg_imp_without_costs.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_without_costs.png
rename to old_generated_plots/orders_vs_avg_imp_without_costs.png
diff --git a/main_hypot/orders_vs_avg_imp_without_costs_no_filter.png b/old_generated_plots/orders_vs_avg_imp_without_costs_no_filter.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_without_costs_no_filter.png
rename to old_generated_plots/orders_vs_avg_imp_without_costs_no_filter.png
diff --git a/main_hypot/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png b/old_generated_plots/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png
similarity index 100%
rename from main_hypot/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png
rename to old_generated_plots/orders_vs_avg_imp_without_costs_no_filter_no_dropouts.png
diff --git a/main_hypot/quad_regression_with_costs.png b/old_generated_plots/quad_regression_with_costs.png
similarity index 100%
rename from main_hypot/quad_regression_with_costs.png
rename to old_generated_plots/quad_regression_with_costs.png
diff --git a/main_hypot/stat_bins.png b/old_generated_plots/stat_bins.png
similarity index 100%
rename from main_hypot/stat_bins.png
rename to old_generated_plots/stat_bins.png
diff --git a/preanalysis/eda_utils.py b/preanalysis/eda_utils.py
new file mode 100644
index 0000000..802a6d8
--- /dev/null
+++ b/preanalysis/eda_utils.py
@@ -0,0 +1,154 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Dict, Iterable, List
+
+import numpy as np
+import pandas as pd
+
+# Paths and column groups
+DATA_PATH = Path("dataset/ds.csv")
+CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
+
+ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
+PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
+ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
+PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
+ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
+
+NUMERIC_COLS = (
+ ACTIVE_IMP_COLS
+ + PASSIVE_IMP_COLS
+ + ACTIVE_CLICK_COLS
+ + PASSIVE_CLICK_COLS
+ + ORDER_COLS
+ + ["age"]
+)
+CAT_COLS = ["gender_cd", "device_platform_cd"]
+
+
+def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
+ """Divide with protection against zero (works for Series and scalars)."""
+ if isinstance(denominator, pd.Series):
+ denom = denominator.replace(0, np.nan)
+ else:
+ denom = np.nan if float(denominator) == 0 else denominator
+ return numerator / denom
+
+
+def normalize_gender(series: pd.Series) -> pd.Series:
+ cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
+ mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
+ return cleaned.map(mapping).fillna("UNKNOWN")
+
+
+def normalize_device(series: pd.Series) -> pd.Series:
+ cleaned = series.fillna("unknown").astype(str).str.strip()
+ lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
+ mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
+ mapped = lowered.map(mapping)
+ fallback = cleaned.str.title()
+ return mapped.fillna(fallback)
+
+
+def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
+ bins = [0, 25, 35, 45, 55, np.inf]
+ labels = ["<25", "25-34", "35-44", "45-54", "55+"]
+ df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
+ return df
+
+
+def add_totals(df: pd.DataFrame) -> pd.DataFrame:
+ df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
+ df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
+ df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
+ df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
+ df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
+ df["click_total"] = df["active_click_total"] + df["passive_click_total"]
+ df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
+ df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
+ df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
+ df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
+ df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
+ df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
+ return df
+
+
+def add_flags(df: pd.DataFrame) -> pd.DataFrame:
+ df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
+ df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
+ df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
+ df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
+ return df
+
+
+def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
+ df = pd.read_csv(path)
+ df["business_dt"] = pd.to_datetime(df["business_dt"])
+ df["gender_cd"] = normalize_gender(df["gender_cd"])
+ df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
+ df = add_age_group(df)
+ df = add_totals(df)
+ df = add_flags(df)
+ return df
+
+
+def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
+ stats = []
+ for col in cols:
+ series = df[col]
+ stats.append(
+ {
+ "col": col,
+ "count": series.count(),
+ "mean": series.mean(),
+ "median": series.median(),
+ "std": series.std(),
+ "min": series.min(),
+ "q25": series.quantile(0.25),
+ "q75": series.quantile(0.75),
+ "max": series.max(),
+ "share_zero": (series == 0).mean(),
+ "p95": series.quantile(0.95),
+ "p99": series.quantile(0.99),
+ }
+ )
+ return pd.DataFrame(stats)
+
+
+def build_daily(df: pd.DataFrame) -> pd.DataFrame:
+ agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
+ daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
+ daily = add_totals(daily)
+ daily["day_of_week"] = daily["business_dt"].dt.day_name()
+ return daily
+
+
+def build_client(df: pd.DataFrame) -> pd.DataFrame:
+ agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
+ meta_spec: Dict[str, str | callable] = {
+ "age": "median",
+ "gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
+ "age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
+ "device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
+ }
+ agg_spec.update(meta_spec)
+ client = df.groupby("id").agg(agg_spec).reset_index()
+ contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
+ imp_day = df.copy()
+ imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
+ max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
+ client = add_totals(client)
+ client = add_flags(client)
+ client = client.merge(contact_days, on="id", how="left")
+ client = client.merge(max_imp_day, on="id", how="left")
+ client = add_contact_density(client)
+ return client
+
+
+def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
+ # contact_days must already be present
+ if "contact_days" in df.columns:
+ df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
+ return df
+ return df
diff --git a/preanalysis/first_stage.ipynb b/preanalysis/first_stage.ipynb
index 47013c5..609c77f 100644
--- a/preanalysis/first_stage.ipynb
+++ b/preanalysis/first_stage.ipynb
@@ -42,19 +42,33 @@
"Requirement already satisfied: tzdata>=2022.7 in /opt/homebrew/Cellar/jupyterlab/4.4.3_2/libexec/lib/python3.13/site-packages (from pandas>=1.5.0->fastparquet) (2025.2)\n",
"Requirement already satisfied: six>=1.5 in /opt/homebrew/Cellar/jupyterlab/4.4.3_2/libexec/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas>=1.5.0->fastparquet) (1.17.0)\n",
"Downloading fastparquet-2024.11.0-cp313-cp313-macosx_11_0_arm64.whl (683 kB)\n",
- "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m683.8/683.8 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hDownloading cramjam-2.11.0-cp313-cp313-macosx_11_0_arm64.whl (1.7 MB)\n",
- "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m:01\u001b[0m\n",
- "\u001b[?25hDownloading fsspec-2025.12.0-py3-none-any.whl (201 kB)\n",
+ "\u001B[2K \u001B[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m683.8/683.8 kB\u001B[0m \u001B[31m5.0 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
+ "\u001B[?25hDownloading cramjam-2.11.0-cp313-cp313-macosx_11_0_arm64.whl (1.7 MB)\n",
+ "\u001B[2K \u001B[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m1.7/1.7 MB\u001B[0m \u001B[31m2.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m MB/s\u001B[0m eta \u001B[36m0:00:01\u001B[0m:01\u001B[0m\n",
+ "\u001B[?25hDownloading fsspec-2025.12.0-py3-none-any.whl (201 kB)\n",
"Installing collected packages: fsspec, cramjam, fastparquet\n",
- "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3/3\u001b[0m [fastparquet]\n",
- "\u001b[1A\u001b[2KSuccessfully installed cramjam-2.11.0 fastparquet-2024.11.0 fsspec-2025.12.0\n",
+ "\u001B[2K \u001B[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m3/3\u001B[0m [fastparquet]\n",
+ "\u001B[1A\u001B[2KSuccessfully installed cramjam-2.11.0 fastparquet-2024.11.0 fsspec-2025.12.0\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
- "pip install fastparquet"
+ "\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import seaborn as sns\n",
+ "import math\n",
+ "import matplotlib.pyplot as plt\n",
+ "from pathlib import Path\n",
+ "from eda_utils import (\n",
+ " load_data, DATA_PATH, CATEGORIES, ACTIVE_IMP_COLS, PASSIVE_IMP_COLS,\n",
+ " ACTIVE_CLICK_COLS, PASSIVE_CLICK_COLS, ORDER_COLS, NUMERIC_COLS, CAT_COLS,\n",
+ " describe_zero_share, safe_divide, build_daily, build_client, add_contact_density\n",
+ ")\n",
+ "pd.set_option(\"display.max_columns\", None)\n",
+ "pd.options.display.float_format = '{:,.3f}'.format\n",
+ "sns.set_theme(style=\"ticks\", palette=\"deep\")\n"
]
},
{
@@ -69,17 +83,8 @@
},
"outputs": [],
"source": [
- "\n",
- "import pandas as pd\n",
- "import numpy as np\n",
- "import seaborn as sns\n",
- "import math\n",
- "import matplotlib.pyplot as plt\n",
- "from pathlib import Path\n",
- "\n",
- "pd.set_option(\"display.max_columns\", None)\n",
- "pd.options.display.float_format = '{:,.3f}'.format\n",
- "sns.set_theme(style=\"ticks\", palette=\"deep\")\n"
+ "df = pd.read_csv(\"../dataset/ds.csv\")\n",
+ "print(f'Raw shape: {df.shape}')"
]
},
{
@@ -102,8 +107,10 @@
}
],
"source": [
- "df = pd.read_csv(\"../dataset/ds.csv\")\n",
- "print(f'Raw shape: {df.shape}')"
+ "import io\n",
+ "buf = io.StringIO()\n",
+ "df.info(buf=buf)\n",
+ "print('Raw info:\\n', buf.getvalue())"
]
},
{
@@ -168,16 +175,10 @@
]
}
],
- "source": [
- "import io\n",
- "buf = io.StringIO()\n",
- "df.info(buf=buf)\n",
- "print('Raw info:\\n', buf.getvalue())"
- ]
+ "source": "df.head(5)"
},
{
- "cell_type": "code",
- "execution_count": 5,
+ "cell_type": "markdown",
"id": "0d18c485",
"metadata": {
"ExecuteTime": {
@@ -185,352 +186,23 @@
"start_time": "2025-12-05T18:56:35.440402Z"
}
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " business_dt | \n",
- " active_imp_ent | \n",
- " active_click_ent | \n",
- " active_imp_super | \n",
- " active_click_super | \n",
- " active_imp_transport | \n",
- " active_click_transport | \n",
- " active_imp_shopping | \n",
- " active_click_shopping | \n",
- " active_imp_hotel | \n",
- " active_click_hotel | \n",
- " active_imp_avia | \n",
- " active_click_avia | \n",
- " passive_imp_ent | \n",
- " passive_click_ent | \n",
- " passive_imp_super | \n",
- " passive_click_super | \n",
- " passive_imp_transport | \n",
- " passive_click_transport | \n",
- " passive_imp_shopping | \n",
- " passive_click_shopping | \n",
- " passive_imp_hotel | \n",
- " passive_click_hotel | \n",
- " passive_imp_avia | \n",
- " passive_click_avia | \n",
- " orders_amt_ent | \n",
- " orders_amt_super | \n",
- " orders_amt_transport | \n",
- " orders_amt_shopping | \n",
- " orders_amt_hotel | \n",
- " orders_amt_avia | \n",
- " gender_cd | \n",
- " age | \n",
- " device_platform_cd | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 7119 | \n",
- " 2025-04-02 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 3.000 | \n",
- " 1.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 2 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " F | \n",
- " 40 | \n",
- " iOS | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 1797 | \n",
- " 2025-08-27 | \n",
- " 1.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0 | \n",
- " 0 | \n",
- " 3 | \n",
- " 0 | \n",
- " 2.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 2.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 0 | \n",
- " 0 | \n",
- " 5 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " M | \n",
- " 38 | \n",
- " IOS | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 8010 | \n",
- " 2025-07-10 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " M | \n",
- " 51 | \n",
- " Android | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 2360 | \n",
- " 2025-08-10 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " M | \n",
- " 37 | \n",
- " IOS | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 3457 | \n",
- " 2025-05-23 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 3.000 | \n",
- " 1.000 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 2 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " F | \n",
- " 27 | \n",
- " iOS | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id business_dt active_imp_ent active_click_ent active_imp_super \\\n",
- "0 7119 2025-04-02 0.000 0.000 3.000 \n",
- "1 1797 2025-08-27 1.000 1.000 0.000 \n",
- "2 8010 2025-07-10 0.000 0.000 1.000 \n",
- "3 2360 2025-08-10 0.000 0.000 0.000 \n",
- "4 3457 2025-05-23 0.000 0.000 1.000 \n",
- "\n",
- " active_click_super active_imp_transport active_click_transport \\\n",
- "0 1.000 1.000 0.000 \n",
- "1 0.000 0.000 0.000 \n",
- "2 1.000 0.000 0.000 \n",
- "3 0.000 0.000 1.000 \n",
- "4 0.000 0.000 0.000 \n",
- "\n",
- " active_imp_shopping active_click_shopping active_imp_hotel \\\n",
- "0 1.000 0.000 0 \n",
- "1 0.000 0.000 0 \n",
- "2 0.000 0.000 0 \n",
- "3 0.000 0.000 0 \n",
- "4 3.000 1.000 0 \n",
- "\n",
- " active_click_hotel active_imp_avia active_click_avia passive_imp_ent \\\n",
- "0 0 0 0 0.000 \n",
- "1 0 3 0 2.000 \n",
- "2 0 0 0 1.000 \n",
- "3 0 0 0 0.000 \n",
- "4 0 0 0 0.000 \n",
- "\n",
- " passive_click_ent passive_imp_super passive_click_super \\\n",
- "0 0.000 0.000 0.000 \n",
- "1 0.000 1.000 0.000 \n",
- "2 0.000 1.000 0.000 \n",
- "3 0.000 0.000 0.000 \n",
- "4 0.000 0.000 0.000 \n",
- "\n",
- " passive_imp_transport passive_click_transport passive_imp_shopping \\\n",
- "0 0.000 0.000 0.000 \n",
- "1 2.000 0.000 1.000 \n",
- "2 1.000 0.000 1.000 \n",
- "3 1.000 0.000 0.000 \n",
- "4 0.000 0.000 0.000 \n",
- "\n",
- " passive_click_shopping passive_imp_hotel passive_click_hotel \\\n",
- "0 0.000 2 0 \n",
- "1 0.000 0 0 \n",
- "2 0.000 0 0 \n",
- "3 0.000 1 0 \n",
- "4 0.000 2 0 \n",
- "\n",
- " passive_imp_avia passive_click_avia orders_amt_ent orders_amt_super \\\n",
- "0 0 0 0 0 \n",
- "1 5 0 0 0 \n",
- "2 1 0 0 0 \n",
- "3 1 0 0 0 \n",
- "4 0 0 0 0 \n",
- "\n",
- " orders_amt_transport orders_amt_shopping orders_amt_hotel \\\n",
- "0 0 0 0 \n",
- "1 0 0 0 \n",
- "2 0 0 0 \n",
- "3 0 0 0 \n",
- "4 0 0 0 \n",
- "\n",
- " orders_amt_avia gender_cd age device_platform_cd \n",
- "0 0 F 40 iOS \n",
- "1 0 M 38 IOS \n",
- "2 0 M 51 Android \n",
- "3 0 M 37 IOS \n",
- "4 0 F 27 iOS "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head(5)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "b35ad277-a07b-44a2-8c7b-da5e47ef5435",
- "metadata": {},
- "source": [
- "# Анализ"
- ]
+ "source": "# Анализ"
},
{
"cell_type": "code",
- "execution_count": 6,
+ "id": "b35ad277-a07b-44a2-8c7b-da5e47ef5435",
+ "metadata": {},
+ "source": [
+ "n_rows, n_cols = df.shape\n",
+ "n_unique_clients = df['id'].nunique()\n",
+ "min_dt, max_dt = df['business_dt'].min(), df['business_dt'].max()\n",
+ "print({'rows': n_rows, 'cols': n_cols, 'unique_clients': n_unique_clients, 'min_dt': min_dt, 'max_dt': max_dt})"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
"id": "78a7f3d2",
"metadata": {
"ExecuteTime": {
@@ -538,26 +210,6 @@
"start_time": "2025-12-05T18:56:35.556685Z"
}
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'rows': 118189, 'cols': 35, 'unique_clients': 8339, 'min_dt': '2025-01-09', 'max_dt': '2025-11-04'}\n"
- ]
- }
- ],
- "source": [
- "n_rows, n_cols = df.shape\n",
- "n_unique_clients = df['id'].nunique()\n",
- "min_dt, max_dt = df['business_dt'].min(), df['business_dt'].max()\n",
- "print({'rows': n_rows, 'cols': n_cols, 'unique_clients': n_unique_clients, 'min_dt': min_dt, 'max_dt': max_dt})"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "f691bec5-e203-4bcc-8645-333178708a66",
- "metadata": {},
"source": [
"Всего в датасете 118189 записей и 35 полей данных\n",
"8339 уникальных клиентов\n",
@@ -566,7 +218,18 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "id": "f691bec5-e203-4bcc-8645-333178708a66",
+ "metadata": {},
+ "source": [
+ "dup_table = df.groupby(['id', 'business_dt']).size().value_counts().reset_index()\n",
+ "dup_table.columns = ['rows_per_key', 'n_pairs']\n",
+ "dup_table.head()"
+ ],
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
"id": "a40091f6",
"metadata": {
"ExecuteTime": {
@@ -574,69 +237,18 @@
"start_time": "2025-12-05T18:56:35.602181Z"
}
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " rows_per_key | \n",
- " n_pairs | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 1 | \n",
- " 118189 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " rows_per_key n_pairs\n",
- "0 1 118189"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "dup_table = df.groupby(['id', 'business_dt']).size().value_counts().reset_index()\n",
- "dup_table.columns = ['rows_per_key', 'n_pairs']\n",
- "dup_table.head()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "c1c33a2f-6ad9-476a-9b62-ba5511189527",
- "metadata": {},
- "source": [
- "Датасет не содержит дублирования по паре id + дата"
- ]
+ "source": "Датасет не содержит дублирования по паре id + дата"
},
{
"cell_type": "code",
- "execution_count": 8,
+ "id": "c1c33a2f-6ad9-476a-9b62-ba5511189527",
+ "metadata": {},
+ "source": "df.groupby('id').size().describe()",
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
"id": "43cbdc8a",
"metadata": {
"ExecuteTime": {
@@ -644,41 +256,18 @@
"start_time": "2025-12-05T18:56:35.680252Z"
}
},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "count 8,339.000\n",
- "mean 14.173\n",
- "std 4.762\n",
- "min 4.000\n",
- "25% 11.000\n",
- "50% 13.000\n",
- "75% 16.000\n",
- "max 52.000\n",
- "dtype: float64"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.groupby('id').size().describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "4a0eba3c-fe71-456e-84f6-8e672c7110b5",
- "metadata": {},
- "source": [
- "В среднем каждый клиент содержит по 14 записей, всего кол-во записей распределено на промежутке от 4 до 52."
- ]
+ "source": "В среднем каждый клиент содержит по 14 записей, всего кол-во записей распределено на промежутке от 4 до 52."
},
{
"cell_type": "code",
- "execution_count": 9,
+ "id": "4a0eba3c-fe71-456e-84f6-8e672c7110b5",
+ "metadata": {},
+ "source": "df.isna().sum().to_frame('missing')",
+ "outputs": [],
+ "execution_count": null
+ },
+ {
+ "cell_type": "markdown",
"id": "84b726d3",
"metadata": {
"ExecuteTime": {
@@ -686,730 +275,26 @@
"start_time": "2025-12-05T18:56:35.783710Z"
}
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " missing | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | id | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | business_dt | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_imp_ent | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_click_ent | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_imp_super | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_click_super | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_imp_transport | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_click_transport | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_imp_shopping | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_click_shopping | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_imp_hotel | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_click_hotel | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_imp_avia | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | active_click_avia | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_imp_ent | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_click_ent | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_imp_super | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_click_super | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_imp_transport | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_click_transport | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_imp_shopping | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_click_shopping | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_imp_hotel | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_click_hotel | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_imp_avia | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | passive_click_avia | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | orders_amt_ent | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | orders_amt_super | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | orders_amt_transport | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | orders_amt_shopping | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | orders_amt_hotel | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | orders_amt_avia | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | gender_cd | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | age | \n",
- " 0 | \n",
- "
\n",
- " \n",
- " | device_platform_cd | \n",
- " 0 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " missing\n",
- "id 0\n",
- "business_dt 0\n",
- "active_imp_ent 0\n",
- "active_click_ent 0\n",
- "active_imp_super 0\n",
- "active_click_super 0\n",
- "active_imp_transport 0\n",
- "active_click_transport 0\n",
- "active_imp_shopping 0\n",
- "active_click_shopping 0\n",
- "active_imp_hotel 0\n",
- "active_click_hotel 0\n",
- "active_imp_avia 0\n",
- "active_click_avia 0\n",
- "passive_imp_ent 0\n",
- "passive_click_ent 0\n",
- "passive_imp_super 0\n",
- "passive_click_super 0\n",
- "passive_imp_transport 0\n",
- "passive_click_transport 0\n",
- "passive_imp_shopping 0\n",
- "passive_click_shopping 0\n",
- "passive_imp_hotel 0\n",
- "passive_click_hotel 0\n",
- "passive_imp_avia 0\n",
- "passive_click_avia 0\n",
- "orders_amt_ent 0\n",
- "orders_amt_super 0\n",
- "orders_amt_transport 0\n",
- "orders_amt_shopping 0\n",
- "orders_amt_hotel 0\n",
- "orders_amt_avia 0\n",
- "gender_cd 0\n",
- "age 0\n",
- "device_platform_cd 0"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.isna().sum().to_frame('missing')"
- ]
+ "source": "В датасете отсуствуют пропущенные значения"
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
"id": "16acc953-c151-43b8-b923-aab2c3a25ffb",
"metadata": {},
- "source": [
- "В датасете отсуствуют пропущенные значения"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "id": "4a5160f3-f243-478e-b793-db857eefb053",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " count | \n",
- " mean | \n",
- " std | \n",
- " min | \n",
- " 25% | \n",
- " 50% | \n",
- " 75% | \n",
- " max | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | id | \n",
- " 118,189.000 | \n",
- " 4,131.899 | \n",
- " 2,408.258 | \n",
- " 1.000 | \n",
- " 2,038.000 | \n",
- " 4,121.000 | \n",
- " 6,219.000 | \n",
- " 8,339.000 | \n",
- "
\n",
- " \n",
- " | active_imp_ent | \n",
- " 118,189.000 | \n",
- " 0.314 | \n",
- " 0.614 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 9.000 | \n",
- "
\n",
- " \n",
- " | active_click_ent | \n",
- " 118,189.000 | \n",
- " 0.240 | \n",
- " 0.483 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 6.000 | \n",
- "
\n",
- " \n",
- " | active_imp_super | \n",
- " 118,189.000 | \n",
- " 0.380 | \n",
- " 0.809 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 11.000 | \n",
- "
\n",
- " \n",
- " | active_click_super | \n",
- " 118,189.000 | \n",
- " 0.276 | \n",
- " 0.542 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 9.000 | \n",
- "
\n",
- " \n",
- " | active_imp_transport | \n",
- " 118,189.000 | \n",
- " 0.574 | \n",
- " 0.944 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 24.000 | \n",
- "
\n",
- " \n",
- " | active_click_transport | \n",
- " 118,189.000 | \n",
- " 0.443 | \n",
- " 0.645 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 11.000 | \n",
- "
\n",
- " \n",
- " | active_imp_shopping | \n",
- " 118,189.000 | \n",
- " 0.255 | \n",
- " 0.565 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 6.000 | \n",
- "
\n",
- " \n",
- " | active_click_shopping | \n",
- " 118,189.000 | \n",
- " 0.199 | \n",
- " 0.450 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 5.000 | \n",
- "
\n",
- " \n",
- " | active_imp_hotel | \n",
- " 118,189.000 | \n",
- " 0.141 | \n",
- " 0.483 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 7.000 | \n",
- "
\n",
- " \n",
- " | active_click_hotel | \n",
- " 118,189.000 | \n",
- " 0.035 | \n",
- " 0.185 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 2.000 | \n",
- "
\n",
- " \n",
- " | active_imp_avia | \n",
- " 118,189.000 | \n",
- " 0.193 | \n",
- " 0.523 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 6.000 | \n",
- "
\n",
- " \n",
- " | active_click_avia | \n",
- " 118,189.000 | \n",
- " 0.054 | \n",
- " 0.227 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 2.000 | \n",
- "
\n",
- " \n",
- " | passive_imp_ent | \n",
- " 118,189.000 | \n",
- " 0.552 | \n",
- " 1.256 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 42.000 | \n",
- "
\n",
- " \n",
- " | passive_click_ent | \n",
- " 118,189.000 | \n",
- " 0.027 | \n",
- " 0.190 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 11.000 | \n",
- "
\n",
- " \n",
- " | passive_imp_super | \n",
- " 118,189.000 | \n",
- " 0.280 | \n",
- " 0.859 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 26.000 | \n",
- "
\n",
- " \n",
- " | passive_click_super | \n",
- " 118,189.000 | \n",
- " 0.009 | \n",
- " 0.118 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 5.000 | \n",
- "
\n",
- " \n",
- " | passive_imp_transport | \n",
- " 118,189.000 | \n",
- " 0.794 | \n",
- " 1.472 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 43.000 | \n",
- "
\n",
- " \n",
- " | passive_click_transport | \n",
- " 118,189.000 | \n",
- " 0.020 | \n",
- " 0.155 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 7.000 | \n",
- "
\n",
- " \n",
- " | passive_imp_shopping | \n",
- " 118,189.000 | \n",
- " 0.689 | \n",
- " 1.768 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 83.000 | \n",
- "
\n",
- " \n",
- " | passive_click_shopping | \n",
- " 118,189.000 | \n",
- " 0.011 | \n",
- " 0.128 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 7.000 | \n",
- "
\n",
- " \n",
- " | passive_imp_hotel | \n",
- " 118,189.000 | \n",
- " 0.987 | \n",
- " 1.811 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 44.000 | \n",
- "
\n",
- " \n",
- " | passive_click_hotel | \n",
- " 118,189.000 | \n",
- " 0.058 | \n",
- " 0.242 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 8.000 | \n",
- "
\n",
- " \n",
- " | passive_imp_avia | \n",
- " 118,189.000 | \n",
- " 0.702 | \n",
- " 1.400 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 1.000 | \n",
- " 52.000 | \n",
- "
\n",
- " \n",
- " | passive_click_avia | \n",
- " 118,189.000 | \n",
- " 0.028 | \n",
- " 0.182 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 8.000 | \n",
- "
\n",
- " \n",
- " | orders_amt_ent | \n",
- " 118,189.000 | \n",
- " 0.010 | \n",
- " 0.115 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 11.000 | \n",
- "
\n",
- " \n",
- " | orders_amt_super | \n",
- " 118,189.000 | \n",
- " 0.022 | \n",
- " 0.155 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 4.000 | \n",
- "
\n",
- " \n",
- " | orders_amt_transport | \n",
- " 118,189.000 | \n",
- " 0.053 | \n",
- " 0.242 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 5.000 | \n",
- "
\n",
- " \n",
- " | orders_amt_shopping | \n",
- " 118,189.000 | \n",
- " 0.008 | \n",
- " 0.114 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 11.000 | \n",
- "
\n",
- " \n",
- " | orders_amt_hotel | \n",
- " 118,189.000 | \n",
- " 0.004 | \n",
- " 0.067 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 3.000 | \n",
- "
\n",
- " \n",
- " | orders_amt_avia | \n",
- " 118,189.000 | \n",
- " 0.009 | \n",
- " 0.109 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 0.000 | \n",
- " 6.000 | \n",
- "
\n",
- " \n",
- " | age | \n",
- " 118,189.000 | \n",
- " 42.360 | \n",
- " 9.930 | \n",
- " 15.000 | \n",
- " 36.000 | \n",
- " 41.000 | \n",
- " 48.000 | \n",
- " 80.000 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " count mean std min 25% \\\n",
- "id 118,189.000 4,131.899 2,408.258 1.000 2,038.000 \n",
- "active_imp_ent 118,189.000 0.314 0.614 0.000 0.000 \n",
- "active_click_ent 118,189.000 0.240 0.483 0.000 0.000 \n",
- "active_imp_super 118,189.000 0.380 0.809 0.000 0.000 \n",
- "active_click_super 118,189.000 0.276 0.542 0.000 0.000 \n",
- "active_imp_transport 118,189.000 0.574 0.944 0.000 0.000 \n",
- "active_click_transport 118,189.000 0.443 0.645 0.000 0.000 \n",
- "active_imp_shopping 118,189.000 0.255 0.565 0.000 0.000 \n",
- "active_click_shopping 118,189.000 0.199 0.450 0.000 0.000 \n",
- "active_imp_hotel 118,189.000 0.141 0.483 0.000 0.000 \n",
- "active_click_hotel 118,189.000 0.035 0.185 0.000 0.000 \n",
- "active_imp_avia 118,189.000 0.193 0.523 0.000 0.000 \n",
- "active_click_avia 118,189.000 0.054 0.227 0.000 0.000 \n",
- "passive_imp_ent 118,189.000 0.552 1.256 0.000 0.000 \n",
- "passive_click_ent 118,189.000 0.027 0.190 0.000 0.000 \n",
- "passive_imp_super 118,189.000 0.280 0.859 0.000 0.000 \n",
- "passive_click_super 118,189.000 0.009 0.118 0.000 0.000 \n",
- "passive_imp_transport 118,189.000 0.794 1.472 0.000 0.000 \n",
- "passive_click_transport 118,189.000 0.020 0.155 0.000 0.000 \n",
- "passive_imp_shopping 118,189.000 0.689 1.768 0.000 0.000 \n",
- "passive_click_shopping 118,189.000 0.011 0.128 0.000 0.000 \n",
- "passive_imp_hotel 118,189.000 0.987 1.811 0.000 0.000 \n",
- "passive_click_hotel 118,189.000 0.058 0.242 0.000 0.000 \n",
- "passive_imp_avia 118,189.000 0.702 1.400 0.000 0.000 \n",
- "passive_click_avia 118,189.000 0.028 0.182 0.000 0.000 \n",
- "orders_amt_ent 118,189.000 0.010 0.115 0.000 0.000 \n",
- "orders_amt_super 118,189.000 0.022 0.155 0.000 0.000 \n",
- "orders_amt_transport 118,189.000 0.053 0.242 0.000 0.000 \n",
- "orders_amt_shopping 118,189.000 0.008 0.114 0.000 0.000 \n",
- "orders_amt_hotel 118,189.000 0.004 0.067 0.000 0.000 \n",
- "orders_amt_avia 118,189.000 0.009 0.109 0.000 0.000 \n",
- "age 118,189.000 42.360 9.930 15.000 36.000 \n",
- "\n",
- " 50% 75% max \n",
- "id 4,121.000 6,219.000 8,339.000 \n",
- "active_imp_ent 0.000 0.000 9.000 \n",
- "active_click_ent 0.000 0.000 6.000 \n",
- "active_imp_super 0.000 0.000 11.000 \n",
- "active_click_super 0.000 0.000 9.000 \n",
- "active_imp_transport 0.000 1.000 24.000 \n",
- "active_click_transport 0.000 1.000 11.000 \n",
- "active_imp_shopping 0.000 0.000 6.000 \n",
- "active_click_shopping 0.000 0.000 5.000 \n",
- "active_imp_hotel 0.000 0.000 7.000 \n",
- "active_click_hotel 0.000 0.000 2.000 \n",
- "active_imp_avia 0.000 0.000 6.000 \n",
- "active_click_avia 0.000 0.000 2.000 \n",
- "passive_imp_ent 0.000 1.000 42.000 \n",
- "passive_click_ent 0.000 0.000 11.000 \n",
- "passive_imp_super 0.000 0.000 26.000 \n",
- "passive_click_super 0.000 0.000 5.000 \n",
- "passive_imp_transport 0.000 1.000 43.000 \n",
- "passive_click_transport 0.000 0.000 7.000 \n",
- "passive_imp_shopping 0.000 1.000 83.000 \n",
- "passive_click_shopping 0.000 0.000 7.000 \n",
- "passive_imp_hotel 0.000 1.000 44.000 \n",
- "passive_click_hotel 0.000 0.000 8.000 \n",
- "passive_imp_avia 0.000 1.000 52.000 \n",
- "passive_click_avia 0.000 0.000 8.000 \n",
- "orders_amt_ent 0.000 0.000 11.000 \n",
- "orders_amt_super 0.000 0.000 4.000 \n",
- "orders_amt_transport 0.000 0.000 5.000 \n",
- "orders_amt_shopping 0.000 0.000 11.000 \n",
- "orders_amt_hotel 0.000 0.000 3.000 \n",
- "orders_amt_avia 0.000 0.000 6.000 \n",
- "age 41.000 48.000 80.000 "
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.describe().T"
- ]
+ "source": "describe_zero_share(df, ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS)",
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
- "id": "4a2e8f9f-e9af-4bf9-bc69-cf12979ef359",
+ "id": "4a5160f3-f243-478e-b793-db857eefb053",
"metadata": {},
- "source": [
- "Достаточно странные минимальные/максимальные значения некоторых полей, относительно средних значений, построим boxplots"
- ]
+ "source": "Достаточно странные минимальные/максимальные значения некоторых полей, относительно средних значений, построим boxplots"
},
{
"cell_type": "code",
- "execution_count": 13,
- "id": "7c2b1102-0114-48de-a88b-f442812b70d6",
+ "id": "4a2e8f9f-e9af-4bf9-bc69-cf12979ef359",
"metadata": {},
- "outputs": [
- {
- "data": {
- "image/png": "",
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
"source": [
"# Создание subplot в несколько строк\n",
"numeric_cols = df.select_dtypes(include=['number']).columns\n",
@@ -1438,15 +323,23 @@
"plt.tight_layout()\n",
"plt.subplots_adjust(top=0.95) # Добавляем место для заголовка\n",
"plt.show()"
- ]
+ ],
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "markdown",
+ "id": "7c2b1102-0114-48de-a88b-f442812b70d6",
+ "metadata": {},
+ "source": "смотря на эти графики хочется плакать, но ничего, попробуем посчитать сколько конкретно у нас ненулевых значений (т.е. заказов)"
+ },
+ {
+ "cell_type": "code",
"id": "33d8e762-c154-4ece-9558-e63a99b8dafa",
"metadata": {},
- "source": [
- "смотря на эти графики хочется плакать, но ничего, попробуем посчитать сколько конкретно у нас ненулевых значений (т.е. заказов)"
- ]
+ "source": "fix, axes = plt.subplot",
+ "outputs": [],
+ "execution_count": null
},
{
"cell_type": "code",
@@ -1543,12 +436,7 @@
"output_type": "display_data"
}
],
- "source": [
- "for field in fieldsToCount:\n",
- " df.loc[df[field] > 0, field] = 1\n",
- "\n",
- "countsOfOrdersByCategories()"
- ]
+ "source": ""
},
{
"cell_type": "code",
@@ -1581,10 +469,10 @@
}
],
"source": [
- "age_check = df['age'].describe(percentiles=[0.01, 0.25, 0.5, 0.75, 0.99])\n",
- "age_outliers = df[(df['age'] < 14) | (df['age'] > 100)]\n",
- "print(age_check)\n",
- "print('Outlier share:', len(age_outliers) / len(df))"
+ "for field in fieldsToCount:\n",
+ " df.loc[df[field] > 0, field] = 1\n",
+ "\n",
+ "countsOfOrdersByCategories()"
]
},
{
@@ -1610,13 +498,10 @@
}
],
"source": [
- "cnt_by_date = df.groupby('business_dt').size().reset_index(name='n_records')\n",
- "fig, ax = plt.subplots(figsize=(12, 4))\n",
- "sns.lineplot(data=cnt_by_date, x='business_dt', y='n_records', ax=ax)\n",
- "ax.set_title('Количество записей по датам')\n",
- "ax.set_ylabel('N')\n",
- "plt.xticks(rotation=45)\n",
- "plt.tight_layout()"
+ "age_check = df['age'].describe(percentiles=[0.01, 0.25, 0.5, 0.75, 0.99])\n",
+ "age_outliers = df[(df['age'] < 14) | (df['age'] > 100)]\n",
+ "print(age_check)\n",
+ "print('Outlier share:', len(age_outliers) / len(df))"
]
},
{
@@ -1642,10 +527,8 @@
}
],
"source": [
- "fig, axes = plt.subplots(1, 1, figsize=(10, 4))\n",
- "sns.boxplot(data=df, y='age')\n",
- "axes.set_title('Возраст (boxplot)')\n",
- "plt.tight_layout()"
+ "categoricals = {col: df[col].value_counts(dropna=False) for col in CAT_COLS}\n",
+ "categoricals"
]
},
{
@@ -1668,10 +551,13 @@
}
],
"source": [
- "SAVE_CLEANED = True\n",
- "if SAVE_CLEANED:\n",
- " df.to_parquet('dataset/ds_clean.parquet', engine=\"fastparquet\", index=False)\n",
- " print('Saved dataset/ds_clean.parquet')"
+ "cnt_by_date = df.groupby('business_dt').size().reset_index(name='n_records')\n",
+ "fig, ax = plt.subplots(figsize=(12, 4))\n",
+ "sns.lineplot(data=cnt_by_date, x='business_dt', y='n_records', ax=ax)\n",
+ "ax.set_title('Количество записей по датам')\n",
+ "ax.set_ylabel('N')\n",
+ "plt.xticks(rotation=45)\n",
+ "plt.tight_layout()"
]
},
{
@@ -1680,7 +566,33 @@
"id": "5b9e3d6a-9624-4a11-9984-de5b1a44b04d",
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "fig, axes = plt.subplots(1, 1, figsize=(10, 4))\n",
+ "sns.boxplot(data=df, y='age')\n",
+ "axes.set_title('Возраст (boxplot)')\n",
+ "plt.tight_layout()"
+ ]
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": null,
+ "source": [
+ "SAVE_CLEANED = True\n",
+ "if SAVE_CLEANED:\n",
+ " df.to_parquet('dataset/ds_clean.parquet', index=False)\n",
+ " print('Saved dataset/ds_clean.parquet')"
+ ],
+ "id": "89d49461c63a71bf"
+ },
+ {
+ "metadata": {},
+ "cell_type": "code",
+ "outputs": [],
+ "execution_count": null,
+ "source": "",
+ "id": "283fc5c684b10f2d"
}
],
"metadata": {