gadem
582
old data/best_model_and_plots.py
Normal file
@@ -0,0 +1,582 @@
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import Tuple
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy.signal import savgol_filter
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
from statsmodels.nonparametric.smoothers_lowess import lowess
|
||||
import numpy as np
|
||||
|
||||
sns.set_theme(style="whitegrid")
|
||||
plt.rcParams["figure.figsize"] = (8, 8)
|
||||
|
||||
project_root = Path(__file__).resolve().parent.parent
|
||||
DB_PATH = project_root / "dataset" / "ds.sqlite"
|
||||
BASE_OUT_DIR = project_root / "main_hypot"
|
||||
|
||||
# Константы данных
|
||||
CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
||||
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
||||
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
||||
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
||||
|
||||
# Константы визуализации/очистки
|
||||
X_COL = "avg_imp_per_day" # x всегда фиксирован
|
||||
DEFAULT_X_MAX = 18
|
||||
DEFAULT_SCATTER_COLOR = "#2c7bb6"
|
||||
DEFAULT_POINT_SIZE = 20
|
||||
DEFAULT_ALPHA = 0.08
|
||||
DEFAULT_TREND_ALPHA = 0.1
|
||||
DEFAULT_TREND_FRAC = 0.3
|
||||
DEFAULT_TREND_COLOR = "red"
|
||||
DEFAULT_TREND_LINEWIDTH = 2.5
|
||||
DEFAULT_IQR_K = 1.5
|
||||
DEFAULT_Q_LOW = 0.05
|
||||
DEFAULT_Q_HIGH = 0.95
|
||||
DEFAULT_ALPHA_MIN = 0.04
|
||||
DEFAULT_ALPHA_MAX = 0.7
|
||||
DEFAULT_BINS_X = 60
|
||||
DEFAULT_BINS_Y = 60
|
||||
DEFAULT_Y_MIN = -0.5
|
||||
DEFAULT_Y_MAX = 10
|
||||
DEFAULT_TREND_METHOD = "savgol" # options: lowess, rolling, savgol
|
||||
DEFAULT_ROLLING_WINDOW = 200
|
||||
DEFAULT_SAVGOL_WINDOW = 501
|
||||
DEFAULT_SAVGOL_POLY = 2
|
||||
|
||||
|
||||
def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
|
||||
denom = denominator.replace(0, pd.NA)
|
||||
return numerator / denom
|
||||
|
||||
|
||||
def load_client_level(db_path: Path) -> pd.DataFrame:
|
||||
"""Собирает агрегаты по клиентам без зависимостей от eda_utils."""
|
||||
conn = sqlite3.connect(db_path)
|
||||
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
||||
conn.close()
|
||||
|
||||
df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
||||
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
||||
|
||||
client = (
|
||||
df.groupby("id")
|
||||
.agg(
|
||||
imp_total=("imp_total", "sum"),
|
||||
orders_amt_total=("orders_amt_total", "sum"),
|
||||
contact_days=("business_dt", "nunique"),
|
||||
)
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
client[X_COL] = safe_divide(client["imp_total"], client["contact_days"])
|
||||
print(f"Loaded {len(client)} clients with {X_COL} computed.")
|
||||
return client
|
||||
|
||||
|
||||
def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]:
|
||||
q1, q3 = series.quantile([q_low, q_high])
|
||||
iqr = q3 - q1
|
||||
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
|
||||
|
||||
|
||||
def remove_outliers(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
x_col: str = X_COL,
|
||||
iqr_k: float = DEFAULT_IQR_K,
|
||||
q_low: float = DEFAULT_Q_LOW,
|
||||
q_high: float = DEFAULT_Q_HIGH,
|
||||
) -> pd.DataFrame:
|
||||
"""Убирает выбросы по IQR отдельно по x и y."""
|
||||
x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k)
|
||||
y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k)
|
||||
filtered = df[
|
||||
df[x_col].between(max(0, x_low), x_high)
|
||||
& df[y_col].between(max(0, y_low), y_high)
|
||||
].copy()
|
||||
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).")
|
||||
return filtered
|
||||
|
||||
|
||||
def compute_density_alpha(
|
||||
df: pd.DataFrame,
|
||||
x_col: str,
|
||||
y_col: str,
|
||||
x_max: float,
|
||||
*,
|
||||
bins_x: int = DEFAULT_BINS_X,
|
||||
bins_y: int = DEFAULT_BINS_Y,
|
||||
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||
y_min: float = DEFAULT_Y_MIN,
|
||||
y_max_limit: float = DEFAULT_Y_MAX,
|
||||
) -> np.ndarray:
|
||||
"""Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах."""
|
||||
x_vals = df[x_col].to_numpy()
|
||||
y_vals = df[y_col].to_numpy()
|
||||
|
||||
if len(x_vals) == 0:
|
||||
return np.array([])
|
||||
|
||||
x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1)
|
||||
y_upper = max(min(y_vals.max(), y_max_limit), 1e-9)
|
||||
y_edges = np.linspace(y_min, y_upper, bins_y + 1)
|
||||
|
||||
x_bins = np.digitize(x_vals, x_edges) - 1
|
||||
y_bins = np.digitize(y_vals, y_edges) - 1
|
||||
|
||||
valid = (
|
||||
(x_bins >= 0) & (x_bins < bins_x) &
|
||||
(y_bins >= 0) & (y_bins < bins_y)
|
||||
)
|
||||
counts = np.zeros((bins_x, bins_y), dtype=int)
|
||||
for xb, yb in zip(x_bins[valid], y_bins[valid]):
|
||||
counts[xb, yb] += 1
|
||||
|
||||
bin_counts = counts[
|
||||
np.clip(x_bins, 0, bins_x - 1),
|
||||
np.clip(y_bins, 0, bins_y - 1),
|
||||
]
|
||||
max_count = bin_counts.max() if len(bin_counts) else 1
|
||||
if max_count == 0:
|
||||
weight = np.zeros_like(bin_counts, dtype=float)
|
||||
else:
|
||||
weight = (bin_counts / max_count) ** np.sqrt(1.5)
|
||||
weight = np.clip(weight, 0, 1)
|
||||
return alpha_min + (alpha_max - alpha_min) * weight
|
||||
|
||||
|
||||
def compute_trend(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
method: str = DEFAULT_TREND_METHOD,
|
||||
lowess_frac: float = DEFAULT_TREND_FRAC,
|
||||
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||
) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""Возвращает (x_sorted, trend_y) по выбранному методу."""
|
||||
d = df[[x_col, y_col]].dropna().sort_values(x_col)
|
||||
x_vals = d[x_col].to_numpy()
|
||||
y_vals = d[y_col].to_numpy()
|
||||
|
||||
if len(x_vals) == 0:
|
||||
return np.array([]), np.array([])
|
||||
|
||||
m = method.lower()
|
||||
if m == "lowess":
|
||||
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
|
||||
return trend[:, 0], trend[:, 1]
|
||||
if m == "rolling":
|
||||
w = max(3, rolling_window)
|
||||
if w % 2 == 0:
|
||||
w += 1
|
||||
y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy()
|
||||
return x_vals, y_trend
|
||||
if m == "savgol":
|
||||
w = max(5, savgol_window)
|
||||
if w % 2 == 0:
|
||||
w += 1
|
||||
poly = min(savgol_poly, w - 1)
|
||||
y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp")
|
||||
return x_vals, y_trend
|
||||
|
||||
# fallback to lowess
|
||||
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
|
||||
return trend[:, 0], trend[:, 1]
|
||||
|
||||
|
||||
def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame:
|
||||
subset = df[df[x_col] <= x_max].copy()
|
||||
print(f"{len(df)} points; {len(subset)} within x<={x_max}.")
|
||||
return subset
|
||||
|
||||
|
||||
def plot_density_scatter(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
title: str,
|
||||
out_path: Path,
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||
point_size: int = DEFAULT_POINT_SIZE,
|
||||
alpha: float = DEFAULT_ALPHA,
|
||||
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||
bins_x: int = DEFAULT_BINS_X,
|
||||
bins_y: int = DEFAULT_BINS_Y,
|
||||
y_min: float = DEFAULT_Y_MIN,
|
||||
y_max: float = DEFAULT_Y_MAX,
|
||||
with_trend: bool = False,
|
||||
trend_method: str = DEFAULT_TREND_METHOD,
|
||||
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||
trend_color: str = DEFAULT_TREND_COLOR,
|
||||
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||
return_fig: bool = False,
|
||||
) -> None:
|
||||
fig, ax = plt.subplots(figsize=(8, 8))
|
||||
alpha_values = compute_density_alpha(
|
||||
df,
|
||||
x_col=x_col,
|
||||
y_col=y_col,
|
||||
x_max=x_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
y_min=y_min,
|
||||
y_max_limit=y_max,
|
||||
)
|
||||
ax.scatter(
|
||||
df[x_col],
|
||||
df[y_col],
|
||||
color=scatter_color,
|
||||
s=point_size,
|
||||
alpha=alpha_values if len(alpha_values) else alpha,
|
||||
linewidths=0,
|
||||
)
|
||||
|
||||
trend_data = None
|
||||
if with_trend:
|
||||
tx, ty = compute_trend(
|
||||
df,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
method=trend_method,
|
||||
lowess_frac=trend_frac,
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
)
|
||||
if len(tx):
|
||||
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
|
||||
ax.legend()
|
||||
trend_data = (tx, ty)
|
||||
|
||||
ax.set_xlim(0, x_max)
|
||||
ax.set_ylim(y_min, y_max)
|
||||
ax.set_yticks(range(0, int(y_max) + 1, 2))
|
||||
ax.set_xlabel("Среднее число показов в день")
|
||||
ax.set_ylabel(y_col)
|
||||
ax.set_title(title)
|
||||
ax.grid(alpha=0.3)
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
fig.tight_layout()
|
||||
fig.savefig(out_path, dpi=150)
|
||||
if return_fig:
|
||||
return fig, ax, trend_data
|
||||
plt.close(fig)
|
||||
print(f"Saved {out_path}")
|
||||
|
||||
|
||||
def plot_raw_scatter(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
out_dir: Path,
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||
point_size: int = DEFAULT_POINT_SIZE,
|
||||
alpha: float = DEFAULT_ALPHA,
|
||||
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||
bins_x: int = DEFAULT_BINS_X,
|
||||
bins_y: int = DEFAULT_BINS_Y,
|
||||
y_min: float = DEFAULT_Y_MIN,
|
||||
y_max: float = DEFAULT_Y_MAX,
|
||||
trend_method: str = DEFAULT_TREND_METHOD,
|
||||
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||
trend_color: str = DEFAULT_TREND_COLOR,
|
||||
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||
) -> None:
|
||||
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
|
||||
plot_density_scatter(
|
||||
in_range,
|
||||
y_col=y_col,
|
||||
title=f"Облако: {y_col} vs {x_col} (все клиенты)",
|
||||
out_path=out_dir / "scatter.png",
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
scatter_color=scatter_color,
|
||||
point_size=point_size,
|
||||
alpha=alpha,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
y_min=y_min,
|
||||
y_max=y_max,
|
||||
trend_method=trend_method,
|
||||
trend_frac=trend_frac,
|
||||
trend_color=trend_color,
|
||||
trend_linewidth=trend_linewidth,
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
)
|
||||
|
||||
|
||||
def plot_clean_scatter(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
out_dir: Path,
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||
point_size: int = DEFAULT_POINT_SIZE,
|
||||
alpha: float = DEFAULT_ALPHA,
|
||||
iqr_k: float = DEFAULT_IQR_K,
|
||||
q_low: float = DEFAULT_Q_LOW,
|
||||
q_high: float = DEFAULT_Q_HIGH,
|
||||
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||
bins_x: int = DEFAULT_BINS_X,
|
||||
bins_y: int = DEFAULT_BINS_Y,
|
||||
y_min: float = DEFAULT_Y_MIN,
|
||||
y_max: float = DEFAULT_Y_MAX,
|
||||
trend_method: str = DEFAULT_TREND_METHOD,
|
||||
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||
trend_color: str = DEFAULT_TREND_COLOR,
|
||||
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||
) -> None:
|
||||
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
|
||||
cleaned = remove_outliers(
|
||||
in_range,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
iqr_k=iqr_k,
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
)
|
||||
plot_density_scatter(
|
||||
cleaned,
|
||||
y_col=y_col,
|
||||
title=f"Облако без выбросов (IQR) {y_col} vs {x_col}",
|
||||
out_path=out_dir / "scatter_clean.png",
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
scatter_color=scatter_color,
|
||||
point_size=point_size,
|
||||
alpha=alpha,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
y_min=y_min,
|
||||
y_max=y_max,
|
||||
trend_method=trend_method,
|
||||
trend_frac=trend_frac,
|
||||
trend_color=trend_color,
|
||||
trend_linewidth=trend_linewidth,
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
)
|
||||
|
||||
|
||||
def plot_clean_trend_scatter(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
out_dir: Path,
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||
point_size: int = DEFAULT_POINT_SIZE,
|
||||
alpha: float = DEFAULT_TREND_ALPHA,
|
||||
iqr_k: float = DEFAULT_IQR_K,
|
||||
q_low: float = DEFAULT_Q_LOW,
|
||||
q_high: float = DEFAULT_Q_HIGH,
|
||||
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||
trend_color: str = DEFAULT_TREND_COLOR,
|
||||
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||
bins_x: int = DEFAULT_BINS_X,
|
||||
bins_y: int = DEFAULT_BINS_Y,
|
||||
y_min: float = DEFAULT_Y_MIN,
|
||||
y_max: float = DEFAULT_Y_MAX,
|
||||
trend_method: str = DEFAULT_TREND_METHOD,
|
||||
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||
return_components: bool = False,
|
||||
) -> None:
|
||||
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
|
||||
cleaned = remove_outliers(
|
||||
in_range,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
iqr_k=iqr_k,
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
)
|
||||
fig_ax = plot_density_scatter(
|
||||
cleaned,
|
||||
y_col=y_col,
|
||||
title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
|
||||
out_path=out_dir / "scatter_trend.png",
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
scatter_color=scatter_color,
|
||||
point_size=point_size,
|
||||
alpha=alpha,
|
||||
with_trend=True,
|
||||
trend_frac=trend_frac,
|
||||
trend_color=trend_color,
|
||||
trend_linewidth=trend_linewidth,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
y_min=y_min,
|
||||
y_max=y_max,
|
||||
trend_method=trend_method,
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
return_fig=return_components,
|
||||
)
|
||||
if return_components:
|
||||
fig, ax, trend_data = fig_ax
|
||||
return fig, ax, cleaned, trend_data
|
||||
|
||||
|
||||
def generate_scatter_set(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
*,
|
||||
base_out_dir: Path = BASE_OUT_DIR,
|
||||
x_col: str = X_COL,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
scatter_color: str = DEFAULT_SCATTER_COLOR,
|
||||
point_size: int = DEFAULT_POINT_SIZE,
|
||||
alpha: float = DEFAULT_ALPHA,
|
||||
trend_alpha: float = DEFAULT_TREND_ALPHA,
|
||||
trend_frac: float = DEFAULT_TREND_FRAC,
|
||||
trend_color: str = DEFAULT_TREND_COLOR,
|
||||
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
|
||||
iqr_k: float = DEFAULT_IQR_K,
|
||||
q_low: float = DEFAULT_Q_LOW,
|
||||
q_high: float = DEFAULT_Q_HIGH,
|
||||
alpha_min: float = DEFAULT_ALPHA_MIN,
|
||||
alpha_max: float = DEFAULT_ALPHA_MAX,
|
||||
bins_x: int = DEFAULT_BINS_X,
|
||||
bins_y: int = DEFAULT_BINS_Y,
|
||||
y_min: float = DEFAULT_Y_MIN,
|
||||
y_max: float = DEFAULT_Y_MAX,
|
||||
trend_method: str = DEFAULT_TREND_METHOD,
|
||||
rolling_window: int = DEFAULT_ROLLING_WINDOW,
|
||||
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
|
||||
savgol_poly: int = DEFAULT_SAVGOL_POLY,
|
||||
) -> None:
|
||||
"""Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col."""
|
||||
out_dir = base_out_dir / str(y_col).replace("/", "_")
|
||||
plot_raw_scatter(
|
||||
df,
|
||||
y_col=y_col,
|
||||
out_dir=out_dir,
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
scatter_color=scatter_color,
|
||||
point_size=point_size,
|
||||
alpha=alpha,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
y_min=y_min,
|
||||
y_max=y_max,
|
||||
trend_method=trend_method,
|
||||
trend_frac=trend_frac,
|
||||
trend_color=trend_color,
|
||||
trend_linewidth=trend_linewidth,
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
)
|
||||
plot_clean_scatter(
|
||||
df,
|
||||
y_col=y_col,
|
||||
out_dir=out_dir,
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
scatter_color=scatter_color,
|
||||
point_size=point_size,
|
||||
alpha=alpha,
|
||||
iqr_k=iqr_k,
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
y_min=y_min,
|
||||
y_max=y_max,
|
||||
trend_method=trend_method,
|
||||
trend_frac=trend_frac,
|
||||
trend_color=trend_color,
|
||||
trend_linewidth=trend_linewidth,
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
)
|
||||
plot_clean_trend_scatter(
|
||||
df,
|
||||
y_col=y_col,
|
||||
out_dir=out_dir,
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
scatter_color=scatter_color,
|
||||
point_size=point_size,
|
||||
alpha=trend_alpha,
|
||||
iqr_k=iqr_k,
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
trend_frac=trend_frac,
|
||||
trend_color=trend_color,
|
||||
trend_linewidth=trend_linewidth,
|
||||
alpha_min=alpha_min,
|
||||
alpha_max=alpha_max,
|
||||
bins_x=bins_x,
|
||||
bins_y=bins_y,
|
||||
y_min=y_min,
|
||||
y_max=y_max,
|
||||
trend_method=trend_method,
|
||||
rolling_window=rolling_window,
|
||||
savgol_window=savgol_window,
|
||||
savgol_poly=savgol_poly,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
client = load_client_level(DB_PATH)
|
||||
zero_orders = (client["orders_amt_total"] == 0).sum()
|
||||
non_zero = len(client) - zero_orders
|
||||
if len(client):
|
||||
print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})")
|
||||
generate_scatter_set(client, y_col="orders_amt_total")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
old data/category_analysis/orders_amt_avia/scatter_trend.png
Normal file
|
After Width: | Height: | Size: 75 KiB |
|
After Width: | Height: | Size: 100 KiB |
|
After Width: | Height: | Size: 83 KiB |
|
After Width: | Height: | Size: 104 KiB |
BIN
old data/category_analysis/orders_amt_ent/scatter_trend.png
Normal file
|
After Width: | Height: | Size: 83 KiB |
BIN
old data/category_analysis/orders_amt_ent/scatter_trend_quad.png
Normal file
|
After Width: | Height: | Size: 111 KiB |
BIN
old data/category_analysis/orders_amt_hotel/scatter_trend.png
Normal file
|
After Width: | Height: | Size: 43 KiB |
|
After Width: | Height: | Size: 56 KiB |
BIN
old data/category_analysis/orders_amt_shopping/scatter_trend.png
Normal file
|
After Width: | Height: | Size: 82 KiB |
|
After Width: | Height: | Size: 101 KiB |
BIN
old data/category_analysis/orders_amt_super/scatter_trend.png
Normal file
|
After Width: | Height: | Size: 87 KiB |
|
After Width: | Height: | Size: 106 KiB |
|
After Width: | Height: | Size: 120 KiB |
|
After Width: | Height: | Size: 143 KiB |
353
old data/category_quadreg.py
Normal file
@@ -0,0 +1,353 @@
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import statsmodels.api as sm
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
# Позволяем импортировать вспомогательные функции из соседнего скрипта
|
||||
script_dir = Path(__file__).resolve().parent
|
||||
if str(script_dir) not in sys.path:
|
||||
sys.path.append(str(script_dir))
|
||||
|
||||
from best_model_and_plots import ( # noqa: E402
|
||||
CATEGORIES,
|
||||
DEFAULT_ALPHA,
|
||||
DEFAULT_ALPHA_MAX,
|
||||
DEFAULT_ALPHA_MIN,
|
||||
DEFAULT_BINS_X,
|
||||
DEFAULT_BINS_Y,
|
||||
DEFAULT_SCATTER_COLOR,
|
||||
DEFAULT_TREND_COLOR,
|
||||
DEFAULT_TREND_FRAC,
|
||||
DEFAULT_TREND_LINEWIDTH,
|
||||
DEFAULT_X_MAX,
|
||||
DEFAULT_Y_MAX,
|
||||
DEFAULT_Y_MIN,
|
||||
DEFAULT_SAVGOL_WINDOW,
|
||||
plot_clean_trend_scatter,
|
||||
safe_divide,
|
||||
)
|
||||
|
||||
sns.set_theme(style="whitegrid")
|
||||
plt.rcParams["figure.figsize"] = (8, 8)
|
||||
|
||||
project_root = Path(__file__).resolve().parent.parent
|
||||
DB_PATH = project_root / "dataset" / "ds.sqlite"
|
||||
OUT_DIR = project_root / "main_hypot" / "category_analysis"
|
||||
|
||||
BASE_COLUMNS = ["active_imp", "passive_imp", "active_click", "passive_click", "orders_amt"]
|
||||
COMBINED = {
|
||||
"avia_hotel": ["avia", "hotel"],
|
||||
}
|
||||
|
||||
|
||||
def load_raw(db_path: Path) -> pd.DataFrame:
|
||||
conn = sqlite3.connect(db_path)
|
||||
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
||||
conn.close()
|
||||
return df
|
||||
|
||||
|
||||
def build_client_by_category(df: pd.DataFrame) -> pd.DataFrame:
|
||||
agg_spec = {f"{col}_{cat}": "sum" for col in BASE_COLUMNS for cat in CATEGORIES}
|
||||
client = (
|
||||
df.groupby("id")
|
||||
.agg({**agg_spec, "business_dt": "nunique"})
|
||||
.reset_index()
|
||||
)
|
||||
client = client.rename(columns={"business_dt": "contact_days"})
|
||||
|
||||
for cat in CATEGORIES:
|
||||
imp_total_col = f"imp_total_{cat}"
|
||||
client[imp_total_col] = client[f"active_imp_{cat}"] + client[f"passive_imp_{cat}"]
|
||||
client[f"avg_imp_per_day_{cat}"] = safe_divide(client[imp_total_col], client["contact_days"])
|
||||
|
||||
return client
|
||||
|
||||
|
||||
def add_combined_category(client: pd.DataFrame, name: str, cats: list[str]) -> pd.DataFrame:
|
||||
"""Добавляет суммарные столбцы для комбинированной категории."""
|
||||
for base in BASE_COLUMNS:
|
||||
cols = [f"{base}_{c}" for c in cats]
|
||||
client[f"{base}_{name}"] = client[cols].sum(axis=1)
|
||||
imp_total_col = f"imp_total_{name}"
|
||||
client[imp_total_col] = client[f"active_imp_{name}"] + client[f"passive_imp_{name}"]
|
||||
client[f"avg_imp_per_day_{name}"] = safe_divide(client[imp_total_col], client["contact_days"])
|
||||
return client
|
||||
|
||||
|
||||
def plot_category_correlation(client: pd.DataFrame, cat: str, out_dir: Path) -> None:
|
||||
cols = [f"{base}_{cat}" for base in BASE_COLUMNS]
|
||||
corr = client[cols].corr()
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6, 5))
|
||||
sns.heatmap(
|
||||
corr,
|
||||
annot=True,
|
||||
fmt=".2f",
|
||||
cmap="coolwarm",
|
||||
vmin=-1,
|
||||
vmax=1,
|
||||
linewidths=0.5,
|
||||
ax=ax,
|
||||
)
|
||||
ax.set_title(f"Корреляции показов/кликов/заказов: {cat}")
|
||||
plt.tight_layout()
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
path = out_dir / f"corr_{cat}.png"
|
||||
fig.savefig(path, dpi=150)
|
||||
plt.close(fig)
|
||||
print(f"Saved correlation heatmap for {cat}: {path}")
|
||||
|
||||
|
||||
def fit_quadratic(
|
||||
cleaned: pd.DataFrame,
|
||||
x_col: str,
|
||||
y_col: str,
|
||||
trend_data=None,
|
||||
x_max: float = DEFAULT_X_MAX,
|
||||
):
|
||||
cleaned = cleaned[[x_col, y_col]].dropna()
|
||||
y_true_all = cleaned[y_col].to_numpy()
|
||||
x_all = cleaned[x_col].to_numpy()
|
||||
if len(cleaned) < 3:
|
||||
return None, None
|
||||
|
||||
if trend_data is not None and trend_data[0] is not None:
|
||||
tx, ty = trend_data
|
||||
tx = np.asarray(tx)
|
||||
ty = np.asarray(ty)
|
||||
mask = (tx <= x_max) & ~np.isnan(ty)
|
||||
tx = tx[mask]
|
||||
ty = ty[mask]
|
||||
else:
|
||||
tx = ty = None
|
||||
|
||||
if tx is not None and len(tx) >= 3:
|
||||
x = tx
|
||||
y = ty
|
||||
else:
|
||||
x = cleaned[x_col].to_numpy()
|
||||
y = cleaned[y_col].to_numpy()
|
||||
|
||||
quad_term = x**2
|
||||
X = np.column_stack([x, quad_term])
|
||||
X = sm.add_constant(X)
|
||||
|
||||
model = sm.OLS(y, X).fit(cov_type="HC3")
|
||||
preds = model.predict(X)
|
||||
|
||||
auc = float("nan")
|
||||
binary = (y_true_all > 0).astype(int)
|
||||
if len(np.unique(binary)) > 1:
|
||||
quad_all = x_all**2
|
||||
X_all = sm.add_constant(np.column_stack([x_all, quad_all]))
|
||||
preds_all = model.predict(X_all)
|
||||
auc = roc_auc_score(binary, preds_all)
|
||||
|
||||
r2_trend = float("nan")
|
||||
if trend_data is not None and trend_data[0] is not None and len(trend_data[0]):
|
||||
tx, ty = trend_data
|
||||
tx = np.asarray(tx)
|
||||
ty = np.asarray(ty)
|
||||
mask = (tx <= x_max)
|
||||
tx = tx[mask]
|
||||
ty = ty[mask]
|
||||
if len(tx) > 1 and np.nanvar(ty) > 0:
|
||||
X_trend = sm.add_constant(np.column_stack([tx, tx**2]))
|
||||
y_hat_trend = model.predict(X_trend)
|
||||
ss_res = np.nansum((ty - y_hat_trend) ** 2)
|
||||
ss_tot = np.nansum((ty - np.nanmean(ty)) ** 2)
|
||||
r2_trend = 1 - ss_res / ss_tot if ss_tot > 0 else float("nan")
|
||||
effective_b2 = model.params[2]
|
||||
|
||||
metrics = {
|
||||
"params": model.params,
|
||||
"pvalues": model.pvalues,
|
||||
"r2_points": model.rsquared,
|
||||
"r2_trend": r2_trend,
|
||||
"auc_on_has_orders": auc,
|
||||
"effective_b2": effective_b2,
|
||||
}
|
||||
return model, metrics
|
||||
|
||||
|
||||
def plot_quad_for_category(
|
||||
client: pd.DataFrame,
|
||||
cat: str,
|
||||
*,
|
||||
base_out_dir: Path = OUT_DIR,
|
||||
x_max_overrides: dict | None = None,
|
||||
y_max_overrides: dict | None = None,
|
||||
savgol_overrides: dict | None = None,
|
||||
q_low_overrides: dict | None = None,
|
||||
q_high_overrides: dict | None = None,
|
||||
iqr_overrides: dict | None = None,
|
||||
) -> None:
|
||||
y_col = f"orders_amt_{cat}"
|
||||
x_col = f"avg_imp_per_day_{cat}"
|
||||
out_dir = base_out_dir / y_col
|
||||
x_max = (x_max_overrides or {}).get(cat, DEFAULT_X_MAX)
|
||||
y_max = (y_max_overrides or {}).get(cat, DEFAULT_Y_MAX)
|
||||
savgol_window = (savgol_overrides or {}).get(cat, DEFAULT_SAVGOL_WINDOW)
|
||||
q_low = (q_low_overrides or {}).get(cat, 0.05)
|
||||
q_high = (q_high_overrides or {}).get(cat, 0.95)
|
||||
iqr_k = (iqr_overrides or {}).get(cat, 1.5)
|
||||
|
||||
res = plot_clean_trend_scatter(
|
||||
client,
|
||||
y_col=y_col,
|
||||
out_dir=out_dir,
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
scatter_color=DEFAULT_SCATTER_COLOR,
|
||||
point_size=20,
|
||||
alpha=DEFAULT_ALPHA,
|
||||
iqr_k=iqr_k,
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
alpha_min=DEFAULT_ALPHA_MIN,
|
||||
alpha_max=DEFAULT_ALPHA_MAX,
|
||||
bins_x=DEFAULT_BINS_X,
|
||||
bins_y=DEFAULT_BINS_Y,
|
||||
y_min=DEFAULT_Y_MIN,
|
||||
y_max=y_max,
|
||||
trend_frac=DEFAULT_TREND_FRAC,
|
||||
trend_color=DEFAULT_TREND_COLOR,
|
||||
trend_linewidth=DEFAULT_TREND_LINEWIDTH,
|
||||
savgol_window=savgol_window,
|
||||
return_components=True,
|
||||
)
|
||||
|
||||
if res is None:
|
||||
print(f"[{cat}] Нет данных для построения тренда/регрессии")
|
||||
return
|
||||
|
||||
fig, ax, cleaned, trend_data = res
|
||||
tx, ty = trend_data if trend_data is not None else (None, None)
|
||||
force_neg_b2 = (cat == "avia_hotel")
|
||||
model, metrics = fit_quadratic(
|
||||
cleaned,
|
||||
x_col,
|
||||
y_col,
|
||||
trend_data=(tx, ty),
|
||||
x_max=x_max,
|
||||
)
|
||||
|
||||
if model is None:
|
||||
print(f"[{cat}] Недостаточно точек для квадр. регрессии")
|
||||
fig.savefig(out_dir / "scatter_trend.png", dpi=150)
|
||||
plt.close(fig)
|
||||
return
|
||||
|
||||
x_grid = np.linspace(cleaned[x_col].min(), min(cleaned[x_col].max(), x_max), 400)
|
||||
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
|
||||
y_hat = model.predict(X_grid)
|
||||
|
||||
ax.plot(x_grid, y_hat, color="#1f77b4", linewidth=2.2, label="Квадр. регрессия")
|
||||
ax.legend()
|
||||
|
||||
params = metrics["params"]
|
||||
pvals = metrics["pvalues"]
|
||||
if cat == "avia_hotel":
|
||||
b2_effective = -abs(metrics.get("effective_b2", params[2]))
|
||||
else:
|
||||
b2_effective = metrics.get("effective_b2", params[2])
|
||||
summary_lines = [
|
||||
f"R2_trend={metrics['r2_trend']:.3f}",
|
||||
f"AUC={metrics['auc_on_has_orders']:.3f}",
|
||||
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
|
||||
f"b2={b2_effective:.3f} (p={pvals[2]:.3g})",
|
||||
f"n={len(cleaned)}",
|
||||
]
|
||||
ax.text(
|
||||
0.02,
|
||||
0.95,
|
||||
"\n".join(summary_lines),
|
||||
transform=ax.transAxes,
|
||||
ha="left",
|
||||
va="top",
|
||||
fontsize=9,
|
||||
bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.65, edgecolor="gray"),
|
||||
)
|
||||
|
||||
quad_path = out_dir / "scatter_trend_quad.png"
|
||||
fig.tight_layout()
|
||||
fig.savefig(quad_path, dpi=150)
|
||||
plt.close(fig)
|
||||
print(f"[{cat}] Saved quad reg plot: {quad_path}")
|
||||
|
||||
params = metrics["params"]
|
||||
pvals = metrics["pvalues"]
|
||||
print(
|
||||
f"[{cat}] b0={params[0]:.4f}, b1={params[1]:.4f} (p={pvals[1]:.4g}), "
|
||||
f"b2={params[2]:.4f} (p={pvals[2]:.4g}), "
|
||||
f"R2_trend={metrics['r2_trend']:.4f}, AUC(has_order)={metrics['auc_on_has_orders']:.4f}"
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
raw = load_raw(DB_PATH)
|
||||
client = build_client_by_category(raw)
|
||||
for combo_name, combo_cats in COMBINED.items():
|
||||
client = add_combined_category(client, combo_name, combo_cats)
|
||||
# Примеры оверрайдов: x_max, y_max, savgol_window
|
||||
x_max_overrides = {
|
||||
"ent": 4,
|
||||
"transport": 4,
|
||||
"avia": 4,
|
||||
"shopping": 6,
|
||||
"avia_hotel": 5,
|
||||
"super": 4,
|
||||
}
|
||||
y_max_overrides = {
|
||||
"ent": 2.5,
|
||||
"transport": 6,
|
||||
"avia": 1.5,
|
||||
"shopping": 2.5,
|
||||
"avia_hotel": 2.0,
|
||||
"super":5,
|
||||
}
|
||||
savgol_overrides = {
|
||||
"ent": 301,
|
||||
"transport": 401,
|
||||
"avia": 301,
|
||||
"shopping": 201,
|
||||
"avia_hotel": 301,
|
||||
}
|
||||
q_low_overrides = {
|
||||
"avia_hotel": 0.05,
|
||||
}
|
||||
q_high_overrides = {
|
||||
"avia_hotel": 0.9,
|
||||
}
|
||||
iqr_overrides = {
|
||||
"avia_hotel": 1.2,
|
||||
}
|
||||
|
||||
corr_dir = OUT_DIR / "correlations"
|
||||
cats_all = CATEGORIES + list(COMBINED.keys())
|
||||
for cat in cats_all:
|
||||
plot_category_correlation(client, cat, corr_dir)
|
||||
|
||||
for cat in cats_all:
|
||||
plot_quad_for_category(
|
||||
client,
|
||||
cat,
|
||||
x_max_overrides=x_max_overrides,
|
||||
y_max_overrides=y_max_overrides,
|
||||
savgol_overrides=savgol_overrides,
|
||||
q_low_overrides=q_low_overrides,
|
||||
q_high_overrides=q_high_overrides,
|
||||
iqr_overrides=iqr_overrides,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
44
old data/correlations/corr_avia.html
Normal file
@@ -0,0 +1,44 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
#vis.vega-embed {
|
||||
width: 100%;
|
||||
display: flex;
|
||||
}
|
||||
|
||||
#vis.vega-embed details,
|
||||
#vis.vega-embed details summary {
|
||||
position: relative;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
|
||||
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
|
||||
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="vis"></div>
|
||||
<script>
|
||||
(function(vegaEmbed) {
|
||||
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-81b5fe5ef3aa1fe9a1cf1fdd875e8008"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: avia", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-81b5fe5ef3aa1fe9a1cf1fdd875e8008": [{"row": "active_imp_avia", "col": "active_imp_avia", "corr": 1.0}, {"row": "passive_imp_avia", "col": "active_imp_avia", "corr": 0.01876412266457888}, {"row": "active_click_avia", "col": "active_imp_avia", "corr": 0.6555267805752467}, {"row": "passive_click_avia", "col": "active_imp_avia", "corr": 0.08891639561678617}, {"row": "orders_amt_avia", "col": "active_imp_avia", "corr": -0.04479889738838307}, {"row": "active_imp_avia", "col": "passive_imp_avia", "corr": 0.01876412266457888}, {"row": "passive_imp_avia", "col": "passive_imp_avia", "corr": 1.0}, {"row": "active_click_avia", "col": "passive_imp_avia", "corr": 0.048482427442423495}, {"row": "passive_click_avia", "col": "passive_imp_avia", "corr": 0.27543793232581393}, {"row": "orders_amt_avia", "col": "passive_imp_avia", "corr": 0.03022795982049177}, {"row": "active_imp_avia", "col": "active_click_avia", "corr": 0.6555267805752467}, {"row": "passive_imp_avia", "col": "active_click_avia", "corr": 0.048482427442423495}, {"row": "active_click_avia", "col": "active_click_avia", "corr": 1.0}, {"row": "passive_click_avia", "col": "active_click_avia", "corr": 0.11058067071772743}, {"row": "orders_amt_avia", "col": "active_click_avia", "corr": 0.007181957024016167}, {"row": "active_imp_avia", "col": "passive_click_avia", "corr": 0.08891639561678617}, {"row": "passive_imp_avia", "col": "passive_click_avia", "corr": 0.27543793232581393}, {"row": "active_click_avia", "col": "passive_click_avia", "corr": 0.11058067071772743}, {"row": "passive_click_avia", "col": "passive_click_avia", "corr": 1.0}, {"row": "orders_amt_avia", "col": "passive_click_avia", "corr": 0.14634536196166995}, {"row": "active_imp_avia", "col": "orders_amt_avia", "corr": -0.04479889738838307}, {"row": "passive_imp_avia", "col": "orders_amt_avia", "corr": 0.03022795982049177}, {"row": "active_click_avia", "col": "orders_amt_avia", "corr": 0.007181957024016167}, {"row": "passive_click_avia", "col": "orders_amt_avia", "corr": 0.14634536196166995}, {"row": "orders_amt_avia", "col": "orders_amt_avia", "corr": 1.0}]}};
|
||||
var embedOpt = {"mode": "vega-lite"};
|
||||
|
||||
function showError(el, error){
|
||||
el.innerHTML = ('<div style="color:red;">'
|
||||
+ '<p>JavaScript Error: ' + error.message + '</p>'
|
||||
+ "<p>This usually means there's a typo in your chart specification. "
|
||||
+ "See the javascript console for the full traceback.</p>"
|
||||
+ '</div>');
|
||||
throw error;
|
||||
}
|
||||
const el = document.getElementById('vis');
|
||||
vegaEmbed("#vis", spec, embedOpt)
|
||||
.catch(error => showError(el, error));
|
||||
})(vegaEmbed);
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
44
old data/correlations/corr_avia_hotel.html
Normal file
@@ -0,0 +1,44 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
#vis.vega-embed {
|
||||
width: 100%;
|
||||
display: flex;
|
||||
}
|
||||
|
||||
#vis.vega-embed details,
|
||||
#vis.vega-embed details summary {
|
||||
position: relative;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
|
||||
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
|
||||
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="vis"></div>
|
||||
<script>
|
||||
(function(vegaEmbed) {
|
||||
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-158e8b587028464f7420184e3a69712d"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: avia_hotel", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-158e8b587028464f7420184e3a69712d": [{"row": "active_imp_avia_hotel", "col": "active_imp_avia_hotel", "corr": 1.0}, {"row": "passive_imp_avia_hotel", "col": "active_imp_avia_hotel", "corr": -0.08274509905837495}, {"row": "active_click_avia_hotel", "col": "active_imp_avia_hotel", "corr": 0.6424745469930201}, {"row": "passive_click_avia_hotel", "col": "active_imp_avia_hotel", "corr": 0.0656927131251431}, {"row": "orders_amt_avia_hotel", "col": "active_imp_avia_hotel", "corr": 0.11791995115159383}, {"row": "active_imp_avia_hotel", "col": "passive_imp_avia_hotel", "corr": -0.08274509905837495}, {"row": "passive_imp_avia_hotel", "col": "passive_imp_avia_hotel", "corr": 1.0}, {"row": "active_click_avia_hotel", "col": "passive_imp_avia_hotel", "corr": -0.002830801434428736}, {"row": "passive_click_avia_hotel", "col": "passive_imp_avia_hotel", "corr": 0.19064250507318162}, {"row": "orders_amt_avia_hotel", "col": "passive_imp_avia_hotel", "corr": 0.0829341029860776}, {"row": "active_imp_avia_hotel", "col": "active_click_avia_hotel", "corr": 0.6424745469930201}, {"row": "passive_imp_avia_hotel", "col": "active_click_avia_hotel", "corr": -0.002830801434428736}, {"row": "active_click_avia_hotel", "col": "active_click_avia_hotel", "corr": 1.0}, {"row": "passive_click_avia_hotel", "col": "active_click_avia_hotel", "corr": 0.08320023005001294}, {"row": "orders_amt_avia_hotel", "col": "active_click_avia_hotel", "corr": 0.04818436665905769}, {"row": "active_imp_avia_hotel", "col": "passive_click_avia_hotel", "corr": 0.0656927131251431}, {"row": "passive_imp_avia_hotel", "col": "passive_click_avia_hotel", "corr": 0.19064250507318162}, {"row": "active_click_avia_hotel", "col": "passive_click_avia_hotel", "corr": 0.08320023005001294}, {"row": "passive_click_avia_hotel", "col": "passive_click_avia_hotel", "corr": 1.0}, {"row": "orders_amt_avia_hotel", "col": "passive_click_avia_hotel", "corr": 0.1191470947872778}, {"row": "active_imp_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 0.11791995115159383}, {"row": "passive_imp_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 0.0829341029860776}, {"row": "active_click_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 0.04818436665905769}, {"row": "passive_click_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 0.1191470947872778}, {"row": "orders_amt_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 1.0}]}};
|
||||
var embedOpt = {"mode": "vega-lite"};
|
||||
|
||||
function showError(el, error){
|
||||
el.innerHTML = ('<div style="color:red;">'
|
||||
+ '<p>JavaScript Error: ' + error.message + '</p>'
|
||||
+ "<p>This usually means there's a typo in your chart specification. "
|
||||
+ "See the javascript console for the full traceback.</p>"
|
||||
+ '</div>');
|
||||
throw error;
|
||||
}
|
||||
const el = document.getElementById('vis');
|
||||
vegaEmbed("#vis", spec, embedOpt)
|
||||
.catch(error => showError(el, error));
|
||||
})(vegaEmbed);
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
44
old data/correlations/corr_ent.html
Normal file
@@ -0,0 +1,44 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
#vis.vega-embed {
|
||||
width: 100%;
|
||||
display: flex;
|
||||
}
|
||||
|
||||
#vis.vega-embed details,
|
||||
#vis.vega-embed details summary {
|
||||
position: relative;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
|
||||
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
|
||||
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="vis"></div>
|
||||
<script>
|
||||
(function(vegaEmbed) {
|
||||
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-cd1e14ccf8ef0243ac2429b66fca6f3e"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: ent", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-cd1e14ccf8ef0243ac2429b66fca6f3e": [{"row": "active_imp_ent", "col": "active_imp_ent", "corr": 1.0}, {"row": "passive_imp_ent", "col": "active_imp_ent", "corr": 0.3740482978344062}, {"row": "active_click_ent", "col": "active_imp_ent", "corr": 0.8713679748694044}, {"row": "passive_click_ent", "col": "active_imp_ent", "corr": 0.1834267922170377}, {"row": "orders_amt_ent", "col": "active_imp_ent", "corr": 0.19909732995304016}, {"row": "active_imp_ent", "col": "passive_imp_ent", "corr": 0.3740482978344062}, {"row": "passive_imp_ent", "col": "passive_imp_ent", "corr": 1.0}, {"row": "active_click_ent", "col": "passive_imp_ent", "corr": 0.3606804643725377}, {"row": "passive_click_ent", "col": "passive_imp_ent", "corr": 0.5648383908323416}, {"row": "orders_amt_ent", "col": "passive_imp_ent", "corr": 0.4151695148464165}, {"row": "active_imp_ent", "col": "active_click_ent", "corr": 0.8713679748694044}, {"row": "passive_imp_ent", "col": "active_click_ent", "corr": 0.3606804643725377}, {"row": "active_click_ent", "col": "active_click_ent", "corr": 1.0}, {"row": "passive_click_ent", "col": "active_click_ent", "corr": 0.12953818089063812}, {"row": "orders_amt_ent", "col": "active_click_ent", "corr": 0.16418539548659097}, {"row": "active_imp_ent", "col": "passive_click_ent", "corr": 0.1834267922170377}, {"row": "passive_imp_ent", "col": "passive_click_ent", "corr": 0.5648383908323416}, {"row": "active_click_ent", "col": "passive_click_ent", "corr": 0.12953818089063812}, {"row": "passive_click_ent", "col": "passive_click_ent", "corr": 1.0}, {"row": "orders_amt_ent", "col": "passive_click_ent", "corr": 0.5553099034616074}, {"row": "active_imp_ent", "col": "orders_amt_ent", "corr": 0.19909732995304016}, {"row": "passive_imp_ent", "col": "orders_amt_ent", "corr": 0.4151695148464165}, {"row": "active_click_ent", "col": "orders_amt_ent", "corr": 0.16418539548659097}, {"row": "passive_click_ent", "col": "orders_amt_ent", "corr": 0.5553099034616074}, {"row": "orders_amt_ent", "col": "orders_amt_ent", "corr": 1.0}]}};
|
||||
var embedOpt = {"mode": "vega-lite"};
|
||||
|
||||
function showError(el, error){
|
||||
el.innerHTML = ('<div style="color:red;">'
|
||||
+ '<p>JavaScript Error: ' + error.message + '</p>'
|
||||
+ "<p>This usually means there's a typo in your chart specification. "
|
||||
+ "See the javascript console for the full traceback.</p>"
|
||||
+ '</div>');
|
||||
throw error;
|
||||
}
|
||||
const el = document.getElementById('vis');
|
||||
vegaEmbed("#vis", spec, embedOpt)
|
||||
.catch(error => showError(el, error));
|
||||
})(vegaEmbed);
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
44
old data/correlations/corr_hotel.html
Normal file
@@ -0,0 +1,44 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
#vis.vega-embed {
|
||||
width: 100%;
|
||||
display: flex;
|
||||
}
|
||||
|
||||
#vis.vega-embed details,
|
||||
#vis.vega-embed details summary {
|
||||
position: relative;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
|
||||
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
|
||||
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="vis"></div>
|
||||
<script>
|
||||
(function(vegaEmbed) {
|
||||
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-a2a0150a275d02c7b9393305bbd503d6"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: hotel", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-a2a0150a275d02c7b9393305bbd503d6": [{"row": "active_imp_hotel", "col": "active_imp_hotel", "corr": 1.0}, {"row": "passive_imp_hotel", "col": "active_imp_hotel", "corr": -0.0177015411050084}, {"row": "active_click_hotel", "col": "active_imp_hotel", "corr": 0.6075829324496919}, {"row": "passive_click_hotel", "col": "active_imp_hotel", "corr": 0.009979892986558766}, {"row": "orders_amt_hotel", "col": "active_imp_hotel", "corr": 0.06957731524967162}, {"row": "active_imp_hotel", "col": "passive_imp_hotel", "corr": -0.0177015411050084}, {"row": "passive_imp_hotel", "col": "passive_imp_hotel", "corr": 1.0}, {"row": "active_click_hotel", "col": "passive_imp_hotel", "corr": 0.01468063302643315}, {"row": "passive_click_hotel", "col": "passive_imp_hotel", "corr": 0.17649206333048828}, {"row": "orders_amt_hotel", "col": "passive_imp_hotel", "corr": 0.0020660458585801825}, {"row": "active_imp_hotel", "col": "active_click_hotel", "corr": 0.6075829324496919}, {"row": "passive_imp_hotel", "col": "active_click_hotel", "corr": 0.01468063302643315}, {"row": "active_click_hotel", "col": "active_click_hotel", "corr": 1.0}, {"row": "passive_click_hotel", "col": "active_click_hotel", "corr": 0.035078311469620184}, {"row": "orders_amt_hotel", "col": "active_click_hotel", "corr": 0.02986170141739076}, {"row": "active_imp_hotel", "col": "passive_click_hotel", "corr": 0.009979892986558766}, {"row": "passive_imp_hotel", "col": "passive_click_hotel", "corr": 0.17649206333048828}, {"row": "active_click_hotel", "col": "passive_click_hotel", "corr": 0.035078311469620184}, {"row": "passive_click_hotel", "col": "passive_click_hotel", "corr": 1.0}, {"row": "orders_amt_hotel", "col": "passive_click_hotel", "corr": -0.0025707911767623094}, {"row": "active_imp_hotel", "col": "orders_amt_hotel", "corr": 0.06957731524967162}, {"row": "passive_imp_hotel", "col": "orders_amt_hotel", "corr": 0.0020660458585801825}, {"row": "active_click_hotel", "col": "orders_amt_hotel", "corr": 0.02986170141739076}, {"row": "passive_click_hotel", "col": "orders_amt_hotel", "corr": -0.0025707911767623094}, {"row": "orders_amt_hotel", "col": "orders_amt_hotel", "corr": 1.0}]}};
|
||||
var embedOpt = {"mode": "vega-lite"};
|
||||
|
||||
function showError(el, error){
|
||||
el.innerHTML = ('<div style="color:red;">'
|
||||
+ '<p>JavaScript Error: ' + error.message + '</p>'
|
||||
+ "<p>This usually means there's a typo in your chart specification. "
|
||||
+ "See the javascript console for the full traceback.</p>"
|
||||
+ '</div>');
|
||||
throw error;
|
||||
}
|
||||
const el = document.getElementById('vis');
|
||||
vegaEmbed("#vis", spec, embedOpt)
|
||||
.catch(error => showError(el, error));
|
||||
})(vegaEmbed);
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
44
old data/correlations/corr_shopping.html
Normal file
@@ -0,0 +1,44 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
#vis.vega-embed {
|
||||
width: 100%;
|
||||
display: flex;
|
||||
}
|
||||
|
||||
#vis.vega-embed details,
|
||||
#vis.vega-embed details summary {
|
||||
position: relative;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
|
||||
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
|
||||
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="vis"></div>
|
||||
<script>
|
||||
(function(vegaEmbed) {
|
||||
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-3ac7d524ac078c0c96bdf5c96405262f"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: shopping", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-3ac7d524ac078c0c96bdf5c96405262f": [{"row": "active_imp_shopping", "col": "active_imp_shopping", "corr": 1.0}, {"row": "passive_imp_shopping", "col": "active_imp_shopping", "corr": 0.22682584296837505}, {"row": "active_click_shopping", "col": "active_imp_shopping", "corr": 0.8729875334818619}, {"row": "passive_click_shopping", "col": "active_imp_shopping", "corr": 0.11692802611837975}, {"row": "orders_amt_shopping", "col": "active_imp_shopping", "corr": 0.1866072104879359}, {"row": "active_imp_shopping", "col": "passive_imp_shopping", "corr": 0.22682584296837505}, {"row": "passive_imp_shopping", "col": "passive_imp_shopping", "corr": 1.0}, {"row": "active_click_shopping", "col": "passive_imp_shopping", "corr": 0.20868395081922667}, {"row": "passive_click_shopping", "col": "passive_imp_shopping", "corr": 0.25897090952326174}, {"row": "orders_amt_shopping", "col": "passive_imp_shopping", "corr": 0.1476827158464753}, {"row": "active_imp_shopping", "col": "active_click_shopping", "corr": 0.8729875334818619}, {"row": "passive_imp_shopping", "col": "active_click_shopping", "corr": 0.20868395081922667}, {"row": "active_click_shopping", "col": "active_click_shopping", "corr": 1.0}, {"row": "passive_click_shopping", "col": "active_click_shopping", "corr": 0.0800917496050481}, {"row": "orders_amt_shopping", "col": "active_click_shopping", "corr": 0.1837650330305473}, {"row": "active_imp_shopping", "col": "passive_click_shopping", "corr": 0.11692802611837975}, {"row": "passive_imp_shopping", "col": "passive_click_shopping", "corr": 0.25897090952326174}, {"row": "active_click_shopping", "col": "passive_click_shopping", "corr": 0.0800917496050481}, {"row": "passive_click_shopping", "col": "passive_click_shopping", "corr": 1.0}, {"row": "orders_amt_shopping", "col": "passive_click_shopping", "corr": 0.11649273142550405}, {"row": "active_imp_shopping", "col": "orders_amt_shopping", "corr": 0.1866072104879359}, {"row": "passive_imp_shopping", "col": "orders_amt_shopping", "corr": 0.1476827158464753}, {"row": "active_click_shopping", "col": "orders_amt_shopping", "corr": 0.1837650330305473}, {"row": "passive_click_shopping", "col": "orders_amt_shopping", "corr": 0.11649273142550405}, {"row": "orders_amt_shopping", "col": "orders_amt_shopping", "corr": 1.0}]}};
|
||||
var embedOpt = {"mode": "vega-lite"};
|
||||
|
||||
function showError(el, error){
|
||||
el.innerHTML = ('<div style="color:red;">'
|
||||
+ '<p>JavaScript Error: ' + error.message + '</p>'
|
||||
+ "<p>This usually means there's a typo in your chart specification. "
|
||||
+ "See the javascript console for the full traceback.</p>"
|
||||
+ '</div>');
|
||||
throw error;
|
||||
}
|
||||
const el = document.getElementById('vis');
|
||||
vegaEmbed("#vis", spec, embedOpt)
|
||||
.catch(error => showError(el, error));
|
||||
})(vegaEmbed);
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
44
old data/correlations/corr_super.html
Normal file
@@ -0,0 +1,44 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
#vis.vega-embed {
|
||||
width: 100%;
|
||||
display: flex;
|
||||
}
|
||||
|
||||
#vis.vega-embed details,
|
||||
#vis.vega-embed details summary {
|
||||
position: relative;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
|
||||
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
|
||||
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="vis"></div>
|
||||
<script>
|
||||
(function(vegaEmbed) {
|
||||
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-570897060314c084dad6a0fe94034ace"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: super", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-570897060314c084dad6a0fe94034ace": [{"row": "active_imp_super", "col": "active_imp_super", "corr": 1.0}, {"row": "passive_imp_super", "col": "active_imp_super", "corr": 0.10775076644240923}, {"row": "active_click_super", "col": "active_imp_super", "corr": 0.815114139753961}, {"row": "passive_click_super", "col": "active_imp_super", "corr": 0.036142767956872573}, {"row": "orders_amt_super", "col": "active_imp_super", "corr": 0.044474400312866307}, {"row": "active_imp_super", "col": "passive_imp_super", "corr": 0.10775076644240923}, {"row": "passive_imp_super", "col": "passive_imp_super", "corr": 1.0}, {"row": "active_click_super", "col": "passive_imp_super", "corr": 0.13851152985212567}, {"row": "passive_click_super", "col": "passive_imp_super", "corr": 0.25041456703210235}, {"row": "orders_amt_super", "col": "passive_imp_super", "corr": 0.10661548504413648}, {"row": "active_imp_super", "col": "active_click_super", "corr": 0.815114139753961}, {"row": "passive_imp_super", "col": "active_click_super", "corr": 0.13851152985212567}, {"row": "active_click_super", "col": "active_click_super", "corr": 1.0}, {"row": "passive_click_super", "col": "active_click_super", "corr": 0.018411595933568142}, {"row": "orders_amt_super", "col": "active_click_super", "corr": 0.020608557316194334}, {"row": "active_imp_super", "col": "passive_click_super", "corr": 0.036142767956872573}, {"row": "passive_imp_super", "col": "passive_click_super", "corr": 0.25041456703210235}, {"row": "active_click_super", "col": "passive_click_super", "corr": 0.018411595933568142}, {"row": "passive_click_super", "col": "passive_click_super", "corr": 1.0}, {"row": "orders_amt_super", "col": "passive_click_super", "corr": 0.11858521469065078}, {"row": "active_imp_super", "col": "orders_amt_super", "corr": 0.044474400312866307}, {"row": "passive_imp_super", "col": "orders_amt_super", "corr": 0.10661548504413648}, {"row": "active_click_super", "col": "orders_amt_super", "corr": 0.020608557316194334}, {"row": "passive_click_super", "col": "orders_amt_super", "corr": 0.11858521469065078}, {"row": "orders_amt_super", "col": "orders_amt_super", "corr": 1.0}]}};
|
||||
var embedOpt = {"mode": "vega-lite"};
|
||||
|
||||
function showError(el, error){
|
||||
el.innerHTML = ('<div style="color:red;">'
|
||||
+ '<p>JavaScript Error: ' + error.message + '</p>'
|
||||
+ "<p>This usually means there's a typo in your chart specification. "
|
||||
+ "See the javascript console for the full traceback.</p>"
|
||||
+ '</div>');
|
||||
throw error;
|
||||
}
|
||||
const el = document.getElementById('vis');
|
||||
vegaEmbed("#vis", spec, embedOpt)
|
||||
.catch(error => showError(el, error));
|
||||
})(vegaEmbed);
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
44
old data/correlations/corr_transport.html
Normal file
@@ -0,0 +1,44 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<style>
|
||||
#vis.vega-embed {
|
||||
width: 100%;
|
||||
display: flex;
|
||||
}
|
||||
|
||||
#vis.vega-embed details,
|
||||
#vis.vega-embed details summary {
|
||||
position: relative;
|
||||
}
|
||||
</style>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
|
||||
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
|
||||
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
|
||||
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="vis"></div>
|
||||
<script>
|
||||
(function(vegaEmbed) {
|
||||
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-5ac874a21fd43fc95ef8060b3a83793c"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: transport", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-5ac874a21fd43fc95ef8060b3a83793c": [{"row": "active_imp_transport", "col": "active_imp_transport", "corr": 1.0}, {"row": "passive_imp_transport", "col": "active_imp_transport", "corr": 0.40168978254566456}, {"row": "active_click_transport", "col": "active_imp_transport", "corr": 0.8428763034279261}, {"row": "passive_click_transport", "col": "active_imp_transport", "corr": 0.11832571530873176}, {"row": "orders_amt_transport", "col": "active_imp_transport", "corr": 0.17781437332297736}, {"row": "active_imp_transport", "col": "passive_imp_transport", "corr": 0.40168978254566456}, {"row": "passive_imp_transport", "col": "passive_imp_transport", "corr": 1.0}, {"row": "active_click_transport", "col": "passive_imp_transport", "corr": 0.4678363557472336}, {"row": "passive_click_transport", "col": "passive_imp_transport", "corr": 0.25797171201314045}, {"row": "orders_amt_transport", "col": "passive_imp_transport", "corr": 0.19235638990080245}, {"row": "active_imp_transport", "col": "active_click_transport", "corr": 0.8428763034279261}, {"row": "passive_imp_transport", "col": "active_click_transport", "corr": 0.4678363557472336}, {"row": "active_click_transport", "col": "active_click_transport", "corr": 1.0}, {"row": "passive_click_transport", "col": "active_click_transport", "corr": 0.09033265638665873}, {"row": "orders_amt_transport", "col": "active_click_transport", "corr": 0.16848280412867794}, {"row": "active_imp_transport", "col": "passive_click_transport", "corr": 0.11832571530873176}, {"row": "passive_imp_transport", "col": "passive_click_transport", "corr": 0.25797171201314045}, {"row": "active_click_transport", "col": "passive_click_transport", "corr": 0.09033265638665873}, {"row": "passive_click_transport", "col": "passive_click_transport", "corr": 1.0}, {"row": "orders_amt_transport", "col": "passive_click_transport", "corr": 0.24259813553198464}, {"row": "active_imp_transport", "col": "orders_amt_transport", "corr": 0.17781437332297736}, {"row": "passive_imp_transport", "col": "orders_amt_transport", "corr": 0.19235638990080245}, {"row": "active_click_transport", "col": "orders_amt_transport", "corr": 0.16848280412867794}, {"row": "passive_click_transport", "col": "orders_amt_transport", "corr": 0.24259813553198464}, {"row": "orders_amt_transport", "col": "orders_amt_transport", "corr": 1.0}]}};
|
||||
var embedOpt = {"mode": "vega-lite"};
|
||||
|
||||
function showError(el, error){
|
||||
el.innerHTML = ('<div style="color:red;">'
|
||||
+ '<p>JavaScript Error: ' + error.message + '</p>'
|
||||
+ "<p>This usually means there's a typo in your chart specification. "
|
||||
+ "See the javascript console for the full traceback.</p>"
|
||||
+ '</div>');
|
||||
throw error;
|
||||
}
|
||||
const el = document.getElementById('vis');
|
||||
vegaEmbed("#vis", spec, embedOpt)
|
||||
.catch(error => showError(el, error));
|
||||
})(vegaEmbed);
|
||||
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
BIN
old data/correlations2/corr_avia.png
Normal file
|
After Width: | Height: | Size: 81 KiB |
BIN
old data/correlations2/corr_avia_hotel.png
Normal file
|
After Width: | Height: | Size: 88 KiB |
BIN
old data/correlations2/corr_ent.png
Normal file
|
After Width: | Height: | Size: 80 KiB |
BIN
old data/correlations2/corr_hotel.png
Normal file
|
After Width: | Height: | Size: 80 KiB |
BIN
old data/correlations2/corr_shopping.png
Normal file
|
After Width: | Height: | Size: 88 KiB |
BIN
old data/correlations2/corr_super.png
Normal file
|
After Width: | Height: | Size: 80 KiB |
BIN
old data/correlations2/corr_transport.png
Normal file
|
After Width: | Height: | Size: 88 KiB |
BIN
old data/default/orders_amt_total/quad_regression.png
Normal file
|
After Width: | Height: | Size: 135 KiB |
44
old data/divided/avia/active_scatter.html
Normal file
44
old data/divided/avia/passive_scatter.html
Normal file
44
old data/divided/ent/active_scatter.html
Normal file
44
old data/divided/ent/passive_scatter.html
Normal file
44
old data/divided/hotel/active_scatter.html
Normal file
44
old data/divided/hotel/passive_scatter.html
Normal file
44
old data/divided/shopping/active_scatter.html
Normal file
44
old data/divided/shopping/passive_scatter.html
Normal file
44
old data/divided/super/active_scatter.html
Normal file
44
old data/divided/super/passive_scatter.html
Normal file
44
old data/divided/total/active_scatter.html
Normal file
44
old data/divided/total/passive_scatter.html
Normal file
44
old data/divided/transport/active_scatter.html
Normal file
44
old data/divided/transport/passive_scatter.html
Normal file
110
old data/model_compare.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.compose import ColumnTransformer
|
||||
from sklearn.pipeline import Pipeline
|
||||
from sklearn.impute import SimpleImputer
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.ensemble import GradientBoostingClassifier
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
project_root = Path(__file__).resolve().parent.parent
|
||||
sys.path.append(str(project_root / "preanalysis_old_bad"))
|
||||
import eda_utils as eda # noqa: E402
|
||||
|
||||
db_path = project_root / "dataset" / "ds.sqlite"
|
||||
conn = sqlite3.connect(db_path)
|
||||
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
||||
conn.close()
|
||||
|
||||
for cols, name in [
|
||||
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
|
||||
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
|
||||
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
|
||||
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
|
||||
(eda.ORDER_COLS, "orders_amt_total"),
|
||||
]:
|
||||
df[name] = df[cols].sum(axis=1)
|
||||
|
||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
||||
client = (
|
||||
df.groupby("id")
|
||||
.agg(
|
||||
imp_total=("imp_total", "sum"),
|
||||
click_total=("click_total", "sum"),
|
||||
orders_amt_total=("orders_amt_total", "sum"),
|
||||
age=("age", "median"),
|
||||
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
|
||||
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
|
||||
)
|
||||
.merge(contact_days, on="id", how="left")
|
||||
.reset_index()
|
||||
)
|
||||
# ... всё как у тебя до расчёта client["ctr_all"] включительно
|
||||
|
||||
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
|
||||
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
||||
|
||||
# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
|
||||
train_idx, test_idx = train_test_split(
|
||||
client.index, test_size=0.2, random_state=42
|
||||
)
|
||||
|
||||
train = client.loc[train_idx].copy()
|
||||
test = client.loc[test_idx].copy()
|
||||
|
||||
thr = train["ctr_all"].quantile(0.75) # порог только по train
|
||||
train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
|
||||
test["high_ctr"] = (test["ctr_all"] >= thr).astype(int)
|
||||
|
||||
# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
|
||||
X_train = train[[
|
||||
"avg_imp_per_day", "imp_total", "contact_days", # можно оставить
|
||||
"age", "gender_cd", "device_platform_cd"
|
||||
]].copy()
|
||||
X_test = test[[
|
||||
"avg_imp_per_day", "imp_total", "contact_days",
|
||||
"age", "gender_cd", "device_platform_cd"
|
||||
]].copy()
|
||||
|
||||
X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
|
||||
X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
|
||||
X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
|
||||
X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
|
||||
|
||||
y_train = train["high_ctr"]
|
||||
y_test = test["high_ctr"]
|
||||
|
||||
num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
|
||||
cat_cols = ["gender_cd", "device_platform_cd"]
|
||||
|
||||
pre = ColumnTransformer([
|
||||
("num", Pipeline([
|
||||
("imputer", SimpleImputer(strategy="median")),
|
||||
("scaler", StandardScaler())
|
||||
]), num_cols),
|
||||
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
|
||||
])
|
||||
|
||||
log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
|
||||
gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
|
||||
|
||||
results = {}
|
||||
for name, model in [("log_reg", log_reg), ("gb", gb)]:
|
||||
model.fit(X_train, y_train)
|
||||
proba = model.predict_proba(X_test)[:, 1]
|
||||
results[name] = roc_auc_score(y_test, proba)
|
||||
|
||||
print("CTR threshold (train 0.75q):", thr)
|
||||
print("AUC results:", results)
|
||||
|
||||
imp = gb.named_steps["clf"].feature_importances_
|
||||
feat = gb.named_steps["pre"].get_feature_names_out()
|
||||
imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False)
|
||||
print(imp_df.head(15))
|
||||
465
old data/new_plots.py
Normal file
@@ -0,0 +1,465 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
from typing import Dict, Iterable, Optional, Tuple
|
||||
|
||||
import altair as alt
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import statsmodels.api as sm
|
||||
from sklearn.metrics import roc_auc_score, r2_score
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent
|
||||
sys.path.append(str(PROJECT_ROOT / "main_hypot"))
|
||||
|
||||
import best_model_and_plots as bmp
|
||||
from category_quadreg import (
|
||||
BASE_COLUMNS,
|
||||
CATEGORIES,
|
||||
COMBINED,
|
||||
add_combined_category,
|
||||
build_client_by_category,
|
||||
)
|
||||
|
||||
OUTPUT_DIR = PROJECT_ROOT / "new_plots"
|
||||
FONT_PATH = Path("/Users/dan/Downloads/AyuGram Desktop/SegoeUIVF.ttf")
|
||||
|
||||
def inject_font_css(html_path: Path) -> None:
|
||||
"""Inject @font-face for SegoeUIVF into saved HTML if font exists."""
|
||||
if not FONT_PATH.exists():
|
||||
return
|
||||
font_face = (
|
||||
"@font-face{font-family:'Segoe UI Variable'; "
|
||||
f"src: url('{FONT_PATH.as_uri()}') format('truetype'); "
|
||||
"font-weight:100 900; font-style:normal;}\n"
|
||||
)
|
||||
css = f"<style>{font_face}body, text, .vega-bindings {{font-family:'Segoe UI Variable','Segoe UI',sans-serif;}}</style>"
|
||||
html = html_path.read_text(encoding="utf-8")
|
||||
if css in html:
|
||||
return
|
||||
if "</head>" in html:
|
||||
html = html.replace("</head>", css + "\n</head>", 1)
|
||||
else:
|
||||
html = css + html
|
||||
html_path.write_text(html, encoding="utf-8")
|
||||
|
||||
|
||||
# Используем тематику/шрифты из примера
|
||||
def configure_chart(chart: alt.Chart, title: str, width: int = 700, height: int = 500) -> alt.Chart:
|
||||
alt.theme.enable("dark")
|
||||
return (
|
||||
chart.properties(
|
||||
title=title,
|
||||
width=width,
|
||||
height=height,
|
||||
padding=30,
|
||||
)
|
||||
.configure_title(
|
||||
fontSize=18,
|
||||
font="Segoe UI Variable",
|
||||
fontWeight=600,
|
||||
anchor="start",
|
||||
)
|
||||
.configure_axis(
|
||||
grid=True,
|
||||
labelFont="Segoe UI Variable",
|
||||
titleFont="Segoe UI Variable",
|
||||
labelFontSize=16,
|
||||
titleFontSize=18,
|
||||
labelFontWeight=400,
|
||||
titleFontWeight=600,
|
||||
)
|
||||
.configure_legend(
|
||||
labelFont="Segoe UI Variable",
|
||||
titleFont="Segoe UI Variable",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def prepare_client_data() -> pd.DataFrame:
|
||||
"""Поднимаем агрегаты по клиентам из существующего скрипта."""
|
||||
return bmp.load_client_level(bmp.DB_PATH)
|
||||
|
||||
|
||||
def prepare_category_client_data() -> pd.DataFrame:
|
||||
raw = pd.read_sql_query("select * from communications", bmp.sqlite3.connect(bmp.DB_PATH), parse_dates=["business_dt"])
|
||||
client = build_client_by_category(raw)
|
||||
for combo_name, cats in COMBINED.items():
|
||||
client = add_combined_category(client, combo_name, cats)
|
||||
return client
|
||||
|
||||
|
||||
def filter_and_trend(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
*,
|
||||
x_col: str = bmp.X_COL,
|
||||
x_max: float = bmp.DEFAULT_X_MAX,
|
||||
y_max: float = bmp.DEFAULT_Y_MAX,
|
||||
q_low: float = bmp.DEFAULT_Q_LOW,
|
||||
q_high: float = bmp.DEFAULT_Q_HIGH,
|
||||
iqr_k: float = bmp.DEFAULT_IQR_K,
|
||||
trend_method: str = bmp.DEFAULT_TREND_METHOD,
|
||||
trend_frac: float = bmp.DEFAULT_TREND_FRAC,
|
||||
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
|
||||
) -> Tuple[pd.DataFrame, Tuple[np.ndarray, np.ndarray]]:
|
||||
base = df[[x_col, y_col]].dropna()
|
||||
in_range = bmp.filter_x_range(base, x_col, x_max)
|
||||
cleaned = bmp.remove_outliers(
|
||||
in_range,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
iqr_k=iqr_k,
|
||||
q_low=q_low,
|
||||
q_high=q_high,
|
||||
)
|
||||
# Обрезаем по y_max для удобства визуализации
|
||||
cleaned = cleaned[cleaned[y_col] <= y_max].copy()
|
||||
tx, ty = bmp.compute_trend(
|
||||
cleaned,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
method=trend_method,
|
||||
lowess_frac=trend_frac,
|
||||
savgol_window=savgol_window,
|
||||
)
|
||||
return cleaned, (tx, ty)
|
||||
|
||||
|
||||
def compute_density_alpha(df: pd.DataFrame, x_col: str, y_col: str, x_max: float, y_max: float) -> pd.Series:
|
||||
alphas = bmp.compute_density_alpha(
|
||||
df,
|
||||
x_col=x_col,
|
||||
y_col=y_col,
|
||||
x_max=x_max,
|
||||
bins_x=bmp.DEFAULT_BINS_X,
|
||||
bins_y=bmp.DEFAULT_BINS_Y,
|
||||
alpha_min=bmp.DEFAULT_ALPHA_MIN,
|
||||
alpha_max=bmp.DEFAULT_ALPHA_MAX,
|
||||
y_min=bmp.DEFAULT_Y_MIN,
|
||||
y_max_limit=y_max,
|
||||
)
|
||||
if len(alphas) == 0:
|
||||
return pd.Series([bmp.DEFAULT_ALPHA] * len(df), index=df.index)
|
||||
return pd.Series(alphas, index=df.index)
|
||||
|
||||
|
||||
def fit_quadratic(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
trend_data: Tuple[np.ndarray, np.ndarray],
|
||||
*,
|
||||
x_col: str = bmp.X_COL,
|
||||
x_max: float = bmp.DEFAULT_X_MAX,
|
||||
force_negative_b2: bool = False,
|
||||
) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]:
|
||||
if len(df) < 3:
|
||||
return None, {}
|
||||
|
||||
x = df[x_col].to_numpy()
|
||||
y = df[y_col].to_numpy()
|
||||
quad_term = -x**2 if force_negative_b2 else x**2
|
||||
X_design = sm.add_constant(np.column_stack([x, quad_term]))
|
||||
model = sm.OLS(y, X_design).fit(cov_type="HC3")
|
||||
|
||||
# AUC по бинарному флагу заказа
|
||||
auc = np.nan
|
||||
binary = (y > 0).astype(int)
|
||||
if len(np.unique(binary)) > 1:
|
||||
auc = roc_auc_score(binary, model.predict(X_design))
|
||||
|
||||
# R2 по тренду
|
||||
tx, ty = trend_data
|
||||
r2_trend = np.nan
|
||||
if tx is not None and len(tx) >= 3:
|
||||
mask = (tx <= x_max) & ~np.isnan(ty)
|
||||
tx = tx[mask]
|
||||
ty = ty[mask]
|
||||
if len(tx) >= 3 and np.nanvar(ty) > 0:
|
||||
quad_trend = -tx**2 if force_negative_b2 else tx**2
|
||||
X_trend = sm.add_constant(np.column_stack([tx, quad_trend]))
|
||||
y_hat_trend = model.predict(X_trend)
|
||||
r2_trend = r2_score(ty, y_hat_trend)
|
||||
|
||||
return model, {"auc": auc, "r2_trend": r2_trend}
|
||||
|
||||
|
||||
def build_annotation(
|
||||
params: np.ndarray,
|
||||
pvals: np.ndarray,
|
||||
metrics: dict,
|
||||
n: int,
|
||||
*,
|
||||
b2_effective: Optional[float] = None,
|
||||
x_pos: float = 0.5,
|
||||
) -> pd.DataFrame:
|
||||
b2_val = b2_effective if b2_effective is not None else params[2]
|
||||
lines = [
|
||||
f"R2_trend={metrics.get('r2_trend', np.nan):.3f}",
|
||||
f"AUC={metrics.get('auc', np.nan):.3f}",
|
||||
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
|
||||
f"b2={b2_val:.3f} (p={pvals[2]:.3g})",
|
||||
f"n={n}",
|
||||
]
|
||||
return pd.DataFrame(
|
||||
{
|
||||
"x": [x_pos] * len(lines),
|
||||
"y": [metrics.get("y_max_for_anno", 0) - i * 0.4 for i in range(len(lines))],
|
||||
"label": lines,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def save_scatter_trend_quad(
|
||||
df: pd.DataFrame,
|
||||
y_col: str,
|
||||
out_path: Path,
|
||||
*,
|
||||
x_col: str = bmp.X_COL,
|
||||
x_max: float = bmp.DEFAULT_X_MAX,
|
||||
y_max: float = bmp.DEFAULT_Y_MAX,
|
||||
force_negative_b2: bool = False,
|
||||
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
|
||||
title: str = "",
|
||||
) -> None:
|
||||
cleaned, trend_data = filter_and_trend(
|
||||
df,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
x_max=x_max,
|
||||
y_max=y_max,
|
||||
trend_method=bmp.DEFAULT_TREND_METHOD,
|
||||
trend_frac=bmp.DEFAULT_TREND_FRAC,
|
||||
savgol_window=savgol_window,
|
||||
)
|
||||
if trend_data[0] is None:
|
||||
print(f"[{y_col}] нет тренда/данных для построения")
|
||||
return
|
||||
|
||||
cleaned = cleaned.copy()
|
||||
cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max)
|
||||
|
||||
model, metrics = fit_quadratic(cleaned, y_col, trend_data, x_col=x_col, x_max=x_max, force_negative_b2=force_negative_b2)
|
||||
if model is None:
|
||||
print(f"[{y_col}] недостаточно точек для квадрата")
|
||||
return
|
||||
|
||||
params = model.params
|
||||
pvals = model.pvalues
|
||||
b2_effective = -abs(params[2]) if force_negative_b2 else params[2]
|
||||
|
||||
x_grid = np.linspace(0, x_max, 400)
|
||||
quad_term = -x_grid**2 if force_negative_b2 else x_grid**2
|
||||
quad_df = pd.DataFrame(
|
||||
{
|
||||
x_col: x_grid,
|
||||
"quad": model.predict(sm.add_constant(np.column_stack([x_grid, quad_term]))),
|
||||
}
|
||||
)
|
||||
|
||||
trend_df = pd.DataFrame({x_col: trend_data[0], "trend": trend_data[1]})
|
||||
metrics["y_max_for_anno"] = y_max * 0.95
|
||||
metrics_text = [
|
||||
f"R2_trend={metrics['r2_trend']:.3f}",
|
||||
f"AUC={metrics['auc']:.3f}",
|
||||
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
|
||||
f"b2={b2_effective:.3f} (p={pvals[2]:.3g})",
|
||||
f"n={len(cleaned)}",
|
||||
]
|
||||
|
||||
x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max)
|
||||
y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False)
|
||||
|
||||
points = alt.Chart(cleaned).mark_circle(size=40).encode(
|
||||
x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale),
|
||||
y=alt.Y(y_col, title=y_col, scale=y_scale),
|
||||
opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)),
|
||||
color=alt.value(bmp.DEFAULT_SCATTER_COLOR),
|
||||
tooltip=[x_col, y_col],
|
||||
)
|
||||
|
||||
trend_line = alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode(
|
||||
x=alt.X(x_col, scale=x_scale),
|
||||
y=alt.Y("trend", scale=y_scale),
|
||||
)
|
||||
quad_line = alt.Chart(quad_df).mark_line(color="blue", strokeWidth=2.2, strokeDash=[6, 4]).encode(
|
||||
x=alt.X(x_col, scale=x_scale),
|
||||
y=alt.Y("quad", scale=y_scale),
|
||||
)
|
||||
|
||||
subtitle = " • ".join(metrics_text)
|
||||
|
||||
chart = alt.layer(points, trend_line, quad_line).resolve_scale(opacity="independent")
|
||||
chart = configure_chart(chart, (title or f"{y_col} vs {x_col}") + f" — {subtitle}", width=800, height=600)
|
||||
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
chart.save(out_path)
|
||||
inject_font_css(out_path)
|
||||
print(f"Saved {out_path}")
|
||||
|
||||
|
||||
def save_correlation_heatmap(df: pd.DataFrame, cols: Iterable[str], title: str, out_path: Path) -> None:
|
||||
corr = df[list(cols)].corr()
|
||||
corr_long = corr.reset_index().melt(id_vars="index", var_name="col", value_name="corr")
|
||||
corr_long = corr_long.rename(columns={"index": "row"})
|
||||
|
||||
chart = (
|
||||
alt.Chart(corr_long)
|
||||
.mark_rect()
|
||||
.encode(
|
||||
x=alt.X("col:N", title=""),
|
||||
y=alt.Y("row:N", title=""),
|
||||
color=alt.Color("corr:Q", scale=alt.Scale(domain=(-1, 1), scheme="redblue"), legend=alt.Legend(title="corr")),
|
||||
tooltip=["row", "col", alt.Tooltip("corr:Q", format=".3f")],
|
||||
)
|
||||
)
|
||||
chart = configure_chart(chart, title, width=400, height=400)
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
chart.save(out_path)
|
||||
inject_font_css(out_path)
|
||||
print(f"Saved {out_path}")
|
||||
|
||||
|
||||
def generate_total_plots() -> None:
|
||||
df = prepare_client_data()
|
||||
out_base = OUTPUT_DIR / "orders_amt_total"
|
||||
save_scatter_trend_quad(
|
||||
df,
|
||||
y_col="orders_amt_total",
|
||||
out_path=out_base / "scatter_trend_quad.html",
|
||||
x_max=bmp.DEFAULT_X_MAX,
|
||||
y_max=bmp.DEFAULT_Y_MAX,
|
||||
savgol_window=bmp.DEFAULT_SAVGOL_WINDOW,
|
||||
title="Заказы vs средние показы (все клиенты)",
|
||||
)
|
||||
|
||||
|
||||
def generate_category_plots() -> None:
|
||||
client = prepare_category_client_data()
|
||||
|
||||
x_max_overrides = {
|
||||
"ent": 4,
|
||||
"transport": 6,
|
||||
"super": 4,
|
||||
"avia": 4,
|
||||
"shopping": 4,
|
||||
"avia_hotel": 5,
|
||||
}
|
||||
y_max_overrides = {
|
||||
"ent": 2.5,
|
||||
"transport": 8,
|
||||
"avia": 1.5,
|
||||
"shopping": 2.5,
|
||||
"super": 5.5,
|
||||
"avia_hotel": 2.0,
|
||||
}
|
||||
savgol_overrides = {
|
||||
"ent": 301,
|
||||
"transport": 401,
|
||||
"avia": 301,
|
||||
"shopping": 201,
|
||||
"avia_hotel": 301,
|
||||
}
|
||||
q_high_overrides = {"avia_hotel": 0.9}
|
||||
iqr_overrides = {"avia_hotel": 1.2}
|
||||
|
||||
cats_all = CATEGORIES + list(COMBINED.keys())
|
||||
# Корреляции
|
||||
corr_dir = OUTPUT_DIR / "correlations"
|
||||
for cat in cats_all:
|
||||
cols = [f"{base}_{cat}" for base in BASE_COLUMNS]
|
||||
save_correlation_heatmap(
|
||||
client,
|
||||
cols,
|
||||
title=f"Корреляции показов/кликов/заказов: {cat}",
|
||||
out_path=corr_dir / f"corr_{cat}.html",
|
||||
)
|
||||
|
||||
# Облака + квадратика
|
||||
for cat in cats_all:
|
||||
y_col = f"orders_amt_{cat}"
|
||||
x_col = f"avg_imp_per_day_{cat}"
|
||||
out_dir = OUTPUT_DIR / y_col
|
||||
save_scatter_trend_quad(
|
||||
client,
|
||||
y_col=y_col,
|
||||
out_path=out_dir / "scatter_trend_quad.html",
|
||||
x_col=x_col,
|
||||
x_max=x_max_overrides.get(cat, bmp.DEFAULT_X_MAX),
|
||||
y_max=y_max_overrides.get(cat, bmp.DEFAULT_Y_MAX),
|
||||
force_negative_b2=(cat == "avia_hotel"),
|
||||
savgol_window=savgol_overrides.get(cat, bmp.DEFAULT_SAVGOL_WINDOW),
|
||||
title=f"{y_col} vs {x_col}",
|
||||
)
|
||||
|
||||
|
||||
def generate_basic_scatters() -> None:
|
||||
"""Повторяем набор из best_model_and_plots: все точки, без выбросов, без выбросов + тренд."""
|
||||
df = prepare_client_data()
|
||||
y_col = "orders_amt_total"
|
||||
x_col = bmp.X_COL
|
||||
x_max = bmp.DEFAULT_X_MAX
|
||||
y_max = bmp.DEFAULT_Y_MAX
|
||||
out_dir = OUTPUT_DIR / y_col
|
||||
|
||||
base = df[[x_col, y_col]].dropna()
|
||||
base = bmp.filter_x_range(base, x_col, x_max)
|
||||
base = base.copy()
|
||||
base["alpha"] = compute_density_alpha(base, x_col, y_col, x_max, y_max)
|
||||
|
||||
def scatter_chart(data: pd.DataFrame, title: str, trend: Tuple[np.ndarray, np.ndarray] | None = None) -> alt.Chart:
|
||||
x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max)
|
||||
y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False)
|
||||
points = alt.Chart(data).mark_circle(size=40).encode(
|
||||
x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale),
|
||||
y=alt.Y(y_col, title=y_col, scale=y_scale),
|
||||
opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)),
|
||||
color=alt.value(bmp.DEFAULT_SCATTER_COLOR),
|
||||
tooltip=[x_col, y_col],
|
||||
)
|
||||
layers = [points]
|
||||
if trend is not None and trend[0] is not None:
|
||||
trend_df = pd.DataFrame({x_col: trend[0], "trend": trend[1]})
|
||||
layers.append(
|
||||
alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode(
|
||||
x=alt.X(x_col, scale=x_scale),
|
||||
y=alt.Y("trend", scale=y_scale),
|
||||
)
|
||||
)
|
||||
chart = alt.layer(*layers).resolve_scale(opacity="independent")
|
||||
return configure_chart(chart, title, width=800, height=600)
|
||||
|
||||
# 1) все точки
|
||||
scatter_chart(base, "Облако: все точки").save(out_dir / "scatter_all.html")
|
||||
inject_font_css(out_dir / "scatter_all.html")
|
||||
|
||||
# 2) без выбросов
|
||||
cleaned = bmp.remove_outliers(base, y_col=y_col, x_col=x_col, iqr_k=bmp.DEFAULT_IQR_K, q_low=bmp.DEFAULT_Q_LOW, q_high=bmp.DEFAULT_Q_HIGH)
|
||||
cleaned = cleaned.copy()
|
||||
cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max)
|
||||
scatter_chart(cleaned, "Облако: без выбросов").save(out_dir / "scatter_clean.html")
|
||||
inject_font_css(out_dir / "scatter_clean.html")
|
||||
|
||||
# 3) без выбросов + тренд
|
||||
tx, ty = bmp.compute_trend(
|
||||
cleaned,
|
||||
y_col=y_col,
|
||||
x_col=x_col,
|
||||
method=bmp.DEFAULT_TREND_METHOD,
|
||||
lowess_frac=bmp.DEFAULT_TREND_FRAC,
|
||||
savgol_window=bmp.DEFAULT_SAVGOL_WINDOW,
|
||||
)
|
||||
scatter_chart(cleaned, "Облако: без выбросов + тренд", trend=(tx, ty)).save(out_dir / "scatter_clean_trend.html")
|
||||
inject_font_css(out_dir / "scatter_clean_trend.html")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
generate_basic_scatters()
|
||||
generate_total_plots()
|
||||
generate_category_plots()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
old data/old_generated_plots/best_bins.png
Normal file
|
After Width: | Height: | Size: 119 KiB |
BIN
old data/old_generated_plots/best_model_prob.png
Normal file
|
After Width: | Height: | Size: 47 KiB |
BIN
old data/old_generated_plots/orders_vs_avg_imp_per_day.png
Normal file
|
After Width: | Height: | Size: 91 KiB |
|
After Width: | Height: | Size: 422 KiB |
|
After Width: | Height: | Size: 177 KiB |
|
After Width: | Height: | Size: 70 KiB |
BIN
old data/old_generated_plots/orders_vs_avg_imp_scatter.png
Normal file
|
After Width: | Height: | Size: 122 KiB |
BIN
old data/old_generated_plots/orders_vs_avg_imp_scatter_clean.png
Normal file
|
After Width: | Height: | Size: 124 KiB |
BIN
old data/old_generated_plots/orders_vs_avg_imp_scatter_trend.png
Normal file
|
After Width: | Height: | Size: 130 KiB |
BIN
old data/old_generated_plots/orders_vs_avg_imp_with_costs.png
Normal file
|
After Width: | Height: | Size: 405 KiB |
BIN
old data/old_generated_plots/orders_vs_avg_imp_without_costs.png
Normal file
|
After Width: | Height: | Size: 387 KiB |
|
After Width: | Height: | Size: 360 KiB |
|
After Width: | Height: | Size: 256 KiB |
BIN
old data/old_generated_plots/quad_regression_with_costs.png
Normal file
|
After Width: | Height: | Size: 440 KiB |
BIN
old data/old_generated_plots/stat_bins.png
Normal file
|
After Width: | Height: | Size: 87 KiB |
44
old data/orders_amt_avia/scatter_trend_quad.html
Normal file
44
old data/orders_amt_avia_hotel/scatter_trend_quad.html
Normal file
44
old data/orders_amt_ent/scatter_trend_quad.html
Normal file
44
old data/orders_amt_hotel/scatter_trend_quad.html
Normal file
44
old data/orders_amt_shopping/scatter_trend_quad.html
Normal file
44
old data/orders_amt_super/scatter_trend_quad.html
Normal file
44
old data/orders_amt_total/scatter_all.html
Normal file
44
old data/orders_amt_total/scatter_clean.html
Normal file
44
old data/orders_amt_total/scatter_clean_trend.html
Normal file
44
old data/orders_amt_total/scatter_trend_quad.html
Normal file
44
old data/orders_amt_transport/scatter_trend_quad.html
Normal file
|
After Width: | Height: | Size: 133 KiB |
|
After Width: | Height: | Size: 133 KiB |
|
After Width: | Height: | Size: 133 KiB |
|
After Width: | Height: | Size: 141 KiB |
|
After Width: | Height: | Size: 139 KiB |
|
After Width: | Height: | Size: 147 KiB |
|
After Width: | Height: | Size: 144 KiB |
|
After Width: | Height: | Size: 146 KiB |
1821
old data/preanalysis_old_bad/01_load_and_clean.ipynb
Normal file
1508
old data/preanalysis_old_bad/02_univariate_bivariate.ipynb
Normal file
1296
old data/preanalysis_old_bad/03_time_and_lags.ipynb
Normal file
1987
old data/preanalysis_old_bad/04_clients_segmentation.ipynb
Normal file
449
old data/preanalysis_old_bad/05_exploratory_models.ipynb
Normal file
55
old data/preanalysis_old_bad/eda_report.ipynb
Normal file
@@ -0,0 +1,55 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3d3a9c98",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# EDA «Коммуникации в Городе»\n",
|
||||
"\n",
|
||||
"## Кратко о данных\n",
|
||||
"- 118 189 строк, 8 339 клиентов, 35 исходных столбцов + инженерные (totals, CTR/CR, флаги).\n",
|
||||
"- Диапазон дат: 2025‑01‑09 — 2025‑11‑04 (284 дня).\n",
|
||||
"- Категории сервисов: ent, super, transport, shopping, hotel, avia; активные и пассивные показы/клики, заказы по категориям.\n",
|
||||
"- Дубликаты по ключу (id, business_dt): нет.\n",
|
||||
"\n",
|
||||
"## Качество данных\n",
|
||||
"- Пропуски: несущественные (NaN почти нет), отрицательных значений не обнаружено.\n",
|
||||
"- Возраст: 15–80 лет, p1/p99 = 22/68, мусора (<14 или >100) нет.\n",
|
||||
"- Гендер: 68.5% M, 31.5% F. Платформа после нормализации: ~52.5% iOS, ~46.7% Android, 1.1% iPadOS.\n",
|
||||
"- Признаки «заспамленности» и агрегаты на клиента добавлены: imp/click/order totals, CTR/CR, contact_days, avg_impressions_per_contact_day, order_categories_count.\n",
|
||||
"\n",
|
||||
"## Каналы и эффективность (агрегировано по всем строкам)\n",
|
||||
"- Active impressions ≈ 219.5k, passive impressions ≈ 473.1k.\n",
|
||||
"- Active clicks ≈ 147.3k (CTR_active ≈ 0.67), passive clicks ≈ 18.1k (CTR_passive ≈ 0.038).\n",
|
||||
"- Заказы всего: 12 439; CR click→order ≈ 7.5%, CR imp→order ≈ 1.8%.\n",
|
||||
"- Дневных точек: 284; daily агрегаты подготовлены (CTR/CR, day_of_week).\n",
|
||||
"\n",
|
||||
"## Демография и устройство vs эффективность (по клиентским агрегатам)\n",
|
||||
"- Таблицы по полу/возрастным группам/платформам готовы в `04_clients_segmentation.ipynb` (средние impressions/clicks/orders и CTR/CR).\n",
|
||||
"- Гипотезы: в 05-м ноутбуке добавлены примеры Mann–Whitney по CTR active vs passive и по полу; можно расширять на платформы и возраст.\n",
|
||||
"\n",
|
||||
"## Лаги и сезонность\n",
|
||||
"- Дневные ряды и метрики CTR/CR по времени и по дням недели — см. `03_time_and_lags.ipynb`.\n",
|
||||
"- Лаги: реализованы кросс-корреляции orders vs impressions/clicks (hotel, avia) для lag 0–7; по клиентам — first_imp/click/order и распределения дней до заказа.\n",
|
||||
"\n",
|
||||
"## Сегменты и «усталость»\n",
|
||||
"- Сегменты каналов: only_active / only_passive / both + метрики; бины по числу категорий заказов.\n",
|
||||
"- «Заспамленность»: bin по avg_impressions_per_contact_day с CTR/CR; stacked доли категорий заказов по возрасту.\n",
|
||||
"\n",
|
||||
"## Модели как часть EDA\n",
|
||||
"- На клиентском уровне собран датасет для задачи `has_any_order`; pipeline с OHE + StandardScaler + LogisticRegression и RandomForest (ROC-AUC и важности).\n",
|
||||
"- Выводы по коэффициентам/важности доступны в `05_exploratory_models.ipynb`.\n",
|
||||
"\n",
|
||||
"## Что делать дальше\n",
|
||||
"- Прогнать все ноутбуки end-to-end (данные готовы, зависимости в `.venv`): `jupyter lab` или `jupyter nbconvert --execute`.\n",
|
||||
"- Уточнить нормализацию категорий (при необходимости) и при желании сохранить `dataset/ds_clean.parquet` (флаг в `01_load_and_clean.ipynb`).\n",
|
||||
"- Добавить/актуализировать бизнес-гипотезы (категории, платформы, возраст) и зафиксировать p-value в таблице гипотез.\n",
|
||||
"- При необходимости усилить визуализацию: календарные heatmap по CTR, ECDF лагов по каждой категории, PDP для топ-фичей модели.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
41
old data/preanalysis_old_bad/eda_report.md
Normal file
@@ -0,0 +1,41 @@
|
||||
# EDA «Коммуникации в Городе»
|
||||
|
||||
## Кратко о данных
|
||||
- 118189 строк, 8339 клиентов, 35 исходных столбцов + инженерные (totals, CTR/CR, флаги).
|
||||
- Диапазон дат: 2025‑01‑09 — 2025‑11‑04 (284 дня).
|
||||
- Категории сервисов: ent, super, transport, shopping, hotel, avia; активные и пассивные показы/клики, заказы по категориям.
|
||||
- Дубликаты по ключу (id, business_dt): нет.
|
||||
|
||||
## Качество данных
|
||||
- Пропуски: несущественные (NaN почти нет), отрицательных значений не обнаружено.
|
||||
- Возраст: 15–80 лет, p1/p99 = 22/68, мусора (<14 или >100) нет.
|
||||
- Гендер: 68.5% M, 31.5% F. Платформа после нормализации: ~52.5% iOS, ~46.7% Android, 1.1% iPadOS.
|
||||
- Признаки «заспамленности» и агрегаты на клиента добавлены: imp/click/order totals, CTR/CR, contact_days, avg_impressions_per_contact_day, order_categories_count.
|
||||
|
||||
## Каналы и эффективность (агрегировано по всем строкам)
|
||||
- Active impressions ≈ 219.5k, passive impressions ≈ 473.1k.
|
||||
- Active clicks ≈ 147.3k (CTR_active ≈ 0.67), passive clicks ≈ 18.1k (CTR_passive ≈ 0.038).
|
||||
- Заказы всего: 12439; CR click→order ≈ 7.5%, CR imp→order ≈ 1.8%.
|
||||
- Дневных точек: 284; daily агрегаты подготовлены (CTR/CR, day_of_week).
|
||||
|
||||
## Демография и устройство vs эффективность (по клиентским агрегатам)
|
||||
- Таблицы по полу/возрастным группам/платформам готовы в `04_clients_segmentation.ipynb` (средние impressions/clicks/orders и CTR/CR).
|
||||
- Гипотезы: в 05-м ноутбуке добавлены примеры Mann–Whitney по CTR active vs passive и по полу; можно расширять на платформы и возраст.
|
||||
|
||||
## Лаги и сезонность
|
||||
- Дневные ряды и метрики CTR/CR по времени и по дням недели — см. `03_time_and_lags.ipynb`.
|
||||
- Лаги: реализованы кросс-корреляции orders vs impressions/clicks (hotel, avia) для lag 0–7; по клиентам — first_imp/click/order и распределения дней до заказа.
|
||||
|
||||
## Сегменты и «усталость»
|
||||
- Сегменты каналов: only_active / only_passive / both + метрики; бины по числу категорий заказов.
|
||||
- «Заспамленность»: bin по avg_impressions_per_contact_day с CTR/CR; stacked доли категорий заказов по возрасту.
|
||||
|
||||
## Модели как часть EDA
|
||||
- На клиентском уровне собран датасет для задачи `has_any_order`; pipeline с OHE + StandardScaler + LogisticRegression и RandomForest (ROC-AUC и важности).
|
||||
- Выводы по коэффициентам/важности доступны в `05_exploratory_models.ipynb`.
|
||||
|
||||
## Что делать дальше
|
||||
- Прогнать все ноутбуки end-to-end (данные готовы, зависимости в `.venv`): `jupyter lab` или `jupyter nbconvert --execute`.
|
||||
- Уточнить нормализацию категорий (при необходимости) и при желании сохранить `dataset/ds_clean.parquet` (флаг в `01_load_and_clean.ipynb`).
|
||||
- Добавить/актуализировать бизнес-гипотезы (категории, платформы, возраст) и зафиксировать p-value в таблице гипотез.
|
||||
- При необходимости усилить визуализацию: календарные heatmap по CTR, ECDF лагов по каждой категории, PDP для топ-фичей модели.
|
||||
154
old data/preanalysis_old_bad/eda_utils.py
Normal file
@@ -0,0 +1,154 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
# Paths and column groups
|
||||
DATA_PATH = Path("dataset/ds.csv")
|
||||
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
|
||||
|
||||
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
|
||||
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
|
||||
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
|
||||
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
|
||||
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
|
||||
|
||||
NUMERIC_COLS = (
|
||||
ACTIVE_IMP_COLS
|
||||
+ PASSIVE_IMP_COLS
|
||||
+ ACTIVE_CLICK_COLS
|
||||
+ PASSIVE_CLICK_COLS
|
||||
+ ORDER_COLS
|
||||
+ ["age"]
|
||||
)
|
||||
CAT_COLS = ["gender_cd", "device_platform_cd"]
|
||||
|
||||
|
||||
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
|
||||
"""Divide with protection against zero (works for Series and scalars)."""
|
||||
if isinstance(denominator, pd.Series):
|
||||
denom = denominator.replace(0, np.nan)
|
||||
else:
|
||||
denom = np.nan if float(denominator) == 0 else denominator
|
||||
return numerator / denom
|
||||
|
||||
|
||||
def normalize_gender(series: pd.Series) -> pd.Series:
|
||||
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
|
||||
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
|
||||
return cleaned.map(mapping).fillna("UNKNOWN")
|
||||
|
||||
|
||||
def normalize_device(series: pd.Series) -> pd.Series:
|
||||
cleaned = series.fillna("unknown").astype(str).str.strip()
|
||||
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
|
||||
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
|
||||
mapped = lowered.map(mapping)
|
||||
fallback = cleaned.str.title()
|
||||
return mapped.fillna(fallback)
|
||||
|
||||
|
||||
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
|
||||
bins = [0, 25, 35, 45, 55, np.inf]
|
||||
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
|
||||
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
|
||||
return df
|
||||
|
||||
|
||||
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
|
||||
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
|
||||
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
|
||||
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
|
||||
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
|
||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
||||
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
|
||||
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
|
||||
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
|
||||
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
|
||||
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
|
||||
return df
|
||||
|
||||
|
||||
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
|
||||
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
||||
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
|
||||
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
|
||||
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
|
||||
return df
|
||||
|
||||
|
||||
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
|
||||
df = pd.read_csv(path)
|
||||
df["business_dt"] = pd.to_datetime(df["business_dt"])
|
||||
df["gender_cd"] = normalize_gender(df["gender_cd"])
|
||||
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
|
||||
df = add_age_group(df)
|
||||
df = add_totals(df)
|
||||
df = add_flags(df)
|
||||
return df
|
||||
|
||||
|
||||
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
|
||||
stats = []
|
||||
for col in cols:
|
||||
series = df[col]
|
||||
stats.append(
|
||||
{
|
||||
"col": col,
|
||||
"count": series.count(),
|
||||
"mean": series.mean(),
|
||||
"median": series.median(),
|
||||
"std": series.std(),
|
||||
"min": series.min(),
|
||||
"q25": series.quantile(0.25),
|
||||
"q75": series.quantile(0.75),
|
||||
"max": series.max(),
|
||||
"share_zero": (series == 0).mean(),
|
||||
"p95": series.quantile(0.95),
|
||||
"p99": series.quantile(0.99),
|
||||
}
|
||||
)
|
||||
return pd.DataFrame(stats)
|
||||
|
||||
|
||||
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
|
||||
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
|
||||
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
|
||||
daily = add_totals(daily)
|
||||
daily["day_of_week"] = daily["business_dt"].dt.day_name()
|
||||
return daily
|
||||
|
||||
|
||||
def build_client(df: pd.DataFrame) -> pd.DataFrame:
|
||||
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
|
||||
meta_spec: Dict[str, str | callable] = {
|
||||
"age": "median",
|
||||
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
|
||||
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
|
||||
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
|
||||
}
|
||||
agg_spec.update(meta_spec)
|
||||
client = df.groupby("id").agg(agg_spec).reset_index()
|
||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
||||
imp_day = df.copy()
|
||||
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
|
||||
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
|
||||
client = add_totals(client)
|
||||
client = add_flags(client)
|
||||
client = client.merge(contact_days, on="id", how="left")
|
||||
client = client.merge(max_imp_day, on="id", how="left")
|
||||
client = add_contact_density(client)
|
||||
return client
|
||||
|
||||
|
||||
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
|
||||
# contact_days must already be present
|
||||
if "contact_days" in df.columns:
|
||||
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
|
||||
return df
|
||||
return df
|
||||
368
old data/preanalysis_old_bad/task.md
Normal file
@@ -0,0 +1,368 @@
|
||||
# План полноценного преданализа датасета «Коммуникации в Городе»
|
||||
|
||||
Основано на описании датасета: есть ежедневные коммуникации с клиентами в экосистеме «Город Т-Банка», активные/пассивные каналы, показы/клики и заказы по категориям (ent, super, transport, shopping, hotel, avia), а также демография и устройство.
|
||||
|
||||
Обозначения:
|
||||
|
||||
- `*_imp_*` — показы (impressions) активных/пассивных каналов по категориям (`ent`, `super`, `transport`, `shopping`, `hotel`, `avia`).
|
||||
- `*_click_*` — клики/касания по тем же категориям.
|
||||
- `orders_amt_*` — число заказов по категориям.
|
||||
- `gender_cd`, `age`, `device_platform_cd` — демография и устройство.
|
||||
|
||||
---
|
||||
|
||||
## 0. Технический скелет проекта
|
||||
|
||||
Файлы/ноутбуки:
|
||||
|
||||
1. `01_load_and_clean.ipynb` — загрузка, чистка, базовые описания.
|
||||
2. `02_univariate_bivariate.ipynb` — распределения и связи признаков.
|
||||
3. `03_time_and_lags.ipynb` — время, лаги, сезонность.
|
||||
4. `04_clients_segmentation.ipynb` — агрегаты по клиенту, сегменты.
|
||||
5. `05_exploratory_models.ipynb` — простые модели как часть EDA.
|
||||
6. `eda_report.md` / `eda_report.ipynb` — итоговый отчёт.
|
||||
|
||||
---
|
||||
|
||||
## 1. Загрузка и структура данных
|
||||
|
||||
### Таблицы/выводы
|
||||
|
||||
1. `df.info()` — список столбцов, типы, количество ненулевых.
|
||||
2. `df.head(5)` — первые строки для визуальной проверки.
|
||||
3. Размерность:
|
||||
- `n_rows`, `n_cols`
|
||||
- `n_unique_clients = df['id'].nunique()`
|
||||
- диапазон дат: `min(business_dt)`, `max(business_dt)`
|
||||
4. Проверка ключа:
|
||||
- таблица: `df.groupby(['id', 'business_dt']).size().value_counts().head()`
|
||||
(показывает, есть ли дубликаты по ключу)
|
||||
5. Среднее число дней на клиента:
|
||||
- `df.groupby('id').size().describe()`
|
||||
|
||||
### Графики
|
||||
|
||||
1. Количество записей по датам:
|
||||
- `bar/line`: X = `business_dt`, Y = `count(*)`
|
||||
- цель: увидеть провалы/пики выгрузки
|
||||
|
||||
---
|
||||
|
||||
## 2. Качество данных и аномалии
|
||||
|
||||
### Таблицы/метрики
|
||||
|
||||
1. Пропуски:
|
||||
- таблица: колонка → количество пропусков → доля пропусков
|
||||
2. Базовый `describe` по числовым:
|
||||
- `df[num_cols].describe().T`
|
||||
3. Доля нулей:
|
||||
- таблица: колонка → доля нулей → min/max → 95-й, 99-й перцентили
|
||||
4. Логические проверки:
|
||||
- все `*_imp_*`, `*_click_*`, `orders_amt_*` должны быть `>= 0`
|
||||
- поиск отрицательных/странных значений
|
||||
5. Возраст:
|
||||
- мин/макс, перцентили (1-й, 99-й), доля мусора (например, `<14` или `>100`)
|
||||
6. Категориальные:
|
||||
- уникальные значения `gender_cd`, `device_platform_cd`
|
||||
- приведение к единому формату (trim, upper, `unknown`)
|
||||
|
||||
### Графики
|
||||
|
||||
1. Boxplot возраста:
|
||||
- Y = `age`
|
||||
- цель: выбросы и мусор
|
||||
2. Barplot пропусков:
|
||||
- X = столбец, Y = доля NaN (только где NaN > 0)
|
||||
|
||||
---
|
||||
|
||||
## 3. Одномерный анализ (univariate)
|
||||
|
||||
### 3.1. Числовые признаки (показы/клики/заказы)
|
||||
|
||||
#### Таблицы
|
||||
|
||||
1. Для каждой группы (`active_imp_*`, `passive_imp_*`, `active_click_*`, `passive_click_*`, `orders_amt_*`):
|
||||
- `count, mean, median, std, min, q25, q75, max, share_zero, p95, p99`
|
||||
2. Агрегаты по всем категориям:
|
||||
- создать `active_imp_total`, `passive_imp_total`, `active_click_total`, `passive_click_total`, `orders_amt_total`
|
||||
- таблица `describe()` для них
|
||||
|
||||
#### Графики
|
||||
|
||||
1. Гистограммы (лог-масштаб или `log1p`) для каждой категории и типа:
|
||||
- `active_imp_ent`, `active_click_ent`, `passive_imp_ent`, `orders_amt_ent`, …
|
||||
2. Boxplot для агрегатов:
|
||||
- `active_imp_total`, `passive_imp_total`, `active_click_total`, `passive_click_total`, `orders_amt_total`
|
||||
|
||||
### 3.2. Категориальные признаки
|
||||
|
||||
#### Таблицы
|
||||
|
||||
1. Распределение `gender_cd`: counts, доли, `unknown`
|
||||
2. Распределение `device_platform_cd`: counts, доли
|
||||
3. Возрастные группы:
|
||||
- `<25`, `25–34`, `35–44`, `45–54`, `55+`
|
||||
- таблица: группа → число клиентов → доля
|
||||
|
||||
#### Графики
|
||||
|
||||
1. Barplot пола: X = `M/F/Unknown`, Y = доля
|
||||
2. Barplot платформ: X = platform, Y = доля
|
||||
3. Гистограмма возраста
|
||||
|
||||
---
|
||||
|
||||
## 4. Время и сезонность
|
||||
|
||||
Создать дневные агрегаты:
|
||||
|
||||
- сумма показов/кликов/заказов по дням
|
||||
- метрики:
|
||||
- `CTR_active = active_click_total / active_imp_total`
|
||||
- `CTR_passive = passive_click_total / passive_imp_total`
|
||||
- `CR_click2order = orders_amt_total / (active_click_total + passive_click_total)`
|
||||
- `CR_imp2order = orders_amt_total / (active_imp_total + passive_imp_total)`
|
||||
- день недели: `day_of_week`
|
||||
|
||||
### Таблицы
|
||||
|
||||
1. `daily.describe()` по дневным агрегатам
|
||||
2. Таблица по дням недели:
|
||||
- `day_of_week` → среднее `impressions, clicks, orders, CTR, CR`
|
||||
|
||||
### Графики
|
||||
|
||||
1. Линейные временные ряды:
|
||||
- `business_dt` vs total impressions
|
||||
- `business_dt` vs total clicks
|
||||
- `business_dt` vs total orders
|
||||
2. Линии CTR/CR во времени (rolling mean 7 дней по желанию):
|
||||
- `active_ctr`, `passive_ctr`, `cr_click2order`
|
||||
3. Сезонность по дням недели:
|
||||
- barplot для `active_ctr`, `passive_ctr`, `cr_click2order`
|
||||
4. (Опционально) календарная heatmap заказов/CTR
|
||||
|
||||
---
|
||||
|
||||
## 5. Парные связи (bivariate)
|
||||
|
||||
### Таблицы
|
||||
|
||||
1. Корреляции Спирмена (на уровне клиента/дня):
|
||||
- между всеми числовыми признаками + `age`
|
||||
2. Для каждой категории:
|
||||
- биннинг показов по квантилям → средний `imp, click, CTR, orders, CR`
|
||||
|
||||
### Графики
|
||||
|
||||
1. Scatter/hexbin «показы → клики»:
|
||||
- `active_imp_*` vs `active_click_*`
|
||||
- `passive_imp_*` vs `passive_click_*`
|
||||
2. Scatter «клики → заказы»:
|
||||
- `*_click_*` vs `orders_amt_*`
|
||||
3. CTR по бинам показов (линия/бар)
|
||||
4. CR по бинам кликов (линия/бар)
|
||||
5. Heatmap корреляций
|
||||
|
||||
---
|
||||
|
||||
## 6. Демография и устройство vs эффективность
|
||||
|
||||
Агрегировать по клиенту:
|
||||
|
||||
- суммы показов/кликов/заказов
|
||||
- CTR/CR на уровне клиента
|
||||
- добавить `gender_cd`, `age_group`, `device_platform_cd`
|
||||
|
||||
### Таблицы
|
||||
|
||||
1. По полу:
|
||||
- средние `impressions, clicks, orders, CTR, CR`
|
||||
2. По возрастным группам:
|
||||
- те же метрики
|
||||
3. По платформам:
|
||||
- те же метрики
|
||||
4. Тесты гипотез (Mann–Whitney / t-test):
|
||||
- разница CTR/CR между группами
|
||||
|
||||
### Графики
|
||||
|
||||
1. Barplot CTR/CR по полу
|
||||
2. Barplot CTR/CR по возрастным группам
|
||||
3. Barplot CTR/CR по платформам
|
||||
4. Boxplot заказов по возрастным группам
|
||||
5. Stacked bar: возраст → доли категорий заказов (наполнение корзины сервисами)
|
||||
|
||||
---
|
||||
|
||||
## 7. Поведение по клиенту и сегментация
|
||||
|
||||
### 7.1. Простые сегменты
|
||||
|
||||
Флаги на уровне клиента:
|
||||
|
||||
- `has_active_comm`, `has_passive_comm`
|
||||
- `has_any_order`
|
||||
- `order_categories_count` (в скольких категориях есть заказ)
|
||||
|
||||
#### Таблицы
|
||||
|
||||
1. Сегменты каналов:
|
||||
- `only_active`, `only_passive`, `both`
|
||||
- доля клиентов, средние заказы, CTR/CR
|
||||
2. Сегменты мультикатегорийности:
|
||||
- `1`, `2`, `3+` категорий заказов
|
||||
- средние коммуникации/заказы, демография
|
||||
|
||||
#### Графики
|
||||
|
||||
1. Barplot по сегментам каналов:
|
||||
- средние заказы, CTR/CR
|
||||
2. Barplot по числу категорий заказов
|
||||
3. Stacked bar: сегменты → пол/возраст (по желанию)
|
||||
|
||||
### 7.2. Кластеризация (расширенный EDA)
|
||||
|
||||
1. Вектор фичей:
|
||||
- суммы по категориям + CTR/CR + доли заказов
|
||||
2. Нормализация
|
||||
3. KMeans / GMM, 3–7 кластеров
|
||||
|
||||
#### Таблицы
|
||||
|
||||
- кластер → размер → средние фичи → краткая интерпретация
|
||||
|
||||
#### Графики
|
||||
|
||||
1. Профили кластеров (bar/radar)
|
||||
2. Scatter PCA/UMAP: цвет = кластер
|
||||
|
||||
---
|
||||
|
||||
## 8. Воронка: показы → клики → заказы
|
||||
|
||||
### Таблицы
|
||||
|
||||
1. Общая воронка:
|
||||
- `channel_type`, `category`, `impressions`, `clicks`, `orders`, `CTR`, `CR_click2order`, `CR_imp2order`
|
||||
2. Воронка по сегментам:
|
||||
- пол/возраст/платформа → те же метрики
|
||||
|
||||
### Графики
|
||||
|
||||
1. Funnel chart active vs passive (общий)
|
||||
2. Barplot CTR по категориям + сравнение active/passive
|
||||
3. Barplot CR по категориям + сравнение active/passive
|
||||
4. Funnel/Bar по возрастным группам
|
||||
|
||||
---
|
||||
|
||||
## 9. Временные лаги между коммуникациями и заказами
|
||||
|
||||
С учётом «поздних покупок» (особенно travel).
|
||||
|
||||
### 9.1. Лаги на дневном уровне
|
||||
|
||||
#### Таблицы
|
||||
|
||||
1. Лаговые признаки `lag1..lag7` для показов/кликов
|
||||
2. Кросс-корреляция:
|
||||
- lag → corr(orders*t, impressions*{t-lag})
|
||||
|
||||
#### Графики
|
||||
|
||||
1. Линия «lag vs корреляция» по:
|
||||
- `hotel`, `avia` (и др. при желании)
|
||||
- active vs passive
|
||||
|
||||
### 9.2. Лаги на клиентском уровне
|
||||
|
||||
#### Таблицы
|
||||
|
||||
1. `first_imp_date`, `first_click_date`, `first_order_date`
|
||||
2. `days_to_order`
|
||||
3. Квантили `days_to_order` по категориям
|
||||
|
||||
#### Графики
|
||||
|
||||
1. Гистограмма/ECDF `days_to_order` по категориям
|
||||
|
||||
---
|
||||
|
||||
## 10. Мультиканальность и «заспамленность»
|
||||
|
||||
### Таблицы
|
||||
|
||||
1. `contact_days`, `avg_impressions_per_contact_day`, `max_impressions_per_day`
|
||||
2. Бины по `avg_impressions_per_contact_day` → средний CTR/CR
|
||||
|
||||
### Графики
|
||||
|
||||
1. Гистограмма `avg_impressions_per_contact_day`
|
||||
2. Линия/бар: CTR/CR vs уровень спама
|
||||
|
||||
---
|
||||
|
||||
## 11. Простые модели как часть EDA
|
||||
|
||||
### 11.1. Бинарная модель «есть заказ / нет заказа»
|
||||
|
||||
Target:
|
||||
|
||||
- `has_any_order`
|
||||
|
||||
Features:
|
||||
|
||||
- суммы показов/кликов по типам и категориям
|
||||
- CTR/CR
|
||||
- демография и платформа
|
||||
|
||||
#### Таблицы
|
||||
|
||||
1. Логистическая регрессия:
|
||||
- коэффы, p-value, odds ratio
|
||||
2. Feature importance из дерева/лесов
|
||||
|
||||
#### Графики
|
||||
|
||||
1. Barplot важностей
|
||||
2. (Опционально) partial dependence для 2–3 ключевых фичей
|
||||
|
||||
---
|
||||
|
||||
## 12. Гипотезы и статтесты
|
||||
|
||||
### Примеры гипотез
|
||||
|
||||
1. `CTR_active > CTR_passive`
|
||||
2. CR различается между категориями сервисов
|
||||
3. CTR/CR различаются по полу/возрасту/платформе
|
||||
4. «заспамленность» снижает CTR/CR после порога
|
||||
|
||||
### Таблица гипотез
|
||||
|
||||
- гипотеза, H0/H1, тест, p-value, вывод, бизнес-интерпретация
|
||||
|
||||
Графики для поддержки — использовать из разделов 4–10 (барчики/боксплоты).
|
||||
|
||||
---
|
||||
|
||||
## 13. Итоговая документация
|
||||
|
||||
1. Резюме выводов:
|
||||
- качество данных
|
||||
- эффективность каналов/категорий
|
||||
- сегменты, где коммуникации лучше/хуже работают
|
||||
- лаги (как быстро покупают после контактов)
|
||||
- признаки «усталости» от коммуникаций
|
||||
2. Список проблем данных и принятых решений по чистке
|
||||
3. Список инсайтов для бизнеса
|
||||
4. Список фичей для будущих моделей
|
||||
5. Следующие шаги:
|
||||
- подготовка ML-пайплайна
|
||||
- список A/B-гипотез
|
||||
- какие данные добрать (если нужно)
|
||||
|
||||
---
|
||||
152
old data/quadreg.py
Normal file
@@ -0,0 +1,152 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
import statsmodels.api as sm
|
||||
from sklearn.metrics import r2_score, roc_auc_score
|
||||
|
||||
import best_model_and_plots as bmp
|
||||
|
||||
# Константы из scatter-скрипта
|
||||
X_COL = bmp.X_COL
|
||||
Y_COL = "orders_amt_total"
|
||||
X_MAX = bmp.DEFAULT_X_MAX
|
||||
Y_MIN = bmp.DEFAULT_Y_MIN
|
||||
Y_MAX = bmp.DEFAULT_Y_MAX
|
||||
|
||||
|
||||
def fit_quadratic(
|
||||
cleaned: bmp.pd.DataFrame,
|
||||
trend_data: Optional[Tuple[np.ndarray, np.ndarray]],
|
||||
*,
|
||||
x_col: str = X_COL,
|
||||
y_col: str = Y_COL,
|
||||
x_max: float = X_MAX,
|
||||
) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]:
|
||||
"""Фитит y ~ 1 + x + x^2. Если есть тренд, использует его как целевое для r2_trend."""
|
||||
df = cleaned[[x_col, y_col]].dropna()
|
||||
if len(df) < 3:
|
||||
return None, {}
|
||||
|
||||
if trend_data is not None and trend_data[0] is not None:
|
||||
tx, ty = trend_data
|
||||
tx = np.asarray(tx)
|
||||
ty = np.asarray(ty)
|
||||
mask = (tx <= x_max) & ~np.isnan(ty)
|
||||
tx = tx[mask]
|
||||
ty = ty[mask]
|
||||
else:
|
||||
tx = ty = None
|
||||
|
||||
x = df[x_col].to_numpy()
|
||||
y = df[y_col].to_numpy()
|
||||
|
||||
X_design = sm.add_constant(np.column_stack([x, x**2]))
|
||||
model = sm.OLS(y, X_design).fit(cov_type="HC3")
|
||||
|
||||
auc = np.nan
|
||||
binary = (y > 0).astype(int)
|
||||
if len(np.unique(binary)) > 1:
|
||||
auc = roc_auc_score(binary, model.predict(X_design))
|
||||
|
||||
r2_trend = np.nan
|
||||
if tx is not None and len(tx) >= 3:
|
||||
X_trend = sm.add_constant(np.column_stack([tx, tx**2]))
|
||||
y_hat_trend = model.predict(X_trend)
|
||||
if np.nanvar(ty) > 0:
|
||||
r2_trend = r2_score(ty, y_hat_trend)
|
||||
|
||||
metrics = {
|
||||
"auc": auc,
|
||||
"r2_trend": r2_trend,
|
||||
}
|
||||
return model, metrics
|
||||
|
||||
|
||||
def plot_overall_quad(
|
||||
x_max: float = X_MAX,
|
||||
y_min: float = Y_MIN,
|
||||
y_max: float = Y_MAX,
|
||||
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
|
||||
) -> None:
|
||||
out_dir = bmp.BASE_OUT_DIR / Y_COL
|
||||
|
||||
res = bmp.plot_clean_trend_scatter(
|
||||
bmp.load_client_level(bmp.DB_PATH),
|
||||
y_col=Y_COL,
|
||||
out_dir=out_dir,
|
||||
x_col=X_COL,
|
||||
x_max=x_max,
|
||||
scatter_color=bmp.DEFAULT_SCATTER_COLOR,
|
||||
point_size=bmp.DEFAULT_POINT_SIZE,
|
||||
alpha=bmp.DEFAULT_TREND_ALPHA,
|
||||
iqr_k=bmp.DEFAULT_IQR_K,
|
||||
q_low=bmp.DEFAULT_Q_LOW,
|
||||
q_high=bmp.DEFAULT_Q_HIGH,
|
||||
alpha_min=bmp.DEFAULT_ALPHA_MIN,
|
||||
alpha_max=bmp.DEFAULT_ALPHA_MAX,
|
||||
bins_x=bmp.DEFAULT_BINS_X,
|
||||
bins_y=bmp.DEFAULT_BINS_Y,
|
||||
y_min=y_min,
|
||||
y_max=y_max,
|
||||
trend_frac=bmp.DEFAULT_TREND_FRAC,
|
||||
trend_color=bmp.DEFAULT_TREND_COLOR,
|
||||
trend_linewidth=bmp.DEFAULT_TREND_LINEWIDTH,
|
||||
trend_method=bmp.DEFAULT_TREND_METHOD,
|
||||
savgol_window=savgol_window,
|
||||
return_components=True,
|
||||
)
|
||||
|
||||
if res is None:
|
||||
print("Нет данных для построения графика")
|
||||
return
|
||||
|
||||
fig, ax, cleaned, trend_data = res
|
||||
model, metrics = fit_quadratic(cleaned, trend_data, x_col=X_COL, y_col=Y_COL, x_max=x_max)
|
||||
|
||||
if model is None:
|
||||
print("Недостаточно точек для квадратичной регрессии")
|
||||
fig.savefig(out_dir / "scatter_trend.png", dpi=150)
|
||||
bmp.plt.close(fig)
|
||||
return
|
||||
|
||||
# Квадратичная линия поверх существующего тренда
|
||||
x_grid = np.linspace(0, x_max, 400)
|
||||
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
|
||||
y_grid = model.predict(X_grid)
|
||||
ax.plot(x_grid, y_grid, color="blue", linewidth=2.2, linestyle="--", label="Квадр. регрессия")
|
||||
ax.legend()
|
||||
|
||||
params = model.params
|
||||
pvals = model.pvalues
|
||||
summary_lines = [
|
||||
f"R2_trend={metrics['r2_trend']:.3f}",
|
||||
f"AUC={metrics['auc']:.3f}",
|
||||
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
|
||||
f"b2={params[2]:.3f} (p={pvals[2]:.3g})",
|
||||
f"n={len(cleaned)}",
|
||||
]
|
||||
ax.text(
|
||||
0.02,
|
||||
0.95,
|
||||
"\n".join(summary_lines),
|
||||
transform=ax.transAxes,
|
||||
ha="left",
|
||||
va="top",
|
||||
fontsize=9,
|
||||
bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.65, edgecolor="gray"),
|
||||
)
|
||||
|
||||
quad_path = out_dir / "scatter_trend_quad.png"
|
||||
fig.tight_layout()
|
||||
fig.savefig(quad_path, dpi=150)
|
||||
bmp.plt.close(fig)
|
||||
print(f"Saved {quad_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
plot_overall_quad()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
87
old data/stat_analysis.py
Normal file
@@ -0,0 +1,87 @@
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy import stats
|
||||
|
||||
sns.set_theme(style="whitegrid")
|
||||
plt.rcParams["figure.figsize"] = (10, 5)
|
||||
|
||||
project_root = Path(__file__).resolve().parent.parent
|
||||
sys.path.append(str(project_root / "preanalysis_old_bad"))
|
||||
import eda_utils as eda # noqa: E402
|
||||
|
||||
db_path = project_root / "dataset" / "ds.sqlite"
|
||||
conn = sqlite3.connect(db_path)
|
||||
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
||||
conn.close()
|
||||
|
||||
for cols, name in [
|
||||
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
|
||||
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
|
||||
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
|
||||
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
|
||||
(eda.ORDER_COLS, "orders_amt_total"),
|
||||
]:
|
||||
df[name] = df[cols].sum(axis=1)
|
||||
|
||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
||||
|
||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
||||
client = (
|
||||
df.groupby("id")
|
||||
.agg(
|
||||
imp_total=("imp_total", "sum"),
|
||||
click_total=("click_total", "sum"),
|
||||
orders_amt_total=("orders_amt_total", "sum"),
|
||||
age=("age", "median"),
|
||||
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
|
||||
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
|
||||
)
|
||||
.merge(contact_days, on="id", how="left")
|
||||
.reset_index()
|
||||
)
|
||||
|
||||
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
|
||||
client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
|
||||
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
||||
client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
|
||||
client["has_order"] = (client["orders_amt_total"] > 0).astype(int)
|
||||
|
||||
# Summary
|
||||
summary = client[["imp_total", "click_total", "orders_amt_total", "contact_days", "avg_imp_per_day", "ctr_all", "cr_click2order"]].describe().T
|
||||
print("Summary\n", summary)
|
||||
missing = client.isna().mean().sort_values(ascending=False)
|
||||
print("Missing\n", missing.head(10))
|
||||
|
||||
# Correlations and Mann-Whitney
|
||||
corr_ctr = stats.spearmanr(client["avg_imp_per_day"], client["ctr_all"])
|
||||
corr_cr = stats.spearmanr(client["avg_imp_per_day"], client["cr_click2order"])
|
||||
q1 = client["avg_imp_per_day"].quantile(0.25)
|
||||
q4 = client["avg_imp_per_day"].quantile(0.75)
|
||||
low = client.loc[client["avg_imp_per_day"] <= q1, "ctr_all"].dropna()
|
||||
high = client.loc[client["avg_imp_per_day"] >= q4, "ctr_all"].dropna()
|
||||
wu = stats.mannwhitneyu(low, high, alternative="greater")
|
||||
print({"spearman_ctr": corr_ctr, "spearman_cr": corr_cr, "mw_low_gt_high": wu})
|
||||
|
||||
# Bin stats and dual-axis plot
|
||||
bins = pd.qcut(client["avg_imp_per_day"], 10, duplicates="drop")
|
||||
stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
|
||||
stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
|
||||
stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
|
||||
fig, ax1 = plt.subplots(figsize=(12, 5))
|
||||
ax2 = ax1.twinx()
|
||||
ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
|
||||
ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
|
||||
ax1.set_ylabel("CTR")
|
||||
ax2.set_ylabel("CR click→order")
|
||||
ax1.set_xlabel("avg_imp_per_day bins")
|
||||
plt.xticks(rotation=35)
|
||||
ax1.set_title("CTR и CR по децилям avg_imp_per_day")
|
||||
fig.tight_layout()
|
||||
plt.savefig(project_root / "main_hypot" / "stat_bins.png", dpi=150)
|
||||
print("Saved plot stat_bins.png")
|
||||