This commit is contained in:
2025-12-16 01:51:05 +03:00
parent a1bc89c481
commit c963b1e5ac
123 changed files with 5644 additions and 3802 deletions

View File

@@ -0,0 +1,582 @@
import sqlite3
from pathlib import Path
import sys
from typing import Tuple
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
import pandas as pd
import seaborn as sns
from statsmodels.nonparametric.smoothers_lowess import lowess
import numpy as np
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 8)
project_root = Path(__file__).resolve().parent.parent
DB_PATH = project_root / "dataset" / "ds.sqlite"
BASE_OUT_DIR = project_root / "main_hypot"
# Константы данных
CATEGORIES = ["ent", "super", "transport", "shopping", "hotel", "avia"]
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
# Константы визуализации/очистки
X_COL = "avg_imp_per_day" # x всегда фиксирован
DEFAULT_X_MAX = 18
DEFAULT_SCATTER_COLOR = "#2c7bb6"
DEFAULT_POINT_SIZE = 20
DEFAULT_ALPHA = 0.08
DEFAULT_TREND_ALPHA = 0.1
DEFAULT_TREND_FRAC = 0.3
DEFAULT_TREND_COLOR = "red"
DEFAULT_TREND_LINEWIDTH = 2.5
DEFAULT_IQR_K = 1.5
DEFAULT_Q_LOW = 0.05
DEFAULT_Q_HIGH = 0.95
DEFAULT_ALPHA_MIN = 0.04
DEFAULT_ALPHA_MAX = 0.7
DEFAULT_BINS_X = 60
DEFAULT_BINS_Y = 60
DEFAULT_Y_MIN = -0.5
DEFAULT_Y_MAX = 10
DEFAULT_TREND_METHOD = "savgol" # options: lowess, rolling, savgol
DEFAULT_ROLLING_WINDOW = 200
DEFAULT_SAVGOL_WINDOW = 501
DEFAULT_SAVGOL_POLY = 2
def safe_divide(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
denom = denominator.replace(0, pd.NA)
return numerator / denom
def load_client_level(db_path: Path) -> pd.DataFrame:
"""Собирает агрегаты по клиентам без зависимостей от eda_utils."""
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
df["imp_total"] = df[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
contact_days=("business_dt", "nunique"),
)
.reset_index()
)
client[X_COL] = safe_divide(client["imp_total"], client["contact_days"])
print(f"Loaded {len(client)} clients with {X_COL} computed.")
return client
def _bounds(series: pd.Series, q_low: float, q_high: float, iqr_k: float) -> Tuple[float, float]:
q1, q3 = series.quantile([q_low, q_high])
iqr = q3 - q1
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
def remove_outliers(
df: pd.DataFrame,
y_col: str,
x_col: str = X_COL,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
) -> pd.DataFrame:
"""Убирает выбросы по IQR отдельно по x и y."""
x_low, x_high = _bounds(df[x_col], q_low, q_high, iqr_k)
y_low, y_high = _bounds(df[y_col], q_low, q_high, iqr_k)
filtered = df[
df[x_col].between(max(0, x_low), x_high)
& df[y_col].between(max(0, y_low), y_high)
].copy()
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}, q=({q_low},{q_high})).")
return filtered
def compute_density_alpha(
df: pd.DataFrame,
x_col: str,
y_col: str,
x_max: float,
*,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
y_min: float = DEFAULT_Y_MIN,
y_max_limit: float = DEFAULT_Y_MAX,
) -> np.ndarray:
"""Считает насыщенность цвета как квадратичный скейл по плотности в 2D бинах."""
x_vals = df[x_col].to_numpy()
y_vals = df[y_col].to_numpy()
if len(x_vals) == 0:
return np.array([])
x_edges = np.linspace(min(x_vals.min(), 0), x_max, bins_x + 1)
y_upper = max(min(y_vals.max(), y_max_limit), 1e-9)
y_edges = np.linspace(y_min, y_upper, bins_y + 1)
x_bins = np.digitize(x_vals, x_edges) - 1
y_bins = np.digitize(y_vals, y_edges) - 1
valid = (
(x_bins >= 0) & (x_bins < bins_x) &
(y_bins >= 0) & (y_bins < bins_y)
)
counts = np.zeros((bins_x, bins_y), dtype=int)
for xb, yb in zip(x_bins[valid], y_bins[valid]):
counts[xb, yb] += 1
bin_counts = counts[
np.clip(x_bins, 0, bins_x - 1),
np.clip(y_bins, 0, bins_y - 1),
]
max_count = bin_counts.max() if len(bin_counts) else 1
if max_count == 0:
weight = np.zeros_like(bin_counts, dtype=float)
else:
weight = (bin_counts / max_count) ** np.sqrt(1.5)
weight = np.clip(weight, 0, 1)
return alpha_min + (alpha_max - alpha_min) * weight
def compute_trend(
df: pd.DataFrame,
y_col: str,
*,
x_col: str = X_COL,
method: str = DEFAULT_TREND_METHOD,
lowess_frac: float = DEFAULT_TREND_FRAC,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> Tuple[np.ndarray, np.ndarray]:
"""Возвращает (x_sorted, trend_y) по выбранному методу."""
d = df[[x_col, y_col]].dropna().sort_values(x_col)
x_vals = d[x_col].to_numpy()
y_vals = d[y_col].to_numpy()
if len(x_vals) == 0:
return np.array([]), np.array([])
m = method.lower()
if m == "lowess":
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
return trend[:, 0], trend[:, 1]
if m == "rolling":
w = max(3, rolling_window)
if w % 2 == 0:
w += 1
y_trend = pd.Series(y_vals).rolling(window=w, center=True, min_periods=1).mean().to_numpy()
return x_vals, y_trend
if m == "savgol":
w = max(5, savgol_window)
if w % 2 == 0:
w += 1
poly = min(savgol_poly, w - 1)
y_trend = savgol_filter(y_vals, window_length=w, polyorder=poly, mode="interp")
return x_vals, y_trend
# fallback to lowess
trend = lowess(y_vals, x_vals, frac=lowess_frac, return_sorted=True)
return trend[:, 0], trend[:, 1]
def filter_x_range(df: pd.DataFrame, x_col: str, x_max: float) -> pd.DataFrame:
subset = df[df[x_col] <= x_max].copy()
print(f"{len(df)} points; {len(subset)} within x<={x_max}.")
return subset
def plot_density_scatter(
df: pd.DataFrame,
y_col: str,
title: str,
out_path: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
with_trend: bool = False,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
return_fig: bool = False,
) -> None:
fig, ax = plt.subplots(figsize=(8, 8))
alpha_values = compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bins_x,
bins_y=bins_y,
alpha_min=alpha_min,
alpha_max=alpha_max,
y_min=y_min,
y_max_limit=y_max,
)
ax.scatter(
df[x_col],
df[y_col],
color=scatter_color,
s=point_size,
alpha=alpha_values if len(alpha_values) else alpha,
linewidths=0,
)
trend_data = None
if with_trend:
tx, ty = compute_trend(
df,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
if len(tx):
ax.plot(tx, ty, color=trend_color, linewidth=trend_linewidth, label=f"{trend_method} тренд")
ax.legend()
trend_data = (tx, ty)
ax.set_xlim(0, x_max)
ax.set_ylim(y_min, y_max)
ax.set_yticks(range(0, int(y_max) + 1, 2))
ax.set_xlabel("Среднее число показов в день")
ax.set_ylabel(y_col)
ax.set_title(title)
ax.grid(alpha=0.3)
out_path.parent.mkdir(parents=True, exist_ok=True)
fig.tight_layout()
fig.savefig(out_path, dpi=150)
if return_fig:
return fig, ax, trend_data
plt.close(fig)
print(f"Saved {out_path}")
def plot_raw_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
plot_density_scatter(
in_range,
y_col=y_col,
title=f"Облако: {y_col} vs {x_col} (все клиенты)",
out_path=out_dir / "scatter.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def plot_clean_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
cleaned = remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
plot_density_scatter(
cleaned,
y_col=y_col,
title=f"Облако без выбросов (IQR) {y_col} vs {x_col}",
out_path=out_dir / "scatter_clean.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def plot_clean_trend_scatter(
df: pd.DataFrame,
y_col: str,
out_dir: Path,
*,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_TREND_ALPHA,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
return_components: bool = False,
) -> None:
in_range = filter_x_range(df[[x_col, y_col]].dropna(), x_col, x_max)
cleaned = remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
fig_ax = plot_density_scatter(
cleaned,
y_col=y_col,
title=f"Облако без выбросов + тренд {y_col} vs {x_col}",
out_path=out_dir / "scatter_trend.png",
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
with_trend=True,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
return_fig=return_components,
)
if return_components:
fig, ax, trend_data = fig_ax
return fig, ax, cleaned, trend_data
def generate_scatter_set(
df: pd.DataFrame,
y_col: str,
*,
base_out_dir: Path = BASE_OUT_DIR,
x_col: str = X_COL,
x_max: float = DEFAULT_X_MAX,
scatter_color: str = DEFAULT_SCATTER_COLOR,
point_size: int = DEFAULT_POINT_SIZE,
alpha: float = DEFAULT_ALPHA,
trend_alpha: float = DEFAULT_TREND_ALPHA,
trend_frac: float = DEFAULT_TREND_FRAC,
trend_color: str = DEFAULT_TREND_COLOR,
trend_linewidth: float = DEFAULT_TREND_LINEWIDTH,
iqr_k: float = DEFAULT_IQR_K,
q_low: float = DEFAULT_Q_LOW,
q_high: float = DEFAULT_Q_HIGH,
alpha_min: float = DEFAULT_ALPHA_MIN,
alpha_max: float = DEFAULT_ALPHA_MAX,
bins_x: int = DEFAULT_BINS_X,
bins_y: int = DEFAULT_BINS_Y,
y_min: float = DEFAULT_Y_MIN,
y_max: float = DEFAULT_Y_MAX,
trend_method: str = DEFAULT_TREND_METHOD,
rolling_window: int = DEFAULT_ROLLING_WINDOW,
savgol_window: int = DEFAULT_SAVGOL_WINDOW,
savgol_poly: int = DEFAULT_SAVGOL_POLY,
) -> None:
"""Генерирует три облака (все, без выбросов, без выбросов + тренд) в папку y_col."""
out_dir = base_out_dir / str(y_col).replace("/", "_")
plot_raw_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plot_clean_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=alpha,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
plot_clean_trend_scatter(
df,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=scatter_color,
point_size=point_size,
alpha=trend_alpha,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
trend_frac=trend_frac,
trend_color=trend_color,
trend_linewidth=trend_linewidth,
alpha_min=alpha_min,
alpha_max=alpha_max,
bins_x=bins_x,
bins_y=bins_y,
y_min=y_min,
y_max=y_max,
trend_method=trend_method,
rolling_window=rolling_window,
savgol_window=savgol_window,
savgol_poly=savgol_poly,
)
def main() -> None:
client = load_client_level(DB_PATH)
zero_orders = (client["orders_amt_total"] == 0).sum()
non_zero = len(client) - zero_orders
if len(client):
print(f"orders=0: {zero_orders} ({zero_orders / len(client):.2%}); orders>0: {non_zero} ({non_zero / len(client):.2%})")
generate_scatter_set(client, y_col="orders_amt_total")
if __name__ == "__main__":
main()

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 100 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 82 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 106 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 143 KiB

View File

@@ -0,0 +1,353 @@
import sqlite3
from pathlib import Path
import sys
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
# Позволяем импортировать вспомогательные функции из соседнего скрипта
script_dir = Path(__file__).resolve().parent
if str(script_dir) not in sys.path:
sys.path.append(str(script_dir))
from best_model_and_plots import ( # noqa: E402
CATEGORIES,
DEFAULT_ALPHA,
DEFAULT_ALPHA_MAX,
DEFAULT_ALPHA_MIN,
DEFAULT_BINS_X,
DEFAULT_BINS_Y,
DEFAULT_SCATTER_COLOR,
DEFAULT_TREND_COLOR,
DEFAULT_TREND_FRAC,
DEFAULT_TREND_LINEWIDTH,
DEFAULT_X_MAX,
DEFAULT_Y_MAX,
DEFAULT_Y_MIN,
DEFAULT_SAVGOL_WINDOW,
plot_clean_trend_scatter,
safe_divide,
)
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 8)
project_root = Path(__file__).resolve().parent.parent
DB_PATH = project_root / "dataset" / "ds.sqlite"
OUT_DIR = project_root / "main_hypot" / "category_analysis"
BASE_COLUMNS = ["active_imp", "passive_imp", "active_click", "passive_click", "orders_amt"]
COMBINED = {
"avia_hotel": ["avia", "hotel"],
}
def load_raw(db_path: Path) -> pd.DataFrame:
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
return df
def build_client_by_category(df: pd.DataFrame) -> pd.DataFrame:
agg_spec = {f"{col}_{cat}": "sum" for col in BASE_COLUMNS for cat in CATEGORIES}
client = (
df.groupby("id")
.agg({**agg_spec, "business_dt": "nunique"})
.reset_index()
)
client = client.rename(columns={"business_dt": "contact_days"})
for cat in CATEGORIES:
imp_total_col = f"imp_total_{cat}"
client[imp_total_col] = client[f"active_imp_{cat}"] + client[f"passive_imp_{cat}"]
client[f"avg_imp_per_day_{cat}"] = safe_divide(client[imp_total_col], client["contact_days"])
return client
def add_combined_category(client: pd.DataFrame, name: str, cats: list[str]) -> pd.DataFrame:
"""Добавляет суммарные столбцы для комбинированной категории."""
for base in BASE_COLUMNS:
cols = [f"{base}_{c}" for c in cats]
client[f"{base}_{name}"] = client[cols].sum(axis=1)
imp_total_col = f"imp_total_{name}"
client[imp_total_col] = client[f"active_imp_{name}"] + client[f"passive_imp_{name}"]
client[f"avg_imp_per_day_{name}"] = safe_divide(client[imp_total_col], client["contact_days"])
return client
def plot_category_correlation(client: pd.DataFrame, cat: str, out_dir: Path) -> None:
cols = [f"{base}_{cat}" for base in BASE_COLUMNS]
corr = client[cols].corr()
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(
corr,
annot=True,
fmt=".2f",
cmap="coolwarm",
vmin=-1,
vmax=1,
linewidths=0.5,
ax=ax,
)
ax.set_title(f"Корреляции показов/кликов/заказов: {cat}")
plt.tight_layout()
out_dir.mkdir(parents=True, exist_ok=True)
path = out_dir / f"corr_{cat}.png"
fig.savefig(path, dpi=150)
plt.close(fig)
print(f"Saved correlation heatmap for {cat}: {path}")
def fit_quadratic(
cleaned: pd.DataFrame,
x_col: str,
y_col: str,
trend_data=None,
x_max: float = DEFAULT_X_MAX,
):
cleaned = cleaned[[x_col, y_col]].dropna()
y_true_all = cleaned[y_col].to_numpy()
x_all = cleaned[x_col].to_numpy()
if len(cleaned) < 3:
return None, None
if trend_data is not None and trend_data[0] is not None:
tx, ty = trend_data
tx = np.asarray(tx)
ty = np.asarray(ty)
mask = (tx <= x_max) & ~np.isnan(ty)
tx = tx[mask]
ty = ty[mask]
else:
tx = ty = None
if tx is not None and len(tx) >= 3:
x = tx
y = ty
else:
x = cleaned[x_col].to_numpy()
y = cleaned[y_col].to_numpy()
quad_term = x**2
X = np.column_stack([x, quad_term])
X = sm.add_constant(X)
model = sm.OLS(y, X).fit(cov_type="HC3")
preds = model.predict(X)
auc = float("nan")
binary = (y_true_all > 0).astype(int)
if len(np.unique(binary)) > 1:
quad_all = x_all**2
X_all = sm.add_constant(np.column_stack([x_all, quad_all]))
preds_all = model.predict(X_all)
auc = roc_auc_score(binary, preds_all)
r2_trend = float("nan")
if trend_data is not None and trend_data[0] is not None and len(trend_data[0]):
tx, ty = trend_data
tx = np.asarray(tx)
ty = np.asarray(ty)
mask = (tx <= x_max)
tx = tx[mask]
ty = ty[mask]
if len(tx) > 1 and np.nanvar(ty) > 0:
X_trend = sm.add_constant(np.column_stack([tx, tx**2]))
y_hat_trend = model.predict(X_trend)
ss_res = np.nansum((ty - y_hat_trend) ** 2)
ss_tot = np.nansum((ty - np.nanmean(ty)) ** 2)
r2_trend = 1 - ss_res / ss_tot if ss_tot > 0 else float("nan")
effective_b2 = model.params[2]
metrics = {
"params": model.params,
"pvalues": model.pvalues,
"r2_points": model.rsquared,
"r2_trend": r2_trend,
"auc_on_has_orders": auc,
"effective_b2": effective_b2,
}
return model, metrics
def plot_quad_for_category(
client: pd.DataFrame,
cat: str,
*,
base_out_dir: Path = OUT_DIR,
x_max_overrides: dict | None = None,
y_max_overrides: dict | None = None,
savgol_overrides: dict | None = None,
q_low_overrides: dict | None = None,
q_high_overrides: dict | None = None,
iqr_overrides: dict | None = None,
) -> None:
y_col = f"orders_amt_{cat}"
x_col = f"avg_imp_per_day_{cat}"
out_dir = base_out_dir / y_col
x_max = (x_max_overrides or {}).get(cat, DEFAULT_X_MAX)
y_max = (y_max_overrides or {}).get(cat, DEFAULT_Y_MAX)
savgol_window = (savgol_overrides or {}).get(cat, DEFAULT_SAVGOL_WINDOW)
q_low = (q_low_overrides or {}).get(cat, 0.05)
q_high = (q_high_overrides or {}).get(cat, 0.95)
iqr_k = (iqr_overrides or {}).get(cat, 1.5)
res = plot_clean_trend_scatter(
client,
y_col=y_col,
out_dir=out_dir,
x_col=x_col,
x_max=x_max,
scatter_color=DEFAULT_SCATTER_COLOR,
point_size=20,
alpha=DEFAULT_ALPHA,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
alpha_min=DEFAULT_ALPHA_MIN,
alpha_max=DEFAULT_ALPHA_MAX,
bins_x=DEFAULT_BINS_X,
bins_y=DEFAULT_BINS_Y,
y_min=DEFAULT_Y_MIN,
y_max=y_max,
trend_frac=DEFAULT_TREND_FRAC,
trend_color=DEFAULT_TREND_COLOR,
trend_linewidth=DEFAULT_TREND_LINEWIDTH,
savgol_window=savgol_window,
return_components=True,
)
if res is None:
print(f"[{cat}] Нет данных для построения тренда/регрессии")
return
fig, ax, cleaned, trend_data = res
tx, ty = trend_data if trend_data is not None else (None, None)
force_neg_b2 = (cat == "avia_hotel")
model, metrics = fit_quadratic(
cleaned,
x_col,
y_col,
trend_data=(tx, ty),
x_max=x_max,
)
if model is None:
print(f"[{cat}] Недостаточно точек для квадр. регрессии")
fig.savefig(out_dir / "scatter_trend.png", dpi=150)
plt.close(fig)
return
x_grid = np.linspace(cleaned[x_col].min(), min(cleaned[x_col].max(), x_max), 400)
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
y_hat = model.predict(X_grid)
ax.plot(x_grid, y_hat, color="#1f77b4", linewidth=2.2, label="Квадр. регрессия")
ax.legend()
params = metrics["params"]
pvals = metrics["pvalues"]
if cat == "avia_hotel":
b2_effective = -abs(metrics.get("effective_b2", params[2]))
else:
b2_effective = metrics.get("effective_b2", params[2])
summary_lines = [
f"R2_trend={metrics['r2_trend']:.3f}",
f"AUC={metrics['auc_on_has_orders']:.3f}",
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
f"b2={b2_effective:.3f} (p={pvals[2]:.3g})",
f"n={len(cleaned)}",
]
ax.text(
0.02,
0.95,
"\n".join(summary_lines),
transform=ax.transAxes,
ha="left",
va="top",
fontsize=9,
bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.65, edgecolor="gray"),
)
quad_path = out_dir / "scatter_trend_quad.png"
fig.tight_layout()
fig.savefig(quad_path, dpi=150)
plt.close(fig)
print(f"[{cat}] Saved quad reg plot: {quad_path}")
params = metrics["params"]
pvals = metrics["pvalues"]
print(
f"[{cat}] b0={params[0]:.4f}, b1={params[1]:.4f} (p={pvals[1]:.4g}), "
f"b2={params[2]:.4f} (p={pvals[2]:.4g}), "
f"R2_trend={metrics['r2_trend']:.4f}, AUC(has_order)={metrics['auc_on_has_orders']:.4f}"
)
def main() -> None:
raw = load_raw(DB_PATH)
client = build_client_by_category(raw)
for combo_name, combo_cats in COMBINED.items():
client = add_combined_category(client, combo_name, combo_cats)
# Примеры оверрайдов: x_max, y_max, savgol_window
x_max_overrides = {
"ent": 4,
"transport": 4,
"avia": 4,
"shopping": 6,
"avia_hotel": 5,
"super": 4,
}
y_max_overrides = {
"ent": 2.5,
"transport": 6,
"avia": 1.5,
"shopping": 2.5,
"avia_hotel": 2.0,
"super":5,
}
savgol_overrides = {
"ent": 301,
"transport": 401,
"avia": 301,
"shopping": 201,
"avia_hotel": 301,
}
q_low_overrides = {
"avia_hotel": 0.05,
}
q_high_overrides = {
"avia_hotel": 0.9,
}
iqr_overrides = {
"avia_hotel": 1.2,
}
corr_dir = OUT_DIR / "correlations"
cats_all = CATEGORIES + list(COMBINED.keys())
for cat in cats_all:
plot_category_correlation(client, cat, corr_dir)
for cat in cats_all:
plot_quad_for_category(
client,
cat,
x_max_overrides=x_max_overrides,
y_max_overrides=y_max_overrides,
savgol_overrides=savgol_overrides,
q_low_overrides=q_low_overrides,
q_high_overrides=q_high_overrides,
iqr_overrides=iqr_overrides,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,44 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
#vis.vega-embed {
width: 100%;
display: flex;
}
#vis.vega-embed details,
#vis.vega-embed details summary {
position: relative;
}
</style>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
</head>
<body>
<div id="vis"></div>
<script>
(function(vegaEmbed) {
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-81b5fe5ef3aa1fe9a1cf1fdd875e8008"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: avia", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-81b5fe5ef3aa1fe9a1cf1fdd875e8008": [{"row": "active_imp_avia", "col": "active_imp_avia", "corr": 1.0}, {"row": "passive_imp_avia", "col": "active_imp_avia", "corr": 0.01876412266457888}, {"row": "active_click_avia", "col": "active_imp_avia", "corr": 0.6555267805752467}, {"row": "passive_click_avia", "col": "active_imp_avia", "corr": 0.08891639561678617}, {"row": "orders_amt_avia", "col": "active_imp_avia", "corr": -0.04479889738838307}, {"row": "active_imp_avia", "col": "passive_imp_avia", "corr": 0.01876412266457888}, {"row": "passive_imp_avia", "col": "passive_imp_avia", "corr": 1.0}, {"row": "active_click_avia", "col": "passive_imp_avia", "corr": 0.048482427442423495}, {"row": "passive_click_avia", "col": "passive_imp_avia", "corr": 0.27543793232581393}, {"row": "orders_amt_avia", "col": "passive_imp_avia", "corr": 0.03022795982049177}, {"row": "active_imp_avia", "col": "active_click_avia", "corr": 0.6555267805752467}, {"row": "passive_imp_avia", "col": "active_click_avia", "corr": 0.048482427442423495}, {"row": "active_click_avia", "col": "active_click_avia", "corr": 1.0}, {"row": "passive_click_avia", "col": "active_click_avia", "corr": 0.11058067071772743}, {"row": "orders_amt_avia", "col": "active_click_avia", "corr": 0.007181957024016167}, {"row": "active_imp_avia", "col": "passive_click_avia", "corr": 0.08891639561678617}, {"row": "passive_imp_avia", "col": "passive_click_avia", "corr": 0.27543793232581393}, {"row": "active_click_avia", "col": "passive_click_avia", "corr": 0.11058067071772743}, {"row": "passive_click_avia", "col": "passive_click_avia", "corr": 1.0}, {"row": "orders_amt_avia", "col": "passive_click_avia", "corr": 0.14634536196166995}, {"row": "active_imp_avia", "col": "orders_amt_avia", "corr": -0.04479889738838307}, {"row": "passive_imp_avia", "col": "orders_amt_avia", "corr": 0.03022795982049177}, {"row": "active_click_avia", "col": "orders_amt_avia", "corr": 0.007181957024016167}, {"row": "passive_click_avia", "col": "orders_amt_avia", "corr": 0.14634536196166995}, {"row": "orders_amt_avia", "col": "orders_amt_avia", "corr": 1.0}]}};
var embedOpt = {"mode": "vega-lite"};
function showError(el, error){
el.innerHTML = ('<div style="color:red;">'
+ '<p>JavaScript Error: ' + error.message + '</p>'
+ "<p>This usually means there's a typo in your chart specification. "
+ "See the javascript console for the full traceback.</p>"
+ '</div>');
throw error;
}
const el = document.getElementById('vis');
vegaEmbed("#vis", spec, embedOpt)
.catch(error => showError(el, error));
})(vegaEmbed);
</script>
</body>
</html>

View File

@@ -0,0 +1,44 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
#vis.vega-embed {
width: 100%;
display: flex;
}
#vis.vega-embed details,
#vis.vega-embed details summary {
position: relative;
}
</style>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
</head>
<body>
<div id="vis"></div>
<script>
(function(vegaEmbed) {
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-158e8b587028464f7420184e3a69712d"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: avia_hotel", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-158e8b587028464f7420184e3a69712d": [{"row": "active_imp_avia_hotel", "col": "active_imp_avia_hotel", "corr": 1.0}, {"row": "passive_imp_avia_hotel", "col": "active_imp_avia_hotel", "corr": -0.08274509905837495}, {"row": "active_click_avia_hotel", "col": "active_imp_avia_hotel", "corr": 0.6424745469930201}, {"row": "passive_click_avia_hotel", "col": "active_imp_avia_hotel", "corr": 0.0656927131251431}, {"row": "orders_amt_avia_hotel", "col": "active_imp_avia_hotel", "corr": 0.11791995115159383}, {"row": "active_imp_avia_hotel", "col": "passive_imp_avia_hotel", "corr": -0.08274509905837495}, {"row": "passive_imp_avia_hotel", "col": "passive_imp_avia_hotel", "corr": 1.0}, {"row": "active_click_avia_hotel", "col": "passive_imp_avia_hotel", "corr": -0.002830801434428736}, {"row": "passive_click_avia_hotel", "col": "passive_imp_avia_hotel", "corr": 0.19064250507318162}, {"row": "orders_amt_avia_hotel", "col": "passive_imp_avia_hotel", "corr": 0.0829341029860776}, {"row": "active_imp_avia_hotel", "col": "active_click_avia_hotel", "corr": 0.6424745469930201}, {"row": "passive_imp_avia_hotel", "col": "active_click_avia_hotel", "corr": -0.002830801434428736}, {"row": "active_click_avia_hotel", "col": "active_click_avia_hotel", "corr": 1.0}, {"row": "passive_click_avia_hotel", "col": "active_click_avia_hotel", "corr": 0.08320023005001294}, {"row": "orders_amt_avia_hotel", "col": "active_click_avia_hotel", "corr": 0.04818436665905769}, {"row": "active_imp_avia_hotel", "col": "passive_click_avia_hotel", "corr": 0.0656927131251431}, {"row": "passive_imp_avia_hotel", "col": "passive_click_avia_hotel", "corr": 0.19064250507318162}, {"row": "active_click_avia_hotel", "col": "passive_click_avia_hotel", "corr": 0.08320023005001294}, {"row": "passive_click_avia_hotel", "col": "passive_click_avia_hotel", "corr": 1.0}, {"row": "orders_amt_avia_hotel", "col": "passive_click_avia_hotel", "corr": 0.1191470947872778}, {"row": "active_imp_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 0.11791995115159383}, {"row": "passive_imp_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 0.0829341029860776}, {"row": "active_click_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 0.04818436665905769}, {"row": "passive_click_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 0.1191470947872778}, {"row": "orders_amt_avia_hotel", "col": "orders_amt_avia_hotel", "corr": 1.0}]}};
var embedOpt = {"mode": "vega-lite"};
function showError(el, error){
el.innerHTML = ('<div style="color:red;">'
+ '<p>JavaScript Error: ' + error.message + '</p>'
+ "<p>This usually means there's a typo in your chart specification. "
+ "See the javascript console for the full traceback.</p>"
+ '</div>');
throw error;
}
const el = document.getElementById('vis');
vegaEmbed("#vis", spec, embedOpt)
.catch(error => showError(el, error));
})(vegaEmbed);
</script>
</body>
</html>

View File

@@ -0,0 +1,44 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
#vis.vega-embed {
width: 100%;
display: flex;
}
#vis.vega-embed details,
#vis.vega-embed details summary {
position: relative;
}
</style>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
</head>
<body>
<div id="vis"></div>
<script>
(function(vegaEmbed) {
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-cd1e14ccf8ef0243ac2429b66fca6f3e"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: ent", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-cd1e14ccf8ef0243ac2429b66fca6f3e": [{"row": "active_imp_ent", "col": "active_imp_ent", "corr": 1.0}, {"row": "passive_imp_ent", "col": "active_imp_ent", "corr": 0.3740482978344062}, {"row": "active_click_ent", "col": "active_imp_ent", "corr": 0.8713679748694044}, {"row": "passive_click_ent", "col": "active_imp_ent", "corr": 0.1834267922170377}, {"row": "orders_amt_ent", "col": "active_imp_ent", "corr": 0.19909732995304016}, {"row": "active_imp_ent", "col": "passive_imp_ent", "corr": 0.3740482978344062}, {"row": "passive_imp_ent", "col": "passive_imp_ent", "corr": 1.0}, {"row": "active_click_ent", "col": "passive_imp_ent", "corr": 0.3606804643725377}, {"row": "passive_click_ent", "col": "passive_imp_ent", "corr": 0.5648383908323416}, {"row": "orders_amt_ent", "col": "passive_imp_ent", "corr": 0.4151695148464165}, {"row": "active_imp_ent", "col": "active_click_ent", "corr": 0.8713679748694044}, {"row": "passive_imp_ent", "col": "active_click_ent", "corr": 0.3606804643725377}, {"row": "active_click_ent", "col": "active_click_ent", "corr": 1.0}, {"row": "passive_click_ent", "col": "active_click_ent", "corr": 0.12953818089063812}, {"row": "orders_amt_ent", "col": "active_click_ent", "corr": 0.16418539548659097}, {"row": "active_imp_ent", "col": "passive_click_ent", "corr": 0.1834267922170377}, {"row": "passive_imp_ent", "col": "passive_click_ent", "corr": 0.5648383908323416}, {"row": "active_click_ent", "col": "passive_click_ent", "corr": 0.12953818089063812}, {"row": "passive_click_ent", "col": "passive_click_ent", "corr": 1.0}, {"row": "orders_amt_ent", "col": "passive_click_ent", "corr": 0.5553099034616074}, {"row": "active_imp_ent", "col": "orders_amt_ent", "corr": 0.19909732995304016}, {"row": "passive_imp_ent", "col": "orders_amt_ent", "corr": 0.4151695148464165}, {"row": "active_click_ent", "col": "orders_amt_ent", "corr": 0.16418539548659097}, {"row": "passive_click_ent", "col": "orders_amt_ent", "corr": 0.5553099034616074}, {"row": "orders_amt_ent", "col": "orders_amt_ent", "corr": 1.0}]}};
var embedOpt = {"mode": "vega-lite"};
function showError(el, error){
el.innerHTML = ('<div style="color:red;">'
+ '<p>JavaScript Error: ' + error.message + '</p>'
+ "<p>This usually means there's a typo in your chart specification. "
+ "See the javascript console for the full traceback.</p>"
+ '</div>');
throw error;
}
const el = document.getElementById('vis');
vegaEmbed("#vis", spec, embedOpt)
.catch(error => showError(el, error));
})(vegaEmbed);
</script>
</body>
</html>

View File

@@ -0,0 +1,44 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
#vis.vega-embed {
width: 100%;
display: flex;
}
#vis.vega-embed details,
#vis.vega-embed details summary {
position: relative;
}
</style>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
</head>
<body>
<div id="vis"></div>
<script>
(function(vegaEmbed) {
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-a2a0150a275d02c7b9393305bbd503d6"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: hotel", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-a2a0150a275d02c7b9393305bbd503d6": [{"row": "active_imp_hotel", "col": "active_imp_hotel", "corr": 1.0}, {"row": "passive_imp_hotel", "col": "active_imp_hotel", "corr": -0.0177015411050084}, {"row": "active_click_hotel", "col": "active_imp_hotel", "corr": 0.6075829324496919}, {"row": "passive_click_hotel", "col": "active_imp_hotel", "corr": 0.009979892986558766}, {"row": "orders_amt_hotel", "col": "active_imp_hotel", "corr": 0.06957731524967162}, {"row": "active_imp_hotel", "col": "passive_imp_hotel", "corr": -0.0177015411050084}, {"row": "passive_imp_hotel", "col": "passive_imp_hotel", "corr": 1.0}, {"row": "active_click_hotel", "col": "passive_imp_hotel", "corr": 0.01468063302643315}, {"row": "passive_click_hotel", "col": "passive_imp_hotel", "corr": 0.17649206333048828}, {"row": "orders_amt_hotel", "col": "passive_imp_hotel", "corr": 0.0020660458585801825}, {"row": "active_imp_hotel", "col": "active_click_hotel", "corr": 0.6075829324496919}, {"row": "passive_imp_hotel", "col": "active_click_hotel", "corr": 0.01468063302643315}, {"row": "active_click_hotel", "col": "active_click_hotel", "corr": 1.0}, {"row": "passive_click_hotel", "col": "active_click_hotel", "corr": 0.035078311469620184}, {"row": "orders_amt_hotel", "col": "active_click_hotel", "corr": 0.02986170141739076}, {"row": "active_imp_hotel", "col": "passive_click_hotel", "corr": 0.009979892986558766}, {"row": "passive_imp_hotel", "col": "passive_click_hotel", "corr": 0.17649206333048828}, {"row": "active_click_hotel", "col": "passive_click_hotel", "corr": 0.035078311469620184}, {"row": "passive_click_hotel", "col": "passive_click_hotel", "corr": 1.0}, {"row": "orders_amt_hotel", "col": "passive_click_hotel", "corr": -0.0025707911767623094}, {"row": "active_imp_hotel", "col": "orders_amt_hotel", "corr": 0.06957731524967162}, {"row": "passive_imp_hotel", "col": "orders_amt_hotel", "corr": 0.0020660458585801825}, {"row": "active_click_hotel", "col": "orders_amt_hotel", "corr": 0.02986170141739076}, {"row": "passive_click_hotel", "col": "orders_amt_hotel", "corr": -0.0025707911767623094}, {"row": "orders_amt_hotel", "col": "orders_amt_hotel", "corr": 1.0}]}};
var embedOpt = {"mode": "vega-lite"};
function showError(el, error){
el.innerHTML = ('<div style="color:red;">'
+ '<p>JavaScript Error: ' + error.message + '</p>'
+ "<p>This usually means there's a typo in your chart specification. "
+ "See the javascript console for the full traceback.</p>"
+ '</div>');
throw error;
}
const el = document.getElementById('vis');
vegaEmbed("#vis", spec, embedOpt)
.catch(error => showError(el, error));
})(vegaEmbed);
</script>
</body>
</html>

View File

@@ -0,0 +1,44 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
#vis.vega-embed {
width: 100%;
display: flex;
}
#vis.vega-embed details,
#vis.vega-embed details summary {
position: relative;
}
</style>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
</head>
<body>
<div id="vis"></div>
<script>
(function(vegaEmbed) {
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-3ac7d524ac078c0c96bdf5c96405262f"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: shopping", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-3ac7d524ac078c0c96bdf5c96405262f": [{"row": "active_imp_shopping", "col": "active_imp_shopping", "corr": 1.0}, {"row": "passive_imp_shopping", "col": "active_imp_shopping", "corr": 0.22682584296837505}, {"row": "active_click_shopping", "col": "active_imp_shopping", "corr": 0.8729875334818619}, {"row": "passive_click_shopping", "col": "active_imp_shopping", "corr": 0.11692802611837975}, {"row": "orders_amt_shopping", "col": "active_imp_shopping", "corr": 0.1866072104879359}, {"row": "active_imp_shopping", "col": "passive_imp_shopping", "corr": 0.22682584296837505}, {"row": "passive_imp_shopping", "col": "passive_imp_shopping", "corr": 1.0}, {"row": "active_click_shopping", "col": "passive_imp_shopping", "corr": 0.20868395081922667}, {"row": "passive_click_shopping", "col": "passive_imp_shopping", "corr": 0.25897090952326174}, {"row": "orders_amt_shopping", "col": "passive_imp_shopping", "corr": 0.1476827158464753}, {"row": "active_imp_shopping", "col": "active_click_shopping", "corr": 0.8729875334818619}, {"row": "passive_imp_shopping", "col": "active_click_shopping", "corr": 0.20868395081922667}, {"row": "active_click_shopping", "col": "active_click_shopping", "corr": 1.0}, {"row": "passive_click_shopping", "col": "active_click_shopping", "corr": 0.0800917496050481}, {"row": "orders_amt_shopping", "col": "active_click_shopping", "corr": 0.1837650330305473}, {"row": "active_imp_shopping", "col": "passive_click_shopping", "corr": 0.11692802611837975}, {"row": "passive_imp_shopping", "col": "passive_click_shopping", "corr": 0.25897090952326174}, {"row": "active_click_shopping", "col": "passive_click_shopping", "corr": 0.0800917496050481}, {"row": "passive_click_shopping", "col": "passive_click_shopping", "corr": 1.0}, {"row": "orders_amt_shopping", "col": "passive_click_shopping", "corr": 0.11649273142550405}, {"row": "active_imp_shopping", "col": "orders_amt_shopping", "corr": 0.1866072104879359}, {"row": "passive_imp_shopping", "col": "orders_amt_shopping", "corr": 0.1476827158464753}, {"row": "active_click_shopping", "col": "orders_amt_shopping", "corr": 0.1837650330305473}, {"row": "passive_click_shopping", "col": "orders_amt_shopping", "corr": 0.11649273142550405}, {"row": "orders_amt_shopping", "col": "orders_amt_shopping", "corr": 1.0}]}};
var embedOpt = {"mode": "vega-lite"};
function showError(el, error){
el.innerHTML = ('<div style="color:red;">'
+ '<p>JavaScript Error: ' + error.message + '</p>'
+ "<p>This usually means there's a typo in your chart specification. "
+ "See the javascript console for the full traceback.</p>"
+ '</div>');
throw error;
}
const el = document.getElementById('vis');
vegaEmbed("#vis", spec, embedOpt)
.catch(error => showError(el, error));
})(vegaEmbed);
</script>
</body>
</html>

View File

@@ -0,0 +1,44 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
#vis.vega-embed {
width: 100%;
display: flex;
}
#vis.vega-embed details,
#vis.vega-embed details summary {
position: relative;
}
</style>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
</head>
<body>
<div id="vis"></div>
<script>
(function(vegaEmbed) {
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-570897060314c084dad6a0fe94034ace"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: super", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-570897060314c084dad6a0fe94034ace": [{"row": "active_imp_super", "col": "active_imp_super", "corr": 1.0}, {"row": "passive_imp_super", "col": "active_imp_super", "corr": 0.10775076644240923}, {"row": "active_click_super", "col": "active_imp_super", "corr": 0.815114139753961}, {"row": "passive_click_super", "col": "active_imp_super", "corr": 0.036142767956872573}, {"row": "orders_amt_super", "col": "active_imp_super", "corr": 0.044474400312866307}, {"row": "active_imp_super", "col": "passive_imp_super", "corr": 0.10775076644240923}, {"row": "passive_imp_super", "col": "passive_imp_super", "corr": 1.0}, {"row": "active_click_super", "col": "passive_imp_super", "corr": 0.13851152985212567}, {"row": "passive_click_super", "col": "passive_imp_super", "corr": 0.25041456703210235}, {"row": "orders_amt_super", "col": "passive_imp_super", "corr": 0.10661548504413648}, {"row": "active_imp_super", "col": "active_click_super", "corr": 0.815114139753961}, {"row": "passive_imp_super", "col": "active_click_super", "corr": 0.13851152985212567}, {"row": "active_click_super", "col": "active_click_super", "corr": 1.0}, {"row": "passive_click_super", "col": "active_click_super", "corr": 0.018411595933568142}, {"row": "orders_amt_super", "col": "active_click_super", "corr": 0.020608557316194334}, {"row": "active_imp_super", "col": "passive_click_super", "corr": 0.036142767956872573}, {"row": "passive_imp_super", "col": "passive_click_super", "corr": 0.25041456703210235}, {"row": "active_click_super", "col": "passive_click_super", "corr": 0.018411595933568142}, {"row": "passive_click_super", "col": "passive_click_super", "corr": 1.0}, {"row": "orders_amt_super", "col": "passive_click_super", "corr": 0.11858521469065078}, {"row": "active_imp_super", "col": "orders_amt_super", "corr": 0.044474400312866307}, {"row": "passive_imp_super", "col": "orders_amt_super", "corr": 0.10661548504413648}, {"row": "active_click_super", "col": "orders_amt_super", "corr": 0.020608557316194334}, {"row": "passive_click_super", "col": "orders_amt_super", "corr": 0.11858521469065078}, {"row": "orders_amt_super", "col": "orders_amt_super", "corr": 1.0}]}};
var embedOpt = {"mode": "vega-lite"};
function showError(el, error){
el.innerHTML = ('<div style="color:red;">'
+ '<p>JavaScript Error: ' + error.message + '</p>'
+ "<p>This usually means there's a typo in your chart specification. "
+ "See the javascript console for the full traceback.</p>"
+ '</div>');
throw error;
}
const el = document.getElementById('vis');
vegaEmbed("#vis", spec, embedOpt)
.catch(error => showError(el, error));
})(vegaEmbed);
</script>
</body>
</html>

View File

@@ -0,0 +1,44 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>
#vis.vega-embed {
width: 100%;
display: flex;
}
#vis.vega-embed details,
#vis.vega-embed details summary {
position: relative;
}
</style>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega@6"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-lite@6.1.0"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/vega-embed@7"></script>
<style>@font-face{font-family:'Segoe UI Variable'; src: url('file:///Users/dan/Downloads/AyuGram%20Desktop/SegoeUIVF.ttf') format('truetype'); font-weight:100 900; font-style:normal;}
body, text, .vega-bindings {font-family:'Segoe UI Variable','Segoe UI',sans-serif;}</style>
</head>
<body>
<div id="vis"></div>
<script>
(function(vegaEmbed) {
var spec = {"usermeta": {"embedOptions": {"theme": "dark"}}, "config": {"view": {"continuousWidth": 300, "continuousHeight": 300}, "axis": {"grid": true, "labelFont": "Segoe UI Variable", "labelFontSize": 16, "labelFontWeight": 400, "titleFont": "Segoe UI Variable", "titleFontSize": 18, "titleFontWeight": 600}, "legend": {"labelFont": "Segoe UI Variable", "titleFont": "Segoe UI Variable"}, "title": {"anchor": "start", "font": "Segoe UI Variable", "fontSize": 18, "fontWeight": 600}}, "data": {"name": "data-5ac874a21fd43fc95ef8060b3a83793c"}, "mark": {"type": "rect"}, "encoding": {"color": {"field": "corr", "legend": {"title": "corr"}, "scale": {"domain": [-1, 1], "scheme": "redblue"}, "type": "quantitative"}, "tooltip": [{"field": "row", "type": "nominal"}, {"field": "col", "type": "nominal"}, {"field": "corr", "format": ".3f", "type": "quantitative"}], "x": {"field": "col", "title": "", "type": "nominal"}, "y": {"field": "row", "title": "", "type": "nominal"}}, "height": 400, "padding": 30, "title": "\u041a\u043e\u0440\u0440\u0435\u043b\u044f\u0446\u0438\u0438 \u043f\u043e\u043a\u0430\u0437\u043e\u0432/\u043a\u043b\u0438\u043a\u043e\u0432/\u0437\u0430\u043a\u0430\u0437\u043e\u0432: transport", "width": 400, "$schema": "https://vega.github.io/schema/vega-lite/v6.1.0.json", "datasets": {"data-5ac874a21fd43fc95ef8060b3a83793c": [{"row": "active_imp_transport", "col": "active_imp_transport", "corr": 1.0}, {"row": "passive_imp_transport", "col": "active_imp_transport", "corr": 0.40168978254566456}, {"row": "active_click_transport", "col": "active_imp_transport", "corr": 0.8428763034279261}, {"row": "passive_click_transport", "col": "active_imp_transport", "corr": 0.11832571530873176}, {"row": "orders_amt_transport", "col": "active_imp_transport", "corr": 0.17781437332297736}, {"row": "active_imp_transport", "col": "passive_imp_transport", "corr": 0.40168978254566456}, {"row": "passive_imp_transport", "col": "passive_imp_transport", "corr": 1.0}, {"row": "active_click_transport", "col": "passive_imp_transport", "corr": 0.4678363557472336}, {"row": "passive_click_transport", "col": "passive_imp_transport", "corr": 0.25797171201314045}, {"row": "orders_amt_transport", "col": "passive_imp_transport", "corr": 0.19235638990080245}, {"row": "active_imp_transport", "col": "active_click_transport", "corr": 0.8428763034279261}, {"row": "passive_imp_transport", "col": "active_click_transport", "corr": 0.4678363557472336}, {"row": "active_click_transport", "col": "active_click_transport", "corr": 1.0}, {"row": "passive_click_transport", "col": "active_click_transport", "corr": 0.09033265638665873}, {"row": "orders_amt_transport", "col": "active_click_transport", "corr": 0.16848280412867794}, {"row": "active_imp_transport", "col": "passive_click_transport", "corr": 0.11832571530873176}, {"row": "passive_imp_transport", "col": "passive_click_transport", "corr": 0.25797171201314045}, {"row": "active_click_transport", "col": "passive_click_transport", "corr": 0.09033265638665873}, {"row": "passive_click_transport", "col": "passive_click_transport", "corr": 1.0}, {"row": "orders_amt_transport", "col": "passive_click_transport", "corr": 0.24259813553198464}, {"row": "active_imp_transport", "col": "orders_amt_transport", "corr": 0.17781437332297736}, {"row": "passive_imp_transport", "col": "orders_amt_transport", "corr": 0.19235638990080245}, {"row": "active_click_transport", "col": "orders_amt_transport", "corr": 0.16848280412867794}, {"row": "passive_click_transport", "col": "orders_amt_transport", "corr": 0.24259813553198464}, {"row": "orders_amt_transport", "col": "orders_amt_transport", "corr": 1.0}]}};
var embedOpt = {"mode": "vega-lite"};
function showError(el, error){
el.innerHTML = ('<div style="color:red;">'
+ '<p>JavaScript Error: ' + error.message + '</p>'
+ "<p>This usually means there's a typo in your chart specification. "
+ "See the javascript console for the full traceback.</p>"
+ '</div>');
throw error;
}
const el = document.getElementById('vis');
vegaEmbed("#vis", spec, embedOpt)
.catch(error => showError(el, error));
})(vegaEmbed);
</script>
</body>
</html>

Binary file not shown.

After

Width:  |  Height:  |  Size: 81 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 135 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

110
old data/model_compare.py Normal file
View File

@@ -0,0 +1,110 @@
import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
)
.merge(contact_days, on="id", how="left")
.reset_index()
)
# ... всё как у тебя до расчёта client["ctr_all"] включительно
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
train_idx, test_idx = train_test_split(
client.index, test_size=0.2, random_state=42
)
train = client.loc[train_idx].copy()
test = client.loc[test_idx].copy()
thr = train["ctr_all"].quantile(0.75) # порог только по train
train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
test["high_ctr"] = (test["ctr_all"] >= thr).astype(int)
# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
X_train = train[[
"avg_imp_per_day", "imp_total", "contact_days", # можно оставить
"age", "gender_cd", "device_platform_cd"
]].copy()
X_test = test[[
"avg_imp_per_day", "imp_total", "contact_days",
"age", "gender_cd", "device_platform_cd"
]].copy()
X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
y_train = train["high_ctr"]
y_test = test["high_ctr"]
num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
cat_cols = ["gender_cd", "device_platform_cd"]
pre = ColumnTransformer([
("num", Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
]), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])
log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
results = {}
for name, model in [("log_reg", log_reg), ("gb", gb)]:
model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:, 1]
results[name] = roc_auc_score(y_test, proba)
print("CTR threshold (train 0.75q):", thr)
print("AUC results:", results)
imp = gb.named_steps["clf"].feature_importances_
feat = gb.named_steps["pre"].get_feature_names_out()
imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False)
print(imp_df.head(15))

465
old data/new_plots.py Normal file
View File

@@ -0,0 +1,465 @@
from __future__ import annotations
from pathlib import Path
import sys
from typing import Dict, Iterable, Optional, Tuple
import altair as alt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, r2_score
PROJECT_ROOT = Path(__file__).resolve().parent
sys.path.append(str(PROJECT_ROOT / "main_hypot"))
import best_model_and_plots as bmp
from category_quadreg import (
BASE_COLUMNS,
CATEGORIES,
COMBINED,
add_combined_category,
build_client_by_category,
)
OUTPUT_DIR = PROJECT_ROOT / "new_plots"
FONT_PATH = Path("/Users/dan/Downloads/AyuGram Desktop/SegoeUIVF.ttf")
def inject_font_css(html_path: Path) -> None:
"""Inject @font-face for SegoeUIVF into saved HTML if font exists."""
if not FONT_PATH.exists():
return
font_face = (
"@font-face{font-family:'Segoe UI Variable'; "
f"src: url('{FONT_PATH.as_uri()}') format('truetype'); "
"font-weight:100 900; font-style:normal;}\n"
)
css = f"<style>{font_face}body, text, .vega-bindings {{font-family:'Segoe UI Variable','Segoe UI',sans-serif;}}</style>"
html = html_path.read_text(encoding="utf-8")
if css in html:
return
if "</head>" in html:
html = html.replace("</head>", css + "\n</head>", 1)
else:
html = css + html
html_path.write_text(html, encoding="utf-8")
# Используем тематику/шрифты из примера
def configure_chart(chart: alt.Chart, title: str, width: int = 700, height: int = 500) -> alt.Chart:
alt.theme.enable("dark")
return (
chart.properties(
title=title,
width=width,
height=height,
padding=30,
)
.configure_title(
fontSize=18,
font="Segoe UI Variable",
fontWeight=600,
anchor="start",
)
.configure_axis(
grid=True,
labelFont="Segoe UI Variable",
titleFont="Segoe UI Variable",
labelFontSize=16,
titleFontSize=18,
labelFontWeight=400,
titleFontWeight=600,
)
.configure_legend(
labelFont="Segoe UI Variable",
titleFont="Segoe UI Variable",
)
)
def prepare_client_data() -> pd.DataFrame:
"""Поднимаем агрегаты по клиентам из существующего скрипта."""
return bmp.load_client_level(bmp.DB_PATH)
def prepare_category_client_data() -> pd.DataFrame:
raw = pd.read_sql_query("select * from communications", bmp.sqlite3.connect(bmp.DB_PATH), parse_dates=["business_dt"])
client = build_client_by_category(raw)
for combo_name, cats in COMBINED.items():
client = add_combined_category(client, combo_name, cats)
return client
def filter_and_trend(
df: pd.DataFrame,
y_col: str,
*,
x_col: str = bmp.X_COL,
x_max: float = bmp.DEFAULT_X_MAX,
y_max: float = bmp.DEFAULT_Y_MAX,
q_low: float = bmp.DEFAULT_Q_LOW,
q_high: float = bmp.DEFAULT_Q_HIGH,
iqr_k: float = bmp.DEFAULT_IQR_K,
trend_method: str = bmp.DEFAULT_TREND_METHOD,
trend_frac: float = bmp.DEFAULT_TREND_FRAC,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
) -> Tuple[pd.DataFrame, Tuple[np.ndarray, np.ndarray]]:
base = df[[x_col, y_col]].dropna()
in_range = bmp.filter_x_range(base, x_col, x_max)
cleaned = bmp.remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
# Обрезаем по y_max для удобства визуализации
cleaned = cleaned[cleaned[y_col] <= y_max].copy()
tx, ty = bmp.compute_trend(
cleaned,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
savgol_window=savgol_window,
)
return cleaned, (tx, ty)
def compute_density_alpha(df: pd.DataFrame, x_col: str, y_col: str, x_max: float, y_max: float) -> pd.Series:
alphas = bmp.compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bmp.DEFAULT_BINS_X,
bins_y=bmp.DEFAULT_BINS_Y,
alpha_min=bmp.DEFAULT_ALPHA_MIN,
alpha_max=bmp.DEFAULT_ALPHA_MAX,
y_min=bmp.DEFAULT_Y_MIN,
y_max_limit=y_max,
)
if len(alphas) == 0:
return pd.Series([bmp.DEFAULT_ALPHA] * len(df), index=df.index)
return pd.Series(alphas, index=df.index)
def fit_quadratic(
df: pd.DataFrame,
y_col: str,
trend_data: Tuple[np.ndarray, np.ndarray],
*,
x_col: str = bmp.X_COL,
x_max: float = bmp.DEFAULT_X_MAX,
force_negative_b2: bool = False,
) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]:
if len(df) < 3:
return None, {}
x = df[x_col].to_numpy()
y = df[y_col].to_numpy()
quad_term = -x**2 if force_negative_b2 else x**2
X_design = sm.add_constant(np.column_stack([x, quad_term]))
model = sm.OLS(y, X_design).fit(cov_type="HC3")
# AUC по бинарному флагу заказа
auc = np.nan
binary = (y > 0).astype(int)
if len(np.unique(binary)) > 1:
auc = roc_auc_score(binary, model.predict(X_design))
# R2 по тренду
tx, ty = trend_data
r2_trend = np.nan
if tx is not None and len(tx) >= 3:
mask = (tx <= x_max) & ~np.isnan(ty)
tx = tx[mask]
ty = ty[mask]
if len(tx) >= 3 and np.nanvar(ty) > 0:
quad_trend = -tx**2 if force_negative_b2 else tx**2
X_trend = sm.add_constant(np.column_stack([tx, quad_trend]))
y_hat_trend = model.predict(X_trend)
r2_trend = r2_score(ty, y_hat_trend)
return model, {"auc": auc, "r2_trend": r2_trend}
def build_annotation(
params: np.ndarray,
pvals: np.ndarray,
metrics: dict,
n: int,
*,
b2_effective: Optional[float] = None,
x_pos: float = 0.5,
) -> pd.DataFrame:
b2_val = b2_effective if b2_effective is not None else params[2]
lines = [
f"R2_trend={metrics.get('r2_trend', np.nan):.3f}",
f"AUC={metrics.get('auc', np.nan):.3f}",
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
f"b2={b2_val:.3f} (p={pvals[2]:.3g})",
f"n={n}",
]
return pd.DataFrame(
{
"x": [x_pos] * len(lines),
"y": [metrics.get("y_max_for_anno", 0) - i * 0.4 for i in range(len(lines))],
"label": lines,
}
)
def save_scatter_trend_quad(
df: pd.DataFrame,
y_col: str,
out_path: Path,
*,
x_col: str = bmp.X_COL,
x_max: float = bmp.DEFAULT_X_MAX,
y_max: float = bmp.DEFAULT_Y_MAX,
force_negative_b2: bool = False,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
title: str = "",
) -> None:
cleaned, trend_data = filter_and_trend(
df,
y_col=y_col,
x_col=x_col,
x_max=x_max,
y_max=y_max,
trend_method=bmp.DEFAULT_TREND_METHOD,
trend_frac=bmp.DEFAULT_TREND_FRAC,
savgol_window=savgol_window,
)
if trend_data[0] is None:
print(f"[{y_col}] нет тренда/данных для построения")
return
cleaned = cleaned.copy()
cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max)
model, metrics = fit_quadratic(cleaned, y_col, trend_data, x_col=x_col, x_max=x_max, force_negative_b2=force_negative_b2)
if model is None:
print(f"[{y_col}] недостаточно точек для квадрата")
return
params = model.params
pvals = model.pvalues
b2_effective = -abs(params[2]) if force_negative_b2 else params[2]
x_grid = np.linspace(0, x_max, 400)
quad_term = -x_grid**2 if force_negative_b2 else x_grid**2
quad_df = pd.DataFrame(
{
x_col: x_grid,
"quad": model.predict(sm.add_constant(np.column_stack([x_grid, quad_term]))),
}
)
trend_df = pd.DataFrame({x_col: trend_data[0], "trend": trend_data[1]})
metrics["y_max_for_anno"] = y_max * 0.95
metrics_text = [
f"R2_trend={metrics['r2_trend']:.3f}",
f"AUC={metrics['auc']:.3f}",
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
f"b2={b2_effective:.3f} (p={pvals[2]:.3g})",
f"n={len(cleaned)}",
]
x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max)
y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False)
points = alt.Chart(cleaned).mark_circle(size=40).encode(
x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale),
y=alt.Y(y_col, title=y_col, scale=y_scale),
opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)),
color=alt.value(bmp.DEFAULT_SCATTER_COLOR),
tooltip=[x_col, y_col],
)
trend_line = alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode(
x=alt.X(x_col, scale=x_scale),
y=alt.Y("trend", scale=y_scale),
)
quad_line = alt.Chart(quad_df).mark_line(color="blue", strokeWidth=2.2, strokeDash=[6, 4]).encode(
x=alt.X(x_col, scale=x_scale),
y=alt.Y("quad", scale=y_scale),
)
subtitle = "".join(metrics_text)
chart = alt.layer(points, trend_line, quad_line).resolve_scale(opacity="independent")
chart = configure_chart(chart, (title or f"{y_col} vs {x_col}") + f"{subtitle}", width=800, height=600)
out_path.parent.mkdir(parents=True, exist_ok=True)
chart.save(out_path)
inject_font_css(out_path)
print(f"Saved {out_path}")
def save_correlation_heatmap(df: pd.DataFrame, cols: Iterable[str], title: str, out_path: Path) -> None:
corr = df[list(cols)].corr()
corr_long = corr.reset_index().melt(id_vars="index", var_name="col", value_name="corr")
corr_long = corr_long.rename(columns={"index": "row"})
chart = (
alt.Chart(corr_long)
.mark_rect()
.encode(
x=alt.X("col:N", title=""),
y=alt.Y("row:N", title=""),
color=alt.Color("corr:Q", scale=alt.Scale(domain=(-1, 1), scheme="redblue"), legend=alt.Legend(title="corr")),
tooltip=["row", "col", alt.Tooltip("corr:Q", format=".3f")],
)
)
chart = configure_chart(chart, title, width=400, height=400)
out_path.parent.mkdir(parents=True, exist_ok=True)
chart.save(out_path)
inject_font_css(out_path)
print(f"Saved {out_path}")
def generate_total_plots() -> None:
df = prepare_client_data()
out_base = OUTPUT_DIR / "orders_amt_total"
save_scatter_trend_quad(
df,
y_col="orders_amt_total",
out_path=out_base / "scatter_trend_quad.html",
x_max=bmp.DEFAULT_X_MAX,
y_max=bmp.DEFAULT_Y_MAX,
savgol_window=bmp.DEFAULT_SAVGOL_WINDOW,
title="Заказы vs средние показы (все клиенты)",
)
def generate_category_plots() -> None:
client = prepare_category_client_data()
x_max_overrides = {
"ent": 4,
"transport": 6,
"super": 4,
"avia": 4,
"shopping": 4,
"avia_hotel": 5,
}
y_max_overrides = {
"ent": 2.5,
"transport": 8,
"avia": 1.5,
"shopping": 2.5,
"super": 5.5,
"avia_hotel": 2.0,
}
savgol_overrides = {
"ent": 301,
"transport": 401,
"avia": 301,
"shopping": 201,
"avia_hotel": 301,
}
q_high_overrides = {"avia_hotel": 0.9}
iqr_overrides = {"avia_hotel": 1.2}
cats_all = CATEGORIES + list(COMBINED.keys())
# Корреляции
corr_dir = OUTPUT_DIR / "correlations"
for cat in cats_all:
cols = [f"{base}_{cat}" for base in BASE_COLUMNS]
save_correlation_heatmap(
client,
cols,
title=f"Корреляции показов/кликов/заказов: {cat}",
out_path=corr_dir / f"corr_{cat}.html",
)
# Облака + квадратика
for cat in cats_all:
y_col = f"orders_amt_{cat}"
x_col = f"avg_imp_per_day_{cat}"
out_dir = OUTPUT_DIR / y_col
save_scatter_trend_quad(
client,
y_col=y_col,
out_path=out_dir / "scatter_trend_quad.html",
x_col=x_col,
x_max=x_max_overrides.get(cat, bmp.DEFAULT_X_MAX),
y_max=y_max_overrides.get(cat, bmp.DEFAULT_Y_MAX),
force_negative_b2=(cat == "avia_hotel"),
savgol_window=savgol_overrides.get(cat, bmp.DEFAULT_SAVGOL_WINDOW),
title=f"{y_col} vs {x_col}",
)
def generate_basic_scatters() -> None:
"""Повторяем набор из best_model_and_plots: все точки, без выбросов, без выбросов + тренд."""
df = prepare_client_data()
y_col = "orders_amt_total"
x_col = bmp.X_COL
x_max = bmp.DEFAULT_X_MAX
y_max = bmp.DEFAULT_Y_MAX
out_dir = OUTPUT_DIR / y_col
base = df[[x_col, y_col]].dropna()
base = bmp.filter_x_range(base, x_col, x_max)
base = base.copy()
base["alpha"] = compute_density_alpha(base, x_col, y_col, x_max, y_max)
def scatter_chart(data: pd.DataFrame, title: str, trend: Tuple[np.ndarray, np.ndarray] | None = None) -> alt.Chart:
x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max)
y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False)
points = alt.Chart(data).mark_circle(size=40).encode(
x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale),
y=alt.Y(y_col, title=y_col, scale=y_scale),
opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)),
color=alt.value(bmp.DEFAULT_SCATTER_COLOR),
tooltip=[x_col, y_col],
)
layers = [points]
if trend is not None and trend[0] is not None:
trend_df = pd.DataFrame({x_col: trend[0], "trend": trend[1]})
layers.append(
alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode(
x=alt.X(x_col, scale=x_scale),
y=alt.Y("trend", scale=y_scale),
)
)
chart = alt.layer(*layers).resolve_scale(opacity="independent")
return configure_chart(chart, title, width=800, height=600)
# 1) все точки
scatter_chart(base, "Облако: все точки").save(out_dir / "scatter_all.html")
inject_font_css(out_dir / "scatter_all.html")
# 2) без выбросов
cleaned = bmp.remove_outliers(base, y_col=y_col, x_col=x_col, iqr_k=bmp.DEFAULT_IQR_K, q_low=bmp.DEFAULT_Q_LOW, q_high=bmp.DEFAULT_Q_HIGH)
cleaned = cleaned.copy()
cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max)
scatter_chart(cleaned, "Облако: без выбросов").save(out_dir / "scatter_clean.html")
inject_font_css(out_dir / "scatter_clean.html")
# 3) без выбросов + тренд
tx, ty = bmp.compute_trend(
cleaned,
y_col=y_col,
x_col=x_col,
method=bmp.DEFAULT_TREND_METHOD,
lowess_frac=bmp.DEFAULT_TREND_FRAC,
savgol_window=bmp.DEFAULT_SAVGOL_WINDOW,
)
scatter_chart(cleaned, "Облако: без выбросов + тренд", trend=(tx, ty)).save(out_dir / "scatter_clean_trend.html")
inject_font_css(out_dir / "scatter_clean_trend.html")
def main() -> None:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
generate_basic_scatters()
generate_total_plots()
generate_category_plots()
if __name__ == "__main__":
main()

Binary file not shown.

After

Width:  |  Height:  |  Size: 119 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 422 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 177 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 122 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 130 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 405 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 387 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 360 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 256 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 440 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 87 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 141 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 139 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 147 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 144 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,55 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "3d3a9c98",
"metadata": {},
"source": [
"# EDA «Коммуникации в Городе»\n",
"\n",
"## Кратко о данных\n",
"- 118189 строк, 8339 клиентов, 35 исходных столбцов + инженерные (totals, CTR/CR, флаги).\n",
"- Диапазон дат: 20250109 — 20251104 (284 дня).\n",
"- Категории сервисов: ent, super, transport, shopping, hotel, avia; активные и пассивные показы/клики, заказы по категориям.\n",
"- Дубликаты по ключу (id, business_dt): нет.\n",
"\n",
"## Качество данных\n",
"- Пропуски: несущественные (NaN почти нет), отрицательных значений не обнаружено.\n",
"- Возраст: 1580 лет, p1/p99 = 22/68, мусора (<14 или >100) нет.\n",
"- Гендер: 68.5% M, 31.5% F. Платформа после нормализации: ~52.5% iOS, ~46.7% Android, 1.1% iPadOS.\n",
"- Признаки «заспамленности» и агрегаты на клиента добавлены: imp/click/order totals, CTR/CR, contact_days, avg_impressions_per_contact_day, order_categories_count.\n",
"\n",
"## Каналы и эффективность (агрегировано по всем строкам)\n",
"- Active impressions ≈ 219.5k, passive impressions ≈ 473.1k.\n",
"- Active clicks ≈ 147.3k (CTR_active ≈ 0.67), passive clicks ≈ 18.1k (CTR_passive ≈ 0.038).\n",
"- Заказы всего: 12439; CR click→order ≈ 7.5%, CR imp→order ≈ 1.8%.\n",
"- Дневных точек: 284; daily агрегаты подготовлены (CTR/CR, day_of_week).\n",
"\n",
"## Демография и устройство vs эффективность (по клиентским агрегатам)\n",
"- Таблицы по полу/возрастным группам/платформам готовы в `04_clients_segmentation.ipynb` (средние impressions/clicks/orders и CTR/CR).\n",
"- Гипотезы: в 05-м ноутбуке добавлены примеры MannWhitney по CTR active vs passive и по полу; можно расширять на платформы и возраст.\n",
"\n",
"## Лаги и сезонность\n",
"- Дневные ряды и метрики CTR/CR по времени и по дням недели — см. `03_time_and_lags.ipynb`.\n",
"- Лаги: реализованы кросс-корреляции orders vs impressions/clicks (hotel, avia) для lag 07; по клиентам — first_imp/click/order и распределения дней до заказа.\n",
"\n",
"## Сегменты и «усталость»\n",
"- Сегменты каналов: only_active / only_passive / both + метрики; бины по числу категорий заказов.\n",
"- «Заспамленность»: bin по avg_impressions_per_contact_day с CTR/CR; stacked доли категорий заказов по возрасту.\n",
"\n",
"## Модели как часть EDA\n",
"- На клиентском уровне собран датасет для задачи `has_any_order`; pipeline с OHE + StandardScaler + LogisticRegression и RandomForest (ROC-AUC и важности).\n",
"- Выводы по коэффициентам/важности доступны в `05_exploratory_models.ipynb`.\n",
"\n",
"## Что делать дальше\n",
"- Прогнать все ноутбуки end-to-end (данные готовы, зависимости в `.venv`): `jupyter lab` или `jupyter nbconvert --execute`.\n",
"- Уточнить нормализацию категорий (при необходимости) и при желании сохранить `dataset/ds_clean.parquet` (флаг в `01_load_and_clean.ipynb`).\n",
"- Добавить/актуализировать бизнес-гипотезы (категории, платформы, возраст) и зафиксировать p-value в таблице гипотез.\n",
"- При необходимости усилить визуализацию: календарные heatmap по CTR, ECDF лагов по каждой категории, PDP для топ-фичей модели.\n"
]
}
],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,41 @@
# EDA «Коммуникации в Городе»
## Кратко о данных
- 118189 строк, 8339 клиентов, 35 исходных столбцов + инженерные (totals, CTR/CR, флаги).
- Диапазон дат: 20250109 — 20251104 (284 дня).
- Категории сервисов: ent, super, transport, shopping, hotel, avia; активные и пассивные показы/клики, заказы по категориям.
- Дубликаты по ключу (id, business_dt): нет.
## Качество данных
- Пропуски: несущественные (NaN почти нет), отрицательных значений не обнаружено.
- Возраст: 1580 лет, p1/p99 = 22/68, мусора (<14 или >100) нет.
- Гендер: 68.5% M, 31.5% F. Платформа после нормализации: ~52.5% iOS, ~46.7% Android, 1.1% iPadOS.
- Признаки «заспамленности» и агрегаты на клиента добавлены: imp/click/order totals, CTR/CR, contact_days, avg_impressions_per_contact_day, order_categories_count.
## Каналы и эффективность (агрегировано по всем строкам)
- Active impressions ≈ 219.5k, passive impressions ≈ 473.1k.
- Active clicks ≈ 147.3k (CTR_active ≈ 0.67), passive clicks ≈ 18.1k (CTR_passive ≈ 0.038).
- Заказы всего: 12439; CR click→order ≈ 7.5%, CR imp→order ≈ 1.8%.
- Дневных точек: 284; daily агрегаты подготовлены (CTR/CR, day_of_week).
## Демография и устройство vs эффективность (по клиентским агрегатам)
- Таблицы по полу/возрастным группам/платформам готовы в `04_clients_segmentation.ipynb` (средние impressions/clicks/orders и CTR/CR).
- Гипотезы: в 05-м ноутбуке добавлены примеры MannWhitney по CTR active vs passive и по полу; можно расширять на платформы и возраст.
## Лаги и сезонность
- Дневные ряды и метрики CTR/CR по времени и по дням недели — см. `03_time_and_lags.ipynb`.
- Лаги: реализованы кросс-корреляции orders vs impressions/clicks (hotel, avia) для lag 07; по клиентам — first_imp/click/order и распределения дней до заказа.
## Сегменты и «усталость»
- Сегменты каналов: only_active / only_passive / both + метрики; бины по числу категорий заказов.
- «Заспамленность»: bin по avg_impressions_per_contact_day с CTR/CR; stacked доли категорий заказов по возрасту.
## Модели как часть EDA
- На клиентском уровне собран датасет для задачи `has_any_order`; pipeline с OHE + StandardScaler + LogisticRegression и RandomForest (ROC-AUC и важности).
- Выводы по коэффициентам/важности доступны в `05_exploratory_models.ipynb`.
## Что делать дальше
- Прогнать все ноутбуки end-to-end (данные готовы, зависимости в `.venv`): `jupyter lab` или `jupyter nbconvert --execute`.
- Уточнить нормализацию категорий (при необходимости) и при желании сохранить `dataset/ds_clean.parquet` (флаг в `01_load_and_clean.ipynb`).
- Добавить/актуализировать бизнес-гипотезы (категории, платформы, возраст) и зафиксировать p-value в таблице гипотез.
- При необходимости усилить визуализацию: календарные heatmap по CTR, ECDF лагов по каждой категории, PDP для топ-фичей модели.

View File

@@ -0,0 +1,154 @@
from __future__ import annotations
from pathlib import Path
from typing import Dict, Iterable, List
import numpy as np
import pandas as pd
# Paths and column groups
DATA_PATH = Path("dataset/ds.csv")
CATEGORIES: List[str] = ["ent", "super", "transport", "shopping", "hotel", "avia"]
ACTIVE_IMP_COLS = [f"active_imp_{c}" for c in CATEGORIES]
PASSIVE_IMP_COLS = [f"passive_imp_{c}" for c in CATEGORIES]
ACTIVE_CLICK_COLS = [f"active_click_{c}" for c in CATEGORIES]
PASSIVE_CLICK_COLS = [f"passive_click_{c}" for c in CATEGORIES]
ORDER_COLS = [f"orders_amt_{c}" for c in CATEGORIES]
NUMERIC_COLS = (
ACTIVE_IMP_COLS
+ PASSIVE_IMP_COLS
+ ACTIVE_CLICK_COLS
+ PASSIVE_CLICK_COLS
+ ORDER_COLS
+ ["age"]
)
CAT_COLS = ["gender_cd", "device_platform_cd"]
def safe_divide(numerator: pd.Series | float, denominator: pd.Series | float) -> pd.Series:
"""Divide with protection against zero (works for Series and scalars)."""
if isinstance(denominator, pd.Series):
denom = denominator.replace(0, np.nan)
else:
denom = np.nan if float(denominator) == 0 else denominator
return numerator / denom
def normalize_gender(series: pd.Series) -> pd.Series:
cleaned = series.fillna("UNKNOWN").astype(str).str.strip().str.upper()
mapping = {"M": "M", "MALE": "M", "F": "F", "FEMALE": "F"}
return cleaned.map(mapping).fillna("UNKNOWN")
def normalize_device(series: pd.Series) -> pd.Series:
cleaned = series.fillna("unknown").astype(str).str.strip()
lowered = cleaned.str.lower().str.replace(" ", "").str.replace("_", "")
mapping = {"android": "Android", "ios": "iOS", "ipados": "iPadOS", "ipad": "iPadOS"}
mapped = lowered.map(mapping)
fallback = cleaned.str.title()
return mapped.fillna(fallback)
def add_age_group(df: pd.DataFrame) -> pd.DataFrame:
bins = [0, 25, 35, 45, 55, np.inf]
labels = ["<25", "25-34", "35-44", "45-54", "55+"]
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)
return df
def add_totals(df: pd.DataFrame) -> pd.DataFrame:
df["active_imp_total"] = df[ACTIVE_IMP_COLS].sum(axis=1)
df["passive_imp_total"] = df[PASSIVE_IMP_COLS].sum(axis=1)
df["active_click_total"] = df[ACTIVE_CLICK_COLS].sum(axis=1)
df["passive_click_total"] = df[PASSIVE_CLICK_COLS].sum(axis=1)
df["orders_amt_total"] = df[ORDER_COLS].sum(axis=1)
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["active_ctr"] = safe_divide(df["active_click_total"], df["active_imp_total"])
df["passive_ctr"] = safe_divide(df["passive_click_total"], df["passive_imp_total"])
df["ctr_all"] = safe_divide(df["click_total"], df["imp_total"])
df["cr_click2order"] = safe_divide(df["orders_amt_total"], df["click_total"])
df["cr_imp2order"] = safe_divide(df["orders_amt_total"], df["imp_total"])
return df
def add_flags(df: pd.DataFrame) -> pd.DataFrame:
df["has_active_comm"] = (df[ACTIVE_IMP_COLS + ACTIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
df["has_passive_comm"] = (df[PASSIVE_IMP_COLS + PASSIVE_CLICK_COLS].sum(axis=1) > 0).astype(int)
df["has_any_order"] = (df[ORDER_COLS].sum(axis=1) > 0).astype(int)
df["order_categories_count"] = (df[ORDER_COLS] > 0).sum(axis=1)
return df
def load_data(path: Path | str = DATA_PATH) -> pd.DataFrame:
df = pd.read_csv(path)
df["business_dt"] = pd.to_datetime(df["business_dt"])
df["gender_cd"] = normalize_gender(df["gender_cd"])
df["device_platform_cd"] = normalize_device(df["device_platform_cd"])
df = add_age_group(df)
df = add_totals(df)
df = add_flags(df)
return df
def describe_zero_share(df: pd.DataFrame, cols: Iterable[str]) -> pd.DataFrame:
stats = []
for col in cols:
series = df[col]
stats.append(
{
"col": col,
"count": series.count(),
"mean": series.mean(),
"median": series.median(),
"std": series.std(),
"min": series.min(),
"q25": series.quantile(0.25),
"q75": series.quantile(0.75),
"max": series.max(),
"share_zero": (series == 0).mean(),
"p95": series.quantile(0.95),
"p99": series.quantile(0.99),
}
)
return pd.DataFrame(stats)
def build_daily(df: pd.DataFrame) -> pd.DataFrame:
agg_cols = ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS
daily = df.groupby("business_dt")[agg_cols].sum().reset_index()
daily = add_totals(daily)
daily["day_of_week"] = daily["business_dt"].dt.day_name()
return daily
def build_client(df: pd.DataFrame) -> pd.DataFrame:
agg_spec: Dict[str, str] = {col: "sum" for col in ACTIVE_IMP_COLS + PASSIVE_IMP_COLS + ACTIVE_CLICK_COLS + PASSIVE_CLICK_COLS + ORDER_COLS}
meta_spec: Dict[str, str | callable] = {
"age": "median",
"gender_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "UNKNOWN",
"age_group": lambda s: s.mode().iat[0] if not s.mode().empty else np.nan,
"device_platform_cd": lambda s: s.mode().iat[0] if not s.mode().empty else "Other",
}
agg_spec.update(meta_spec)
client = df.groupby("id").agg(agg_spec).reset_index()
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
imp_day = df.copy()
imp_day["imp_day_total"] = imp_day[ACTIVE_IMP_COLS + PASSIVE_IMP_COLS].sum(axis=1)
max_imp_day = imp_day.groupby("id")["imp_day_total"].max().rename("max_impressions_per_day")
client = add_totals(client)
client = add_flags(client)
client = client.merge(contact_days, on="id", how="left")
client = client.merge(max_imp_day, on="id", how="left")
client = add_contact_density(client)
return client
def add_contact_density(df: pd.DataFrame) -> pd.DataFrame:
# contact_days must already be present
if "contact_days" in df.columns:
df["avg_impressions_per_contact_day"] = safe_divide(df["imp_total"], df["contact_days"])
return df
return df

View File

@@ -0,0 +1,368 @@
# План полноценного преданализа датасета «Коммуникации в Городе»
Основано на описании датасета: есть ежедневные коммуникации с клиентами в экосистеме «Город Т-Банка», активные/пассивные каналы, показы/клики и заказы по категориям (ent, super, transport, shopping, hotel, avia), а также демография и устройство.
Обозначения:
- `*_imp_*` — показы (impressions) активных/пассивных каналов по категориям (`ent`, `super`, `transport`, `shopping`, `hotel`, `avia`).
- `*_click_*` — клики/касания по тем же категориям.
- `orders_amt_*` — число заказов по категориям.
- `gender_cd`, `age`, `device_platform_cd` — демография и устройство.
---
## 0. Технический скелет проекта
Файлы/ноутбуки:
1. `01_load_and_clean.ipynb` — загрузка, чистка, базовые описания.
2. `02_univariate_bivariate.ipynb` — распределения и связи признаков.
3. `03_time_and_lags.ipynb` — время, лаги, сезонность.
4. `04_clients_segmentation.ipynb` — агрегаты по клиенту, сегменты.
5. `05_exploratory_models.ipynb` — простые модели как часть EDA.
6. `eda_report.md` / `eda_report.ipynb` — итоговый отчёт.
---
## 1. Загрузка и структура данных
### Таблицы/выводы
1. `df.info()` — список столбцов, типы, количество ненулевых.
2. `df.head(5)` — первые строки для визуальной проверки.
3. Размерность:
- `n_rows`, `n_cols`
- `n_unique_clients = df['id'].nunique()`
- диапазон дат: `min(business_dt)`, `max(business_dt)`
4. Проверка ключа:
- таблица: `df.groupby(['id', 'business_dt']).size().value_counts().head()`
(показывает, есть ли дубликаты по ключу)
5. Среднее число дней на клиента:
- `df.groupby('id').size().describe()`
### Графики
1. Количество записей по датам:
- `bar/line`: X = `business_dt`, Y = `count(*)`
- цель: увидеть провалы/пики выгрузки
---
## 2. Качество данных и аномалии
### Таблицы/метрики
1. Пропуски:
- таблица: колонка → количество пропусков → доля пропусков
2. Базовый `describe` по числовым:
- `df[num_cols].describe().T`
3. Доля нулей:
- таблица: колонка → доля нулей → min/max → 95-й, 99-й перцентили
4. Логические проверки:
- все `*_imp_*`, `*_click_*`, `orders_amt_*` должны быть `>= 0`
- поиск отрицательных/странных значений
5. Возраст:
- мин/макс, перцентили (1-й, 99-й), доля мусора (например, `<14` или `>100`)
6. Категориальные:
- уникальные значения `gender_cd`, `device_platform_cd`
- приведение к единому формату (trim, upper, `unknown`)
### Графики
1. Boxplot возраста:
- Y = `age`
- цель: выбросы и мусор
2. Barplot пропусков:
- X = столбец, Y = доля NaN (только где NaN > 0)
---
## 3. Одномерный анализ (univariate)
### 3.1. Числовые признаки (показы/клики/заказы)
#### Таблицы
1. Для каждой группы (`active_imp_*`, `passive_imp_*`, `active_click_*`, `passive_click_*`, `orders_amt_*`):
- `count, mean, median, std, min, q25, q75, max, share_zero, p95, p99`
2. Агрегаты по всем категориям:
- создать `active_imp_total`, `passive_imp_total`, `active_click_total`, `passive_click_total`, `orders_amt_total`
- таблица `describe()` для них
#### Графики
1. Гистограммы (лог-масштаб или `log1p`) для каждой категории и типа:
- `active_imp_ent`, `active_click_ent`, `passive_imp_ent`, `orders_amt_ent`, …
2. Boxplot для агрегатов:
- `active_imp_total`, `passive_imp_total`, `active_click_total`, `passive_click_total`, `orders_amt_total`
### 3.2. Категориальные признаки
#### Таблицы
1. Распределение `gender_cd`: counts, доли, `unknown`
2. Распределение `device_platform_cd`: counts, доли
3. Возрастные группы:
- `<25`, `2534`, `3544`, `4554`, `55+`
- таблица: группа → число клиентов → доля
#### Графики
1. Barplot пола: X = `M/F/Unknown`, Y = доля
2. Barplot платформ: X = platform, Y = доля
3. Гистограмма возраста
---
## 4. Время и сезонность
Создать дневные агрегаты:
- сумма показов/кликов/заказов по дням
- метрики:
- `CTR_active = active_click_total / active_imp_total`
- `CTR_passive = passive_click_total / passive_imp_total`
- `CR_click2order = orders_amt_total / (active_click_total + passive_click_total)`
- `CR_imp2order = orders_amt_total / (active_imp_total + passive_imp_total)`
- день недели: `day_of_week`
### Таблицы
1. `daily.describe()` по дневным агрегатам
2. Таблица по дням недели:
- `day_of_week` → среднее `impressions, clicks, orders, CTR, CR`
### Графики
1. Линейные временные ряды:
- `business_dt` vs total impressions
- `business_dt` vs total clicks
- `business_dt` vs total orders
2. Линии CTR/CR во времени (rolling mean 7 дней по желанию):
- `active_ctr`, `passive_ctr`, `cr_click2order`
3. Сезонность по дням недели:
- barplot для `active_ctr`, `passive_ctr`, `cr_click2order`
4. (Опционально) календарная heatmap заказов/CTR
---
## 5. Парные связи (bivariate)
### Таблицы
1. Корреляции Спирмена (на уровне клиента/дня):
- между всеми числовыми признаками + `age`
2. Для каждой категории:
- биннинг показов по квантилям → средний `imp, click, CTR, orders, CR`
### Графики
1. Scatter/hexbin «показы → клики»:
- `active_imp_*` vs `active_click_*`
- `passive_imp_*` vs `passive_click_*`
2. Scatter «клики → заказы»:
- `*_click_*` vs `orders_amt_*`
3. CTR по бинам показов (линия/бар)
4. CR по бинам кликов (линия/бар)
5. Heatmap корреляций
---
## 6. Демография и устройство vs эффективность
Агрегировать по клиенту:
- суммы показов/кликов/заказов
- CTR/CR на уровне клиента
- добавить `gender_cd`, `age_group`, `device_platform_cd`
### Таблицы
1. По полу:
- средние `impressions, clicks, orders, CTR, CR`
2. По возрастным группам:
- те же метрики
3. По платформам:
- те же метрики
4. Тесты гипотез (MannWhitney / t-test):
- разница CTR/CR между группами
### Графики
1. Barplot CTR/CR по полу
2. Barplot CTR/CR по возрастным группам
3. Barplot CTR/CR по платформам
4. Boxplot заказов по возрастным группам
5. Stacked bar: возраст → доли категорий заказов (наполнение корзины сервисами)
---
## 7. Поведение по клиенту и сегментация
### 7.1. Простые сегменты
Флаги на уровне клиента:
- `has_active_comm`, `has_passive_comm`
- `has_any_order`
- `order_categories_count` (в скольких категориях есть заказ)
#### Таблицы
1. Сегменты каналов:
- `only_active`, `only_passive`, `both`
- доля клиентов, средние заказы, CTR/CR
2. Сегменты мультикатегорийности:
- `1`, `2`, `3+` категорий заказов
- средние коммуникации/заказы, демография
#### Графики
1. Barplot по сегментам каналов:
- средние заказы, CTR/CR
2. Barplot по числу категорий заказов
3. Stacked bar: сегменты → пол/возраст (по желанию)
### 7.2. Кластеризация (расширенный EDA)
1. Вектор фичей:
- суммы по категориям + CTR/CR + доли заказов
2. Нормализация
3. KMeans / GMM, 37 кластеров
#### Таблицы
- кластер → размер → средние фичи → краткая интерпретация
#### Графики
1. Профили кластеров (bar/radar)
2. Scatter PCA/UMAP: цвет = кластер
---
## 8. Воронка: показы → клики → заказы
### Таблицы
1. Общая воронка:
- `channel_type`, `category`, `impressions`, `clicks`, `orders`, `CTR`, `CR_click2order`, `CR_imp2order`
2. Воронка по сегментам:
- пол/возраст/платформа → те же метрики
### Графики
1. Funnel chart active vs passive (общий)
2. Barplot CTR по категориям + сравнение active/passive
3. Barplot CR по категориям + сравнение active/passive
4. Funnel/Bar по возрастным группам
---
## 9. Временные лаги между коммуникациями и заказами
С учётом «поздних покупок» (особенно travel).
### 9.1. Лаги на дневном уровне
#### Таблицы
1. Лаговые признаки `lag1..lag7` для показов/кликов
2. Кросс-корреляция:
- lag → corr(orders*t, impressions*{t-lag})
#### Графики
1. Линия «lag vs корреляция» по:
- `hotel`, `avia` (и др. при желании)
- active vs passive
### 9.2. Лаги на клиентском уровне
#### Таблицы
1. `first_imp_date`, `first_click_date`, `first_order_date`
2. `days_to_order`
3. Квантили `days_to_order` по категориям
#### Графики
1. Гистограмма/ECDF `days_to_order` по категориям
---
## 10. Мультиканальность и «заспамленность»
### Таблицы
1. `contact_days`, `avg_impressions_per_contact_day`, `max_impressions_per_day`
2. Бины по `avg_impressions_per_contact_day` → средний CTR/CR
### Графики
1. Гистограмма `avg_impressions_per_contact_day`
2. Линия/бар: CTR/CR vs уровень спама
---
## 11. Простые модели как часть EDA
### 11.1. Бинарная модель «есть заказ / нет заказа»
Target:
- `has_any_order`
Features:
- суммы показов/кликов по типам и категориям
- CTR/CR
- демография и платформа
#### Таблицы
1. Логистическая регрессия:
- коэффы, p-value, odds ratio
2. Feature importance из дерева/лесов
#### Графики
1. Barplot важностей
2. (Опционально) partial dependence для 23 ключевых фичей
---
## 12. Гипотезы и статтесты
### Примеры гипотез
1. `CTR_active > CTR_passive`
2. CR различается между категориями сервисов
3. CTR/CR различаются по полу/возрасту/платформе
4. «заспамленность» снижает CTR/CR после порога
### Таблица гипотез
- гипотеза, H0/H1, тест, p-value, вывод, бизнес-интерпретация
Графики для поддержки — использовать из разделов 410 (барчики/боксплоты).
---
## 13. Итоговая документация
1. Резюме выводов:
- качество данных
- эффективность каналов/категорий
- сегменты, где коммуникации лучше/хуже работают
- лаги (как быстро покупают после контактов)
- признаки «усталости» от коммуникаций
2. Список проблем данных и принятых решений по чистке
3. Список инсайтов для бизнеса
4. Список фичей для будущих моделей
5. Следующие шаги:
- подготовка ML-пайплайна
- список A/B-гипотез
- какие данные добрать (если нужно)
---

152
old data/quadreg.py Normal file
View File

@@ -0,0 +1,152 @@
from pathlib import Path
from typing import Optional, Tuple
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import r2_score, roc_auc_score
import best_model_and_plots as bmp
# Константы из scatter-скрипта
X_COL = bmp.X_COL
Y_COL = "orders_amt_total"
X_MAX = bmp.DEFAULT_X_MAX
Y_MIN = bmp.DEFAULT_Y_MIN
Y_MAX = bmp.DEFAULT_Y_MAX
def fit_quadratic(
cleaned: bmp.pd.DataFrame,
trend_data: Optional[Tuple[np.ndarray, np.ndarray]],
*,
x_col: str = X_COL,
y_col: str = Y_COL,
x_max: float = X_MAX,
) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]:
"""Фитит y ~ 1 + x + x^2. Если есть тренд, использует его как целевое для r2_trend."""
df = cleaned[[x_col, y_col]].dropna()
if len(df) < 3:
return None, {}
if trend_data is not None and trend_data[0] is not None:
tx, ty = trend_data
tx = np.asarray(tx)
ty = np.asarray(ty)
mask = (tx <= x_max) & ~np.isnan(ty)
tx = tx[mask]
ty = ty[mask]
else:
tx = ty = None
x = df[x_col].to_numpy()
y = df[y_col].to_numpy()
X_design = sm.add_constant(np.column_stack([x, x**2]))
model = sm.OLS(y, X_design).fit(cov_type="HC3")
auc = np.nan
binary = (y > 0).astype(int)
if len(np.unique(binary)) > 1:
auc = roc_auc_score(binary, model.predict(X_design))
r2_trend = np.nan
if tx is not None and len(tx) >= 3:
X_trend = sm.add_constant(np.column_stack([tx, tx**2]))
y_hat_trend = model.predict(X_trend)
if np.nanvar(ty) > 0:
r2_trend = r2_score(ty, y_hat_trend)
metrics = {
"auc": auc,
"r2_trend": r2_trend,
}
return model, metrics
def plot_overall_quad(
x_max: float = X_MAX,
y_min: float = Y_MIN,
y_max: float = Y_MAX,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
) -> None:
out_dir = bmp.BASE_OUT_DIR / Y_COL
res = bmp.plot_clean_trend_scatter(
bmp.load_client_level(bmp.DB_PATH),
y_col=Y_COL,
out_dir=out_dir,
x_col=X_COL,
x_max=x_max,
scatter_color=bmp.DEFAULT_SCATTER_COLOR,
point_size=bmp.DEFAULT_POINT_SIZE,
alpha=bmp.DEFAULT_TREND_ALPHA,
iqr_k=bmp.DEFAULT_IQR_K,
q_low=bmp.DEFAULT_Q_LOW,
q_high=bmp.DEFAULT_Q_HIGH,
alpha_min=bmp.DEFAULT_ALPHA_MIN,
alpha_max=bmp.DEFAULT_ALPHA_MAX,
bins_x=bmp.DEFAULT_BINS_X,
bins_y=bmp.DEFAULT_BINS_Y,
y_min=y_min,
y_max=y_max,
trend_frac=bmp.DEFAULT_TREND_FRAC,
trend_color=bmp.DEFAULT_TREND_COLOR,
trend_linewidth=bmp.DEFAULT_TREND_LINEWIDTH,
trend_method=bmp.DEFAULT_TREND_METHOD,
savgol_window=savgol_window,
return_components=True,
)
if res is None:
print("Нет данных для построения графика")
return
fig, ax, cleaned, trend_data = res
model, metrics = fit_quadratic(cleaned, trend_data, x_col=X_COL, y_col=Y_COL, x_max=x_max)
if model is None:
print("Недостаточно точек для квадратичной регрессии")
fig.savefig(out_dir / "scatter_trend.png", dpi=150)
bmp.plt.close(fig)
return
# Квадратичная линия поверх существующего тренда
x_grid = np.linspace(0, x_max, 400)
X_grid = sm.add_constant(np.column_stack([x_grid, x_grid**2]))
y_grid = model.predict(X_grid)
ax.plot(x_grid, y_grid, color="blue", linewidth=2.2, linestyle="--", label="Квадр. регрессия")
ax.legend()
params = model.params
pvals = model.pvalues
summary_lines = [
f"R2_trend={metrics['r2_trend']:.3f}",
f"AUC={metrics['auc']:.3f}",
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
f"b2={params[2]:.3f} (p={pvals[2]:.3g})",
f"n={len(cleaned)}",
]
ax.text(
0.02,
0.95,
"\n".join(summary_lines),
transform=ax.transAxes,
ha="left",
va="top",
fontsize=9,
bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.65, edgecolor="gray"),
)
quad_path = out_dir / "scatter_trend_quad.png"
fig.tight_layout()
fig.savefig(quad_path, dpi=150)
bmp.plt.close(fig)
print(f"Saved {quad_path}")
def main() -> None:
plot_overall_quad()
if __name__ == "__main__":
main()

87
old data/stat_analysis.py Normal file
View File

@@ -0,0 +1,87 @@
import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
)
.merge(contact_days, on="id", how="left")
.reset_index()
)
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
client["has_order"] = (client["orders_amt_total"] > 0).astype(int)
# Summary
summary = client[["imp_total", "click_total", "orders_amt_total", "contact_days", "avg_imp_per_day", "ctr_all", "cr_click2order"]].describe().T
print("Summary\n", summary)
missing = client.isna().mean().sort_values(ascending=False)
print("Missing\n", missing.head(10))
# Correlations and Mann-Whitney
corr_ctr = stats.spearmanr(client["avg_imp_per_day"], client["ctr_all"])
corr_cr = stats.spearmanr(client["avg_imp_per_day"], client["cr_click2order"])
q1 = client["avg_imp_per_day"].quantile(0.25)
q4 = client["avg_imp_per_day"].quantile(0.75)
low = client.loc[client["avg_imp_per_day"] <= q1, "ctr_all"].dropna()
high = client.loc[client["avg_imp_per_day"] >= q4, "ctr_all"].dropna()
wu = stats.mannwhitneyu(low, high, alternative="greater")
print({"spearman_ctr": corr_ctr, "spearman_cr": corr_cr, "mw_low_gt_high": wu})
# Bin stats and dual-axis plot
bins = pd.qcut(client["avg_imp_per_day"], 10, duplicates="drop")
stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
fig, ax1 = plt.subplots(figsize=(12, 5))
ax2 = ax1.twinx()
ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
ax1.set_ylabel("CTR")
ax2.set_ylabel("CR click→order")
ax1.set_xlabel("avg_imp_per_day bins")
plt.xticks(rotation=35)
ax1.set_title("CTR и CR по децилям avg_imp_per_day")
fig.tight_layout()
plt.savefig(project_root / "main_hypot" / "stat_bins.png", dpi=150)
print("Saved plot stat_bins.png")