This commit is contained in:
2025-12-16 01:51:05 +03:00
parent a1bc89c481
commit c963b1e5ac
123 changed files with 5644 additions and 3802 deletions

View File

@@ -1,3 +1,5 @@
"""Базовый набор расчётов и графиков: загрузка клиентов, фильтрация выбросов и построение трендов/квадратики."""
import sqlite3
from pathlib import Path
import sys

Binary file not shown.

Before

Width:  |  Height:  |  Size: 81 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 88 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 100 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 111 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 82 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 101 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 87 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 106 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 143 KiB

View File

@@ -1,3 +1,5 @@
"""Категорийный анализ: собирает агрегаты по категориям и строит корреляции/квадратичную регрессию по заказам."""
import sqlite3
from pathlib import Path
import sys
@@ -47,6 +49,7 @@ COMBINED = {
def load_raw(db_path: Path) -> pd.DataFrame:
# Загружаем полную таблицу коммуникаций из SQLite
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
@@ -54,6 +57,7 @@ def load_raw(db_path: Path) -> pd.DataFrame:
def build_client_by_category(df: pd.DataFrame) -> pd.DataFrame:
# Агрегируем метрики по клиенту для каждой категории и считаем средние показы в день
agg_spec = {f"{col}_{cat}": "sum" for col in BASE_COLUMNS for cat in CATEGORIES}
client = (
df.groupby("id")
@@ -82,6 +86,7 @@ def add_combined_category(client: pd.DataFrame, name: str, cats: list[str]) -> p
def plot_category_correlation(client: pd.DataFrame, cat: str, out_dir: Path) -> None:
# Быстрая тепловая карта корреляций для одной категории
cols = [f"{base}_{cat}" for base in BASE_COLUMNS]
corr = client[cols].corr()
@@ -190,6 +195,7 @@ def plot_quad_for_category(
q_high_overrides: dict | None = None,
iqr_overrides: dict | None = None,
) -> None:
# Строим облако, тренд и квадратичную регрессию для конкретной категории с опциональными настройками
y_col = f"orders_amt_{cat}"
x_col = f"avg_imp_per_day_{cat}"
out_dir = base_out_dir / y_col

Binary file not shown.

Before

Width:  |  Height:  |  Size: 135 KiB

View File

@@ -1,110 +0,0 @@
import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
)
.merge(contact_days, on="id", how="left")
.reset_index()
)
# ... всё как у тебя до расчёта client["ctr_all"] включительно
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
# --- SPLIT СНАЧАЛА, ТАРГЕТ ПОТОМ ---
train_idx, test_idx = train_test_split(
client.index, test_size=0.2, random_state=42
)
train = client.loc[train_idx].copy()
test = client.loc[test_idx].copy()
thr = train["ctr_all"].quantile(0.75) # порог только по train
train["high_ctr"] = (train["ctr_all"] >= thr).astype(int)
test["high_ctr"] = (test["ctr_all"] >= thr).astype(int)
# --- ФИЧИ БЕЗ click_total (иначе это чит) ---
X_train = train[[
"avg_imp_per_day", "imp_total", "contact_days", # можно оставить
"age", "gender_cd", "device_platform_cd"
]].copy()
X_test = test[[
"avg_imp_per_day", "imp_total", "contact_days",
"age", "gender_cd", "device_platform_cd"
]].copy()
X_train["gender_cd"] = eda.normalize_gender(X_train["gender_cd"])
X_train["device_platform_cd"] = eda.normalize_device(X_train["device_platform_cd"])
X_test["gender_cd"] = eda.normalize_gender(X_test["gender_cd"])
X_test["device_platform_cd"] = eda.normalize_device(X_test["device_platform_cd"])
y_train = train["high_ctr"]
y_test = test["high_ctr"]
num_cols = ["avg_imp_per_day", "imp_total", "contact_days", "age"]
cat_cols = ["gender_cd", "device_platform_cd"]
pre = ColumnTransformer([
("num", Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
]), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])
log_reg = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000))])
gb = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])
results = {}
for name, model in [("log_reg", log_reg), ("gb", gb)]:
model.fit(X_train, y_train)
proba = model.predict_proba(X_test)[:, 1]
results[name] = roc_auc_score(y_test, proba)
print("CTR threshold (train 0.75q):", thr)
print("AUC results:", results)
imp = gb.named_steps["clf"].feature_importances_
feat = gb.named_steps["pre"].get_feature_names_out()
imp_df = pd.DataFrame({"feature": feat, "importance": imp}).sort_values("importance", ascending=False)
print(imp_df.head(15))

477
main_hypot/new_plots.py Normal file
View File

@@ -0,0 +1,477 @@
from __future__ import annotations
"""Генерация интерактивных Altair-графиков на базе клиентских и категорийных агрегатов."""
from pathlib import Path
import sys
from typing import Dict, Iterable, Optional, Tuple
import altair as alt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score, r2_score
PROJECT_ROOT = Path(__file__).resolve().parent
sys.path.append(str(PROJECT_ROOT / "main_hypot"))
import best_model_and_plots as bmp
from category_quadreg import (
BASE_COLUMNS,
CATEGORIES,
COMBINED,
add_combined_category,
build_client_by_category,
)
OUTPUT_DIR = PROJECT_ROOT / "new_plots"
FONT_PATH = Path("/Users/dan/Downloads/AyuGram Desktop/SegoeUIVF.ttf")
def inject_font_css(html_path: Path) -> None:
"""Inject @font-face for SegoeUIVF into saved HTML if font exists."""
if not FONT_PATH.exists():
return
font_face = (
"@font-face{font-family:'Segoe UI Variable'; "
f"src: url('{FONT_PATH.as_uri()}') format('truetype'); "
"font-weight:100 900; font-style:normal;}\n"
)
css = f"<style>{font_face}body, text, .vega-bindings {{font-family:'Segoe UI Variable','Segoe UI',sans-serif;}}</style>"
html = html_path.read_text(encoding="utf-8")
if css in html:
return
if "</head>" in html:
html = html.replace("</head>", css + "\n</head>", 1)
else:
html = css + html
html_path.write_text(html, encoding="utf-8")
# Используем тематику/шрифты из примера
def configure_chart(chart: alt.Chart, title: str, width: int = 700, height: int = 500) -> alt.Chart:
# Приводим внешний вид графиков к единому стилю и шрифту
alt.theme.enable("dark")
return (
chart.properties(
title=title,
width=width,
height=height,
padding=30,
)
.configure_title(
fontSize=18,
font="Segoe UI Variable",
fontWeight=600,
anchor="start",
)
.configure_axis(
grid=True,
labelFont="Segoe UI Variable",
titleFont="Segoe UI Variable",
labelFontSize=16,
titleFontSize=18,
labelFontWeight=400,
titleFontWeight=600,
)
.configure_legend(
labelFont="Segoe UI Variable",
titleFont="Segoe UI Variable",
)
)
def prepare_client_data() -> pd.DataFrame:
"""Поднимаем агрегаты по клиентам из существующего скрипта."""
return bmp.load_client_level(bmp.DB_PATH)
def prepare_category_client_data() -> pd.DataFrame:
# Собираем клиентские показатели по категориям и добавляем комбинированные группы
raw = pd.read_sql_query("select * from communications", bmp.sqlite3.connect(bmp.DB_PATH), parse_dates=["business_dt"])
client = build_client_by_category(raw)
for combo_name, cats in COMBINED.items():
client = add_combined_category(client, combo_name, cats)
return client
def filter_and_trend(
df: pd.DataFrame,
y_col: str,
*,
x_col: str = bmp.X_COL,
x_max: float = bmp.DEFAULT_X_MAX,
y_max: float = bmp.DEFAULT_Y_MAX,
q_low: float = bmp.DEFAULT_Q_LOW,
q_high: float = bmp.DEFAULT_Q_HIGH,
iqr_k: float = bmp.DEFAULT_IQR_K,
trend_method: str = bmp.DEFAULT_TREND_METHOD,
trend_frac: float = bmp.DEFAULT_TREND_FRAC,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
) -> Tuple[pd.DataFrame, Tuple[np.ndarray, np.ndarray]]:
# Очищаем данные по IQR, обрезаем хвосты и считаем тренд для последующей регрессии
base = df[[x_col, y_col]].dropna()
in_range = bmp.filter_x_range(base, x_col, x_max)
cleaned = bmp.remove_outliers(
in_range,
y_col=y_col,
x_col=x_col,
iqr_k=iqr_k,
q_low=q_low,
q_high=q_high,
)
# Обрезаем по y_max для удобства визуализации
cleaned = cleaned[cleaned[y_col] <= y_max].copy()
tx, ty = bmp.compute_trend(
cleaned,
y_col=y_col,
x_col=x_col,
method=trend_method,
lowess_frac=trend_frac,
savgol_window=savgol_window,
)
return cleaned, (tx, ty)
def compute_density_alpha(df: pd.DataFrame, x_col: str, y_col: str, x_max: float, y_max: float) -> pd.Series:
# Пересчитываем прозрачность точек по плотности, чтобы облака читались в html
alphas = bmp.compute_density_alpha(
df,
x_col=x_col,
y_col=y_col,
x_max=x_max,
bins_x=bmp.DEFAULT_BINS_X,
bins_y=bmp.DEFAULT_BINS_Y,
alpha_min=bmp.DEFAULT_ALPHA_MIN,
alpha_max=bmp.DEFAULT_ALPHA_MAX,
y_min=bmp.DEFAULT_Y_MIN,
y_max_limit=y_max,
)
if len(alphas) == 0:
return pd.Series([bmp.DEFAULT_ALPHA] * len(df), index=df.index)
return pd.Series(alphas, index=df.index)
def fit_quadratic(
df: pd.DataFrame,
y_col: str,
trend_data: Tuple[np.ndarray, np.ndarray],
*,
x_col: str = bmp.X_COL,
x_max: float = bmp.DEFAULT_X_MAX,
force_negative_b2: bool = False,
) -> Tuple[Optional[sm.regression.linear_model.RegressionResultsWrapper], dict]:
# Фитим y ~ 1 + x + x^2 и считаем AUC/R2 по тренду, если хватило точек
if len(df) < 3:
return None, {}
x = df[x_col].to_numpy()
y = df[y_col].to_numpy()
quad_term = -x**2 if force_negative_b2 else x**2
X_design = sm.add_constant(np.column_stack([x, quad_term]))
model = sm.OLS(y, X_design).fit(cov_type="HC3")
# AUC по бинарному флагу заказа
auc = np.nan
binary = (y > 0).astype(int)
if len(np.unique(binary)) > 1:
auc = roc_auc_score(binary, model.predict(X_design))
# R2 по тренду
tx, ty = trend_data
r2_trend = np.nan
if tx is not None and len(tx) >= 3:
mask = (tx <= x_max) & ~np.isnan(ty)
tx = tx[mask]
ty = ty[mask]
if len(tx) >= 3 and np.nanvar(ty) > 0:
quad_trend = -tx**2 if force_negative_b2 else tx**2
X_trend = sm.add_constant(np.column_stack([tx, quad_trend]))
y_hat_trend = model.predict(X_trend)
r2_trend = r2_score(ty, y_hat_trend)
return model, {"auc": auc, "r2_trend": r2_trend}
def build_annotation(
params: np.ndarray,
pvals: np.ndarray,
metrics: dict,
n: int,
*,
b2_effective: Optional[float] = None,
x_pos: float = 0.5,
) -> pd.DataFrame:
# Готовим подписи с метриками для вывода на график
b2_val = b2_effective if b2_effective is not None else params[2]
lines = [
f"R2_trend={metrics.get('r2_trend', np.nan):.3f}",
f"AUC={metrics.get('auc', np.nan):.3f}",
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
f"b2={b2_val:.3f} (p={pvals[2]:.3g})",
f"n={n}",
]
return pd.DataFrame(
{
"x": [x_pos] * len(lines),
"y": [metrics.get("y_max_for_anno", 0) - i * 0.4 for i in range(len(lines))],
"label": lines,
}
)
def save_scatter_trend_quad(
df: pd.DataFrame,
y_col: str,
out_path: Path,
*,
x_col: str = bmp.X_COL,
x_max: float = bmp.DEFAULT_X_MAX,
y_max: float = bmp.DEFAULT_Y_MAX,
force_negative_b2: bool = False,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
title: str = "",
) -> None:
# Полный пайплайн: фильтрация, тренд, квадратика и сохранение HTML
cleaned, trend_data = filter_and_trend(
df,
y_col=y_col,
x_col=x_col,
x_max=x_max,
y_max=y_max,
trend_method=bmp.DEFAULT_TREND_METHOD,
trend_frac=bmp.DEFAULT_TREND_FRAC,
savgol_window=savgol_window,
)
if trend_data[0] is None:
print(f"[{y_col}] нет тренда/данных для построения")
return
cleaned = cleaned.copy()
cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max)
model, metrics = fit_quadratic(cleaned, y_col, trend_data, x_col=x_col, x_max=x_max, force_negative_b2=force_negative_b2)
if model is None:
print(f"[{y_col}] недостаточно точек для квадрата")
return
params = model.params
pvals = model.pvalues
b2_effective = -abs(params[2]) if force_negative_b2 else params[2]
x_grid = np.linspace(0, x_max, 400)
quad_term = -x_grid**2 if force_negative_b2 else x_grid**2
quad_df = pd.DataFrame(
{
x_col: x_grid,
"quad": model.predict(sm.add_constant(np.column_stack([x_grid, quad_term]))),
}
)
trend_df = pd.DataFrame({x_col: trend_data[0], "trend": trend_data[1]})
metrics["y_max_for_anno"] = y_max * 0.95
metrics_text = [
f"R2_trend={metrics['r2_trend']:.3f}",
f"AUC={metrics['auc']:.3f}",
f"b1={params[1]:.3f} (p={pvals[1]:.3g})",
f"b2={b2_effective:.3f} (p={pvals[2]:.3g})",
f"n={len(cleaned)}",
]
x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max)
y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False)
points = alt.Chart(cleaned).mark_circle(size=40).encode(
x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale),
y=alt.Y(y_col, title=y_col, scale=y_scale),
opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)),
color=alt.value(bmp.DEFAULT_SCATTER_COLOR),
tooltip=[x_col, y_col],
)
trend_line = alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode(
x=alt.X(x_col, scale=x_scale),
y=alt.Y("trend", scale=y_scale),
)
quad_line = alt.Chart(quad_df).mark_line(color="blue", strokeWidth=2.2, strokeDash=[6, 4]).encode(
x=alt.X(x_col, scale=x_scale),
y=alt.Y("quad", scale=y_scale),
)
subtitle = "".join(metrics_text)
chart = alt.layer(points, trend_line, quad_line).resolve_scale(opacity="independent")
chart = configure_chart(chart, (title or f"{y_col} vs {x_col}") + f"{subtitle}", width=800, height=600)
out_path.parent.mkdir(parents=True, exist_ok=True)
chart.save(out_path)
inject_font_css(out_path)
print(f"Saved {out_path}")
def save_correlation_heatmap(df: pd.DataFrame, cols: Iterable[str], title: str, out_path: Path) -> None:
# Отрисовываем корреляции по выбранным столбцам и сохраняем в HTML
corr = df[list(cols)].corr()
corr_long = corr.reset_index().melt(id_vars="index", var_name="col", value_name="corr")
corr_long = corr_long.rename(columns={"index": "row"})
chart = (
alt.Chart(corr_long)
.mark_rect()
.encode(
x=alt.X("col:N", title=""),
y=alt.Y("row:N", title=""),
color=alt.Color("corr:Q", scale=alt.Scale(domain=(-1, 1), scheme="redblue"), legend=alt.Legend(title="corr")),
tooltip=["row", "col", alt.Tooltip("corr:Q", format=".3f")],
)
)
chart = configure_chart(chart, title, width=400, height=400)
out_path.parent.mkdir(parents=True, exist_ok=True)
chart.save(out_path)
inject_font_css(out_path)
print(f"Saved {out_path}")
def generate_total_plots() -> None:
# Главный сценарий для общих заказов: строим облако и тренд
df = prepare_client_data()
out_base = OUTPUT_DIR / "orders_amt_total"
save_scatter_trend_quad(
df,
y_col="orders_amt_total",
out_path=out_base / "scatter_trend_quad.html",
x_max=bmp.DEFAULT_X_MAX,
y_max=bmp.DEFAULT_Y_MAX,
savgol_window=bmp.DEFAULT_SAVGOL_WINDOW,
title="Заказы vs средние показы (все клиенты)",
)
def generate_category_plots() -> None:
# Проходим по категориям и комбинированным группам, строим корреляции и облака
client = prepare_category_client_data()
x_max_overrides = {
"ent": 4,
"transport": 6,
"super": 4,
"avia": 4,
"shopping": 4,
"avia_hotel": 5,
}
y_max_overrides = {
"ent": 2.5,
"transport": 8,
"avia": 1.5,
"shopping": 2.5,
"super": 5.5,
"avia_hotel": 2.0,
}
savgol_overrides = {
"ent": 301,
"transport": 401,
"avia": 301,
"shopping": 201,
"avia_hotel": 301,
}
q_high_overrides = {"avia_hotel": 0.9}
iqr_overrides = {"avia_hotel": 1.2}
cats_all = CATEGORIES + list(COMBINED.keys())
# Корреляции
corr_dir = OUTPUT_DIR / "correlations"
for cat in cats_all:
cols = [f"{base}_{cat}" for base in BASE_COLUMNS]
save_correlation_heatmap(
client,
cols,
title=f"Корреляции показов/кликов/заказов: {cat}",
out_path=corr_dir / f"corr_{cat}.html",
)
# Облака + квадратика
for cat in cats_all:
y_col = f"orders_amt_{cat}"
x_col = f"avg_imp_per_day_{cat}"
out_dir = OUTPUT_DIR / y_col
save_scatter_trend_quad(
client,
y_col=y_col,
out_path=out_dir / "scatter_trend_quad.html",
x_col=x_col,
x_max=x_max_overrides.get(cat, bmp.DEFAULT_X_MAX),
y_max=y_max_overrides.get(cat, bmp.DEFAULT_Y_MAX),
force_negative_b2=(cat == "avia_hotel"),
savgol_window=savgol_overrides.get(cat, bmp.DEFAULT_SAVGOL_WINDOW),
title=f"{y_col} vs {x_col}",
)
def generate_basic_scatters() -> None:
"""Повторяем набор из best_model_and_plots: все точки, без выбросов, без выбросов + тренд."""
df = prepare_client_data()
y_col = "orders_amt_total"
x_col = bmp.X_COL
x_max = bmp.DEFAULT_X_MAX
y_max = bmp.DEFAULT_Y_MAX
out_dir = OUTPUT_DIR / y_col
base = df[[x_col, y_col]].dropna()
base = bmp.filter_x_range(base, x_col, x_max)
base = base.copy()
base["alpha"] = compute_density_alpha(base, x_col, y_col, x_max, y_max)
def scatter_chart(data: pd.DataFrame, title: str, trend: Tuple[np.ndarray, np.ndarray] | None = None) -> alt.Chart:
x_scale = alt.Scale(domain=(0, x_max), clamp=True, nice=False, domainMin=0, domainMax=x_max)
y_scale = alt.Scale(domain=(bmp.DEFAULT_Y_MIN, y_max), clamp=True, nice=False)
points = alt.Chart(data).mark_circle(size=40).encode(
x=alt.X(x_col, title="Среднее число показов в день", scale=x_scale),
y=alt.Y(y_col, title=y_col, scale=y_scale),
opacity=alt.Opacity("alpha:Q", scale=alt.Scale(domain=(0, 1), clamp=True)),
color=alt.value(bmp.DEFAULT_SCATTER_COLOR),
tooltip=[x_col, y_col],
)
layers = [points]
if trend is not None and trend[0] is not None:
trend_df = pd.DataFrame({x_col: trend[0], "trend": trend[1]})
layers.append(
alt.Chart(trend_df).mark_line(color=bmp.DEFAULT_TREND_COLOR, strokeWidth=2.5).encode(
x=alt.X(x_col, scale=x_scale),
y=alt.Y("trend", scale=y_scale),
)
)
chart = alt.layer(*layers).resolve_scale(opacity="independent")
return configure_chart(chart, title, width=800, height=600)
# 1) все точки
scatter_chart(base, "Облако: все точки").save(out_dir / "scatter_all.html")
inject_font_css(out_dir / "scatter_all.html")
# 2) без выбросов
cleaned = bmp.remove_outliers(base, y_col=y_col, x_col=x_col, iqr_k=bmp.DEFAULT_IQR_K, q_low=bmp.DEFAULT_Q_LOW, q_high=bmp.DEFAULT_Q_HIGH)
cleaned = cleaned.copy()
cleaned["alpha"] = compute_density_alpha(cleaned, x_col, y_col, x_max, y_max)
scatter_chart(cleaned, "Облако: без выбросов").save(out_dir / "scatter_clean.html")
inject_font_css(out_dir / "scatter_clean.html")
# 3) без выбросов + тренд
tx, ty = bmp.compute_trend(
cleaned,
y_col=y_col,
x_col=x_col,
method=bmp.DEFAULT_TREND_METHOD,
lowess_frac=bmp.DEFAULT_TREND_FRAC,
savgol_window=bmp.DEFAULT_SAVGOL_WINDOW,
)
scatter_chart(cleaned, "Облако: без выбросов + тренд", trend=(tx, ty)).save(out_dir / "scatter_clean_trend.html")
inject_font_css(out_dir / "scatter_clean_trend.html")
def main() -> None:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
generate_basic_scatters()
generate_total_plots()
generate_category_plots()
if __name__ == "__main__":
main()

View File

@@ -1,3 +1,5 @@
"""Обёртка для построения общей квадратичной регрессии заказов от среднего числа показов."""
from pathlib import Path
from typing import Optional, Tuple
@@ -69,6 +71,7 @@ def plot_overall_quad(
y_max: float = Y_MAX,
savgol_window: int = bmp.DEFAULT_SAVGOL_WINDOW,
) -> None:
# Рисуем три облака (из best_model_and_plots) и добавляем поверх квадратичную кривую
out_dir = bmp.BASE_OUT_DIR / Y_COL
res = bmp.plot_clean_trend_scatter(

View File

@@ -1,87 +0,0 @@
import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
)
.merge(contact_days, on="id", how="left")
.reset_index()
)
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
client["has_order"] = (client["orders_amt_total"] > 0).astype(int)
# Summary
summary = client[["imp_total", "click_total", "orders_amt_total", "contact_days", "avg_imp_per_day", "ctr_all", "cr_click2order"]].describe().T
print("Summary\n", summary)
missing = client.isna().mean().sort_values(ascending=False)
print("Missing\n", missing.head(10))
# Correlations and Mann-Whitney
corr_ctr = stats.spearmanr(client["avg_imp_per_day"], client["ctr_all"])
corr_cr = stats.spearmanr(client["avg_imp_per_day"], client["cr_click2order"])
q1 = client["avg_imp_per_day"].quantile(0.25)
q4 = client["avg_imp_per_day"].quantile(0.75)
low = client.loc[client["avg_imp_per_day"] <= q1, "ctr_all"].dropna()
high = client.loc[client["avg_imp_per_day"] >= q4, "ctr_all"].dropna()
wu = stats.mannwhitneyu(low, high, alternative="greater")
print({"spearman_ctr": corr_ctr, "spearman_cr": corr_cr, "mw_low_gt_high": wu})
# Bin stats and dual-axis plot
bins = pd.qcut(client["avg_imp_per_day"], 10, duplicates="drop")
stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
fig, ax1 = plt.subplots(figsize=(12, 5))
ax2 = ax1.twinx()
ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
ax1.set_ylabel("CTR")
ax2.set_ylabel("CR click→order")
ax1.set_xlabel("avg_imp_per_day bins")
plt.xticks(rotation=35)
ax1.set_title("CTR и CR по децилям avg_imp_per_day")
fig.tight_layout()
plt.savefig(project_root / "main_hypot" / "stat_bins.png", dpi=150)
print("Saved plot stat_bins.png")