Files
dano2025/main_hypot/best_model_and_plots.py
2025-12-14 17:30:01 +03:00

144 lines
4.4 KiB
Python

import sqlite3
from pathlib import Path
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from statsmodels.nonparametric.smoothers_lowess import lowess
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402
DB_PATH = project_root / "dataset" / "ds.sqlite"
OUT_DIR = project_root / "main_hypot"
X_COL = "avg_imp_per_day"
Y_COL = "orders_amt_total"
X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать
SCATTER_COLOR = "#2c7bb6"
def load_client_level(db_path: Path) -> pd.DataFrame:
"""Собирает агрегаты по клиентам без усреднения по x."""
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
contact_days=("business_dt", "nunique"),
)
.reset_index()
)
client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"])
client[Y_COL] = client["orders_amt_total"]
client = client[["id", X_COL, Y_COL]].dropna()
in_range = client[client[X_COL] <= X_MAX].copy()
print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
return in_range
def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame:
"""Убирает выбросы по IQR отдельно по x и y."""
def bounds(series: pd.Series) -> tuple[float, float]:
q1, q3 = series.quantile([0.05, 0.95])
iqr = q3 - q1
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
x_low, x_high = bounds(df[X_COL])
y_low, y_high = bounds(df[Y_COL])
filtered = df[
df[X_COL].between(max(0, x_low), x_high)
& df[Y_COL].between(max(0, y_low), y_high)
].copy()
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).")
return filtered
def plot_density_scatter(
df: pd.DataFrame,
title: str,
out_name: str,
with_trend: bool = False,
alpha: float = 0.08,
) -> None:
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(
data=df,
x=X_COL,
y=Y_COL,
color=SCATTER_COLOR,
s=20,
alpha=alpha,
linewidth=0,
ax=ax,
)
if with_trend:
trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True)
ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд")
ax.legend()
ax.set_xlim(0, X_MAX)
ax.set_ylim(bottom=0)
ax.set_xlabel("Среднее число показов в день")
ax.set_ylabel("Число заказов за период (сумма)")
ax.set_title(title)
ax.grid(alpha=0.3)
OUT_DIR.mkdir(parents=True, exist_ok=True)
out_path = OUT_DIR / out_name
fig.tight_layout()
fig.savefig(out_path, dpi=150)
plt.close(fig)
print(f"Saved {out_path}")
def main() -> None:
client = load_client_level(DB_PATH)
plot_density_scatter(
client,
title="Облако: заказы vs средние показы в день (все клиенты)",
out_name="orders_vs_avg_imp_scatter.png",
)
cleaned = remove_outliers(client)
plot_density_scatter(
cleaned,
title="Облако без выбросов (IQR) заказы vs средние показы в день",
out_name="orders_vs_avg_imp_scatter_clean.png",
)
plot_density_scatter(
cleaned,
title="Облако без выбросов + тренд",
out_name="orders_vs_avg_imp_scatter_trend.png",
with_trend=True,
alpha=0.1,
)
if __name__ == "__main__":
main()