new plots

This commit is contained in:
dan
2025-12-14 17:30:01 +03:00
parent cfee72470c
commit 5cac173b2f
4 changed files with 116 additions and 117 deletions

View File

@@ -1,19 +1,29 @@
import sqlite3 import sqlite3
from pathlib import Path from pathlib import Path
import sys import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt from statsmodels.nonparametric.smoothers_lowess import lowess
sns.set_theme(style="whitegrid") sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5) plt.rcParams["figure.figsize"] = (10, 6)
project_root = Path(__file__).resolve().parent.parent project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad")) sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402 import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite" DB_PATH = project_root / "dataset" / "ds.sqlite"
OUT_DIR = project_root / "main_hypot"
X_COL = "avg_imp_per_day"
Y_COL = "orders_amt_total"
X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать
SCATTER_COLOR = "#2c7bb6"
def load_client_level(db_path: Path) -> pd.DataFrame:
"""Собирает агрегаты по клиентам без усреднения по x."""
conn = sqlite3.connect(db_path) conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close() conn.close()
@@ -28,117 +38,106 @@ for cols, name in [
df[name] = df[cols].sum(axis=1) df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = ( client = (
df.groupby("id") df.groupby("id")
.agg( .agg(
imp_total=("imp_total", "sum"), imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"), orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"), contact_days=("business_dt", "nunique"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
) )
.merge(contact_days, on="id", how="left")
.reset_index() .reset_index()
) )
client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"]) # orders / impressions
client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"])
stats_imp = ( client[Y_COL] = client["orders_amt_total"]
client.groupby("avg_imp_per_day", as_index=False) client = client[["id", X_COL, Y_COL]].dropna()
.agg(
orders_mean=("orders_amt_total", "mean"), in_range = client[client[X_COL] <= X_MAX].copy()
n_clients=("id", "count"), print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
) return in_range
.sort_values("avg_imp_per_day")
def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame:
"""Убирает выбросы по IQR отдельно по x и y."""
def bounds(series: pd.Series) -> tuple[float, float]:
q1, q3 = series.quantile([0.05, 0.95])
iqr = q3 - q1
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
x_low, x_high = bounds(df[X_COL])
y_low, y_high = bounds(df[Y_COL])
filtered = df[
df[X_COL].between(max(0, x_low), x_high)
& df[Y_COL].between(max(0, y_low), y_high)
].copy()
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).")
return filtered
def plot_density_scatter(
df: pd.DataFrame,
title: str,
out_name: str,
with_trend: bool = False,
alpha: float = 0.08,
) -> None:
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(
data=df,
x=X_COL,
y=Y_COL,
color=SCATTER_COLOR,
s=20,
alpha=alpha,
linewidth=0,
ax=ax,
) )
K_MULT = 2 # "в разы" -> 5x. Поменяй на 3/10 если хочешь if with_trend:
ABS_DY_MIN = 1 trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True)
X_MAX = 16 ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд")
ax.legend()
stats_imp = stats_imp.sort_values("avg_imp_per_day").reset_index(drop=True) ax.set_xlim(0, X_MAX)
ax.set_ylim(bottom=0)
ax.set_xlabel("Среднее число показов в день")
ax.set_ylabel("Число заказов за период (сумма)")
ax.set_title(title)
ax.grid(alpha=0.3)
# 1) cut by x OUT_DIR.mkdir(parents=True, exist_ok=True)
stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True) out_path = OUT_DIR / out_name
fig.tight_layout()
fig.savefig(out_path, dpi=150)
plt.close(fig)
print(f"Saved {out_path}")
# 2) detect vertical outliers by dy logic
before = len(stats_f)
y = stats_f["orders_mean"]
abs_dy = y.diff().abs()
prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean() def main() -> None:
ratio = abs_dy / (prev3_mean.replace(0, np.nan)) # avoid inf when prev mean == 0 client = load_client_level(DB_PATH)
is_outlier = (abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT) | (y > 5) plot_density_scatter(
# первые точки не могут нормально иметь "3 предыдущих дельты" client,
is_outlier = is_outlier.fillna(False) title="Облако: заказы vs средние показы в день (все клиенты)",
out_name="orders_vs_avg_imp_scatter.png",
stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
after = len(stats_f)
cleaned = before - after
print(f"{before} - {after}, cleaned: {cleaned}")
# --- smoothing (rolling mean on remaining points) ---
w = max(7, int(len(stats_f) * 0.05))
if w % 2 == 0:
w += 1
stats_f["orders_smooth"] = (
stats_f["orders_mean"]
.rolling(window=w, center=True, min_periods=1)
.mean()
)
# --- cost line (linear expenses) ---
# нормируем так, чтобы масштаб был сопоставим с заказами
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
# plot
plt.figure(figsize=(10, 8))
plt.plot(
stats_f["avg_imp_per_day"],
stats_f["orders_mean"],
marker="o",
linewidth=1,
alpha=0.3,
label="Среднее число заказов"
) )
plt.plot( cleaned = remove_outliers(client)
stats_f["avg_imp_per_day"], plot_density_scatter(
stats_f["orders_smooth"], cleaned,
color="red", title="Облако без выбросов (IQR) заказы vs средние показы в день",
linewidth=2.5, out_name="orders_vs_avg_imp_scatter_clean.png",
label="Сглаженный тренд заказов"
) )
plt.plot( plot_density_scatter(
stats_f["avg_imp_per_day"], cleaned,
stats_f["cost_line"], title="Облако без выбросов + тренд",
color="black", out_name="orders_vs_avg_imp_scatter_trend.png",
linestyle="--", with_trend=True,
linewidth=2, alpha=0.1,
label="Линейные расходы на показы"
) )
plt.xlabel("Среднее число показов в день")
plt.ylabel("Среднее число заказов")
plt.title("Зависимость заказов от интенсивности коммуникаций")
plt.legend() if __name__ == "__main__":
plt.grid(alpha=0.3) main()
plt.tight_layout()
plt.savefig(
project_root / "main_hypot" / "orders_vs_avg_imp_with_costs.png",
dpi=150
)
print("Saved orders_vs_avg_imp_with_costs.png")

Binary file not shown.

After

Width:  |  Height:  |  Size: 122 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 130 KiB