new plots

This commit is contained in:
dan
2025-12-14 17:30:01 +03:00
parent cfee72470c
commit 5cac173b2f
4 changed files with 116 additions and 117 deletions

View File

@@ -1,144 +1,143 @@
import sqlite3 import sqlite3
from pathlib import Path from pathlib import Path
import sys import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import seaborn as sns import seaborn as sns
import matplotlib.pyplot as plt from statsmodels.nonparametric.smoothers_lowess import lowess
sns.set_theme(style="whitegrid") sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5) plt.rcParams["figure.figsize"] = (10, 6)
project_root = Path(__file__).resolve().parent.parent project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis_old_bad")) sys.path.append(str(project_root / "preanalysis_old_bad"))
import eda_utils as eda # noqa: E402 import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite" DB_PATH = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path) OUT_DIR = project_root / "main_hypot"
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"]) X_COL = "avg_imp_per_day"
conn.close() Y_COL = "orders_amt_total"
X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать
SCATTER_COLOR = "#2c7bb6"
for cols, name in [
def load_client_level(db_path: Path) -> pd.DataFrame:
"""Собирает агрегаты по клиентам без усреднения по x."""
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"), (eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"), (eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"), (eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"), (eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"), (eda.ORDER_COLS, "orders_amt_total"),
]: ]:
df[name] = df[cols].sum(axis=1) df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"] df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days") client = (
client = (
df.groupby("id") df.groupby("id")
.agg( .agg(
imp_total=("imp_total", "sum"), imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"), orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"), contact_days=("business_dt", "nunique"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
) )
.merge(contact_days, on="id", how="left")
.reset_index() .reset_index()
)
client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"]) # orders / impressions
client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
stats_imp = (
client.groupby("avg_imp_per_day", as_index=False)
.agg(
orders_mean=("orders_amt_total", "mean"),
n_clients=("id", "count"),
) )
.sort_values("avg_imp_per_day")
)
K_MULT = 2 # "в разы" -> 5x. Поменяй на 3/10 если хочешь client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"])
ABS_DY_MIN = 1 client[Y_COL] = client["orders_amt_total"]
X_MAX = 16 client = client[["id", X_COL, Y_COL]].dropna()
stats_imp = stats_imp.sort_values("avg_imp_per_day").reset_index(drop=True) in_range = client[client[X_COL] <= X_MAX].copy()
print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
return in_range
# 1) cut by x
stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
# 2) detect vertical outliers by dy logic def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame:
before = len(stats_f) """Убирает выбросы по IQR отдельно по x и y."""
y = stats_f["orders_mean"] def bounds(series: pd.Series) -> tuple[float, float]:
abs_dy = y.diff().abs() q1, q3 = series.quantile([0.05, 0.95])
iqr = q3 - q1
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean() x_low, x_high = bounds(df[X_COL])
ratio = abs_dy / (prev3_mean.replace(0, np.nan)) # avoid inf when prev mean == 0 y_low, y_high = bounds(df[Y_COL])
filtered = df[
df[X_COL].between(max(0, x_low), x_high)
& df[Y_COL].between(max(0, y_low), y_high)
].copy()
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).")
return filtered
is_outlier = (abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT) | (y > 5)
# первые точки не могут нормально иметь "3 предыдущих дельты"
is_outlier = is_outlier.fillna(False)
stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True) def plot_density_scatter(
after = len(stats_f) df: pd.DataFrame,
cleaned = before - after title: str,
out_name: str,
with_trend: bool = False,
alpha: float = 0.08,
) -> None:
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(
data=df,
x=X_COL,
y=Y_COL,
color=SCATTER_COLOR,
s=20,
alpha=alpha,
linewidth=0,
ax=ax,
)
print(f"{before} - {after}, cleaned: {cleaned}") if with_trend:
trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True)
ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд")
ax.legend()
# --- smoothing (rolling mean on remaining points) --- ax.set_xlim(0, X_MAX)
w = max(7, int(len(stats_f) * 0.05)) ax.set_ylim(bottom=0)
if w % 2 == 0: ax.set_xlabel("Среднее число показов в день")
w += 1 ax.set_ylabel("Число заказов за период (сумма)")
ax.set_title(title)
ax.grid(alpha=0.3)
stats_f["orders_smooth"] = ( OUT_DIR.mkdir(parents=True, exist_ok=True)
stats_f["orders_mean"] out_path = OUT_DIR / out_name
.rolling(window=w, center=True, min_periods=1) fig.tight_layout()
.mean() fig.savefig(out_path, dpi=150)
) plt.close(fig)
# --- cost line (linear expenses) --- print(f"Saved {out_path}")
# нормируем так, чтобы масштаб был сопоставим с заказами
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
# plot
plt.figure(figsize=(10, 8))
plt.plot( def main() -> None:
stats_f["avg_imp_per_day"], client = load_client_level(DB_PATH)
stats_f["orders_mean"],
marker="o",
linewidth=1,
alpha=0.3,
label="Среднее число заказов"
)
plt.plot( plot_density_scatter(
stats_f["avg_imp_per_day"], client,
stats_f["orders_smooth"], title="Облако: заказы vs средние показы в день (все клиенты)",
color="red", out_name="orders_vs_avg_imp_scatter.png",
linewidth=2.5, )
label="Сглаженный тренд заказов"
)
plt.plot( cleaned = remove_outliers(client)
stats_f["avg_imp_per_day"], plot_density_scatter(
stats_f["cost_line"], cleaned,
color="black", title="Облако без выбросов (IQR) заказы vs средние показы в день",
linestyle="--", out_name="orders_vs_avg_imp_scatter_clean.png",
linewidth=2, )
label="Линейные расходы на показы"
)
plt.xlabel("Среднее число показов в день") plot_density_scatter(
plt.ylabel("Среднее число заказов") cleaned,
plt.title("Зависимость заказов от интенсивности коммуникаций") title="Облако без выбросов + тренд",
out_name="orders_vs_avg_imp_scatter_trend.png",
with_trend=True,
alpha=0.1,
)
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig( if __name__ == "__main__":
project_root / "main_hypot" / "orders_vs_avg_imp_with_costs.png", main()
dpi=150
)
print("Saved orders_vs_avg_imp_with_costs.png")

Binary file not shown.

After

Width:  |  Height:  |  Size: 122 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 130 KiB