new plots
This commit is contained in:
@@ -1,144 +1,143 @@
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import numpy as np
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
import matplotlib.pyplot as plt
|
from statsmodels.nonparametric.smoothers_lowess import lowess
|
||||||
|
|
||||||
sns.set_theme(style="whitegrid")
|
sns.set_theme(style="whitegrid")
|
||||||
plt.rcParams["figure.figsize"] = (10, 5)
|
plt.rcParams["figure.figsize"] = (10, 6)
|
||||||
|
|
||||||
project_root = Path(__file__).resolve().parent.parent
|
project_root = Path(__file__).resolve().parent.parent
|
||||||
sys.path.append(str(project_root / "preanalysis_old_bad"))
|
sys.path.append(str(project_root / "preanalysis_old_bad"))
|
||||||
import eda_utils as eda # noqa: E402
|
import eda_utils as eda # noqa: E402
|
||||||
|
|
||||||
db_path = project_root / "dataset" / "ds.sqlite"
|
DB_PATH = project_root / "dataset" / "ds.sqlite"
|
||||||
conn = sqlite3.connect(db_path)
|
OUT_DIR = project_root / "main_hypot"
|
||||||
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
X_COL = "avg_imp_per_day"
|
||||||
conn.close()
|
Y_COL = "orders_amt_total"
|
||||||
|
X_MAX = 18 # обрезаем длинный хвост по показам, чтобы облака было легче читать
|
||||||
|
SCATTER_COLOR = "#2c7bb6"
|
||||||
|
|
||||||
for cols, name in [
|
|
||||||
|
def load_client_level(db_path: Path) -> pd.DataFrame:
|
||||||
|
"""Собирает агрегаты по клиентам без усреднения по x."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
for cols, name in [
|
||||||
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
|
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
|
||||||
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
|
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
|
||||||
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
|
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
|
||||||
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
|
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
|
||||||
(eda.ORDER_COLS, "orders_amt_total"),
|
(eda.ORDER_COLS, "orders_amt_total"),
|
||||||
]:
|
]:
|
||||||
df[name] = df[cols].sum(axis=1)
|
df[name] = df[cols].sum(axis=1)
|
||||||
|
|
||||||
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
||||||
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
|
||||||
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
client = (
|
||||||
client = (
|
|
||||||
df.groupby("id")
|
df.groupby("id")
|
||||||
.agg(
|
.agg(
|
||||||
imp_total=("imp_total", "sum"),
|
imp_total=("imp_total", "sum"),
|
||||||
click_total=("click_total", "sum"),
|
|
||||||
orders_amt_total=("orders_amt_total", "sum"),
|
orders_amt_total=("orders_amt_total", "sum"),
|
||||||
age=("age", "median"),
|
contact_days=("business_dt", "nunique"),
|
||||||
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
|
|
||||||
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
|
|
||||||
)
|
)
|
||||||
.merge(contact_days, on="id", how="left")
|
|
||||||
.reset_index()
|
.reset_index()
|
||||||
)
|
|
||||||
client["order_rate"] = eda.safe_divide(client["orders_amt_total"], client["imp_total"]) # orders / impressions
|
|
||||||
client["order_rate_pct"] = 100 * client["order_rate"] # чтобы шкала была человеческая
|
|
||||||
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
|
||||||
|
|
||||||
# Mean absolute orders for each exact avg_imp_per_day (no bins), sorted ascending
|
|
||||||
stats_imp = (
|
|
||||||
client.groupby("avg_imp_per_day", as_index=False)
|
|
||||||
.agg(
|
|
||||||
orders_mean=("orders_amt_total", "mean"),
|
|
||||||
n_clients=("id", "count"),
|
|
||||||
)
|
)
|
||||||
.sort_values("avg_imp_per_day")
|
|
||||||
)
|
|
||||||
|
|
||||||
K_MULT = 2 # "в разы" -> 5x. Поменяй на 3/10 если хочешь
|
client[X_COL] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
||||||
ABS_DY_MIN = 1
|
client[Y_COL] = client["orders_amt_total"]
|
||||||
X_MAX = 16
|
client = client[["id", X_COL, Y_COL]].dropna()
|
||||||
|
|
||||||
stats_imp = stats_imp.sort_values("avg_imp_per_day").reset_index(drop=True)
|
in_range = client[client[X_COL] <= X_MAX].copy()
|
||||||
|
print(f"Loaded {len(client)} clients; {len(in_range)} within x<={X_MAX} kept for plotting.")
|
||||||
|
return in_range
|
||||||
|
|
||||||
# 1) cut by x
|
|
||||||
stats_f = stats_imp[stats_imp["avg_imp_per_day"] <= X_MAX].copy().reset_index(drop=True)
|
|
||||||
|
|
||||||
# 2) detect vertical outliers by dy logic
|
def remove_outliers(df: pd.DataFrame, iqr_k: float = 1.5) -> pd.DataFrame:
|
||||||
before = len(stats_f)
|
"""Убирает выбросы по IQR отдельно по x и y."""
|
||||||
y = stats_f["orders_mean"]
|
def bounds(series: pd.Series) -> tuple[float, float]:
|
||||||
abs_dy = y.diff().abs()
|
q1, q3 = series.quantile([0.05, 0.95])
|
||||||
|
iqr = q3 - q1
|
||||||
|
return q1 - iqr_k * iqr, q3 + iqr_k * iqr
|
||||||
|
|
||||||
prev3_mean = abs_dy.shift(1).rolling(window=3, min_periods=3).mean()
|
x_low, x_high = bounds(df[X_COL])
|
||||||
ratio = abs_dy / (prev3_mean.replace(0, np.nan)) # avoid inf when prev mean == 0
|
y_low, y_high = bounds(df[Y_COL])
|
||||||
|
filtered = df[
|
||||||
|
df[X_COL].between(max(0, x_low), x_high)
|
||||||
|
& df[Y_COL].between(max(0, y_low), y_high)
|
||||||
|
].copy()
|
||||||
|
print(f"Outlier cleaning: {len(df)} -> {len(filtered)} points (IQR k={iqr_k}).")
|
||||||
|
return filtered
|
||||||
|
|
||||||
is_outlier = (abs_dy >= ABS_DY_MIN) & (ratio >= K_MULT) | (y > 5)
|
|
||||||
# первые точки не могут нормально иметь "3 предыдущих дельты"
|
|
||||||
is_outlier = is_outlier.fillna(False)
|
|
||||||
|
|
||||||
stats_f = stats_f.loc[~is_outlier].copy().reset_index(drop=True)
|
def plot_density_scatter(
|
||||||
after = len(stats_f)
|
df: pd.DataFrame,
|
||||||
cleaned = before - after
|
title: str,
|
||||||
|
out_name: str,
|
||||||
|
with_trend: bool = False,
|
||||||
|
alpha: float = 0.08,
|
||||||
|
) -> None:
|
||||||
|
fig, ax = plt.subplots(figsize=(10, 6))
|
||||||
|
sns.scatterplot(
|
||||||
|
data=df,
|
||||||
|
x=X_COL,
|
||||||
|
y=Y_COL,
|
||||||
|
color=SCATTER_COLOR,
|
||||||
|
s=20,
|
||||||
|
alpha=alpha,
|
||||||
|
linewidth=0,
|
||||||
|
ax=ax,
|
||||||
|
)
|
||||||
|
|
||||||
print(f"{before} - {after}, cleaned: {cleaned}")
|
if with_trend:
|
||||||
|
trend = lowess(df[Y_COL], df[X_COL], frac=0.3, return_sorted=True)
|
||||||
|
ax.plot(trend[:, 0], trend[:, 1], color="red", linewidth=2.5, label="LOWESS тренд")
|
||||||
|
ax.legend()
|
||||||
|
|
||||||
# --- smoothing (rolling mean on remaining points) ---
|
ax.set_xlim(0, X_MAX)
|
||||||
w = max(7, int(len(stats_f) * 0.05))
|
ax.set_ylim(bottom=0)
|
||||||
if w % 2 == 0:
|
ax.set_xlabel("Среднее число показов в день")
|
||||||
w += 1
|
ax.set_ylabel("Число заказов за период (сумма)")
|
||||||
|
ax.set_title(title)
|
||||||
|
ax.grid(alpha=0.3)
|
||||||
|
|
||||||
stats_f["orders_smooth"] = (
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
stats_f["orders_mean"]
|
out_path = OUT_DIR / out_name
|
||||||
.rolling(window=w, center=True, min_periods=1)
|
fig.tight_layout()
|
||||||
.mean()
|
fig.savefig(out_path, dpi=150)
|
||||||
)
|
plt.close(fig)
|
||||||
# --- cost line (linear expenses) ---
|
print(f"Saved {out_path}")
|
||||||
# нормируем так, чтобы масштаб был сопоставим с заказами
|
|
||||||
c = stats_f["orders_smooth"].max() / stats_f["avg_imp_per_day"].max()
|
|
||||||
stats_f["cost_line"] = c * stats_f["avg_imp_per_day"]
|
|
||||||
|
|
||||||
# plot
|
|
||||||
plt.figure(figsize=(10, 8))
|
|
||||||
|
|
||||||
plt.plot(
|
def main() -> None:
|
||||||
stats_f["avg_imp_per_day"],
|
client = load_client_level(DB_PATH)
|
||||||
stats_f["orders_mean"],
|
|
||||||
marker="o",
|
|
||||||
linewidth=1,
|
|
||||||
alpha=0.3,
|
|
||||||
label="Среднее число заказов"
|
|
||||||
)
|
|
||||||
|
|
||||||
plt.plot(
|
plot_density_scatter(
|
||||||
stats_f["avg_imp_per_day"],
|
client,
|
||||||
stats_f["orders_smooth"],
|
title="Облако: заказы vs средние показы в день (все клиенты)",
|
||||||
color="red",
|
out_name="orders_vs_avg_imp_scatter.png",
|
||||||
linewidth=2.5,
|
)
|
||||||
label="Сглаженный тренд заказов"
|
|
||||||
)
|
|
||||||
|
|
||||||
plt.plot(
|
cleaned = remove_outliers(client)
|
||||||
stats_f["avg_imp_per_day"],
|
plot_density_scatter(
|
||||||
stats_f["cost_line"],
|
cleaned,
|
||||||
color="black",
|
title="Облако без выбросов (IQR) заказы vs средние показы в день",
|
||||||
linestyle="--",
|
out_name="orders_vs_avg_imp_scatter_clean.png",
|
||||||
linewidth=2,
|
)
|
||||||
label="Линейные расходы на показы"
|
|
||||||
)
|
|
||||||
|
|
||||||
plt.xlabel("Среднее число показов в день")
|
plot_density_scatter(
|
||||||
plt.ylabel("Среднее число заказов")
|
cleaned,
|
||||||
plt.title("Зависимость заказов от интенсивности коммуникаций")
|
title="Облако без выбросов + тренд",
|
||||||
|
out_name="orders_vs_avg_imp_scatter_trend.png",
|
||||||
|
with_trend=True,
|
||||||
|
alpha=0.1,
|
||||||
|
)
|
||||||
|
|
||||||
plt.legend()
|
|
||||||
plt.grid(alpha=0.3)
|
|
||||||
plt.tight_layout()
|
|
||||||
|
|
||||||
plt.savefig(
|
if __name__ == "__main__":
|
||||||
project_root / "main_hypot" / "orders_vs_avg_imp_with_costs.png",
|
main()
|
||||||
dpi=150
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Saved orders_vs_avg_imp_with_costs.png")
|
|
||||||
|
|||||||
BIN
main_hypot/orders_vs_avg_imp_scatter.png
Normal file
BIN
main_hypot/orders_vs_avg_imp_scatter.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 122 KiB |
BIN
main_hypot/orders_vs_avg_imp_scatter_clean.png
Normal file
BIN
main_hypot/orders_vs_avg_imp_scatter_clean.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 124 KiB |
BIN
main_hypot/orders_vs_avg_imp_scatter_trend.png
Normal file
BIN
main_hypot/orders_vs_avg_imp_scatter_trend.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 130 KiB |
Reference in New Issue
Block a user