Files
dano2025/spam_hypot/stat_analysis.py
2025-12-12 23:17:56 +03:00

88 lines
3.7 KiB
Python

import sqlite3
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)
project_root = Path(__file__).resolve().parent.parent
sys.path.append(str(project_root / "preanalysis"))
import eda_utils as eda # noqa: E402
db_path = project_root / "dataset" / "ds.sqlite"
conn = sqlite3.connect(db_path)
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
conn.close()
for cols, name in [
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
(eda.ORDER_COLS, "orders_amt_total"),
]:
df[name] = df[cols].sum(axis=1)
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
client = (
df.groupby("id")
.agg(
imp_total=("imp_total", "sum"),
click_total=("click_total", "sum"),
orders_amt_total=("orders_amt_total", "sum"),
age=("age", "median"),
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
)
.merge(contact_days, on="id", how="left")
.reset_index()
)
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
client["has_order"] = (client["orders_amt_total"] > 0).astype(int)
# Summary
summary = client[["imp_total", "click_total", "orders_amt_total", "contact_days", "avg_imp_per_day", "ctr_all", "cr_click2order"]].describe().T
print("Summary\n", summary)
missing = client.isna().mean().sort_values(ascending=False)
print("Missing\n", missing.head(10))
# Correlations and Mann-Whitney
corr_ctr = stats.spearmanr(client["avg_imp_per_day"], client["ctr_all"])
corr_cr = stats.spearmanr(client["avg_imp_per_day"], client["cr_click2order"])
q1 = client["avg_imp_per_day"].quantile(0.25)
q4 = client["avg_imp_per_day"].quantile(0.75)
low = client.loc[client["avg_imp_per_day"] <= q1, "ctr_all"].dropna()
high = client.loc[client["avg_imp_per_day"] >= q4, "ctr_all"].dropna()
wu = stats.mannwhitneyu(low, high, alternative="greater")
print({"spearman_ctr": corr_ctr, "spearman_cr": corr_cr, "mw_low_gt_high": wu})
# Bin stats and dual-axis plot
bins = pd.qcut(client["avg_imp_per_day"], 10, duplicates="drop")
stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
fig, ax1 = plt.subplots(figsize=(12, 5))
ax2 = ax1.twinx()
ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
ax1.set_ylabel("CTR")
ax2.set_ylabel("CR click→order")
ax1.set_xlabel("avg_imp_per_day bins")
plt.xticks(rotation=35)
ax1.set_title("CTR и CR по децилям avg_imp_per_day")
fig.tight_layout()
plt.savefig(project_root / "spam_hypot" / "stat_bins.png", dpi=150)
print("Saved plot stat_bins.png")