88 lines
3.7 KiB
Python
88 lines
3.7 KiB
Python
import sqlite3
|
|
from pathlib import Path
|
|
import sys
|
|
import numpy as np
|
|
import pandas as pd
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from scipy import stats
|
|
|
|
sns.set_theme(style="whitegrid")
|
|
plt.rcParams["figure.figsize"] = (10, 5)
|
|
|
|
project_root = Path(__file__).resolve().parent.parent
|
|
sys.path.append(str(project_root / "preanalysis"))
|
|
import eda_utils as eda # noqa: E402
|
|
|
|
db_path = project_root / "dataset" / "ds.sqlite"
|
|
conn = sqlite3.connect(db_path)
|
|
df = pd.read_sql_query("select * from communications", conn, parse_dates=["business_dt"])
|
|
conn.close()
|
|
|
|
for cols, name in [
|
|
(eda.ACTIVE_IMP_COLS, "active_imp_total"),
|
|
(eda.PASSIVE_IMP_COLS, "passive_imp_total"),
|
|
(eda.ACTIVE_CLICK_COLS, "active_click_total"),
|
|
(eda.PASSIVE_CLICK_COLS, "passive_click_total"),
|
|
(eda.ORDER_COLS, "orders_amt_total"),
|
|
]:
|
|
df[name] = df[cols].sum(axis=1)
|
|
|
|
df["imp_total"] = df["active_imp_total"] + df["passive_imp_total"]
|
|
df["click_total"] = df["active_click_total"] + df["passive_click_total"]
|
|
|
|
contact_days = df.groupby("id")["business_dt"].nunique().rename("contact_days")
|
|
client = (
|
|
df.groupby("id")
|
|
.agg(
|
|
imp_total=("imp_total", "sum"),
|
|
click_total=("click_total", "sum"),
|
|
orders_amt_total=("orders_amt_total", "sum"),
|
|
age=("age", "median"),
|
|
gender_cd=("gender_cd", lambda s: s.mode().iat[0]),
|
|
device_platform_cd=("device_platform_cd", lambda s: s.mode().iat[0]),
|
|
)
|
|
.merge(contact_days, on="id", how="left")
|
|
.reset_index()
|
|
)
|
|
|
|
client["ctr_all"] = eda.safe_divide(client["click_total"], client["imp_total"])
|
|
client["cr_click2order"] = eda.safe_divide(client["orders_amt_total"], client["click_total"])
|
|
client["avg_imp_per_day"] = eda.safe_divide(client["imp_total"], client["contact_days"])
|
|
client["high_ctr"] = (client["ctr_all"] >= client["ctr_all"].quantile(0.75)).astype(int)
|
|
client["has_order"] = (client["orders_amt_total"] > 0).astype(int)
|
|
|
|
# Summary
|
|
summary = client[["imp_total", "click_total", "orders_amt_total", "contact_days", "avg_imp_per_day", "ctr_all", "cr_click2order"]].describe().T
|
|
print("Summary\n", summary)
|
|
missing = client.isna().mean().sort_values(ascending=False)
|
|
print("Missing\n", missing.head(10))
|
|
|
|
# Correlations and Mann-Whitney
|
|
corr_ctr = stats.spearmanr(client["avg_imp_per_day"], client["ctr_all"])
|
|
corr_cr = stats.spearmanr(client["avg_imp_per_day"], client["cr_click2order"])
|
|
q1 = client["avg_imp_per_day"].quantile(0.25)
|
|
q4 = client["avg_imp_per_day"].quantile(0.75)
|
|
low = client.loc[client["avg_imp_per_day"] <= q1, "ctr_all"].dropna()
|
|
high = client.loc[client["avg_imp_per_day"] >= q4, "ctr_all"].dropna()
|
|
wu = stats.mannwhitneyu(low, high, alternative="greater")
|
|
print({"spearman_ctr": corr_ctr, "spearman_cr": corr_cr, "mw_low_gt_high": wu})
|
|
|
|
# Bin stats and dual-axis plot
|
|
bins = pd.qcut(client["avg_imp_per_day"], 10, duplicates="drop")
|
|
stats_bin = client.groupby(bins, observed=False)[["ctr_all", "cr_click2order"]].median().reset_index().rename(columns={"index": "bin"})
|
|
stats_bin["avg_imp_per_day"] = client.groupby(bins, observed=False)["avg_imp_per_day"].median().values
|
|
stats_bin["bin_label"] = stats_bin["avg_imp_per_day"].round(2).astype(str)
|
|
fig, ax1 = plt.subplots(figsize=(12, 5))
|
|
ax2 = ax1.twinx()
|
|
ax1.plot(stats_bin["bin_label"], stats_bin["ctr_all"], marker="o", color="#4c72b0", label="CTR")
|
|
ax2.plot(stats_bin["bin_label"], stats_bin["cr_click2order"], marker="s", color="#c44e52", label="CR")
|
|
ax1.set_ylabel("CTR")
|
|
ax2.set_ylabel("CR click→order")
|
|
ax1.set_xlabel("avg_imp_per_day bins")
|
|
plt.xticks(rotation=35)
|
|
ax1.set_title("CTR и CR по децилям avg_imp_per_day")
|
|
fig.tight_layout()
|
|
plt.savefig(project_root / "spam_hypot" / "stat_bins.png", dpi=150)
|
|
print("Saved plot stat_bins.png")
|