Files
dano2025/spam_hypot/01_stat_analysis.ipynb
2025-12-12 23:17:56 +03:00

189 lines
7.3 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "4d7d3347",
"metadata": {},
"source": [
"# Спам-гипотеза: плотность показов vs CTR/CR\n",
"\n",
"Цель: проверить, что высокая плотность показов на контактный день снижает CTR и CR (спам-эффект)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7acbd1c8",
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"from pathlib import Path\n",
"import sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from scipy import stats\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"sns.set_theme(style=\"whitegrid\")\n",
"plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
"\n",
"project_root = Path.cwd().resolve()\n",
"while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
" project_root = project_root.parent\n",
"sys.path.append(str(project_root / \"preanalysis\"))\n",
"import eda_utils as eda\n",
"\n",
"db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
"conn = sqlite3.connect(db_path)\n",
"df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
"conn.close()\n",
"\n",
"for cols, name in [\n",
" (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
" (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
" (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
" (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
" (eda.ORDER_COLS, \"orders_amt_total\"),\n",
"]:\n",
" df[name] = df[cols].sum(axis=1)\n",
"\n",
"df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
"df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
"\n",
"contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
"client = df.groupby(\"id\").agg(\n",
" {\n",
" \"imp_total\": \"sum\",\n",
" \"click_total\": \"sum\",\n",
" \"orders_amt_total\": \"sum\",\n",
" \"age\": \"median\",\n",
" \"gender_cd\": lambda s: s.mode().iat[0],\n",
" \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
" }\n",
").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
"\n",
"client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
"client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
"client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
"client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
"client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
]
},
{
"cell_type": "markdown",
"id": "94eb2d26",
"metadata": {},
"source": [
"## Базовые статистики"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "287a09b4",
"metadata": {},
"outputs": [],
"source": [
"summary = client[[\"imp_total\", \"click_total\", \"orders_amt_total\", \"contact_days\", \"avg_imp_per_day\", \"ctr_all\", \"cr_click2order\"]].describe().T\n",
"missing = client.isna().mean().sort_values(ascending=False)\n",
"summary, missing.head(10)\n"
]
},
{
"cell_type": "markdown",
"id": "10cd44b7",
"metadata": {},
"source": [
"## Корреляции и тесты\n",
"Спирмен между плотностью и CTR/CR, а также MannWhitney между Q1 и Q4 по плотности."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88714a03",
"metadata": {},
"outputs": [],
"source": [
"corr_ctr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"ctr_all\"])\n",
"corr_cr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"cr_click2order\"])\n",
"q1 = client[\"avg_imp_per_day\"].quantile(0.25)\n",
"q4 = client[\"avg_imp_per_day\"].quantile(0.75)\n",
"low = client.loc[client[\"avg_imp_per_day\"] <= q1, \"ctr_all\"].dropna()\n",
"high = client.loc[client[\"avg_imp_per_day\"] >= q4, \"ctr_all\"].dropna()\n",
"wu = stats.mannwhitneyu(low, high, alternative=\"greater\")\n",
"{ \"spearman_ctr\": corr_ctr, \"spearman_cr\": corr_cr, \"mw_low_gt_high\": wu }\n"
]
},
{
"cell_type": "markdown",
"id": "20d492fa",
"metadata": {},
"source": [
"bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
"stats_bin = client.groupby(bins, observed=False).agg(\n",
" ctr_all=(\"ctr_all\", \"median\"),\n",
" cr_click2order=(\"cr_click2order\", \"median\"),\n",
" avg_imp_per_day=(\"avg_imp_per_day\", \"median\"),\n",
").reset_index()\n",
"stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
"fig, ax1 = plt.subplots(figsize=(12, 5))\n",
"ax2 = ax1.twinx()\n",
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
"ax1.set_ylabel(\"CTR\")\n",
"ax2.set_ylabel(\"CR click→order\")\n",
"plt.xticks(rotation=35)\n",
"ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
"fig.tight_layout()\n",
"plt.show()\n",
"stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "943f0d4b",
"metadata": {},
"outputs": [],
"source": [
"bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
"stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n",
"stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
"fig, ax1 = plt.subplots(figsize=(12, 5))\n",
"ax2 = ax1.twinx()\n",
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
"ax1.set_ylabel(\"CTR\")\n",
"ax2.set_ylabel(\"CR click→order\")\n",
"plt.xticks(rotation=35)\n",
"ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
"fig.tight_layout()\n",
"plt.show()\n",
"stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}