spam hypot

This commit is contained in:
dan
2025-12-12 23:17:56 +03:00
parent 174a96038f
commit ce595182b9
21 changed files with 2845 additions and 362 deletions

View File

@@ -0,0 +1,188 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "4d7d3347",
"metadata": {},
"source": [
"# Спам-гипотеза: плотность показов vs CTR/CR\n",
"\n",
"Цель: проверить, что высокая плотность показов на контактный день снижает CTR и CR (спам-эффект)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7acbd1c8",
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"from pathlib import Path\n",
"import sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from scipy import stats\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"sns.set_theme(style=\"whitegrid\")\n",
"plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
"\n",
"project_root = Path.cwd().resolve()\n",
"while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
" project_root = project_root.parent\n",
"sys.path.append(str(project_root / \"preanalysis\"))\n",
"import eda_utils as eda\n",
"\n",
"db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
"conn = sqlite3.connect(db_path)\n",
"df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
"conn.close()\n",
"\n",
"for cols, name in [\n",
" (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
" (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
" (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
" (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
" (eda.ORDER_COLS, \"orders_amt_total\"),\n",
"]:\n",
" df[name] = df[cols].sum(axis=1)\n",
"\n",
"df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
"df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
"\n",
"contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
"client = df.groupby(\"id\").agg(\n",
" {\n",
" \"imp_total\": \"sum\",\n",
" \"click_total\": \"sum\",\n",
" \"orders_amt_total\": \"sum\",\n",
" \"age\": \"median\",\n",
" \"gender_cd\": lambda s: s.mode().iat[0],\n",
" \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
" }\n",
").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
"\n",
"client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
"client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
"client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
"client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
"client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
]
},
{
"cell_type": "markdown",
"id": "94eb2d26",
"metadata": {},
"source": [
"## Базовые статистики"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "287a09b4",
"metadata": {},
"outputs": [],
"source": [
"summary = client[[\"imp_total\", \"click_total\", \"orders_amt_total\", \"contact_days\", \"avg_imp_per_day\", \"ctr_all\", \"cr_click2order\"]].describe().T\n",
"missing = client.isna().mean().sort_values(ascending=False)\n",
"summary, missing.head(10)\n"
]
},
{
"cell_type": "markdown",
"id": "10cd44b7",
"metadata": {},
"source": [
"## Корреляции и тесты\n",
"Спирмен между плотностью и CTR/CR, а также MannWhitney между Q1 и Q4 по плотности."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88714a03",
"metadata": {},
"outputs": [],
"source": [
"corr_ctr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"ctr_all\"])\n",
"corr_cr = stats.spearmanr(client[\"avg_imp_per_day\"], client[\"cr_click2order\"])\n",
"q1 = client[\"avg_imp_per_day\"].quantile(0.25)\n",
"q4 = client[\"avg_imp_per_day\"].quantile(0.75)\n",
"low = client.loc[client[\"avg_imp_per_day\"] <= q1, \"ctr_all\"].dropna()\n",
"high = client.loc[client[\"avg_imp_per_day\"] >= q4, \"ctr_all\"].dropna()\n",
"wu = stats.mannwhitneyu(low, high, alternative=\"greater\")\n",
"{ \"spearman_ctr\": corr_ctr, \"spearman_cr\": corr_cr, \"mw_low_gt_high\": wu }\n"
]
},
{
"cell_type": "markdown",
"id": "20d492fa",
"metadata": {},
"source": [
"bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
"stats_bin = client.groupby(bins, observed=False).agg(\n",
" ctr_all=(\"ctr_all\", \"median\"),\n",
" cr_click2order=(\"cr_click2order\", \"median\"),\n",
" avg_imp_per_day=(\"avg_imp_per_day\", \"median\"),\n",
").reset_index()\n",
"stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
"fig, ax1 = plt.subplots(figsize=(12, 5))\n",
"ax2 = ax1.twinx()\n",
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
"ax1.set_ylabel(\"CTR\")\n",
"ax2.set_ylabel(\"CR click→order\")\n",
"plt.xticks(rotation=35)\n",
"ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
"fig.tight_layout()\n",
"plt.show()\n",
"stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "943f0d4b",
"metadata": {},
"outputs": [],
"source": [
"bins = pd.qcut(client[\"avg_imp_per_day\"], 10, duplicates=\"drop\")\n",
"stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n",
"stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
"fig, ax1 = plt.subplots(figsize=(12, 5))\n",
"ax2 = ax1.twinx()\n",
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"ctr_all\", marker=\"o\", ax=ax1, color=\"#4c72b0\", label=\"CTR\")\n",
"sns.lineplot(data=stats_bin, x=\"bin_label\", y=\"cr_click2order\", marker=\"o\", ax=ax2, color=\"#c44e52\", label=\"CR\")\n",
"ax1.set_ylabel(\"CTR\")\n",
"ax2.set_ylabel(\"CR click→order\")\n",
"plt.xticks(rotation=35)\n",
"ax1.set_title(\"CTR и CR по децилям avg_imp_per_day\")\n",
"fig.tight_layout()\n",
"plt.show()\n",
"stats_bin[[\"bin_label\", \"ctr_all\", \"cr_click2order\"]]\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}