Files
dano2025/spam_hypot/03_best_model.ipynb
2025-12-12 23:17:56 +03:00

207 lines
7.9 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"id": "d88bf2d8",
"metadata": {},
"source": [
"# Спам-гипотеза: лучшая модель и визуализации\n",
"\n",
"Используем GradientBoostingClassifier (лучше логрега по AUC) для подтверждения гипотезы."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "87f3f728",
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"from pathlib import Path\n",
"import sys\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from scipy import stats\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.impute import SimpleImputer\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"sns.set_theme(style=\"whitegrid\")\n",
"plt.rcParams[\"figure.figsize\"] = (10, 5)\n",
"\n",
"project_root = Path.cwd().resolve()\n",
"while not (project_root / \"preanalysis\").exists() and project_root.parent != project_root:\n",
" project_root = project_root.parent\n",
"sys.path.append(str(project_root / \"preanalysis\"))\n",
"import eda_utils as eda\n",
"\n",
"db_path = project_root / \"dataset\" / \"ds.sqlite\"\n",
"conn = sqlite3.connect(db_path)\n",
"df = pd.read_sql_query(\"select * from communications\", conn, parse_dates=[\"business_dt\"])\n",
"conn.close()\n",
"\n",
"for cols, name in [\n",
" (eda.ACTIVE_IMP_COLS, \"active_imp_total\"),\n",
" (eda.PASSIVE_IMP_COLS, \"passive_imp_total\"),\n",
" (eda.ACTIVE_CLICK_COLS, \"active_click_total\"),\n",
" (eda.PASSIVE_CLICK_COLS, \"passive_click_total\"),\n",
" (eda.ORDER_COLS, \"orders_amt_total\"),\n",
"]:\n",
" df[name] = df[cols].sum(axis=1)\n",
"\n",
"df[\"imp_total\"] = df[\"active_imp_total\"] + df[\"passive_imp_total\"]\n",
"df[\"click_total\"] = df[\"active_click_total\"] + df[\"passive_click_total\"]\n",
"\n",
"contact_days = df.groupby(\"id\")[\"business_dt\"].nunique().rename(\"contact_days\")\n",
"client = df.groupby(\"id\").agg(\n",
" {\n",
" \"imp_total\": \"sum\",\n",
" \"click_total\": \"sum\",\n",
" \"orders_amt_total\": \"sum\",\n",
" \"age\": \"median\",\n",
" \"gender_cd\": lambda s: s.mode().iat[0],\n",
" \"device_platform_cd\": lambda s: s.mode().iat[0],\n",
" }\n",
").merge(contact_days, on=\"id\", how=\"left\").reset_index()\n",
"\n",
"client[\"ctr_all\"] = eda.safe_divide(client[\"click_total\"], client[\"imp_total\"])\n",
"client[\"cr_click2order\"] = eda.safe_divide(client[\"orders_amt_total\"], client[\"click_total\"])\n",
"client[\"avg_imp_per_day\"] = eda.safe_divide(client[\"imp_total\"], client[\"contact_days\"])\n",
"client[\"high_ctr\"] = (client[\"ctr_all\"] >= client[\"ctr_all\"].quantile(0.75)).astype(int)\n",
"client[\"has_order\"] = (client[\"orders_amt_total\"] > 0).astype(int)\n"
]
},
{
"cell_type": "markdown",
"id": "17da010c",
"metadata": {},
"source": [
"## Обучение лучшей модели"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "81433d7e",
"metadata": {},
"outputs": [],
"source": [
"X = client[[\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\", \"gender_cd\", \"device_platform_cd\"]]\n",
"X = X.copy()\n",
"X[\"gender_cd\"] = eda.normalize_gender(X[\"gender_cd\"])\n",
"X[\"device_platform_cd\"] = eda.normalize_device(X[\"device_platform_cd\"])\n",
"y = client[\"high_ctr\"]\n",
"\n",
"num_cols = [\"avg_imp_per_day\", \"imp_total\", \"click_total\", \"age\"]\n",
"cat_cols = [\"gender_cd\", \"device_platform_cd\"]\n",
"pre = ColumnTransformer([\n",
" (\"num\", Pipeline([(\"imputer\", SimpleImputer(strategy=\"median\")), (\"scaler\", StandardScaler())]), num_cols),\n",
" (\"cat\", OneHotEncoder(handle_unknown=\"ignore\"), cat_cols),\n",
"])\n",
"\n",
"best = Pipeline([(\"pre\", pre), (\"clf\", GradientBoostingClassifier(random_state=42))])\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n",
"best.fit(X_train, y_train)\n",
"proba = best.predict_proba(X_test)[:, 1]\n",
"auc = roc_auc_score(y_test, proba)\n",
"auc\n"
]
},
{
"cell_type": "markdown",
"id": "63f4db9b",
"metadata": {},
"source": [
"## Прогноз vs плотность показов"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f48584b5",
"metadata": {},
"outputs": [],
"source": [
"grid = pd.DataFrame({\"avg_imp_per_day\": np.linspace(client[\"avg_imp_per_day\"].min(), client[\"avg_imp_per_day\"].max(), 50)})\n",
"base = client.median(numeric_only=True)\n",
"base_gender = client[\"gender_cd\"].mode().iat[0]\n",
"base_device = client[\"device_platform_cd\"].mode().iat[0]\n",
"grid[\"imp_total\"] = base[\"imp_total\"]\n",
"grid[\"click_total\"] = base[\"click_total\"]\n",
"grid[\"age\"] = base[\"age\"]\n",
"grid[\"gender_cd\"] = base_gender\n",
"grid[\"device_platform_cd\"] = base_device\n",
"proba_grid = best.predict_proba(grid)[:, 1]\n",
"plt.figure(figsize=(10, 4))\n",
"plt.plot(grid[\"avg_imp_per_day\"], proba_grid, marker=\"o\")\n",
"plt.xlabel(\"avg_imp_per_day\")\n",
"plt.ylabel(\"P(high CTR)\")\n",
"plt.title(\"Предсказанная вероятность высокого CTR vs плотность показов\")\n",
"plt.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"id": "32f73b44",
"metadata": {},
"source": [
"## График CTR и CR по тонким бинам (две оси)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bb4d0190",
"metadata": {},
"outputs": [],
"source": [
"bins = pd.qcut(client[\"avg_imp_per_day\"], 15, duplicates=\"drop\")\n",
"stats_bin = client.groupby(bins).agg({\"ctr_all\": \"median\", \"cr_click2order\": \"median\", \"avg_imp_per_day\": \"median\"}).reset_index()\n",
"stats_bin[\"bin_label\"] = stats_bin[\"avg_imp_per_day\"].round(2).astype(str)\n",
"fig, ax1 = plt.subplots(figsize=(12, 5))\n",
"ax2 = ax1.twinx()\n",
"ax1.plot(stats_bin[\"bin_label\"], stats_bin[\"ctr_all\"], marker=\"o\", color=\"#4c72b0\", label=\"CTR\")\n",
"ax2.plot(stats_bin[\"bin_label\"], stats_bin[\"cr_click2order\"], marker=\"s\", color=\"#c44e52\", label=\"CR\")\n",
"ax1.set_ylabel(\"CTR\")\n",
"ax2.set_ylabel(\"CR click→order\")\n",
"ax1.set_xlabel(\"avg_imp_per_day bins\")\n",
"plt.xticks(rotation=35)\n",
"ax1.set_title(\"CTR и CR по 15 бинам avg_imp_per_day\")\n",
"fig.tight_layout()\n",
"plt.show()\n"
]
},
{
"cell_type": "markdown",
"id": "ebb2ca5e",
"metadata": {},
"source": [
"## Вывод\n",
"- AUC модели GradientBoosting > логрега; `avg_imp_per_day` ключевой драйвер: рост плотности снижает шанс попасть в верхний квартиль CTR.\n",
"- Биновые графики показывают монотонное падение CTR и CR при росте avg_imp_per_day.\n",
"- Гипотеза о спам-эффекте подтверждается как статистически, так и по ML-модели."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}